diff --git a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk index bb9721c8e..3774dd730 100644 --- a/make/hotspot/gensrc/GensrcAdlc.gmk +++ b/make/hotspot/gensrc/GensrcAdlc.gmk @@ -140,6 +140,12 @@ ifeq ($(call check-jvm-feature, compiler2), true) $d/os_cpu/$(HOTSPOT_TARGET_OS)_$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_OS)_$(HOTSPOT_TARGET_CPU_ARCH).ad \ ))) + ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64) + AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ + $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \ + ))) + endif + ifeq ($(call check-jvm-feature, zgc), true) AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/z/z_$(HOTSPOT_TARGET_CPU).ad \ diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 617b2b8fb..eab0101b0 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -69,7 +69,7 @@ register %{ // // r0-r7,r10-r26 volatile (caller save) // r27-r32 system (no save, no allocate) -// r8-r9 invisible to the allocator (so we can use them as scratch regs) +// r8-r9 non-allocatable (so we can use them as scratch regs) // // as regards Java usage. we don't use any callee save registers // because this makes it difficult to de-optimise a frame (see comment @@ -94,6 +94,10 @@ reg_def R6 ( SOC, SOC, Op_RegI, 6, r6->as_VMReg() ); reg_def R6_H ( SOC, SOC, Op_RegI, 6, r6->as_VMReg()->next() ); reg_def R7 ( SOC, SOC, Op_RegI, 7, r7->as_VMReg() ); reg_def R7_H ( SOC, SOC, Op_RegI, 7, r7->as_VMReg()->next() ); +reg_def R8 ( NS, SOC, Op_RegI, 8, r8->as_VMReg() ); // rscratch1, non-allocatable +reg_def R8_H ( NS, SOC, Op_RegI, 8, r8->as_VMReg()->next() ); +reg_def R9 ( NS, SOC, Op_RegI, 9, r9->as_VMReg() ); // rscratch2, non-allocatable +reg_def R9_H ( NS, SOC, Op_RegI, 9, r9->as_VMReg()->next() ); reg_def R10 ( SOC, SOC, Op_RegI, 10, r10->as_VMReg() ); reg_def R10_H ( SOC, SOC, Op_RegI, 10, r10->as_VMReg()->next()); reg_def R11 ( SOC, SOC, Op_RegI, 11, r11->as_VMReg() ); @@ -140,7 +144,7 @@ reg_def R31 ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg() ); // sp reg_def R31_H ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg()->next()); // ---------------------------- -// Float/Double Registers +// Float/Double/Vector Registers // ---------------------------- // Double Registers @@ -161,165 +165,316 @@ reg_def R31_H ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg()->next()); // the platform ABI treats v8-v15 as callee save). float registers // v16-v31 are SOC as per the platform spec - reg_def V0 ( SOC, SOC, Op_RegF, 0, v0->as_VMReg() ); - reg_def V0_H ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next() ); - reg_def V0_J ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(2) ); - reg_def V0_K ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(3) ); - - reg_def V1 ( SOC, SOC, Op_RegF, 1, v1->as_VMReg() ); - reg_def V1_H ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next() ); - reg_def V1_J ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(2) ); - reg_def V1_K ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(3) ); - - reg_def V2 ( SOC, SOC, Op_RegF, 2, v2->as_VMReg() ); - reg_def V2_H ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next() ); - reg_def V2_J ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(2) ); - reg_def V2_K ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(3) ); - - reg_def V3 ( SOC, SOC, Op_RegF, 3, v3->as_VMReg() ); - reg_def V3_H ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next() ); - reg_def V3_J ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(2) ); - reg_def V3_K ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(3) ); - - reg_def V4 ( SOC, SOC, Op_RegF, 4, v4->as_VMReg() ); - reg_def V4_H ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next() ); - reg_def V4_J ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(2) ); - reg_def V4_K ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(3) ); - - reg_def V5 ( SOC, SOC, Op_RegF, 5, v5->as_VMReg() ); - reg_def V5_H ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next() ); - reg_def V5_J ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(2) ); - reg_def V5_K ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(3) ); - - reg_def V6 ( SOC, SOC, Op_RegF, 6, v6->as_VMReg() ); - reg_def V6_H ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next() ); - reg_def V6_J ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(2) ); - reg_def V6_K ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(3) ); - - reg_def V7 ( SOC, SOC, Op_RegF, 7, v7->as_VMReg() ); - reg_def V7_H ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next() ); - reg_def V7_J ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(2) ); - reg_def V7_K ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(3) ); - - reg_def V8 ( SOC, SOC, Op_RegF, 8, v8->as_VMReg() ); - reg_def V8_H ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next() ); - reg_def V8_J ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(2) ); - reg_def V8_K ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(3) ); - - reg_def V9 ( SOC, SOC, Op_RegF, 9, v9->as_VMReg() ); - reg_def V9_H ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next() ); - reg_def V9_J ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(2) ); - reg_def V9_K ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(3) ); - - reg_def V10 ( SOC, SOC, Op_RegF, 10, v10->as_VMReg() ); - reg_def V10_H( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next() ); - reg_def V10_J( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2)); - reg_def V10_K( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3)); - - reg_def V11 ( SOC, SOC, Op_RegF, 11, v11->as_VMReg() ); - reg_def V11_H( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next() ); - reg_def V11_J( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2)); - reg_def V11_K( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3)); - - reg_def V12 ( SOC, SOC, Op_RegF, 12, v12->as_VMReg() ); - reg_def V12_H( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next() ); - reg_def V12_J( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2)); - reg_def V12_K( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3)); - - reg_def V13 ( SOC, SOC, Op_RegF, 13, v13->as_VMReg() ); - reg_def V13_H( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next() ); - reg_def V13_J( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2)); - reg_def V13_K( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3)); - - reg_def V14 ( SOC, SOC, Op_RegF, 14, v14->as_VMReg() ); - reg_def V14_H( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next() ); - reg_def V14_J( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2)); - reg_def V14_K( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3)); - - reg_def V15 ( SOC, SOC, Op_RegF, 15, v15->as_VMReg() ); - reg_def V15_H( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next() ); - reg_def V15_J( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2)); - reg_def V15_K( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3)); - - reg_def V16 ( SOC, SOC, Op_RegF, 16, v16->as_VMReg() ); - reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() ); - reg_def V16_J( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2)); - reg_def V16_K( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3)); - - reg_def V17 ( SOC, SOC, Op_RegF, 17, v17->as_VMReg() ); - reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() ); - reg_def V17_J( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2)); - reg_def V17_K( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3)); - - reg_def V18 ( SOC, SOC, Op_RegF, 18, v18->as_VMReg() ); - reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() ); - reg_def V18_J( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2)); - reg_def V18_K( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3)); - - reg_def V19 ( SOC, SOC, Op_RegF, 19, v19->as_VMReg() ); - reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() ); - reg_def V19_J( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2)); - reg_def V19_K( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3)); - - reg_def V20 ( SOC, SOC, Op_RegF, 20, v20->as_VMReg() ); - reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() ); - reg_def V20_J( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2)); - reg_def V20_K( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3)); - - reg_def V21 ( SOC, SOC, Op_RegF, 21, v21->as_VMReg() ); - reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() ); - reg_def V21_J( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2)); - reg_def V21_K( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3)); - - reg_def V22 ( SOC, SOC, Op_RegF, 22, v22->as_VMReg() ); - reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() ); - reg_def V22_J( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2)); - reg_def V22_K( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3)); - - reg_def V23 ( SOC, SOC, Op_RegF, 23, v23->as_VMReg() ); - reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() ); - reg_def V23_J( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2)); - reg_def V23_K( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3)); - - reg_def V24 ( SOC, SOC, Op_RegF, 24, v24->as_VMReg() ); - reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() ); - reg_def V24_J( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2)); - reg_def V24_K( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3)); - - reg_def V25 ( SOC, SOC, Op_RegF, 25, v25->as_VMReg() ); - reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() ); - reg_def V25_J( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2)); - reg_def V25_K( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3)); - - reg_def V26 ( SOC, SOC, Op_RegF, 26, v26->as_VMReg() ); - reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() ); - reg_def V26_J( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2)); - reg_def V26_K( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3)); - - reg_def V27 ( SOC, SOC, Op_RegF, 27, v27->as_VMReg() ); - reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() ); - reg_def V27_J( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2)); - reg_def V27_K( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3)); - - reg_def V28 ( SOC, SOC, Op_RegF, 28, v28->as_VMReg() ); - reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() ); - reg_def V28_J( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2)); - reg_def V28_K( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3)); - - reg_def V29 ( SOC, SOC, Op_RegF, 29, v29->as_VMReg() ); - reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() ); - reg_def V29_J( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2)); - reg_def V29_K( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3)); - - reg_def V30 ( SOC, SOC, Op_RegF, 30, v30->as_VMReg() ); - reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() ); - reg_def V30_J( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2)); - reg_def V30_K( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3)); - - reg_def V31 ( SOC, SOC, Op_RegF, 31, v31->as_VMReg() ); - reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() ); - reg_def V31_J( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2)); - reg_def V31_K( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3)); +// For SVE vector registers, we simply extend vector register size to 8 +// 'logical' slots. This is nominally 256 bits but it actually covers +// all possible 'physical' SVE vector register lengths from 128 ~ 2048 +// bits. The 'physical' SVE vector register length is detected during +// startup, so the register allocator is able to identify the correct +// number of bytes needed for an SVE spill/unspill. +// Note that a vector register with 4 slots denotes a 128-bit NEON +// register allowing it to be distinguished from the corresponding SVE +// vector register when the SVE vector length is 128 bits. + + reg_def V0 ( SOC, SOC, Op_RegF, 0, v0->as_VMReg() ); + reg_def V0_H ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next() ); + reg_def V0_J ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(2) ); + reg_def V0_K ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(3) ); + reg_def V0_L ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(4) ); + reg_def V0_M ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(5) ); + reg_def V0_N ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(6) ); + reg_def V0_O ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(7) ); + + reg_def V1 ( SOC, SOC, Op_RegF, 1, v1->as_VMReg() ); + reg_def V1_H ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next() ); + reg_def V1_J ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(2) ); + reg_def V1_K ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(3) ); + reg_def V1_L ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(4) ); + reg_def V1_M ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(5) ); + reg_def V1_N ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(6) ); + reg_def V1_O ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(7) ); + + reg_def V2 ( SOC, SOC, Op_RegF, 2, v2->as_VMReg() ); + reg_def V2_H ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next() ); + reg_def V2_J ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(2) ); + reg_def V2_K ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(3) ); + reg_def V2_L ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(4) ); + reg_def V2_M ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(5) ); + reg_def V2_N ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(6) ); + reg_def V2_O ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(7) ); + + reg_def V3 ( SOC, SOC, Op_RegF, 3, v3->as_VMReg() ); + reg_def V3_H ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next() ); + reg_def V3_J ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(2) ); + reg_def V3_K ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(3) ); + reg_def V3_L ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(4) ); + reg_def V3_M ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(5) ); + reg_def V3_N ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(6) ); + reg_def V3_O ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(7) ); + + reg_def V4 ( SOC, SOC, Op_RegF, 4, v4->as_VMReg() ); + reg_def V4_H ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next() ); + reg_def V4_J ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(2) ); + reg_def V4_K ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(3) ); + reg_def V4_L ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(4) ); + reg_def V4_M ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(5) ); + reg_def V4_N ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(6) ); + reg_def V4_O ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(7) ); + + reg_def V5 ( SOC, SOC, Op_RegF, 5, v5->as_VMReg() ); + reg_def V5_H ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next() ); + reg_def V5_J ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(2) ); + reg_def V5_K ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(3) ); + reg_def V5_L ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(4) ); + reg_def V5_M ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(5) ); + reg_def V5_N ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(6) ); + reg_def V5_O ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(7) ); + + reg_def V6 ( SOC, SOC, Op_RegF, 6, v6->as_VMReg() ); + reg_def V6_H ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next() ); + reg_def V6_J ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(2) ); + reg_def V6_K ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(3) ); + reg_def V6_L ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(4) ); + reg_def V6_M ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(5) ); + reg_def V6_N ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(6) ); + reg_def V6_O ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(7) ); + + reg_def V7 ( SOC, SOC, Op_RegF, 7, v7->as_VMReg() ); + reg_def V7_H ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next() ); + reg_def V7_J ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(2) ); + reg_def V7_K ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(3) ); + reg_def V7_L ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(4) ); + reg_def V7_M ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(5) ); + reg_def V7_N ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(6) ); + reg_def V7_O ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(7) ); + + reg_def V8 ( SOC, SOC, Op_RegF, 8, v8->as_VMReg() ); + reg_def V8_H ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next() ); + reg_def V8_J ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(2) ); + reg_def V8_K ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(3) ); + reg_def V8_L ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(4) ); + reg_def V8_M ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(5) ); + reg_def V8_N ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(6) ); + reg_def V8_O ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(7) ); + + reg_def V9 ( SOC, SOC, Op_RegF, 9, v9->as_VMReg() ); + reg_def V9_H ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next() ); + reg_def V9_J ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(2) ); + reg_def V9_K ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(3) ); + reg_def V9_L ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(4) ); + reg_def V9_M ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(5) ); + reg_def V9_N ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(6) ); + reg_def V9_O ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(7) ); + + reg_def V10 ( SOC, SOC, Op_RegF, 10, v10->as_VMReg() ); + reg_def V10_H ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next() ); + reg_def V10_J ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2) ); + reg_def V10_K ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3) ); + reg_def V10_L ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(4) ); + reg_def V10_M ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(5) ); + reg_def V10_N ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(6) ); + reg_def V10_O ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(7) ); + + reg_def V11 ( SOC, SOC, Op_RegF, 11, v11->as_VMReg() ); + reg_def V11_H ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next() ); + reg_def V11_J ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2) ); + reg_def V11_K ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3) ); + reg_def V11_L ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(4) ); + reg_def V11_M ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(5) ); + reg_def V11_N ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(6) ); + reg_def V11_O ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(7) ); + + reg_def V12 ( SOC, SOC, Op_RegF, 12, v12->as_VMReg() ); + reg_def V12_H ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next() ); + reg_def V12_J ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2) ); + reg_def V12_K ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3) ); + reg_def V12_L ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(4) ); + reg_def V12_M ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(5) ); + reg_def V12_N ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(6) ); + reg_def V12_O ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(7) ); + + reg_def V13 ( SOC, SOC, Op_RegF, 13, v13->as_VMReg() ); + reg_def V13_H ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next() ); + reg_def V13_J ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2) ); + reg_def V13_K ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3) ); + reg_def V13_L ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(4) ); + reg_def V13_M ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(5) ); + reg_def V13_N ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(6) ); + reg_def V13_O ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(7) ); + + reg_def V14 ( SOC, SOC, Op_RegF, 14, v14->as_VMReg() ); + reg_def V14_H ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next() ); + reg_def V14_J ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2) ); + reg_def V14_K ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3) ); + reg_def V14_L ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(4) ); + reg_def V14_M ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(5) ); + reg_def V14_N ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(6) ); + reg_def V14_O ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(7) ); + + reg_def V15 ( SOC, SOC, Op_RegF, 15, v15->as_VMReg() ); + reg_def V15_H ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next() ); + reg_def V15_J ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2) ); + reg_def V15_K ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3) ); + reg_def V15_L ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(4) ); + reg_def V15_M ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(5) ); + reg_def V15_N ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(6) ); + reg_def V15_O ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(7) ); + + reg_def V16 ( SOC, SOC, Op_RegF, 16, v16->as_VMReg() ); + reg_def V16_H ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() ); + reg_def V16_J ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2) ); + reg_def V16_K ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3) ); + reg_def V16_L ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(4) ); + reg_def V16_M ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(5) ); + reg_def V16_N ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(6) ); + reg_def V16_O ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(7) ); + + reg_def V17 ( SOC, SOC, Op_RegF, 17, v17->as_VMReg() ); + reg_def V17_H ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() ); + reg_def V17_J ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2) ); + reg_def V17_K ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3) ); + reg_def V17_L ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(4) ); + reg_def V17_M ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(5) ); + reg_def V17_N ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(6) ); + reg_def V17_O ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(7) ); + + reg_def V18 ( SOC, SOC, Op_RegF, 18, v18->as_VMReg() ); + reg_def V18_H ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() ); + reg_def V18_J ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2) ); + reg_def V18_K ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3) ); + reg_def V18_L ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(4) ); + reg_def V18_M ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(5) ); + reg_def V18_N ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(6) ); + reg_def V18_O ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(7) ); + + reg_def V19 ( SOC, SOC, Op_RegF, 19, v19->as_VMReg() ); + reg_def V19_H ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() ); + reg_def V19_J ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2) ); + reg_def V19_K ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3) ); + reg_def V19_L ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(4) ); + reg_def V19_M ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(5) ); + reg_def V19_N ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(6) ); + reg_def V19_O ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(7) ); + + reg_def V20 ( SOC, SOC, Op_RegF, 20, v20->as_VMReg() ); + reg_def V20_H ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() ); + reg_def V20_J ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2) ); + reg_def V20_K ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3) ); + reg_def V20_L ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(4) ); + reg_def V20_M ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(5) ); + reg_def V20_N ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(6) ); + reg_def V20_O ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(7) ); + + reg_def V21 ( SOC, SOC, Op_RegF, 21, v21->as_VMReg() ); + reg_def V21_H ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() ); + reg_def V21_J ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2) ); + reg_def V21_K ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3) ); + reg_def V21_L ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(4) ); + reg_def V21_M ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(5) ); + reg_def V21_N ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(6) ); + reg_def V21_O ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(7) ); + + reg_def V22 ( SOC, SOC, Op_RegF, 22, v22->as_VMReg() ); + reg_def V22_H ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() ); + reg_def V22_J ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2) ); + reg_def V22_K ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3) ); + reg_def V22_L ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(4) ); + reg_def V22_M ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(5) ); + reg_def V22_N ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(6) ); + reg_def V22_O ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(7) ); + + reg_def V23 ( SOC, SOC, Op_RegF, 23, v23->as_VMReg() ); + reg_def V23_H ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() ); + reg_def V23_J ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2) ); + reg_def V23_K ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3) ); + reg_def V23_L ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(4) ); + reg_def V23_M ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(5) ); + reg_def V23_N ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(6) ); + reg_def V23_O ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(7) ); + + reg_def V24 ( SOC, SOC, Op_RegF, 24, v24->as_VMReg() ); + reg_def V24_H ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() ); + reg_def V24_J ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2) ); + reg_def V24_K ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3) ); + reg_def V24_L ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(4) ); + reg_def V24_M ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(5) ); + reg_def V24_N ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(6) ); + reg_def V24_O ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(7) ); + + reg_def V25 ( SOC, SOC, Op_RegF, 25, v25->as_VMReg() ); + reg_def V25_H ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() ); + reg_def V25_J ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2) ); + reg_def V25_K ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3) ); + reg_def V25_L ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(4) ); + reg_def V25_M ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(5) ); + reg_def V25_N ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(6) ); + reg_def V25_O ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(7) ); + + reg_def V26 ( SOC, SOC, Op_RegF, 26, v26->as_VMReg() ); + reg_def V26_H ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() ); + reg_def V26_J ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2) ); + reg_def V26_K ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3) ); + reg_def V26_L ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(4) ); + reg_def V26_M ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(5) ); + reg_def V26_N ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(6) ); + reg_def V26_O ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(7) ); + + reg_def V27 ( SOC, SOC, Op_RegF, 27, v27->as_VMReg() ); + reg_def V27_H ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() ); + reg_def V27_J ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2) ); + reg_def V27_K ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3) ); + reg_def V27_L ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(4) ); + reg_def V27_M ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(5) ); + reg_def V27_N ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(6) ); + reg_def V27_O ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(7) ); + + reg_def V28 ( SOC, SOC, Op_RegF, 28, v28->as_VMReg() ); + reg_def V28_H ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() ); + reg_def V28_J ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2) ); + reg_def V28_K ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3) ); + reg_def V28_L ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(4) ); + reg_def V28_M ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(5) ); + reg_def V28_N ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(6) ); + reg_def V28_O ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(7) ); + + reg_def V29 ( SOC, SOC, Op_RegF, 29, v29->as_VMReg() ); + reg_def V29_H ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() ); + reg_def V29_J ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2) ); + reg_def V29_K ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3) ); + reg_def V29_L ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(4) ); + reg_def V29_M ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(5) ); + reg_def V29_N ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(6) ); + reg_def V29_O ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(7) ); + + reg_def V30 ( SOC, SOC, Op_RegF, 30, v30->as_VMReg() ); + reg_def V30_H ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() ); + reg_def V30_J ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2) ); + reg_def V30_K ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3) ); + reg_def V30_L ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(4) ); + reg_def V30_M ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(5) ); + reg_def V30_N ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(6) ); + reg_def V30_O ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(7) ); + + reg_def V31 ( SOC, SOC, Op_RegF, 31, v31->as_VMReg() ); + reg_def V31_H ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() ); + reg_def V31_J ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2) ); + reg_def V31_K ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3) ); + reg_def V31_L ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(4) ); + reg_def V31_M ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(5) ); + reg_def V31_N ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(6) ); + reg_def V31_O ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(7) ); + + +// ---------------------------- +// SVE Predicate Registers +// ---------------------------- + reg_def P0 (SOC, SOC, Op_RegVMask, 0, p0->as_VMReg()); + reg_def P1 (SOC, SOC, Op_RegVMask, 1, p1->as_VMReg()); + reg_def P2 (SOC, SOC, Op_RegVMask, 2, p2->as_VMReg()); + reg_def P3 (SOC, SOC, Op_RegVMask, 3, p3->as_VMReg()); + reg_def P4 (SOC, SOC, Op_RegVMask, 4, p4->as_VMReg()); + reg_def P5 (SOC, SOC, Op_RegVMask, 5, p5->as_VMReg()); + reg_def P6 (SOC, SOC, Op_RegVMask, 6, p6->as_VMReg()); + reg_def P7 (SOC, SOC, Op_RegVMask, 7, p7->as_VMReg()); // ---------------------------- // Special Registers @@ -333,7 +488,6 @@ reg_def R31_H ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg()->next()); reg_def RFLAGS(SOC, SOC, 0, 32, VMRegImpl::Bad()); - // Specify priority of register selection within phases of register // allocation. Highest priority is first. A useful heuristic is to // give registers a low priority when they are required by machine @@ -381,50 +535,64 @@ alloc_class chunk0( R29, R29_H, // fp R30, R30_H, // lr R31, R31_H, // sp + R8, R8_H, // rscratch1 + R9, R9_H, // rscratch2 ); alloc_class chunk1( // no save - V16, V16_H, V16_J, V16_K, - V17, V17_H, V17_J, V17_K, - V18, V18_H, V18_J, V18_K, - V19, V19_H, V19_J, V19_K, - V20, V20_H, V20_J, V20_K, - V21, V21_H, V21_J, V21_K, - V22, V22_H, V22_J, V22_K, - V23, V23_H, V23_J, V23_K, - V24, V24_H, V24_J, V24_K, - V25, V25_H, V25_J, V25_K, - V26, V26_H, V26_J, V26_K, - V27, V27_H, V27_J, V27_K, - V28, V28_H, V28_J, V28_K, - V29, V29_H, V29_J, V29_K, - V30, V30_H, V30_J, V30_K, - V31, V31_H, V31_J, V31_K, + V16, V16_H, V16_J, V16_K, V16_L, V16_M, V16_N, V16_O, + V17, V17_H, V17_J, V17_K, V17_L, V17_M, V17_N, V17_O, + V18, V18_H, V18_J, V18_K, V18_L, V18_M, V18_N, V18_O, + V19, V19_H, V19_J, V19_K, V19_L, V19_M, V19_N, V19_O, + V20, V20_H, V20_J, V20_K, V20_L, V20_M, V20_N, V20_O, + V21, V21_H, V21_J, V21_K, V21_L, V21_M, V21_N, V21_O, + V22, V22_H, V22_J, V22_K, V22_L, V22_M, V22_N, V22_O, + V23, V23_H, V23_J, V23_K, V23_L, V23_M, V23_N, V23_O, + V24, V24_H, V24_J, V24_K, V24_L, V24_M, V24_N, V24_O, + V25, V25_H, V25_J, V25_K, V25_L, V25_M, V25_N, V25_O, + V26, V26_H, V26_J, V26_K, V26_L, V26_M, V26_N, V26_O, + V27, V27_H, V27_J, V27_K, V27_L, V27_M, V27_N, V27_O, + V28, V28_H, V28_J, V28_K, V28_L, V28_M, V28_N, V28_O, + V29, V29_H, V29_J, V29_K, V29_L, V29_M, V29_N, V29_O, + V30, V30_H, V30_J, V30_K, V30_L, V30_M, V30_N, V30_O, + V31, V31_H, V31_J, V31_K, V31_L, V31_M, V31_N, V31_O, // arg registers - V0, V0_H, V0_J, V0_K, - V1, V1_H, V1_J, V1_K, - V2, V2_H, V2_J, V2_K, - V3, V3_H, V3_J, V3_K, - V4, V4_H, V4_J, V4_K, - V5, V5_H, V5_J, V5_K, - V6, V6_H, V6_J, V6_K, - V7, V7_H, V7_J, V7_K, + V0, V0_H, V0_J, V0_K, V0_L, V0_M, V0_N, V0_O, + V1, V1_H, V1_J, V1_K, V1_L, V1_M, V1_N, V1_O, + V2, V2_H, V2_J, V2_K, V2_L, V2_M, V2_N, V2_O, + V3, V3_H, V3_J, V3_K, V3_L, V3_M, V3_N, V3_O, + V4, V4_H, V4_J, V4_K, V4_L, V4_M, V4_N, V4_O, + V5, V5_H, V5_J, V5_K, V5_L, V5_M, V5_N, V5_O, + V6, V6_H, V6_J, V6_K, V6_L, V6_M, V6_N, V6_O, + V7, V7_H, V7_J, V7_K, V7_L, V7_M, V7_N, V7_O, // non-volatiles - V8, V8_H, V8_J, V8_K, - V9, V9_H, V9_J, V9_K, - V10, V10_H, V10_J, V10_K, - V11, V11_H, V11_J, V11_K, - V12, V12_H, V12_J, V12_K, - V13, V13_H, V13_J, V13_K, - V14, V14_H, V14_J, V14_K, - V15, V15_H, V15_J, V15_K, + V8, V8_H, V8_J, V8_K, V8_L, V8_M, V8_N, V8_O, + V9, V9_H, V9_J, V9_K, V9_L, V9_M, V9_N, V9_O, + V10, V10_H, V10_J, V10_K, V10_L, V10_M, V10_N, V10_O, + V11, V11_H, V11_J, V11_K, V11_L, V11_M, V11_N, V11_O, + V12, V12_H, V12_J, V12_K, V12_L, V12_M, V12_N, V12_O, + V13, V13_H, V13_J, V13_K, V13_L, V13_M, V13_N, V13_O, + V14, V14_H, V14_J, V14_K, V14_L, V14_M, V14_N, V14_O, + V15, V15_H, V15_J, V15_K, V15_L, V15_M, V15_N, V15_O, +); + +alloc_class chunk2 ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + P7, + // Only use P0~P7 here for performance ); -alloc_class chunk2(RFLAGS); +alloc_class chunk3(RFLAGS); //----------Architecture Description Register Classes-------------------------- // Several register classes are automatically defined based upon information in @@ -865,6 +1033,42 @@ reg_class double_reg( V31, V31_H ); +// Class for all SVE vector registers. +reg_class vectora_reg ( + V0, V0_H, V0_J, V0_K, V0_L, V0_M, V0_N, V0_O, + V1, V1_H, V1_J, V1_K, V1_L, V1_M, V1_N, V1_O, + V2, V2_H, V2_J, V2_K, V2_L, V2_M, V2_N, V2_O, + V3, V3_H, V3_J, V3_K, V3_L, V3_M, V3_N, V3_O, + V4, V4_H, V4_J, V4_K, V4_L, V4_M, V4_N, V4_O, + V5, V5_H, V5_J, V5_K, V5_L, V5_M, V5_N, V5_O, + V6, V6_H, V6_J, V6_K, V6_L, V6_M, V6_N, V6_O, + V7, V7_H, V7_J, V7_K, V7_L, V7_M, V7_N, V7_O, + V8, V8_H, V8_J, V8_K, V8_L, V8_M, V8_N, V8_O, + V9, V9_H, V9_J, V9_K, V9_L, V9_M, V9_N, V9_O, + V10, V10_H, V10_J, V10_K, V10_L, V10_M, V10_N, V10_O, + V11, V11_H, V11_J, V11_K, V11_L, V11_M, V11_N, V11_O, + V12, V12_H, V12_J, V12_K, V12_L, V12_M, V12_N, V12_O, + V13, V13_H, V13_J, V13_K, V13_L, V13_M, V13_N, V13_O, + V14, V14_H, V14_J, V14_K, V14_L, V14_M, V14_N, V14_O, + V15, V15_H, V15_J, V15_K, V15_L, V15_M, V15_N, V15_O, + V16, V16_H, V16_J, V16_K, V16_L, V16_M, V16_N, V16_O, + V17, V17_H, V17_J, V17_K, V17_L, V17_M, V17_N, V17_O, + V18, V18_H, V18_J, V18_K, V18_L, V18_M, V18_N, V18_O, + V19, V19_H, V19_J, V19_K, V19_L, V19_M, V19_N, V19_O, + V20, V20_H, V20_J, V20_K, V20_L, V20_M, V20_N, V20_O, + V21, V21_H, V21_J, V21_K, V21_L, V21_M, V21_N, V21_O, + V22, V22_H, V22_J, V22_K, V22_L, V22_M, V22_N, V22_O, + V23, V23_H, V23_J, V23_K, V23_L, V23_M, V23_N, V23_O, + V24, V24_H, V24_J, V24_K, V24_L, V24_M, V24_N, V24_O, + V25, V25_H, V25_J, V25_K, V25_L, V25_M, V25_N, V25_O, + V26, V26_H, V26_J, V26_K, V26_L, V26_M, V26_N, V26_O, + V27, V27_H, V27_J, V27_K, V27_L, V27_M, V27_N, V27_O, + V28, V28_H, V28_J, V28_K, V28_L, V28_M, V28_N, V28_O, + V29, V29_H, V29_J, V29_K, V29_L, V29_M, V29_N, V29_O, + V30, V30_H, V30_J, V30_K, V30_L, V30_M, V30_N, V30_O, + V31, V31_H, V31_J, V31_K, V31_L, V31_M, V31_N, V31_O, +); + // Class for all 64bit vector registers reg_class vectord_reg( V0, V0_H, @@ -1097,6 +1301,31 @@ reg_class v31_reg( V31, V31_H ); +// Class for all SVE predicate registers. +reg_class pr_reg ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + // P7, non-allocatable, preserved with all elements preset to TRUE. +); + +// Class for SVE governing predicate registers, which are used +// to determine the active elements of a predicated instruction. +reg_class gov_pr ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + // P7, non-allocatable, preserved with all elements preset to TRUE. +); + // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -1758,6 +1987,10 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { // branch if we need to invalidate the method later __ nop(); + if (UseSVE > 0 && C->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } + int bangsize = C->bang_size_in_bytes(); if (C->need_stack_bang(bangsize) && UseStackBanging) __ generate_stack_overflow_check(bangsize); @@ -1859,7 +2092,7 @@ int MachEpilogNode::safepoint_offset() const { // Figure out which register class each belongs in: rc_int, rc_float or // rc_stack. -enum RC { rc_bad, rc_int, rc_float, rc_stack }; +enum RC { rc_bad, rc_int, rc_float, rc_predicate, rc_stack }; static enum RC rc_class(OptoReg::Name reg) { @@ -1867,19 +2100,25 @@ static enum RC rc_class(OptoReg::Name reg) { return rc_bad; } - // we have 30 int registers * 2 halves - // (rscratch1 and rscratch2 are omitted) + // we have 32 int registers * 2 halves + int slots_of_int_registers = RegisterImpl::max_slots_per_register * RegisterImpl::number_of_registers; - if (reg < 60) { + if (reg < slots_of_int_registers) { return rc_int; } - // we have 32 float register * 2 halves - if (reg < 60 + 128) { + // we have 32 float register * 8 halves + int slots_of_float_registers = FloatRegisterImpl::max_slots_per_register * FloatRegisterImpl::number_of_registers; + if (reg < slots_of_int_registers + slots_of_float_registers) { return rc_float; } - // Between float regs & stack is the flags regs. + int slots_of_predicate_registers = PRegisterImpl::max_slots_per_register * PRegisterImpl::number_of_registers; + if (reg < slots_of_int_registers + slots_of_float_registers + slots_of_predicate_registers) { + return rc_predicate; + } + + // Between predicate regs & stack is the flags. assert(OptoReg::is_stack(reg), "blow up if spilling flags"); return rc_stack; @@ -1918,8 +2157,28 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo if (bottom_type()->isa_vect() != NULL) { uint ireg = ideal_reg(); - assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); - if (cbuf) { + if (ireg == Op_VecA && cbuf) { + MacroAssembler _masm(cbuf); + int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { + // stack->stack + __ spill_copy_sve_vector_stack_to_stack(src_offset, dst_offset, + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { + __ spill_sve_vector(as_FloatRegister(Matcher::_regEncode[src_lo]), ra_->reg2offset(dst_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { + __ unspill_sve_vector(as_FloatRegister(Matcher::_regEncode[dst_lo]), ra_->reg2offset(src_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { + __ sve_orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + ShouldNotReachHere(); + } + } else if (cbuf) { + assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); MacroAssembler _masm(cbuf); assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity"); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { @@ -1937,12 +2196,12 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo as_FloatRegister(Matcher::_regEncode[src_lo])); } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]), - ireg == Op_VecD ? __ D : __ Q, - ra_->reg2offset(dst_lo)); + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(dst_lo)); } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]), - ireg == Op_VecD ? __ D : __ Q, - ra_->reg2offset(src_lo)); + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(src_lo)); } else { ShouldNotReachHere(); } @@ -2027,9 +2286,24 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo st->print("%s", Matcher::regName[dst_lo]); } if (bottom_type()->isa_vect() != NULL) { - st->print("\t# vector spill size = %d", ideal_reg()==Op_VecD ? 64:128); + int vsize = 0; + switch (ideal_reg()) { + case Op_VecD: + vsize = 64; + break; + case Op_VecX: + vsize = 128; + break; + case Op_VecA: + vsize = Matcher::scalable_vector_reg_size(T_BYTE) * 8; + break; + default: + assert(false, "bad register type for spill"); + ShouldNotReachHere(); + } + st->print("\t# vector spill size = %d", vsize); } else { - st->print("\t# spill size = %d", is64 ? 64:32); + st->print("\t# spill size = %d", is64 ? 64 : 32); } } @@ -2188,19 +2462,32 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { - - // TODO - // identify extra cases that we might want to provide match rules for - // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - bool ret_value = match_rule_supported(opcode); - // Add rules here. - - return ret_value; // Per default match rules are supported. + // Identify extra cases that we might want to provide match rules for vector nodes and + // other intrinsics guarded with vector length (vlen) and element type (bt). + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { + if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { + return false; + } + int bit_size = vlen * type2aelembytes(bt) * 8; + if (UseSVE == 0 && bit_size > 128) { + return false; + } + if (UseSVE > 0) { + return op_sve_supported(opcode); + } else { // NEON + // Special cases + switch (opcode) { + case Op_MulVL: + return false; + default: + break; + } + } + return true; // Per default match rules are supported. } const bool Matcher::has_predicated_vectors(void) { - return false; + return UseSVE > 0; } const int Matcher::float_pressure(int default_pressure_threshold) { @@ -2236,7 +2523,8 @@ const bool Matcher::convL2FSupported(void) { // Vector width in bytes. const int Matcher::vector_width_in_bytes(BasicType bt) { - int size = MIN2(16,(int)MaxVectorSize); + // The MaxVectorSize should have been set by detecting SVE max vector register size. + int size = MIN2((UseSVE > 0) ? 256 : 16, (int)MaxVectorSize); // Minimum 2 values in vector if (size < 2*type2aelembytes(bt)) size = 0; // But never < 4 @@ -2249,14 +2537,32 @@ const int Matcher::max_vector_size(const BasicType bt) { return vector_width_in_bytes(bt)/type2aelembytes(bt); } const int Matcher::min_vector_size(const BasicType bt) { -// For the moment limit the vector size to 8 bytes + int max_size = max_vector_size(bt); + if ((UseSVE > 0) && (MaxVectorSize >= 16)) { + // Currently vector length less than SVE vector register size is not supported. + return max_size; + } else { + // For the moment limit the vector size to 8 bytes with NEON. int size = 8 / type2aelembytes(bt); if (size < 2) size = 2; return size; + } +} + +const bool Matcher::supports_scalable_vector() { + return UseSVE > 0; +} + +// Actual max scalable vector register length. +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return Matcher::max_vector_size(bt); } // Vector ideal reg. const uint Matcher::vector_ideal_reg(int len) { + if (UseSVE > 0 && 16 <= len && len <= 256) { + return Op_VecA; + } switch(len) { case 8: return Op_VecD; case 16: return Op_VecX; @@ -2266,6 +2572,9 @@ const uint Matcher::vector_ideal_reg(int len) { } const uint Matcher::vector_shift_count_ideal_reg(int size) { + if (UseSVE > 0 && 16 <= size && size <= 256) { + return Op_VecA; + } switch(size) { case 8: return Op_VecD; case 16: return Op_VecX; @@ -3419,6 +3728,11 @@ encode %{ if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + // Only non uncommon_trap calls need to reinitialize ptrue. + if (uncommon_trap_request() == 0) { + __ reinitialize_ptrue(); + } } %} @@ -3429,6 +3743,8 @@ encode %{ if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); } %} @@ -3465,6 +3781,9 @@ encode %{ __ bind(retaddr); __ add(sp, sp, 2 * wordSize); } + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } %} enc_class aarch64_enc_rethrow() %{ @@ -3474,6 +3793,11 @@ encode %{ enc_class aarch64_enc_ret() %{ MacroAssembler _masm(&cbuf); +#ifdef ASSERT + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ verify_ptrue(); + } +#endif __ ret(lr); %} @@ -4203,6 +4527,41 @@ operand immLoffset16() interface(CONST_INTER); %} +// 8 bit signed value. +operand immI8() +%{ + predicate(n->get_int() <= 127 && n->get_int() >= -128); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immI8_shift8() +%{ + predicate((n->get_int() <= 127 && n->get_int() >= -128) || + (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0)); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immL8_shift8() +%{ + predicate((n->get_long() <= 127 && n->get_long() >= -128) || + (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0)); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // 32 bit integer valid for add sub immediate operand immIAddSub() %{ @@ -4832,6 +5191,18 @@ operand vRegD() interface(REG_INTER); %} +// Generic vector class. This will be used for +// all vector operands, including NEON and SVE, +// but currently only used for SVE VecA. +operand vReg() +%{ + constraint(ALLOC_IN_RC(vectora_reg)); + match(VecA); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + operand vecD() %{ constraint(ALLOC_IN_RC(vectord_reg)); @@ -5140,6 +5511,15 @@ operand vRegD_V31() interface(REG_INTER); %} +operand pRegGov() +%{ + constraint(ALLOC_IN_RC(gov_pr)); + match(RegVMask); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + // Flags register, used as output of signed compare instructions // note that on AArch64 we also use this register as the output for @@ -15477,7 +15857,7 @@ instruct loadV8(vecD dst, vmem8 mem) // Load Vector (128 bits) instruct loadV16(vecX dst, vmem16 mem) %{ - predicate(n->as_LoadVector()->memory_size() == 16); + predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 16); match(Set dst (LoadVector mem)); ins_cost(4 * INSN_COST); format %{ "ldrq $dst,$mem\t# vector (128 bits)" %} @@ -15533,7 +15913,7 @@ instruct replicate8B(vecD dst, iRegIorL2I src) instruct replicate16B(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (16B)" %} @@ -15558,7 +15938,7 @@ instruct replicate8B_imm(vecD dst, immI con) instruct replicate16B_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(16B)" %} @@ -15583,7 +15963,7 @@ instruct replicate4S(vecD dst, iRegIorL2I src) instruct replicate8S(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (8S)" %} @@ -15608,7 +15988,7 @@ instruct replicate4S_imm(vecD dst, immI con) instruct replicate8S_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(8H)" %} @@ -15632,7 +16012,7 @@ instruct replicate2I(vecD dst, iRegIorL2I src) instruct replicate4I(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4I)" %} @@ -15656,7 +16036,7 @@ instruct replicate2I_imm(vecD dst, immI con) instruct replicate4I_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(4I)" %} @@ -15668,7 +16048,7 @@ instruct replicate4I_imm(vecX dst, immI con) instruct replicate2L(vecX dst, iRegL src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateL src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2L)" %} @@ -15680,7 +16060,7 @@ instruct replicate2L(vecX dst, iRegL src) instruct replicate2L_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateI zero)); ins_cost(INSN_COST); format %{ "movi $dst, $zero\t# vector(4I)" %} @@ -15707,7 +16087,7 @@ instruct replicate2F(vecD dst, vRegF src) instruct replicate4F(vecX dst, vRegF src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateF src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4F)" %} @@ -15720,7 +16100,7 @@ instruct replicate4F(vecX dst, vRegF src) instruct replicate2D(vecX dst, vRegD src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateD src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2D)" %} @@ -17496,6 +17876,43 @@ instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{ ins_pipe(vshift128_imm); %} +instruct vpopcount4I(vecX dst, vecX src) %{ + predicate(UsePopCountInstruction && n->as_Vector()->length() == 4); + match(Set dst (PopCountVI src)); + format %{ + "cnt $dst, $src\t# vector (16B)\n\t" + "uaddlp $dst, $dst\t# vector (16B)\n\t" + "uaddlp $dst, $dst\t# vector (8H)" + %} + ins_encode %{ + __ cnt(as_FloatRegister($dst$$reg), __ T16B, + as_FloatRegister($src$$reg)); + __ uaddlp(as_FloatRegister($dst$$reg), __ T16B, + as_FloatRegister($dst$$reg)); + __ uaddlp(as_FloatRegister($dst$$reg), __ T8H, + as_FloatRegister($dst$$reg)); + %} + ins_pipe(pipe_class_default); +%} + +instruct vpopcount2I(vecD dst, vecD src) %{ + predicate(UsePopCountInstruction && n->as_Vector()->length() == 2); + match(Set dst (PopCountVI src)); + format %{ + "cnt $dst, $src\t# vector (8B)\n\t" + "uaddlp $dst, $dst\t# vector (8B)\n\t" + "uaddlp $dst, $dst\t# vector (4H)" + %} + ins_encode %{ + __ cnt(as_FloatRegister($dst$$reg), __ T8B, + as_FloatRegister($src$$reg)); + __ uaddlp(as_FloatRegister($dst$$reg), __ T8B, + as_FloatRegister($dst$$reg)); + __ uaddlp(as_FloatRegister($dst$$reg), __ T4H, + as_FloatRegister($dst$$reg)); + %} + ins_pipe(pipe_class_default); +%} //----------PEEPHOLE RULES----------------------------------------------------- // These must follow all instruction definitions as they use the names diff --git a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad new file mode 100644 index 000000000..8d80cb37a --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad @@ -0,0 +1,1366 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- + +// AArch64 SVE Architecture Description File + + +// 4 bit signed offset -- for predicated load/store + +operand vmemA_immIOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_int(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand vmemA_immLOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_long(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + + +operand vmemA_indOffI4(iRegP reg, vmemA_immIOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +operand vmemA_indOffL4(iRegP reg, vmemA_immLOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn = NULL; + Assembler::SIMD_RegVariant type = Assembler::B; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + + + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +// vector add + +instruct vaddB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (AddVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (AddVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector and + +instruct vand(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (AndV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_and $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_and(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector or + +instruct vor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (OrV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_orr $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_orr(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector xor + +instruct vxor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (XorV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_eor $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_eor(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector float div + +instruct vdivF(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (DivVF dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vdivD(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (DivVD dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmls + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmla + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmls + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (AddVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (AddVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (AddVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mls + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (SubVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (SubVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (SubVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +// vector mul + +instruct vmulS(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (MulVS dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulI(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (MulVI dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulL(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (MulVL dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (MulVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (MulVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fneg + +instruct vnegF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVF src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vnegD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVD src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector add reduction + +instruct reduce_addI(iRegINoSp dst, iRegIorL2I src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (S)\n\t" + "umov $dst, $tmp, S, 0\n\t" + "addw $dst, $dst, $src1\t # add reduction S" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ S, 0); + __ addw($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addL(iRegLNoSp dst, iRegL src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (D)\n\t" + "umov $dst, $tmp, D, 0\n\t" + "add $dst, $dst, $src1\t # add reduction D" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ D, 0); + __ add($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addF(vRegF src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVF src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addD(vRegD src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVD src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector replicate + +instruct replicateB(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL(vReg dst, iRegL src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateB_imm8(vReg dst, immI8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL_imm8(vReg dst, immL8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateF(vReg dst, vRegF src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateF src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateD(vReg dst, vRegD src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateD src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector shift + +instruct vasrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) con = 7; + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 16) con = 15; + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntB(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16 && + (n->bottom_type()->is_vect()->element_basic_type() == T_BYTE)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntS(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8 && + (n->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + (n->bottom_type()->is_vect()->element_basic_type() == T_CHAR))); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntI(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + (n->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntL(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + (n->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sqrt + +instruct vsqrtF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVF src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsqrtD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVD src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sub + +instruct vsubB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (SubVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (SubVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} diff --git a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 new file mode 100644 index 000000000..0323f2f8c --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 @@ -0,0 +1,727 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +dnl Generate the warning +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- +dnl + +// AArch64 SVE Architecture Description File + +dnl +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET($1, $2, $3 ) +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET(imm_type_abbr, imm_type, imm_len) +define(`OPERAND_VMEMORYA_IMMEDIATE_OFFSET', ` +operand vmemA_imm$1Offset$3() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_$2(), $3, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(Con$1); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%}') +dnl +// 4 bit signed offset -- for predicated load/store +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(I, int, 4) +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(L, long, 4) +dnl +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET($1, $2 ) +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET(imm_type_abbr, imm_len) +define(`OPERAND_VMEMORYA_INDIRECT_OFFSET', ` +operand vmemA_indOff$1$2(iRegP reg, vmemA_imm$1Offset$2 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + `index'(0xffffffff); + scale(0x0); + disp($off); + %} +%}') +dnl +OPERAND_VMEMORYA_INDIRECT_OFFSET(I, 4) +OPERAND_VMEMORYA_INDIRECT_OFFSET(L, 4) + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn; + Assembler::SIMD_RegVariant type; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + +dnl +dnl ELEMENT_SHORT_CHART($1, $2) +dnl ELEMENT_SHORT_CHART(etype, node) +define(`ELEMENT_SHORT_CHAR',`ifelse(`$1', `T_SHORT', + `($2->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + ($2->bottom_type()->is_vect()->element_basic_type() == T_CHAR))', + `($2->bottom_type()->is_vect()->element_basic_type() == $1)')') +dnl + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl UNARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, %6 ) +dnl UNARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`UNARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$6 $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +dnl +dnl BINARY_OP_UNPREDICATED($1, $2 $3, $4 $5 ) +dnl BINARY_OP_UNPREDICATED(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_UNPREDICATED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector add +BINARY_OP_UNPREDICATED(vaddB, AddVB, B, 16, sve_add) +BINARY_OP_UNPREDICATED(vaddS, AddVS, H, 8, sve_add) +BINARY_OP_UNPREDICATED(vaddI, AddVI, S, 4, sve_add) +BINARY_OP_UNPREDICATED(vaddL, AddVL, D, 2, sve_add) +BINARY_OP_UNPREDICATED(vaddF, AddVF, S, 4, sve_fadd) +BINARY_OP_UNPREDICATED(vaddD, AddVD, D, 2, sve_fadd) +dnl +dnl BINARY_OP_UNSIZED($1, $2, $3, $4 ) +dnl BINARY_OP_UNSIZED(insn_name, op_name, min_vec_len, insn) +define(`BINARY_OP_UNSIZED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $3); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$4 $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ $4(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector and +BINARY_OP_UNSIZED(vand, AndV, 16, sve_and) + +// vector or +BINARY_OP_UNSIZED(vor, OrV, 16, sve_orr) + +// vector xor +BINARY_OP_UNSIZED(vxor, XorV, 16, sve_eor) +dnl +dnl VDIVF($1, $2 , $3 ) +dnl VDIVF(name_suffix, size, min_vec_len) +define(`VDIVF', ` +instruct vdiv$1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (DivV$1 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) ($2)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector float div +VDIVF(F, S, 4) +VDIVF(D, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, $6 ) +dnl BINARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$6 $dst_src1, $dst_src1, $src2\t # vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst_src1$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl + +dnl +dnl VFMLA($1 $2 $3 ) +dnl VFMLA(name_suffix, size, min_vec_len) +define(`VFMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmla +VFMLA(F, S, 4) +VFMLA(D, D, 2) + +dnl +dnl VFMLS($1 $2 $3 ) +dnl VFMLS(name_suffix, size, min_vec_len) +define(`VFMLS', ` +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmls +VFMLS(F, S, 4) +VFMLS(D, D, 2) + +dnl +dnl VFNMLA($1 $2 $3 ) +dnl VFNMLA(name_suffix, size, min_vec_len) +define(`VFNMLA', ` +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmla +VFNMLA(F, S, 4) +VFNMLA(D, D, 2) + +dnl +dnl VFNMLS($1 $2 $3 ) +dnl VFNMLS(name_suffix, size, min_vec_len) +define(`VFNMLS', ` +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmls +VFNMLS(F, S, 4) +VFNMLS(D, D, 2) + +dnl +dnl VMLA($1 $2 $3 ) +dnl VMLA(name_suffix, size, min_vec_len) +define(`VMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vmla$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (AddV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mla +VMLA(B, B, 16) +VMLA(S, H, 8) +VMLA(I, S, 4) +VMLA(L, D, 2) + +dnl +dnl VMLS($1 $2 $3 ) +dnl VMLS(name_suffix, size, min_vec_len) +define(`VMLS', ` +// dst_src1 = dst_src1 - src2 * src3 +instruct vmls$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (SubV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mls +VMLS(B, B, 16) +VMLS(S, H, 8) +VMLS(I, S, 4) +VMLS(L, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl BINARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst_src1, $dst_src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst_src1$$reg), __ $3, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector mul +BINARY_OP_TRUE_PREDICATE(vmulS, MulVS, H, 8, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulI, MulVI, S, 4, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulL, MulVL, D, 2, sve_mul) +BINARY_OP_UNPREDICATED(vmulF, MulVF, S, 4, sve_fmul) +BINARY_OP_UNPREDICATED(vmulD, MulVD, D, 2, sve_fmul) + +dnl +dnl UNARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl UNARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_bytes, insn) +define(`UNARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $4); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fneg +UNARY_OP_TRUE_PREDICATE(vnegF, NegVF, S, 16, sve_fneg) +UNARY_OP_TRUE_PREDICATE(vnegD, NegVD, D, 16, sve_fneg) + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl REDUCE_ADD($1, $2, $3, $4, $5, $6, $7 ) +dnl REDUCE_ADD(insn_name, op_name, reg_dst, reg_src, size, elem_type, insn1) +define(`REDUCE_ADD', ` +instruct $1($3 dst, $4 src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + ELEMENT_SHORT_CHAR($6, n->in(2))); + match(Set dst ($2 src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) ($5)\n\t" + "umov $dst, $tmp, $5, 0\n\t" + "$7 $dst, $dst, $src1\t # add reduction $5" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ $5, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ $5, 0); + __ $7($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REDUCE_ADDF($1, $2, $3, $4 ) +dnl REDUCE_ADDF(insn_name, op_name, reg_dst, size) +define(`REDUCE_ADDF', ` +instruct $1($3 src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst ($2 src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector add reduction +REDUCE_ADD(reduce_addI, AddReductionVI, iRegINoSp, iRegIorL2I, S, T_INT, addw) +REDUCE_ADD(reduce_addL, AddReductionVL, iRegLNoSp, iRegL, D, T_LONG, add) +REDUCE_ADDF(reduce_addF, AddReductionVF, vRegF, S) +REDUCE_ADDF(reduce_addD, AddReductionVD, vRegD, D) + +dnl +dnl REDUCE_FMINMAX($1, $2, $3, $4, $5 ) +dnl REDUCE_FMINMAX(min_max, name_suffix, element_type, size, reg_src_dst) +define(`REDUCE_FMINMAX', ` +instruct reduce_$1$2($5 dst, $5 src1, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == $3 && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (translit($1, `m', `M')ReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_f$1v $dst, $src2 # vector (sve) (S)\n\t" + "f$1s $dst, $dst, $src1\t # $1 reduction $2" %} + ins_encode %{ + __ sve_f$1v(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + __ f`$1'translit($4, `SD', `sd')(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +// vector max reduction +REDUCE_FMINMAX(max, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(max, D, T_DOUBLE, D, vRegD) + +// vector min reduction +REDUCE_FMINMAX(min, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(min, D, T_DOUBLE, D, vRegD) + +dnl +dnl REPLICATE($1, $2, $3, $4, $5 ) +dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`REPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REPLICATE_IMM8($1, $2, $3, $4, $5 ) +dnl REPLICATE_IMM8(insn_name, op_name, imm_type, size, min_vec_len) +define(`REPLICATE_IMM8', ` +instruct $1(vReg dst, $3 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, $con$$constant); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl FREPLICATE($1, $2, $3, $4, $5 ) +dnl FREPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`FREPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector replicate +REPLICATE(replicateB, ReplicateB, iRegIorL2I, B, 16) +REPLICATE(replicateS, ReplicateS, iRegIorL2I, H, 8) +REPLICATE(replicateI, ReplicateI, iRegIorL2I, S, 4) +REPLICATE(replicateL, ReplicateL, iRegL, D, 2) + +REPLICATE_IMM8(replicateB_imm8, ReplicateB, immI8, B, 16) +REPLICATE_IMM8(replicateS_imm8, ReplicateS, immI8_shift8, H, 8) +REPLICATE_IMM8(replicateI_imm8, ReplicateI, immI8_shift8, S, 4) +REPLICATE_IMM8(replicateL_imm8, ReplicateL, immL8_shift8, D, 2) + +FREPLICATE(replicateF, ReplicateF, vRegF, S, 4) +FREPLICATE(replicateD, ReplicateD, vRegD, D, 2) +dnl +dnl VSHIFT_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 dst shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $dst, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_IMM_UNPREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_IMM_UNPREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_IMM_UNPREDICATE', ` +instruct $1(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + int con = (int)$shift$$constant;dnl +ifelse(eval(index(`$1', `vasr') == 0 || index(`$1', `vlsr') == 0), 1, ` + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }')dnl +ifelse(eval(index(`$1', `vasr') == 0), 1, `ifelse(eval(index(`$3', `B') == 0), 1, ` + if (con >= 8) con = 7;')ifelse(eval(index(`$3', `H') == 0), 1, ` + if (con >= 16) con = 15;')')dnl +ifelse(eval((index(`$1', `vlsl') == 0 || index(`$1', `vlsr') == 0) && (index(`$3', `B') == 0 || index(`$3', `H') == 0)), 1, ` + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }') + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_COUNT($1, $2, $3, $4 ) +dnl VSHIFT_COUNT(insn_name, size, min_vec_len, type) +define(`VSHIFT_COUNT', ` +instruct $1(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3 && + ELEMENT_SHORT_CHAR($4, n)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) ($2)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $2, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector shift +VSHIFT_TRUE_PREDICATE(vasrB, RShiftVB, B, 16, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrS, RShiftVS, H, 8, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrI, RShiftVI, S, 4, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrL, RShiftVL, D, 2, sve_asr) +VSHIFT_TRUE_PREDICATE(vlslB, LShiftVB, B, 16, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslS, LShiftVS, H, 8, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslI, LShiftVI, S, 4, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslL, LShiftVL, D, 2, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlsrB, URShiftVB, B, 16, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrS, URShiftVS, H, 8, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrI, URShiftVI, S, 4, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrL, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vasrB_imm, RShiftVB, B, 16, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrS_imm, RShiftVS, H, 8, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrI_imm, RShiftVI, S, 4, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrL_imm, RShiftVL, D, 2, sve_asr) +VSHIFT_IMM_UNPREDICATE(vlsrB_imm, URShiftVB, B, 16, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrS_imm, URShiftVS, H, 8, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrI_imm, URShiftVI, S, 4, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrL_imm, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlslB_imm, LShiftVB, B, 16, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslS_imm, LShiftVS, H, 8, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslI_imm, LShiftVI, S, 4, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslL_imm, LShiftVL, D, 2, sve_lsl) +VSHIFT_COUNT(vshiftcntB, B, 16, T_BYTE) +VSHIFT_COUNT(vshiftcntS, H, 8, T_SHORT) +VSHIFT_COUNT(vshiftcntI, S, 4, T_INT) +VSHIFT_COUNT(vshiftcntL, D, 2, T_LONG) + +// vector sqrt +UNARY_OP_TRUE_PREDICATE(vsqrtF, SqrtVF, S, 16, sve_fsqrt) +UNARY_OP_TRUE_PREDICATE(vsqrtD, SqrtVD, D, 16, sve_fsqrt) + +// vector sub +BINARY_OP_UNPREDICATED(vsubB, SubVB, B, 16, sve_sub) +BINARY_OP_UNPREDICATED(vsubS, SubVS, H, 8, sve_sub) +BINARY_OP_UNPREDICATED(vsubI, SubVI, S, 4, sve_sub) +BINARY_OP_UNPREDICATED(vsubL, SubVL, D, 2, sve_sub) +BINARY_OP_UNPREDICATED(vsubF, SubVF, S, 4, sve_fsub) +BINARY_OP_UNPREDICATED(vsubD, SubVD, D, 2, sve_fsub) diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp index 586743eb9..441ea4066 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp @@ -98,531 +98,617 @@ void entry(CodeBuffer *cb) { __ bind(back); // ArithOp - __ add(r19, r22, r7, Assembler::LSL, 28); // add x19, x22, x7, LSL #28 - __ sub(r16, r11, r10, Assembler::LSR, 13); // sub x16, x11, x10, LSR #13 - __ adds(r27, r13, r28, Assembler::ASR, 2); // adds x27, x13, x28, ASR #2 - __ subs(r20, r28, r26, Assembler::ASR, 41); // subs x20, x28, x26, ASR #41 - __ addw(r8, r19, r19, Assembler::ASR, 19); // add w8, w19, w19, ASR #19 - __ subw(r4, r9, r10, Assembler::LSL, 14); // sub w4, w9, w10, LSL #14 - __ addsw(r8, r11, r30, Assembler::LSL, 13); // adds w8, w11, w30, LSL #13 - __ subsw(r0, r25, r19, Assembler::LSL, 9); // subs w0, w25, w19, LSL #9 - __ andr(r20, r0, r21, Assembler::LSL, 19); // and x20, x0, x21, LSL #19 - __ orr(r21, r14, r20, Assembler::LSL, 17); // orr x21, x14, x20, LSL #17 - __ eor(r25, r28, r1, Assembler::LSL, 51); // eor x25, x28, x1, LSL #51 - __ ands(r10, r27, r11, Assembler::ASR, 15); // ands x10, x27, x11, ASR #15 - __ andw(r25, r5, r12, Assembler::ASR, 23); // and w25, w5, w12, ASR #23 - __ orrw(r18, r14, r10, Assembler::LSR, 4); // orr w18, w14, w10, LSR #4 - __ eorw(r4, r21, r5, Assembler::ASR, 22); // eor w4, w21, w5, ASR #22 - __ andsw(r21, r0, r5, Assembler::ASR, 29); // ands w21, w0, w5, ASR #29 - __ bic(r26, r30, r6, Assembler::ASR, 37); // bic x26, x30, x6, ASR #37 - __ orn(r3, r1, r13, Assembler::LSR, 29); // orn x3, x1, x13, LSR #29 - __ eon(r0, r28, r9, Assembler::LSL, 47); // eon x0, x28, x9, LSL #47 - __ bics(r29, r5, r28, Assembler::LSL, 46); // bics x29, x5, x28, LSL #46 - __ bicw(r9, r18, r7, Assembler::LSR, 20); // bic w9, w18, w7, LSR #20 - __ ornw(r26, r13, r25, Assembler::ASR, 24); // orn w26, w13, w25, ASR #24 - __ eonw(r25, r4, r19, Assembler::LSL, 6); // eon w25, w4, w19, LSL #6 - __ bicsw(r5, r26, r4, Assembler::LSR, 24); // bics w5, w26, w4, LSR #24 + __ add(r26, r23, r13, Assembler::LSL, 32); // add x26, x23, x13, LSL #32 + __ sub(r12, r24, r9, Assembler::LSR, 37); // sub x12, x24, x9, LSR #37 + __ adds(r28, r15, r8, Assembler::ASR, 39); // adds x28, x15, x8, ASR #39 + __ subs(r7, r28, r30, Assembler::ASR, 57); // subs x7, x28, x30, ASR #57 + __ addw(r9, r22, r27, Assembler::ASR, 15); // add w9, w22, w27, ASR #15 + __ subw(r3, r13, r18, Assembler::ASR, 30); // sub w3, w13, w18, ASR #30 + __ addsw(r14, r26, r8, Assembler::ASR, 17); // adds w14, w26, w8, ASR #17 + __ subsw(r0, r22, r12, Assembler::ASR, 21); // subs w0, w22, w12, ASR #21 + __ andr(r0, r15, r26, Assembler::LSL, 20); // and x0, x15, x26, LSL #20 + __ orr(r26, r5, r17, Assembler::LSL, 61); // orr x26, x5, x17, LSL #61 + __ eor(r24, r13, r2, Assembler::LSL, 32); // eor x24, x13, x2, LSL #32 + __ ands(r28, r3, r17, Assembler::ASR, 35); // ands x28, x3, x17, ASR #35 + __ andw(r25, r16, r29, Assembler::LSR, 18); // and w25, w16, w29, LSR #18 + __ orrw(r13, r18, r11, Assembler::LSR, 9); // orr w13, w18, w11, LSR #9 + __ eorw(r5, r5, r18, Assembler::LSR, 15); // eor w5, w5, w18, LSR #15 + __ andsw(r2, r23, r27, Assembler::ASR, 26); // ands w2, w23, w27, ASR #26 + __ bic(r27, r28, r16, Assembler::LSR, 45); // bic x27, x28, x16, LSR #45 + __ orn(r8, r25, r26, Assembler::ASR, 37); // orn x8, x25, x26, ASR #37 + __ eon(r29, r17, r13, Assembler::LSR, 63); // eon x29, x17, x13, LSR #63 + __ bics(r28, r24, r2, Assembler::LSR, 31); // bics x28, x24, x2, LSR #31 + __ bicw(r19, r26, r7, Assembler::ASR, 3); // bic w19, w26, w7, ASR #3 + __ ornw(r6, r24, r10, Assembler::ASR, 3); // orn w6, w24, w10, ASR #3 + __ eonw(r4, r21, r1, Assembler::LSR, 29); // eon w4, w21, w1, LSR #29 + __ bicsw(r16, r21, r0, Assembler::LSR, 19); // bics w16, w21, w0, LSR #19 // AddSubImmOp - __ addw(r7, r19, 340u); // add w7, w19, #340 - __ addsw(r8, r0, 401u); // adds w8, w0, #401 - __ subw(r29, r20, 163u); // sub w29, w20, #163 - __ subsw(r8, r23, 759u); // subs w8, w23, #759 - __ add(r1, r12, 523u); // add x1, x12, #523 - __ adds(r2, r11, 426u); // adds x2, x11, #426 - __ sub(r14, r29, 716u); // sub x14, x29, #716 - __ subs(r11, r5, 582u); // subs x11, x5, #582 + __ addw(r17, r12, 379u); // add w17, w12, #379 + __ addsw(r30, r1, 22u); // adds w30, w1, #22 + __ subw(r29, r5, 126u); // sub w29, w5, #126 + __ subsw(r6, r24, 960u); // subs w6, w24, #960 + __ add(r0, r13, 104u); // add x0, x13, #104 + __ adds(r8, r6, 663u); // adds x8, x6, #663 + __ sub(r10, r5, 516u); // sub x10, x5, #516 + __ subs(r1, r3, 1012u); // subs x1, x3, #1012 // LogicalImmOp - __ andw(r23, r22, 32768ul); // and w23, w22, #0x8000 - __ orrw(r4, r10, 4042322160ul); // orr w4, w10, #0xf0f0f0f0 - __ eorw(r0, r24, 4042322160ul); // eor w0, w24, #0xf0f0f0f0 - __ andsw(r19, r29, 2139127680ul); // ands w19, w29, #0x7f807f80 - __ andr(r5, r10, 4503599627354112ul); // and x5, x10, #0xfffffffffc000 - __ orr(r12, r30, 18445618178097414144ul); // orr x12, x30, #0xfffc0000fffc0000 - __ eor(r30, r5, 262128ul); // eor x30, x5, #0x3fff0 - __ ands(r26, r23, 4194300ul); // ands x26, x23, #0x3ffffc + __ andw(r6, r11, 4294049777ull); // and w6, w11, #0xfff1fff1 + __ orrw(r28, r5, 4294966791ull); // orr w28, w5, #0xfffffe07 + __ eorw(r1, r20, 134217216ull); // eor w1, w20, #0x7fffe00 + __ andsw(r7, r18, 1048576ull); // ands w7, w18, #0x100000 + __ andr(r14, r12, 9223372036854775808ull); // and x14, x12, #0x8000000000000000 + __ orr(r9, r11, 562675075514368ull); // orr x9, x11, #0x1ffc000000000 + __ eor(r17, r0, 18014398509481728ull); // eor x17, x0, #0x3fffffffffff00 + __ ands(r1, r8, 18446744073705357315ull); // ands x1, x8, #0xffffffffffc00003 // AbsOp - __ b(__ pc()); // b . - __ b(back); // b back - __ b(forth); // b forth - __ bl(__ pc()); // bl . - __ bl(back); // bl back - __ bl(forth); // bl forth + __ b(__ pc()); // b . + __ b(back); // b back + __ b(forth); // b forth + __ bl(__ pc()); // bl . + __ bl(back); // bl back + __ bl(forth); // bl forth // RegAndAbsOp - __ cbzw(r12, __ pc()); // cbz w12, . - __ cbzw(r12, back); // cbz w12, back - __ cbzw(r12, forth); // cbz w12, forth - __ cbnzw(r20, __ pc()); // cbnz w20, . - __ cbnzw(r20, back); // cbnz w20, back - __ cbnzw(r20, forth); // cbnz w20, forth - __ cbz(r12, __ pc()); // cbz x12, . - __ cbz(r12, back); // cbz x12, back - __ cbz(r12, forth); // cbz x12, forth - __ cbnz(r24, __ pc()); // cbnz x24, . - __ cbnz(r24, back); // cbnz x24, back - __ cbnz(r24, forth); // cbnz x24, forth - __ adr(r6, __ pc()); // adr x6, . - __ adr(r6, back); // adr x6, back - __ adr(r6, forth); // adr x6, forth - __ _adrp(r21, __ pc()); // adrp x21, . + __ cbzw(r10, __ pc()); // cbz w10, . + __ cbzw(r10, back); // cbz w10, back + __ cbzw(r10, forth); // cbz w10, forth + __ cbnzw(r8, __ pc()); // cbnz w8, . + __ cbnzw(r8, back); // cbnz w8, back + __ cbnzw(r8, forth); // cbnz w8, forth + __ cbz(r11, __ pc()); // cbz x11, . + __ cbz(r11, back); // cbz x11, back + __ cbz(r11, forth); // cbz x11, forth + __ cbnz(r29, __ pc()); // cbnz x29, . + __ cbnz(r29, back); // cbnz x29, back + __ cbnz(r29, forth); // cbnz x29, forth + __ adr(r19, __ pc()); // adr x19, . + __ adr(r19, back); // adr x19, back + __ adr(r19, forth); // adr x19, forth + __ _adrp(r19, __ pc()); // adrp x19, . // RegImmAbsOp - __ tbz(r1, 1, __ pc()); // tbz x1, #1, . - __ tbz(r1, 1, back); // tbz x1, #1, back - __ tbz(r1, 1, forth); // tbz x1, #1, forth - __ tbnz(r8, 9, __ pc()); // tbnz x8, #9, . - __ tbnz(r8, 9, back); // tbnz x8, #9, back - __ tbnz(r8, 9, forth); // tbnz x8, #9, forth + __ tbz(r22, 6, __ pc()); // tbz x22, #6, . + __ tbz(r22, 6, back); // tbz x22, #6, back + __ tbz(r22, 6, forth); // tbz x22, #6, forth + __ tbnz(r12, 11, __ pc()); // tbnz x12, #11, . + __ tbnz(r12, 11, back); // tbnz x12, #11, back + __ tbnz(r12, 11, forth); // tbnz x12, #11, forth // MoveWideImmOp - __ movnw(r12, 23175, 0); // movn w12, #23175, lsl 0 - __ movzw(r11, 20476, 16); // movz w11, #20476, lsl 16 - __ movkw(r21, 3716, 0); // movk w21, #3716, lsl 0 - __ movn(r29, 28661, 48); // movn x29, #28661, lsl 48 - __ movz(r3, 6927, 0); // movz x3, #6927, lsl 0 - __ movk(r22, 9828, 16); // movk x22, #9828, lsl 16 + __ movnw(r0, 6301, 0); // movn w0, #6301, lsl 0 + __ movzw(r7, 20886, 0); // movz w7, #20886, lsl 0 + __ movkw(r27, 18617, 0); // movk w27, #18617, lsl 0 + __ movn(r12, 22998, 16); // movn x12, #22998, lsl 16 + __ movz(r20, 1532, 16); // movz x20, #1532, lsl 16 + __ movk(r8, 5167, 32); // movk x8, #5167, lsl 32 // BitfieldOp - __ sbfm(r12, r8, 6, 22); // sbfm x12, x8, #6, #22 - __ bfmw(r19, r25, 25, 19); // bfm w19, w25, #25, #19 - __ ubfmw(r9, r12, 29, 15); // ubfm w9, w12, #29, #15 - __ sbfm(r28, r25, 16, 16); // sbfm x28, x25, #16, #16 - __ bfm(r12, r5, 4, 25); // bfm x12, x5, #4, #25 - __ ubfm(r0, r10, 6, 8); // ubfm x0, x10, #6, #8 + __ sbfm(r15, r17, 24, 28); // sbfm x15, x17, #24, #28 + __ bfmw(r15, r9, 14, 25); // bfm w15, w9, #14, #25 + __ ubfmw(r27, r25, 6, 31); // ubfm w27, w25, #6, #31 + __ sbfm(r19, r2, 23, 31); // sbfm x19, x2, #23, #31 + __ bfm(r12, r21, 10, 6); // bfm x12, x21, #10, #6 + __ ubfm(r22, r0, 26, 16); // ubfm x22, x0, #26, #16 // ExtractOp - __ extrw(r4, r13, r26, 24); // extr w4, w13, w26, #24 - __ extr(r23, r30, r24, 31); // extr x23, x30, x24, #31 + __ extrw(r3, r3, r20, 27); // extr w3, w3, w20, #27 + __ extr(r8, r30, r3, 54); // extr x8, x30, x3, #54 // CondBranchOp - __ br(Assembler::EQ, __ pc()); // b.EQ . - __ br(Assembler::EQ, back); // b.EQ back - __ br(Assembler::EQ, forth); // b.EQ forth - __ br(Assembler::NE, __ pc()); // b.NE . - __ br(Assembler::NE, back); // b.NE back - __ br(Assembler::NE, forth); // b.NE forth - __ br(Assembler::HS, __ pc()); // b.HS . - __ br(Assembler::HS, back); // b.HS back - __ br(Assembler::HS, forth); // b.HS forth - __ br(Assembler::CS, __ pc()); // b.CS . - __ br(Assembler::CS, back); // b.CS back - __ br(Assembler::CS, forth); // b.CS forth - __ br(Assembler::LO, __ pc()); // b.LO . - __ br(Assembler::LO, back); // b.LO back - __ br(Assembler::LO, forth); // b.LO forth - __ br(Assembler::CC, __ pc()); // b.CC . - __ br(Assembler::CC, back); // b.CC back - __ br(Assembler::CC, forth); // b.CC forth - __ br(Assembler::MI, __ pc()); // b.MI . - __ br(Assembler::MI, back); // b.MI back - __ br(Assembler::MI, forth); // b.MI forth - __ br(Assembler::PL, __ pc()); // b.PL . - __ br(Assembler::PL, back); // b.PL back - __ br(Assembler::PL, forth); // b.PL forth - __ br(Assembler::VS, __ pc()); // b.VS . - __ br(Assembler::VS, back); // b.VS back - __ br(Assembler::VS, forth); // b.VS forth - __ br(Assembler::VC, __ pc()); // b.VC . - __ br(Assembler::VC, back); // b.VC back - __ br(Assembler::VC, forth); // b.VC forth - __ br(Assembler::HI, __ pc()); // b.HI . - __ br(Assembler::HI, back); // b.HI back - __ br(Assembler::HI, forth); // b.HI forth - __ br(Assembler::LS, __ pc()); // b.LS . - __ br(Assembler::LS, back); // b.LS back - __ br(Assembler::LS, forth); // b.LS forth - __ br(Assembler::GE, __ pc()); // b.GE . - __ br(Assembler::GE, back); // b.GE back - __ br(Assembler::GE, forth); // b.GE forth - __ br(Assembler::LT, __ pc()); // b.LT . - __ br(Assembler::LT, back); // b.LT back - __ br(Assembler::LT, forth); // b.LT forth - __ br(Assembler::GT, __ pc()); // b.GT . - __ br(Assembler::GT, back); // b.GT back - __ br(Assembler::GT, forth); // b.GT forth - __ br(Assembler::LE, __ pc()); // b.LE . - __ br(Assembler::LE, back); // b.LE back - __ br(Assembler::LE, forth); // b.LE forth - __ br(Assembler::AL, __ pc()); // b.AL . - __ br(Assembler::AL, back); // b.AL back - __ br(Assembler::AL, forth); // b.AL forth - __ br(Assembler::NV, __ pc()); // b.NV . - __ br(Assembler::NV, back); // b.NV back - __ br(Assembler::NV, forth); // b.NV forth + __ br(Assembler::EQ, __ pc()); // b.EQ . + __ br(Assembler::EQ, back); // b.EQ back + __ br(Assembler::EQ, forth); // b.EQ forth + __ br(Assembler::NE, __ pc()); // b.NE . + __ br(Assembler::NE, back); // b.NE back + __ br(Assembler::NE, forth); // b.NE forth + __ br(Assembler::HS, __ pc()); // b.HS . + __ br(Assembler::HS, back); // b.HS back + __ br(Assembler::HS, forth); // b.HS forth + __ br(Assembler::CS, __ pc()); // b.CS . + __ br(Assembler::CS, back); // b.CS back + __ br(Assembler::CS, forth); // b.CS forth + __ br(Assembler::LO, __ pc()); // b.LO . + __ br(Assembler::LO, back); // b.LO back + __ br(Assembler::LO, forth); // b.LO forth + __ br(Assembler::CC, __ pc()); // b.CC . + __ br(Assembler::CC, back); // b.CC back + __ br(Assembler::CC, forth); // b.CC forth + __ br(Assembler::MI, __ pc()); // b.MI . + __ br(Assembler::MI, back); // b.MI back + __ br(Assembler::MI, forth); // b.MI forth + __ br(Assembler::PL, __ pc()); // b.PL . + __ br(Assembler::PL, back); // b.PL back + __ br(Assembler::PL, forth); // b.PL forth + __ br(Assembler::VS, __ pc()); // b.VS . + __ br(Assembler::VS, back); // b.VS back + __ br(Assembler::VS, forth); // b.VS forth + __ br(Assembler::VC, __ pc()); // b.VC . + __ br(Assembler::VC, back); // b.VC back + __ br(Assembler::VC, forth); // b.VC forth + __ br(Assembler::HI, __ pc()); // b.HI . + __ br(Assembler::HI, back); // b.HI back + __ br(Assembler::HI, forth); // b.HI forth + __ br(Assembler::LS, __ pc()); // b.LS . + __ br(Assembler::LS, back); // b.LS back + __ br(Assembler::LS, forth); // b.LS forth + __ br(Assembler::GE, __ pc()); // b.GE . + __ br(Assembler::GE, back); // b.GE back + __ br(Assembler::GE, forth); // b.GE forth + __ br(Assembler::LT, __ pc()); // b.LT . + __ br(Assembler::LT, back); // b.LT back + __ br(Assembler::LT, forth); // b.LT forth + __ br(Assembler::GT, __ pc()); // b.GT . + __ br(Assembler::GT, back); // b.GT back + __ br(Assembler::GT, forth); // b.GT forth + __ br(Assembler::LE, __ pc()); // b.LE . + __ br(Assembler::LE, back); // b.LE back + __ br(Assembler::LE, forth); // b.LE forth + __ br(Assembler::AL, __ pc()); // b.AL . + __ br(Assembler::AL, back); // b.AL back + __ br(Assembler::AL, forth); // b.AL forth + __ br(Assembler::NV, __ pc()); // b.NV . + __ br(Assembler::NV, back); // b.NV back + __ br(Assembler::NV, forth); // b.NV forth // ImmOp - __ svc(12729); // svc #12729 - __ hvc(6788); // hvc #6788 - __ smc(1535); // smc #1535 - __ brk(16766); // brk #16766 - __ hlt(9753); // hlt #9753 + __ svc(12999); // svc #12999 + __ hvc(2665); // hvc #2665 + __ smc(9002); // smc #9002 + __ brk(14843); // brk #14843 + __ hlt(25964); // hlt #25964 // Op - __ nop(); // nop - __ eret(); // eret - __ drps(); // drps - __ isb(); // isb + __ nop(); // nop + __ eret(); // eret + __ drps(); // drps + __ isb(); // isb // SystemOp - __ dsb(Assembler::SY); // dsb SY - __ dmb(Assembler::ISHST); // dmb ISHST + __ dsb(Assembler::ST); // dsb ST + __ dmb(Assembler::OSHST); // dmb OSHST // OneRegOp - __ br(r2); // br x2 - __ blr(r5); // blr x5 + __ br(r16); // br x16 + __ blr(r20); // blr x20 // LoadStoreExclusiveOp - __ stxr(r20, r21, r2); // stxr w20, x21, [x2] - __ stlxr(r5, r29, r7); // stlxr w5, x29, [x7] - __ ldxr(r5, r16); // ldxr x5, [x16] - __ ldaxr(r27, r29); // ldaxr x27, [x29] - __ stlr(r0, r29); // stlr x0, [x29] - __ ldar(r21, r28); // ldar x21, [x28] + __ stxr(r10, r27, r8); // stxr w10, x27, [x8] + __ stlxr(r0, r1, r21); // stlxr w0, x1, [x21] + __ ldxr(r17, r29); // ldxr x17, [x29] + __ ldaxr(r29, r28); // ldaxr x29, [x28] + __ stlr(r1, r23); // stlr x1, [x23] + __ ldar(r21, r20); // ldar x21, [x20] // LoadStoreExclusiveOp - __ stxrw(r21, r24, r7); // stxr w21, w24, [x7] - __ stlxrw(r21, r26, r28); // stlxr w21, w26, [x28] - __ ldxrw(r21, r6); // ldxr w21, [x6] - __ ldaxrw(r15, r30); // ldaxr w15, [x30] - __ stlrw(r19, r3); // stlr w19, [x3] - __ ldarw(r22, r2); // ldar w22, [x2] + __ stxrw(r22, r27, r19); // stxr w22, w27, [x19] + __ stlxrw(r11, r16, r6); // stlxr w11, w16, [x6] + __ ldxrw(r18, r0); // ldxr w18, [x0] + __ ldaxrw(r4, r10); // ldaxr w4, [x10] + __ stlrw(r24, r22); // stlr w24, [x22] + __ ldarw(r10, r19); // ldar w10, [x19] // LoadStoreExclusiveOp - __ stxrh(r18, r15, r0); // stxrh w18, w15, [x0] - __ stlxrh(r11, r5, r28); // stlxrh w11, w5, [x28] - __ ldxrh(r29, r6); // ldxrh w29, [x6] - __ ldaxrh(r18, r7); // ldaxrh w18, [x7] - __ stlrh(r25, r28); // stlrh w25, [x28] - __ ldarh(r2, r19); // ldarh w2, [x19] + __ stxrh(r1, r5, r30); // stxrh w1, w5, [x30] + __ stlxrh(r8, r12, r17); // stlxrh w8, w12, [x17] + __ ldxrh(r9, r14); // ldxrh w9, [x14] + __ ldaxrh(r7, r1); // ldaxrh w7, [x1] + __ stlrh(r5, r16); // stlrh w5, [x16] + __ ldarh(r2, r12); // ldarh w2, [x12] // LoadStoreExclusiveOp - __ stxrb(r10, r30, r1); // stxrb w10, w30, [x1] - __ stlxrb(r20, r21, r22); // stlxrb w20, w21, [x22] - __ ldxrb(r25, r2); // ldxrb w25, [x2] - __ ldaxrb(r24, r5); // ldaxrb w24, [x5] - __ stlrb(r16, r3); // stlrb w16, [x3] - __ ldarb(r22, r29); // ldarb w22, [x29] + __ stxrb(r10, r12, r3); // stxrb w10, w12, [x3] + __ stlxrb(r28, r14, r26); // stlxrb w28, w14, [x26] + __ ldxrb(r30, r10); // ldxrb w30, [x10] + __ ldaxrb(r14, r21); // ldaxrb w14, [x21] + __ stlrb(r13, r9); // stlrb w13, [x9] + __ ldarb(r22, r27); // ldarb w22, [x27] // LoadStoreExclusiveOp - __ ldxp(r8, r2, r19); // ldxp x8, x2, [x19] - __ ldaxp(r7, r19, r14); // ldaxp x7, x19, [x14] - __ stxp(r8, r27, r28, r5); // stxp w8, x27, x28, [x5] - __ stlxp(r5, r8, r14, r6); // stlxp w5, x8, x14, [x6] + __ ldxp(r28, r19, r11); // ldxp x28, x19, [x11] + __ ldaxp(r30, r19, r2); // ldaxp x30, x19, [x2] + __ stxp(r2, r23, r1, r0); // stxp w2, x23, x1, [x0] + __ stlxp(r12, r16, r13, r15); // stlxp w12, x16, x13, [x15] // LoadStoreExclusiveOp - __ ldxpw(r25, r4, r22); // ldxp w25, w4, [x22] - __ ldaxpw(r13, r14, r15); // ldaxp w13, w14, [x15] - __ stxpw(r20, r26, r8, r10); // stxp w20, w26, w8, [x10] - __ stlxpw(r23, r18, r18, r18); // stlxp w23, w18, w18, [x18] + __ ldxpw(r18, r21, r13); // ldxp w18, w21, [x13] + __ ldaxpw(r11, r30, r8); // ldaxp w11, w30, [x8] + __ stxpw(r24, r13, r11, r1); // stxp w24, w13, w11, [x1] + __ stlxpw(r26, r21, r27, r13); // stlxp w26, w21, w27, [x13] -// base_plus_unscaled_offset +// base_plus_unscaled_offset // LoadStoreOp - __ str(r30, Address(r11, 99)); // str x30, [x11, 99] - __ strw(r23, Address(r25, -77)); // str w23, [x25, -77] - __ strb(r2, Address(r14, 3)); // strb w2, [x14, 3] - __ strh(r9, Address(r10, 5)); // strh w9, [x10, 5] - __ ldr(r20, Address(r15, 57)); // ldr x20, [x15, 57] - __ ldrw(r12, Address(r16, -78)); // ldr w12, [x16, -78] - __ ldrb(r22, Address(r26, -3)); // ldrb w22, [x26, -3] - __ ldrh(r30, Address(r19, -47)); // ldrh w30, [x19, -47] - __ ldrsb(r9, Address(r10, -12)); // ldrsb x9, [x10, -12] - __ ldrsh(r28, Address(r17, 14)); // ldrsh x28, [x17, 14] - __ ldrshw(r3, Address(r5, 10)); // ldrsh w3, [x5, 10] - __ ldrsw(r17, Address(r17, -91)); // ldrsw x17, [x17, -91] - __ ldrd(v2, Address(r20, -17)); // ldr d2, [x20, -17] - __ ldrs(v22, Address(r7, -10)); // ldr s22, [x7, -10] - __ strd(v30, Address(r18, -223)); // str d30, [x18, -223] - __ strs(v13, Address(r22, 21)); // str s13, [x22, 21] - -// pre + __ str(r11, Address(r20, -103)); // str x11, [x20, -103] + __ strw(r28, Address(r16, 62)); // str w28, [x16, 62] + __ strb(r27, Address(r9, -9)); // strb w27, [x9, -9] + __ strh(r2, Address(r25, -50)); // strh w2, [x25, -50] + __ ldr(r4, Address(r2, -241)); // ldr x4, [x2, -241] + __ ldrw(r30, Address(r20, -31)); // ldr w30, [x20, -31] + __ ldrb(r18, Address(r23, -23)); // ldrb w18, [x23, -23] + __ ldrh(r29, Address(r26, -1)); // ldrh w29, [x26, -1] + __ ldrsb(r1, Address(r9, 6)); // ldrsb x1, [x9, 6] + __ ldrsh(r11, Address(r12, 19)); // ldrsh x11, [x12, 19] + __ ldrshw(r11, Address(r1, -50)); // ldrsh w11, [x1, -50] + __ ldrsw(r19, Address(r24, 41)); // ldrsw x19, [x24, 41] + __ ldrd(v24, Address(r24, 95)); // ldr d24, [x24, 95] + __ ldrs(v15, Address(r5, -43)); // ldr s15, [x5, -43] + __ strd(v21, Address(r27, 1)); // str d21, [x27, 1] + __ strs(v23, Address(r13, -107)); // str s23, [x13, -107] + +// pre // LoadStoreOp - __ str(r9, Address(__ pre(r18, -112))); // str x9, [x18, -112]! - __ strw(r29, Address(__ pre(r23, 11))); // str w29, [x23, 11]! - __ strb(r18, Address(__ pre(r12, -1))); // strb w18, [x12, -1]! - __ strh(r16, Address(__ pre(r20, -23))); // strh w16, [x20, -23]! - __ ldr(r3, Address(__ pre(r29, 9))); // ldr x3, [x29, 9]! - __ ldrw(r25, Address(__ pre(r3, 19))); // ldr w25, [x3, 19]! - __ ldrb(r1, Address(__ pre(r29, -1))); // ldrb w1, [x29, -1]! - __ ldrh(r8, Address(__ pre(r29, -57))); // ldrh w8, [x29, -57]! - __ ldrsb(r5, Address(__ pre(r14, -13))); // ldrsb x5, [x14, -13]! - __ ldrsh(r10, Address(__ pre(r27, 1))); // ldrsh x10, [x27, 1]! - __ ldrshw(r11, Address(__ pre(r10, 25))); // ldrsh w11, [x10, 25]! - __ ldrsw(r4, Address(__ pre(r22, -92))); // ldrsw x4, [x22, -92]! - __ ldrd(v11, Address(__ pre(r23, 8))); // ldr d11, [x23, 8]! - __ ldrs(v25, Address(__ pre(r19, 54))); // ldr s25, [x19, 54]! - __ strd(v1, Address(__ pre(r7, -174))); // str d1, [x7, -174]! - __ strs(v8, Address(__ pre(r25, 54))); // str s8, [x25, 54]! - -// post + __ str(r11, Address(__ pre(r0, 8))); // str x11, [x0, 8]! + __ strw(r3, Address(__ pre(r0, 29))); // str w3, [x0, 29]! + __ strb(r11, Address(__ pre(r14, 9))); // strb w11, [x14, 9]! + __ strh(r29, Address(__ pre(r24, -3))); // strh w29, [x24, -3]! + __ ldr(r13, Address(__ pre(r17, -144))); // ldr x13, [x17, -144]! + __ ldrw(r12, Address(__ pre(r22, -6))); // ldr w12, [x22, -6]! + __ ldrb(r13, Address(__ pre(r12, -10))); // ldrb w13, [x12, -10]! + __ ldrh(r0, Address(__ pre(r21, -21))); // ldrh w0, [x21, -21]! + __ ldrsb(r23, Address(__ pre(r7, 4))); // ldrsb x23, [x7, 4]! + __ ldrsh(r3, Address(__ pre(r7, -53))); // ldrsh x3, [x7, -53]! + __ ldrshw(r28, Address(__ pre(r5, -7))); // ldrsh w28, [x5, -7]! + __ ldrsw(r24, Address(__ pre(r9, -18))); // ldrsw x24, [x9, -18]! + __ ldrd(v14, Address(__ pre(r11, 12))); // ldr d14, [x11, 12]! + __ ldrs(v19, Address(__ pre(r12, -67))); // ldr s19, [x12, -67]! + __ strd(v20, Address(__ pre(r0, -253))); // str d20, [x0, -253]! + __ strs(v8, Address(__ pre(r0, 64))); // str s8, [x0, 64]! + +// post // LoadStoreOp - __ str(r5, Address(__ post(r11, 37))); // str x5, [x11], 37 - __ strw(r24, Address(__ post(r15, 19))); // str w24, [x15], 19 - __ strb(r15, Address(__ post(r26, -1))); // strb w15, [x26], -1 - __ strh(r18, Address(__ post(r18, -6))); // strh w18, [x18], -6 - __ ldr(r7, Address(__ post(r2, -230))); // ldr x7, [x2], -230 - __ ldrw(r27, Address(__ post(r11, -27))); // ldr w27, [x11], -27 - __ ldrb(r18, Address(__ post(r3, -25))); // ldrb w18, [x3], -25 - __ ldrh(r10, Address(__ post(r24, -32))); // ldrh w10, [x24], -32 - __ ldrsb(r22, Address(__ post(r10, 4))); // ldrsb x22, [x10], 4 - __ ldrsh(r17, Address(__ post(r12, 25))); // ldrsh x17, [x12], 25 - __ ldrshw(r8, Address(__ post(r7, -62))); // ldrsh w8, [x7], -62 - __ ldrsw(r23, Address(__ post(r22, -51))); // ldrsw x23, [x22], -51 - __ ldrd(v24, Address(__ post(r25, 48))); // ldr d24, [x25], 48 - __ ldrs(v21, Address(__ post(r12, -10))); // ldr s21, [x12], -10 - __ strd(v18, Address(__ post(r13, -222))); // str d18, [x13], -222 - __ strs(v16, Address(__ post(r1, -41))); // str s16, [x1], -41 - -// base_plus_reg + __ str(r4, Address(__ post(r28, -94))); // str x4, [x28], -94 + __ strw(r12, Address(__ post(r7, -54))); // str w12, [x7], -54 + __ strb(r27, Address(__ post(r10, -24))); // strb w27, [x10], -24 + __ strh(r6, Address(__ post(r8, 27))); // strh w6, [x8], 27 + __ ldr(r14, Address(__ post(r10, -202))); // ldr x14, [x10], -202 + __ ldrw(r16, Address(__ post(r5, -41))); // ldr w16, [x5], -41 + __ ldrb(r2, Address(__ post(r14, 9))); // ldrb w2, [x14], 9 + __ ldrh(r28, Address(__ post(r13, -20))); // ldrh w28, [x13], -20 + __ ldrsb(r9, Address(__ post(r13, -31))); // ldrsb x9, [x13], -31 + __ ldrsh(r3, Address(__ post(r24, -36))); // ldrsh x3, [x24], -36 + __ ldrshw(r20, Address(__ post(r3, 6))); // ldrsh w20, [x3], 6 + __ ldrsw(r7, Address(__ post(r19, -1))); // ldrsw x7, [x19], -1 + __ ldrd(v30, Address(__ post(r8, -130))); // ldr d30, [x8], -130 + __ ldrs(v25, Address(__ post(r15, 21))); // ldr s25, [x15], 21 + __ strd(v14, Address(__ post(r23, 90))); // str d14, [x23], 90 + __ strs(v8, Address(__ post(r0, -33))); // str s8, [x0], -33 + +// base_plus_reg // LoadStoreOp - __ str(r2, Address(r22, r15, Address::sxtw(0))); // str x2, [x22, w15, sxtw #0] - __ strw(r2, Address(r16, r29, Address::lsl(0))); // str w2, [x16, x29, lsl #0] - __ strb(r20, Address(r18, r14, Address::uxtw(0))); // strb w20, [x18, w14, uxtw #0] - __ strh(r6, Address(r19, r20, Address::sxtx(1))); // strh w6, [x19, x20, sxtx #1] - __ ldr(r14, Address(r29, r14, Address::sxtw(0))); // ldr x14, [x29, w14, sxtw #0] - __ ldrw(r16, Address(r20, r12, Address::sxtw(2))); // ldr w16, [x20, w12, sxtw #2] - __ ldrb(r9, Address(r12, r0, Address::sxtw(0))); // ldrb w9, [x12, w0, sxtw #0] - __ ldrh(r12, Address(r17, r3, Address::lsl(1))); // ldrh w12, [x17, x3, lsl #1] - __ ldrsb(r2, Address(r17, r3, Address::sxtx(0))); // ldrsb x2, [x17, x3, sxtx #0] - __ ldrsh(r7, Address(r1, r17, Address::uxtw(1))); // ldrsh x7, [x1, w17, uxtw #1] - __ ldrshw(r25, Address(r15, r18, Address::sxtw(1))); // ldrsh w25, [x15, w18, sxtw #1] - __ ldrsw(r23, Address(r21, r12, Address::lsl(0))); // ldrsw x23, [x21, x12, lsl #0] - __ ldrd(v5, Address(r13, r8, Address::lsl(3))); // ldr d5, [x13, x8, lsl #3] - __ ldrs(v3, Address(r10, r22, Address::lsl(2))); // ldr s3, [x10, x22, lsl #2] - __ strd(v14, Address(r2, r27, Address::sxtw(0))); // str d14, [x2, w27, sxtw #0] - __ strs(v20, Address(r6, r25, Address::lsl(0))); // str s20, [x6, x25, lsl #0] - -// base_plus_scaled_offset + __ str(r10, Address(r18, r21, Address::sxtw(3))); // str x10, [x18, w21, sxtw #3] + __ strw(r4, Address(r13, r22, Address::sxtw(2))); // str w4, [x13, w22, sxtw #2] + __ strb(r13, Address(r0, r19, Address::uxtw(0))); // strb w13, [x0, w19, uxtw #0] + __ strh(r12, Address(r27, r6, Address::sxtw(0))); // strh w12, [x27, w6, sxtw #0] + __ ldr(r0, Address(r8, r16, Address::lsl(0))); // ldr x0, [x8, x16, lsl #0] + __ ldrw(r0, Address(r4, r26, Address::sxtx(0))); // ldr w0, [x4, x26, sxtx #0] + __ ldrb(r14, Address(r25, r5, Address::sxtw(0))); // ldrb w14, [x25, w5, sxtw #0] + __ ldrh(r9, Address(r4, r18, Address::uxtw(0))); // ldrh w9, [x4, w18, uxtw #0] + __ ldrsb(r27, Address(r4, r7, Address::lsl(0))); // ldrsb x27, [x4, x7, lsl #0] + __ ldrsh(r15, Address(r17, r30, Address::sxtw(0))); // ldrsh x15, [x17, w30, sxtw #0] + __ ldrshw(r16, Address(r0, r22, Address::sxtw(0))); // ldrsh w16, [x0, w22, sxtw #0] + __ ldrsw(r22, Address(r10, r30, Address::sxtx(2))); // ldrsw x22, [x10, x30, sxtx #2] + __ ldrd(v29, Address(r21, r10, Address::sxtx(3))); // ldr d29, [x21, x10, sxtx #3] + __ ldrs(v3, Address(r11, r19, Address::uxtw(0))); // ldr s3, [x11, w19, uxtw #0] + __ strd(v13, Address(r28, r29, Address::uxtw(3))); // str d13, [x28, w29, uxtw #3] + __ strs(v23, Address(r29, r5, Address::sxtx(2))); // str s23, [x29, x5, sxtx #2] + +// base_plus_scaled_offset // LoadStoreOp - __ str(r30, Address(r7, 16256)); // str x30, [x7, 16256] - __ strw(r15, Address(r8, 7588)); // str w15, [x8, 7588] - __ strb(r11, Address(r0, 1866)); // strb w11, [x0, 1866] - __ strh(r3, Address(r17, 3734)); // strh w3, [x17, 3734] - __ ldr(r2, Address(r7, 14224)); // ldr x2, [x7, 14224] - __ ldrw(r5, Address(r9, 7396)); // ldr w5, [x9, 7396] - __ ldrb(r28, Address(r9, 1721)); // ldrb w28, [x9, 1721] - __ ldrh(r2, Address(r20, 3656)); // ldrh w2, [x20, 3656] - __ ldrsb(r22, Address(r14, 1887)); // ldrsb x22, [x14, 1887] - __ ldrsh(r8, Address(r0, 4080)); // ldrsh x8, [x0, 4080] - __ ldrshw(r0, Address(r30, 3916)); // ldrsh w0, [x30, 3916] - __ ldrsw(r24, Address(r19, 6828)); // ldrsw x24, [x19, 6828] - __ ldrd(v24, Address(r12, 13032)); // ldr d24, [x12, 13032] - __ ldrs(v8, Address(r8, 7452)); // ldr s8, [x8, 7452] - __ strd(v10, Address(r15, 15992)); // str d10, [x15, 15992] - __ strs(v26, Address(r19, 6688)); // str s26, [x19, 6688] - -// pcrel + __ str(r5, Address(r8, 12600)); // str x5, [x8, 12600] + __ strw(r29, Address(r24, 7880)); // str w29, [x24, 7880] + __ strb(r19, Address(r17, 1566)); // strb w19, [x17, 1566] + __ strh(r13, Address(r19, 3984)); // strh w13, [x19, 3984] + __ ldr(r19, Address(r23, 13632)); // ldr x19, [x23, 13632] + __ ldrw(r23, Address(r29, 6264)); // ldr w23, [x29, 6264] + __ ldrb(r22, Address(r11, 2012)); // ldrb w22, [x11, 2012] + __ ldrh(r3, Address(r10, 3784)); // ldrh w3, [x10, 3784] + __ ldrsb(r8, Address(r16, 1951)); // ldrsb x8, [x16, 1951] + __ ldrsh(r23, Address(r20, 3346)); // ldrsh x23, [x20, 3346] + __ ldrshw(r2, Address(r1, 3994)); // ldrsh w2, [x1, 3994] + __ ldrsw(r4, Address(r17, 7204)); // ldrsw x4, [x17, 7204] + __ ldrd(v20, Address(r27, 14400)); // ldr d20, [x27, 14400] + __ ldrs(v25, Address(r14, 8096)); // ldr s25, [x14, 8096] + __ strd(v26, Address(r10, 15024)); // str d26, [x10, 15024] + __ strs(v9, Address(r3, 6936)); // str s9, [x3, 6936] + +// pcrel // LoadStoreOp - __ ldr(r10, forth); // ldr x10, forth - __ ldrw(r3, __ pc()); // ldr w3, . + __ ldr(r27, forth); // ldr x27, forth + __ ldrw(r11, __ pc()); // ldr w11, . // LoadStoreOp - __ prfm(Address(r23, 9)); // prfm PLDL1KEEP, [x23, 9] + __ prfm(Address(r3, -187)); // prfm PLDL1KEEP, [x3, -187] // LoadStoreOp - __ prfm(back); // prfm PLDL1KEEP, back + __ prfm(__ pc()); // prfm PLDL1KEEP, . // LoadStoreOp - __ prfm(Address(r3, r8, Address::uxtw(0))); // prfm PLDL1KEEP, [x3, w8, uxtw #0] + __ prfm(Address(r29, r14, Address::lsl(0))); // prfm PLDL1KEEP, [x29, x14, lsl #0] // LoadStoreOp - __ prfm(Address(r11, 15080)); // prfm PLDL1KEEP, [x11, 15080] + __ prfm(Address(r4, 13312)); // prfm PLDL1KEEP, [x4, 13312] // AddSubCarryOp - __ adcw(r13, r9, r28); // adc w13, w9, w28 - __ adcsw(r27, r19, r28); // adcs w27, w19, w28 - __ sbcw(r19, r18, r6); // sbc w19, w18, w6 - __ sbcsw(r14, r20, r3); // sbcs w14, w20, w3 - __ adc(r16, r14, r8); // adc x16, x14, x8 - __ adcs(r0, r29, r8); // adcs x0, x29, x8 - __ sbc(r8, r24, r20); // sbc x8, x24, x20 - __ sbcs(r12, r28, r0); // sbcs x12, x28, x0 + __ adcw(r21, r1, r7); // adc w21, w1, w7 + __ adcsw(r8, r5, r7); // adcs w8, w5, w7 + __ sbcw(r7, r27, r14); // sbc w7, w27, w14 + __ sbcsw(r27, r4, r17); // sbcs w27, w4, w17 + __ adc(r0, r28, r0); // adc x0, x28, x0 + __ adcs(r12, r24, r30); // adcs x12, x24, x30 + __ sbc(r0, r25, r15); // sbc x0, x25, x15 + __ sbcs(r1, r24, r3); // sbcs x1, x24, x3 // AddSubExtendedOp - __ addw(r23, r6, r16, ext::uxtb, 4); // add w23, w6, w16, uxtb #4 - __ addsw(r25, r25, r23, ext::sxth, 2); // adds w25, w25, w23, sxth #2 - __ sub(r26, r22, r4, ext::uxtx, 1); // sub x26, x22, x4, uxtx #1 - __ subsw(r17, r29, r19, ext::sxtx, 3); // subs w17, w29, w19, sxtx #3 - __ add(r11, r30, r21, ext::uxtb, 3); // add x11, x30, x21, uxtb #3 - __ adds(r16, r19, r0, ext::sxtb, 2); // adds x16, x19, x0, sxtb #2 - __ sub(r11, r9, r25, ext::sxtx, 1); // sub x11, x9, x25, sxtx #1 - __ subs(r17, r20, r12, ext::sxtb, 4); // subs x17, x20, x12, sxtb #4 + __ addw(r18, r24, r20, ext::uxtb, 2); // add w18, w24, w20, uxtb #2 + __ addsw(r13, r28, r10, ext::uxth, 1); // adds w13, w28, w10, uxth #1 + __ sub(r15, r16, r2, ext::sxth, 2); // sub x15, x16, x2, sxth #2 + __ subsw(r29, r13, r13, ext::uxth, 2); // subs w29, w13, w13, uxth #2 + __ add(r12, r20, r12, ext::sxtw, 3); // add x12, x20, x12, sxtw #3 + __ adds(r30, r27, r11, ext::sxtb, 1); // adds x30, x27, x11, sxtb #1 + __ sub(r14, r7, r1, ext::sxtw, 2); // sub x14, x7, x1, sxtw #2 + __ subs(r29, r3, r27, ext::sxth, 1); // subs x29, x3, x27, sxth #1 // ConditionalCompareOp - __ ccmnw(r13, r11, 3u, Assembler::LE); // ccmn w13, w11, #3, LE - __ ccmpw(r13, r12, 2u, Assembler::HI); // ccmp w13, w12, #2, HI - __ ccmn(r3, r2, 12u, Assembler::NE); // ccmn x3, x2, #12, NE - __ ccmp(r7, r21, 3u, Assembler::VS); // ccmp x7, x21, #3, VS + __ ccmnw(r0, r13, 14u, Assembler::MI); // ccmn w0, w13, #14, MI + __ ccmpw(r22, r18, 6u, Assembler::CC); // ccmp w22, w18, #6, CC + __ ccmn(r18, r30, 14u, Assembler::VS); // ccmn x18, x30, #14, VS + __ ccmp(r10, r19, 12u, Assembler::HI); // ccmp x10, x19, #12, HI // ConditionalCompareImmedOp - __ ccmnw(r2, 14, 4, Assembler::CC); // ccmn w2, #14, #4, CC - __ ccmpw(r17, 17, 6, Assembler::PL); // ccmp w17, #17, #6, PL - __ ccmn(r10, 12, 0, Assembler::CS); // ccmn x10, #12, #0, CS - __ ccmp(r21, 18, 14, Assembler::GE); // ccmp x21, #18, #14, GE + __ ccmnw(r6, 18, 2, Assembler::LE); // ccmn w6, #18, #2, LE + __ ccmpw(r9, 13, 4, Assembler::HI); // ccmp w9, #13, #4, HI + __ ccmn(r21, 11, 11, Assembler::LO); // ccmn x21, #11, #11, LO + __ ccmp(r4, 13, 2, Assembler::VC); // ccmp x4, #13, #2, VC // ConditionalSelectOp - __ cselw(r21, r13, r12, Assembler::GT); // csel w21, w13, w12, GT - __ csincw(r10, r27, r15, Assembler::LS); // csinc w10, w27, w15, LS - __ csinvw(r0, r13, r9, Assembler::HI); // csinv w0, w13, w9, HI - __ csnegw(r18, r4, r26, Assembler::VS); // csneg w18, w4, w26, VS - __ csel(r12, r29, r7, Assembler::LS); // csel x12, x29, x7, LS - __ csinc(r6, r7, r20, Assembler::VC); // csinc x6, x7, x20, VC - __ csinv(r22, r21, r3, Assembler::LE); // csinv x22, x21, x3, LE - __ csneg(r19, r12, r27, Assembler::LS); // csneg x19, x12, x27, LS + __ cselw(r12, r2, r22, Assembler::HI); // csel w12, w2, w22, HI + __ csincw(r24, r16, r17, Assembler::HS); // csinc w24, w16, w17, HS + __ csinvw(r6, r7, r16, Assembler::LT); // csinv w6, w7, w16, LT + __ csnegw(r11, r27, r22, Assembler::LS); // csneg w11, w27, w22, LS + __ csel(r10, r3, r29, Assembler::LT); // csel x10, x3, x29, LT + __ csinc(r12, r26, r27, Assembler::CC); // csinc x12, x26, x27, CC + __ csinv(r15, r10, r21, Assembler::GT); // csinv x15, x10, x21, GT + __ csneg(r30, r23, r9, Assembler::GT); // csneg x30, x23, x9, GT // TwoRegOp - __ rbitw(r0, r16); // rbit w0, w16 - __ rev16w(r17, r23); // rev16 w17, w23 - __ revw(r17, r14); // rev w17, w14 - __ clzw(r24, r30); // clz w24, w30 - __ clsw(r24, r22); // cls w24, w22 - __ rbit(r3, r17); // rbit x3, x17 - __ rev16(r12, r13); // rev16 x12, x13 - __ rev32(r9, r22); // rev32 x9, x22 - __ rev(r0, r0); // rev x0, x0 - __ clz(r5, r16); // clz x5, x16 - __ cls(r25, r22); // cls x25, x22 + __ rbitw(r30, r10); // rbit w30, w10 + __ rev16w(r29, r15); // rev16 w29, w15 + __ revw(r29, r30); // rev w29, w30 + __ clzw(r25, r21); // clz w25, w21 + __ clsw(r4, r0); // cls w4, w0 + __ rbit(r18, r21); // rbit x18, x21 + __ rev16(r29, r16); // rev16 x29, x16 + __ rev32(r21, r20); // rev32 x21, x20 + __ rev(r6, r19); // rev x6, x19 + __ clz(r30, r3); // clz x30, x3 + __ cls(r21, r19); // cls x21, x19 // ThreeRegOp - __ udivw(r29, r4, r0); // udiv w29, w4, w0 - __ sdivw(r0, r29, r29); // sdiv w0, w29, w29 - __ lslvw(r5, r17, r21); // lslv w5, w17, w21 - __ lsrvw(r9, r9, r18); // lsrv w9, w9, w18 - __ asrvw(r1, r27, r8); // asrv w1, w27, w8 - __ rorvw(r18, r20, r13); // rorv w18, w20, w13 - __ udiv(r8, r25, r12); // udiv x8, x25, x12 - __ sdiv(r7, r5, r28); // sdiv x7, x5, x28 - __ lslv(r5, r17, r27); // lslv x5, x17, x27 - __ lsrv(r23, r26, r20); // lsrv x23, x26, x20 - __ asrv(r28, r8, r28); // asrv x28, x8, x28 - __ rorv(r3, r29, r4); // rorv x3, x29, x4 + __ udivw(r11, r24, r0); // udiv w11, w24, w0 + __ sdivw(r27, r25, r14); // sdiv w27, w25, w14 + __ lslvw(r3, r14, r18); // lslv w3, w14, w18 + __ lsrvw(r7, r15, r24); // lsrv w7, w15, w24 + __ asrvw(r28, r17, r25); // asrv w28, w17, w25 + __ rorvw(r2, r26, r28); // rorv w2, w26, w28 + __ udiv(r5, r25, r26); // udiv x5, x25, x26 + __ sdiv(r27, r16, r18); // sdiv x27, x16, x18 + __ lslv(r6, r21, r12); // lslv x6, x21, x12 + __ lsrv(r0, r4, r12); // lsrv x0, x4, x12 + __ asrv(r27, r17, r28); // asrv x27, x17, x28 + __ rorv(r28, r2, r18); // rorv x28, x2, x18 // FourRegMulOp - __ maddw(r17, r14, r26, r21); // madd w17, w14, w26, w21 - __ msubw(r1, r30, r11, r11); // msub w1, w30, w11, w11 - __ madd(r1, r17, r6, r28); // madd x1, x17, x6, x28 - __ msub(r30, r6, r30, r8); // msub x30, x6, x30, x8 - __ smaddl(r21, r6, r14, r8); // smaddl x21, w6, w14, x8 - __ smsubl(r10, r10, r24, r19); // smsubl x10, w10, w24, x19 - __ umaddl(r20, r18, r14, r24); // umaddl x20, w18, w14, x24 - __ umsubl(r18, r2, r5, r5); // umsubl x18, w2, w5, x5 + __ maddw(r10, r15, r14, r14); // madd w10, w15, w14, w14 + __ msubw(r3, r25, r15, r19); // msub w3, w25, w15, w19 + __ madd(r14, r5, r16, r4); // madd x14, x5, x16, x4 + __ msub(r26, r25, r4, r2); // msub x26, x25, x4, x2 + __ smaddl(r2, r12, r29, r17); // smaddl x2, w12, w29, x17 + __ smsubl(r8, r7, r3, r4); // smsubl x8, w7, w3, x4 + __ umaddl(r25, r4, r26, r25); // umaddl x25, w4, w26, x25 + __ umsubl(r4, r17, r0, r26); // umsubl x4, w17, w0, x26 // ThreeRegFloatOp - __ fmuls(v8, v18, v13); // fmul s8, s18, s13 - __ fdivs(v2, v14, v28); // fdiv s2, s14, s28 - __ fadds(v15, v12, v28); // fadd s15, s12, s28 - __ fsubs(v0, v12, v1); // fsub s0, s12, s1 - __ fmuls(v15, v29, v4); // fmul s15, s29, s4 - __ fmuld(v12, v1, v23); // fmul d12, d1, d23 - __ fdivd(v27, v8, v18); // fdiv d27, d8, d18 - __ faddd(v23, v20, v11); // fadd d23, d20, d11 - __ fsubd(v8, v12, v18); // fsub d8, d12, d18 - __ fmuld(v26, v24, v23); // fmul d26, d24, d23 + __ fmuls(v17, v23, v15); // fmul s17, s23, s15 + __ fdivs(v21, v28, v17); // fdiv s21, s28, s17 + __ fadds(v27, v10, v3); // fadd s27, s10, s3 + __ fsubs(v0, v7, v25); // fsub s0, s7, s25 + __ fmuls(v9, v6, v15); // fmul s9, s6, s15 + __ fmuld(v29, v15, v10); // fmul d29, d15, d10 + __ fdivd(v2, v17, v7); // fdiv d2, d17, d7 + __ faddd(v11, v11, v23); // fadd d11, d11, d23 + __ fsubd(v7, v29, v23); // fsub d7, d29, d23 + __ fmuld(v14, v27, v11); // fmul d14, d27, d11 // FourRegFloatOp - __ fmadds(v21, v23, v13, v25); // fmadd s21, s23, s13, s25 - __ fmsubs(v22, v10, v1, v14); // fmsub s22, s10, s1, s14 - __ fnmadds(v14, v20, v2, v30); // fnmadd s14, s20, s2, s30 - __ fnmadds(v7, v29, v22, v22); // fnmadd s7, s29, s22, s22 - __ fmaddd(v13, v5, v15, v5); // fmadd d13, d5, d15, d5 - __ fmsubd(v14, v12, v5, v10); // fmsub d14, d12, d5, d10 - __ fnmaddd(v10, v19, v0, v1); // fnmadd d10, d19, d0, d1 - __ fnmaddd(v20, v2, v2, v0); // fnmadd d20, d2, d2, d0 + __ fmadds(v11, v4, v24, v12); // fmadd s11, s4, s24, s12 + __ fmsubs(v15, v14, v20, v11); // fmsub s15, s14, s20, s11 + __ fnmadds(v28, v13, v11, v12); // fnmadd s28, s13, s11, s12 + __ fnmadds(v23, v30, v26, v14); // fnmadd s23, s30, s26, s14 + __ fmaddd(v9, v13, v10, v7); // fmadd d9, d13, d10, d7 + __ fmsubd(v5, v29, v15, v3); // fmsub d5, d29, d15, d3 + __ fnmaddd(v11, v12, v15, v30); // fnmadd d11, d12, d15, d30 + __ fnmaddd(v30, v17, v19, v20); // fnmadd d30, d17, d19, d20 // TwoRegFloatOp - __ fmovs(v25, v9); // fmov s25, s9 - __ fabss(v20, v4); // fabs s20, s4 - __ fnegs(v3, v27); // fneg s3, s27 - __ fsqrts(v1, v2); // fsqrt s1, s2 - __ fcvts(v30, v0); // fcvt d30, s0 - __ fmovd(v12, v4); // fmov d12, d4 - __ fabsd(v1, v27); // fabs d1, d27 - __ fnegd(v8, v22); // fneg d8, d22 - __ fsqrtd(v11, v11); // fsqrt d11, d11 - __ fcvtd(v22, v28); // fcvt s22, d28 + __ fmovs(v27, v7); // fmov s27, s7 + __ fabss(v9, v21); // fabs s9, s21 + __ fnegs(v2, v9); // fneg s2, s9 + __ fsqrts(v27, v7); // fsqrt s27, s7 + __ fcvts(v29, v30); // fcvt d29, s30 + __ fmovd(v17, v1); // fmov d17, d1 + __ fabsd(v2, v6); // fabs d2, d6 + __ fnegd(v10, v3); // fneg d10, d3 + __ fsqrtd(v24, v11); // fsqrt d24, d11 + __ fcvtd(v7, v1); // fcvt s7, d1 // FloatConvertOp - __ fcvtzsw(r28, v22); // fcvtzs w28, s22 - __ fcvtzs(r20, v27); // fcvtzs x20, s27 - __ fcvtzdw(r14, v0); // fcvtzs w14, d0 - __ fcvtzd(r26, v11); // fcvtzs x26, d11 - __ scvtfws(v28, r22); // scvtf s28, w22 - __ scvtfs(v16, r10); // scvtf s16, x10 - __ scvtfwd(v8, r21); // scvtf d8, w21 - __ scvtfd(v21, r28); // scvtf d21, x28 - __ fmovs(r24, v24); // fmov w24, s24 - __ fmovd(r8, v19); // fmov x8, d19 - __ fmovs(v8, r12); // fmov s8, w12 - __ fmovd(v6, r7); // fmov d6, x7 + __ fcvtzsw(r11, v0); // fcvtzs w11, s0 + __ fcvtzs(r3, v18); // fcvtzs x3, s18 + __ fcvtzdw(r28, v6); // fcvtzs w28, d6 + __ fcvtzd(r22, v6); // fcvtzs x22, d6 + __ scvtfws(v0, r27); // scvtf s0, w27 + __ scvtfs(v26, r2); // scvtf s26, x2 + __ scvtfwd(v5, r7); // scvtf d5, w7 + __ scvtfd(v28, r11); // scvtf d28, x11 + __ fmovs(r25, v13); // fmov w25, s13 + __ fmovd(r11, v23); // fmov x11, d23 + __ fmovs(v19, r8); // fmov s19, w8 + __ fmovd(v18, r21); // fmov d18, x21 // TwoRegFloatOp - __ fcmps(v30, v16); // fcmp s30, s16 - __ fcmpd(v25, v11); // fcmp d25, d11 - __ fcmps(v11, 0.0); // fcmp s11, #0.0 - __ fcmpd(v11, 0.0); // fcmp d11, #0.0 + __ fcmps(v25, v20); // fcmp s25, s20 + __ fcmpd(v19, v18); // fcmp d19, d18 + __ fcmps(v2, 0.0); // fcmp s2, #0.0 + __ fcmpd(v29, 0.0); // fcmp d29, #0.0 // LoadStorePairOp - __ stpw(r29, r12, Address(r17, 128)); // stp w29, w12, [x17, #128] - __ ldpw(r22, r18, Address(r14, -96)); // ldp w22, w18, [x14, #-96] - __ ldpsw(r11, r16, Address(r1, 64)); // ldpsw x11, x16, [x1, #64] - __ stp(r0, r11, Address(r26, 112)); // stp x0, x11, [x26, #112] - __ ldp(r7, r1, Address(r26, 16)); // ldp x7, x1, [x26, #16] + __ stpw(r8, r21, Address(r19, 16)); // stp w8, w21, [x19, #16] + __ ldpw(r6, r15, Address(r20, 0)); // ldp w6, w15, [x20, #0] + __ ldpsw(r27, r14, Address(r3, -208)); // ldpsw x27, x14, [x3, #-208] + __ stp(r10, r12, Address(r11, -80)); // stp x10, x12, [x11, #-80] + __ ldp(r7, r14, Address(r7, -32)); // ldp x7, x14, [x7, #-32] // LoadStorePairOp - __ stpw(r10, r7, Address(__ pre(r24, 0))); // stp w10, w7, [x24, #0]! - __ ldpw(r7, r28, Address(__ pre(r24, -256))); // ldp w7, w28, [x24, #-256]! - __ ldpsw(r25, r28, Address(__ pre(r21, -240))); // ldpsw x25, x28, [x21, #-240]! - __ stp(r20, r18, Address(__ pre(r14, -16))); // stp x20, x18, [x14, #-16]! - __ ldp(r8, r10, Address(__ pre(r13, 80))); // ldp x8, x10, [x13, #80]! + __ stpw(r0, r22, Address(__ pre(r12, 112))); // stp w0, w22, [x12, #112]! + __ ldpw(r14, r7, Address(__ pre(r8, 48))); // ldp w14, w7, [x8, #48]! + __ ldpsw(r16, r2, Address(__ pre(r9, 0))); // ldpsw x16, x2, [x9, #0]! + __ stp(r20, r29, Address(__ pre(r1, -64))); // stp x20, x29, [x1, #-64]! + __ ldp(r21, r12, Address(__ pre(r5, 80))); // ldp x21, x12, [x5, #80]! // LoadStorePairOp - __ stpw(r26, r24, Address(__ post(r2, -128))); // stp w26, w24, [x2], #-128 - __ ldpw(r2, r25, Address(__ post(r21, -192))); // ldp w2, w25, [x21], #-192 - __ ldpsw(r17, r2, Address(__ post(r21, -144))); // ldpsw x17, x2, [x21], #-144 - __ stp(r12, r10, Address(__ post(r11, 96))); // stp x12, x10, [x11], #96 - __ ldp(r24, r6, Address(__ post(r17, -32))); // ldp x24, x6, [x17], #-32 + __ stpw(r24, r24, Address(__ post(r27, -112))); // stp w24, w24, [x27], #-112 + __ ldpw(r28, r22, Address(__ post(r18, 16))); // ldp w28, w22, [x18], #16 + __ ldpsw(r17, r6, Address(__ post(r13, -96))); // ldpsw x17, x6, [x13], #-96 + __ stp(r28, r26, Address(__ post(r5, -160))); // stp x28, x26, [x5], #-160 + __ ldp(r6, r21, Address(__ post(r26, -240))); // ldp x6, x21, [x26], #-240 // LoadStorePairOp - __ stnpw(r3, r30, Address(r14, -224)); // stnp w3, w30, [x14, #-224] - __ ldnpw(r15, r20, Address(r26, -144)); // ldnp w15, w20, [x26, #-144] - __ stnp(r22, r25, Address(r12, -128)); // stnp x22, x25, [x12, #-128] - __ ldnp(r27, r22, Address(r17, -176)); // ldnp x27, x22, [x17, #-176] + __ stnpw(r13, r20, Address(r30, 32)); // stnp w13, w20, [x30, #32] + __ ldnpw(r17, r11, Address(r5, 96)); // ldnp w17, w11, [x5, #96] + __ stnp(r13, r20, Address(r26, -96)); // stnp x13, x20, [x26, #-96] + __ ldnp(r29, r12, Address(r23, -80)); // ldnp x29, x12, [x23, #-80] + +// SpecialCases + __ sve_cpy(z0, __ S, p0, v1); // mov z0.s, p0/m, s1 + __ sve_inc(r0, __ S); // incw x0 + __ sve_dec(r1, __ H); // dech x1 + __ sve_lsl(z0, __ B, z1, 7); // lsl z0.b, z1.b, #7 + __ sve_lsl(z21, __ H, z1, 15); // lsl z21.h, z1.h, #15 + __ sve_lsl(z0, __ S, z1, 31); // lsl z0.s, z1.s, #31 + __ sve_lsl(z0, __ D, z1, 63); // lsl z0.d, z1.d, #63 + __ sve_lsr(z0, __ B, z1, 7); // lsr z0.b, z1.b, #7 + __ sve_asr(z0, __ H, z11, 15); // asr z0.h, z11.h, #15 + __ sve_lsr(z30, __ S, z1, 31); // lsr z30.s, z1.s, #31 + __ sve_asr(z0, __ D, z1, 63); // asr z0.d, z1.d, #63 + __ sve_addvl(sp, r0, 31); // addvl sp, x0, #31 + __ sve_addpl(r1, sp, -32); // addpl x1, sp, -32 + __ sve_cntp(r8, __ B, p0, p1); // cntp x8, p0, p1.b + __ sve_dup(z0, __ B, 127); // dup z0.b, 127 + __ sve_dup(z1, __ H, -128); // dup z1.h, -128 + __ sve_dup(z2, __ S, 32512); // dup z2.s, 32512 + __ sve_dup(z7, __ D, -32768); // dup z7.d, -32768 + __ sve_ld1b(z0, __ B, p0, Address(sp)); // ld1b {z0.b}, p0/z, [sp] + __ sve_ld1h(z10, __ H, p1, Address(sp, -8)); // ld1h {z10.h}, p1/z, [sp, #-8, MUL VL] + __ sve_ld1w(z20, __ S, p2, Address(r0, 7)); // ld1w {z20.s}, p2/z, [x0, #7, MUL VL] + __ sve_ld1b(z30, __ B, p3, Address(sp, r8)); // ld1b {z30.b}, p3/z, [sp, x8] + __ sve_ld1w(z0, __ S, p4, Address(sp, r28)); // ld1w {z0.s}, p4/z, [sp, x28, LSL #2] + __ sve_ld1d(z11, __ D, p5, Address(r0, r1)); // ld1d {z11.d}, p5/z, [x0, x1, LSL #3] + __ sve_st1b(z22, __ B, p6, Address(sp)); // st1b {z22.b}, p6, [sp] + __ sve_st1b(z31, __ B, p7, Address(sp, -8)); // st1b {z31.b}, p7, [sp, #-8, MUL VL] + __ sve_st1w(z0, __ S, p1, Address(r0, 7)); // st1w {z0.s}, p1, [x0, #7, MUL VL] + __ sve_st1b(z0, __ B, p2, Address(sp, r1)); // st1b {z0.b}, p2, [sp, x1] + __ sve_st1h(z0, __ H, p3, Address(sp, r8)); // st1h {z0.h}, p3, [sp, x8, LSL #1] + __ sve_st1d(z0, __ D, p4, Address(r0, r18)); // st1d {z0.d}, p4, [x0, x18, LSL #3] + __ sve_ldr(z0, Address(sp)); // ldr z0, [sp] + __ sve_ldr(z31, Address(sp, -256)); // ldr z31, [sp, #-256, MUL VL] + __ sve_str(z8, Address(r8, 255)); // str z8, [x8, #255, MUL VL] // FloatImmediateOp - __ fmovd(v0, 2.0); // fmov d0, #2.0 - __ fmovd(v0, 2.125); // fmov d0, #2.125 - __ fmovd(v0, 4.0); // fmov d0, #4.0 - __ fmovd(v0, 4.25); // fmov d0, #4.25 - __ fmovd(v0, 8.0); // fmov d0, #8.0 - __ fmovd(v0, 8.5); // fmov d0, #8.5 - __ fmovd(v0, 16.0); // fmov d0, #16.0 - __ fmovd(v0, 17.0); // fmov d0, #17.0 - __ fmovd(v0, 0.125); // fmov d0, #0.125 - __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 - __ fmovd(v0, 0.25); // fmov d0, #0.25 - __ fmovd(v0, 0.265625); // fmov d0, #0.265625 - __ fmovd(v0, 0.5); // fmov d0, #0.5 - __ fmovd(v0, 0.53125); // fmov d0, #0.53125 - __ fmovd(v0, 1.0); // fmov d0, #1.0 - __ fmovd(v0, 1.0625); // fmov d0, #1.0625 - __ fmovd(v0, -2.0); // fmov d0, #-2.0 - __ fmovd(v0, -2.125); // fmov d0, #-2.125 - __ fmovd(v0, -4.0); // fmov d0, #-4.0 - __ fmovd(v0, -4.25); // fmov d0, #-4.25 - __ fmovd(v0, -8.0); // fmov d0, #-8.0 - __ fmovd(v0, -8.5); // fmov d0, #-8.5 - __ fmovd(v0, -16.0); // fmov d0, #-16.0 - __ fmovd(v0, -17.0); // fmov d0, #-17.0 - __ fmovd(v0, -0.125); // fmov d0, #-0.125 - __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 - __ fmovd(v0, -0.25); // fmov d0, #-0.25 - __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 - __ fmovd(v0, -0.5); // fmov d0, #-0.5 - __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 - __ fmovd(v0, -1.0); // fmov d0, #-1.0 - __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 + __ fmovd(v0, 2.0); // fmov d0, #2.0 + __ fmovd(v0, 2.125); // fmov d0, #2.125 + __ fmovd(v0, 4.0); // fmov d0, #4.0 + __ fmovd(v0, 4.25); // fmov d0, #4.25 + __ fmovd(v0, 8.0); // fmov d0, #8.0 + __ fmovd(v0, 8.5); // fmov d0, #8.5 + __ fmovd(v0, 16.0); // fmov d0, #16.0 + __ fmovd(v0, 17.0); // fmov d0, #17.0 + __ fmovd(v0, 0.125); // fmov d0, #0.125 + __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 + __ fmovd(v0, 0.25); // fmov d0, #0.25 + __ fmovd(v0, 0.265625); // fmov d0, #0.265625 + __ fmovd(v0, 0.5); // fmov d0, #0.5 + __ fmovd(v0, 0.53125); // fmov d0, #0.53125 + __ fmovd(v0, 1.0); // fmov d0, #1.0 + __ fmovd(v0, 1.0625); // fmov d0, #1.0625 + __ fmovd(v0, -2.0); // fmov d0, #-2.0 + __ fmovd(v0, -2.125); // fmov d0, #-2.125 + __ fmovd(v0, -4.0); // fmov d0, #-4.0 + __ fmovd(v0, -4.25); // fmov d0, #-4.25 + __ fmovd(v0, -8.0); // fmov d0, #-8.0 + __ fmovd(v0, -8.5); // fmov d0, #-8.5 + __ fmovd(v0, -16.0); // fmov d0, #-16.0 + __ fmovd(v0, -17.0); // fmov d0, #-17.0 + __ fmovd(v0, -0.125); // fmov d0, #-0.125 + __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 + __ fmovd(v0, -0.25); // fmov d0, #-0.25 + __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 + __ fmovd(v0, -0.5); // fmov d0, #-0.5 + __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 + __ fmovd(v0, -1.0); // fmov d0, #-1.0 + __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 + +// SVEVectorOp + __ sve_add(z14, __ S, z16, z27); // add z14.s, z16.s, z27.s + __ sve_sub(z0, __ S, z6, z26); // sub z0.s, z6.s, z26.s + __ sve_fadd(z27, __ S, z12, z6); // fadd z27.s, z12.s, z6.s + __ sve_fmul(z30, __ S, z4, z19); // fmul z30.s, z4.s, z19.s + __ sve_fsub(z11, __ D, z16, z2); // fsub z11.d, z16.d, z2.d + __ sve_abs(z15, __ D, p0, z12); // abs z15.d, p0/m, z12.d + __ sve_add(z9, __ B, p5, z23); // add z9.b, p5/m, z9.b, z23.b + __ sve_asr(z30, __ S, p0, z26); // asr z30.s, p0/m, z30.s, z26.s + __ sve_cnt(z4, __ H, p2, z18); // cnt z4.h, p2/m, z18.h + __ sve_lsl(z25, __ S, p1, z11); // lsl z25.s, p1/m, z25.s, z11.s + __ sve_lsr(z10, __ B, p6, z8); // lsr z10.b, p6/m, z10.b, z8.b + __ sve_mul(z4, __ B, p5, z17); // mul z4.b, p5/m, z4.b, z17.b + __ sve_neg(z30, __ S, p3, z9); // neg z30.s, p3/m, z9.s + __ sve_not(z0, __ D, p3, z20); // not z0.d, p3/m, z20.d + __ sve_smax(z23, __ H, p7, z3); // smax z23.h, p7/m, z23.h, z3.h + __ sve_smin(z0, __ H, p2, z11); // smin z0.h, p2/m, z0.h, z11.h + __ sve_sub(z11, __ D, p6, z5); // sub z11.d, p6/m, z11.d, z5.d + __ sve_fabs(z16, __ S, p2, z17); // fabs z16.s, p2/m, z17.s + __ sve_fadd(z15, __ S, p0, z26); // fadd z15.s, p0/m, z15.s, z26.s + __ sve_fdiv(z10, __ S, p7, z19); // fdiv z10.s, p7/m, z10.s, z19.s + __ sve_fmax(z24, __ D, p0, z17); // fmax z24.d, p0/m, z24.d, z17.d + __ sve_fmin(z26, __ D, p4, z15); // fmin z26.d, p4/m, z26.d, z15.d + __ sve_fmul(z24, __ D, p2, z17); // fmul z24.d, p2/m, z24.d, z17.d + __ sve_fneg(z30, __ S, p5, z29); // fneg z30.s, p5/m, z29.s + __ sve_frintm(z18, __ S, p5, z10); // frintm z18.s, p5/m, z10.s + __ sve_frintn(z30, __ D, p2, z30); // frintn z30.d, p2/m, z30.d + __ sve_frintp(z6, __ S, p6, z30); // frintp z6.s, p6/m, z30.s + __ sve_fsqrt(z20, __ D, p6, z2); // fsqrt z20.d, p6/m, z2.d + __ sve_fsub(z9, __ S, p5, z29); // fsub z9.s, p5/m, z9.s, z29.s + __ sve_fmla(z18, __ D, p2, z3, z22); // fmla z18.d, p2/m, z3.d, z22.d + __ sve_fmls(z15, __ D, p2, z13, z12); // fmls z15.d, p2/m, z13.d, z12.d + __ sve_fnmla(z12, __ S, p0, z30, z30); // fnmla z12.s, p0/m, z30.s, z30.s + __ sve_fnmls(z7, __ D, p3, z21, z0); // fnmls z7.d, p3/m, z21.d, z0.d + __ sve_mla(z19, __ H, p2, z26, z20); // mla z19.h, p2/m, z26.h, z20.h + __ sve_mls(z16, __ D, p7, z1, z21); // mls z16.d, p7/m, z1.d, z21.d + __ sve_and(z21, z4, z18); // and z21.d, z4.d, z18.d + __ sve_eor(z12, z18, z7); // eor z12.d, z18.d, z7.d + __ sve_orr(z25, z15, z13); // orr z25.d, z15.d, z13.d + +// SVEReductionOp + __ sve_andv(v11, __ D, p4, z7); // andv d11, p4, z7.d + __ sve_orv(v11, __ D, p1, z9); // orv d11, p1, z9.d + __ sve_eorv(v28, __ D, p7, z0); // eorv d28, p7, z0.d + __ sve_smaxv(v16, __ H, p0, z7); // smaxv h16, p0, z7.h + __ sve_sminv(v12, __ B, p3, z29); // sminv b12, p3, z29.b + __ sve_fminv(v21, __ S, p6, z11); // fminv s21, p6, z11.s + __ sve_fmaxv(v6, __ D, p2, z4); // fmaxv d6, p2, z4.d + __ sve_fadda(v7, __ D, p0, z7); // fadda d7, p0, d7, z7.d + __ sve_uaddv(v12, __ B, p7, z29); // uaddv d12, p7, z29.b __ bind(forth); @@ -633,542 +719,642 @@ aarch64ops.o: file format elf64-littleaarch64 Disassembly of section .text: 0000000000000000 : - 0: 8b0772d3 add x19, x22, x7, lsl #28 - 4: cb4a3570 sub x16, x11, x10, lsr #13 - 8: ab9c09bb adds x27, x13, x28, asr #2 - c: eb9aa794 subs x20, x28, x26, asr #41 - 10: 0b934e68 add w8, w19, w19, asr #19 - 14: 4b0a3924 sub w4, w9, w10, lsl #14 - 18: 2b1e3568 adds w8, w11, w30, lsl #13 - 1c: 6b132720 subs w0, w25, w19, lsl #9 - 20: 8a154c14 and x20, x0, x21, lsl #19 - 24: aa1445d5 orr x21, x14, x20, lsl #17 - 28: ca01cf99 eor x25, x28, x1, lsl #51 - 2c: ea8b3f6a ands x10, x27, x11, asr #15 - 30: 0a8c5cb9 and w25, w5, w12, asr #23 - 34: 2a4a11d2 orr w18, w14, w10, lsr #4 - 38: 4a855aa4 eor w4, w21, w5, asr #22 - 3c: 6a857415 ands w21, w0, w5, asr #29 - 40: 8aa697da bic x26, x30, x6, asr #37 - 44: aa6d7423 orn x3, x1, x13, lsr #29 - 48: ca29bf80 eon x0, x28, x9, lsl #47 - 4c: ea3cb8bd bics x29, x5, x28, lsl #46 - 50: 0a675249 bic w9, w18, w7, lsr #20 - 54: 2ab961ba orn w26, w13, w25, asr #24 - 58: 4a331899 eon w25, w4, w19, lsl #6 - 5c: 6a646345 bics w5, w26, w4, lsr #24 - 60: 11055267 add w7, w19, #0x154 - 64: 31064408 adds w8, w0, #0x191 - 68: 51028e9d sub w29, w20, #0xa3 - 6c: 710bdee8 subs w8, w23, #0x2f7 - 70: 91082d81 add x1, x12, #0x20b - 74: b106a962 adds x2, x11, #0x1aa - 78: d10b33ae sub x14, x29, #0x2cc - 7c: f10918ab subs x11, x5, #0x246 - 80: 121102d7 and w23, w22, #0x8000 - 84: 3204cd44 orr w4, w10, #0xf0f0f0f0 - 88: 5204cf00 eor w0, w24, #0xf0f0f0f0 - 8c: 72099fb3 ands w19, w29, #0x7f807f80 - 90: 92729545 and x5, x10, #0xfffffffffc000 - 94: b20e37cc orr x12, x30, #0xfffc0000fffc0000 - 98: d27c34be eor x30, x5, #0x3fff0 - 9c: f27e4efa ands x26, x23, #0x3ffffc - a0: 14000000 b a0 - a4: 17ffffd7 b 0 - a8: 1400017f b 6a4 - ac: 94000000 bl ac - b0: 97ffffd4 bl 0 - b4: 9400017c bl 6a4 - b8: 3400000c cbz w12, b8 - bc: 34fffa2c cbz w12, 0 - c0: 34002f2c cbz w12, 6a4 - c4: 35000014 cbnz w20, c4 - c8: 35fff9d4 cbnz w20, 0 - cc: 35002ed4 cbnz w20, 6a4 - d0: b400000c cbz x12, d0 - d4: b4fff96c cbz x12, 0 - d8: b4002e6c cbz x12, 6a4 - dc: b5000018 cbnz x24, dc - e0: b5fff918 cbnz x24, 0 - e4: b5002e18 cbnz x24, 6a4 - e8: 10000006 adr x6, e8 - ec: 10fff8a6 adr x6, 0 - f0: 10002da6 adr x6, 6a4 - f4: 90000015 adrp x21, 0 - f8: 36080001 tbz w1, #1, f8 - fc: 360ff821 tbz w1, #1, 0 - 100: 36082d21 tbz w1, #1, 6a4 - 104: 37480008 tbnz w8, #9, 104 - 108: 374ff7c8 tbnz w8, #9, 0 - 10c: 37482cc8 tbnz w8, #9, 6a4 - 110: 128b50ec movn w12, #0x5a87 - 114: 52a9ff8b movz w11, #0x4ffc, lsl #16 - 118: 7281d095 movk w21, #0xe84 - 11c: 92edfebd movn x29, #0x6ff5, lsl #48 - 120: d28361e3 movz x3, #0x1b0f - 124: f2a4cc96 movk x22, #0x2664, lsl #16 - 128: 9346590c sbfx x12, x8, #6, #17 - 12c: 33194f33 bfi w19, w25, #7, #20 - 130: 531d3d89 ubfiz w9, w12, #3, #16 - 134: 9350433c sbfx x28, x25, #16, #1 - 138: b34464ac bfxil x12, x5, #4, #22 - 13c: d3462140 ubfx x0, x10, #6, #3 - 140: 139a61a4 extr w4, w13, w26, #24 - 144: 93d87fd7 extr x23, x30, x24, #31 - 148: 54000000 b.eq 148 - 14c: 54fff5a0 b.eq 0 - 150: 54002aa0 b.eq 6a4 - 154: 54000001 b.ne 154 - 158: 54fff541 b.ne 0 - 15c: 54002a41 b.ne 6a4 - 160: 54000002 b.cs 160 - 164: 54fff4e2 b.cs 0 - 168: 540029e2 b.cs 6a4 - 16c: 54000002 b.cs 16c - 170: 54fff482 b.cs 0 - 174: 54002982 b.cs 6a4 - 178: 54000003 b.cc 178 - 17c: 54fff423 b.cc 0 - 180: 54002923 b.cc 6a4 - 184: 54000003 b.cc 184 - 188: 54fff3c3 b.cc 0 - 18c: 540028c3 b.cc 6a4 - 190: 54000004 b.mi 190 - 194: 54fff364 b.mi 0 - 198: 54002864 b.mi 6a4 - 19c: 54000005 b.pl 19c - 1a0: 54fff305 b.pl 0 - 1a4: 54002805 b.pl 6a4 - 1a8: 54000006 b.vs 1a8 - 1ac: 54fff2a6 b.vs 0 - 1b0: 540027a6 b.vs 6a4 - 1b4: 54000007 b.vc 1b4 - 1b8: 54fff247 b.vc 0 - 1bc: 54002747 b.vc 6a4 - 1c0: 54000008 b.hi 1c0 - 1c4: 54fff1e8 b.hi 0 - 1c8: 540026e8 b.hi 6a4 - 1cc: 54000009 b.ls 1cc - 1d0: 54fff189 b.ls 0 - 1d4: 54002689 b.ls 6a4 - 1d8: 5400000a b.ge 1d8 - 1dc: 54fff12a b.ge 0 - 1e0: 5400262a b.ge 6a4 - 1e4: 5400000b b.lt 1e4 - 1e8: 54fff0cb b.lt 0 - 1ec: 540025cb b.lt 6a4 - 1f0: 5400000c b.gt 1f0 - 1f4: 54fff06c b.gt 0 - 1f8: 5400256c b.gt 6a4 - 1fc: 5400000d b.le 1fc - 200: 54fff00d b.le 0 - 204: 5400250d b.le 6a4 - 208: 5400000e b.al 208 - 20c: 54ffefae b.al 0 - 210: 540024ae b.al 6a4 - 214: 5400000f b.nv 214 - 218: 54ffef4f b.nv 0 - 21c: 5400244f b.nv 6a4 - 220: d4063721 svc #0x31b9 - 224: d4035082 hvc #0x1a84 - 228: d400bfe3 smc #0x5ff - 22c: d4282fc0 brk #0x417e - 230: d444c320 hlt #0x2619 - 234: d503201f nop - 238: d69f03e0 eret - 23c: d6bf03e0 drps - 240: d5033fdf isb - 244: d5033f9f dsb sy - 248: d5033abf dmb ishst - 24c: d61f0040 br x2 - 250: d63f00a0 blr x5 - 254: c8147c55 stxr w20, x21, [x2] - 258: c805fcfd stlxr w5, x29, [x7] - 25c: c85f7e05 ldxr x5, [x16] - 260: c85fffbb ldaxr x27, [x29] - 264: c89fffa0 stlr x0, [x29] - 268: c8dfff95 ldar x21, [x28] - 26c: 88157cf8 stxr w21, w24, [x7] - 270: 8815ff9a stlxr w21, w26, [x28] - 274: 885f7cd5 ldxr w21, [x6] - 278: 885fffcf ldaxr w15, [x30] - 27c: 889ffc73 stlr w19, [x3] - 280: 88dffc56 ldar w22, [x2] - 284: 48127c0f stxrh w18, w15, [x0] - 288: 480bff85 stlxrh w11, w5, [x28] - 28c: 485f7cdd ldxrh w29, [x6] - 290: 485ffcf2 ldaxrh w18, [x7] - 294: 489fff99 stlrh w25, [x28] - 298: 48dffe62 ldarh w2, [x19] - 29c: 080a7c3e stxrb w10, w30, [x1] - 2a0: 0814fed5 stlxrb w20, w21, [x22] - 2a4: 085f7c59 ldxrb w25, [x2] - 2a8: 085ffcb8 ldaxrb w24, [x5] - 2ac: 089ffc70 stlrb w16, [x3] - 2b0: 08dfffb6 ldarb w22, [x29] - 2b4: c87f0a68 ldxp x8, x2, [x19] - 2b8: c87fcdc7 ldaxp x7, x19, [x14] - 2bc: c82870bb stxp w8, x27, x28, [x5] - 2c0: c825b8c8 stlxp w5, x8, x14, [x6] - 2c4: 887f12d9 ldxp w25, w4, [x22] - 2c8: 887fb9ed ldaxp w13, w14, [x15] - 2cc: 8834215a stxp w20, w26, w8, [x10] - 2d0: 8837ca52 stlxp w23, w18, w18, [x18] - 2d4: f806317e str x30, [x11,#99] - 2d8: b81b3337 str w23, [x25,#-77] - 2dc: 39000dc2 strb w2, [x14,#3] - 2e0: 78005149 strh w9, [x10,#5] - 2e4: f84391f4 ldr x20, [x15,#57] - 2e8: b85b220c ldr w12, [x16,#-78] - 2ec: 385fd356 ldrb w22, [x26,#-3] - 2f0: 785d127e ldrh w30, [x19,#-47] - 2f4: 389f4149 ldrsb x9, [x10,#-12] - 2f8: 79801e3c ldrsh x28, [x17,#14] - 2fc: 79c014a3 ldrsh w3, [x5,#10] - 300: b89a5231 ldrsw x17, [x17,#-91] - 304: fc5ef282 ldr d2, [x20,#-17] - 308: bc5f60f6 ldr s22, [x7,#-10] - 30c: fc12125e str d30, [x18,#-223] - 310: bc0152cd str s13, [x22,#21] - 314: f8190e49 str x9, [x18,#-112]! - 318: b800befd str w29, [x23,#11]! - 31c: 381ffd92 strb w18, [x12,#-1]! - 320: 781e9e90 strh w16, [x20,#-23]! - 324: f8409fa3 ldr x3, [x29,#9]! - 328: b8413c79 ldr w25, [x3,#19]! - 32c: 385fffa1 ldrb w1, [x29,#-1]! - 330: 785c7fa8 ldrh w8, [x29,#-57]! - 334: 389f3dc5 ldrsb x5, [x14,#-13]! - 338: 78801f6a ldrsh x10, [x27,#1]! - 33c: 78c19d4b ldrsh w11, [x10,#25]! - 340: b89a4ec4 ldrsw x4, [x22,#-92]! - 344: fc408eeb ldr d11, [x23,#8]! - 348: bc436e79 ldr s25, [x19,#54]! - 34c: fc152ce1 str d1, [x7,#-174]! - 350: bc036f28 str s8, [x25,#54]! - 354: f8025565 str x5, [x11],#37 - 358: b80135f8 str w24, [x15],#19 - 35c: 381ff74f strb w15, [x26],#-1 - 360: 781fa652 strh w18, [x18],#-6 - 364: f851a447 ldr x7, [x2],#-230 - 368: b85e557b ldr w27, [x11],#-27 - 36c: 385e7472 ldrb w18, [x3],#-25 - 370: 785e070a ldrh w10, [x24],#-32 - 374: 38804556 ldrsb x22, [x10],#4 - 378: 78819591 ldrsh x17, [x12],#25 - 37c: 78dc24e8 ldrsh w8, [x7],#-62 - 380: b89cd6d7 ldrsw x23, [x22],#-51 - 384: fc430738 ldr d24, [x25],#48 - 388: bc5f6595 ldr s21, [x12],#-10 - 38c: fc1225b2 str d18, [x13],#-222 - 390: bc1d7430 str s16, [x1],#-41 - 394: f82fcac2 str x2, [x22,w15,sxtw] - 398: b83d6a02 str w2, [x16,x29] - 39c: 382e5a54 strb w20, [x18,w14,uxtw #0] - 3a0: 7834fa66 strh w6, [x19,x20,sxtx #1] - 3a4: f86ecbae ldr x14, [x29,w14,sxtw] - 3a8: b86cda90 ldr w16, [x20,w12,sxtw #2] - 3ac: 3860d989 ldrb w9, [x12,w0,sxtw #0] - 3b0: 78637a2c ldrh w12, [x17,x3,lsl #1] - 3b4: 38a3fa22 ldrsb x2, [x17,x3,sxtx #0] - 3b8: 78b15827 ldrsh x7, [x1,w17,uxtw #1] - 3bc: 78f2d9f9 ldrsh w25, [x15,w18,sxtw #1] - 3c0: b8ac6ab7 ldrsw x23, [x21,x12] - 3c4: fc6879a5 ldr d5, [x13,x8,lsl #3] - 3c8: bc767943 ldr s3, [x10,x22,lsl #2] - 3cc: fc3bc84e str d14, [x2,w27,sxtw] - 3d0: bc3968d4 str s20, [x6,x25] - 3d4: f91fc0fe str x30, [x7,#16256] - 3d8: b91da50f str w15, [x8,#7588] - 3dc: 391d280b strb w11, [x0,#1866] - 3e0: 791d2e23 strh w3, [x17,#3734] - 3e4: f95bc8e2 ldr x2, [x7,#14224] - 3e8: b95ce525 ldr w5, [x9,#7396] - 3ec: 395ae53c ldrb w28, [x9,#1721] - 3f0: 795c9282 ldrh w2, [x20,#3656] - 3f4: 399d7dd6 ldrsb x22, [x14,#1887] - 3f8: 799fe008 ldrsh x8, [x0,#4080] - 3fc: 79de9bc0 ldrsh w0, [x30,#3916] - 400: b99aae78 ldrsw x24, [x19,#6828] - 404: fd597598 ldr d24, [x12,#13032] - 408: bd5d1d08 ldr s8, [x8,#7452] - 40c: fd1f3dea str d10, [x15,#15992] - 410: bd1a227a str s26, [x19,#6688] - 414: 5800148a ldr x10, 6a4 - 418: 18000003 ldr w3, 418 - 41c: f88092e0 prfm pldl1keep, [x23,#9] - 420: d8ffdf00 prfm pldl1keep, 0 - 424: f8a84860 prfm pldl1keep, [x3,w8,uxtw] - 428: f99d7560 prfm pldl1keep, [x11,#15080] - 42c: 1a1c012d adc w13, w9, w28 - 430: 3a1c027b adcs w27, w19, w28 - 434: 5a060253 sbc w19, w18, w6 - 438: 7a03028e sbcs w14, w20, w3 - 43c: 9a0801d0 adc x16, x14, x8 - 440: ba0803a0 adcs x0, x29, x8 - 444: da140308 sbc x8, x24, x20 - 448: fa00038c sbcs x12, x28, x0 - 44c: 0b3010d7 add w23, w6, w16, uxtb #4 - 450: 2b37ab39 adds w25, w25, w23, sxth #2 - 454: cb2466da sub x26, x22, x4, uxtx #1 - 458: 6b33efb1 subs w17, w29, w19, sxtx #3 - 45c: 8b350fcb add x11, x30, w21, uxtb #3 - 460: ab208a70 adds x16, x19, w0, sxtb #2 - 464: cb39e52b sub x11, x9, x25, sxtx #1 - 468: eb2c9291 subs x17, x20, w12, sxtb #4 - 46c: 3a4bd1a3 ccmn w13, w11, #0x3, le - 470: 7a4c81a2 ccmp w13, w12, #0x2, hi - 474: ba42106c ccmn x3, x2, #0xc, ne - 478: fa5560e3 ccmp x7, x21, #0x3, vs - 47c: 3a4e3844 ccmn w2, #0xe, #0x4, cc - 480: 7a515a26 ccmp w17, #0x11, #0x6, pl - 484: ba4c2940 ccmn x10, #0xc, #0x0, cs - 488: fa52aaae ccmp x21, #0x12, #0xe, ge - 48c: 1a8cc1b5 csel w21, w13, w12, gt - 490: 1a8f976a csinc w10, w27, w15, ls - 494: 5a8981a0 csinv w0, w13, w9, hi - 498: 5a9a6492 csneg w18, w4, w26, vs - 49c: 9a8793ac csel x12, x29, x7, ls - 4a0: 9a9474e6 csinc x6, x7, x20, vc - 4a4: da83d2b6 csinv x22, x21, x3, le - 4a8: da9b9593 csneg x19, x12, x27, ls - 4ac: 5ac00200 rbit w0, w16 - 4b0: 5ac006f1 rev16 w17, w23 - 4b4: 5ac009d1 rev w17, w14 - 4b8: 5ac013d8 clz w24, w30 - 4bc: 5ac016d8 cls w24, w22 - 4c0: dac00223 rbit x3, x17 - 4c4: dac005ac rev16 x12, x13 - 4c8: dac00ac9 rev32 x9, x22 - 4cc: dac00c00 rev x0, x0 - 4d0: dac01205 clz x5, x16 - 4d4: dac016d9 cls x25, x22 - 4d8: 1ac0089d udiv w29, w4, w0 - 4dc: 1add0fa0 sdiv w0, w29, w29 - 4e0: 1ad52225 lsl w5, w17, w21 - 4e4: 1ad22529 lsr w9, w9, w18 - 4e8: 1ac82b61 asr w1, w27, w8 - 4ec: 1acd2e92 ror w18, w20, w13 - 4f0: 9acc0b28 udiv x8, x25, x12 - 4f4: 9adc0ca7 sdiv x7, x5, x28 - 4f8: 9adb2225 lsl x5, x17, x27 - 4fc: 9ad42757 lsr x23, x26, x20 - 500: 9adc291c asr x28, x8, x28 - 504: 9ac42fa3 ror x3, x29, x4 - 508: 1b1a55d1 madd w17, w14, w26, w21 - 50c: 1b0bafc1 msub w1, w30, w11, w11 - 510: 9b067221 madd x1, x17, x6, x28 - 514: 9b1ea0de msub x30, x6, x30, x8 - 518: 9b2e20d5 smaddl x21, w6, w14, x8 - 51c: 9b38cd4a smsubl x10, w10, w24, x19 - 520: 9bae6254 umaddl x20, w18, w14, x24 - 524: 9ba59452 umsubl x18, w2, w5, x5 - 528: 1e2d0a48 fmul s8, s18, s13 - 52c: 1e3c19c2 fdiv s2, s14, s28 - 530: 1e3c298f fadd s15, s12, s28 - 534: 1e213980 fsub s0, s12, s1 - 538: 1e240baf fmul s15, s29, s4 - 53c: 1e77082c fmul d12, d1, d23 - 540: 1e72191b fdiv d27, d8, d18 - 544: 1e6b2a97 fadd d23, d20, d11 - 548: 1e723988 fsub d8, d12, d18 - 54c: 1e770b1a fmul d26, d24, d23 - 550: 1f0d66f5 fmadd s21, s23, s13, s25 - 554: 1f01b956 fmsub s22, s10, s1, s14 - 558: 1f227a8e fnmadd s14, s20, s2, s30 - 55c: 1f365ba7 fnmadd s7, s29, s22, s22 - 560: 1f4f14ad fmadd d13, d5, d15, d5 - 564: 1f45a98e fmsub d14, d12, d5, d10 - 568: 1f60066a fnmadd d10, d19, d0, d1 - 56c: 1f620054 fnmadd d20, d2, d2, d0 - 570: 1e204139 fmov s25, s9 - 574: 1e20c094 fabs s20, s4 - 578: 1e214363 fneg s3, s27 - 57c: 1e21c041 fsqrt s1, s2 - 580: 1e22c01e fcvt d30, s0 - 584: 1e60408c fmov d12, d4 - 588: 1e60c361 fabs d1, d27 - 58c: 1e6142c8 fneg d8, d22 - 590: 1e61c16b fsqrt d11, d11 - 594: 1e624396 fcvt s22, d28 - 598: 1e3802dc fcvtzs w28, s22 - 59c: 9e380374 fcvtzs x20, s27 - 5a0: 1e78000e fcvtzs w14, d0 - 5a4: 9e78017a fcvtzs x26, d11 - 5a8: 1e2202dc scvtf s28, w22 - 5ac: 9e220150 scvtf s16, x10 - 5b0: 1e6202a8 scvtf d8, w21 - 5b4: 9e620395 scvtf d21, x28 - 5b8: 1e260318 fmov w24, s24 - 5bc: 9e660268 fmov x8, d19 - 5c0: 1e270188 fmov s8, w12 - 5c4: 9e6700e6 fmov d6, x7 - 5c8: 1e3023c0 fcmp s30, s16 - 5cc: 1e6b2320 fcmp d25, d11 - 5d0: 1e202168 fcmp s11, #0.0 - 5d4: 1e602168 fcmp d11, #0.0 - 5d8: 2910323d stp w29, w12, [x17,#128] - 5dc: 297449d6 ldp w22, w18, [x14,#-96] - 5e0: 6948402b ldpsw x11, x16, [x1,#64] - 5e4: a9072f40 stp x0, x11, [x26,#112] - 5e8: a9410747 ldp x7, x1, [x26,#16] - 5ec: 29801f0a stp w10, w7, [x24,#0]! - 5f0: 29e07307 ldp w7, w28, [x24,#-256]! - 5f4: 69e272b9 ldpsw x25, x28, [x21,#-240]! - 5f8: a9bf49d4 stp x20, x18, [x14,#-16]! - 5fc: a9c529a8 ldp x8, x10, [x13,#80]! - 600: 28b0605a stp w26, w24, [x2],#-128 - 604: 28e866a2 ldp w2, w25, [x21],#-192 - 608: 68ee0ab1 ldpsw x17, x2, [x21],#-144 - 60c: a886296c stp x12, x10, [x11],#96 - 610: a8fe1a38 ldp x24, x6, [x17],#-32 - 614: 282479c3 stnp w3, w30, [x14,#-224] - 618: 286e534f ldnp w15, w20, [x26,#-144] - 61c: a8386596 stnp x22, x25, [x12,#-128] - 620: a8755a3b ldnp x27, x22, [x17,#-176] - 624: 1e601000 fmov d0, #2.000000000000000000e+00 - 628: 1e603000 fmov d0, #2.125000000000000000e+00 - 62c: 1e621000 fmov d0, #4.000000000000000000e+00 - 630: 1e623000 fmov d0, #4.250000000000000000e+00 - 634: 1e641000 fmov d0, #8.000000000000000000e+00 - 638: 1e643000 fmov d0, #8.500000000000000000e+00 - 63c: 1e661000 fmov d0, #1.600000000000000000e+01 - 640: 1e663000 fmov d0, #1.700000000000000000e+01 - 644: 1e681000 fmov d0, #1.250000000000000000e-01 - 648: 1e683000 fmov d0, #1.328125000000000000e-01 - 64c: 1e6a1000 fmov d0, #2.500000000000000000e-01 - 650: 1e6a3000 fmov d0, #2.656250000000000000e-01 - 654: 1e6c1000 fmov d0, #5.000000000000000000e-01 - 658: 1e6c3000 fmov d0, #5.312500000000000000e-01 - 65c: 1e6e1000 fmov d0, #1.000000000000000000e+00 - 660: 1e6e3000 fmov d0, #1.062500000000000000e+00 - 664: 1e701000 fmov d0, #-2.000000000000000000e+00 - 668: 1e703000 fmov d0, #-2.125000000000000000e+00 - 66c: 1e721000 fmov d0, #-4.000000000000000000e+00 - 670: 1e723000 fmov d0, #-4.250000000000000000e+00 - 674: 1e741000 fmov d0, #-8.000000000000000000e+00 - 678: 1e743000 fmov d0, #-8.500000000000000000e+00 - 67c: 1e761000 fmov d0, #-1.600000000000000000e+01 - 680: 1e763000 fmov d0, #-1.700000000000000000e+01 - 684: 1e781000 fmov d0, #-1.250000000000000000e-01 - 688: 1e783000 fmov d0, #-1.328125000000000000e-01 - 68c: 1e7a1000 fmov d0, #-2.500000000000000000e-01 - 690: 1e7a3000 fmov d0, #-2.656250000000000000e-01 - 694: 1e7c1000 fmov d0, #-5.000000000000000000e-01 - 698: 1e7c3000 fmov d0, #-5.312500000000000000e-01 - 69c: 1e7e1000 fmov d0, #-1.000000000000000000e+00 - 6a0: 1e7e3000 fmov d0, #-1.062500000000000000e+00 + 0: 8b0d82fa add x26, x23, x13, lsl #32 + 4: cb49970c sub x12, x24, x9, lsr #37 + 8: ab889dfc adds x28, x15, x8, asr #39 + c: eb9ee787 subs x7, x28, x30, asr #57 + 10: 0b9b3ec9 add w9, w22, w27, asr #15 + 14: 4b9279a3 sub w3, w13, w18, asr #30 + 18: 2b88474e adds w14, w26, w8, asr #17 + 1c: 6b8c56c0 subs w0, w22, w12, asr #21 + 20: 8a1a51e0 and x0, x15, x26, lsl #20 + 24: aa11f4ba orr x26, x5, x17, lsl #61 + 28: ca0281b8 eor x24, x13, x2, lsl #32 + 2c: ea918c7c ands x28, x3, x17, asr #35 + 30: 0a5d4a19 and w25, w16, w29, lsr #18 + 34: 2a4b264d orr w13, w18, w11, lsr #9 + 38: 4a523ca5 eor w5, w5, w18, lsr #15 + 3c: 6a9b6ae2 ands w2, w23, w27, asr #26 + 40: 8a70b79b bic x27, x28, x16, lsr #45 + 44: aaba9728 orn x8, x25, x26, asr #37 + 48: ca6dfe3d eon x29, x17, x13, lsr #63 + 4c: ea627f1c bics x28, x24, x2, lsr #31 + 50: 0aa70f53 bic w19, w26, w7, asr #3 + 54: 2aaa0f06 orn w6, w24, w10, asr #3 + 58: 4a6176a4 eon w4, w21, w1, lsr #29 + 5c: 6a604eb0 bics w16, w21, w0, lsr #19 + 60: 1105ed91 add w17, w12, #0x17b + 64: 3100583e adds w30, w1, #0x16 + 68: 5101f8bd sub w29, w5, #0x7e + 6c: 710f0306 subs w6, w24, #0x3c0 + 70: 9101a1a0 add x0, x13, #0x68 + 74: b10a5cc8 adds x8, x6, #0x297 + 78: d10810aa sub x10, x5, #0x204 + 7c: f10fd061 subs x1, x3, #0x3f4 + 80: 120cb166 and w6, w11, #0xfff1fff1 + 84: 321764bc orr w28, w5, #0xfffffe07 + 88: 52174681 eor w1, w20, #0x7fffe00 + 8c: 720c0247 ands w7, w18, #0x100000 + 90: 9241018e and x14, x12, #0x8000000000000000 + 94: b25a2969 orr x9, x11, #0x1ffc000000000 + 98: d278b411 eor x17, x0, #0x3fffffffffff00 + 9c: f26aad01 ands x1, x8, #0xffffffffffc00003 + a0: 14000000 b a0 + a4: 17ffffd7 b 0 + a8: 140001cf b 7e4 + ac: 94000000 bl ac + b0: 97ffffd4 bl 0 + b4: 940001cc bl 7e4 + b8: 3400000a cbz w10, b8 + bc: 34fffa2a cbz w10, 0 + c0: 3400392a cbz w10, 7e4 + c4: 35000008 cbnz w8, c4 + c8: 35fff9c8 cbnz w8, 0 + cc: 350038c8 cbnz w8, 7e4 + d0: b400000b cbz x11, d0 + d4: b4fff96b cbz x11, 0 + d8: b400386b cbz x11, 7e4 + dc: b500001d cbnz x29, dc + e0: b5fff91d cbnz x29, 0 + e4: b500381d cbnz x29, 7e4 + e8: 10000013 adr x19, e8 + ec: 10fff8b3 adr x19, 0 + f0: 100037b3 adr x19, 7e4 + f4: 90000013 adrp x19, 0 + f8: 36300016 tbz w22, #6, f8 + fc: 3637f836 tbz w22, #6, 0 + 100: 36303736 tbz w22, #6, 7e4 + 104: 3758000c tbnz w12, #11, 104 + 108: 375ff7cc tbnz w12, #11, 0 + 10c: 375836cc tbnz w12, #11, 7e4 + 110: 128313a0 mov w0, #0xffffe762 // #-6302 + 114: 528a32c7 mov w7, #0x5196 // #20886 + 118: 7289173b movk w27, #0x48b9 + 11c: 92ab3acc mov x12, #0xffffffffa629ffff // #-1507196929 + 120: d2a0bf94 mov x20, #0x5fc0000 // #100401152 + 124: f2c285e8 movk x8, #0x142f, lsl #32 + 128: 9358722f sbfx x15, x17, #24, #5 + 12c: 330e652f bfxil w15, w9, #14, #12 + 130: 53067f3b lsr w27, w25, #6 + 134: 93577c53 sbfx x19, x2, #23, #9 + 138: b34a1aac bfi x12, x21, #54, #7 + 13c: d35a4016 ubfiz x22, x0, #38, #17 + 140: 13946c63 extr w3, w3, w20, #27 + 144: 93c3dbc8 extr x8, x30, x3, #54 + 148: 54000000 b.eq 148 // b.none + 14c: 54fff5a0 b.eq 0 // b.none + 150: 540034a0 b.eq 7e4 // b.none + 154: 54000001 b.ne 154 // b.any + 158: 54fff541 b.ne 0 // b.any + 15c: 54003441 b.ne 7e4 // b.any + 160: 54000002 b.cs 160 // b.hs, b.nlast + 164: 54fff4e2 b.cs 0 // b.hs, b.nlast + 168: 540033e2 b.cs 7e4 // b.hs, b.nlast + 16c: 54000002 b.cs 16c // b.hs, b.nlast + 170: 54fff482 b.cs 0 // b.hs, b.nlast + 174: 54003382 b.cs 7e4 // b.hs, b.nlast + 178: 54000003 b.cc 178 // b.lo, b.ul, b.last + 17c: 54fff423 b.cc 0 // b.lo, b.ul, b.last + 180: 54003323 b.cc 7e4 // b.lo, b.ul, b.last + 184: 54000003 b.cc 184 // b.lo, b.ul, b.last + 188: 54fff3c3 b.cc 0 // b.lo, b.ul, b.last + 18c: 540032c3 b.cc 7e4 // b.lo, b.ul, b.last + 190: 54000004 b.mi 190 // b.first + 194: 54fff364 b.mi 0 // b.first + 198: 54003264 b.mi 7e4 // b.first + 19c: 54000005 b.pl 19c // b.nfrst + 1a0: 54fff305 b.pl 0 // b.nfrst + 1a4: 54003205 b.pl 7e4 // b.nfrst + 1a8: 54000006 b.vs 1a8 + 1ac: 54fff2a6 b.vs 0 + 1b0: 540031a6 b.vs 7e4 + 1b4: 54000007 b.vc 1b4 + 1b8: 54fff247 b.vc 0 + 1bc: 54003147 b.vc 7e4 + 1c0: 54000008 b.hi 1c0 // b.pmore + 1c4: 54fff1e8 b.hi 0 // b.pmore + 1c8: 540030e8 b.hi 7e4 // b.pmore + 1cc: 54000009 b.ls 1cc // b.plast + 1d0: 54fff189 b.ls 0 // b.plast + 1d4: 54003089 b.ls 7e4 // b.plast + 1d8: 5400000a b.ge 1d8 // b.tcont + 1dc: 54fff12a b.ge 0 // b.tcont + 1e0: 5400302a b.ge 7e4 // b.tcont + 1e4: 5400000b b.lt 1e4 // b.tstop + 1e8: 54fff0cb b.lt 0 // b.tstop + 1ec: 54002fcb b.lt 7e4 // b.tstop + 1f0: 5400000c b.gt 1f0 + 1f4: 54fff06c b.gt 0 + 1f8: 54002f6c b.gt 7e4 + 1fc: 5400000d b.le 1fc + 200: 54fff00d b.le 0 + 204: 54002f0d b.le 7e4 + 208: 5400000e b.al 208 + 20c: 54ffefae b.al 0 + 210: 54002eae b.al 7e4 + 214: 5400000f b.nv 214 + 218: 54ffef4f b.nv 0 + 21c: 54002e4f b.nv 7e4 + 220: d40658e1 svc #0x32c7 + 224: d4014d22 hvc #0xa69 + 228: d4046543 smc #0x232a + 22c: d4273f60 brk #0x39fb + 230: d44cad80 hlt #0x656c + 234: d503201f nop + 238: d69f03e0 eret + 23c: d6bf03e0 drps + 240: d5033fdf isb + 244: d5033e9f dsb st + 248: d50332bf dmb oshst + 24c: d61f0200 br x16 + 250: d63f0280 blr x20 + 254: c80a7d1b stxr w10, x27, [x8] + 258: c800fea1 stlxr w0, x1, [x21] + 25c: c85f7fb1 ldxr x17, [x29] + 260: c85fff9d ldaxr x29, [x28] + 264: c89ffee1 stlr x1, [x23] + 268: c8dffe95 ldar x21, [x20] + 26c: 88167e7b stxr w22, w27, [x19] + 270: 880bfcd0 stlxr w11, w16, [x6] + 274: 885f7c12 ldxr w18, [x0] + 278: 885ffd44 ldaxr w4, [x10] + 27c: 889ffed8 stlr w24, [x22] + 280: 88dffe6a ldar w10, [x19] + 284: 48017fc5 stxrh w1, w5, [x30] + 288: 4808fe2c stlxrh w8, w12, [x17] + 28c: 485f7dc9 ldxrh w9, [x14] + 290: 485ffc27 ldaxrh w7, [x1] + 294: 489ffe05 stlrh w5, [x16] + 298: 48dffd82 ldarh w2, [x12] + 29c: 080a7c6c stxrb w10, w12, [x3] + 2a0: 081cff4e stlxrb w28, w14, [x26] + 2a4: 085f7d5e ldxrb w30, [x10] + 2a8: 085ffeae ldaxrb w14, [x21] + 2ac: 089ffd2d stlrb w13, [x9] + 2b0: 08dfff76 ldarb w22, [x27] + 2b4: c87f4d7c ldxp x28, x19, [x11] + 2b8: c87fcc5e ldaxp x30, x19, [x2] + 2bc: c8220417 stxp w2, x23, x1, [x0] + 2c0: c82cb5f0 stlxp w12, x16, x13, [x15] + 2c4: 887f55b2 ldxp w18, w21, [x13] + 2c8: 887ff90b ldaxp w11, w30, [x8] + 2cc: 88382c2d stxp w24, w13, w11, [x1] + 2d0: 883aedb5 stlxp w26, w21, w27, [x13] + 2d4: f819928b stur x11, [x20, #-103] + 2d8: b803e21c stur w28, [x16, #62] + 2dc: 381f713b sturb w27, [x9, #-9] + 2e0: 781ce322 sturh w2, [x25, #-50] + 2e4: f850f044 ldur x4, [x2, #-241] + 2e8: b85e129e ldur w30, [x20, #-31] + 2ec: 385e92f2 ldurb w18, [x23, #-23] + 2f0: 785ff35d ldurh w29, [x26, #-1] + 2f4: 39801921 ldrsb x1, [x9, #6] + 2f8: 7881318b ldursh x11, [x12, #19] + 2fc: 78dce02b ldursh w11, [x1, #-50] + 300: b8829313 ldursw x19, [x24, #41] + 304: fc45f318 ldur d24, [x24, #95] + 308: bc5d50af ldur s15, [x5, #-43] + 30c: fc001375 stur d21, [x27, #1] + 310: bc1951b7 stur s23, [x13, #-107] + 314: f8008c0b str x11, [x0, #8]! + 318: b801dc03 str w3, [x0, #29]! + 31c: 38009dcb strb w11, [x14, #9]! + 320: 781fdf1d strh w29, [x24, #-3]! + 324: f8570e2d ldr x13, [x17, #-144]! + 328: b85faecc ldr w12, [x22, #-6]! + 32c: 385f6d8d ldrb w13, [x12, #-10]! + 330: 785ebea0 ldrh w0, [x21, #-21]! + 334: 38804cf7 ldrsb x23, [x7, #4]! + 338: 789cbce3 ldrsh x3, [x7, #-53]! + 33c: 78df9cbc ldrsh w28, [x5, #-7]! + 340: b89eed38 ldrsw x24, [x9, #-18]! + 344: fc40cd6e ldr d14, [x11, #12]! + 348: bc5bdd93 ldr s19, [x12, #-67]! + 34c: fc103c14 str d20, [x0, #-253]! + 350: bc040c08 str s8, [x0, #64]! + 354: f81a2784 str x4, [x28], #-94 + 358: b81ca4ec str w12, [x7], #-54 + 35c: 381e855b strb w27, [x10], #-24 + 360: 7801b506 strh w6, [x8], #27 + 364: f853654e ldr x14, [x10], #-202 + 368: b85d74b0 ldr w16, [x5], #-41 + 36c: 384095c2 ldrb w2, [x14], #9 + 370: 785ec5bc ldrh w28, [x13], #-20 + 374: 389e15a9 ldrsb x9, [x13], #-31 + 378: 789dc703 ldrsh x3, [x24], #-36 + 37c: 78c06474 ldrsh w20, [x3], #6 + 380: b89ff667 ldrsw x7, [x19], #-1 + 384: fc57e51e ldr d30, [x8], #-130 + 388: bc4155f9 ldr s25, [x15], #21 + 38c: fc05a6ee str d14, [x23], #90 + 390: bc1df408 str s8, [x0], #-33 + 394: f835da4a str x10, [x18, w21, sxtw #3] + 398: b836d9a4 str w4, [x13, w22, sxtw #2] + 39c: 3833580d strb w13, [x0, w19, uxtw #0] + 3a0: 7826cb6c strh w12, [x27, w6, sxtw] + 3a4: f8706900 ldr x0, [x8, x16] + 3a8: b87ae880 ldr w0, [x4, x26, sxtx] + 3ac: 3865db2e ldrb w14, [x25, w5, sxtw #0] + 3b0: 78724889 ldrh w9, [x4, w18, uxtw] + 3b4: 38a7789b ldrsb x27, [x4, x7, lsl #0] + 3b8: 78beca2f ldrsh x15, [x17, w30, sxtw] + 3bc: 78f6c810 ldrsh w16, [x0, w22, sxtw] + 3c0: b8bef956 ldrsw x22, [x10, x30, sxtx #2] + 3c4: fc6afabd ldr d29, [x21, x10, sxtx #3] + 3c8: bc734963 ldr s3, [x11, w19, uxtw] + 3cc: fc3d5b8d str d13, [x28, w29, uxtw #3] + 3d0: bc25fbb7 str s23, [x29, x5, sxtx #2] + 3d4: f9189d05 str x5, [x8, #12600] + 3d8: b91ecb1d str w29, [x24, #7880] + 3dc: 39187a33 strb w19, [x17, #1566] + 3e0: 791f226d strh w13, [x19, #3984] + 3e4: f95aa2f3 ldr x19, [x23, #13632] + 3e8: b9587bb7 ldr w23, [x29, #6264] + 3ec: 395f7176 ldrb w22, [x11, #2012] + 3f0: 795d9143 ldrh w3, [x10, #3784] + 3f4: 399e7e08 ldrsb x8, [x16, #1951] + 3f8: 799a2697 ldrsh x23, [x20, #3346] + 3fc: 79df3422 ldrsh w2, [x1, #3994] + 400: b99c2624 ldrsw x4, [x17, #7204] + 404: fd5c2374 ldr d20, [x27, #14400] + 408: bd5fa1d9 ldr s25, [x14, #8096] + 40c: fd1d595a str d26, [x10, #15024] + 410: bd1b1869 str s9, [x3, #6936] + 414: 58001e9b ldr x27, 7e4 + 418: 1800000b ldr w11, 418 + 41c: f8945060 prfum pldl1keep, [x3, #-187] + 420: d8000000 prfm pldl1keep, 420 + 424: f8ae6ba0 prfm pldl1keep, [x29, x14] + 428: f99a0080 prfm pldl1keep, [x4, #13312] + 42c: 1a070035 adc w21, w1, w7 + 430: 3a0700a8 adcs w8, w5, w7 + 434: 5a0e0367 sbc w7, w27, w14 + 438: 7a11009b sbcs w27, w4, w17 + 43c: 9a000380 adc x0, x28, x0 + 440: ba1e030c adcs x12, x24, x30 + 444: da0f0320 sbc x0, x25, x15 + 448: fa030301 sbcs x1, x24, x3 + 44c: 0b340b12 add w18, w24, w20, uxtb #2 + 450: 2b2a278d adds w13, w28, w10, uxth #1 + 454: cb22aa0f sub x15, x16, w2, sxth #2 + 458: 6b2d29bd subs w29, w13, w13, uxth #2 + 45c: 8b2cce8c add x12, x20, w12, sxtw #3 + 460: ab2b877e adds x30, x27, w11, sxtb #1 + 464: cb21c8ee sub x14, x7, w1, sxtw #2 + 468: eb3ba47d subs x29, x3, w27, sxth #1 + 46c: 3a4d400e ccmn w0, w13, #0xe, mi // mi = first + 470: 7a5232c6 ccmp w22, w18, #0x6, cc // cc = lo, ul, last + 474: ba5e624e ccmn x18, x30, #0xe, vs + 478: fa53814c ccmp x10, x19, #0xc, hi // hi = pmore + 47c: 3a52d8c2 ccmn w6, #0x12, #0x2, le + 480: 7a4d8924 ccmp w9, #0xd, #0x4, hi // hi = pmore + 484: ba4b3aab ccmn x21, #0xb, #0xb, cc // cc = lo, ul, last + 488: fa4d7882 ccmp x4, #0xd, #0x2, vc + 48c: 1a96804c csel w12, w2, w22, hi // hi = pmore + 490: 1a912618 csinc w24, w16, w17, cs // cs = hs, nlast + 494: 5a90b0e6 csinv w6, w7, w16, lt // lt = tstop + 498: 5a96976b csneg w11, w27, w22, ls // ls = plast + 49c: 9a9db06a csel x10, x3, x29, lt // lt = tstop + 4a0: 9a9b374c csinc x12, x26, x27, cc // cc = lo, ul, last + 4a4: da95c14f csinv x15, x10, x21, gt + 4a8: da89c6fe csneg x30, x23, x9, gt + 4ac: 5ac0015e rbit w30, w10 + 4b0: 5ac005fd rev16 w29, w15 + 4b4: 5ac00bdd rev w29, w30 + 4b8: 5ac012b9 clz w25, w21 + 4bc: 5ac01404 cls w4, w0 + 4c0: dac002b2 rbit x18, x21 + 4c4: dac0061d rev16 x29, x16 + 4c8: dac00a95 rev32 x21, x20 + 4cc: dac00e66 rev x6, x19 + 4d0: dac0107e clz x30, x3 + 4d4: dac01675 cls x21, x19 + 4d8: 1ac00b0b udiv w11, w24, w0 + 4dc: 1ace0f3b sdiv w27, w25, w14 + 4e0: 1ad221c3 lsl w3, w14, w18 + 4e4: 1ad825e7 lsr w7, w15, w24 + 4e8: 1ad92a3c asr w28, w17, w25 + 4ec: 1adc2f42 ror w2, w26, w28 + 4f0: 9ada0b25 udiv x5, x25, x26 + 4f4: 9ad20e1b sdiv x27, x16, x18 + 4f8: 9acc22a6 lsl x6, x21, x12 + 4fc: 9acc2480 lsr x0, x4, x12 + 500: 9adc2a3b asr x27, x17, x28 + 504: 9ad22c5c ror x28, x2, x18 + 508: 1b0e39ea madd w10, w15, w14, w14 + 50c: 1b0fcf23 msub w3, w25, w15, w19 + 510: 9b1010ae madd x14, x5, x16, x4 + 514: 9b048b3a msub x26, x25, x4, x2 + 518: 9b3d4582 smaddl x2, w12, w29, x17 + 51c: 9b2390e8 smsubl x8, w7, w3, x4 + 520: 9bba6499 umaddl x25, w4, w26, x25 + 524: 9ba0ea24 umsubl x4, w17, w0, x26 + 528: 1e2f0af1 fmul s17, s23, s15 + 52c: 1e311b95 fdiv s21, s28, s17 + 530: 1e23295b fadd s27, s10, s3 + 534: 1e3938e0 fsub s0, s7, s25 + 538: 1e2f08c9 fmul s9, s6, s15 + 53c: 1e6a09fd fmul d29, d15, d10 + 540: 1e671a22 fdiv d2, d17, d7 + 544: 1e77296b fadd d11, d11, d23 + 548: 1e773ba7 fsub d7, d29, d23 + 54c: 1e6b0b6e fmul d14, d27, d11 + 550: 1f18308b fmadd s11, s4, s24, s12 + 554: 1f14adcf fmsub s15, s14, s20, s11 + 558: 1f2b31bc nmadd s28, s13, s11, s12 + 55c: 1f3a3bd7 fnmadd s23, s30, s26, s14 + 560: 1f4a1da9 fmadd d9, d13, d10, d7 + 564: 1f4f8fa5 fmsub d5, d29, d15, d3 + 568: 1f6f798b fnmadd d11, d12, d15, d30 + 56c: 1f73523e fnmadd d30, d17, d19, d20 + 570: 1e2040fb fmov s27, s7 + 574: 1e20c2a9 fabs s9, s21 + 578: 1e214122 fneg s2, s9 + 57c: 1e21c0fb fsqrt s27, s7 + 580: 1e22c3dd fcvt d29, s30 + 584: 1e604031 fmov d17, d1 + 588: 1e60c0c2 fabs d2, d6 + 58c: 1e61406a fneg d10, d3 + 590: 1e61c178 fsqrt d24, d11 + 594: 1e624027 fcvt s7, d1 + 598: 1e38000b fcvtzs w11, s0 + 59c: 9e380243 fcvtzs x3, s18 + 5a0: 1e7800dc fcvtzs w28, d6 + 5a4: 9e7800d6 fcvtzs x22, d6 + 5a8: 1e220360 scvtf s0, w27 + 5ac: 9e22005a scvtf s26, x2 + 5b0: 1e6200e5 scvtf d5, w7 + 5b4: 9e62017c scvtf d28, x11 + 5b8: 1e2601b9 fmov w25, s13 + 5bc: 9e6602eb fmov x11, d23 + 5c0: 1e270113 fmov s19, w8 + 5c4: 9e6702b2 fmov d18, x21 + 5c8: 1e342320 fcmp s25, s20 + 5cc: 1e722260 fcmp d19, d18 + 5d0: 1e202048 fcmp s2, #0.0 + 5d4: 1e6023a8 fcmp d29, #0.0 + 5d8: 29025668 stp w8, w21, [x19, #16] + 5dc: 29403e86 ldp w6, w15, [x20] + 5e0: 6966387b ldpsw x27, x14, [x3, #-208] + 5e4: a93b316a stp x10, x12, [x11, #-80] + 5e8: a97e38e7 ldp x7, x14, [x7, #-32] + 5ec: 298e5980 stp w0, w22, [x12, #112]! + 5f0: 29c61d0e ldp w14, w7, [x8, #48]! + 5f4: 69c00930 ldpsw x16, x2, [x9, #0]! + 5f8: a9bc7434 stp x20, x29, [x1, #-64]! + 5fc: a9c530b5 ldp x21, x12, [x5, #80]! + 600: 28b26378 stp w24, w24, [x27], #-112 + 604: 28c25a5c ldp w28, w22, [x18], #16 + 608: 68f419b1 ldpsw x17, x6, [x13], #-96 + 60c: a8b668bc stp x28, x26, [x5], #-160 + 610: a8f15746 ldp x6, x21, [x26], #-240 + 614: 280453cd stnp w13, w20, [x30, #32] + 618: 284c2cb1 ldnp w17, w11, [x5, #96] + 61c: a83a534d stnp x13, x20, [x26, #-96] + 620: a87b32fd ldnp x29, x12, [x23, #-80] + 624: 05a08020 mov z0.s, p0/m, s1 + 628: 04b0e3e0 incw x0 + 62c: 0470e7e1 dech x1 + 630: 042f9c20 lsl z0.b, z1.b, #7 + 634: 043f9c35 lsl z21.h, z1.h, #15 + 638: 047f9c20 lsl z0.s, z1.s, #31 + 63c: 04ff9c20 lsl z0.d, z1.d, #63 + 640: 04299420 lsr z0.b, z1.b, #7 + 644: 04319160 asr z0.h, z11.h, #15 + 648: 0461943e lsr z30.s, z1.s, #31 + 64c: 04a19020 asr z0.d, z1.d, #63 + 650: 042053ff addvl sp, x0, #31 + 654: 047f5401 addpl x1, sp, #-32 + 658: 25208028 cntp x8, p0, p1.b + 65c: 2538cfe0 mov z0.b, #127 + 660: 2578d001 mov z1.h, #-128 + 664: 25b8efe2 mov z2.s, #32512 + 668: 25f8f007 mov z7.d, #-32768 + 66c: a400a3e0 ld1b {z0.b}, p0/z, [sp] + 670: a4a8a7ea ld1h {z10.h}, p1/z, [sp, #-8, mul vl] + 674: a547a814 ld1w {z20.s}, p2/z, [x0, #7, mul vl] + 678: a4084ffe ld1b {z30.b}, p3/z, [sp, x8] + 67c: a55c53e0 ld1w {z0.s}, p4/z, [sp, x28, lsl #2] + 680: a5e1540b ld1d {z11.d}, p5/z, [x0, x1, lsl #3] + 684: e400fbf6 st1b {z22.b}, p6, [sp] + 688: e408ffff st1b {z31.b}, p7, [sp, #-8, mul vl] + 68c: e547e400 st1w {z0.s}, p1, [x0, #7, mul vl] + 690: e4014be0 st1b {z0.b}, p2, [sp, x1] + 694: e4a84fe0 st1h {z0.h}, p3, [sp, x8, lsl #1] + 698: e5f25000 st1d {z0.d}, p4, [x0, x18, lsl #3] + 69c: 858043e0 ldr z0, [sp] + 6a0: 85a043ff ldr z31, [sp, #-256, mul vl] + 6a4: e59f5d08 str z8, [x8, #255, mul vl] + 6a8: 1e601000 fmov d0, #2.000000000000000000e+00 + 6ac: 1e603000 fmov d0, #2.125000000000000000e+00 + 6b0: 1e621000 fmov d0, #4.000000000000000000e+00 + 6b4: 1e623000 fmov d0, #4.250000000000000000e+00 + 6b8: 1e641000 fmov d0, #8.000000000000000000e+00 + 6bc: 1e643000 fmov d0, #8.500000000000000000e+00 + 6c0: 1e661000 fmov d0, #1.600000000000000000e+01 + 6c4: 1e663000 fmov d0, #1.700000000000000000e+01 + 6c8: 1e681000 fmov d0, #1.250000000000000000e-01 + 6cc: 1e683000 fmov d0, #1.328125000000000000e-01 + 6d0: 1e6a1000 fmov d0, #2.500000000000000000e-01 + 6d4: 1e6a3000 fmov d0, #2.656250000000000000e-01 + 6d8: 1e6c1000 fmov d0, #5.000000000000000000e-01 + 6dc: 1e6c3000 fmov d0, #5.312500000000000000e-01 + 6e0: 1e6e1000 fmov d0, #1.000000000000000000e+00 + 6e4: 1e6e3000 fmov d0, #1.062500000000000000e+00 + 6e8: 1e701000 fmov d0, #-2.000000000000000000e+00 + 6ec: 1e703000 fmov d0, #-2.125000000000000000e+00 + 6f0: 1e721000 fmov d0, #-4.000000000000000000e+00 + 6f4: 1e723000 fmov d0, #-4.250000000000000000e+00 + 6f8: 1e741000 fmov d0, #-8.000000000000000000e+00 + 6fc: 1e743000 fmov d0, #-8.500000000000000000e+00 + 700: 1e761000 fmov d0, #-1.600000000000000000e+01 + 704: 1e763000 fmov d0, #-1.700000000000000000e+01 + 708: 1e781000 fmov d0, #-1.250000000000000000e-01 + 70c: 1e783000 fmov d0, #-1.328125000000000000e-01 + 710: 1e7a1000 fmov d0, #-2.500000000000000000e-01 + 714: 1e7a3000 fmov d0, #-2.656250000000000000e-01 + 718: 1e7c1000 fmov d0, #-5.000000000000000000e-01 + 71c: 1e7c3000 fmov d0, #-5.312500000000000000e-01 + 720: 1e7e1000 fmov d0, #-1.000000000000000000e+00 + 724: 1e7e3000 fmov d0, #-1.062500000000000000e+00 + 728: 04bb020e add z14.s, z16.s, z27.s + 72c: 04ba04c0 sub z0.s, z6.s, z26.s + 730: 6586019b fadd z27.s, z12.s, z6.s + 734: 6593089e fmul z30.s, z4.s, z19.s + 738: 65c2060b fsub z11.d, z16.d, z2.d + 73c: 04d6a18f abs z15.d, p0/m, z12.d + 740: 040016e9 add z9.b, p5/m, z9.b, z23.b + 744: 0490835e asr z30.s, p0/m, z30.s, z26.s + 748: 045aaa44 cnt z4.h, p2/m, z18.h + 74c: 04938579 lsl z25.s, p1/m, z25.s, z11.s + 750: 0411990a lsr z10.b, p6/m, z10.b, z8.b + 754: 04101624 mul z4.b, p5/m, z4.b, z17.b + 758: 0497ad3e neg z30.s, p3/m, z9.s + 75c: 04deae80 not z0.d, p3/m, z20.d + 760: 04481c77 smax z23.h, p7/m, z23.h, z3.h + 764: 044a0960 smin z0.h, p2/m, z0.h, z11.h + 768: 04c118ab sub z11.d, p6/m, z11.d, z5.d + 76c: 049caa30 fabs z16.s, p2/m, z17.s + 770: 6580834f fadd z15.s, p0/m, z15.s, z26.s + 774: 658d9e6a fdiv z10.s, p7/m, z10.s, z19.s + 778: 65c68238 fmax z24.d, p0/m, z24.d, z17.d + 77c: 65c791fa fmin z26.d, p4/m, z26.d, z15.d + 780: 65c28a38 fmul z24.d, p2/m, z24.d, z17.d + 784: 049db7be fneg z30.s, p5/m, z29.s + 788: 6582b552 frintm z18.s, p5/m, z10.s + 78c: 65c0abde frintn z30.d, p2/m, z30.d + 790: 6581bbc6 frintp z6.s, p6/m, z30.s + 794: 65cdb854 fsqrt z20.d, p6/m, z2.d + 798: 658197a9 fsub z9.s, p5/m, z9.s, z29.s + 79c: 65f60872 fmla z18.d, p2/m, z3.d, z22.d + 7a0: 65ec29af fmls z15.d, p2/m, z13.d, z12.d + 7a4: 65be43cc fnmla z12.s, p0/m, z30.s, z30.s + 7a8: 65e06ea7 fnmls z7.d, p3/m, z21.d, z0.d + 7ac: 04544b53 mla z19.h, p2/m, z26.h, z20.h + 7b0: 04d57c30 mls z16.d, p7/m, z1.d, z21.d + 7b4: 04323095 and z21.d, z4.d, z18.d + 7b8: 04a7324c eor z12.d, z18.d, z7.d + 7bc: 046d31f9 orr z25.d, z15.d, z13.d + 7c0: 04da30eb andv d11, p4, z7.d + 7c4: 04d8252b orv d11, p1, z9.d + 7c8: 04d93c1c eorv d28, p7, z0.d + 7cc: 044820f0 smaxv h16, p0, z7.h + 7d0: 040a2fac sminv b12, p3, z29.b + 7d4: 65873975 fminv s21, p6, z11.s + 7d8: 65c62886 fmaxv d6, p2, z4.d + 7dc: 65d820e7 fadda d7, p0, d7, z7.d + 7e0: 04013fac uaddv d12, p7, z29.b */ static const unsigned int insns[] = { - 0x8b0772d3, 0xcb4a3570, 0xab9c09bb, 0xeb9aa794, - 0x0b934e68, 0x4b0a3924, 0x2b1e3568, 0x6b132720, - 0x8a154c14, 0xaa1445d5, 0xca01cf99, 0xea8b3f6a, - 0x0a8c5cb9, 0x2a4a11d2, 0x4a855aa4, 0x6a857415, - 0x8aa697da, 0xaa6d7423, 0xca29bf80, 0xea3cb8bd, - 0x0a675249, 0x2ab961ba, 0x4a331899, 0x6a646345, - 0x11055267, 0x31064408, 0x51028e9d, 0x710bdee8, - 0x91082d81, 0xb106a962, 0xd10b33ae, 0xf10918ab, - 0x121102d7, 0x3204cd44, 0x5204cf00, 0x72099fb3, - 0x92729545, 0xb20e37cc, 0xd27c34be, 0xf27e4efa, - 0x14000000, 0x17ffffd7, 0x1400017f, 0x94000000, - 0x97ffffd4, 0x9400017c, 0x3400000c, 0x34fffa2c, - 0x34002f2c, 0x35000014, 0x35fff9d4, 0x35002ed4, - 0xb400000c, 0xb4fff96c, 0xb4002e6c, 0xb5000018, - 0xb5fff918, 0xb5002e18, 0x10000006, 0x10fff8a6, - 0x10002da6, 0x90000015, 0x36080001, 0x360ff821, - 0x36082d21, 0x37480008, 0x374ff7c8, 0x37482cc8, - 0x128b50ec, 0x52a9ff8b, 0x7281d095, 0x92edfebd, - 0xd28361e3, 0xf2a4cc96, 0x9346590c, 0x33194f33, - 0x531d3d89, 0x9350433c, 0xb34464ac, 0xd3462140, - 0x139a61a4, 0x93d87fd7, 0x54000000, 0x54fff5a0, - 0x54002aa0, 0x54000001, 0x54fff541, 0x54002a41, - 0x54000002, 0x54fff4e2, 0x540029e2, 0x54000002, - 0x54fff482, 0x54002982, 0x54000003, 0x54fff423, - 0x54002923, 0x54000003, 0x54fff3c3, 0x540028c3, - 0x54000004, 0x54fff364, 0x54002864, 0x54000005, - 0x54fff305, 0x54002805, 0x54000006, 0x54fff2a6, - 0x540027a6, 0x54000007, 0x54fff247, 0x54002747, - 0x54000008, 0x54fff1e8, 0x540026e8, 0x54000009, - 0x54fff189, 0x54002689, 0x5400000a, 0x54fff12a, - 0x5400262a, 0x5400000b, 0x54fff0cb, 0x540025cb, - 0x5400000c, 0x54fff06c, 0x5400256c, 0x5400000d, - 0x54fff00d, 0x5400250d, 0x5400000e, 0x54ffefae, - 0x540024ae, 0x5400000f, 0x54ffef4f, 0x5400244f, - 0xd4063721, 0xd4035082, 0xd400bfe3, 0xd4282fc0, - 0xd444c320, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, - 0xd5033fdf, 0xd5033f9f, 0xd5033abf, 0xd61f0040, - 0xd63f00a0, 0xc8147c55, 0xc805fcfd, 0xc85f7e05, - 0xc85fffbb, 0xc89fffa0, 0xc8dfff95, 0x88157cf8, - 0x8815ff9a, 0x885f7cd5, 0x885fffcf, 0x889ffc73, - 0x88dffc56, 0x48127c0f, 0x480bff85, 0x485f7cdd, - 0x485ffcf2, 0x489fff99, 0x48dffe62, 0x080a7c3e, - 0x0814fed5, 0x085f7c59, 0x085ffcb8, 0x089ffc70, - 0x08dfffb6, 0xc87f0a68, 0xc87fcdc7, 0xc82870bb, - 0xc825b8c8, 0x887f12d9, 0x887fb9ed, 0x8834215a, - 0x8837ca52, 0xf806317e, 0xb81b3337, 0x39000dc2, - 0x78005149, 0xf84391f4, 0xb85b220c, 0x385fd356, - 0x785d127e, 0x389f4149, 0x79801e3c, 0x79c014a3, - 0xb89a5231, 0xfc5ef282, 0xbc5f60f6, 0xfc12125e, - 0xbc0152cd, 0xf8190e49, 0xb800befd, 0x381ffd92, - 0x781e9e90, 0xf8409fa3, 0xb8413c79, 0x385fffa1, - 0x785c7fa8, 0x389f3dc5, 0x78801f6a, 0x78c19d4b, - 0xb89a4ec4, 0xfc408eeb, 0xbc436e79, 0xfc152ce1, - 0xbc036f28, 0xf8025565, 0xb80135f8, 0x381ff74f, - 0x781fa652, 0xf851a447, 0xb85e557b, 0x385e7472, - 0x785e070a, 0x38804556, 0x78819591, 0x78dc24e8, - 0xb89cd6d7, 0xfc430738, 0xbc5f6595, 0xfc1225b2, - 0xbc1d7430, 0xf82fcac2, 0xb83d6a02, 0x382e5a54, - 0x7834fa66, 0xf86ecbae, 0xb86cda90, 0x3860d989, - 0x78637a2c, 0x38a3fa22, 0x78b15827, 0x78f2d9f9, - 0xb8ac6ab7, 0xfc6879a5, 0xbc767943, 0xfc3bc84e, - 0xbc3968d4, 0xf91fc0fe, 0xb91da50f, 0x391d280b, - 0x791d2e23, 0xf95bc8e2, 0xb95ce525, 0x395ae53c, - 0x795c9282, 0x399d7dd6, 0x799fe008, 0x79de9bc0, - 0xb99aae78, 0xfd597598, 0xbd5d1d08, 0xfd1f3dea, - 0xbd1a227a, 0x5800148a, 0x18000003, 0xf88092e0, - 0xd8ffdf00, 0xf8a84860, 0xf99d7560, 0x1a1c012d, - 0x3a1c027b, 0x5a060253, 0x7a03028e, 0x9a0801d0, - 0xba0803a0, 0xda140308, 0xfa00038c, 0x0b3010d7, - 0x2b37ab39, 0xcb2466da, 0x6b33efb1, 0x8b350fcb, - 0xab208a70, 0xcb39e52b, 0xeb2c9291, 0x3a4bd1a3, - 0x7a4c81a2, 0xba42106c, 0xfa5560e3, 0x3a4e3844, - 0x7a515a26, 0xba4c2940, 0xfa52aaae, 0x1a8cc1b5, - 0x1a8f976a, 0x5a8981a0, 0x5a9a6492, 0x9a8793ac, - 0x9a9474e6, 0xda83d2b6, 0xda9b9593, 0x5ac00200, - 0x5ac006f1, 0x5ac009d1, 0x5ac013d8, 0x5ac016d8, - 0xdac00223, 0xdac005ac, 0xdac00ac9, 0xdac00c00, - 0xdac01205, 0xdac016d9, 0x1ac0089d, 0x1add0fa0, - 0x1ad52225, 0x1ad22529, 0x1ac82b61, 0x1acd2e92, - 0x9acc0b28, 0x9adc0ca7, 0x9adb2225, 0x9ad42757, - 0x9adc291c, 0x9ac42fa3, 0x1b1a55d1, 0x1b0bafc1, - 0x9b067221, 0x9b1ea0de, 0x9b2e20d5, 0x9b38cd4a, - 0x9bae6254, 0x9ba59452, 0x1e2d0a48, 0x1e3c19c2, - 0x1e3c298f, 0x1e213980, 0x1e240baf, 0x1e77082c, - 0x1e72191b, 0x1e6b2a97, 0x1e723988, 0x1e770b1a, - 0x1f0d66f5, 0x1f01b956, 0x1f227a8e, 0x1f365ba7, - 0x1f4f14ad, 0x1f45a98e, 0x1f60066a, 0x1f620054, - 0x1e204139, 0x1e20c094, 0x1e214363, 0x1e21c041, - 0x1e22c01e, 0x1e60408c, 0x1e60c361, 0x1e6142c8, - 0x1e61c16b, 0x1e624396, 0x1e3802dc, 0x9e380374, - 0x1e78000e, 0x9e78017a, 0x1e2202dc, 0x9e220150, - 0x1e6202a8, 0x9e620395, 0x1e260318, 0x9e660268, - 0x1e270188, 0x9e6700e6, 0x1e3023c0, 0x1e6b2320, - 0x1e202168, 0x1e602168, 0x2910323d, 0x297449d6, - 0x6948402b, 0xa9072f40, 0xa9410747, 0x29801f0a, - 0x29e07307, 0x69e272b9, 0xa9bf49d4, 0xa9c529a8, - 0x28b0605a, 0x28e866a2, 0x68ee0ab1, 0xa886296c, - 0xa8fe1a38, 0x282479c3, 0x286e534f, 0xa8386596, - 0xa8755a3b, 0x1e601000, 0x1e603000, 0x1e621000, - 0x1e623000, 0x1e641000, 0x1e643000, 0x1e661000, - 0x1e663000, 0x1e681000, 0x1e683000, 0x1e6a1000, - 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, - 0x1e6e3000, 0x1e701000, 0x1e703000, 0x1e721000, - 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, - 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, - 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, - 0x1e7e3000, + 0x8b0d82fa, 0xcb49970c, 0xab889dfc, 0xeb9ee787, + 0x0b9b3ec9, 0x4b9279a3, 0x2b88474e, 0x6b8c56c0, + 0x8a1a51e0, 0xaa11f4ba, 0xca0281b8, 0xea918c7c, + 0x0a5d4a19, 0x2a4b264d, 0x4a523ca5, 0x6a9b6ae2, + 0x8a70b79b, 0xaaba9728, 0xca6dfe3d, 0xea627f1c, + 0x0aa70f53, 0x2aaa0f06, 0x4a6176a4, 0x6a604eb0, + 0x1105ed91, 0x3100583e, 0x5101f8bd, 0x710f0306, + 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, + 0x120cb166, 0x321764bc, 0x52174681, 0x720c0247, + 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, + 0x14000000, 0x17ffffd7, 0x140001cf, 0x94000000, + 0x97ffffd4, 0x940001cc, 0x3400000a, 0x34fffa2a, + 0x3400392a, 0x35000008, 0x35fff9c8, 0x350038c8, + 0xb400000b, 0xb4fff96b, 0xb400386b, 0xb500001d, + 0xb5fff91d, 0xb500381d, 0x10000013, 0x10fff8b3, + 0x100037b3, 0x90000013, 0x36300016, 0x3637f836, + 0x36303736, 0x3758000c, 0x375ff7cc, 0x375836cc, + 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, + 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, + 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, + 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, + 0x540034a0, 0x54000001, 0x54fff541, 0x54003441, + 0x54000002, 0x54fff4e2, 0x540033e2, 0x54000002, + 0x54fff482, 0x54003382, 0x54000003, 0x54fff423, + 0x54003323, 0x54000003, 0x54fff3c3, 0x540032c3, + 0x54000004, 0x54fff364, 0x54003264, 0x54000005, + 0x54fff305, 0x54003205, 0x54000006, 0x54fff2a6, + 0x540031a6, 0x54000007, 0x54fff247, 0x54003147, + 0x54000008, 0x54fff1e8, 0x540030e8, 0x54000009, + 0x54fff189, 0x54003089, 0x5400000a, 0x54fff12a, + 0x5400302a, 0x5400000b, 0x54fff0cb, 0x54002fcb, + 0x5400000c, 0x54fff06c, 0x54002f6c, 0x5400000d, + 0x54fff00d, 0x54002f0d, 0x5400000e, 0x54ffefae, + 0x54002eae, 0x5400000f, 0x54ffef4f, 0x54002e4f, + 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, + 0xd44cad80, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, + 0xd5033fdf, 0xd5033e9f, 0xd50332bf, 0xd61f0200, + 0xd63f0280, 0xc80a7d1b, 0xc800fea1, 0xc85f7fb1, + 0xc85fff9d, 0xc89ffee1, 0xc8dffe95, 0x88167e7b, + 0x880bfcd0, 0x885f7c12, 0x885ffd44, 0x889ffed8, + 0x88dffe6a, 0x48017fc5, 0x4808fe2c, 0x485f7dc9, + 0x485ffc27, 0x489ffe05, 0x48dffd82, 0x080a7c6c, + 0x081cff4e, 0x085f7d5e, 0x085ffeae, 0x089ffd2d, + 0x08dfff76, 0xc87f4d7c, 0xc87fcc5e, 0xc8220417, + 0xc82cb5f0, 0x887f55b2, 0x887ff90b, 0x88382c2d, + 0x883aedb5, 0xf819928b, 0xb803e21c, 0x381f713b, + 0x781ce322, 0xf850f044, 0xb85e129e, 0x385e92f2, + 0x785ff35d, 0x39801921, 0x7881318b, 0x78dce02b, + 0xb8829313, 0xfc45f318, 0xbc5d50af, 0xfc001375, + 0xbc1951b7, 0xf8008c0b, 0xb801dc03, 0x38009dcb, + 0x781fdf1d, 0xf8570e2d, 0xb85faecc, 0x385f6d8d, + 0x785ebea0, 0x38804cf7, 0x789cbce3, 0x78df9cbc, + 0xb89eed38, 0xfc40cd6e, 0xbc5bdd93, 0xfc103c14, + 0xbc040c08, 0xf81a2784, 0xb81ca4ec, 0x381e855b, + 0x7801b506, 0xf853654e, 0xb85d74b0, 0x384095c2, + 0x785ec5bc, 0x389e15a9, 0x789dc703, 0x78c06474, + 0xb89ff667, 0xfc57e51e, 0xbc4155f9, 0xfc05a6ee, + 0xbc1df408, 0xf835da4a, 0xb836d9a4, 0x3833580d, + 0x7826cb6c, 0xf8706900, 0xb87ae880, 0x3865db2e, + 0x78724889, 0x38a7789b, 0x78beca2f, 0x78f6c810, + 0xb8bef956, 0xfc6afabd, 0xbc734963, 0xfc3d5b8d, + 0xbc25fbb7, 0xf9189d05, 0xb91ecb1d, 0x39187a33, + 0x791f226d, 0xf95aa2f3, 0xb9587bb7, 0x395f7176, + 0x795d9143, 0x399e7e08, 0x799a2697, 0x79df3422, + 0xb99c2624, 0xfd5c2374, 0xbd5fa1d9, 0xfd1d595a, + 0xbd1b1869, 0x58001e9b, 0x1800000b, 0xf8945060, + 0xd8000000, 0xf8ae6ba0, 0xf99a0080, 0x1a070035, + 0x3a0700a8, 0x5a0e0367, 0x7a11009b, 0x9a000380, + 0xba1e030c, 0xda0f0320, 0xfa030301, 0x0b340b12, + 0x2b2a278d, 0xcb22aa0f, 0x6b2d29bd, 0x8b2cce8c, + 0xab2b877e, 0xcb21c8ee, 0xeb3ba47d, 0x3a4d400e, + 0x7a5232c6, 0xba5e624e, 0xfa53814c, 0x3a52d8c2, + 0x7a4d8924, 0xba4b3aab, 0xfa4d7882, 0x1a96804c, + 0x1a912618, 0x5a90b0e6, 0x5a96976b, 0x9a9db06a, + 0x9a9b374c, 0xda95c14f, 0xda89c6fe, 0x5ac0015e, + 0x5ac005fd, 0x5ac00bdd, 0x5ac012b9, 0x5ac01404, + 0xdac002b2, 0xdac0061d, 0xdac00a95, 0xdac00e66, + 0xdac0107e, 0xdac01675, 0x1ac00b0b, 0x1ace0f3b, + 0x1ad221c3, 0x1ad825e7, 0x1ad92a3c, 0x1adc2f42, + 0x9ada0b25, 0x9ad20e1b, 0x9acc22a6, 0x9acc2480, + 0x9adc2a3b, 0x9ad22c5c, 0x1b0e39ea, 0x1b0fcf23, + 0x9b1010ae, 0x9b048b3a, 0x9b3d4582, 0x9b2390e8, + 0x9bba6499, 0x9ba0ea24, 0x1e2f0af1, 0x1e311b95, + 0x1e23295b, 0x1e3938e0, 0x1e2f08c9, 0x1e6a09fd, + 0x1e671a22, 0x1e77296b, 0x1e773ba7, 0x1e6b0b6e, + 0x1f18308b, 0x1f14adcf, 0x1f2b31bc, 0x1f3a3bd7, + 0x1f4a1da9, 0x1f4f8fa5, 0x1f6f798b, 0x1f73523e, + 0x1e2040fb, 0x1e20c2a9, 0x1e214122, 0x1e21c0fb, + 0x1e22c3dd, 0x1e604031, 0x1e60c0c2, 0x1e61406a, + 0x1e61c178, 0x1e624027, 0x1e38000b, 0x9e380243, + 0x1e7800dc, 0x9e7800d6, 0x1e220360, 0x9e22005a, + 0x1e6200e5, 0x9e62017c, 0x1e2601b9, 0x9e6602eb, + 0x1e270113, 0x9e6702b2, 0x1e342320, 0x1e722260, + 0x1e202048, 0x1e6023a8, 0x29025668, 0x29403e86, + 0x6966387b, 0xa93b316a, 0xa97e38e7, 0x298e5980, + 0x29c61d0e, 0x69c00930, 0xa9bc7434, 0xa9c530b5, + 0x28b26378, 0x28c25a5c, 0x68f419b1, 0xa8b668bc, + 0xa8f15746, 0x280453cd, 0x284c2cb1, 0xa83a534d, + 0xa87b32fd, 0x05a08020, 0x04b0e3e0, 0x0470e7e1, + 0x042f9c20, 0x043f9c35, 0x047f9c20, 0x04ff9c20, + 0x04299420, 0x04319160, 0x0461943e, 0x04a19020, + 0x042053ff, 0x047f5401, 0x25208028, 0x2538cfe0, + 0x2578d001, 0x25b8efe2, 0x25f8f007, 0xa400a3e0, + 0xa4a8a7ea, 0xa547a814, 0xa4084ffe, 0xa55c53e0, + 0xa5e1540b, 0xe400fbf6, 0xe408ffff, 0xe547e400, + 0xe4014be0, 0xe4a84fe0, 0xe5f25000, 0x858043e0, + 0x85a043ff, 0xe59f5d08, 0x1e601000, 0x1e603000, + 0x1e621000, 0x1e623000, 0x1e641000, 0x1e643000, + 0x1e661000, 0x1e663000, 0x1e681000, 0x1e683000, + 0x1e6a1000, 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, + 0x1e6e1000, 0x1e6e3000, 0x1e701000, 0x1e703000, + 0x1e721000, 0x1e723000, 0x1e741000, 0x1e743000, + 0x1e761000, 0x1e763000, 0x1e781000, 0x1e783000, + 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, + 0x1e7e1000, 0x1e7e3000, 0x04bb020e, 0x04ba04c0, + 0x6586019b, 0x6593089e, 0x65c2060b, 0x04d6a18f, + 0x040016e9, 0x0490835e, 0x045aaa44, 0x04938579, + 0x0411990a, 0x04101624, 0x0497ad3e, 0x04deae80, + 0x04481c77, 0x044a0960, 0x04c118ab, 0x049caa30, + 0x6580834f, 0x658d9e6a, 0x65c68238, 0x65c791fa, + 0x65c28a38, 0x049db7be, 0x6582b552, 0x65c0abde, + 0x6581bbc6, 0x65cdb854, 0x658197a9, 0x65f60872, + 0x65ec29af, 0x65be43cc, 0x65e06ea7, 0x04544b53, + 0x04d57c30, 0x04323095, 0x04a7324c, 0x046d31f9, + 0x04da30eb, 0x04d8252b, 0x04d93c1c, 0x044820f0, + 0x040a2fac, 0x65873975, 0x65c62886, 0x65d820e7, + 0x04013fac, }; // END Generated code -- do not edit diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 80ddb9b31..f554b5e15 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -139,6 +139,9 @@ REGISTER_DECLARATION(Register, rdispatch, r21); // Java stack pointer REGISTER_DECLARATION(Register, esp, r20); +// Preserved predicate register with all elements set TRUE. +REGISTER_DECLARATION(PRegister, ptrue, p7); + #define assert_cond(ARG1) assert(ARG1, #ARG1) namespace asm_util { @@ -273,6 +276,14 @@ public: f(r->encoding_nocheck(), lsb + 4, lsb); } + void prf(PRegister r, int lsb) { + f(r->encoding_nocheck(), lsb + 3, lsb); + } + + void pgrf(PRegister r, int lsb) { + f(r->encoding_nocheck(), lsb + 2, lsb); + } + unsigned get(int msb = 31, int lsb = 0) { int nbits = msb - lsb + 1; unsigned mask = ((1U << nbits) - 1) << lsb; @@ -554,6 +565,18 @@ class Address { void lea(MacroAssembler *, Register) const; static bool offset_ok_for_immed(long offset, uint shift); + + static bool offset_ok_for_sve_immed(long offset, int shift, int vl /* sve vector length */) { + if (offset % vl == 0) { + // Convert address offset into sve imm offset (MUL VL). + int sve_offset = offset / vl; + if (((-(1 << (shift - 1))) <= sve_offset) && (sve_offset < (1 << (shift - 1)))) { + // sve_offset can be encoded + return true; + } + } + return false; + } }; // Convience classes @@ -596,7 +619,9 @@ class InternalAddress: public Address { InternalAddress(address target) : Address(target, relocInfo::internal_word_type) {} }; -const int FPUStateSizeInWords = 32 * 2; +const int FPUStateSizeInWords = FloatRegisterImpl::number_of_registers * + FloatRegisterImpl::save_slots_per_register; + typedef enum { PLDL1KEEP = 0b00000, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM, PSTL1KEEP = 0b10000, PSTL1STRM, PSTL2KEEP, PSTL2STRM, PSTL3KEEP, PSTL3STRM, @@ -667,6 +692,12 @@ public: void rf(FloatRegister reg, int lsb) { current->rf(reg, lsb); } + void prf(PRegister reg, int lsb) { + current->prf(reg, lsb); + } + void pgrf(PRegister reg, int lsb) { + current->pgrf(reg, lsb); + } void fixed(unsigned value, unsigned mask) { current->fixed(value, mask); } @@ -2228,21 +2259,27 @@ public: #undef INSN -#define INSN(NAME, opc, opc2) \ +#define INSN(NAME, opc, opc2, accepted) \ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { \ + guarantee(T != T1Q && T != T1D, "incorrect arrangement"); \ + if (accepted < 3) guarantee(T != T2D, "incorrect arrangement"); \ + if (accepted < 2) guarantee(T != T2S, "incorrect arrangement"); \ + if (accepted < 1) guarantee(T == T8B || T == T16B, "incorrect arrangement"); \ starti; \ f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24); \ f((int)T >> 1, 23, 22), f(opc2, 21, 10); \ rf(Vn, 5), rf(Vd, 0); \ } - INSN(absr, 0, 0b100000101110); - INSN(negr, 1, 0b100000101110); - INSN(notr, 1, 0b100000010110); - INSN(addv, 0, 0b110001101110); - INSN(cls, 0, 0b100000010010); - INSN(clz, 1, 0b100000010010); - INSN(cnt, 0, 0b100000010110); + INSN(absr, 0, 0b100000101110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(negr, 1, 0b100000101110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(notr, 1, 0b100000010110, 0); // accepted arrangements: T8B, T16B + INSN(addv, 0, 0b110001101110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S + INSN(cls, 0, 0b100000010010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S + INSN(clz, 1, 0b100000010010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S + INSN(cnt, 0, 0b100000010110, 0); // accepted arrangements: T8B, T16B + INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S + INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S #undef INSN @@ -2376,13 +2413,18 @@ public: f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); } - void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { - starti; - f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); - f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10); - rf(Vn, 5), rf(Rd, 0); +#define INSN(NAME, op) \ + void NAME(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { \ + starti; \ + f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); \ + f(((idx<<1)|1)<<(int)T, 20, 16), f(op, 15, 10); \ + rf(Vn, 5), rf(Rd, 0); \ } + INSN(umov, 0b001111); + INSN(smov, 0b001011); +#undef INSN + #define INSN(NAME, opc, opc2, isSHR) \ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){ \ starti; \ @@ -2582,13 +2624,299 @@ public: #undef INSN void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) - { +{ + starti; + assert(T == T8B || T == T16B, "invalid arrangement"); + assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value"); + f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); + rf(Vm, 16), f(0, 15), f(index, 14, 11); + f(0, 10), rf(Vn, 5), rf(Vd, 0); +} + +// SVE arithmetics - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid register variant"); \ + f(0b00000100, 31, 24), f(T, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_add, 0b000); + INSN(sve_sub, 0b001); +#undef INSN + +// SVE floating-point arithmetic - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T == S || T == D, "invalid register variant"); \ + f(0b01100101, 31, 24), f(T, 23, 22), f(0, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_fadd, 0b000); + INSN(sve_fmul, 0b010); + INSN(sve_fsub, 0b001); +#undef INSN + +private: + void sve_predicate_reg_insn(unsigned op24, unsigned op13, + FloatRegister Zd_or_Vd, SIMD_RegVariant T, + PRegister Pg, FloatRegister Zn_or_Vn) { + starti; + f(op24, 31, 24), f(T, 23, 22), f(op13, 21, 13); + pgrf(Pg, 10), rf(Zn_or_Vn, 5), rf(Zd_or_Vd, 0); + } + +public: + +// SVE integer arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn_or_Zd_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm_or_Vn) { \ + assert(T != Q, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zdn_or_Zd_or_Vd, T, Pg, Znm_or_Vn); \ + } + + INSN(sve_abs, 0b00000100, 0b010110101); // vector abs, unary + INSN(sve_add, 0b00000100, 0b000000000); // vector add + INSN(sve_andv, 0b00000100, 0b011010001); // bitwise and reduction to scalar + INSN(sve_asr, 0b00000100, 0b010000100); // vector arithmetic shift right + INSN(sve_cnt, 0b00000100, 0b011010101) // count non-zero bits + INSN(sve_cpy, 0b00000101, 0b100000100); // copy scalar to each active vector element + INSN(sve_eorv, 0b00000100, 0b011001001); // bitwise xor reduction to scalar + INSN(sve_lsl, 0b00000100, 0b010011100); // vector logical shift left + INSN(sve_lsr, 0b00000100, 0b010001100); // vector logical shift right + INSN(sve_mul, 0b00000100, 0b010000000); // vector mul + INSN(sve_neg, 0b00000100, 0b010111101); // vector neg, unary + INSN(sve_not, 0b00000100, 0b011110101); // bitwise invert vector, unary + INSN(sve_orv, 0b00000100, 0b011000001); // bitwise or reduction to scalar + INSN(sve_smax, 0b00000100, 0b001000000); // signed maximum vectors + INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar + INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors + INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar + INSN(sve_sub, 0b00000100, 0b000001000); // vector sub + INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar +#undef INSN + +// SVE floating-point arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zd_or_Zdn_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn_or_Zm) { \ + assert(T == S || T == D, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zd_or_Zdn_or_Vd, T, Pg, Zn_or_Zm); \ + } + + INSN(sve_fabs, 0b00000100, 0b011100101); + INSN(sve_fadd, 0b01100101, 0b000000100); + INSN(sve_fadda, 0b01100101, 0b011000001); // add strictly-ordered reduction to scalar Vd + INSN(sve_fdiv, 0b01100101, 0b001101100); + INSN(sve_fmax, 0b01100101, 0b000110100); // floating-point maximum + INSN(sve_fmaxv, 0b01100101, 0b000110001); // floating-point maximum recursive reduction to scalar + INSN(sve_fmin, 0b01100101, 0b000111100); // floating-point minimum + INSN(sve_fminv, 0b01100101, 0b000111001); // floating-point minimum recursive reduction to scalar + INSN(sve_fmul, 0b01100101, 0b000010100); + INSN(sve_fneg, 0b00000100, 0b011101101); + INSN(sve_frintm, 0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity + INSN(sve_frintn, 0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even + INSN(sve_frintp, 0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity + INSN(sve_fsqrt, 0b01100101, 0b001101101); + INSN(sve_fsub, 0b01100101, 0b000001100); +#undef INSN + + // SVE multiple-add/sub - predicated +#define INSN(NAME, op0, op1, op2) \ + void NAME(FloatRegister Zda, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(op0, 31, 24), f(T, 23, 22), f(op1, 21), rf(Zm, 16); \ + f(op2, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zda, 0); \ + } + + INSN(sve_fmla, 0b01100101, 1, 0b000); // floating-point fused multiply-add: Zda = Zda + Zn * Zm + INSN(sve_fmls, 0b01100101, 1, 0b001); // floating-point fused multiply-subtract: Zda = Zda + -Zn * Zm + INSN(sve_fnmla, 0b01100101, 1, 0b010); // floating-point negated fused multiply-add: Zda = -Zda + -Zn * Zm + INSN(sve_fnmls, 0b01100101, 1, 0b011); // floating-point negated fused multiply-subtract: Zda = -Zda + Zn * Zm + INSN(sve_mla, 0b00000100, 0, 0b010); // multiply-add: Zda = Zda + Zn*Zm + INSN(sve_mls, 0b00000100, 0, 0b011); // multiply-subtract: Zda = Zda + -Zn*Zm +#undef INSN + +// SVE bitwise logical - unpredicated +#define INSN(NAME, opc) \ + void NAME(FloatRegister Zd, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + f(0b00000100, 31, 24), f(opc, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_and, 0b00); + INSN(sve_eor, 0b10); + INSN(sve_orr, 0b01); +#undef INSN + +// SVE shift immediate - unpredicated +#define INSN(NAME, opc, isSHR) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, int shift) { \ + starti; \ + /* The encodings for the tszh:tszl:imm3 fields (bits 23:22 20:19 18:16) \ + * for shift right is calculated as: \ + * 0001 xxx B, shift = 16 - UInt(tszh:tszl:imm3) \ + * 001x xxx H, shift = 32 - UInt(tszh:tszl:imm3) \ + * 01xx xxx S, shift = 64 - UInt(tszh:tszl:imm3) \ + * 1xxx xxx D, shift = 128 - UInt(tszh:tszl:imm3) \ + * for shift left is calculated as: \ + * 0001 xxx B, shift = UInt(tszh:tszl:imm3) - 8 \ + * 001x xxx H, shift = UInt(tszh:tszl:imm3) - 16 \ + * 01xx xxx S, shift = UInt(tszh:tszl:imm3) - 32 \ + * 1xxx xxx D, shift = UInt(tszh:tszl:imm3) - 64 \ + */ \ + assert(T != Q, "Invalid register variant"); \ + if (isSHR) { \ + assert(((1 << (T + 3)) >= shift) && (shift > 0) , "Invalid shift value"); \ + } else { \ + assert(((1 << (T + 3)) > shift) && (shift >= 0) , "Invalid shift value"); \ + } \ + int cVal = (1 << ((T + 3) + (isSHR ? 1 : 0))); \ + int encodedShift = isSHR ? cVal - shift : cVal + shift; \ + int tszh = encodedShift >> 5; \ + int tszl_imm = encodedShift & 0x1f; \ + f(0b00000100, 31, 24); \ + f(tszh, 23, 22), f(1,21), f(tszl_imm, 20, 16); \ + f(0b100, 15, 13), f(opc, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_asr, 0b100, /* isSHR = */ true); + INSN(sve_lsl, 0b111, /* isSHR = */ false); + INSN(sve_lsr, 0b101, /* isSHR = */ true); +#undef INSN + +private: + + // Scalar base + immediate index + void sve_ld_st1(FloatRegister Zt, Register Xn, int imm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + f(0, 20), sf(imm, 19, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + // Scalar base + scalar index + void sve_ld_st1(FloatRegister Zt, Register Xn, Register Xm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + rf(Xm, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + void sve_ld_st1(FloatRegister Zt, PRegister Pg, + SIMD_RegVariant T, const Address &a, + int op1, int type, int imm_op2, int scalar_op2) { + switch (a.getMode()) { + case Address::base_plus_offset: + sve_ld_st1(Zt, a.base(), a.offset(), Pg, T, op1, type, imm_op2); + break; + case Address::base_plus_offset_reg: + sve_ld_st1(Zt, a.base(), a.index(), Pg, T, op1, type, scalar_op2); + break; + default: + ShouldNotReachHere(); + } + } + +public: + +// SVE load/store - predicated +#define INSN(NAME, op1, type, imm_op2, scalar_op2) \ + void NAME(FloatRegister Zt, SIMD_RegVariant T, PRegister Pg, const Address &a) { \ + assert(T != Q, "invalid register variant"); \ + sve_ld_st1(Zt, Pg, T, a, op1, type, imm_op2, scalar_op2); \ + } + + INSN(sve_ld1b, 0b1010010, 0b00, 0b101, 0b010); + INSN(sve_st1b, 0b1110010, 0b00, 0b111, 0b010); + INSN(sve_ld1h, 0b1010010, 0b01, 0b101, 0b010); + INSN(sve_st1h, 0b1110010, 0b01, 0b111, 0b010); + INSN(sve_ld1w, 0b1010010, 0b10, 0b101, 0b010); + INSN(sve_st1w, 0b1110010, 0b10, 0b111, 0b010); + INSN(sve_ld1d, 0b1010010, 0b11, 0b101, 0b010); + INSN(sve_st1d, 0b1110010, 0b11, 0b111, 0b010); +#undef INSN + +// SVE load/store - unpredicated +#define INSN(NAME, op1) \ + void NAME(FloatRegister Zt, const Address &a) { \ + starti; \ + assert(a.index() == noreg, "invalid address variant"); \ + f(op1, 31, 29), f(0b0010110, 28, 22), sf(a.offset() >> 3, 21, 16), \ + f(0b010, 15, 13), f(a.offset() & 0x7, 12, 10), srf(a.base(), 5), rf(Zt, 0); \ + } + + INSN(sve_ldr, 0b100); // LDR (vector) + INSN(sve_str, 0b111); // STR (vector) +#undef INSN + +#define INSN(NAME, op) \ + void NAME(Register Xd, Register Xn, int imm6) { \ + starti; \ + f(0b000001000, 31, 23), f(op, 22, 21); \ + srf(Xn, 16), f(0b01010, 15, 11), sf(imm6, 10, 5), srf(Xd, 0); \ + } + + INSN(sve_addvl, 0b01); + INSN(sve_addpl, 0b11); +#undef INSN + +// SVE inc/dec register by element count +#define INSN(NAME, op) \ + void NAME(Register Xdn, SIMD_RegVariant T, unsigned imm4 = 1, int pattern = 0b11111) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(0b00000100,31, 24), f(T, 23, 22), f(0b11, 21, 20); \ + f(imm4 - 1, 19, 16), f(0b11100, 15, 11), f(op, 10), f(pattern, 9, 5), rf(Xdn, 0); \ + } + + INSN(sve_inc, 0); + INSN(sve_dec, 1); +#undef INSN + + // SVE predicate count + void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) { + starti; + assert(T != Q, "invalid size"); + f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14); + prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0); + } + + // SVE dup scalar + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, Register Rn) { + starti; + assert(T != Q, "invalid size"); + f(0b00000101, 31, 24), f(T, 23, 22), f(0b100000001110, 21, 10); + srf(Rn, 5), rf(Zd, 0); + } + + // SVE dup imm + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, int imm8) { + starti; + assert(T != Q, "invalid size"); + int sh = 0; + if (imm8 <= 127 && imm8 >= -128) { + sh = 0; + } else if (T != B && imm8 <= 32512 && imm8 >= -32768 && (imm8 & 0xff) == 0) { + sh = 1; + imm8 = (imm8 >> 8); + } else { + guarantee(false, "invalid immediate"); + } + f(0b00100101, 31, 24), f(T, 23, 22), f(0b11100011, 21, 14); + f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0); + } + + void sve_ptrue(PRegister pd, SIMD_RegVariant esize, int pattern = 0b11111) { starti; - assert(T == T8B || T == T16B, "invalid arrangement"); - assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value"); - f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); - rf(Vm, 16), f(0, 15), f(index, 14, 11); - f(0, 10), rf(Vn, 5), rf(Vd, 0); + f(0b00100101, 31, 24), f(esize, 23, 22), f(0b011000111000, 21, 10); + f(pattern, 9, 5), f(0b0, 4), prf(pd, 0); } Assembler(CodeBuffer* code) : AbstractAssembler(code) { diff --git a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp index 6ac54f257..a258528ea 100644 --- a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp @@ -456,8 +456,12 @@ void ZBarrierSetAssembler::generate_c2_load_barrier_stub(MacroAssembler* masm, Z ZSetupArguments setup_arguments(masm, stub); __ mov(rscratch1, stub->slow_path()); __ blr(rscratch1); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } } - // Stub exit __ b(*stub->continuation()); } diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 071845e5b..f26ea2a8b 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -112,6 +112,9 @@ define_pd_global(intx, InlineSmallCode, 1000); "Avoid generating unaligned memory accesses") \ product(bool, UseLSE, false, \ "Use LSE instructions") \ + product(uint, UseSVE, 0, \ + "Highest supported SVE instruction set version") \ + range(0, 2) \ product(bool, UseBlockZeroing, true, \ "Use DC ZVA for block zeroing") \ product(intx, BlockZeroingLowLimit, 256, \ diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index 241197075..431c5f005 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -50,6 +50,9 @@ #include "runtime/jniHandles.inline.hpp" #include "runtime/sharedRuntime.hpp" #include "runtime/thread.hpp" +#ifdef COMPILER2 +#include "opto/matcher.hpp" +#endif #ifdef PRODUCT #define BLOCK_COMMENT(str) /* nothing */ @@ -2098,8 +2098,17 @@ int MacroAssembler::pop(unsigned int bitset, Register stack) { } // Push lots of registers in the bit set supplied. Don't push sp. -// Return the number of words pushed +// Return the number of dwords pushed int MacroAssembler::push_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif + // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2114,8 +2123,18 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + sub(stack, stack, sve_vector_size_in_bytes * count); + for (int i = 0; i < count; i++) { + sve_str(as_FloatRegister(regs[i]), Address(stack, i)); + } + return count * sve_vector_size_in_bytes / 8; + } + add(stack, stack, -count * wordSize * 2); + // NEON if (count & 1) { strq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2128,7 +2147,16 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return count; } +// Return the number of dwords poped int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2143,6 +2171,16 @@ int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + for (int i = count - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(regs[i]), Address(stack, i)); + } + add(stack, stack, sve_vector_size_in_bytes * count); + return count * sve_vector_size_in_bytes / 8; + } + + // NEON if (count & 1) { ldrq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2616,23 +2654,39 @@ void MacroAssembler::pop_call_clobbered_registers() { pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); } -void MacroAssembler::push_CPU_state(bool save_vectors) { - int step = (save_vectors ? 8 : 4) * wordSize; +void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve, + int sve_vector_size_in_bytes) { push(0x3fffffff, sp); // integer registers except lr & sp - mov(rscratch1, -step); - sub(sp, sp, step); - for (int i = 28; i >= 4; i -= 4) { - st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) { + sub(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { + sve_str(as_FloatRegister(i), Address(sp, i)); + } + } else { + int step = (save_vectors ? 8 : 4) * wordSize; + mov(rscratch1, -step); + sub(sp, sp, step); + for (int i = 28; i >= 4; i -= 4) { + st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + } + st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } - st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } -void MacroAssembler::pop_CPU_state(bool restore_vectors) { - int step = (restore_vectors ? 8 : 4) * wordSize; - for (int i = 0; i <= 28; i += 4) - ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); +void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve, + int sve_vector_size_in_bytes) { + if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) { + for (int i = FloatRegisterImpl::number_of_registers - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(i), Address(sp, i)); + } + add(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + } else { + int step = (restore_vectors ? 8 : 4) * wordSize; + for (int i = 0; i <= 28; i += 4) + ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); + } pop(0x3fffffff, sp); // integer registers except lr & sp } @@ -2681,6 +2735,21 @@ Address MacroAssembler::spill_address(int size, int offset, Register tmp) return Address(base, offset); } +Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) { + assert(offset >= 0, "spill to negative address?"); + + Register base = sp; + + // An immediate offset in the range 0 to 255 which is multiplied + // by the current vector or predicate register size in bytes. + if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) { + return Address(base, offset / sve_reg_size_in_bytes); + } + + add(tmp, base, offset); + return Address(tmp); +} + // Checks whether offset is aligned. // Returns true if it is, else false. bool MacroAssembler::merge_alignment_check(Register base, @@ -5843,3 +5912,24 @@ void MacroAssembler::get_thread(Register dst) { pop(saved_regs, sp); } + +void MacroAssembler::verify_sve_vector_length() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + movw(rscratch1, zr); + sve_inc(rscratch1, B); + subsw(zr, rscratch1, VM_Version::get_initial_sve_vector_length()); + br(EQ, verify_ok); + stop("Error: SVE vector length has changed since jvm startup"); + bind(verify_ok); +} + +void MacroAssembler::verify_ptrue() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count. + sve_dec(rscratch1, B); + cbz(rscratch1, verify_ok); + stop("Error: the preserved predicate register (p7) elements are not all true"); + bind(verify_ok); +} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 014a4d3c6..9fb98c010 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -862,8 +862,10 @@ public: DEBUG_ONLY(void verify_heapbase(const char* msg);) - void push_CPU_state(bool save_vectors = false); - void pop_CPU_state(bool restore_vectors = false) ; + void push_CPU_state(bool save_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); + void pop_CPU_state(bool restore_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); // Round up to a power of two void round_to(Register reg, int modulus); @@ -938,6 +940,11 @@ public: Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); + void verify_sve_vector_length(); + void reinitialize_ptrue() { + sve_ptrue(ptrue, B); + } + void verify_ptrue(); // Debugging @@ -1307,6 +1314,7 @@ private: // Returns an address on the stack which is reachable with a ldr/str of size // Uses rscratch2 if the address is not directly reachable Address spill_address(int size, int offset, Register tmp=rscratch2); + Address sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp=rscratch2); bool merge_alignment_check(Register base, size_t size, long cur_offset, long prev_offset) const; @@ -1330,6 +1338,9 @@ public: void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) { str(Vx, T, spill_address(1 << (int)T, offset)); } + void spill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_str(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void unspill(Register Rx, bool is64, int offset) { if (is64) { ldr(Rx, spill_address(8, offset)); @@ -1340,6 +1351,9 @@ public: void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) { ldr(Vx, T, spill_address(1 << (int)T, offset)); } + void unspill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_ldr(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void spill_copy128(int src_offset, int dst_offset, Register tmp1=rscratch1, Register tmp2=rscratch2) { if (src_offset < 512 && (src_offset & 7) == 0 && @@ -1353,6 +1367,15 @@ public: spill(tmp1, true, dst_offset+8); } } + void spill_copy_sve_vector_stack_to_stack(int src_offset, int dst_offset, + int sve_vec_reg_size_in_bytes) { + assert(sve_vec_reg_size_in_bytes % 16 == 0, "unexpected sve vector reg size"); + for (int i = 0; i < sve_vec_reg_size_in_bytes / 16; i++) { + spill_copy128(src_offset, dst_offset); + src_offset += 16; + dst_offset += 16; + } + } }; #ifdef ASSERT diff --git a/src/hotspot/cpu/aarch64/register_aarch64.cpp b/src/hotspot/cpu/aarch64/register_aarch64.cpp index 30924e8a5..3db8e8337 100644 --- a/src/hotspot/cpu/aarch64/register_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_aarch64.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -26,10 +26,15 @@ #include "precompiled.hpp" #include "register_aarch64.hpp" -const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1; +const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers * + RegisterImpl::max_slots_per_register; const int ConcreteRegisterImpl::max_fpr - = ConcreteRegisterImpl::max_gpr + (FloatRegisterImpl::number_of_registers << 1); + = ConcreteRegisterImpl::max_gpr + + FloatRegisterImpl::number_of_registers * FloatRegisterImpl::max_slots_per_register; + +const int ConcreteRegisterImpl::max_pr + = ConcreteRegisterImpl::max_fpr + PRegisterImpl::number_of_registers; const char* RegisterImpl::name() const { const char* names[number_of_registers] = { @@ -52,3 +57,10 @@ const char* FloatRegisterImpl::name() const { }; return is_valid() ? names[encoding()] : "noreg"; } + +const char* PRegisterImpl::name() const { + const char* names[number_of_registers] = { + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7" + }; + return is_valid() ? names[encoding()] : "noreg"; +} diff --git a/src/hotspot/cpu/aarch64/register_aarch64.hpp b/src/hotspot/cpu/aarch64/register_aarch64.hpp index 5f7662c89..c211b39ee 100644 --- a/src/hotspot/cpu/aarch64/register_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/register_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -44,7 +44,8 @@ class RegisterImpl: public AbstractRegisterImpl { enum { number_of_registers = 32, number_of_byte_registers = 32, - number_of_registers_for_jvmci = 34 // Including SP and ZR. + number_of_registers_for_jvmci = 34, // Including SP and ZR. + max_slots_per_register = 2 }; // derived registers, offsets, and addresses @@ -127,7 +128,11 @@ inline FloatRegister as_FloatRegister(int encoding) { class FloatRegisterImpl: public AbstractRegisterImpl { public: enum { - number_of_registers = 32 + number_of_registers = 32, + max_slots_per_register = 8, + save_slots_per_register = 2, + slots_per_neon_register = 4, + extra_save_slots_per_neon_register = slots_per_neon_register - save_slots_per_register }; // construction @@ -183,6 +188,80 @@ CONSTANT_REGISTER_DECLARATION(FloatRegister, v29 , (29)); CONSTANT_REGISTER_DECLARATION(FloatRegister, v30 , (30)); CONSTANT_REGISTER_DECLARATION(FloatRegister, v31 , (31)); +// SVE vector registers, shared with the SIMD&FP v0-v31. Vn maps to Zn[127:0]. +CONSTANT_REGISTER_DECLARATION(FloatRegister, z0 , ( 0)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z1 , ( 1)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z2 , ( 2)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z3 , ( 3)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z4 , ( 4)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z5 , ( 5)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z6 , ( 6)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z7 , ( 7)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z8 , ( 8)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z9 , ( 9)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z10 , (10)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z11 , (11)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z12 , (12)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z13 , (13)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z14 , (14)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z15 , (15)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z16 , (16)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z17 , (17)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z18 , (18)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z19 , (19)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z20 , (20)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z21 , (21)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z22 , (22)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z23 , (23)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z24 , (24)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z25 , (25)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z26 , (26)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z27 , (27)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z28 , (28)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z29 , (29)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z30 , (30)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z31 , (31)); + + +class PRegisterImpl; +typedef PRegisterImpl* PRegister; +inline PRegister as_PRegister(int encoding) { + return (PRegister)(intptr_t)encoding; +} + +// The implementation of predicate registers for the architecture +class PRegisterImpl: public AbstractRegisterImpl { + public: + enum { + number_of_registers = 8, + max_slots_per_register = 1 + }; + + // construction + inline friend PRegister as_PRegister(int encoding); + + VMReg as_VMReg(); + + // derived registers, offsets, and addresses + PRegister successor() const { return as_PRegister(encoding() + 1); } + + // accessors + int encoding() const { assert(is_valid(), "invalid register"); return (intptr_t)this; } + int encoding_nocheck() const { return (intptr_t)this; } + bool is_valid() const { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; } + const char* name() const; +}; + +// The predicate registers of SVE. +CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0)); +CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1)); +CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2)); +CONSTANT_REGISTER_DECLARATION(PRegister, p3, ( 3)); +CONSTANT_REGISTER_DECLARATION(PRegister, p4, ( 4)); +CONSTANT_REGISTER_DECLARATION(PRegister, p5, ( 5)); +CONSTANT_REGISTER_DECLARATION(PRegister, p6, ( 6)); +CONSTANT_REGISTER_DECLARATION(PRegister, p7, ( 7)); + // Need to know the total number of registers of all sorts for SharedInfo. // Define a class that exports it. class ConcreteRegisterImpl : public AbstractRegisterImpl { @@ -193,14 +272,16 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl { // There is no requirement that any ordering here matches any ordering c2 gives // it's optoregs. - number_of_registers = (2 * RegisterImpl::number_of_registers + - 4 * FloatRegisterImpl::number_of_registers + + number_of_registers = (RegisterImpl::max_slots_per_register * RegisterImpl::number_of_registers + + FloatRegisterImpl::max_slots_per_register * FloatRegisterImpl::number_of_registers + + PRegisterImpl::max_slots_per_register * PRegisterImpl::number_of_registers + 1) // flags }; // added to make it compile static const int max_gpr; static const int max_fpr; + static const int max_pr; }; // A set of registers diff --git a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp index c18109087..e337f582a 100644 --- a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -154,3 +154,47 @@ REGISTER_DEFINITION(Register, rthread); REGISTER_DEFINITION(Register, rheapbase); REGISTER_DEFINITION(Register, r31_sp); + +REGISTER_DEFINITION(FloatRegister, z0); +REGISTER_DEFINITION(FloatRegister, z1); +REGISTER_DEFINITION(FloatRegister, z2); +REGISTER_DEFINITION(FloatRegister, z3); +REGISTER_DEFINITION(FloatRegister, z4); +REGISTER_DEFINITION(FloatRegister, z5); +REGISTER_DEFINITION(FloatRegister, z6); +REGISTER_DEFINITION(FloatRegister, z7); +REGISTER_DEFINITION(FloatRegister, z8); +REGISTER_DEFINITION(FloatRegister, z9); +REGISTER_DEFINITION(FloatRegister, z10); +REGISTER_DEFINITION(FloatRegister, z11); +REGISTER_DEFINITION(FloatRegister, z12); +REGISTER_DEFINITION(FloatRegister, z13); +REGISTER_DEFINITION(FloatRegister, z14); +REGISTER_DEFINITION(FloatRegister, z15); +REGISTER_DEFINITION(FloatRegister, z16); +REGISTER_DEFINITION(FloatRegister, z17); +REGISTER_DEFINITION(FloatRegister, z18); +REGISTER_DEFINITION(FloatRegister, z19); +REGISTER_DEFINITION(FloatRegister, z20); +REGISTER_DEFINITION(FloatRegister, z21); +REGISTER_DEFINITION(FloatRegister, z22); +REGISTER_DEFINITION(FloatRegister, z23); +REGISTER_DEFINITION(FloatRegister, z24); +REGISTER_DEFINITION(FloatRegister, z25); +REGISTER_DEFINITION(FloatRegister, z26); +REGISTER_DEFINITION(FloatRegister, z27); +REGISTER_DEFINITION(FloatRegister, z28); +REGISTER_DEFINITION(FloatRegister, z29); +REGISTER_DEFINITION(FloatRegister, z30); +REGISTER_DEFINITION(FloatRegister, z31); + +REGISTER_DEFINITION(PRegister, p0); +REGISTER_DEFINITION(PRegister, p1); +REGISTER_DEFINITION(PRegister, p2); +REGISTER_DEFINITION(PRegister, p3); +REGISTER_DEFINITION(PRegister, p4); +REGISTER_DEFINITION(PRegister, p5); +REGISTER_DEFINITION(PRegister, p6); +REGISTER_DEFINITION(PRegister, p7); + +REGISTER_DEFINITION(PRegister, ptrue); diff --git a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp index da2bc6b05..05cc32e7e 100644 --- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp @@ -98,42 +98,60 @@ class RegisterSaver { // Capture info about frame layout enum layout { fpu_state_off = 0, - fpu_state_end = fpu_state_off+FPUStateSizeInWords-1, + fpu_state_end = fpu_state_off + FPUStateSizeInWords - 1, // The frame sender code expects that rfp will be in // the "natural" place and will override any oopMap // setting for it. We must therefore force the layout // so that it agrees with the frame sender code. - r0_off = fpu_state_off+FPUStateSizeInWords, - rfp_off = r0_off + 30 * 2, - return_off = rfp_off + 2, // slot for return address - reg_save_size = return_off + 2}; + r0_off = fpu_state_off + FPUStateSizeInWords, + rfp_off = r0_off + (RegisterImpl::number_of_registers - 2) * RegisterImpl::max_slots_per_register, + return_off = rfp_off + RegisterImpl::max_slots_per_register, // slot for return address + reg_save_size = return_off + RegisterImpl::max_slots_per_register}; }; OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + int sve_vector_size_in_slots = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + sve_vector_size_in_slots = Matcher::scalable_vector_reg_size(T_FLOAT); +#endif + #if COMPILER2_OR_JVMCI if (save_vectors) { + int vect_words = 0; + int extra_save_slots_per_register = 0; // Save upper half of vector registers - int vect_words = 32 * 8 / wordSize; + if (use_sve) { + extra_save_slots_per_register = sve_vector_size_in_slots - FloatRegisterImpl::save_slots_per_register; + } else { + extra_save_slots_per_register = FloatRegisterImpl::extra_save_slots_per_neon_register; + } + vect_words = FloatRegisterImpl::number_of_registers * extra_save_slots_per_register / + VMRegImpl::slots_per_word; additional_frame_words += vect_words; } #else assert(!save_vectors, "vectors are generated only by C2 and JVMCI"); #endif - int frame_size_in_bytes = align_up(additional_frame_words*wordSize + - reg_save_size*BytesPerInt, 16); + int frame_size_in_bytes = align_up(additional_frame_words * wordSize + + reg_save_size * BytesPerInt, 16); // OopMap frame size is in compiler stack slots (jint's) not bytes or words int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; // The caller will allocate additional_frame_words - int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt; + int additional_frame_slots = additional_frame_words * wordSize / BytesPerInt; // CodeBlob frame size is in words. int frame_size_in_words = frame_size_in_bytes / wordSize; *total_frame_words = frame_size_in_words; // Save Integer and Float registers. __ enter(); - __ push_CPU_state(save_vectors); + __ push_CPU_state(save_vectors, use_sve, sve_vector_size_in_bytes); // Set an oopmap for the call site. This oopmap will map all // oop-registers and debug-info registers as callee-saved. This @@ -146,10 +164,10 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int i = 0; i < RegisterImpl::number_of_registers; i++) { Register r = as_Register(i); if (r < rheapbase && r != rscratch1 && r != rscratch2) { - int sp_offset = 2 * (i + 32); // SP offsets are in 4-byte words, - // register slots are 8 bytes - // wide, 32 floating-point - // registers + // SP offsets are in 4-byte words. + // Register slots are 8 bytes wide, 32 floating-point registers. + int sp_offset = RegisterImpl::max_slots_per_register * i + + FloatRegisterImpl::save_slots_per_register * FloatRegisterImpl::number_of_registers; oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset + additional_frame_slots), r->as_VMReg()); } @@ -157,7 +175,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { FloatRegister r = as_FloatRegister(i); - int sp_offset = save_vectors ? (4 * i) : (2 * i); + int sp_offset = 0; + if (save_vectors) { + sp_offset = use_sve ? (sve_vector_size_in_slots * i) : + (FloatRegisterImpl::slots_per_neon_register * i); + } else { + sp_offset = FloatRegisterImpl::save_slots_per_register * i; + } oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset), r->as_VMReg()); } @@ -166,10 +190,15 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { -#ifndef COMPILER2 +#ifdef COMPILER2 + __ pop_CPU_state(restore_vectors, Matcher::supports_scalable_vector(), + Matcher::scalable_vector_reg_size(T_BYTE)); +#else +#if !INCLUDE_JVMCI assert(!restore_vectors, "vectors are generated only by C2 and JVMCI"); #endif __ pop_CPU_state(restore_vectors); +#endif __ leave(); } @@ -1855,6 +1884,11 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset())); } + if (UseSVE > 0) { + // Make sure that jni code does not change SVE vector length. + __ verify_sve_vector_length(); + } + // check for safepoint operation in progress and/or pending suspend requests Label safepoint_in_progress, safepoint_in_progress_done; { @@ -2785,6 +2819,12 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t __ maybe_isb(); __ membar(Assembler::LoadLoad | Assembler::LoadStore); + if (UseSVE > 0 && save_vectors) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); __ cbz(rscratch1, noException); diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 0310463ac..979ff51f8 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -486,6 +486,11 @@ class StubGenerator: public StubCodeGenerator { __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); + if (UseSVE > 0 ) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } // we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own @@ -4804,6 +4809,12 @@ class StubGenerator: public StubCodeGenerator { __ reset_last_Java_frame(true); __ maybe_isb(); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ leave(); // check for pending exceptions diff --git a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp index 6e4eb1a7a..1bb12d24f 100644 --- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp @@ -1377,6 +1377,11 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) { __ push(dtos); __ push(ltos); + if (UseSVE > 0) { + // Make sure that jni code does not change SVE vector length. + __ verify_sve_vector_length(); + } + // change thread state __ mov(rscratch1, _thread_in_native_trans); __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset())); diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index 04ae1167d..8f2c95e8b 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -29,13 +29,15 @@ #include "memory/resourceArea.hpp" #include "runtime/java.hpp" #include "runtime/stubCodeGenerator.hpp" +#include "utilities/formatBuffer.hpp" #include "utilities/macros.hpp" #include "vm_version_aarch64.hpp" #include OS_HEADER_INLINE(os) -#include #include +#include +#include #ifndef HWCAP_AES #define HWCAP_AES (1<<3) @@ -61,12 +63,27 @@ #define HWCAP_ATOMICS (1<<8) #endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif + +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) +#endif + +#ifndef PR_SVE_GET_VL +// For old toolchains which do not have SVE related macros defined. +#define PR_SVE_SET_VL 50 +#define PR_SVE_GET_VL 51 +#endif + int VM_Version::_cpu; int VM_Version::_model; int VM_Version::_model2; int VM_Version::_variant; int VM_Version::_revision; int VM_Version::_stepping; +int VM_Version::_initial_sve_vector_length; VM_Version::PsrInfo VM_Version::_psr_info = { 0, }; static BufferBlob* stub_blob; @@ -160,6 +177,7 @@ void VM_Version::get_processor_features() { } unsigned long auxv = getauxval(AT_HWCAP); + unsigned long auxv2 = getauxval(AT_HWCAP2); char buf[512]; @@ -250,6 +268,8 @@ void VM_Version::get_processor_features() { if (auxv & HWCAP_SHA1) strcat(buf, ", sha1"); if (auxv & HWCAP_SHA2) strcat(buf, ", sha256"); if (auxv & HWCAP_ATOMICS) strcat(buf, ", lse"); + if (auxv & HWCAP_SVE) strcat(buf, ", sve"); + if (auxv2 & HWCAP2_SVE2) strcat(buf, ", sve2"); _features_string = os::strdup(buf); @@ -379,6 +399,18 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseBlockZeroing, false); } + if (auxv & HWCAP_SVE) { + if (FLAG_IS_DEFAULT(UseSVE)) { + FLAG_SET_DEFAULT(UseSVE, (auxv2 & HWCAP2_SVE2) ? 2 : 1); + } + if (UseSVE > 0) { + _initial_sve_vector_length = prctl(PR_SVE_GET_VL); + } + } else if (UseSVE > 0) { + warning("UseSVE specified, but not supported on current CPU. Disabling SVE."); + FLAG_SET_DEFAULT(UseSVE, 0); + } + // This machine allows unaligned memory accesses if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) { FLAG_SET_DEFAULT(UseUnalignedAccesses, true); @@ -411,6 +443,50 @@ void VM_Version::get_processor_features() { UseMontgomerySquareIntrinsic = true; } + if (UseSVE > 0) { + if (FLAG_IS_DEFAULT(MaxVectorSize)) { + MaxVectorSize = _initial_sve_vector_length; + } else if (MaxVectorSize < 16) { + warning("SVE does not support vector length less than 16 bytes. Disabling SVE."); + UseSVE = 0; + } else if ((MaxVectorSize % 16) == 0 && is_power_of_2(MaxVectorSize)) { + int new_vl = prctl(PR_SVE_SET_VL, MaxVectorSize); + _initial_sve_vector_length = new_vl; + // If MaxVectorSize is larger than system largest supported SVE vector length, above prctl() + // call will set task vector length to the system largest supported value. So, we also update + // MaxVectorSize to that largest supported value. + if (new_vl < 0) { + vm_exit_during_initialization( + err_msg("Current system does not support SVE vector length for MaxVectorSize: %d", + (int)MaxVectorSize)); + } else if (new_vl != MaxVectorSize) { + warning("Current system only supports max SVE vector length %d. Set MaxVectorSize to %d", + new_vl, new_vl); + } + MaxVectorSize = new_vl; + } else { + vm_exit_during_initialization(err_msg("Unsupported MaxVectorSize: %d", (int)MaxVectorSize)); + } + } + + if (UseSVE == 0) { // NEON + int min_vector_size = 8; + int max_vector_size = 16; + if (!FLAG_IS_DEFAULT(MaxVectorSize)) { + if (!is_power_of_2(MaxVectorSize)) { + vm_exit_during_initialization(err_msg("Unsupported MaxVectorSize: %d", (int)MaxVectorSize)); + } else if (MaxVectorSize < min_vector_size) { + warning("MaxVectorSize must be at least %i on this platform", min_vector_size); + FLAG_SET_DEFAULT(MaxVectorSize, min_vector_size); + } else if (MaxVectorSize > max_vector_size) { + warning("MaxVectorSize must be at most %i on this platform", max_vector_size); + FLAG_SET_DEFAULT(MaxVectorSize, max_vector_size); + } + } else { + FLAG_SET_DEFAULT(MaxVectorSize, 16); + } + } + #ifdef COMPILER2 if (FLAG_IS_DEFAULT(OptoScheduling)) { OptoScheduling = true; diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp index 0a17f3e73..23c3c1338 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp @@ -40,6 +40,7 @@ protected: static int _variant; static int _revision; static int _stepping; + static int _initial_sve_vector_length; struct PsrInfo { uint32_t dczid_el0; @@ -101,6 +102,7 @@ public: static int cpu_model2() { return _model2; } static int cpu_variant() { return _variant; } static int cpu_revision() { return _revision; } + static int get_initial_sve_vector_length() { return _initial_sve_vector_length; }; static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); } static ByteSize ctr_el0_offset() { return byte_offset_of(PsrInfo, ctr_el0); } static bool is_zva_enabled() { diff --git a/src/hotspot/cpu/aarch64/vmreg_aarch64.cpp b/src/hotspot/cpu/aarch64/vmreg_aarch64.cpp index 9fd20be0f..35d0adf5b 100644 --- a/src/hotspot/cpu/aarch64/vmreg_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vmreg_aarch64.cpp @@ -33,15 +33,17 @@ void VMRegImpl::set_regName() { Register reg = ::as_Register(0); int i; for (i = 0; i < ConcreteRegisterImpl::max_gpr ; ) { - regName[i++] = reg->name(); - regName[i++] = reg->name(); + for (int j = 0 ; j < RegisterImpl::max_slots_per_register ; j++) { + regName[i++] = reg->name(); + } reg = reg->successor(); } FloatRegister freg = ::as_FloatRegister(0); for ( ; i < ConcreteRegisterImpl::max_fpr ; ) { - regName[i++] = freg->name(); - regName[i++] = freg->name(); + for (int j = 0 ; j < FloatRegisterImpl::max_slots_per_register ; j++) { + regName[i++] = freg->name(); + } freg = freg->successor(); } diff --git a/src/hotspot/cpu/aarch64/vmreg_aarch64.hpp b/src/hotspot/cpu/aarch64/vmreg_aarch64.hpp index 0b1d000bb..c249c26a8 100644 --- a/src/hotspot/cpu/aarch64/vmreg_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/vmreg_aarch64.hpp @@ -38,13 +38,14 @@ inline Register as_Register() { assert( is_Register(), "must be"); // Yuk - return ::as_Register(value() >> 1); + return ::as_Register(value() / RegisterImpl::max_slots_per_register); } inline FloatRegister as_FloatRegister() { assert( is_FloatRegister() && is_even(value()), "must be" ); // Yuk - return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) >> 1); + return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) / + FloatRegisterImpl::max_slots_per_register); } inline bool is_concrete() { diff --git a/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp b/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp index 145f9797f..dde7a7a91 100644 --- a/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp +++ b/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2006, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,11 +28,16 @@ inline VMReg RegisterImpl::as_VMReg() { if( this==noreg ) return VMRegImpl::Bad(); - return VMRegImpl::as_VMReg(encoding() << 1 ); + return VMRegImpl::as_VMReg(encoding() * RegisterImpl::max_slots_per_register); } inline VMReg FloatRegisterImpl::as_VMReg() { - return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_gpr); + return VMRegImpl::as_VMReg((encoding() * FloatRegisterImpl::max_slots_per_register) + + ConcreteRegisterImpl::max_gpr); +} + +inline VMReg PRegisterImpl::as_VMReg() { + return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_fpr); } #endif // CPU_AARCH64_VM_VMREG_AARCH64_INLINE_HPP diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index 18e81bdc5..87e5f331b 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1093,7 +1093,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for @@ -1121,6 +1121,14 @@ const int Matcher::vector_width_in_bytes(BasicType bt) { return MaxVectorSize; } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // Vector ideal reg corresponding to specified size in bytes const uint Matcher::vector_ideal_reg(int size) { assert(MaxVectorSize >= size, ""); diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 07bda6d71..4cbe2cf5c 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2242,7 +2242,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for @@ -2310,6 +2310,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return max_vector_size(bt); // Same as max. } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // PPC implementation uses VSX load/store instructions (if // SuperwordUseVSX) which support 4 byte but not arbitrary alignment const bool Matcher::misaligned_vectors_ok() { diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 96c231b0a..782c1c7c4 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1522,7 +1522,7 @@ const bool Matcher::match_rule_supported(int opcode) { // BUT: make sure match rule is not disabled by a false predicate! } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // Identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen. @@ -1573,6 +1573,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return max_vector_size(bt); // Same as max. } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + const uint Matcher::vector_shift_count_ideal_reg(int size) { fatal("vector shift is not supported"); return Node::NotAMachineReg; diff --git a/src/hotspot/cpu/sparc/sparc.ad b/src/hotspot/cpu/sparc/sparc.ad index a09c795c9..3b1b1046e 100644 --- a/src/hotspot/cpu/sparc/sparc.ad +++ b/src/hotspot/cpu/sparc/sparc.ad @@ -1710,7 +1710,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 8fb9a3e34..dc5f1ecf9 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -1341,7 +1341,7 @@ const bool Matcher::match_rule_supported(int opcode) { return ret_value; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen bool ret_value = match_rule_supported(opcode); @@ -1468,6 +1468,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return MIN2(size,max_size); } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // Vector ideal reg corresponding to specified size in bytes const uint Matcher::vector_ideal_reg(int size) { assert(MaxVectorSize >= size, ""); diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index c2d1aca0c..0db8e6a14 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -2887,7 +2887,7 @@ frame RAX_H_num // Op_RegL }; // Excluded flags and vector registers. - assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type"); + assert(ARRAY_SIZE(hi) == _last_machine_leaf - 8, "missing type"); return OptoRegPair(hi[ideal_reg], lo[ideal_reg]); %} %} diff --git a/src/hotspot/share/adlc/archDesc.cpp b/src/hotspot/share/adlc/archDesc.cpp index ba61aa4c0..9e41b2dc6 100644 --- a/src/hotspot/share/adlc/archDesc.cpp +++ b/src/hotspot/share/adlc/archDesc.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -929,6 +929,7 @@ const char *ArchDesc::getIdealType(const char *idealOp) { // Match Vector types. if (strncmp(idealOp, "Vec",3)==0) { switch(last_char) { + case 'A': return "TypeVect::VECTA"; case 'S': return "TypeVect::VECTS"; case 'D': return "TypeVect::VECTD"; case 'X': return "TypeVect::VECTX"; @@ -939,6 +940,10 @@ const char *ArchDesc::getIdealType(const char *idealOp) { } } + if (strncmp(idealOp, "RegVMask", 8) == 0) { + return "Type::BOTTOM"; + } + // !!!!! switch(last_char) { case 'I': return "TypeInt::INT"; diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index 5ba1fdc57..45826d3b2 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -3946,6 +3946,8 @@ bool MatchRule::is_base_register(FormDict &globals) const { strcmp(opType,"RegL")==0 || strcmp(opType,"RegF")==0 || strcmp(opType,"RegD")==0 || + strcmp(opType,"RegVMask")==0 || + strcmp(opType,"VecA")==0 || strcmp(opType,"VecS")==0 || strcmp(opType,"VecD")==0 || strcmp(opType,"VecX")==0 || diff --git a/src/hotspot/share/opto/chaitin.cpp b/src/hotspot/share/opto/chaitin.cpp index 914dc43f6..710af9de8 100644 --- a/src/hotspot/share/opto/chaitin.cpp +++ b/src/hotspot/share/opto/chaitin.cpp @@ -77,6 +77,7 @@ void LRG::dump() const { if( _is_oop ) tty->print("Oop "); if( _is_float ) tty->print("Float "); if( _is_vector ) tty->print("Vector "); + if( _is_scalable ) tty->print("Scalable "); if( _was_spilled1 ) tty->print("Spilled "); if( _was_spilled2 ) tty->print("Spilled2 "); if( _direct_conflict ) tty->print("Direct_conflict "); @@ -646,7 +647,15 @@ void PhaseChaitin::Register_Allocate() { // Live ranges record the highest register in their mask. // We want the low register for the AD file writer's convenience. OptoReg::Name hi = lrg.reg(); // Get hi register - OptoReg::Name lo = OptoReg::add(hi, (1-lrg.num_regs())); // Find lo + int num_regs = lrg.num_regs(); + if (lrg.is_scalable() && OptoReg::is_stack(hi)) { + // For scalable vector registers, when they are allocated in physical + // registers, num_regs is RegMask::SlotsPerVecA for reg mask of scalable + // vector. If they are allocated on stack, we need to get the actual + // num_regs, which reflects the physical length of scalable registers. + num_regs = lrg.scalable_reg_slots(); + } + OptoReg::Name lo = OptoReg::add(hi, (1-num_regs)); // Find lo // We have to use pair [lo,lo+1] even for wide vectors because // the rest of code generation works only with pairs. It is safe // since for registers encoding only 'lo' is used. @@ -801,8 +810,19 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { // Check for vector live range (only if vector register is used). // On SPARC vector uses RegD which could be misaligned so it is not // processes as vector in RA. - if (RegMask::is_vector(ireg)) + if (RegMask::is_vector(ireg)) { lrg._is_vector = 1; + if (ireg == Op_VecA) { + assert(Matcher::supports_scalable_vector(), "scalable vector should be supported"); + lrg._is_scalable = 1; + // For scalable vector, when it is allocated in physical register, + // num_regs is RegMask::SlotsPerVecA for reg mask, + // which may not be the actual physical register size. + // If it is allocated in stack, we need to get the actual + // physical length of scalable vector register. + lrg.set_scalable_reg_slots(Matcher::scalable_vector_reg_size(T_FLOAT)); + } + } assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD || ireg == Op_RegL, "vector must be in vector registers"); @@ -912,6 +932,13 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { lrg.set_reg_pressure(1); #endif break; + case Op_VecA: + assert(Matcher::supports_scalable_vector(), "does not support scalable vector"); + assert(RegMask::num_registers(Op_VecA) == RegMask::SlotsPerVecA, "sanity"); + assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecA), "vector should be aligned"); + lrg.set_num_regs(RegMask::SlotsPerVecA); + lrg.set_reg_pressure(1); + break; case Op_VecS: assert(Matcher::vector_size_supported(T_BYTE,4), "sanity"); assert(RegMask::num_registers(Op_VecS) == RegMask::SlotsPerVecS, "sanity"); @@ -1358,6 +1385,46 @@ static bool is_legal_reg(LRG &lrg, OptoReg::Name reg, int chunk) { return false; } +static OptoReg::Name find_first_set(LRG &lrg, RegMask mask, int chunk) { + int num_regs = lrg.num_regs(); + OptoReg::Name assigned = mask.find_first_set(lrg, num_regs); + + if (lrg.is_scalable()) { + // a physical register is found + if (chunk == 0 && OptoReg::is_reg(assigned)) { + return assigned; + } + + // find available stack slots for scalable register + if (lrg._is_vector) { + num_regs = lrg.scalable_reg_slots(); + // if actual scalable vector register is exactly SlotsPerVecA * 32 bits + if (num_regs == RegMask::SlotsPerVecA) { + return assigned; + } + + // mask has been cleared out by clear_to_sets(SlotsPerVecA) before choose_color, but it + // does not work for scalable size. We have to find adjacent scalable_reg_slots() bits + // instead of SlotsPerVecA bits. + assigned = mask.find_first_set(lrg, num_regs); // find highest valid reg + while (OptoReg::is_valid(assigned) && RegMask::can_represent(assigned)) { + // Verify the found reg has scalable_reg_slots() bits set. + if (mask.is_valid_reg(assigned, num_regs)) { + return assigned; + } else { + // Remove more for each iteration + mask.Remove(assigned - num_regs + 1); // Unmask the lowest reg + mask.clear_to_sets(RegMask::SlotsPerVecA); // Align by SlotsPerVecA bits + assigned = mask.find_first_set(lrg, num_regs); + } + } + return OptoReg::Bad; // will cause chunk change, and retry next chunk + } + } + + return assigned; +} + // Choose a color using the biasing heuristic OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { @@ -1391,7 +1458,7 @@ OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { RegMask tempmask = lrg.mask(); tempmask.AND(lrgs(copy_lrg).mask()); tempmask.clear_to_sets(lrg.num_regs()); - OptoReg::Name reg = tempmask.find_first_set(lrg.num_regs()); + OptoReg::Name reg = find_first_set(lrg, tempmask, chunk); if (OptoReg::is_valid(reg)) return reg; } @@ -1400,7 +1467,7 @@ OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { // If no bias info exists, just go with the register selection ordering if (lrg._is_vector || lrg.num_regs() == 2) { // Find an aligned set - return OptoReg::add(lrg.mask().find_first_set(lrg.num_regs()),chunk); + return OptoReg::add(find_first_set(lrg, lrg.mask(), chunk), chunk); } // CNC - Fun hack. Alternate 1st and 2nd selection. Enables post-allocate @@ -1455,7 +1522,6 @@ uint PhaseChaitin::Select( ) { LRG *lrg = &lrgs(lidx); _simplified = lrg->_next; - #ifndef PRODUCT if (trace_spilling()) { ttyLocker ttyl; @@ -1539,7 +1605,6 @@ uint PhaseChaitin::Select( ) { // Bump register mask up to next stack chunk chunk += RegMask::CHUNK_SIZE; lrg->Set_All(); - goto retry_next_chunk; } @@ -1564,12 +1629,21 @@ uint PhaseChaitin::Select( ) { int n_regs = lrg->num_regs(); assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity"); if (n_regs == 1 || !lrg->_fat_proj) { - assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity"); + if (Matcher::supports_scalable_vector()) { + assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecA, "sanity"); + } else { + assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity"); + } lrg->Clear(); // Clear the mask lrg->Insert(reg); // Set regmask to match selected reg // For vectors and pairs, also insert the low bit of the pair - for (int i = 1; i < n_regs; i++) + // We always choose the high bit, then mask the low bits by register size + if (lrg->is_scalable() && OptoReg::is_stack(lrg->reg())) { // stack + n_regs = lrg->scalable_reg_slots(); + } + for (int i = 1; i < n_regs; i++) { lrg->Insert(OptoReg::add(reg,-i)); + } lrg->set_mask_size(n_regs); } else { // Else fatproj // mask must be equal to fatproj bits, by definition diff --git a/src/hotspot/share/opto/chaitin.hpp b/src/hotspot/share/opto/chaitin.hpp index 776e3cf63..674791c64 100644 --- a/src/hotspot/share/opto/chaitin.hpp +++ b/src/hotspot/share/opto/chaitin.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -115,7 +115,9 @@ public: _msize_valid=1; if (_is_vector) { assert(!_fat_proj, "sanity"); - _mask.verify_sets(_num_regs); + if (!(_is_scalable && OptoReg::is_stack(_reg))) { + _mask.verify_sets(_num_regs); + } } else if (_num_regs == 2 && !_fat_proj) { _mask.verify_pairs(); } @@ -139,14 +141,37 @@ public: void clear_to_pairs() { _mask.clear_to_pairs(); debug_only(_msize_valid=0;) } void clear_to_sets() { _mask.clear_to_sets(_num_regs); debug_only(_msize_valid=0;) } - // Number of registers this live range uses when it colors private: + // Number of registers this live range uses when it colors uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else // except _num_regs is kill count for fat_proj + + // For scalable register, num_regs may not be the actual physical register size. + // We need to get the actual physical length of scalable register when scalable + // register is spilled. The size of one slot is 32-bit. + uint _scalable_reg_slots; // Actual scalable register length of slots. + // Meaningful only when _is_scalable is true. public: int num_regs() const { return _num_regs; } void set_num_regs( int reg ) { assert( _num_regs == reg || !_num_regs, "" ); _num_regs = reg; } + uint scalable_reg_slots() { return _scalable_reg_slots; } + void set_scalable_reg_slots(uint slots) { + assert(_is_scalable, "scalable register"); + assert(slots > 0, "slots of scalable register is not valid"); + _scalable_reg_slots = slots; + } + + bool is_scalable() { +#ifdef ASSERT + if (_is_scalable) { + // Should only be a vector for now, but it could also be a RegVMask in future. + assert(_is_vector && (_num_regs == RegMask::SlotsPerVecA), "unexpected scalable reg"); + } +#endif + return _is_scalable; + } + private: // Number of physical registers this live range uses when it colors // Architecture and register-set dependent @@ -172,6 +197,8 @@ public: uint _is_oop:1, // Live-range holds an oop _is_float:1, // True if in float registers _is_vector:1, // True if in vector registers + _is_scalable:1, // True if register size is scalable + // e.g. Arm SVE vector/predicate registers. _was_spilled1:1, // True if prior spilling on def _was_spilled2:1, // True if twice prior spilling on def _is_bound:1, // live range starts life with no diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 05fdab21e..14e5425b8 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -84,6 +84,7 @@ Matcher::Matcher() idealreg2spillmask [Op_RegF] = NULL; idealreg2spillmask [Op_RegD] = NULL; idealreg2spillmask [Op_RegP] = NULL; + idealreg2spillmask [Op_VecA] = NULL; idealreg2spillmask [Op_VecS] = NULL; idealreg2spillmask [Op_VecD] = NULL; idealreg2spillmask [Op_VecX] = NULL; @@ -97,6 +98,7 @@ Matcher::Matcher() idealreg2debugmask [Op_RegF] = NULL; idealreg2debugmask [Op_RegD] = NULL; idealreg2debugmask [Op_RegP] = NULL; + idealreg2debugmask [Op_VecA] = NULL; idealreg2debugmask [Op_VecS] = NULL; idealreg2debugmask [Op_VecD] = NULL; idealreg2debugmask [Op_VecX] = NULL; @@ -110,6 +112,7 @@ Matcher::Matcher() idealreg2mhdebugmask[Op_RegF] = NULL; idealreg2mhdebugmask[Op_RegD] = NULL; idealreg2mhdebugmask[Op_RegP] = NULL; + idealreg2mhdebugmask[Op_VecA] = NULL; idealreg2mhdebugmask[Op_VecS] = NULL; idealreg2mhdebugmask[Op_VecD] = NULL; idealreg2mhdebugmask[Op_VecX] = NULL; @@ -417,6 +420,8 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) { return rms; } +#define NOF_STACK_MASKS (3*6+6) + //---------------------------init_first_stack_mask----------------------------- // Create the initial stack mask used by values spilling to the stack. // Disallow any debug info in outgoing argument areas by setting the @@ -424,7 +429,12 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) { void Matcher::init_first_stack_mask() { // Allocate storage for spill masks as masks for the appropriate load type. - RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5)); + RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * NOF_STACK_MASKS); + + // Initialize empty placeholder masks into the newly allocated arena + for (int i = 0; i < NOF_STACK_MASKS; i++) { + new (rms + i) RegMask(); + } idealreg2spillmask [Op_RegN] = &rms[0]; idealreg2spillmask [Op_RegI] = &rms[1]; @@ -447,11 +457,12 @@ void Matcher::init_first_stack_mask() { idealreg2mhdebugmask[Op_RegD] = &rms[16]; idealreg2mhdebugmask[Op_RegP] = &rms[17]; - idealreg2spillmask [Op_VecS] = &rms[18]; - idealreg2spillmask [Op_VecD] = &rms[19]; - idealreg2spillmask [Op_VecX] = &rms[20]; - idealreg2spillmask [Op_VecY] = &rms[21]; - idealreg2spillmask [Op_VecZ] = &rms[22]; + idealreg2spillmask [Op_VecA] = &rms[18]; + idealreg2spillmask [Op_VecS] = &rms[19]; + idealreg2spillmask [Op_VecD] = &rms[20]; + idealreg2spillmask [Op_VecX] = &rms[21]; + idealreg2spillmask [Op_VecY] = &rms[22]; + idealreg2spillmask [Op_VecZ] = &rms[23]; OptoReg::Name i; @@ -478,6 +489,7 @@ void Matcher::init_first_stack_mask() { // Keep spill masks aligned. aligned_stack_mask.clear_to_pairs(); assert(aligned_stack_mask.is_AllStack(), "should be infinite stack"); + RegMask scalable_stack_mask = aligned_stack_mask; *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP]; #ifdef _LP64 @@ -548,28 +560,48 @@ void Matcher::init_first_stack_mask() { *idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ]; idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask); } - if (UseFPUForSpilling) { - // This mask logic assumes that the spill operations are - // symmetric and that the registers involved are the same size. - // On sparc for instance we may have to use 64 bit moves will - // kill 2 registers when used with F0-F31. - idealreg2spillmask[Op_RegI]->OR(*idealreg2regmask[Op_RegF]); - idealreg2spillmask[Op_RegF]->OR(*idealreg2regmask[Op_RegI]); + + if (Matcher::supports_scalable_vector()) { + int k = 1; + OptoReg::Name in = OptoReg::add(_in_arg_limit, -1); + // Exclude last input arg stack slots to avoid spilling vector register there, + // otherwise vector spills could stomp over stack slots in caller frame. + for (; (in >= init_in) && (k < scalable_vector_reg_size(T_FLOAT)); k++) { + scalable_stack_mask.Remove(in); + in = OptoReg::add(in, -1); + } + + // For VecA + scalable_stack_mask.clear_to_sets(RegMask::SlotsPerVecA); + assert(scalable_stack_mask.is_AllStack(), "should be infinite stack"); + *idealreg2spillmask[Op_VecA] = *idealreg2regmask[Op_VecA]; + idealreg2spillmask[Op_VecA]->OR(scalable_stack_mask); + } else { + *idealreg2spillmask[Op_VecA] = RegMask::Empty; + } + + if (UseFPUForSpilling) { + // This mask logic assumes that the spill operations are + // symmetric and that the registers involved are the same size. + // On sparc for instance we may have to use 64 bit moves will + // kill 2 registers when used with F0-F31. + idealreg2spillmask[Op_RegI]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegF]->OR(*idealreg2regmask[Op_RegI]); #ifdef _LP64 - idealreg2spillmask[Op_RegN]->OR(*idealreg2regmask[Op_RegF]); - idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); - idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); - idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegN]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); + idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegD]); #else - idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegF]); #ifdef ARM - // ARM has support for moving 64bit values between a pair of - // integer registers and a double register - idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); - idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); + // ARM has support for moving 64bit values between a pair of + // integer registers and a double register + idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); #endif #endif - } + } // Make up debug masks. Any spill slot plus callee-save registers. // Caller-save registers are assumed to be trashable by the various @@ -872,6 +904,10 @@ void Matcher::init_spill_mask( Node *ret ) { idealreg2regmask[Op_RegP] = &spillP->out_RegMask(); // Vector regmasks. + if (Matcher::supports_scalable_vector()) { + MachNode *spillVectA = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTA)); + idealreg2regmask[Op_VecA] = &spillVectA->out_RegMask(); + } if (Matcher::vector_size_supported(T_BYTE,4)) { TypeVect::VECTS = TypeVect::make(T_BYTE, 4); MachNode *spillVectS = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTS)); @@ -1573,7 +1609,6 @@ Node* Matcher::Label_Root(const Node* n, State* svec, Node* control, Node*& mem) } } - // Call DFA to match this node, and return svec->DFA( n->Opcode(), n ); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 244e3d1f8..9a8307102 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -310,7 +310,7 @@ public: // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - static const bool match_rule_supported_vector(int opcode, int vlen); + static const bool match_rule_supported_vector(int opcode, int vlen, BasicType bt); // Some microarchitectures have mask registers used on vectors static const bool has_predicated_vectors(void); @@ -333,6 +333,10 @@ public: Matcher::min_vector_size(bt) <= size); } + static const bool supports_scalable_vector(); + // Actual max scalable vector register length. + static const int scalable_vector_reg_size(const BasicType bt); + // Vector ideal reg static const uint vector_ideal_reg(int len); static const uint vector_shift_count_ideal_reg(int len); diff --git a/src/hotspot/share/opto/opcodes.cpp b/src/hotspot/share/opto/opcodes.cpp index e31e8d847..1a826d8ba 100644 --- a/src/hotspot/share/opto/opcodes.cpp +++ b/src/hotspot/share/opto/opcodes.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -38,12 +38,14 @@ const char *NodeClassNames[] = { "RegF", "RegD", "RegL", - "RegFlags", + "VecA", "VecS", "VecD", "VecX", "VecY", "VecZ", + "RegVMask", + "RegFlags", "_last_machine_leaf", #include "classes.hpp" "_last_class_name", diff --git a/src/hotspot/share/opto/opcodes.hpp b/src/hotspot/share/opto/opcodes.hpp index ae3d61ce0..ec96ba055 100644 --- a/src/hotspot/share/opto/opcodes.hpp +++ b/src/hotspot/share/opto/opcodes.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,11 +37,13 @@ enum Opcodes { macro(RegF) // Machine float register macro(RegD) // Machine double register macro(RegL) // Machine long register + macro(VecA) // Machine vectora register macro(VecS) // Machine vectors register macro(VecD) // Machine vectord register macro(VecX) // Machine vectorx register macro(VecY) // Machine vectory register macro(VecZ) // Machine vectorz register + macro(RegVMask) // Vector mask/predicate register macro(RegFlags) // Machine flags register _last_machine_leaf, // Split between regular opcodes and machine #include "classes.hpp" diff --git a/src/hotspot/share/opto/postaloc.cpp b/src/hotspot/share/opto/postaloc.cpp index d572ac9fe..3514b37bc 100644 --- a/src/hotspot/share/opto/postaloc.cpp +++ b/src/hotspot/share/opto/postaloc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -266,9 +266,9 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v Node *val = skip_copies(n->in(k)); if (val == x) return blk_adjust; // No progress? - int n_regs = RegMask::num_registers(val->ideal_reg()); uint val_idx = _lrg_map.live_range_id(val); OptoReg::Name val_reg = lrgs(val_idx).reg(); + int n_regs = RegMask::num_registers(val->ideal_reg(), lrgs(val_idx)); // See if it happens to already be in the correct register! // (either Phi's direct register, or the common case of the name @@ -305,8 +305,26 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v } Node *vv = value[reg]; + // For scalable register, number of registers may be inconsistent between + // "val_reg" and "reg". For example, when "val" resides in register + // but "reg" is located in stack. + if (lrgs(val_idx).is_scalable()) { + assert(val->ideal_reg() == Op_VecA, "scalable vector register"); + if (OptoReg::is_stack(reg)) { + n_regs = lrgs(val_idx).scalable_reg_slots(); + } else { + n_regs = RegMask::SlotsPerVecA; + } + } if (n_regs > 1) { // Doubles and vectors check for aligned-adjacent set - uint last = (n_regs-1); // Looking for the last part of a set + uint last; + if (lrgs(val_idx).is_scalable()) { + assert(val->ideal_reg() == Op_VecA, "scalable vector register"); + // For scalable vector register, regmask is always SlotsPerVecA bits aligned + last = RegMask::SlotsPerVecA - 1; + } else { + last = (n_regs-1); // Looking for the last part of a set + } if ((reg&last) != last) continue; // Wrong part of a set if (!register_contains_value(vv, reg, n_regs, value)) continue; // Different value } @@ -591,7 +609,7 @@ void PhaseChaitin::post_allocate_copy_removal() { uint k; Node *phi = block->get_node(j); uint pidx = _lrg_map.live_range_id(phi); - OptoReg::Name preg = lrgs(_lrg_map.live_range_id(phi)).reg(); + OptoReg::Name preg = lrgs(pidx).reg(); // Remove copies remaining on edges. Check for junk phi. Node *u = NULL; @@ -619,7 +637,7 @@ void PhaseChaitin::post_allocate_copy_removal() { if( pidx ) { value.map(preg,phi); regnd.map(preg,phi); - int n_regs = RegMask::num_registers(phi->ideal_reg()); + int n_regs = RegMask::num_registers(phi->ideal_reg(), lrgs(pidx)); for (int l = 1; l < n_regs; l++) { OptoReg::Name preg_lo = OptoReg::add(preg,-l); value.map(preg_lo,phi); @@ -663,7 +681,7 @@ void PhaseChaitin::post_allocate_copy_removal() { regnd.map(ureg, def); // Record other half of doubles uint def_ideal_reg = def->ideal_reg(); - int n_regs = RegMask::num_registers(def_ideal_reg); + int n_regs = RegMask::num_registers(def_ideal_reg, lrgs(_lrg_map.live_range_id(def))); for (int l = 1; l < n_regs; l++) { OptoReg::Name ureg_lo = OptoReg::add(ureg,-l); if (!value[ureg_lo] && @@ -707,7 +725,7 @@ void PhaseChaitin::post_allocate_copy_removal() { } uint n_ideal_reg = n->ideal_reg(); - int n_regs = RegMask::num_registers(n_ideal_reg); + int n_regs = RegMask::num_registers(n_ideal_reg, lrgs(lidx)); if (n_regs == 1) { // If Node 'n' does not change the value mapped by the register, // then 'n' is a useless copy. Do not update the register->node diff --git a/src/hotspot/share/opto/regmask.cpp b/src/hotspot/share/opto/regmask.cpp index 2e04c42eb..dd9b5476b 100644 --- a/src/hotspot/share/opto/regmask.cpp +++ b/src/hotspot/share/opto/regmask.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,6 +24,7 @@ #include "precompiled.hpp" #include "opto/ad.hpp" +#include "opto/chaitin.hpp" #include "opto/compile.hpp" #include "opto/matcher.hpp" #include "opto/node.hpp" @@ -116,30 +117,47 @@ const RegMask RegMask::Empty( //============================================================================= bool RegMask::is_vector(uint ireg) { - return (ireg == Op_VecS || ireg == Op_VecD || + return (ireg == Op_VecA || ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ); } int RegMask::num_registers(uint ireg) { switch(ireg) { case Op_VecZ: - return 16; + return SlotsPerVecZ; case Op_VecY: - return 8; + return SlotsPerVecY; case Op_VecX: - return 4; + return SlotsPerVecX; case Op_VecD: + return SlotsPerVecD; case Op_RegD: case Op_RegL: #ifdef _LP64 case Op_RegP: #endif return 2; + case Op_VecA: + assert(Matcher::supports_scalable_vector(), "does not support scalable vector"); + return SlotsPerVecA; } // Op_VecS and the rest ideal registers. return 1; } +int RegMask::num_registers(uint ireg, LRG &lrg) { + int n_regs = num_registers(ireg); + + // assigned is OptoReg which is selected by register allocator + OptoReg::Name assigned = lrg.reg(); + assert(OptoReg::is_valid(assigned), "should be valid opto register"); + + if (lrg.is_scalable() && OptoReg::is_stack(assigned)) { + n_regs = lrg.scalable_reg_slots(); + } + return n_regs; +} + //------------------------------find_first_pair-------------------------------- // Find the lowest-numbered register pair in the mask. Return the // HIGHEST register number in the pair, or BAD if no pairs. @@ -238,14 +256,30 @@ int RegMask::is_bound_pair() const { return true; } +// Check that whether given reg number with size is valid +// for current regmask, where reg is the highest number. +bool RegMask::is_valid_reg(OptoReg::Name reg, const int size) const { + for (int i = 0; i < size; i++) { + if (!Member(reg - i)) { + return false; + } + } + return true; +} + // only indicies of power 2 are accessed, so index 3 is only filled in for storage. static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 }; //------------------------------find_first_set--------------------------------- // Find the lowest-numbered register set in the mask. Return the // HIGHEST register number in the set, or BAD if no sets. // Works also for size 1. -OptoReg::Name RegMask::find_first_set(const int size) const { - verify_sets(size); +OptoReg::Name RegMask::find_first_set(LRG &lrg, const int size) const { + if (lrg.is_scalable()) { + // For scalable vector register, regmask is SlotsPerVecA bits aligned. + assert(is_aligned_sets(SlotsPerVecA), "mask is not aligned, adjacent sets"); + } else { + assert(is_aligned_sets(size), "mask is not aligned, adjacent sets"); + } for (int i = 0; i < RM_SIZE; i++) { if (_A[i]) { // Found some bits int bit = _A[i] & -_A[i]; // Extract low bit @@ -325,12 +359,16 @@ bool RegMask::is_aligned_sets(const int size) const { while (bits) { // Check bits for pairing int bit = bits & -bits; // Extract low bit // Low bit is not odd means its mis-aligned. - if ((bit & low_bits_mask) == 0) return false; + if ((bit & low_bits_mask) == 0) { + return false; + } // Do extra work since (bit << size) may overflow. int hi_bit = bit << (size-1); // high bit int set = hi_bit + ((hi_bit-1) & ~(bit-1)); // Check for aligned adjacent bits in this set - if ((bits & set) != set) return false; + if ((bits & set) != set) { + return false; + } bits -= set; // Remove this set } } diff --git a/src/hotspot/share/opto/regmask.hpp b/src/hotspot/share/opto/regmask.hpp index c64d08795..b733b87ad 100644 --- a/src/hotspot/share/opto/regmask.hpp +++ b/src/hotspot/share/opto/regmask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,6 +28,8 @@ #include "code/vmreg.hpp" #include "opto/optoreg.hpp" +class LRG; + // Some fun naming (textual) substitutions: // // RegMask::get_low_elem() ==> RegMask::find_first_elem() @@ -95,11 +97,13 @@ public: // requirement is internal to the allocator, and independent of any // particular platform. enum { SlotsPerLong = 2, + SlotsPerVecA = 8, SlotsPerVecS = 1, SlotsPerVecD = 2, SlotsPerVecX = 4, SlotsPerVecY = 8, - SlotsPerVecZ = 16 }; + SlotsPerVecZ = 16, + }; // A constructor only used by the ADLC output. All mask fields are filled // in directly. Calls to this look something like RM(1,2,3,4); @@ -204,10 +208,14 @@ public: return false; } + // Check that whether given reg number with size is valid + // for current regmask, where reg is the highest number. + bool is_valid_reg(OptoReg::Name reg, const int size) const; + // Find the lowest-numbered register set in the mask. Return the // HIGHEST register number in the set, or BAD if no sets. // Assert that the mask contains only bit sets. - OptoReg::Name find_first_set(const int size) const; + OptoReg::Name find_first_set(LRG &lrg, const int size) const; // Clear out partial bits; leave only aligned adjacent bit sets of size. void clear_to_sets(const int size); @@ -226,6 +234,7 @@ public: static bool is_vector(uint ireg); static int num_registers(uint ireg); + static int num_registers(uint ireg, LRG &lrg); // Fast overlap test. Non-zero if any registers in common. int overlap( const RegMask &rm ) const { diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index e7714ba3e..a6a62ea4a 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -93,8 +93,11 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) : //------------------------------transform_loop--------------------------- void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { assert(UseSuperWord, "should be"); - // Do vectors exist on this architecture? - if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; + // SuperWord only works with power of two vector sizes. + int vector_width = Matcher::vector_width_in_bytes(T_BYTE); + if (vector_width < 2 || !is_power_of_2(vector_width)) { + return; + } assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); diff --git a/src/hotspot/share/opto/type.cpp b/src/hotspot/share/opto/type.cpp index 8898a3f00..37ec81995 100644 --- a/src/hotspot/share/opto/type.cpp +++ b/src/hotspot/share/opto/type.cpp @@ -79,6 +79,7 @@ const Type::TypeInfo Type::_type_info[Type::lastype] = { { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #else // all other + { Bad, T_ILLEGAL, "vectora:", false, Op_VecA, relocInfo::none }, // VectorA. { Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX @@ -649,6 +650,10 @@ void Type::Initialize_shared(Compile* current) { // get_zero_type() should not happen for T_CONFLICT _zero_type[T_CONFLICT]= NULL; + if (Matcher::supports_scalable_vector()) { + TypeVect::VECTA = TypeVect::make(T_BYTE, Matcher::scalable_vector_reg_size(T_BYTE)); + } + // Vector predefined types, it needs initialized _const_basic_type[]. if (Matcher::vector_size_supported(T_BYTE,4)) { TypeVect::VECTS = TypeVect::make(T_BYTE,4); @@ -665,6 +670,8 @@ void Type::Initialize_shared(Compile* current) { if (Matcher::vector_size_supported(T_FLOAT,16)) { TypeVect::VECTZ = TypeVect::make(T_FLOAT,16); } + + mreg2type[Op_VecA] = TypeVect::VECTA; mreg2type[Op_VecS] = TypeVect::VECTS; mreg2type[Op_VecD] = TypeVect::VECTD; mreg2type[Op_VecX] = TypeVect::VECTX; @@ -984,6 +991,7 @@ const Type::TYPES Type::dual_type[Type::lastype] = { Bad, // Tuple - handled in v-call Bad, // Array - handled in v-call + Bad, // VectorA - handled in v-call Bad, // VectorS - handled in v-call Bad, // VectorD - handled in v-call Bad, // VectorX - handled in v-call @@ -1880,7 +1888,6 @@ const TypeTuple *TypeTuple::LONG_PAIR; const TypeTuple *TypeTuple::INT_CC_PAIR; const TypeTuple *TypeTuple::LONG_CC_PAIR; - //------------------------------make------------------------------------------- // Make a TypeTuple from the range of a method signature const TypeTuple *TypeTuple::make_range(ciSignature* sig) { @@ -2252,6 +2259,7 @@ bool TypeAry::ary_must_be_exact() const { //==============================TypeVect======================================= // Convenience common pre-built types. +const TypeVect *TypeVect::VECTA = NULL; // vector length agnostic const TypeVect *TypeVect::VECTS = NULL; // 32-bit vectors const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors @@ -2262,10 +2270,11 @@ const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors const TypeVect* TypeVect::make(const Type *elem, uint length) { BasicType elem_bt = elem->array_element_basic_type(); assert(is_java_primitive(elem_bt), "only primitive types in vector"); - assert(length > 1 && is_power_of_2(length), "vector length is power of 2"); assert(Matcher::vector_size_supported(elem_bt, length), "length in range"); int size = length * type2aelembytes(elem_bt); switch (Matcher::vector_ideal_reg(size)) { + case Op_VecA: + return (TypeVect*)(new TypeVectA(elem, length))->hashcons(); case Op_VecS: return (TypeVect*)(new TypeVectS(elem, length))->hashcons(); case Op_RegL: @@ -2297,7 +2306,7 @@ const Type *TypeVect::xmeet( const Type *t ) const { default: // All else is a mistake typerr(t); - + case VectorA: case VectorS: case VectorD: case VectorX: @@ -2352,6 +2361,8 @@ bool TypeVect::empty(void) const { #ifndef PRODUCT void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const { switch (base()) { + case VectorA: + st->print("vectora["); break; case VectorS: st->print("vectors["); break; case VectorD: diff --git a/src/hotspot/share/opto/type.hpp b/src/hotspot/share/opto/type.hpp index 6c8194670..ca92fe3ab 100644 --- a/src/hotspot/share/opto/type.hpp +++ b/src/hotspot/share/opto/type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -53,6 +53,7 @@ class TypeNarrowKlass; class TypeAry; class TypeTuple; class TypeVect; +class TypeVectA; class TypeVectS; class TypeVectD; class TypeVectX; @@ -87,6 +88,7 @@ public: Tuple, // Method signature or object layout Array, // Array types + VectorA, // (Scalable) Vector types for vector length agnostic VectorS, // 32bit Vector types VectorD, // 64bit Vector types VectorX, // 128bit Vector types @@ -754,6 +756,7 @@ public: virtual const Type *xmeet( const Type *t) const; virtual const Type *xdual() const; // Compute dual right now. + static const TypeVect *VECTA; static const TypeVect *VECTS; static const TypeVect *VECTD; static const TypeVect *VECTX; @@ -765,6 +768,11 @@ public: #endif }; +class TypeVectA : public TypeVect { + friend class TypeVect; + TypeVectA(const Type* elem, uint length) : TypeVect(VectorA, elem, length) {} +}; + class TypeVectS : public TypeVect { friend class TypeVect; TypeVectS(const Type* elem, uint length) : TypeVect(VectorS, elem, length) {} @@ -1611,12 +1619,12 @@ inline const TypeAry *Type::is_ary() const { } inline const TypeVect *Type::is_vect() const { - assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" ); + assert( _base >= VectorA && _base <= VectorZ, "Not a Vector" ); return (TypeVect*)this; } inline const TypeVect *Type::isa_vect() const { - return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL; + return (_base >= VectorA && _base <= VectorZ) ? (TypeVect*)this : NULL; } inline const TypePtr *Type::is_ptr() const { diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index fae147fa8..3a0a42513 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -221,7 +221,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = VectorNode::opcode(opc, bt); - return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen); + return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } @@ -608,7 +608,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported(vopc); + return vopc != opc && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java new file mode 100644 index 000000000..dc15ca800 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java @@ -0,0 +1,128 @@ +/* +* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +* Copyright (c) 2020, Arm Limited. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +/** + * @test + * + * @requires os.arch == "aarch64" & vm.compiler2.enabled + * @summary Verify VM SVE checking behavior + * @library /test/lib + * @run main/othervm/native compiler.c2.aarch64.TestSVEWithJNI + * + */ + +package compiler.c2.aarch64; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import jdk.test.lib.process.ProcessTools; +import jdk.test.lib.process.OutputAnalyzer; + +public class TestSVEWithJNI { + static { + System.loadLibrary("TestSVEWithJNI"); + } + + static final int EXIT_CODE = 99; + // Returns a nonnegative on success, or a negative value on error. + public static native int setVectorLength(int arg); + // Returns a nonnegative value on success, or a negative value on error. + public static native int getVectorLength(); + + public static final String MSG = "Current Vector Size: "; + public static void testNormal() { + int vlen = getVectorLength(); + System.out.println(MSG + vlen); + // Should be fine if no vector length changed. + if (setVectorLength(vlen) < 0) { + throw new Error("Error in setting vector length."); + } + } + + public static void testAbort() { + int vlen = getVectorLength(); + if (vlen <= 16) { + throw new Error("Error: unsupported vector length."); + } + if (setVectorLength(16) < 0) { + throw new Error("Error: setting vector length failed."); + } + } + + public static ProcessBuilder createProcessBuilder(String [] args, String mode) { + List vmopts = new ArrayList<>(); + String testjdkPath = System.getProperty("test.jdk"); + Collections.addAll(vmopts, "-Dtest.jdk=" + testjdkPath); + Collections.addAll(vmopts, args); + Collections.addAll(vmopts, TestSVEWithJNI.class.getName(), mode); + return ProcessTools.createJavaProcessBuilder(vmopts.toArray(new String[vmopts.size()])); + } + + public static void main(String [] args) throws Exception { + if (args.length == 0) { + int vlen = getVectorLength(); + if (vlen < 0) { + return; + } + String [][] testOpts = { + {"-Xint", "-XX:UseSVE=1"}, + {"-Xcomp", "-XX:UseSVE=1"}, + }; + ProcessBuilder pb; + OutputAnalyzer output; + for (String [] opts : testOpts) { + pb = createProcessBuilder(opts, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + + pb = createProcessBuilder(opts, "abort"); + output = new OutputAnalyzer(pb.start()); + output.shouldNotHaveExitValue(EXIT_CODE); + output.shouldMatch("(error|Error|ERROR)"); + } + + // Verify MaxVectorSize + + // Any SVE architecture should support 128-bit vector size. + pb = createProcessBuilder(new String []{"-XX:UseSVE=1", "-XX:MaxVectorSize=16"}, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + output.shouldContain(MSG + 16); + + // An unsupported large vector size value. + pb = createProcessBuilder(new String []{"-XX:UseSVE=1", "-XX:MaxVectorSize=512"}, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + output.shouldContain("warning"); + } else if (args[0].equals("normal")) { + testNormal(); + System.exit(EXIT_CODE); + } else if (args[0].equals("abort")) { + testAbort(); + System.exit(EXIT_CODE); + } + } +} diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c b/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c new file mode 100644 index 000000000..0cb3ab0b5 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c @@ -0,0 +1,68 @@ +/* +* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +* Copyright (c) 2020, Arm Limited. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#ifdef __aarch64__ + +#include +#include +#include +#include +#include +#include + +#ifndef PR_SVE_GET_VL +// For old toolchains which do not have SVE related macros defined. +#define PR_SVE_SET_VL 50 +#define PR_SVE_GET_VL 51 +#endif + +int get_current_thread_vl() { + return prctl(PR_SVE_GET_VL); +} + +int set_current_thread_vl(unsigned long arg) { + return prctl(PR_SVE_SET_VL, arg); +} + +#ifdef __cplusplus +extern "C" { +#endif + +JNIEXPORT jint JNICALL Java_compiler_c2_aarch64_TestSVEWithJNI_setVectorLength +(JNIEnv * env, jclass clz, jint length) { + return set_current_thread_vl(length); +} + +JNIEXPORT jint JNICALL Java_compiler_c2_aarch64_TestSVEWithJNI_getVectorLength +(JNIEnv *env, jclass clz) { + return get_current_thread_vl(); +} + + +#ifdef __cplusplus +} +#endif + +#endif -- 2.19.1