diff --git a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk index a39640526..2479853fa 100644 --- a/make/hotspot/gensrc/GensrcAdlc.gmk +++ b/make/hotspot/gensrc/GensrcAdlc.gmk @@ -146,6 +146,12 @@ ifeq ($(call check-jvm-feature, compiler2), true) ))) endif + ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64) + AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ + $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \ + ))) + endif + ifeq ($(call check-jvm-feature, shenandoahgc), true) AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/shenandoah/shenandoah_$(HOTSPOT_TARGET_CPU).ad \ diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index b64919a62..fa434df7d 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -69,7 +69,7 @@ register %{ // // r0-r7,r10-r26 volatile (caller save) // r27-r32 system (no save, no allocate) -// r8-r9 invisible to the allocator (so we can use them as scratch regs) +// r8-r9 non-allocatable (so we can use them as scratch regs) // // as regards Java usage. we don't use any callee save registers // because this makes it difficult to de-optimise a frame (see comment @@ -94,6 +94,10 @@ reg_def R6 ( SOC, SOC, Op_RegI, 6, r6->as_VMReg() ); reg_def R6_H ( SOC, SOC, Op_RegI, 6, r6->as_VMReg()->next() ); reg_def R7 ( SOC, SOC, Op_RegI, 7, r7->as_VMReg() ); reg_def R7_H ( SOC, SOC, Op_RegI, 7, r7->as_VMReg()->next() ); +reg_def R8 ( NS, SOC, Op_RegI, 8, r8->as_VMReg() ); // rscratch1, non-allocatable +reg_def R8_H ( NS, SOC, Op_RegI, 8, r8->as_VMReg()->next() ); +reg_def R9 ( NS, SOC, Op_RegI, 9, r9->as_VMReg() ); // rscratch2, non-allocatable +reg_def R9_H ( NS, SOC, Op_RegI, 9, r9->as_VMReg()->next() ); reg_def R10 ( SOC, SOC, Op_RegI, 10, r10->as_VMReg() ); reg_def R10_H ( SOC, SOC, Op_RegI, 10, r10->as_VMReg()->next()); reg_def R11 ( SOC, SOC, Op_RegI, 11, r11->as_VMReg() ); @@ -140,7 +144,7 @@ reg_def R31 ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg() ); // sp reg_def R31_H ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg()->next()); // ---------------------------- -// Float/Double Registers +// Float/Double/Vector Registers // ---------------------------- // Double Registers @@ -161,165 +165,316 @@ reg_def R31_H ( NS, NS, Op_RegI, 31, r31_sp->as_VMReg()->next()); // the platform ABI treats v8-v15 as callee save). float registers // v16-v31 are SOC as per the platform spec - reg_def V0 ( SOC, SOC, Op_RegF, 0, v0->as_VMReg() ); - reg_def V0_H ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next() ); - reg_def V0_J ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(2) ); - reg_def V0_K ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(3) ); - - reg_def V1 ( SOC, SOC, Op_RegF, 1, v1->as_VMReg() ); - reg_def V1_H ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next() ); - reg_def V1_J ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(2) ); - reg_def V1_K ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(3) ); - - reg_def V2 ( SOC, SOC, Op_RegF, 2, v2->as_VMReg() ); - reg_def V2_H ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next() ); - reg_def V2_J ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(2) ); - reg_def V2_K ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(3) ); - - reg_def V3 ( SOC, SOC, Op_RegF, 3, v3->as_VMReg() ); - reg_def V3_H ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next() ); - reg_def V3_J ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(2) ); - reg_def V3_K ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(3) ); - - reg_def V4 ( SOC, SOC, Op_RegF, 4, v4->as_VMReg() ); - reg_def V4_H ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next() ); - reg_def V4_J ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(2) ); - reg_def V4_K ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(3) ); - - reg_def V5 ( SOC, SOC, Op_RegF, 5, v5->as_VMReg() ); - reg_def V5_H ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next() ); - reg_def V5_J ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(2) ); - reg_def V5_K ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(3) ); - - reg_def V6 ( SOC, SOC, Op_RegF, 6, v6->as_VMReg() ); - reg_def V6_H ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next() ); - reg_def V6_J ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(2) ); - reg_def V6_K ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(3) ); - - reg_def V7 ( SOC, SOC, Op_RegF, 7, v7->as_VMReg() ); - reg_def V7_H ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next() ); - reg_def V7_J ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(2) ); - reg_def V7_K ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(3) ); - - reg_def V8 ( SOC, SOE, Op_RegF, 8, v8->as_VMReg() ); - reg_def V8_H ( SOC, SOE, Op_RegF, 8, v8->as_VMReg()->next() ); - reg_def V8_J ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(2) ); - reg_def V8_K ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(3) ); - - reg_def V9 ( SOC, SOE, Op_RegF, 9, v9->as_VMReg() ); - reg_def V9_H ( SOC, SOE, Op_RegF, 9, v9->as_VMReg()->next() ); - reg_def V9_J ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(2) ); - reg_def V9_K ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(3) ); - - reg_def V10 ( SOC, SOE, Op_RegF, 10, v10->as_VMReg() ); - reg_def V10_H( SOC, SOE, Op_RegF, 10, v10->as_VMReg()->next() ); - reg_def V10_J( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2)); - reg_def V10_K( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3)); - - reg_def V11 ( SOC, SOE, Op_RegF, 11, v11->as_VMReg() ); - reg_def V11_H( SOC, SOE, Op_RegF, 11, v11->as_VMReg()->next() ); - reg_def V11_J( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2)); - reg_def V11_K( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3)); - - reg_def V12 ( SOC, SOE, Op_RegF, 12, v12->as_VMReg() ); - reg_def V12_H( SOC, SOE, Op_RegF, 12, v12->as_VMReg()->next() ); - reg_def V12_J( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2)); - reg_def V12_K( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3)); - - reg_def V13 ( SOC, SOE, Op_RegF, 13, v13->as_VMReg() ); - reg_def V13_H( SOC, SOE, Op_RegF, 13, v13->as_VMReg()->next() ); - reg_def V13_J( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2)); - reg_def V13_K( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3)); - - reg_def V14 ( SOC, SOE, Op_RegF, 14, v14->as_VMReg() ); - reg_def V14_H( SOC, SOE, Op_RegF, 14, v14->as_VMReg()->next() ); - reg_def V14_J( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2)); - reg_def V14_K( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3)); - - reg_def V15 ( SOC, SOE, Op_RegF, 15, v15->as_VMReg() ); - reg_def V15_H( SOC, SOE, Op_RegF, 15, v15->as_VMReg()->next() ); - reg_def V15_J( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2)); - reg_def V15_K( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3)); - - reg_def V16 ( SOC, SOC, Op_RegF, 16, v16->as_VMReg() ); - reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() ); - reg_def V16_J( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2)); - reg_def V16_K( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3)); - - reg_def V17 ( SOC, SOC, Op_RegF, 17, v17->as_VMReg() ); - reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() ); - reg_def V17_J( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2)); - reg_def V17_K( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3)); - - reg_def V18 ( SOC, SOC, Op_RegF, 18, v18->as_VMReg() ); - reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() ); - reg_def V18_J( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2)); - reg_def V18_K( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3)); - - reg_def V19 ( SOC, SOC, Op_RegF, 19, v19->as_VMReg() ); - reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() ); - reg_def V19_J( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2)); - reg_def V19_K( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3)); - - reg_def V20 ( SOC, SOC, Op_RegF, 20, v20->as_VMReg() ); - reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() ); - reg_def V20_J( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2)); - reg_def V20_K( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3)); - - reg_def V21 ( SOC, SOC, Op_RegF, 21, v21->as_VMReg() ); - reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() ); - reg_def V21_J( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2)); - reg_def V21_K( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3)); - - reg_def V22 ( SOC, SOC, Op_RegF, 22, v22->as_VMReg() ); - reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() ); - reg_def V22_J( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2)); - reg_def V22_K( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3)); - - reg_def V23 ( SOC, SOC, Op_RegF, 23, v23->as_VMReg() ); - reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() ); - reg_def V23_J( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2)); - reg_def V23_K( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3)); - - reg_def V24 ( SOC, SOC, Op_RegF, 24, v24->as_VMReg() ); - reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() ); - reg_def V24_J( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2)); - reg_def V24_K( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3)); - - reg_def V25 ( SOC, SOC, Op_RegF, 25, v25->as_VMReg() ); - reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() ); - reg_def V25_J( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2)); - reg_def V25_K( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3)); - - reg_def V26 ( SOC, SOC, Op_RegF, 26, v26->as_VMReg() ); - reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() ); - reg_def V26_J( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2)); - reg_def V26_K( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3)); - - reg_def V27 ( SOC, SOC, Op_RegF, 27, v27->as_VMReg() ); - reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() ); - reg_def V27_J( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2)); - reg_def V27_K( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3)); - - reg_def V28 ( SOC, SOC, Op_RegF, 28, v28->as_VMReg() ); - reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() ); - reg_def V28_J( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2)); - reg_def V28_K( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3)); - - reg_def V29 ( SOC, SOC, Op_RegF, 29, v29->as_VMReg() ); - reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() ); - reg_def V29_J( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2)); - reg_def V29_K( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3)); - - reg_def V30 ( SOC, SOC, Op_RegF, 30, v30->as_VMReg() ); - reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() ); - reg_def V30_J( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2)); - reg_def V30_K( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3)); - - reg_def V31 ( SOC, SOC, Op_RegF, 31, v31->as_VMReg() ); - reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() ); - reg_def V31_J( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2)); - reg_def V31_K( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3)); +// For SVE vector registers, we simply extend vector register size to 8 +// 'logical' slots. This is nominally 256 bits but it actually covers +// all possible 'physical' SVE vector register lengths from 128 ~ 2048 +// bits. The 'physical' SVE vector register length is detected during +// startup, so the register allocator is able to identify the correct +// number of bytes needed for an SVE spill/unspill. +// Note that a vector register with 4 slots denotes a 128-bit NEON +// register allowing it to be distinguished from the corresponding SVE +// vector register when the SVE vector length is 128 bits. + + reg_def V0 ( SOC, SOC, Op_RegF, 0, v0->as_VMReg() ); + reg_def V0_H ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next() ); + reg_def V0_J ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(2) ); + reg_def V0_K ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(3) ); + reg_def V0_L ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(4) ); + reg_def V0_M ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(5) ); + reg_def V0_N ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(6) ); + reg_def V0_O ( SOC, SOC, Op_RegF, 0, v0->as_VMReg()->next(7) ); + + reg_def V1 ( SOC, SOC, Op_RegF, 1, v1->as_VMReg() ); + reg_def V1_H ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next() ); + reg_def V1_J ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(2) ); + reg_def V1_K ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(3) ); + reg_def V1_L ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(4) ); + reg_def V1_M ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(5) ); + reg_def V1_N ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(6) ); + reg_def V1_O ( SOC, SOC, Op_RegF, 1, v1->as_VMReg()->next(7) ); + + reg_def V2 ( SOC, SOC, Op_RegF, 2, v2->as_VMReg() ); + reg_def V2_H ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next() ); + reg_def V2_J ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(2) ); + reg_def V2_K ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(3) ); + reg_def V2_L ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(4) ); + reg_def V2_M ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(5) ); + reg_def V2_N ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(6) ); + reg_def V2_O ( SOC, SOC, Op_RegF, 2, v2->as_VMReg()->next(7) ); + + reg_def V3 ( SOC, SOC, Op_RegF, 3, v3->as_VMReg() ); + reg_def V3_H ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next() ); + reg_def V3_J ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(2) ); + reg_def V3_K ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(3) ); + reg_def V3_L ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(4) ); + reg_def V3_M ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(5) ); + reg_def V3_N ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(6) ); + reg_def V3_O ( SOC, SOC, Op_RegF, 3, v3->as_VMReg()->next(7) ); + + reg_def V4 ( SOC, SOC, Op_RegF, 4, v4->as_VMReg() ); + reg_def V4_H ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next() ); + reg_def V4_J ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(2) ); + reg_def V4_K ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(3) ); + reg_def V4_L ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(4) ); + reg_def V4_M ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(5) ); + reg_def V4_N ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(6) ); + reg_def V4_O ( SOC, SOC, Op_RegF, 4, v4->as_VMReg()->next(7) ); + + reg_def V5 ( SOC, SOC, Op_RegF, 5, v5->as_VMReg() ); + reg_def V5_H ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next() ); + reg_def V5_J ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(2) ); + reg_def V5_K ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(3) ); + reg_def V5_L ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(4) ); + reg_def V5_M ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(5) ); + reg_def V5_N ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(6) ); + reg_def V5_O ( SOC, SOC, Op_RegF, 5, v5->as_VMReg()->next(7) ); + + reg_def V6 ( SOC, SOC, Op_RegF, 6, v6->as_VMReg() ); + reg_def V6_H ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next() ); + reg_def V6_J ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(2) ); + reg_def V6_K ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(3) ); + reg_def V6_L ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(4) ); + reg_def V6_M ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(5) ); + reg_def V6_N ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(6) ); + reg_def V6_O ( SOC, SOC, Op_RegF, 6, v6->as_VMReg()->next(7) ); + + reg_def V7 ( SOC, SOC, Op_RegF, 7, v7->as_VMReg() ); + reg_def V7_H ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next() ); + reg_def V7_J ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(2) ); + reg_def V7_K ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(3) ); + reg_def V7_L ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(4) ); + reg_def V7_M ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(5) ); + reg_def V7_N ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(6) ); + reg_def V7_O ( SOC, SOC, Op_RegF, 7, v7->as_VMReg()->next(7) ); + + reg_def V8 ( SOC, SOE, Op_RegF, 8, v8->as_VMReg() ); + reg_def V8_H ( SOC, SOE, Op_RegF, 8, v8->as_VMReg()->next() ); + reg_def V8_J ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(2) ); + reg_def V8_K ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(3) ); + reg_def V8_L ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(4) ); + reg_def V8_M ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(5) ); + reg_def V8_N ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(6) ); + reg_def V8_O ( SOC, SOC, Op_RegF, 8, v8->as_VMReg()->next(7) ); + + reg_def V9 ( SOC, SOE, Op_RegF, 9, v9->as_VMReg() ); + reg_def V9_H ( SOC, SOE, Op_RegF, 9, v9->as_VMReg()->next() ); + reg_def V9_J ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(2) ); + reg_def V9_K ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(3) ); + reg_def V9_L ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(4) ); + reg_def V9_M ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(5) ); + reg_def V9_N ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(6) ); + reg_def V9_O ( SOC, SOC, Op_RegF, 9, v9->as_VMReg()->next(7) ); + + reg_def V10 ( SOC, SOE, Op_RegF, 10, v10->as_VMReg() ); + reg_def V10_H ( SOC, SOE, Op_RegF, 10, v10->as_VMReg()->next() ); + reg_def V10_J ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2) ); + reg_def V10_K ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3) ); + reg_def V10_L ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(4) ); + reg_def V10_M ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(5) ); + reg_def V10_N ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(6) ); + reg_def V10_O ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(7) ); + + reg_def V11 ( SOC, SOE, Op_RegF, 11, v11->as_VMReg() ); + reg_def V11_H ( SOC, SOE, Op_RegF, 11, v11->as_VMReg()->next() ); + reg_def V11_J ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2) ); + reg_def V11_K ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3) ); + reg_def V11_L ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(4) ); + reg_def V11_M ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(5) ); + reg_def V11_N ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(6) ); + reg_def V11_O ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(7) ); + + reg_def V12 ( SOC, SOE, Op_RegF, 12, v12->as_VMReg() ); + reg_def V12_H ( SOC, SOE, Op_RegF, 12, v12->as_VMReg()->next() ); + reg_def V12_J ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2) ); + reg_def V12_K ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3) ); + reg_def V12_L ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(4) ); + reg_def V12_M ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(5) ); + reg_def V12_N ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(6) ); + reg_def V12_O ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(7) ); + + reg_def V13 ( SOC, SOE, Op_RegF, 13, v13->as_VMReg() ); + reg_def V13_H ( SOC, SOE, Op_RegF, 13, v13->as_VMReg()->next() ); + reg_def V13_J ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2) ); + reg_def V13_K ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3) ); + reg_def V13_L ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(4) ); + reg_def V13_M ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(5) ); + reg_def V13_N ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(6) ); + reg_def V13_O ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(7) ); + + reg_def V14 ( SOC, SOE, Op_RegF, 14, v14->as_VMReg() ); + reg_def V14_H ( SOC, SOE, Op_RegF, 14, v14->as_VMReg()->next() ); + reg_def V14_J ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2) ); + reg_def V14_K ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3) ); + reg_def V14_L ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(4) ); + reg_def V14_M ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(5) ); + reg_def V14_N ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(6) ); + reg_def V14_O ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(7) ); + + reg_def V15 ( SOC, SOE, Op_RegF, 15, v15->as_VMReg() ); + reg_def V15_H ( SOC, SOE, Op_RegF, 15, v15->as_VMReg()->next() ); + reg_def V15_J ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2) ); + reg_def V15_K ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3) ); + reg_def V15_L ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(4) ); + reg_def V15_M ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(5) ); + reg_def V15_N ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(6) ); + reg_def V15_O ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(7) ); + + reg_def V16 ( SOC, SOC, Op_RegF, 16, v16->as_VMReg() ); + reg_def V16_H ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() ); + reg_def V16_J ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2) ); + reg_def V16_K ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3) ); + reg_def V16_L ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(4) ); + reg_def V16_M ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(5) ); + reg_def V16_N ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(6) ); + reg_def V16_O ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(7) ); + + reg_def V17 ( SOC, SOC, Op_RegF, 17, v17->as_VMReg() ); + reg_def V17_H ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() ); + reg_def V17_J ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2) ); + reg_def V17_K ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3) ); + reg_def V17_L ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(4) ); + reg_def V17_M ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(5) ); + reg_def V17_N ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(6) ); + reg_def V17_O ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(7) ); + + reg_def V18 ( SOC, SOC, Op_RegF, 18, v18->as_VMReg() ); + reg_def V18_H ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() ); + reg_def V18_J ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2) ); + reg_def V18_K ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3) ); + reg_def V18_L ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(4) ); + reg_def V18_M ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(5) ); + reg_def V18_N ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(6) ); + reg_def V18_O ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(7) ); + + reg_def V19 ( SOC, SOC, Op_RegF, 19, v19->as_VMReg() ); + reg_def V19_H ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() ); + reg_def V19_J ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2) ); + reg_def V19_K ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3) ); + reg_def V19_L ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(4) ); + reg_def V19_M ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(5) ); + reg_def V19_N ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(6) ); + reg_def V19_O ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(7) ); + + reg_def V20 ( SOC, SOC, Op_RegF, 20, v20->as_VMReg() ); + reg_def V20_H ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() ); + reg_def V20_J ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2) ); + reg_def V20_K ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3) ); + reg_def V20_L ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(4) ); + reg_def V20_M ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(5) ); + reg_def V20_N ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(6) ); + reg_def V20_O ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(7) ); + + reg_def V21 ( SOC, SOC, Op_RegF, 21, v21->as_VMReg() ); + reg_def V21_H ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() ); + reg_def V21_J ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2) ); + reg_def V21_K ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3) ); + reg_def V21_L ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(4) ); + reg_def V21_M ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(5) ); + reg_def V21_N ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(6) ); + reg_def V21_O ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(7) ); + + reg_def V22 ( SOC, SOC, Op_RegF, 22, v22->as_VMReg() ); + reg_def V22_H ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() ); + reg_def V22_J ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2) ); + reg_def V22_K ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3) ); + reg_def V22_L ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(4) ); + reg_def V22_M ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(5) ); + reg_def V22_N ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(6) ); + reg_def V22_O ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(7) ); + + reg_def V23 ( SOC, SOC, Op_RegF, 23, v23->as_VMReg() ); + reg_def V23_H ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() ); + reg_def V23_J ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2) ); + reg_def V23_K ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3) ); + reg_def V23_L ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(4) ); + reg_def V23_M ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(5) ); + reg_def V23_N ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(6) ); + reg_def V23_O ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(7) ); + + reg_def V24 ( SOC, SOC, Op_RegF, 24, v24->as_VMReg() ); + reg_def V24_H ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() ); + reg_def V24_J ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2) ); + reg_def V24_K ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3) ); + reg_def V24_L ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(4) ); + reg_def V24_M ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(5) ); + reg_def V24_N ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(6) ); + reg_def V24_O ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(7) ); + + reg_def V25 ( SOC, SOC, Op_RegF, 25, v25->as_VMReg() ); + reg_def V25_H ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() ); + reg_def V25_J ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2) ); + reg_def V25_K ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3) ); + reg_def V25_L ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(4) ); + reg_def V25_M ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(5) ); + reg_def V25_N ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(6) ); + reg_def V25_O ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(7) ); + + reg_def V26 ( SOC, SOC, Op_RegF, 26, v26->as_VMReg() ); + reg_def V26_H ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() ); + reg_def V26_J ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2) ); + reg_def V26_K ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3) ); + reg_def V26_L ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(4) ); + reg_def V26_M ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(5) ); + reg_def V26_N ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(6) ); + reg_def V26_O ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(7) ); + + reg_def V27 ( SOC, SOC, Op_RegF, 27, v27->as_VMReg() ); + reg_def V27_H ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() ); + reg_def V27_J ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2) ); + reg_def V27_K ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3) ); + reg_def V27_L ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(4) ); + reg_def V27_M ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(5) ); + reg_def V27_N ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(6) ); + reg_def V27_O ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(7) ); + + reg_def V28 ( SOC, SOC, Op_RegF, 28, v28->as_VMReg() ); + reg_def V28_H ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() ); + reg_def V28_J ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2) ); + reg_def V28_K ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3) ); + reg_def V28_L ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(4) ); + reg_def V28_M ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(5) ); + reg_def V28_N ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(6) ); + reg_def V28_O ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(7) ); + + reg_def V29 ( SOC, SOC, Op_RegF, 29, v29->as_VMReg() ); + reg_def V29_H ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() ); + reg_def V29_J ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2) ); + reg_def V29_K ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3) ); + reg_def V29_L ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(4) ); + reg_def V29_M ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(5) ); + reg_def V29_N ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(6) ); + reg_def V29_O ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(7) ); + + reg_def V30 ( SOC, SOC, Op_RegF, 30, v30->as_VMReg() ); + reg_def V30_H ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() ); + reg_def V30_J ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2) ); + reg_def V30_K ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3) ); + reg_def V30_L ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(4) ); + reg_def V30_M ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(5) ); + reg_def V30_N ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(6) ); + reg_def V30_O ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(7) ); + + reg_def V31 ( SOC, SOC, Op_RegF, 31, v31->as_VMReg() ); + reg_def V31_H ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() ); + reg_def V31_J ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2) ); + reg_def V31_K ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3) ); + reg_def V31_L ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(4) ); + reg_def V31_M ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(5) ); + reg_def V31_N ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(6) ); + reg_def V31_O ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(7) ); + + +// ---------------------------- +// SVE Predicate Registers +// ---------------------------- + reg_def P0 (SOC, SOC, Op_RegVMask, 0, p0->as_VMReg()); + reg_def P1 (SOC, SOC, Op_RegVMask, 1, p1->as_VMReg()); + reg_def P2 (SOC, SOC, Op_RegVMask, 2, p2->as_VMReg()); + reg_def P3 (SOC, SOC, Op_RegVMask, 3, p3->as_VMReg()); + reg_def P4 (SOC, SOC, Op_RegVMask, 4, p4->as_VMReg()); + reg_def P5 (SOC, SOC, Op_RegVMask, 5, p5->as_VMReg()); + reg_def P6 (SOC, SOC, Op_RegVMask, 6, p6->as_VMReg()); + reg_def P7 (SOC, SOC, Op_RegVMask, 7, p7->as_VMReg()); // ---------------------------- // Special Registers @@ -381,50 +536,64 @@ alloc_class chunk0( R29, R29_H, // fp R30, R30_H, // lr R31, R31_H, // sp + R8, R8_H, // rscratch1 + R9, R9_H, // rscratch2 ); alloc_class chunk1( // no save - V16, V16_H, V16_J, V16_K, - V17, V17_H, V17_J, V17_K, - V18, V18_H, V18_J, V18_K, - V19, V19_H, V19_J, V19_K, - V20, V20_H, V20_J, V20_K, - V21, V21_H, V21_J, V21_K, - V22, V22_H, V22_J, V22_K, - V23, V23_H, V23_J, V23_K, - V24, V24_H, V24_J, V24_K, - V25, V25_H, V25_J, V25_K, - V26, V26_H, V26_J, V26_K, - V27, V27_H, V27_J, V27_K, - V28, V28_H, V28_J, V28_K, - V29, V29_H, V29_J, V29_K, - V30, V30_H, V30_J, V30_K, - V31, V31_H, V31_J, V31_K, + V16, V16_H, V16_J, V16_K, V16_L, V16_M, V16_N, V16_O, + V17, V17_H, V17_J, V17_K, V17_L, V17_M, V17_N, V17_O, + V18, V18_H, V18_J, V18_K, V18_L, V18_M, V18_N, V18_O, + V19, V19_H, V19_J, V19_K, V19_L, V19_M, V19_N, V19_O, + V20, V20_H, V20_J, V20_K, V20_L, V20_M, V20_N, V20_O, + V21, V21_H, V21_J, V21_K, V21_L, V21_M, V21_N, V21_O, + V22, V22_H, V22_J, V22_K, V22_L, V22_M, V22_N, V22_O, + V23, V23_H, V23_J, V23_K, V23_L, V23_M, V23_N, V23_O, + V24, V24_H, V24_J, V24_K, V24_L, V24_M, V24_N, V24_O, + V25, V25_H, V25_J, V25_K, V25_L, V25_M, V25_N, V25_O, + V26, V26_H, V26_J, V26_K, V26_L, V26_M, V26_N, V26_O, + V27, V27_H, V27_J, V27_K, V27_L, V27_M, V27_N, V27_O, + V28, V28_H, V28_J, V28_K, V28_L, V28_M, V28_N, V28_O, + V29, V29_H, V29_J, V29_K, V29_L, V29_M, V29_N, V29_O, + V30, V30_H, V30_J, V30_K, V30_L, V30_M, V30_N, V30_O, + V31, V31_H, V31_J, V31_K, V31_L, V31_M, V31_N, V31_O, // arg registers - V0, V0_H, V0_J, V0_K, - V1, V1_H, V1_J, V1_K, - V2, V2_H, V2_J, V2_K, - V3, V3_H, V3_J, V3_K, - V4, V4_H, V4_J, V4_K, - V5, V5_H, V5_J, V5_K, - V6, V6_H, V6_J, V6_K, - V7, V7_H, V7_J, V7_K, + V0, V0_H, V0_J, V0_K, V0_L, V0_M, V0_N, V0_O, + V1, V1_H, V1_J, V1_K, V1_L, V1_M, V1_N, V1_O, + V2, V2_H, V2_J, V2_K, V2_L, V2_M, V2_N, V2_O, + V3, V3_H, V3_J, V3_K, V3_L, V3_M, V3_N, V3_O, + V4, V4_H, V4_J, V4_K, V4_L, V4_M, V4_N, V4_O, + V5, V5_H, V5_J, V5_K, V5_L, V5_M, V5_N, V5_O, + V6, V6_H, V6_J, V6_K, V6_L, V6_M, V6_N, V6_O, + V7, V7_H, V7_J, V7_K, V7_L, V7_M, V7_N, V7_O, // non-volatiles - V8, V8_H, V8_J, V8_K, - V9, V9_H, V9_J, V9_K, - V10, V10_H, V10_J, V10_K, - V11, V11_H, V11_J, V11_K, - V12, V12_H, V12_J, V12_K, - V13, V13_H, V13_J, V13_K, - V14, V14_H, V14_J, V14_K, - V15, V15_H, V15_J, V15_K, + V8, V8_H, V8_J, V8_K, V8_L, V8_M, V8_N, V8_O, + V9, V9_H, V9_J, V9_K, V9_L, V9_M, V9_N, V9_O, + V10, V10_H, V10_J, V10_K, V10_L, V10_M, V10_N, V10_O, + V11, V11_H, V11_J, V11_K, V11_L, V11_M, V11_N, V11_O, + V12, V12_H, V12_J, V12_K, V12_L, V12_M, V12_N, V12_O, + V13, V13_H, V13_J, V13_K, V13_L, V13_M, V13_N, V13_O, + V14, V14_H, V14_J, V14_K, V14_L, V14_M, V14_N, V14_O, + V15, V15_H, V15_J, V15_K, V15_L, V15_M, V15_N, V15_O, ); -alloc_class chunk2(RFLAGS); +alloc_class chunk2 ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + P7, + // Only use P0~P7 here for performance +); + +alloc_class chunk3(RFLAGS); //----------Architecture Description Register Classes-------------------------- // Several register classes are automatically defined based upon information in @@ -865,6 +1034,42 @@ reg_class double_reg( V31, V31_H ); +// Class for all SVE vector registers. +reg_class vectora_reg ( + V0, V0_H, V0_J, V0_K, V0_L, V0_M, V0_N, V0_O, + V1, V1_H, V1_J, V1_K, V1_L, V1_M, V1_N, V1_O, + V2, V2_H, V2_J, V2_K, V2_L, V2_M, V2_N, V2_O, + V3, V3_H, V3_J, V3_K, V3_L, V3_M, V3_N, V3_O, + V4, V4_H, V4_J, V4_K, V4_L, V4_M, V4_N, V4_O, + V5, V5_H, V5_J, V5_K, V5_L, V5_M, V5_N, V5_O, + V6, V6_H, V6_J, V6_K, V6_L, V6_M, V6_N, V6_O, + V7, V7_H, V7_J, V7_K, V7_L, V7_M, V7_N, V7_O, + V8, V8_H, V8_J, V8_K, V8_L, V8_M, V8_N, V8_O, + V9, V9_H, V9_J, V9_K, V9_L, V9_M, V9_N, V9_O, + V10, V10_H, V10_J, V10_K, V10_L, V10_M, V10_N, V10_O, + V11, V11_H, V11_J, V11_K, V11_L, V11_M, V11_N, V11_O, + V12, V12_H, V12_J, V12_K, V12_L, V12_M, V12_N, V12_O, + V13, V13_H, V13_J, V13_K, V13_L, V13_M, V13_N, V13_O, + V14, V14_H, V14_J, V14_K, V14_L, V14_M, V14_N, V14_O, + V15, V15_H, V15_J, V15_K, V15_L, V15_M, V15_N, V15_O, + V16, V16_H, V16_J, V16_K, V16_L, V16_M, V16_N, V16_O, + V17, V17_H, V17_J, V17_K, V17_L, V17_M, V17_N, V17_O, + V18, V18_H, V18_J, V18_K, V18_L, V18_M, V18_N, V18_O, + V19, V19_H, V19_J, V19_K, V19_L, V19_M, V19_N, V19_O, + V20, V20_H, V20_J, V20_K, V20_L, V20_M, V20_N, V20_O, + V21, V21_H, V21_J, V21_K, V21_L, V21_M, V21_N, V21_O, + V22, V22_H, V22_J, V22_K, V22_L, V22_M, V22_N, V22_O, + V23, V23_H, V23_J, V23_K, V23_L, V23_M, V23_N, V23_O, + V24, V24_H, V24_J, V24_K, V24_L, V24_M, V24_N, V24_O, + V25, V25_H, V25_J, V25_K, V25_L, V25_M, V25_N, V25_O, + V26, V26_H, V26_J, V26_K, V26_L, V26_M, V26_N, V26_O, + V27, V27_H, V27_J, V27_K, V27_L, V27_M, V27_N, V27_O, + V28, V28_H, V28_J, V28_K, V28_L, V28_M, V28_N, V28_O, + V29, V29_H, V29_J, V29_K, V29_L, V29_M, V29_N, V29_O, + V30, V30_H, V30_J, V30_K, V30_L, V30_M, V30_N, V30_O, + V31, V31_H, V31_J, V31_K, V31_L, V31_M, V31_N, V31_O, +); + // Class for all 64bit vector registers reg_class vectord_reg( V0, V0_H, @@ -1097,6 +1302,31 @@ reg_class v31_reg( V31, V31_H ); +// Class for all SVE predicate registers. +reg_class pr_reg ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + // P7, non-allocatable, preserved with all elements preset to TRUE. +); + +// Class for SVE governing predicate registers, which are used +// to determine the active elements of a predicated instruction. +reg_class gov_pr ( + P0, + P1, + P2, + P3, + P4, + P5, + P6, + // P7, non-allocatable, preserved with all elements preset to TRUE. +); + // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -1761,6 +1991,10 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { // branch if we need to invalidate the method later __ nop(); + if (UseSVE > 0 && C->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } + int bangsize = C->bang_size_in_bytes(); if (C->need_stack_bang(bangsize) && UseStackBanging) __ generate_stack_overflow_check(bangsize); @@ -1862,7 +2096,7 @@ int MachEpilogNode::safepoint_offset() const { // Figure out which register class each belongs in: rc_int, rc_float or // rc_stack. -enum RC { rc_bad, rc_int, rc_float, rc_stack }; +enum RC { rc_bad, rc_int, rc_float, rc_predicate, rc_stack }; static enum RC rc_class(OptoReg::Name reg) { @@ -1870,20 +2104,25 @@ static enum RC rc_class(OptoReg::Name reg) { return rc_bad; } - // we have 30 int registers * 2 halves - // (rscratch1 and rscratch2 are omitted) - int slots_of_int_registers = RegisterImpl::max_slots_per_register * (RegisterImpl::number_of_registers - 2); + // we have 32 int registers * 2 halves + int slots_of_int_registers = RegisterImpl::max_slots_per_register * RegisterImpl::number_of_registers; if (reg < slots_of_int_registers) { return rc_int; } - // we have 32 float register * 4 halves - if (reg < slots_of_int_registers + FloatRegisterImpl::max_slots_per_register * FloatRegisterImpl::number_of_registers) { + // we have 32 float register * 8 halves + int slots_of_float_registers = FloatRegisterImpl::max_slots_per_register * FloatRegisterImpl::number_of_registers; + if (reg < slots_of_int_registers + slots_of_float_registers) { return rc_float; } - // Between float regs & stack is the flags regs. + int slots_of_predicate_registers = PRegisterImpl::max_slots_per_register * PRegisterImpl::number_of_registers; + if (reg < slots_of_int_registers + slots_of_float_registers + slots_of_predicate_registers) { + return rc_predicate; + } + + // Between predicate regs & stack is the flags. assert(OptoReg::is_stack(reg), "blow up if spilling flags"); return rc_stack; @@ -1922,8 +2161,28 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo if (bottom_type()->isa_vect() != NULL) { uint ireg = ideal_reg(); - assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); - if (cbuf) { + if (ireg == Op_VecA && cbuf) { + MacroAssembler _masm(cbuf); + int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { + // stack->stack + __ spill_copy_sve_vector_stack_to_stack(src_offset, dst_offset, + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { + __ spill_sve_vector(as_FloatRegister(Matcher::_regEncode[src_lo]), ra_->reg2offset(dst_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { + __ unspill_sve_vector(as_FloatRegister(Matcher::_regEncode[dst_lo]), ra_->reg2offset(src_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { + __ sve_orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + ShouldNotReachHere(); + } + } else if (cbuf) { + assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); MacroAssembler _masm(cbuf); assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity"); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { @@ -1941,12 +2200,12 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo as_FloatRegister(Matcher::_regEncode[src_lo])); } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]), - ireg == Op_VecD ? __ D : __ Q, - ra_->reg2offset(dst_lo)); + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(dst_lo)); } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]), - ireg == Op_VecD ? __ D : __ Q, - ra_->reg2offset(src_lo)); + ireg == Op_VecD ? __ D : __ Q, + ra_->reg2offset(src_lo)); } else { ShouldNotReachHere(); } @@ -2031,9 +2290,24 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo st->print("%s", Matcher::regName[dst_lo]); } if (bottom_type()->isa_vect() != NULL) { - st->print("\t# vector spill size = %d", ideal_reg()==Op_VecD ? 64:128); + int vsize = 0; + switch (ideal_reg()) { + case Op_VecD: + vsize = 64; + break; + case Op_VecX: + vsize = 128; + break; + case Op_VecA: + vsize = Matcher::scalable_vector_reg_size(T_BYTE) * 8; + break; + default: + assert(false, "bad register type for spill"); + ShouldNotReachHere(); + } + st->print("\t# vector spill size = %d", vsize); } else { - st->print("\t# spill size = %d", is64 ? 64:32); + st->print("\t# spill size = %d", is64 ? 64 : 32); } } @@ -2192,19 +2466,32 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { - - // TODO - // identify extra cases that we might want to provide match rules for - // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - bool ret_value = match_rule_supported(opcode); - // Add rules here. - - return ret_value; // Per default match rules are supported. + // Identify extra cases that we might want to provide match rules for vector nodes and + // other intrinsics guarded with vector length (vlen) and element type (bt). + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { + if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { + return false; + } + int bit_size = vlen * type2aelembytes(bt) * 8; + if (UseSVE == 0 && bit_size > 128) { + return false; + } + if (UseSVE > 0) { + return op_sve_supported(opcode); + } else { // NEON + // Special cases + switch (opcode) { + case Op_MulVL: + return false; + default: + break; + } + } + return true; // Per default match rules are supported. } const bool Matcher::has_predicated_vectors(void) { - return false; + return UseSVE > 0; } const int Matcher::float_pressure(int default_pressure_threshold) { @@ -2240,7 +2527,8 @@ const bool Matcher::convL2FSupported(void) { // Vector width in bytes. const int Matcher::vector_width_in_bytes(BasicType bt) { - int size = MIN2(16,(int)MaxVectorSize); + // The MaxVectorSize should have been set by detecting SVE max vector register size. + int size = MIN2((UseSVE > 0) ? 256 : 16, (int)MaxVectorSize); // Minimum 2 values in vector if (size < 2*type2aelembytes(bt)) size = 0; // But never < 4 @@ -2253,14 +2541,32 @@ const int Matcher::max_vector_size(const BasicType bt) { return vector_width_in_bytes(bt)/type2aelembytes(bt); } const int Matcher::min_vector_size(const BasicType bt) { -// For the moment limit the vector size to 8 bytes + int max_size = max_vector_size(bt); + if ((UseSVE > 0) && (MaxVectorSize >= 16)) { + // Currently vector length less than SVE vector register size is not supported. + return max_size; + } else { + // For the moment limit the vector size to 8 bytes with NEON. int size = 8 / type2aelembytes(bt); if (size < 2) size = 2; return size; + } +} + +const bool Matcher::supports_scalable_vector() { + return UseSVE > 0; +} + +// Actual max scalable vector register length. +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return Matcher::max_vector_size(bt); } // Vector ideal reg. const uint Matcher::vector_ideal_reg(int len) { + if (UseSVE > 0 && 16 <= len && len <= 256) { + return Op_VecA; + } switch(len) { case 8: return Op_VecD; case 16: return Op_VecX; @@ -2270,6 +2576,9 @@ const uint Matcher::vector_ideal_reg(int len) { } const uint Matcher::vector_shift_count_ideal_reg(int size) { + if (UseSVE > 0 && 16 <= size && size <= 256) { + return Op_VecA; + } switch(size) { case 8: return Op_VecD; case 16: return Op_VecX; @@ -3419,6 +3728,11 @@ encode %{ if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + // Only non uncommon_trap calls need to reinitialize ptrue. + if (uncommon_trap_request() == 0) { + __ reinitialize_ptrue(); + } } // Emit stub for static call address stub = CompiledStaticCall::emit_to_interp_stub(cbuf); @@ -3436,6 +3750,8 @@ encode %{ if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); } %} @@ -3472,6 +3788,9 @@ encode %{ __ bind(retaddr); __ add(sp, sp, 2 * wordSize); } + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } %} enc_class aarch64_enc_rethrow() %{ @@ -3481,6 +3800,11 @@ encode %{ enc_class aarch64_enc_ret() %{ MacroAssembler _masm(&cbuf); +#ifdef ASSERT + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ verify_ptrue(); + } +#endif __ ret(lr); %} @@ -4222,6 +4546,41 @@ operand immLoffset16() interface(CONST_INTER); %} +// 8 bit signed value. +operand immI8() +%{ + predicate(n->get_int() <= 127 && n->get_int() >= -128); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immI8_shift8() +%{ + predicate((n->get_int() <= 127 && n->get_int() >= -128) || + (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0)); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immL8_shift8() +%{ + predicate((n->get_long() <= 127 && n->get_long() >= -128) || + (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0)); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // 32 bit integer valid for add sub immediate operand immIAddSub() %{ @@ -4851,6 +5210,18 @@ operand vRegD() interface(REG_INTER); %} +// Generic vector class. This will be used for +// all vector operands, including NEON and SVE, +// but currently only used for SVE VecA. +operand vReg() +%{ + constraint(ALLOC_IN_RC(vectora_reg)); + match(VecA); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + operand vecD() %{ constraint(ALLOC_IN_RC(vectord_reg)); @@ -5159,6 +5530,15 @@ operand vRegD_V31() interface(REG_INTER); %} +operand pRegGov() +%{ + constraint(ALLOC_IN_RC(gov_pr)); + match(RegVMask); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + // Flags register, used as output of signed compare instructions // note that on AArch64 we also use this register as the output for @@ -15745,7 +16125,7 @@ instruct loadV8(vecD dst, vmem8 mem) // Load Vector (128 bits) instruct loadV16(vecX dst, vmem16 mem) %{ - predicate(n->as_LoadVector()->memory_size() == 16); + predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 16); match(Set dst (LoadVector mem)); ins_cost(4 * INSN_COST); format %{ "ldrq $dst,$mem\t# vector (128 bits)" %} @@ -15801,7 +16181,7 @@ instruct replicate8B(vecD dst, iRegIorL2I src) instruct replicate16B(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (16B)" %} @@ -15826,7 +16206,7 @@ instruct replicate8B_imm(vecD dst, immI con) instruct replicate16B_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(16B)" %} @@ -15851,7 +16231,7 @@ instruct replicate4S(vecD dst, iRegIorL2I src) instruct replicate8S(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (8S)" %} @@ -15876,7 +16256,7 @@ instruct replicate4S_imm(vecD dst, immI con) instruct replicate8S_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(8H)" %} @@ -15900,7 +16280,7 @@ instruct replicate2I(vecD dst, iRegIorL2I src) instruct replicate4I(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4I)" %} @@ -15924,7 +16304,7 @@ instruct replicate2I_imm(vecD dst, immI con) instruct replicate4I_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(4I)" %} @@ -15936,7 +16316,7 @@ instruct replicate4I_imm(vecX dst, immI con) instruct replicate2L(vecX dst, iRegL src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateL src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2L)" %} @@ -15948,7 +16328,7 @@ instruct replicate2L(vecX dst, iRegL src) instruct replicate2L_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateI zero)); ins_cost(INSN_COST); format %{ "movi $dst, $zero\t# vector(4I)" %} @@ -15975,7 +16355,7 @@ instruct replicate2F(vecD dst, vRegF src) instruct replicate4F(vecX dst, vRegF src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateF src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4F)" %} @@ -15988,7 +16368,7 @@ instruct replicate4F(vecX dst, vRegF src) instruct replicate2D(vecX dst, vRegD src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateD src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2D)" %} diff --git a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad new file mode 100644 index 000000000..8d80cb37a --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad @@ -0,0 +1,1366 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- + +// AArch64 SVE Architecture Description File + + +// 4 bit signed offset -- for predicated load/store + +operand vmemA_immIOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_int(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand vmemA_immLOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_long(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + + +operand vmemA_indOffI4(iRegP reg, vmemA_immIOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +operand vmemA_indOffL4(iRegP reg, vmemA_immLOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn = NULL; + Assembler::SIMD_RegVariant type = Assembler::B; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + + + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +// vector add + +instruct vaddB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (AddVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (AddVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector and + +instruct vand(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (AndV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_and $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_and(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector or + +instruct vor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (OrV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_orr $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_orr(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector xor + +instruct vxor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (XorV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_eor $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_eor(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector float div + +instruct vdivF(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (DivVF dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vdivD(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (DivVD dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmls + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmla + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmls + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (AddVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (AddVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (AddVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mls + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (SubVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (SubVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (SubVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +// vector mul + +instruct vmulS(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (MulVS dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulI(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (MulVI dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulL(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (MulVL dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (MulVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (MulVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fneg + +instruct vnegF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVF src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vnegD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVD src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector add reduction + +instruct reduce_addI(iRegINoSp dst, iRegIorL2I src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (S)\n\t" + "umov $dst, $tmp, S, 0\n\t" + "addw $dst, $dst, $src1\t # add reduction S" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ S, 0); + __ addw($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addL(iRegLNoSp dst, iRegL src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (D)\n\t" + "umov $dst, $tmp, D, 0\n\t" + "add $dst, $dst, $src1\t # add reduction D" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ D, 0); + __ add($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addF(vRegF src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVF src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addD(vRegD src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVD src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector replicate + +instruct replicateB(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL(vReg dst, iRegL src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateB_imm8(vReg dst, immI8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL_imm8(vReg dst, immL8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateF(vReg dst, vRegF src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateF src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateD(vReg dst, vRegD src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateD src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector shift + +instruct vasrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) con = 7; + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 16) con = 15; + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntB(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16 && + (n->bottom_type()->is_vect()->element_basic_type() == T_BYTE)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntS(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8 && + (n->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + (n->bottom_type()->is_vect()->element_basic_type() == T_CHAR))); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntI(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + (n->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntL(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + (n->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sqrt + +instruct vsqrtF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVF src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsqrtD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVD src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sub + +instruct vsubB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (SubVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (SubVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} diff --git a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 new file mode 100644 index 000000000..0323f2f8c --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 @@ -0,0 +1,727 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +dnl Generate the warning +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- +dnl + +// AArch64 SVE Architecture Description File + +dnl +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET($1, $2, $3 ) +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET(imm_type_abbr, imm_type, imm_len) +define(`OPERAND_VMEMORYA_IMMEDIATE_OFFSET', ` +operand vmemA_imm$1Offset$3() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_$2(), $3, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(Con$1); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%}') +dnl +// 4 bit signed offset -- for predicated load/store +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(I, int, 4) +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(L, long, 4) +dnl +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET($1, $2 ) +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET(imm_type_abbr, imm_len) +define(`OPERAND_VMEMORYA_INDIRECT_OFFSET', ` +operand vmemA_indOff$1$2(iRegP reg, vmemA_imm$1Offset$2 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + `index'(0xffffffff); + scale(0x0); + disp($off); + %} +%}') +dnl +OPERAND_VMEMORYA_INDIRECT_OFFSET(I, 4) +OPERAND_VMEMORYA_INDIRECT_OFFSET(L, 4) + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn; + Assembler::SIMD_RegVariant type; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + +dnl +dnl ELEMENT_SHORT_CHART($1, $2) +dnl ELEMENT_SHORT_CHART(etype, node) +define(`ELEMENT_SHORT_CHAR',`ifelse(`$1', `T_SHORT', + `($2->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + ($2->bottom_type()->is_vect()->element_basic_type() == T_CHAR))', + `($2->bottom_type()->is_vect()->element_basic_type() == $1)')') +dnl + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl UNARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, %6 ) +dnl UNARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`UNARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$6 $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +dnl +dnl BINARY_OP_UNPREDICATED($1, $2 $3, $4 $5 ) +dnl BINARY_OP_UNPREDICATED(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_UNPREDICATED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector add +BINARY_OP_UNPREDICATED(vaddB, AddVB, B, 16, sve_add) +BINARY_OP_UNPREDICATED(vaddS, AddVS, H, 8, sve_add) +BINARY_OP_UNPREDICATED(vaddI, AddVI, S, 4, sve_add) +BINARY_OP_UNPREDICATED(vaddL, AddVL, D, 2, sve_add) +BINARY_OP_UNPREDICATED(vaddF, AddVF, S, 4, sve_fadd) +BINARY_OP_UNPREDICATED(vaddD, AddVD, D, 2, sve_fadd) +dnl +dnl BINARY_OP_UNSIZED($1, $2, $3, $4 ) +dnl BINARY_OP_UNSIZED(insn_name, op_name, min_vec_len, insn) +define(`BINARY_OP_UNSIZED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $3); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$4 $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ $4(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector and +BINARY_OP_UNSIZED(vand, AndV, 16, sve_and) + +// vector or +BINARY_OP_UNSIZED(vor, OrV, 16, sve_orr) + +// vector xor +BINARY_OP_UNSIZED(vxor, XorV, 16, sve_eor) +dnl +dnl VDIVF($1, $2 , $3 ) +dnl VDIVF(name_suffix, size, min_vec_len) +define(`VDIVF', ` +instruct vdiv$1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (DivV$1 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) ($2)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector float div +VDIVF(F, S, 4) +VDIVF(D, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, $6 ) +dnl BINARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$6 $dst_src1, $dst_src1, $src2\t # vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst_src1$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl + +dnl +dnl VFMLA($1 $2 $3 ) +dnl VFMLA(name_suffix, size, min_vec_len) +define(`VFMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmla +VFMLA(F, S, 4) +VFMLA(D, D, 2) + +dnl +dnl VFMLS($1 $2 $3 ) +dnl VFMLS(name_suffix, size, min_vec_len) +define(`VFMLS', ` +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmls +VFMLS(F, S, 4) +VFMLS(D, D, 2) + +dnl +dnl VFNMLA($1 $2 $3 ) +dnl VFNMLA(name_suffix, size, min_vec_len) +define(`VFNMLA', ` +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmla +VFNMLA(F, S, 4) +VFNMLA(D, D, 2) + +dnl +dnl VFNMLS($1 $2 $3 ) +dnl VFNMLS(name_suffix, size, min_vec_len) +define(`VFNMLS', ` +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmls +VFNMLS(F, S, 4) +VFNMLS(D, D, 2) + +dnl +dnl VMLA($1 $2 $3 ) +dnl VMLA(name_suffix, size, min_vec_len) +define(`VMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vmla$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (AddV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mla +VMLA(B, B, 16) +VMLA(S, H, 8) +VMLA(I, S, 4) +VMLA(L, D, 2) + +dnl +dnl VMLS($1 $2 $3 ) +dnl VMLS(name_suffix, size, min_vec_len) +define(`VMLS', ` +// dst_src1 = dst_src1 - src2 * src3 +instruct vmls$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (SubV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mls +VMLS(B, B, 16) +VMLS(S, H, 8) +VMLS(I, S, 4) +VMLS(L, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl BINARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst_src1, $dst_src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst_src1$$reg), __ $3, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector mul +BINARY_OP_TRUE_PREDICATE(vmulS, MulVS, H, 8, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulI, MulVI, S, 4, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulL, MulVL, D, 2, sve_mul) +BINARY_OP_UNPREDICATED(vmulF, MulVF, S, 4, sve_fmul) +BINARY_OP_UNPREDICATED(vmulD, MulVD, D, 2, sve_fmul) + +dnl +dnl UNARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl UNARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_bytes, insn) +define(`UNARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $4); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fneg +UNARY_OP_TRUE_PREDICATE(vnegF, NegVF, S, 16, sve_fneg) +UNARY_OP_TRUE_PREDICATE(vnegD, NegVD, D, 16, sve_fneg) + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl REDUCE_ADD($1, $2, $3, $4, $5, $6, $7 ) +dnl REDUCE_ADD(insn_name, op_name, reg_dst, reg_src, size, elem_type, insn1) +define(`REDUCE_ADD', ` +instruct $1($3 dst, $4 src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + ELEMENT_SHORT_CHAR($6, n->in(2))); + match(Set dst ($2 src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) ($5)\n\t" + "umov $dst, $tmp, $5, 0\n\t" + "$7 $dst, $dst, $src1\t # add reduction $5" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ $5, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ $5, 0); + __ $7($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REDUCE_ADDF($1, $2, $3, $4 ) +dnl REDUCE_ADDF(insn_name, op_name, reg_dst, size) +define(`REDUCE_ADDF', ` +instruct $1($3 src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst ($2 src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector add reduction +REDUCE_ADD(reduce_addI, AddReductionVI, iRegINoSp, iRegIorL2I, S, T_INT, addw) +REDUCE_ADD(reduce_addL, AddReductionVL, iRegLNoSp, iRegL, D, T_LONG, add) +REDUCE_ADDF(reduce_addF, AddReductionVF, vRegF, S) +REDUCE_ADDF(reduce_addD, AddReductionVD, vRegD, D) + +dnl +dnl REDUCE_FMINMAX($1, $2, $3, $4, $5 ) +dnl REDUCE_FMINMAX(min_max, name_suffix, element_type, size, reg_src_dst) +define(`REDUCE_FMINMAX', ` +instruct reduce_$1$2($5 dst, $5 src1, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == $3 && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (translit($1, `m', `M')ReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_f$1v $dst, $src2 # vector (sve) (S)\n\t" + "f$1s $dst, $dst, $src1\t # $1 reduction $2" %} + ins_encode %{ + __ sve_f$1v(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + __ f`$1'translit($4, `SD', `sd')(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +// vector max reduction +REDUCE_FMINMAX(max, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(max, D, T_DOUBLE, D, vRegD) + +// vector min reduction +REDUCE_FMINMAX(min, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(min, D, T_DOUBLE, D, vRegD) + +dnl +dnl REPLICATE($1, $2, $3, $4, $5 ) +dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`REPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REPLICATE_IMM8($1, $2, $3, $4, $5 ) +dnl REPLICATE_IMM8(insn_name, op_name, imm_type, size, min_vec_len) +define(`REPLICATE_IMM8', ` +instruct $1(vReg dst, $3 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, $con$$constant); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl FREPLICATE($1, $2, $3, $4, $5 ) +dnl FREPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`FREPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector replicate +REPLICATE(replicateB, ReplicateB, iRegIorL2I, B, 16) +REPLICATE(replicateS, ReplicateS, iRegIorL2I, H, 8) +REPLICATE(replicateI, ReplicateI, iRegIorL2I, S, 4) +REPLICATE(replicateL, ReplicateL, iRegL, D, 2) + +REPLICATE_IMM8(replicateB_imm8, ReplicateB, immI8, B, 16) +REPLICATE_IMM8(replicateS_imm8, ReplicateS, immI8_shift8, H, 8) +REPLICATE_IMM8(replicateI_imm8, ReplicateI, immI8_shift8, S, 4) +REPLICATE_IMM8(replicateL_imm8, ReplicateL, immL8_shift8, D, 2) + +FREPLICATE(replicateF, ReplicateF, vRegF, S, 4) +FREPLICATE(replicateD, ReplicateD, vRegD, D, 2) +dnl +dnl VSHIFT_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 dst shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $dst, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_IMM_UNPREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_IMM_UNPREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_IMM_UNPREDICATE', ` +instruct $1(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + int con = (int)$shift$$constant;dnl +ifelse(eval(index(`$1', `vasr') == 0 || index(`$1', `vlsr') == 0), 1, ` + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }')dnl +ifelse(eval(index(`$1', `vasr') == 0), 1, `ifelse(eval(index(`$3', `B') == 0), 1, ` + if (con >= 8) con = 7;')ifelse(eval(index(`$3', `H') == 0), 1, ` + if (con >= 16) con = 15;')')dnl +ifelse(eval((index(`$1', `vlsl') == 0 || index(`$1', `vlsr') == 0) && (index(`$3', `B') == 0 || index(`$3', `H') == 0)), 1, ` + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }') + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_COUNT($1, $2, $3, $4 ) +dnl VSHIFT_COUNT(insn_name, size, min_vec_len, type) +define(`VSHIFT_COUNT', ` +instruct $1(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3 && + ELEMENT_SHORT_CHAR($4, n)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) ($2)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $2, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector shift +VSHIFT_TRUE_PREDICATE(vasrB, RShiftVB, B, 16, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrS, RShiftVS, H, 8, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrI, RShiftVI, S, 4, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrL, RShiftVL, D, 2, sve_asr) +VSHIFT_TRUE_PREDICATE(vlslB, LShiftVB, B, 16, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslS, LShiftVS, H, 8, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslI, LShiftVI, S, 4, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslL, LShiftVL, D, 2, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlsrB, URShiftVB, B, 16, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrS, URShiftVS, H, 8, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrI, URShiftVI, S, 4, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrL, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vasrB_imm, RShiftVB, B, 16, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrS_imm, RShiftVS, H, 8, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrI_imm, RShiftVI, S, 4, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrL_imm, RShiftVL, D, 2, sve_asr) +VSHIFT_IMM_UNPREDICATE(vlsrB_imm, URShiftVB, B, 16, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrS_imm, URShiftVS, H, 8, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrI_imm, URShiftVI, S, 4, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrL_imm, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlslB_imm, LShiftVB, B, 16, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslS_imm, LShiftVS, H, 8, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslI_imm, LShiftVI, S, 4, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslL_imm, LShiftVL, D, 2, sve_lsl) +VSHIFT_COUNT(vshiftcntB, B, 16, T_BYTE) +VSHIFT_COUNT(vshiftcntS, H, 8, T_SHORT) +VSHIFT_COUNT(vshiftcntI, S, 4, T_INT) +VSHIFT_COUNT(vshiftcntL, D, 2, T_LONG) + +// vector sqrt +UNARY_OP_TRUE_PREDICATE(vsqrtF, SqrtVF, S, 16, sve_fsqrt) +UNARY_OP_TRUE_PREDICATE(vsqrtD, SqrtVD, D, 16, sve_fsqrt) + +// vector sub +BINARY_OP_UNPREDICATED(vsubB, SubVB, B, 16, sve_sub) +BINARY_OP_UNPREDICATED(vsubS, SubVS, H, 8, sve_sub) +BINARY_OP_UNPREDICATED(vsubI, SubVI, S, 4, sve_sub) +BINARY_OP_UNPREDICATED(vsubL, SubVL, D, 2, sve_sub) +BINARY_OP_UNPREDICATED(vsubF, SubVF, S, 4, sve_fsub) +BINARY_OP_UNPREDICATED(vsubD, SubVD, D, 2, sve_fsub) diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp index 2a17d8e0f..943d2a615 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp @@ -96,662 +96,662 @@ void entry(CodeBuffer *cb) { __ bind(back); // ArithOp - __ add(r15, r12, r16, Assembler::LSR, 30); // add x15, x12, x16, LSR #30 - __ sub(r1, r15, r3, Assembler::LSR, 32); // sub x1, x15, x3, LSR #32 - __ adds(r13, r25, r5, Assembler::LSL, 13); // adds x13, x25, x5, LSL #13 - __ subs(r22, r28, r6, Assembler::ASR, 17); // subs x22, x28, x6, ASR #17 - __ addw(r0, r9, r22, Assembler::ASR, 6); // add w0, w9, w22, ASR #6 - __ subw(r19, r3, r25, Assembler::LSL, 21); // sub w19, w3, w25, LSL #21 - __ addsw(r4, r19, r11, Assembler::LSL, 20); // adds w4, w19, w11, LSL #20 - __ subsw(r24, r7, r19, Assembler::ASR, 0); // subs w24, w7, w19, ASR #0 - __ andr(r30, r7, r11, Assembler::LSL, 48); // and x30, x7, x11, LSL #48 - __ orr(r24, r8, r15, Assembler::LSL, 12); // orr x24, x8, x15, LSL #12 - __ eor(r17, r9, r23, Assembler::LSL, 1); // eor x17, x9, x23, LSL #1 - __ ands(r14, r11, r4, Assembler::LSR, 55); // ands x14, x11, x4, LSR #55 - __ andw(r19, r7, r12, Assembler::LSR, 17); // and w19, w7, w12, LSR #17 - __ orrw(r19, r27, r11, Assembler::ASR, 28); // orr w19, w27, w11, ASR #28 - __ eorw(r30, r3, r22, Assembler::LSR, 31); // eor w30, w3, w22, LSR #31 - __ andsw(r19, r26, r28, Assembler::ASR, 0); // ands w19, w26, w28, ASR #0 - __ bic(r29, r6, r26, Assembler::LSL, 51); // bic x29, x6, x26, LSL #51 - __ orn(r26, r27, r17, Assembler::LSL, 35); // orn x26, x27, x17, LSL #35 - __ eon(r21, r4, r14, Assembler::LSL, 5); // eon x21, x4, x14, LSL #5 - __ bics(r2, r15, r0, Assembler::ASR, 5); // bics x2, x15, x0, ASR #5 - __ bicw(r2, r7, r2, Assembler::LSL, 29); // bic w2, w7, w2, LSL #29 - __ ornw(r24, r12, r21, Assembler::LSR, 5); // orn w24, w12, w21, LSR #5 - __ eonw(r30, r15, r19, Assembler::LSL, 2); // eon w30, w15, w19, LSL #2 - __ bicsw(r30, r23, r17, Assembler::ASR, 28); // bics w30, w23, w17, ASR #28 + __ add(r23, r1, r13, Assembler::LSR, 45); // add x23, x1, x13, LSR #45 + __ sub(r8, r30, r12, Assembler::ASR, 56); // sub x8, x30, x12, ASR #56 + __ adds(r27, r23, r14, Assembler::LSL, 54); // adds x27, x23, x14, LSL #54 + __ subs(r21, r15, r20, Assembler::LSR, 38); // subs x21, x15, x20, LSR #38 + __ addw(r25, r17, r4, Assembler::LSL, 3); // add w25, w17, w4, LSL #3 + __ subw(r29, r1, r9, Assembler::ASR, 20); // sub w29, w1, w9, ASR #20 + __ addsw(r10, r26, r9, Assembler::ASR, 9); // adds w10, w26, w9, ASR #9 + __ subsw(r21, r30, r7, Assembler::ASR, 3); // subs w21, w30, w7, ASR #3 + __ andr(r9, r8, r11, Assembler::LSR, 4); // and x9, x8, x11, LSR #4 + __ orr(r3, r18, r1, Assembler::ASR, 1); // orr x3, x18, x1, ASR #1 + __ eor(r10, r20, r2, Assembler::LSL, 27); // eor x10, x20, x2, LSL #27 + __ ands(r12, r9, r11, Assembler::ASR, 31); // ands x12, x9, x11, ASR #31 + __ andw(r20, r9, r30, Assembler::ASR, 26); // and w20, w9, w30, ASR #26 + __ orrw(r21, r10, r26, Assembler::ASR, 17); // orr w21, w10, w26, ASR #17 + __ eorw(r0, r8, r7, Assembler::ASR, 7); // eor w0, w8, w7, ASR #7 + __ andsw(r19, r11, r8, Assembler::LSL, 18); // ands w19, w11, w8, LSL #18 + __ bic(r23, r4, r3, Assembler::LSL, 53); // bic x23, x4, x3, LSL #53 + __ orn(r9, r6, r19, Assembler::LSL, 24); // orn x9, x6, x19, LSL #24 + __ eon(r12, r6, r26, Assembler::LSR, 54); // eon x12, x6, x26, LSR #54 + __ bics(r22, r19, r12, Assembler::LSL, 14); // bics x22, x19, x12, LSL #14 + __ bicw(r29, r13, r22, Assembler::LSL, 11); // bic w29, w13, w22, LSL #11 + __ ornw(r17, r30, r20, Assembler::ASR, 5); // orn w17, w30, w20, ASR #5 + __ eonw(r1, r29, r11, Assembler::LSL, 8); // eon w1, w29, w11, LSL #8 + __ bicsw(r4, r20, r6, Assembler::LSR, 29); // bics w4, w20, w6, LSR #29 // AddSubImmOp - __ addw(r4, r20, 660u); // add w4, w20, #660 - __ addsw(r2, r10, 710u); // adds w2, w10, #710 - __ subw(r19, r26, 244u); // sub w19, w26, #244 - __ subsw(r28, r13, 73u); // subs w28, w13, #73 - __ add(r2, r30, 862u); // add x2, x30, #862 - __ adds(r27, r16, 574u); // adds x27, x16, #574 - __ sub(r22, r9, 589u); // sub x22, x9, #589 - __ subs(r4, r1, 698u); // subs x4, x1, #698 + __ addw(r30, r6, 504u); // add w30, w6, #504 + __ addsw(r19, r8, 943u); // adds w19, w8, #943 + __ subw(r29, r10, 365u); // sub w29, w10, #365 + __ subsw(r4, r8, 284u); // subs w4, w8, #284 + __ add(r3, r14, 958u); // add x3, x14, #958 + __ adds(r22, r20, 167u); // adds x22, x20, #167 + __ sub(r27, r15, 725u); // sub x27, x15, #725 + __ subs(r24, r28, 947u); // subs x24, x28, #947 // LogicalImmOp - __ andw(r28, r19, 4294709247ull); // and w28, w19, #0xfffc0fff - __ orrw(r27, r5, 536870910ull); // orr w27, w5, #0x1ffffffe - __ eorw(r30, r20, 4294840319ull); // eor w30, w20, #0xfffe0fff - __ andsw(r22, r26, 4294959615ull); // ands w22, w26, #0xffffe1ff - __ andr(r5, r7, 4194300ull); // and x5, x7, #0x3ffffc - __ orr(r13, r7, 18014398509481728ull); // orr x13, x7, #0x3fffffffffff00 - __ eor(r7, r9, 18442240474082197503ull); // eor x7, x9, #0xfff0000000003fff - __ ands(r3, r0, 18374686479671656447ull); // ands x3, x0, #0xff00000000007fff + __ andw(r25, r25, 2139127680ull); // and w25, w25, #0x7f807f80 + __ orrw(r13, r26, 2097120ull); // orr w13, w26, #0x1fffe0 + __ eorw(r21, r13, 3758096384ull); // eor w21, w13, #0xe0000000 + __ andsw(r2, r3, 1073733632ull); // ands w2, w3, #0x3fffe000 + __ andr(r8, r10, 1125895612137471ull); // and x8, x10, #0x3ffff0003ffff + __ orr(r27, r16, 18444492273897963519ull); // orr x27, x16, #0xfff80000001fffff + __ eor(r27, r3, 4611685469745315712ull); // eor x27, x3, #0x3fffff803fffff80 + __ ands(r4, r23, 18446744056529698815ull); // ands x4, x23, #0xfffffffc00003fff // AbsOp - __ b(__ pc()); // b . - __ b(back); // b back - __ b(forth); // b forth - __ bl(__ pc()); // bl . - __ bl(back); // bl back - __ bl(forth); // bl forth + __ b(__ pc()); // b . + __ b(back); // b back + __ b(forth); // b forth + __ bl(__ pc()); // bl . + __ bl(back); // bl back + __ bl(forth); // bl forth // RegAndAbsOp - __ cbzw(r16, __ pc()); // cbz w16, . - __ cbzw(r16, back); // cbz w16, back - __ cbzw(r16, forth); // cbz w16, forth - __ cbnzw(r19, __ pc()); // cbnz w19, . - __ cbnzw(r19, back); // cbnz w19, back - __ cbnzw(r19, forth); // cbnz w19, forth - __ cbz(r5, __ pc()); // cbz x5, . - __ cbz(r5, back); // cbz x5, back - __ cbz(r5, forth); // cbz x5, forth - __ cbnz(r4, __ pc()); // cbnz x4, . - __ cbnz(r4, back); // cbnz x4, back - __ cbnz(r4, forth); // cbnz x4, forth - __ adr(r27, __ pc()); // adr x27, . - __ adr(r27, back); // adr x27, back - __ adr(r27, forth); // adr x27, forth - __ _adrp(r16, __ pc()); // adrp x16, . + __ cbzw(r3, __ pc()); // cbz w3, . + __ cbzw(r3, back); // cbz w3, back + __ cbzw(r3, forth); // cbz w3, forth + __ cbnzw(r2, __ pc()); // cbnz w2, . + __ cbnzw(r2, back); // cbnz w2, back + __ cbnzw(r2, forth); // cbnz w2, forth + __ cbz(r25, __ pc()); // cbz x25, . + __ cbz(r25, back); // cbz x25, back + __ cbz(r25, forth); // cbz x25, forth + __ cbnz(r18, __ pc()); // cbnz x18, . + __ cbnz(r18, back); // cbnz x18, back + __ cbnz(r18, forth); // cbnz x18, forth + __ adr(r8, __ pc()); // adr x8, . + __ adr(r8, back); // adr x8, back + __ adr(r8, forth); // adr x8, forth + __ _adrp(r15, __ pc()); // adrp x15, . // RegImmAbsOp - __ tbz(r28, 8, __ pc()); // tbz x28, #8, . - __ tbz(r28, 8, back); // tbz x28, #8, back - __ tbz(r28, 8, forth); // tbz x28, #8, forth - __ tbnz(r1, 1, __ pc()); // tbnz x1, #1, . - __ tbnz(r1, 1, back); // tbnz x1, #1, back - __ tbnz(r1, 1, forth); // tbnz x1, #1, forth + __ tbz(r18, 14, __ pc()); // tbz x18, #14, . + __ tbz(r18, 14, back); // tbz x18, #14, back + __ tbz(r18, 14, forth); // tbz x18, #14, forth + __ tbnz(r25, 15, __ pc()); // tbnz x25, #15, . + __ tbnz(r25, 15, back); // tbnz x25, #15, back + __ tbnz(r25, 15, forth); // tbnz x25, #15, forth // MoveWideImmOp - __ movnw(r20, 8639, 16); // movn w20, #8639, lsl 16 - __ movzw(r7, 25835, 0); // movz w7, #25835, lsl 0 - __ movkw(r17, 7261, 0); // movk w17, #7261, lsl 0 - __ movn(r14, 2097, 32); // movn x14, #2097, lsl 32 - __ movz(r9, 16082, 0); // movz x9, #16082, lsl 0 - __ movk(r19, 13962, 16); // movk x19, #13962, lsl 16 + __ movnw(r18, 4126, 16); // movn w18, #4126, lsl 16 + __ movzw(r30, 13712, 0); // movz w30, #13712, lsl 0 + __ movkw(r21, 13161, 16); // movk w21, #13161, lsl 16 + __ movn(r18, 28524, 48); // movn x18, #28524, lsl 48 + __ movz(r13, 30710, 48); // movz x13, #30710, lsl 48 + __ movk(r3, 31565, 48); // movk x3, #31565, lsl 48 // BitfieldOp - __ sbfm(r9, r22, 6, 22); // sbfm x9, x22, #6, #22 - __ bfmw(r19, r0, 11, 0); // bfm w19, w0, #11, #0 - __ ubfmw(r10, r19, 11, 19); // ubfm w10, w19, #11, #19 - __ sbfm(r4, r15, 5, 17); // sbfm x4, x15, #5, #17 - __ bfm(r3, r5, 19, 28); // bfm x3, x5, #19, #28 - __ ubfm(r12, r28, 17, 2); // ubfm x12, x28, #17, #2 + __ sbfm(r10, r1, 0, 3); // sbfm x10, x1, #0, #3 + __ bfmw(r12, r22, 5, 24); // bfm w12, w22, #5, #24 + __ ubfmw(r17, r3, 11, 8); // ubfm w17, w3, #11, #8 + __ sbfm(r0, r3, 11, 14); // sbfm x0, x3, #11, #14 + __ bfm(r28, r6, 7, 15); // bfm x28, x6, #7, #15 + __ ubfm(r9, r10, 1, 25); // ubfm x9, x10, #1, #25 // ExtractOp - __ extrw(r15, r0, r22, 3); // extr w15, w0, w22, #3 - __ extr(r6, r14, r14, 55); // extr x6, x14, x14, #55 + __ extrw(r21, r27, r25, 23); // extr w21, w27, w25, #23 + __ extr(r14, r17, r22, 17); // extr x14, x17, x22, #17 // CondBranchOp - __ br(Assembler::EQ, __ pc()); // b.EQ . - __ br(Assembler::EQ, back); // b.EQ back - __ br(Assembler::EQ, forth); // b.EQ forth - __ br(Assembler::NE, __ pc()); // b.NE . - __ br(Assembler::NE, back); // b.NE back - __ br(Assembler::NE, forth); // b.NE forth - __ br(Assembler::HS, __ pc()); // b.HS . - __ br(Assembler::HS, back); // b.HS back - __ br(Assembler::HS, forth); // b.HS forth - __ br(Assembler::CS, __ pc()); // b.CS . - __ br(Assembler::CS, back); // b.CS back - __ br(Assembler::CS, forth); // b.CS forth - __ br(Assembler::LO, __ pc()); // b.LO . - __ br(Assembler::LO, back); // b.LO back - __ br(Assembler::LO, forth); // b.LO forth - __ br(Assembler::CC, __ pc()); // b.CC . - __ br(Assembler::CC, back); // b.CC back - __ br(Assembler::CC, forth); // b.CC forth - __ br(Assembler::MI, __ pc()); // b.MI . - __ br(Assembler::MI, back); // b.MI back - __ br(Assembler::MI, forth); // b.MI forth - __ br(Assembler::PL, __ pc()); // b.PL . - __ br(Assembler::PL, back); // b.PL back - __ br(Assembler::PL, forth); // b.PL forth - __ br(Assembler::VS, __ pc()); // b.VS . - __ br(Assembler::VS, back); // b.VS back - __ br(Assembler::VS, forth); // b.VS forth - __ br(Assembler::VC, __ pc()); // b.VC . - __ br(Assembler::VC, back); // b.VC back - __ br(Assembler::VC, forth); // b.VC forth - __ br(Assembler::HI, __ pc()); // b.HI . - __ br(Assembler::HI, back); // b.HI back - __ br(Assembler::HI, forth); // b.HI forth - __ br(Assembler::LS, __ pc()); // b.LS . - __ br(Assembler::LS, back); // b.LS back - __ br(Assembler::LS, forth); // b.LS forth - __ br(Assembler::GE, __ pc()); // b.GE . - __ br(Assembler::GE, back); // b.GE back - __ br(Assembler::GE, forth); // b.GE forth - __ br(Assembler::LT, __ pc()); // b.LT . - __ br(Assembler::LT, back); // b.LT back - __ br(Assembler::LT, forth); // b.LT forth - __ br(Assembler::GT, __ pc()); // b.GT . - __ br(Assembler::GT, back); // b.GT back - __ br(Assembler::GT, forth); // b.GT forth - __ br(Assembler::LE, __ pc()); // b.LE . - __ br(Assembler::LE, back); // b.LE back - __ br(Assembler::LE, forth); // b.LE forth - __ br(Assembler::AL, __ pc()); // b.AL . - __ br(Assembler::AL, back); // b.AL back - __ br(Assembler::AL, forth); // b.AL forth - __ br(Assembler::NV, __ pc()); // b.NV . - __ br(Assembler::NV, back); // b.NV back - __ br(Assembler::NV, forth); // b.NV forth + __ br(Assembler::EQ, __ pc()); // b.EQ . + __ br(Assembler::EQ, back); // b.EQ back + __ br(Assembler::EQ, forth); // b.EQ forth + __ br(Assembler::NE, __ pc()); // b.NE . + __ br(Assembler::NE, back); // b.NE back + __ br(Assembler::NE, forth); // b.NE forth + __ br(Assembler::HS, __ pc()); // b.HS . + __ br(Assembler::HS, back); // b.HS back + __ br(Assembler::HS, forth); // b.HS forth + __ br(Assembler::CS, __ pc()); // b.CS . + __ br(Assembler::CS, back); // b.CS back + __ br(Assembler::CS, forth); // b.CS forth + __ br(Assembler::LO, __ pc()); // b.LO . + __ br(Assembler::LO, back); // b.LO back + __ br(Assembler::LO, forth); // b.LO forth + __ br(Assembler::CC, __ pc()); // b.CC . + __ br(Assembler::CC, back); // b.CC back + __ br(Assembler::CC, forth); // b.CC forth + __ br(Assembler::MI, __ pc()); // b.MI . + __ br(Assembler::MI, back); // b.MI back + __ br(Assembler::MI, forth); // b.MI forth + __ br(Assembler::PL, __ pc()); // b.PL . + __ br(Assembler::PL, back); // b.PL back + __ br(Assembler::PL, forth); // b.PL forth + __ br(Assembler::VS, __ pc()); // b.VS . + __ br(Assembler::VS, back); // b.VS back + __ br(Assembler::VS, forth); // b.VS forth + __ br(Assembler::VC, __ pc()); // b.VC . + __ br(Assembler::VC, back); // b.VC back + __ br(Assembler::VC, forth); // b.VC forth + __ br(Assembler::HI, __ pc()); // b.HI . + __ br(Assembler::HI, back); // b.HI back + __ br(Assembler::HI, forth); // b.HI forth + __ br(Assembler::LS, __ pc()); // b.LS . + __ br(Assembler::LS, back); // b.LS back + __ br(Assembler::LS, forth); // b.LS forth + __ br(Assembler::GE, __ pc()); // b.GE . + __ br(Assembler::GE, back); // b.GE back + __ br(Assembler::GE, forth); // b.GE forth + __ br(Assembler::LT, __ pc()); // b.LT . + __ br(Assembler::LT, back); // b.LT back + __ br(Assembler::LT, forth); // b.LT forth + __ br(Assembler::GT, __ pc()); // b.GT . + __ br(Assembler::GT, back); // b.GT back + __ br(Assembler::GT, forth); // b.GT forth + __ br(Assembler::LE, __ pc()); // b.LE . + __ br(Assembler::LE, back); // b.LE back + __ br(Assembler::LE, forth); // b.LE forth + __ br(Assembler::AL, __ pc()); // b.AL . + __ br(Assembler::AL, back); // b.AL back + __ br(Assembler::AL, forth); // b.AL forth + __ br(Assembler::NV, __ pc()); // b.NV . + __ br(Assembler::NV, back); // b.NV back + __ br(Assembler::NV, forth); // b.NV forth // ImmOp - __ svc(22064); // svc #22064 - __ hvc(533); // hvc #533 - __ smc(9942); // smc #9942 - __ brk(4714); // brk #4714 - __ hlt(4302); // hlt #4302 + __ svc(31973); // svc #31973 + __ hvc(1113); // hvc #1113 + __ smc(24334); // smc #24334 + __ brk(7815); // brk #7815 + __ hlt(28529); // hlt #28529 // Op - __ nop(); // nop - __ eret(); // eret - __ drps(); // drps - __ isb(); // isb + __ nop(); // nop + __ eret(); // eret + __ drps(); // drps + __ isb(); // isb // SystemOp - __ dsb(Assembler::OSH); // dsb OSH - __ dmb(Assembler::NSHLD); // dmb NSHLD + __ dsb(Assembler::NSHLD); // dsb NSHLD + __ dmb(Assembler::NSH); // dmb NSH // OneRegOp - __ br(r20); // br x20 - __ blr(r2); // blr x2 + __ br(r28); // br x28 + __ blr(r17); // blr x17 // LoadStoreExclusiveOp - __ stxr(r18, r23, r0); // stxr w18, x23, [x0] - __ stlxr(r30, r5, r22); // stlxr w30, x5, [x22] - __ ldxr(r5, r8); // ldxr x5, [x8] - __ ldaxr(r20, r16); // ldaxr x20, [x16] - __ stlr(r6, r11); // stlr x6, [x11] - __ ldar(r6, r27); // ldar x6, [x27] + __ stxr(r18, r7, r26); // stxr w18, x7, [x26] + __ stlxr(r25, r12, r6); // stlxr w25, x12, [x6] + __ ldxr(r0, r16); // ldxr x0, [x16] + __ ldaxr(r6, r3); // ldaxr x6, [x3] + __ stlr(r14, r1); // stlr x14, [x1] + __ ldar(r29, r24); // ldar x29, [x24] // LoadStoreExclusiveOp - __ stxrw(r10, r17, r5); // stxr w10, w17, [x5] - __ stlxrw(r22, r9, r12); // stlxr w22, w9, [x12] - __ ldxrw(r27, r8); // ldxr w27, [x8] - __ ldaxrw(r23, r2); // ldaxr w23, [x2] - __ stlrw(r26, r29); // stlr w26, [x29] - __ ldarw(r13, r10); // ldar w13, [x10] + __ stxrw(r28, r15, r23); // stxr w28, w15, [x23] + __ stlxrw(r9, r7, r3); // stlxr w9, w7, [x3] + __ ldxrw(r1, r20); // ldxr w1, [x20] + __ ldaxrw(r20, r15); // ldaxr w20, [x15] + __ stlrw(r21, r9); // stlr w21, [x9] + __ ldarw(r5, r17); // ldar w5, [x17] // LoadStoreExclusiveOp - __ stxrh(r25, r28, r27); // stxrh w25, w28, [x27] - __ stlxrh(r29, r22, r12); // stlxrh w29, w22, [x12] - __ ldxrh(r22, r28); // ldxrh w22, [x28] - __ ldaxrh(r3, r30); // ldaxrh w3, [x30] - __ stlrh(r24, r15); // stlrh w24, [x15] - __ ldarh(r27, r26); // ldarh w27, [x26] + __ stxrh(r13, r20, r30); // stxrh w13, w20, [x30] + __ stlxrh(r10, r12, r18); // stlxrh w10, w12, [x18] + __ ldxrh(r4, r19); // ldxrh w4, [x19] + __ ldaxrh(r22, r10); // ldaxrh w22, [x10] + __ stlrh(r30, r15); // stlrh w30, [x15] + __ ldarh(r4, r24); // ldarh w4, [x24] // LoadStoreExclusiveOp - __ stxrb(r11, r10, r19); // stxrb w11, w10, [x19] - __ stlxrb(r23, r27, r22); // stlxrb w23, w27, [x22] - __ ldxrb(r24, r16); // ldxrb w24, [x16] - __ ldaxrb(r24, r1); // ldaxrb w24, [x1] - __ stlrb(r5, r29); // stlrb w5, [x29] - __ ldarb(r24, r16); // ldarb w24, [x16] + __ stxrb(r10, r20, r12); // stxrb w10, w20, [x12] + __ stlxrb(r20, r29, r11); // stlxrb w20, w29, [x11] + __ ldxrb(r21, r5); // ldxrb w21, [x5] + __ ldaxrb(r4, r9); // ldaxrb w4, [x9] + __ stlrb(r30, r28); // stlrb w30, [x28] + __ ldarb(r19, r24); // ldarb w19, [x24] // LoadStoreExclusiveOp - __ ldxp(r25, r24, r17); // ldxp x25, x24, [x17] - __ ldaxp(r22, r12, r19); // ldaxp x22, x12, [x19] - __ stxp(r0, r26, r21, r25); // stxp w0, x26, x21, [x25] - __ stlxp(r1, r6, r11, r5); // stlxp w1, x6, x11, [x5] + __ ldxp(r11, r16, r18); // ldxp x11, x16, [x18] + __ ldaxp(r8, r7, r15); // ldaxp x8, x7, [x15] + __ stxp(r28, r20, r16, r10); // stxp w28, x20, x16, [x10] + __ stlxp(r7, r9, r21, r3); // stlxp w7, x9, x21, [x3] // LoadStoreExclusiveOp - __ ldxpw(r13, r14, r4); // ldxp w13, w14, [x4] - __ ldaxpw(r17, r2, r6); // ldaxp w17, w2, [x6] - __ stxpw(r15, r3, r9, r18); // stxp w15, w3, w9, [x18] - __ stlxpw(r18, r17, r4, r9); // stlxp w18, w17, w4, [x9] + __ ldxpw(r25, r6, r19); // ldxp w25, w6, [x19] + __ ldaxpw(r30, r9, r2); // ldaxp w30, w9, [x2] + __ stxpw(r16, r0, r20, r12); // stxp w16, w0, w20, [x12] + __ stlxpw(r5, r2, r7, r28); // stlxp w5, w2, w7, [x28] -// base_plus_unscaled_offset +// base_plus_unscaled_offset // LoadStoreOp - __ str(r23, Address(r21, -49)); // str x23, [x21, -49] - __ strw(r21, Address(r2, 63)); // str w21, [x2, 63] - __ strb(r27, Address(r28, 11)); // strb w27, [x28, 11] - __ strh(r29, Address(r15, -13)); // strh w29, [x15, -13] - __ ldr(r14, Address(r30, -45)); // ldr x14, [x30, -45] - __ ldrw(r29, Address(r28, 53)); // ldr w29, [x28, 53] - __ ldrb(r20, Address(r26, 7)); // ldrb w20, [x26, 7] - __ ldrh(r25, Address(r2, -50)); // ldrh w25, [x2, -50] - __ ldrsb(r3, Address(r10, -15)); // ldrsb x3, [x10, -15] - __ ldrsh(r14, Address(r15, 19)); // ldrsh x14, [x15, 19] - __ ldrshw(r29, Address(r11, -5)); // ldrsh w29, [x11, -5] - __ ldrsw(r15, Address(r5, -71)); // ldrsw x15, [x5, -71] - __ ldrd(v19, Address(r12, 3)); // ldr d19, [x12, 3] - __ ldrs(v12, Address(r27, 42)); // ldr s12, [x27, 42] - __ strd(v22, Address(r28, 125)); // str d22, [x28, 125] - __ strs(v24, Address(r15, -20)); // str s24, [x15, -20] - -// pre + __ str(r16, Address(r19, -75)); // str x16, [x19, -75] + __ strw(r1, Address(r28, 30)); // str w1, [x28, 30] + __ strb(r28, Address(r13, -26)); // strb w28, [x13, -26] + __ strh(r8, Address(r6, -51)); // strh w8, [x6, -51] + __ ldr(r0, Address(r28, -227)); // ldr x0, [x28, -227] + __ ldrw(r28, Address(r10, -26)); // ldr w28, [x10, -26] + __ ldrb(r4, Address(r11, 12)); // ldrb w4, [x11, 12] + __ ldrh(r1, Address(r17, 5)); // ldrh w1, [x17, 5] + __ ldrsb(r11, Address(r9, 12)); // ldrsb x11, [x9, 12] + __ ldrsh(r8, Address(r8, -17)); // ldrsh x8, [x8, -17] + __ ldrshw(r20, Address(r13, -35)); // ldrsh w20, [x13, -35] + __ ldrsw(r23, Address(r9, 49)); // ldrsw x23, [x9, 49] + __ ldrd(v9, Address(r4, 29)); // ldr d9, [x4, 29] + __ ldrs(v11, Address(r19, 40)); // ldr s11, [x19, 40] + __ strd(v25, Address(r20, -43)); // str d25, [x20, -43] + __ strs(v25, Address(r1, -80)); // str s25, [x1, -80] + +// pre // LoadStoreOp - __ str(r8, Address(__ pre(r28, -24))); // str x8, [x28, -24]! - __ strw(r6, Address(__ pre(r15, 37))); // str w6, [x15, 37]! - __ strb(r7, Address(__ pre(r1, 7))); // strb w7, [x1, 7]! - __ strh(r0, Address(__ pre(r17, 30))); // strh w0, [x17, 30]! - __ ldr(r25, Address(__ pre(r29, 84))); // ldr x25, [x29, 84]! - __ ldrw(r26, Address(__ pre(r20, -52))); // ldr w26, [x20, -52]! - __ ldrb(r26, Address(__ pre(r29, -25))); // ldrb w26, [x29, -25]! - __ ldrh(r4, Address(__ pre(r25, 26))); // ldrh w4, [x25, 26]! - __ ldrsb(r28, Address(__ pre(r8, -21))); // ldrsb x28, [x8, -21]! - __ ldrsh(r17, Address(__ pre(r14, -6))); // ldrsh x17, [x14, -6]! - __ ldrshw(r28, Address(__ pre(r23, 10))); // ldrsh w28, [x23, 10]! - __ ldrsw(r30, Address(__ pre(r27, -64))); // ldrsw x30, [x27, -64]! - __ ldrd(v20, Address(__ pre(r30, -242))); // ldr d20, [x30, -242]! - __ ldrs(v17, Address(__ pre(r27, 20))); // ldr s17, [x27, 20]! - __ strd(v7, Address(__ pre(r3, 17))); // str d7, [x3, 17]! - __ strs(v13, Address(__ pre(r11, -16))); // str s13, [x11, -16]! - -// post + __ str(r20, Address(__ pre(r0, 25))); // str x20, [x0, 25]! + __ strw(r12, Address(__ pre(r12, -49))); // str w12, [x12, -49]! + __ strb(r28, Address(__ pre(r19, -10))); // strb w28, [x19, -10]! + __ strh(r13, Address(__ pre(r28, -63))); // strh w13, [x28, -63]! + __ ldr(r11, Address(__ pre(r23, -46))); // ldr x11, [x23, -46]! + __ ldrw(r27, Address(__ pre(r24, 17))); // ldr w27, [x24, 17]! + __ ldrb(r14, Address(__ pre(r26, -12))); // ldrb w14, [x26, -12]! + __ ldrh(r24, Address(__ pre(r22, -45))); // ldrh w24, [x22, -45]! + __ ldrsb(r25, Address(__ pre(r9, -11))); // ldrsb x25, [x9, -11]! + __ ldrsh(r5, Address(__ pre(r6, 29))); // ldrsh x5, [x6, 29]! + __ ldrshw(r7, Address(__ pre(r23, -1))); // ldrsh w7, [x23, -1]! + __ ldrsw(r26, Address(__ pre(r13, -61))); // ldrsw x26, [x13, -61]! + __ ldrd(v24, Address(__ pre(r24, -245))); // ldr d24, [x24, -245]! + __ ldrs(v20, Address(__ pre(r25, -55))); // ldr s20, [x25, -55]! + __ strd(v9, Address(__ pre(r2, -203))); // str d9, [x2, -203]! + __ strs(v14, Address(__ pre(r1, -59))); // str s14, [x1, -59]! + +// post // LoadStoreOp - __ str(r6, Address(__ post(r9, -61))); // str x6, [x9], -61 - __ strw(r16, Address(__ post(r5, -29))); // str w16, [x5], -29 - __ strb(r29, Address(__ post(r29, 15))); // strb w29, [x29], 15 - __ strh(r4, Address(__ post(r20, 18))); // strh w4, [x20], 18 - __ ldr(r19, Address(__ post(r18, 46))); // ldr x19, [x18], 46 - __ ldrw(r22, Address(__ post(r2, 23))); // ldr w22, [x2], 23 - __ ldrb(r7, Address(__ post(r3, -30))); // ldrb w7, [x3], -30 - __ ldrh(r11, Address(__ post(r12, -29))); // ldrh w11, [x12], -29 - __ ldrsb(r8, Address(__ post(r6, -29))); // ldrsb x8, [x6], -29 - __ ldrsh(r24, Address(__ post(r23, 4))); // ldrsh x24, [x23], 4 - __ ldrshw(r17, Address(__ post(r16, 0))); // ldrsh w17, [x16], 0 - __ ldrsw(r0, Address(__ post(r20, -8))); // ldrsw x0, [x20], -8 - __ ldrd(v20, Address(__ post(r2, -126))); // ldr d20, [x2], -126 - __ ldrs(v19, Address(__ post(r30, -104))); // ldr s19, [x30], -104 - __ strd(v4, Address(__ post(r17, 118))); // str d4, [x17], 118 - __ strs(v21, Address(__ post(r19, -112))); // str s21, [x19], -112 - -// base_plus_reg + __ str(r19, Address(__ post(r1, 109))); // str x19, [x1], 109 + __ strw(r4, Address(__ post(r5, -54))); // str w4, [x5], -54 + __ strb(r29, Address(__ post(r3, 9))); // strb w29, [x3], 9 + __ strh(r0, Address(__ post(r1, -50))); // strh w0, [x1], -50 + __ ldr(r2, Address(__ post(r6, -48))); // ldr x2, [x6], -48 + __ ldrw(r15, Address(__ post(r6, -115))); // ldr w15, [x6], -115 + __ ldrb(r4, Address(__ post(r2, -27))); // ldrb w4, [x2], -27 + __ ldrh(r17, Address(__ post(r26, -21))); // ldrh w17, [x26], -21 + __ ldrsb(r21, Address(__ post(r24, -13))); // ldrsb x21, [x24], -13 + __ ldrsh(r22, Address(__ post(r6, -48))); // ldrsh x22, [x6], -48 + __ ldrshw(r11, Address(__ post(r6, -48))); // ldrsh w11, [x6], -48 + __ ldrsw(r14, Address(__ post(r30, -5))); // ldrsw x14, [x30], -5 + __ ldrd(v2, Address(__ post(r15, -105))); // ldr d2, [x15], -105 + __ ldrs(v25, Address(__ post(r19, -91))); // ldr s25, [x19], -91 + __ strd(v13, Address(__ post(r23, -191))); // str d13, [x23], -191 + __ strs(v22, Address(__ post(r21, 0))); // str s22, [x21], 0 + +// base_plus_reg // LoadStoreOp - __ str(r26, Address(r2, r19, Address::lsl(3))); // str x26, [x2, x19, lsl #3] - __ strw(r9, Address(r0, r15, Address::sxtw(2))); // str w9, [x0, w15, sxtw #2] - __ strb(r26, Address(r12, r1, Address::lsl(0))); // strb w26, [x12, x1, lsl #0] - __ strh(r21, Address(r11, r10, Address::lsl(1))); // strh w21, [x11, x10, lsl #1] - __ ldr(r16, Address(r23, r16, Address::sxtx(0))); // ldr x16, [x23, x16, sxtx #0] - __ ldrw(r10, Address(r11, r17, Address::sxtw(2))); // ldr w10, [x11, w17, sxtw #2] - __ ldrb(r13, Address(r23, r11, Address::lsl(0))); // ldrb w13, [x23, x11, lsl #0] - __ ldrh(r27, Address(r4, r21, Address::lsl(0))); // ldrh w27, [x4, x21, lsl #0] - __ ldrsb(r26, Address(r8, r15, Address::sxtw(0))); // ldrsb x26, [x8, w15, sxtw #0] - __ ldrsh(r21, Address(r10, r2, Address::sxtw(0))); // ldrsh x21, [x10, w2, sxtw #0] - __ ldrshw(r8, Address(r30, r14, Address::lsl(0))); // ldrsh w8, [x30, x14, lsl #0] - __ ldrsw(r29, Address(r14, r20, Address::sxtx(2))); // ldrsw x29, [x14, x20, sxtx #2] - __ ldrd(v30, Address(r27, r22, Address::sxtx(0))); // ldr d30, [x27, x22, sxtx #0] - __ ldrs(v13, Address(r9, r22, Address::lsl(0))); // ldr s13, [x9, x22, lsl #0] - __ strd(v8, Address(r25, r17, Address::sxtw(3))); // str d8, [x25, w17, sxtw #3] - __ strs(v1, Address(r24, r5, Address::uxtw(2))); // str s1, [x24, w5, uxtw #2] - -// base_plus_scaled_offset + __ str(r22, Address(r12, r18, Address::sxtw(0))); // str x22, [x12, w18, sxtw #0] + __ strw(r30, Address(r27, r12, Address::uxtw(0))); // str w30, [x27, w12, uxtw #0] + __ strb(r7, Address(r4, r22, Address::lsl(0))); // strb w7, [x4, x22, lsl #0] + __ strh(r19, Address(r23, r29, Address::sxtx(1))); // strh w19, [x23, x29, sxtx #1] + __ ldr(r17, Address(r4, r27, Address::sxtx(3))); // ldr x17, [x4, x27, sxtx #3] + __ ldrw(r1, Address(r13, r17, Address::sxtw(0))); // ldr w1, [x13, w17, sxtw #0] + __ ldrb(r16, Address(r27, r29, Address::sxtx(0))); // ldrb w16, [x27, x29, sxtx #0] + __ ldrh(r25, Address(r9, r4, Address::uxtw(1))); // ldrh w25, [x9, w4, uxtw #1] + __ ldrsb(r4, Address(r12, r22, Address::lsl(0))); // ldrsb x4, [x12, x22, lsl #0] + __ ldrsh(r25, Address(r1, r5, Address::uxtw(1))); // ldrsh x25, [x1, w5, uxtw #1] + __ ldrshw(r9, Address(r16, r28, Address::lsl(0))); // ldrsh w9, [x16, x28, lsl #0] + __ ldrsw(r8, Address(r7, r14, Address::sxtx(0))); // ldrsw x8, [x7, x14, sxtx #0] + __ ldrd(v4, Address(r28, r16, Address::uxtw(3))); // ldr d4, [x28, w16, uxtw #3] + __ ldrs(v16, Address(r2, r27, Address::sxtw(2))); // ldr s16, [x2, w27, sxtw #2] + __ strd(v23, Address(r0, r25, Address::lsl(0))); // str d23, [x0, x25, lsl #0] + __ strs(v6, Address(r16, r7, Address::lsl(2))); // str s6, [x16, x7, lsl #2] + +// base_plus_scaled_offset // LoadStoreOp - __ str(r10, Address(r21, 14496)); // str x10, [x21, 14496] - __ strw(r18, Address(r29, 7228)); // str w18, [x29, 7228] - __ strb(r23, Address(r3, 2018)); // strb w23, [x3, 2018] - __ strh(r28, Address(r11, 3428)); // strh w28, [x11, 3428] - __ ldr(r24, Address(r26, 14376)); // ldr x24, [x26, 14376] - __ ldrw(r21, Address(r2, 6972)); // ldr w21, [x2, 6972] - __ ldrb(r4, Address(r5, 1848)); // ldrb w4, [x5, 1848] - __ ldrh(r14, Address(r14, 3112)); // ldrh w14, [x14, 3112] - __ ldrsb(r4, Address(r27, 1959)); // ldrsb x4, [x27, 1959] - __ ldrsh(r4, Address(r27, 3226)); // ldrsh x4, [x27, 3226] - __ ldrshw(r10, Address(r28, 3286)); // ldrsh w10, [x28, 3286] - __ ldrsw(r10, Address(r17, 7912)); // ldrsw x10, [x17, 7912] - __ ldrd(v13, Address(r28, 13400)); // ldr d13, [x28, 13400] - __ ldrs(v24, Address(r3, 7596)); // ldr s24, [x3, 7596] - __ strd(v2, Address(r12, 15360)); // str d2, [x12, 15360] - __ strs(v17, Address(r1, 6492)); // str s17, [x1, 6492] - -// pcrel + __ str(r2, Address(r12, 15288)); // str x2, [x12, 15288] + __ strw(r8, Address(r5, 6928)); // str w8, [x5, 6928] + __ strb(r1, Address(r17, 2016)); // strb w1, [x17, 2016] + __ strh(r8, Address(r25, 3258)); // strh w8, [x25, 3258] + __ ldr(r28, Address(r3, 14656)); // ldr x28, [x3, 14656] + __ ldrw(r21, Address(r11, 7004)); // ldr w21, [x11, 7004] + __ ldrb(r15, Address(r5, 1906)); // ldrb w15, [x5, 1906] + __ ldrh(r0, Address(r19, 3668)); // ldrh w0, [x19, 3668] + __ ldrsb(r29, Address(r9, 1909)); // ldrsb x29, [x9, 1909] + __ ldrsh(r23, Address(r28, 3640)); // ldrsh x23, [x28, 3640] + __ ldrshw(r27, Address(r10, 3672)); // ldrsh w27, [x10, 3672] + __ ldrsw(r21, Address(r27, 7736)); // ldrsw x21, [x27, 7736] + __ ldrd(v26, Address(r27, 14584)); // ldr d26, [x27, 14584] + __ ldrs(v2, Address(r4, 7464)); // ldr s2, [x4, 7464] + __ strd(v1, Address(r21, 16224)); // str d1, [x21, 16224] + __ strs(v4, Address(r22, 7552)); // str s4, [x22, 7552] + +// pcrel // LoadStoreOp - __ ldr(r16, __ pc()); // ldr x16, . - __ ldrw(r13, __ pc()); // ldr w13, . + __ ldr(r1, __ pc()); // ldr x1, . + __ ldrw(r27, __ pc()); // ldr w27, . // LoadStoreOp - __ prfm(Address(r18, -127)); // prfm PLDL1KEEP, [x18, -127] + __ prfm(Address(r4, 45)); // prfm PLDL1KEEP, [x4, 45] // LoadStoreOp - __ prfm(back); // prfm PLDL1KEEP, back + __ prfm(__ pc()); // prfm PLDL1KEEP, . // LoadStoreOp - __ prfm(Address(r20, r2, Address::lsl(3))); // prfm PLDL1KEEP, [x20, x2, lsl #3] + __ prfm(Address(r30, r0, Address::sxtw(0))); // prfm PLDL1KEEP, [x30, w0, sxtw #0] // LoadStoreOp - __ prfm(Address(r9, 13808)); // prfm PLDL1KEEP, [x9, 13808] + __ prfm(Address(r24, 16208)); // prfm PLDL1KEEP, [x24, 16208] // AddSubCarryOp - __ adcw(r8, r23, r2); // adc w8, w23, w2 - __ adcsw(r24, r3, r19); // adcs w24, w3, w19 - __ sbcw(r22, r24, r29); // sbc w22, w24, w29 - __ sbcsw(r12, r27, r3); // sbcs w12, w27, w3 - __ adc(r11, r23, r1); // adc x11, x23, x1 - __ adcs(r29, r5, r23); // adcs x29, x5, x23 - __ sbc(r9, r25, r12); // sbc x9, x25, x12 - __ sbcs(r12, r0, r22); // sbcs x12, x0, x22 + __ adcw(r0, r29, r24); // adc w0, w29, w24 + __ adcsw(r22, r28, r18); // adcs w22, w28, w18 + __ sbcw(r23, r16, r30); // sbc w23, w16, w30 + __ sbcsw(r7, r29, r14); // sbcs w7, w29, w14 + __ adc(r22, r12, r14); // adc x22, x12, x14 + __ adcs(r29, r24, r23); // adcs x29, x24, x23 + __ sbc(r17, r28, r22); // sbc x17, x28, x22 + __ sbcs(r24, r20, r19); // sbcs x24, x20, x19 // AddSubExtendedOp - __ addw(r26, r12, r3, ext::uxtw, 1); // add w26, w12, w3, uxtw #1 - __ addsw(r20, r16, r18, ext::sxtb, 2); // adds w20, w16, w18, sxtb #2 - __ sub(r30, r30, r7, ext::uxtw, 2); // sub x30, x30, x7, uxtw #2 - __ subsw(r11, r21, r2, ext::uxth, 3); // subs w11, w21, w2, uxth #3 - __ add(r2, r26, r1, ext::uxtw, 2); // add x2, x26, x1, uxtw #2 - __ adds(r18, r29, r20, ext::sxth, 1); // adds x18, x29, x20, sxth #1 - __ sub(r14, r16, r4, ext::uxtw, 4); // sub x14, x16, x4, uxtw #4 - __ subs(r0, r17, r23, ext::sxtb, 3); // subs x0, x17, x23, sxtb #3 + __ addw(r27, r22, r6, ext::sxtw, 2); // add w27, w22, w6, sxtw #2 + __ addsw(r13, r11, r24, ext::uxtw, 4); // adds w13, w11, w24, uxtw #4 + __ sub(r16, r8, r4, ext::uxth, 3); // sub x16, x8, x4, uxth #3 + __ subsw(r21, r18, r20, ext::sxtx, 2); // subs w21, w18, w20, sxtx #2 + __ add(r14, r17, r29, ext::uxtb, 2); // add x14, x17, x29, uxtb #2 + __ adds(r17, r17, r14, ext::sxth, 4); // adds x17, x17, x14, sxth #4 + __ sub(r22, r3, r26, ext::sxtw, 1); // sub x22, x3, x26, sxtw #1 + __ subs(r13, r13, r21, ext::uxth, 4); // subs x13, x13, x21, uxth #4 // ConditionalCompareOp - __ ccmnw(r20, r22, 3u, Assembler::PL); // ccmn w20, w22, #3, PL - __ ccmpw(r25, r2, 1u, Assembler::EQ); // ccmp w25, w2, #1, EQ - __ ccmn(r18, r24, 7u, Assembler::GT); // ccmn x18, x24, #7, GT - __ ccmp(r8, r13, 6u, Assembler::PL); // ccmp x8, x13, #6, PL + __ ccmnw(r17, r26, 15u, Assembler::VC); // ccmn w17, w26, #15, VC + __ ccmpw(r25, r6, 5u, Assembler::LO); // ccmp w25, w6, #5, LO + __ ccmn(r1, r30, 1u, Assembler::LS); // ccmn x1, x30, #1, LS + __ ccmp(r17, r7, 2u, Assembler::GE); // ccmp x17, x7, #2, GE // ConditionalCompareImmedOp - __ ccmnw(r9, 2, 4, Assembler::VS); // ccmn w9, #2, #4, VS - __ ccmpw(r2, 27, 7, Assembler::EQ); // ccmp w2, #27, #7, EQ - __ ccmn(r16, 1, 2, Assembler::CC); // ccmn x16, #1, #2, CC - __ ccmp(r17, 31, 3, Assembler::LT); // ccmp x17, #31, #3, LT + __ ccmnw(r17, 25, 6, Assembler::EQ); // ccmn w17, #25, #6, EQ + __ ccmpw(r2, 5, 5, Assembler::EQ); // ccmp w2, #5, #5, EQ + __ ccmn(r19, 17, 10, Assembler::MI); // ccmn x19, #17, #10, MI + __ ccmp(r14, 8, 3, Assembler::GT); // ccmp x14, #8, #3, GT // ConditionalSelectOp - __ cselw(r23, r27, r23, Assembler::LS); // csel w23, w27, w23, LS - __ csincw(r10, r0, r6, Assembler::VS); // csinc w10, w0, w6, VS - __ csinvw(r11, r0, r9, Assembler::CC); // csinv w11, w0, w9, CC - __ csnegw(r17, r27, r18, Assembler::LO); // csneg w17, w27, w18, LO - __ csel(r12, r16, r11, Assembler::VC); // csel x12, x16, x11, VC - __ csinc(r6, r28, r6, Assembler::HI); // csinc x6, x28, x6, HI - __ csinv(r13, r27, r26, Assembler::VC); // csinv x13, x27, x26, VC - __ csneg(r29, r22, r18, Assembler::PL); // csneg x29, x22, x18, PL + __ cselw(r9, r8, r14, Assembler::LS); // csel w9, w8, w14, LS + __ csincw(r27, r11, r5, Assembler::LE); // csinc w27, w11, w5, LE + __ csinvw(r11, r23, r22, Assembler::LO); // csinv w11, w23, w22, LO + __ csnegw(r19, r28, r27, Assembler::CS); // csneg w19, w28, w27, CS + __ csel(r16, r9, r1, Assembler::PL); // csel x16, x9, x1, PL + __ csinc(r28, r14, r12, Assembler::EQ); // csinc x28, x14, x12, EQ + __ csinv(r22, r4, r14, Assembler::PL); // csinv x22, x4, x14, PL + __ csneg(r26, r11, r27, Assembler::HS); // csneg x26, x11, x27, HS // TwoRegOp - __ rbitw(r12, r19); // rbit w12, w19 - __ rev16w(r23, r18); // rev16 w23, w18 - __ revw(r9, r28); // rev w9, w28 - __ clzw(r2, r19); // clz w2, w19 - __ clsw(r25, r29); // cls w25, w29 - __ rbit(r4, r23); // rbit x4, x23 - __ rev16(r29, r18); // rev16 x29, x18 - __ rev32(r7, r8); // rev32 x7, x8 - __ rev(r13, r17); // rev x13, x17 - __ clz(r17, r0); // clz x17, x0 - __ cls(r18, r26); // cls x18, x26 + __ rbitw(r24, r11); // rbit w24, w11 + __ rev16w(r10, r14); // rev16 w10, w14 + __ revw(r9, r5); // rev w9, w5 + __ clzw(r18, r20); // clz w18, w20 + __ clsw(r25, r8); // cls w25, w8 + __ rbit(r22, r24); // rbit x22, x24 + __ rev16(r28, r27); // rev16 x28, x27 + __ rev32(r8, r29); // rev32 x8, x29 + __ rev(r17, r10); // rev x17, x10 + __ clz(r23, r11); // clz x23, x11 + __ cls(r26, r14); // cls x26, x14 // ThreeRegOp - __ udivw(r11, r12, r16); // udiv w11, w12, w16 - __ sdivw(r4, r9, r7); // sdiv w4, w9, w7 - __ lslvw(r12, r7, r16); // lslv w12, w7, w16 - __ lsrvw(r19, r16, r23); // lsrv w19, w16, w23 - __ asrvw(r7, r4, r6); // asrv w7, w4, w6 - __ rorvw(r21, r20, r23); // rorv w21, w20, w23 - __ udiv(r16, r12, r28); // udiv x16, x12, x28 - __ sdiv(r4, r12, r13); // sdiv x4, x12, x13 - __ lslv(r9, r13, r7); // lslv x9, x13, x7 - __ lsrv(r28, r27, r15); // lsrv x28, x27, x15 - __ asrv(r20, r30, r14); // asrv x20, x30, x14 - __ rorv(r14, r18, r30); // rorv x14, x18, x30 - __ umulh(r3, r11, r7); // umulh x3, x11, x7 - __ smulh(r23, r20, r24); // smulh x23, x20, x24 + __ udivw(r21, r4, r28); // udiv w21, w4, w28 + __ sdivw(r30, r10, r22); // sdiv w30, w10, w22 + __ lslvw(r29, r2, r26); // lslv w29, w2, w26 + __ lsrvw(r28, r22, r10); // lsrv w28, w22, w10 + __ asrvw(r11, r24, r12); // asrv w11, w24, w12 + __ rorvw(r21, r30, r16); // rorv w21, w30, w16 + __ udiv(r1, r0, r13); // udiv x1, x0, x13 + __ sdiv(r2, r17, r6); // sdiv x2, x17, x6 + __ lslv(r10, r24, r21); // lslv x10, x24, x21 + __ lsrv(r5, r9, r6); // lsrv x5, x9, x6 + __ asrv(r0, r27, r4); // asrv x0, x27, x4 + __ rorv(r28, r4, r2); // rorv x28, x4, x2 + __ umulh(r1, r30, r7); // umulh x1, x30, x7 + __ smulh(r30, r5, r10); // smulh x30, x5, x10 // FourRegMulOp - __ maddw(r2, r5, r21, r9); // madd w2, w5, w21, w9 - __ msubw(r24, r24, r4, r8); // msub w24, w24, w4, w8 - __ madd(r11, r12, r15, r19); // madd x11, x12, x15, x19 - __ msub(r29, r25, r12, r25); // msub x29, x25, x12, x25 - __ smaddl(r17, r11, r12, r22); // smaddl x17, w11, w12, x22 - __ smsubl(r28, r3, r20, r18); // smsubl x28, w3, w20, x18 - __ umaddl(r7, r4, r28, r26); // umaddl x7, w4, w28, x26 - __ umsubl(r22, r10, r17, r5); // umsubl x22, w10, w17, x5 + __ maddw(r7, r15, r13, r17); // madd w7, w15, w13, w17 + __ msubw(r25, r1, r12, r28); // msub w25, w1, w12, w28 + __ madd(r2, r11, r30, r9); // madd x2, x11, x30, x9 + __ msub(r5, r23, r3, r22); // msub x5, x23, x3, x22 + __ smaddl(r25, r10, r9, r4); // smaddl x25, w10, w9, x4 + __ smsubl(r5, r8, r7, r18); // smsubl x5, w8, w7, x18 + __ umaddl(r24, r5, r26, r25); // umaddl x24, w5, w26, x25 + __ umsubl(r14, r1, r26, r28); // umsubl x14, w1, w26, x28 // ThreeRegFloatOp - __ fmuls(v17, v3, v17); // fmul s17, s3, s17 - __ fdivs(v11, v17, v6); // fdiv s11, s17, s6 - __ fadds(v29, v7, v9); // fadd s29, s7, s9 - __ fsubs(v7, v12, v19); // fsub s7, s12, s19 - __ fmuls(v0, v23, v3); // fmul s0, s23, s3 - __ fmuld(v26, v3, v21); // fmul d26, d3, d21 - __ fdivd(v0, v19, v5); // fdiv d0, d19, d5 - __ faddd(v0, v26, v9); // fadd d0, d26, d9 - __ fsubd(v25, v21, v21); // fsub d25, d21, d21 - __ fmuld(v16, v13, v19); // fmul d16, d13, d19 + __ fmuls(v24, v22, v8); // fmul s24, s22, s8 + __ fdivs(v16, v3, v6); // fdiv s16, s3, s6 + __ fadds(v16, v21, v25); // fadd s16, s21, s25 + __ fsubs(v0, v26, v27); // fsub s0, s26, s27 + __ fmuls(v24, v3, v17); // fmul s24, s3, s17 + __ fmuld(v9, v8, v6); // fmul d9, d8, d6 + __ fdivd(v22, v19, v30); // fdiv d22, d19, d30 + __ faddd(v14, v17, v3); // fadd d14, d17, d3 + __ fsubd(v24, v27, v20); // fsub d24, d27, d20 + __ fmuld(v12, v1, v22); // fmul d12, d1, d22 // FourRegFloatOp - __ fmadds(v29, v18, v0, v16); // fmadd s29, s18, s0, s16 - __ fmsubs(v23, v13, v29, v5); // fmsub s23, s13, s29, s5 - __ fnmadds(v9, v7, v10, v14); // fnmadd s9, s7, s10, s14 - __ fnmadds(v25, v28, v15, v23); // fnmadd s25, s28, s15, s23 - __ fmaddd(v6, v13, v21, v17); // fmadd d6, d13, d21, d17 - __ fmsubd(v3, v21, v2, v7); // fmsub d3, d21, d2, d7 - __ fnmaddd(v10, v25, v5, v17); // fnmadd d10, d25, d5, d17 - __ fnmaddd(v14, v14, v20, v18); // fnmadd d14, d14, d20, d18 + __ fmadds(v16, v8, v11, v29); // fmadd s16, s8, s11, s29 + __ fmsubs(v22, v19, v18, v1); // fmsub s22, s19, s18, s1 + __ fnmadds(v15, v24, v24, v9); // fnmadd s15, s24, s24, s9 + __ fnmadds(v27, v19, v13, v23); // fnmadd s27, s19, s13, s23 + __ fmaddd(v3, v0, v16, v12); // fmadd d3, d0, d16, d12 + __ fmsubd(v19, v3, v18, v10); // fmsub d19, d3, d18, d10 + __ fnmaddd(v1, v2, v11, v20); // fnmadd d1, d2, d11, d20 + __ fnmaddd(v12, v9, v25, v14); // fnmadd d12, d9, d25, d14 // TwoRegFloatOp - __ fmovs(v15, v2); // fmov s15, s2 - __ fabss(v18, v7); // fabs s18, s7 - __ fnegs(v3, v6); // fneg s3, s6 - __ fsqrts(v12, v1); // fsqrt s12, s1 - __ fcvts(v9, v0); // fcvt d9, s0 - __ fmovd(v4, v5); // fmov d4, d5 - __ fabsd(v3, v15); // fabs d3, d15 - __ fnegd(v17, v25); // fneg d17, d25 - __ fsqrtd(v12, v24); // fsqrt d12, d24 - __ fcvtd(v21, v5); // fcvt s21, d5 + __ fmovs(v0, v23); // fmov s0, s23 + __ fabss(v23, v6); // fabs s23, s6 + __ fnegs(v4, v4); // fneg s4, s4 + __ fsqrts(v5, v28); // fsqrt s5, s28 + __ fcvts(v21, v15); // fcvt d21, s15 + __ fmovd(v11, v5); // fmov d11, d5 + __ fabsd(v18, v4); // fabs d18, d4 + __ fnegd(v11, v12); // fneg d11, d12 + __ fsqrtd(v15, v8); // fsqrt d15, d8 + __ fcvtd(v8, v2); // fcvt s8, d2 // FloatConvertOp - __ fcvtzsw(r4, v21); // fcvtzs w4, s21 - __ fcvtzs(r27, v3); // fcvtzs x27, s3 - __ fcvtzdw(r29, v8); // fcvtzs w29, d8 - __ fcvtzd(r9, v21); // fcvtzs x9, d21 - __ scvtfws(v20, r29); // scvtf s20, w29 - __ scvtfs(v7, r8); // scvtf s7, x8 - __ scvtfwd(v12, r21); // scvtf d12, w21 - __ scvtfd(v16, r21); // scvtf d16, x21 - __ fmovs(r18, v5); // fmov w18, s5 - __ fmovd(r25, v8); // fmov x25, d8 - __ fmovs(v18, r26); // fmov s18, w26 - __ fmovd(v0, r11); // fmov d0, x11 + __ fcvtzsw(r19, v18); // fcvtzs w19, s18 + __ fcvtzs(r17, v0); // fcvtzs x17, s0 + __ fcvtzdw(r0, v13); // fcvtzs w0, d13 + __ fcvtzd(r22, v9); // fcvtzs x22, d9 + __ scvtfws(v6, r29); // scvtf s6, w29 + __ scvtfs(v12, r14); // scvtf s12, x14 + __ scvtfwd(v16, r22); // scvtf d16, w22 + __ scvtfd(v14, r5); // scvtf d14, x5 + __ fmovs(r7, v0); // fmov w7, s0 + __ fmovd(r28, v6); // fmov x28, d6 + __ fmovs(v2, r26); // fmov s2, w26 + __ fmovd(v4, r0); // fmov d4, x0 // TwoRegFloatOp - __ fcmps(v16, v6); // fcmp s16, s6 - __ fcmpd(v16, v29); // fcmp d16, d29 - __ fcmps(v30, 0.0); // fcmp s30, #0.0 - __ fcmpd(v9, 0.0); // fcmp d9, #0.0 + __ fcmps(v1, v11); // fcmp s1, s11 + __ fcmpd(v6, v21); // fcmp d6, d21 + __ fcmps(v16, 0.0); // fcmp s16, #0.0 + __ fcmpd(v22, 0.0); // fcmp d22, #0.0 // LoadStorePairOp - __ stpw(r27, r4, Address(r12, -16)); // stp w27, w4, [x12, #-16] - __ ldpw(r3, r9, Address(r10, 80)); // ldp w3, w9, [x10, #80] - __ ldpsw(r16, r3, Address(r3, 64)); // ldpsw x16, x3, [x3, #64] - __ stp(r10, r28, Address(r19, -192)); // stp x10, x28, [x19, #-192] - __ ldp(r19, r18, Address(r7, -192)); // ldp x19, x18, [x7, #-192] + __ stpw(r5, r0, Address(r2, 96)); // stp w5, w0, [x2, #96] + __ ldpw(r14, r29, Address(r19, -64)); // ldp w14, w29, [x19, #-64] + __ ldpsw(r15, r3, Address(r3, -160)); // ldpsw x15, x3, [x3, #-160] + __ stp(r7, r13, Address(r27, -224)); // stp x7, x13, [x27, #-224] + __ ldp(r17, r14, Address(r1, 128)); // ldp x17, x14, [x1, #128] // LoadStorePairOp - __ stpw(r10, r16, Address(__ pre(r30, 16))); // stp w10, w16, [x30, #16]! - __ ldpw(r2, r4, Address(__ pre(r18, -240))); // ldp w2, w4, [x18, #-240]! - __ ldpsw(r24, r19, Address(__ pre(r13, 48))); // ldpsw x24, x19, [x13, #48]! - __ stp(r17, r0, Address(__ pre(r24, 0))); // stp x17, x0, [x24, #0]! - __ ldp(r14, r26, Address(__ pre(r3, -192))); // ldp x14, x26, [x3, #-192]! + __ stpw(r21, r22, Address(__ pre(r4, 128))); // stp w21, w22, [x4, #128]! + __ ldpw(r17, r13, Address(__ pre(r2, -96))); // ldp w17, w13, [x2, #-96]! + __ ldpsw(r21, r25, Address(__ pre(r23, -144))); // ldpsw x21, x25, [x23, #-144]! + __ stp(r4, r16, Address(__ pre(r15, -16))); // stp x4, x16, [x15, #-16]! + __ ldp(r29, r21, Address(__ pre(r25, -160))); // ldp x29, x21, [x25, #-160]! // LoadStorePairOp - __ stpw(r22, r1, Address(__ post(r0, 80))); // stp w22, w1, [x0], #80 - __ ldpw(r18, r10, Address(__ post(r0, -16))); // ldp w18, w10, [x0], #-16 - __ ldpsw(r24, r24, Address(__ post(r22, -16))); // ldpsw x24, x24, [x22], #-16 - __ stp(r12, r12, Address(__ post(r4, 80))); // stp x12, x12, [x4], #80 - __ ldp(r4, r9, Address(__ post(r19, -240))); // ldp x4, x9, [x19], #-240 + __ stpw(r24, r17, Address(__ post(r26, 80))); // stp w24, w17, [x26], #80 + __ ldpw(r3, r30, Address(__ post(r30, -240))); // ldp w3, w30, [x30], #-240 + __ ldpsw(r3, r19, Address(__ post(r30, -32))); // ldpsw x3, x19, [x30], #-32 + __ stp(r25, r1, Address(__ post(r27, -144))); // stp x25, x1, [x27], #-144 + __ ldp(r26, r20, Address(__ post(r28, -64))); // ldp x26, x20, [x28], #-64 // LoadStorePairOp - __ stnpw(r18, r26, Address(r6, -224)); // stnp w18, w26, [x6, #-224] - __ ldnpw(r21, r20, Address(r1, 112)); // ldnp w21, w20, [x1, #112] - __ stnp(r25, r29, Address(r20, -224)); // stnp x25, x29, [x20, #-224] - __ ldnp(r1, r5, Address(r23, 112)); // ldnp x1, x5, [x23, #112] + __ stnpw(r29, r25, Address(r9, -48)); // stnp w29, w25, [x9, #-48] + __ ldnpw(r25, r14, Address(r19, -128)); // ldnp w25, w14, [x19, #-128] + __ stnp(r25, r22, Address(r3, 32)); // stnp x25, x22, [x3, #32] + __ ldnp(r9, r18, Address(r29, -208)); // ldnp x9, x18, [x29, #-208] // LdStSIMDOp - __ ld1(v4, __ T8B, Address(r20)); // ld1 {v4.8B}, [x20] - __ ld1(v24, v25, __ T16B, Address(__ post(r10, 32))); // ld1 {v24.16B, v25.16B}, [x10], 32 - __ ld1(v24, v25, v26, __ T1D, Address(__ post(r6, r15))); // ld1 {v24.1D, v25.1D, v26.1D}, [x6], x15 - __ ld1(v3, v4, v5, v6, __ T8H, Address(__ post(r4, 64))); // ld1 {v3.8H, v4.8H, v5.8H, v6.8H}, [x4], 64 - __ ld1r(v2, __ T8B, Address(r6)); // ld1r {v2.8B}, [x6] - __ ld1r(v13, __ T4S, Address(__ post(r14, 4))); // ld1r {v13.4S}, [x14], 4 - __ ld1r(v15, __ T1D, Address(__ post(r21, r24))); // ld1r {v15.1D}, [x21], x24 - __ ld2(v9, v10, __ T2D, Address(r21)); // ld2 {v9.2D, v10.2D}, [x21] - __ ld2(v29, v30, __ T4H, Address(__ post(r21, 16))); // ld2 {v29.4H, v30.4H}, [x21], 16 - __ ld2r(v8, v9, __ T16B, Address(r14)); // ld2r {v8.16B, v9.16B}, [x14] - __ ld2r(v7, v8, __ T2S, Address(__ post(r20, 8))); // ld2r {v7.2S, v8.2S}, [x20], 8 - __ ld2r(v28, v29, __ T2D, Address(__ post(r3, r3))); // ld2r {v28.2D, v29.2D}, [x3], x3 - __ ld3(v27, v28, v29, __ T4S, Address(__ post(r11, r29))); // ld3 {v27.4S, v28.4S, v29.4S}, [x11], x29 - __ ld3(v16, v17, v18, __ T2S, Address(r10)); // ld3 {v16.2S, v17.2S, v18.2S}, [x10] - __ ld3r(v21, v22, v23, __ T8H, Address(r12)); // ld3r {v21.8H, v22.8H, v23.8H}, [x12] - __ ld3r(v4, v5, v6, __ T4S, Address(__ post(r29, 12))); // ld3r {v4.4S, v5.4S, v6.4S}, [x29], 12 - __ ld3r(v24, v25, v26, __ T1D, Address(__ post(r9, r19))); // ld3r {v24.1D, v25.1D, v26.1D}, [x9], x19 - __ ld4(v10, v11, v12, v13, __ T8H, Address(__ post(r3, 64))); // ld4 {v10.8H, v11.8H, v12.8H, v13.8H}, [x3], 64 - __ ld4(v27, v28, v29, v30, __ T8B, Address(__ post(r28, r9))); // ld4 {v27.8B, v28.8B, v29.8B, v30.8B}, [x28], x9 - __ ld4r(v21, v22, v23, v24, __ T8B, Address(r30)); // ld4r {v21.8B, v22.8B, v23.8B, v24.8B}, [x30] - __ ld4r(v23, v24, v25, v26, __ T4H, Address(__ post(r14, 8))); // ld4r {v23.4H, v24.4H, v25.4H, v26.4H}, [x14], 8 - __ ld4r(v4, v5, v6, v7, __ T2S, Address(__ post(r13, r20))); // ld4r {v4.2S, v5.2S, v6.2S, v7.2S}, [x13], x20 + __ ld1(v21, __ T8B, Address(r19)); // ld1 {v21.8B}, [x19] + __ ld1(v27, v28, __ T16B, Address(__ post(r20, 32))); // ld1 {v27.16B, v28.16B}, [x20], 32 + __ ld1(v5, v6, v7, __ T1D, Address(__ post(r22, r6))); // ld1 {v5.1D, v6.1D, v7.1D}, [x22], x6 + __ ld1(v22, v23, v24, v25, __ T8H, Address(__ post(r12, 64))); // ld1 {v22.8H, v23.8H, v24.8H, v25.8H}, [x12], 64 + __ ld1r(v17, __ T8B, Address(r9)); // ld1r {v17.8B}, [x9] + __ ld1r(v5, __ T4S, Address(__ post(r21, 4))); // ld1r {v5.4S}, [x21], 4 + __ ld1r(v10, __ T1D, Address(__ post(r28, r18))); // ld1r {v10.1D}, [x28], x18 + __ ld2(v26, v27, __ T2D, Address(r15)); // ld2 {v26.2D, v27.2D}, [x15] + __ ld2(v16, v17, __ T4H, Address(__ post(r26, 16))); // ld2 {v16.4H, v17.4H}, [x26], 16 + __ ld2r(v14, v15, __ T16B, Address(r2)); // ld2r {v14.16B, v15.16B}, [x2] + __ ld2r(v18, v19, __ T2S, Address(__ post(r28, 8))); // ld2r {v18.2S, v19.2S}, [x28], 8 + __ ld2r(v19, v20, __ T2D, Address(__ post(r0, r22))); // ld2r {v19.2D, v20.2D}, [x0], x22 + __ ld3(v16, v17, v18, __ T4S, Address(__ post(r2, r18))); // ld3 {v16.4S, v17.4S, v18.4S}, [x2], x18 + __ ld3(v24, v25, v26, __ T2S, Address(r0)); // ld3 {v24.2S, v25.2S, v26.2S}, [x0] + __ ld3r(v4, v5, v6, __ T8H, Address(r16)); // ld3r {v4.8H, v5.8H, v6.8H}, [x16] + __ ld3r(v5, v6, v7, __ T4S, Address(__ post(r1, 12))); // ld3r {v5.4S, v6.4S, v7.4S}, [x1], 12 + __ ld3r(v7, v8, v9, __ T1D, Address(__ post(r10, r16))); // ld3r {v7.1D, v8.1D, v9.1D}, [x10], x16 + __ ld4(v22, v23, v24, v25, __ T8H, Address(__ post(r20, 64))); // ld4 {v22.8H, v23.8H, v24.8H, v25.8H}, [x20], 64 + __ ld4(v15, v16, v17, v18, __ T8B, Address(__ post(r4, r25))); // ld4 {v15.8B, v16.8B, v17.8B, v18.8B}, [x4], x25 + __ ld4r(v0, v1, v2, v3, __ T8B, Address(r5)); // ld4r {v0.8B, v1.8B, v2.8B, v3.8B}, [x5] + __ ld4r(v0, v1, v2, v3, __ T4H, Address(__ post(r1, 8))); // ld4r {v0.4H, v1.4H, v2.4H, v3.4H}, [x1], 8 + __ ld4r(v30, v31, v0, v1, __ T2S, Address(__ post(r28, r14))); // ld4r {v30.2S, v31.2S, v0.2S, v1.2S}, [x28], x14 // SpecialCases - __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE - __ ccmnw(zr, zr, 5u, Assembler::EQ); // ccmn wzr, wzr, #5, EQ - __ ccmp(zr, 1, 4u, Assembler::NE); // ccmp xzr, 1, #4, NE - __ ccmpw(zr, 2, 2, Assembler::GT); // ccmp wzr, 2, #2, GT - __ extr(zr, zr, zr, 0); // extr xzr, xzr, xzr, 0 - __ stlxp(r0, zr, zr, sp); // stlxp w0, xzr, xzr, [sp] - __ stlxpw(r2, zr, zr, r3); // stlxp w2, wzr, wzr, [x3] - __ stxp(r4, zr, zr, r5); // stxp w4, xzr, xzr, [x5] - __ stxpw(r6, zr, zr, sp); // stxp w6, wzr, wzr, [sp] - __ dup(v0, __ T16B, zr); // dup v0.16b, wzr - __ mov(v1, __ T1D, 0, zr); // mov v1.d[0], xzr - __ mov(v1, __ T2S, 1, zr); // mov v1.s[1], wzr - __ mov(v1, __ T4H, 2, zr); // mov v1.h[2], wzr - __ mov(v1, __ T8B, 3, zr); // mov v1.b[3], wzr - __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0 + __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE + __ ccmnw(zr, zr, 5u, Assembler::EQ); // ccmn wzr, wzr, #5, EQ + __ ccmp(zr, 1, 4u, Assembler::NE); // ccmp xzr, 1, #4, NE + __ ccmpw(zr, 2, 2, Assembler::GT); // ccmp wzr, 2, #2, GT + __ extr(zr, zr, zr, 0); // extr xzr, xzr, xzr, 0 + __ stlxp(r0, zr, zr, sp); // stlxp w0, xzr, xzr, [sp] + __ stlxpw(r2, zr, zr, r3); // stlxp w2, wzr, wzr, [x3] + __ stxp(r4, zr, zr, r5); // stxp w4, xzr, xzr, [x5] + __ stxpw(r6, zr, zr, sp); // stxp w6, wzr, wzr, [sp] + __ dup(v0, __ T16B, zr); // dup v0.16b, wzr + __ mov(v1, __ T1D, 0, zr); // mov v1.d[0], xzr + __ mov(v1, __ T2S, 1, zr); // mov v1.s[1], wzr + __ mov(v1, __ T4H, 2, zr); // mov v1.h[2], wzr + __ mov(v1, __ T8B, 3, zr); // mov v1.b[3], wzr + __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0 // FloatImmediateOp - __ fmovd(v0, 2.0); // fmov d0, #2.0 - __ fmovd(v0, 2.125); // fmov d0, #2.125 - __ fmovd(v0, 4.0); // fmov d0, #4.0 - __ fmovd(v0, 4.25); // fmov d0, #4.25 - __ fmovd(v0, 8.0); // fmov d0, #8.0 - __ fmovd(v0, 8.5); // fmov d0, #8.5 - __ fmovd(v0, 16.0); // fmov d0, #16.0 - __ fmovd(v0, 17.0); // fmov d0, #17.0 - __ fmovd(v0, 0.125); // fmov d0, #0.125 - __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 - __ fmovd(v0, 0.25); // fmov d0, #0.25 - __ fmovd(v0, 0.265625); // fmov d0, #0.265625 - __ fmovd(v0, 0.5); // fmov d0, #0.5 - __ fmovd(v0, 0.53125); // fmov d0, #0.53125 - __ fmovd(v0, 1.0); // fmov d0, #1.0 - __ fmovd(v0, 1.0625); // fmov d0, #1.0625 - __ fmovd(v0, -2.0); // fmov d0, #-2.0 - __ fmovd(v0, -2.125); // fmov d0, #-2.125 - __ fmovd(v0, -4.0); // fmov d0, #-4.0 - __ fmovd(v0, -4.25); // fmov d0, #-4.25 - __ fmovd(v0, -8.0); // fmov d0, #-8.0 - __ fmovd(v0, -8.5); // fmov d0, #-8.5 - __ fmovd(v0, -16.0); // fmov d0, #-16.0 - __ fmovd(v0, -17.0); // fmov d0, #-17.0 - __ fmovd(v0, -0.125); // fmov d0, #-0.125 - __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 - __ fmovd(v0, -0.25); // fmov d0, #-0.25 - __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 - __ fmovd(v0, -0.5); // fmov d0, #-0.5 - __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 - __ fmovd(v0, -1.0); // fmov d0, #-1.0 - __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 + __ fmovd(v0, 2.0); // fmov d0, #2.0 + __ fmovd(v0, 2.125); // fmov d0, #2.125 + __ fmovd(v0, 4.0); // fmov d0, #4.0 + __ fmovd(v0, 4.25); // fmov d0, #4.25 + __ fmovd(v0, 8.0); // fmov d0, #8.0 + __ fmovd(v0, 8.5); // fmov d0, #8.5 + __ fmovd(v0, 16.0); // fmov d0, #16.0 + __ fmovd(v0, 17.0); // fmov d0, #17.0 + __ fmovd(v0, 0.125); // fmov d0, #0.125 + __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 + __ fmovd(v0, 0.25); // fmov d0, #0.25 + __ fmovd(v0, 0.265625); // fmov d0, #0.265625 + __ fmovd(v0, 0.5); // fmov d0, #0.5 + __ fmovd(v0, 0.53125); // fmov d0, #0.53125 + __ fmovd(v0, 1.0); // fmov d0, #1.0 + __ fmovd(v0, 1.0625); // fmov d0, #1.0625 + __ fmovd(v0, -2.0); // fmov d0, #-2.0 + __ fmovd(v0, -2.125); // fmov d0, #-2.125 + __ fmovd(v0, -4.0); // fmov d0, #-4.0 + __ fmovd(v0, -4.25); // fmov d0, #-4.25 + __ fmovd(v0, -8.0); // fmov d0, #-8.0 + __ fmovd(v0, -8.5); // fmov d0, #-8.5 + __ fmovd(v0, -16.0); // fmov d0, #-16.0 + __ fmovd(v0, -17.0); // fmov d0, #-17.0 + __ fmovd(v0, -0.125); // fmov d0, #-0.125 + __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 + __ fmovd(v0, -0.25); // fmov d0, #-0.25 + __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 + __ fmovd(v0, -0.5); // fmov d0, #-0.5 + __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 + __ fmovd(v0, -1.0); // fmov d0, #-1.0 + __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 // LSEOp - __ swp(Assembler::xword, r21, r5, r24); // swp x21, x5, [x24] - __ ldadd(Assembler::xword, r13, r13, r15); // ldadd x13, x13, [x15] - __ ldbic(Assembler::xword, r22, r19, r26); // ldclr x22, x19, [x26] - __ ldeor(Assembler::xword, r25, r10, r26); // ldeor x25, x10, [x26] - __ ldorr(Assembler::xword, r5, r27, r15); // ldset x5, x27, [x15] - __ ldsmin(Assembler::xword, r19, r5, r11); // ldsmin x19, x5, [x11] - __ ldsmax(Assembler::xword, r26, r0, r4); // ldsmax x26, x0, [x4] - __ ldumin(Assembler::xword, r22, r23, r30); // ldumin x22, x23, [x30] - __ ldumax(Assembler::xword, r18, r28, r8); // ldumax x18, x28, [x8] + __ swp(Assembler::xword, r26, r9, r17); // swp x26, x9, [x17] + __ ldadd(Assembler::xword, r28, r23, r2); // ldadd x28, x23, [x2] + __ ldbic(Assembler::xword, r22, r2, r3); // ldclr x22, x2, [x3] + __ ldeor(Assembler::xword, r11, r25, r30); // ldeor x11, x25, [x30] + __ ldorr(Assembler::xword, r22, r28, r4); // ldset x22, x28, [x4] + __ ldsmin(Assembler::xword, r6, r11, r24); // ldsmin x6, x11, [x24] + __ ldsmax(Assembler::xword, r12, zr, sp); // ldsmax x12, xzr, [sp] + __ ldumin(Assembler::xword, r23, r30, r9); // ldumin x23, x30, [x9] + __ ldumax(Assembler::xword, r6, r1, r20); // ldumax x6, x1, [x20] // LSEOp - __ swpa(Assembler::xword, r13, r29, r27); // swpa x13, x29, [x27] - __ ldadda(Assembler::xword, r11, r5, r13); // ldadda x11, x5, [x13] - __ ldbica(Assembler::xword, r1, r24, r21); // ldclra x1, x24, [x21] - __ ldeora(Assembler::xword, r27, r17, r24); // ldeora x27, x17, [x24] - __ ldorra(Assembler::xword, r18, r30, r5); // ldseta x18, x30, [x5] - __ ldsmina(Assembler::xword, r7, r22, r25); // ldsmina x7, x22, [x25] - __ ldsmaxa(Assembler::xword, r4, r26, r19); // ldsmaxa x4, x26, [x19] - __ ldumina(Assembler::xword, r6, r30, r3); // ldumina x6, x30, [x3] - __ ldumaxa(Assembler::xword, r24, r23, r5); // ldumaxa x24, x23, [x5] + __ swpa(Assembler::xword, r17, r2, r22); // swpa x17, x2, [x22] + __ ldadda(Assembler::xword, r14, r27, r10); // ldadda x14, x27, [x10] + __ ldbica(Assembler::xword, r6, r30, r19); // ldclra x6, x30, [x19] + __ ldeora(Assembler::xword, r0, r25, r11); // ldeora x0, x25, [x11] + __ ldorra(Assembler::xword, r23, r0, r30); // ldseta x23, x0, [x30] + __ ldsmina(Assembler::xword, r21, r3, r10); // ldsmina x21, x3, [x10] + __ ldsmaxa(Assembler::xword, r15, r22, r0); // ldsmaxa x15, x22, [x0] + __ ldumina(Assembler::xword, r17, r0, r20); // ldumina x17, x0, [x20] + __ ldumaxa(Assembler::xword, r16, r13, r1); // ldumaxa x16, x13, [x1] // LSEOp - __ swpal(Assembler::xword, r24, r18, r28); // swpal x24, x18, [x28] - __ ldaddal(Assembler::xword, r19, zr, r7); // ldaddal x19, xzr, [x7] - __ ldbical(Assembler::xword, r13, r6, r28); // ldclral x13, x6, [x28] - __ ldeoral(Assembler::xword, r8, r15, r21); // ldeoral x8, x15, [x21] - __ ldorral(Assembler::xword, r2, r13, r1); // ldsetal x2, x13, [x1] - __ ldsminal(Assembler::xword, r17, r29, r25); // ldsminal x17, x29, [x25] - __ ldsmaxal(Assembler::xword, r25, r18, r14); // ldsmaxal x25, x18, [x14] - __ lduminal(Assembler::xword, zr, r6, r27); // lduminal xzr, x6, [x27] - __ ldumaxal(Assembler::xword, r16, r5, r15); // ldumaxal x16, x5, [x15] + __ swpal(Assembler::xword, r27, r15, r23); // swpal x27, x15, [x23] + __ ldaddal(Assembler::xword, r19, r30, r1); // ldaddal x19, x30, [x1] + __ ldbical(Assembler::xword, r15, r28, r23); // ldclral x15, x28, [x23] + __ ldeoral(Assembler::xword, r7, r15, r19); // ldeoral x7, x15, [x19] + __ ldorral(Assembler::xword, r11, r12, r10); // ldsetal x11, x12, [x10] + __ ldsminal(Assembler::xword, r6, r7, r12); // ldsminal x6, x7, [x12] + __ ldsmaxal(Assembler::xword, r28, r5, r13); // ldsmaxal x28, x5, [x13] + __ lduminal(Assembler::xword, r9, r20, r17); // lduminal x9, x20, [x17] + __ ldumaxal(Assembler::xword, r21, r25, r11); // ldumaxal x21, x25, [x11] // LSEOp - __ swpl(Assembler::xword, r11, r18, r3); // swpl x11, x18, [x3] - __ ldaddl(Assembler::xword, r26, r20, r2); // ldaddl x26, x20, [x2] - __ ldbicl(Assembler::xword, r11, r4, r11); // ldclrl x11, x4, [x11] - __ ldeorl(Assembler::xword, r30, r19, r23); // ldeorl x30, x19, [x23] - __ ldorrl(Assembler::xword, r3, r15, r14); // ldsetl x3, x15, [x14] - __ ldsminl(Assembler::xword, r30, r22, r20); // ldsminl x30, x22, [x20] - __ ldsmaxl(Assembler::xword, r7, r5, r24); // ldsmaxl x7, x5, [x24] - __ lduminl(Assembler::xword, r23, r16, r15); // lduminl x23, x16, [x15] - __ ldumaxl(Assembler::xword, r11, r19, r0); // ldumaxl x11, x19, [x0] + __ swpl(Assembler::xword, r19, r24, r24); // swpl x19, x24, [x24] + __ ldaddl(Assembler::xword, r8, r26, r30); // ldaddl x8, x26, [x30] + __ ldbicl(Assembler::xword, r17, r18, r8); // ldclrl x17, x18, [x8] + __ ldeorl(Assembler::xword, r2, r3, r3); // ldeorl x2, x3, [x3] + __ ldorrl(Assembler::xword, r26, r7, r16); // ldsetl x26, x7, [x16] + __ ldsminl(Assembler::xword, r27, r6, r5); // ldsminl x27, x6, [x5] + __ ldsmaxl(Assembler::xword, r22, r0, r20); // ldsmaxl x22, x0, [x20] + __ lduminl(Assembler::xword, r11, r26, r2); // lduminl x11, x26, [x2] + __ ldumaxl(Assembler::xword, r30, r29, r4); // ldumaxl x30, x29, [x4] // LSEOp - __ swp(Assembler::word, r28, r28, r1); // swp w28, w28, [x1] - __ ldadd(Assembler::word, r11, r21, r12); // ldadd w11, w21, [x12] - __ ldbic(Assembler::word, r29, r0, r18); // ldclr w29, w0, [x18] - __ ldeor(Assembler::word, r5, r0, r25); // ldeor w5, w0, [x25] - __ ldorr(Assembler::word, r14, r0, r26); // ldset w14, w0, [x26] - __ ldsmin(Assembler::word, r28, r18, r29); // ldsmin w28, w18, [x29] - __ ldsmax(Assembler::word, r15, r1, r29); // ldsmax w15, w1, [x29] - __ ldumin(Assembler::word, r8, r26, r28); // ldumin w8, w26, [x28] - __ ldumax(Assembler::word, r17, r14, r4); // ldumax w17, w14, [x4] + __ swp(Assembler::word, r4, r5, r7); // swp w4, w5, [x7] + __ ldadd(Assembler::word, r10, r26, r2); // ldadd w10, w26, [x2] + __ ldbic(Assembler::word, r27, r16, r27); // ldclr w27, w16, [x27] + __ ldeor(Assembler::word, zr, r23, r10); // ldeor wzr, w23, [x10] + __ ldorr(Assembler::word, r4, r2, r13); // ldset w4, w2, [x13] + __ ldsmin(Assembler::word, r3, r15, r3); // ldsmin w3, w15, [x3] + __ ldsmax(Assembler::word, r3, r10, r6); // ldsmax w3, w10, [x6] + __ ldumin(Assembler::word, r8, r11, r10); // ldumin w8, w11, [x10] + __ ldumax(Assembler::word, r29, r30, r13); // ldumax w29, w30, [x13] // LSEOp - __ swpa(Assembler::word, r24, r25, r1); // swpa w24, w25, [x1] - __ ldadda(Assembler::word, r10, r17, r17); // ldadda w10, w17, [x17] - __ ldbica(Assembler::word, r29, r20, r21); // ldclra w29, w20, [x21] - __ ldeora(Assembler::word, r29, r9, r12); // ldeora w29, w9, [x12] - __ ldorra(Assembler::word, r11, r6, r5); // ldseta w11, w6, [x5] - __ ldsmina(Assembler::word, r21, r7, r21); // ldsmina w21, w7, [x21] - __ ldsmaxa(Assembler::word, r10, r23, r12); // ldsmaxa w10, w23, [x12] - __ ldumina(Assembler::word, r21, r5, r10); // ldumina w21, w5, [x10] - __ ldumaxa(Assembler::word, r30, r20, r18); // ldumaxa w30, w20, [x18] + __ swpa(Assembler::word, r11, r17, r20); // swpa w11, w17, [x20] + __ ldadda(Assembler::word, r26, r16, r6); // ldadda w26, w16, [x6] + __ ldbica(Assembler::word, r21, r10, r1); // ldclra w21, w10, [x1] + __ ldeora(Assembler::word, r29, r12, r23); // ldeora w29, w12, [x23] + __ ldorra(Assembler::word, r29, r8, r8); // ldseta w29, w8, [x8] + __ ldsmina(Assembler::word, r11, r10, r14); // ldsmina w11, w10, [x14] + __ ldsmaxa(Assembler::word, r4, r13, r22); // ldsmaxa w4, w13, [x22] + __ ldumina(Assembler::word, r7, r13, r7); // ldumina w7, w13, [x7] + __ ldumaxa(Assembler::word, r14, r0, sp); // ldumaxa w14, w0, [sp] // LSEOp - __ swpal(Assembler::word, r13, r23, r5); // swpal w13, w23, [x5] - __ ldaddal(Assembler::word, r15, r24, r5); // ldaddal w15, w24, [x5] - __ ldbical(Assembler::word, r9, r10, r25); // ldclral w9, w10, [x25] - __ ldeoral(Assembler::word, r20, r17, r17); // ldeoral w20, w17, [x17] - __ ldorral(Assembler::word, r12, r18, r30); // ldsetal w12, w18, [x30] - __ ldsminal(Assembler::word, r3, r3, r25); // ldsminal w3, w3, [x25] - __ ldsmaxal(Assembler::word, r26, r25, r10); // ldsmaxal w26, w25, [x10] - __ lduminal(Assembler::word, r2, r11, sp); // lduminal w2, w11, [sp] - __ ldumaxal(Assembler::word, r7, r2, r5); // ldumaxal w7, w2, [x5] + __ swpal(Assembler::word, r17, r2, r28); // swpal w17, w2, [x28] + __ ldaddal(Assembler::word, r19, r11, r10); // ldaddal w19, w11, [x10] + __ ldbical(Assembler::word, r12, r19, r20); // ldclral w12, w19, [x20] + __ ldeoral(Assembler::word, r0, r8, r8); // ldeoral w0, w8, [x8] + __ ldorral(Assembler::word, r17, r3, r24); // ldsetal w17, w3, [x24] + __ ldsminal(Assembler::word, r25, r5, r7); // ldsminal w25, w5, [x7] + __ ldsmaxal(Assembler::word, r16, r30, r9); // ldsmaxal w16, w30, [x9] + __ lduminal(Assembler::word, r10, zr, r14); // lduminal w10, wzr, [x14] + __ ldumaxal(Assembler::word, r17, r19, r11); // ldumaxal w17, w19, [x11] // LSEOp - __ swpl(Assembler::word, r0, r7, r20); // swpl w0, w7, [x20] - __ ldaddl(Assembler::word, r5, zr, r2); // ldaddl w5, wzr, [x2] - __ ldbicl(Assembler::word, r27, r25, r27); // ldclrl w27, w25, [x27] - __ ldeorl(Assembler::word, r30, r24, r26); // ldeorl w30, w24, [x26] - __ ldorrl(Assembler::word, r15, r2, r22); // ldsetl w15, w2, [x22] - __ ldsminl(Assembler::word, r0, r3, sp); // ldsminl w0, w3, [sp] - __ ldsmaxl(Assembler::word, r15, r20, r10); // ldsmaxl w15, w20, [x10] - __ lduminl(Assembler::word, r22, r21, r14); // lduminl w22, w21, [x14] - __ ldumaxl(Assembler::word, r6, r30, r2); // ldumaxl w6, w30, [x2] + __ swpl(Assembler::word, r20, r1, r13); // swpl w20, w1, [x13] + __ ldaddl(Assembler::word, r26, r11, r20); // ldaddl w26, w11, [x20] + __ ldbicl(Assembler::word, r18, r24, r30); // ldclrl w18, w24, [x30] + __ ldeorl(Assembler::word, r12, r25, r20); // ldeorl w12, w25, [x20] + __ ldorrl(Assembler::word, r14, r29, r5); // ldsetl w14, w29, [x5] + __ ldsminl(Assembler::word, r2, r26, r27); // ldsminl w2, w26, [x27] + __ ldsmaxl(Assembler::word, r25, r27, r11); // ldsmaxl w25, w27, [x11] + __ lduminl(Assembler::word, r4, r29, r7); // lduminl w4, w29, [x7] + __ ldumaxl(Assembler::word, r16, r29, r10); // ldumaxl w16, w29, [x10] __ bind(forth); @@ -762,567 +762,567 @@ aarch64ops.o: file format elf64-littleaarch64 Disassembly of section .text: 0000000000000000 : - 0: 8b50798f add x15, x12, x16, lsr #30 - 4: cb4381e1 sub x1, x15, x3, lsr #32 - 8: ab05372d adds x13, x25, x5, lsl #13 - c: eb864796 subs x22, x28, x6, asr #17 - 10: 0b961920 add w0, w9, w22, asr #6 - 14: 4b195473 sub w19, w3, w25, lsl #21 - 18: 2b0b5264 adds w4, w19, w11, lsl #20 - 1c: 6b9300f8 subs w24, w7, w19, asr #0 - 20: 8a0bc0fe and x30, x7, x11, lsl #48 - 24: aa0f3118 orr x24, x8, x15, lsl #12 - 28: ca170531 eor x17, x9, x23, lsl #1 - 2c: ea44dd6e ands x14, x11, x4, lsr #55 - 30: 0a4c44f3 and w19, w7, w12, lsr #17 - 34: 2a8b7373 orr w19, w27, w11, asr #28 - 38: 4a567c7e eor w30, w3, w22, lsr #31 - 3c: 6a9c0353 ands w19, w26, w28, asr #0 - 40: 8a3accdd bic x29, x6, x26, lsl #51 - 44: aa318f7a orn x26, x27, x17, lsl #35 - 48: ca2e1495 eon x21, x4, x14, lsl #5 - 4c: eaa015e2 bics x2, x15, x0, asr #5 - 50: 0a2274e2 bic w2, w7, w2, lsl #29 - 54: 2a751598 orn w24, w12, w21, lsr #5 - 58: 4a3309fe eon w30, w15, w19, lsl #2 - 5c: 6ab172fe bics w30, w23, w17, asr #28 - 60: 110a5284 add w4, w20, #0x294 - 64: 310b1942 adds w2, w10, #0x2c6 - 68: 5103d353 sub w19, w26, #0xf4 - 6c: 710125bc subs w28, w13, #0x49 - 70: 910d7bc2 add x2, x30, #0x35e - 74: b108fa1b adds x27, x16, #0x23e - 78: d1093536 sub x22, x9, #0x24d - 7c: f10ae824 subs x4, x1, #0x2ba - 80: 120e667c and w28, w19, #0xfffc0fff - 84: 321f6cbb orr w27, w5, #0x1ffffffe - 88: 520f6a9e eor w30, w20, #0xfffe0fff - 8c: 72136f56 ands w22, w26, #0xffffe1ff - 90: 927e4ce5 and x5, x7, #0x3ffffc - 94: b278b4ed orr x13, x7, #0x3fffffffffff00 - 98: d24c6527 eor x7, x9, #0xfff0000000003fff - 9c: f2485803 ands x3, x0, #0xff00000000007fff - a0: 14000000 b a0 - a4: 17ffffd7 b 0 - a8: 140001ee b 860 - ac: 94000000 bl ac - b0: 97ffffd4 bl 0 - b4: 940001eb bl 860 - b8: 34000010 cbz w16, b8 - bc: 34fffa30 cbz w16, 0 - c0: 34003d10 cbz w16, 860 - c4: 35000013 cbnz w19, c4 - c8: 35fff9d3 cbnz w19, 0 - cc: 35003cb3 cbnz w19, 860 - d0: b4000005 cbz x5, d0 - d4: b4fff965 cbz x5, 0 - d8: b4003c45 cbz x5, 860 - dc: b5000004 cbnz x4, dc - e0: b5fff904 cbnz x4, 0 - e4: b5003be4 cbnz x4, 860 - e8: 1000001b adr x27, e8 - ec: 10fff8bb adr x27, 0 - f0: 10003b9b adr x27, 860 - f4: 90000010 adrp x16, 0 - f8: 3640001c tbz w28, #8, f8 - fc: 3647f83c tbz w28, #8, 0 - 100: 36403b1c tbz w28, #8, 860 - 104: 37080001 tbnz w1, #1, 104 - 108: 370ff7c1 tbnz w1, #1, 0 - 10c: 37083aa1 tbnz w1, #1, 860 - 110: 12a437f4 mov w20, #0xde40ffff // #-566165505 - 114: 528c9d67 mov w7, #0x64eb // #25835 - 118: 72838bb1 movk w17, #0x1c5d - 11c: 92c1062e mov x14, #0xfffff7ceffffffff // #-9006546419713 - 120: d287da49 mov x9, #0x3ed2 // #16082 - 124: f2a6d153 movk x19, #0x368a, lsl #16 - 128: 93465ac9 sbfx x9, x22, #6, #17 - 12c: 330b0013 bfi w19, w0, #21, #1 - 130: 530b4e6a ubfx w10, w19, #11, #9 - 134: 934545e4 sbfx x4, x15, #5, #13 - 138: b35370a3 bfxil x3, x5, #19, #10 - 13c: d3510b8c ubfiz x12, x28, #47, #3 - 140: 13960c0f extr w15, w0, w22, #3 - 144: 93ceddc6 ror x6, x14, #55 - 148: 54000000 b.eq 148 // b.none - 14c: 54fff5a0 b.eq 0 // b.none - 150: 54003880 b.eq 860 // b.none - 154: 54000001 b.ne 154 // b.any - 158: 54fff541 b.ne 0 // b.any - 15c: 54003821 b.ne 860 // b.any - 160: 54000002 b.cs 160 // b.hs, b.nlast - 164: 54fff4e2 b.cs 0 // b.hs, b.nlast - 168: 540037c2 b.cs 860 // b.hs, b.nlast - 16c: 54000002 b.cs 16c // b.hs, b.nlast - 170: 54fff482 b.cs 0 // b.hs, b.nlast - 174: 54003762 b.cs 860 // b.hs, b.nlast - 178: 54000003 b.cc 178 // b.lo, b.ul, b.last - 17c: 54fff423 b.cc 0 // b.lo, b.ul, b.last - 180: 54003703 b.cc 860 // b.lo, b.ul, b.last - 184: 54000003 b.cc 184 // b.lo, b.ul, b.last - 188: 54fff3c3 b.cc 0 // b.lo, b.ul, b.last - 18c: 540036a3 b.cc 860 // b.lo, b.ul, b.last - 190: 54000004 b.mi 190 // b.first - 194: 54fff364 b.mi 0 // b.first - 198: 54003644 b.mi 860 // b.first - 19c: 54000005 b.pl 19c // b.nfrst - 1a0: 54fff305 b.pl 0 // b.nfrst - 1a4: 540035e5 b.pl 860 // b.nfrst - 1a8: 54000006 b.vs 1a8 - 1ac: 54fff2a6 b.vs 0 - 1b0: 54003586 b.vs 860 - 1b4: 54000007 b.vc 1b4 - 1b8: 54fff247 b.vc 0 - 1bc: 54003527 b.vc 860 - 1c0: 54000008 b.hi 1c0 // b.pmore - 1c4: 54fff1e8 b.hi 0 // b.pmore - 1c8: 540034c8 b.hi 860 // b.pmore - 1cc: 54000009 b.ls 1cc // b.plast - 1d0: 54fff189 b.ls 0 // b.plast - 1d4: 54003469 b.ls 860 // b.plast - 1d8: 5400000a b.ge 1d8 // b.tcont - 1dc: 54fff12a b.ge 0 // b.tcont - 1e0: 5400340a b.ge 860 // b.tcont - 1e4: 5400000b b.lt 1e4 // b.tstop - 1e8: 54fff0cb b.lt 0 // b.tstop - 1ec: 540033ab b.lt 860 // b.tstop - 1f0: 5400000c b.gt 1f0 - 1f4: 54fff06c b.gt 0 - 1f8: 5400334c b.gt 860 - 1fc: 5400000d b.le 1fc - 200: 54fff00d b.le 0 - 204: 540032ed b.le 860 - 208: 5400000e b.al 208 - 20c: 54ffefae b.al 0 - 210: 5400328e b.al 860 - 214: 5400000f b.nv 214 - 218: 54ffef4f b.nv 0 - 21c: 5400322f b.nv 860 - 220: d40ac601 svc #0x5630 - 224: d40042a2 hvc #0x215 - 228: d404dac3 smc #0x26d6 - 22c: d4224d40 brk #0x126a - 230: d44219c0 hlt #0x10ce - 234: d503201f nop - 238: d69f03e0 eret - 23c: d6bf03e0 drps - 240: d5033fdf isb - 244: d503339f dsb osh - 248: d50335bf dmb nshld - 24c: d61f0280 br x20 - 250: d63f0040 blr x2 - 254: c8127c17 stxr w18, x23, [x0] - 258: c81efec5 stlxr w30, x5, [x22] - 25c: c85f7d05 ldxr x5, [x8] - 260: c85ffe14 ldaxr x20, [x16] - 264: c89ffd66 stlr x6, [x11] - 268: c8dfff66 ldar x6, [x27] - 26c: 880a7cb1 stxr w10, w17, [x5] - 270: 8816fd89 stlxr w22, w9, [x12] - 274: 885f7d1b ldxr w27, [x8] - 278: 885ffc57 ldaxr w23, [x2] - 27c: 889fffba stlr w26, [x29] - 280: 88dffd4d ldar w13, [x10] - 284: 48197f7c stxrh w25, w28, [x27] - 288: 481dfd96 stlxrh w29, w22, [x12] - 28c: 485f7f96 ldxrh w22, [x28] - 290: 485fffc3 ldaxrh w3, [x30] - 294: 489ffdf8 stlrh w24, [x15] - 298: 48dfff5b ldarh w27, [x26] - 29c: 080b7e6a stxrb w11, w10, [x19] - 2a0: 0817fedb stlxrb w23, w27, [x22] - 2a4: 085f7e18 ldxrb w24, [x16] - 2a8: 085ffc38 ldaxrb w24, [x1] - 2ac: 089fffa5 stlrb w5, [x29] - 2b0: 08dffe18 ldarb w24, [x16] - 2b4: c87f6239 ldxp x25, x24, [x17] - 2b8: c87fb276 ldaxp x22, x12, [x19] - 2bc: c820573a stxp w0, x26, x21, [x25] - 2c0: c821aca6 stlxp w1, x6, x11, [x5] - 2c4: 887f388d ldxp w13, w14, [x4] - 2c8: 887f88d1 ldaxp w17, w2, [x6] - 2cc: 882f2643 stxp w15, w3, w9, [x18] - 2d0: 88329131 stlxp w18, w17, w4, [x9] - 2d4: f81cf2b7 stur x23, [x21, #-49] - 2d8: b803f055 stur w21, [x2, #63] - 2dc: 39002f9b strb w27, [x28, #11] - 2e0: 781f31fd sturh w29, [x15, #-13] - 2e4: f85d33ce ldur x14, [x30, #-45] - 2e8: b843539d ldur w29, [x28, #53] - 2ec: 39401f54 ldrb w20, [x26, #7] - 2f0: 785ce059 ldurh w25, [x2, #-50] - 2f4: 389f1143 ldursb x3, [x10, #-15] - 2f8: 788131ee ldursh x14, [x15, #19] - 2fc: 78dfb17d ldursh w29, [x11, #-5] - 300: b89b90af ldursw x15, [x5, #-71] - 304: fc403193 ldur d19, [x12, #3] - 308: bc42a36c ldur s12, [x27, #42] - 30c: fc07d396 stur d22, [x28, #125] - 310: bc1ec1f8 stur s24, [x15, #-20] - 314: f81e8f88 str x8, [x28, #-24]! - 318: b8025de6 str w6, [x15, #37]! - 31c: 38007c27 strb w7, [x1, #7]! - 320: 7801ee20 strh w0, [x17, #30]! - 324: f8454fb9 ldr x25, [x29, #84]! - 328: b85cce9a ldr w26, [x20, #-52]! - 32c: 385e7fba ldrb w26, [x29, #-25]! - 330: 7841af24 ldrh w4, [x25, #26]! - 334: 389ebd1c ldrsb x28, [x8, #-21]! - 338: 789fadd1 ldrsh x17, [x14, #-6]! - 33c: 78c0aefc ldrsh w28, [x23, #10]! - 340: b89c0f7e ldrsw x30, [x27, #-64]! - 344: fc50efd4 ldr d20, [x30, #-242]! - 348: bc414f71 ldr s17, [x27, #20]! - 34c: fc011c67 str d7, [x3, #17]! - 350: bc1f0d6d str s13, [x11, #-16]! - 354: f81c3526 str x6, [x9], #-61 - 358: b81e34b0 str w16, [x5], #-29 - 35c: 3800f7bd strb w29, [x29], #15 - 360: 78012684 strh w4, [x20], #18 - 364: f842e653 ldr x19, [x18], #46 - 368: b8417456 ldr w22, [x2], #23 - 36c: 385e2467 ldrb w7, [x3], #-30 - 370: 785e358b ldrh w11, [x12], #-29 - 374: 389e34c8 ldrsb x8, [x6], #-29 - 378: 788046f8 ldrsh x24, [x23], #4 - 37c: 78c00611 ldrsh w17, [x16], #0 - 380: b89f8680 ldrsw x0, [x20], #-8 - 384: fc582454 ldr d20, [x2], #-126 - 388: bc5987d3 ldr s19, [x30], #-104 - 38c: fc076624 str d4, [x17], #118 - 390: bc190675 str s21, [x19], #-112 - 394: f833785a str x26, [x2, x19, lsl #3] - 398: b82fd809 str w9, [x0, w15, sxtw #2] - 39c: 3821799a strb w26, [x12, x1, lsl #0] - 3a0: 782a7975 strh w21, [x11, x10, lsl #1] - 3a4: f870eaf0 ldr x16, [x23, x16, sxtx] - 3a8: b871d96a ldr w10, [x11, w17, sxtw #2] - 3ac: 386b7aed ldrb w13, [x23, x11, lsl #0] - 3b0: 7875689b ldrh w27, [x4, x21] - 3b4: 38afd91a ldrsb x26, [x8, w15, sxtw #0] - 3b8: 78a2c955 ldrsh x21, [x10, w2, sxtw] - 3bc: 78ee6bc8 ldrsh w8, [x30, x14] - 3c0: b8b4f9dd ldrsw x29, [x14, x20, sxtx #2] - 3c4: fc76eb7e ldr d30, [x27, x22, sxtx] - 3c8: bc76692d ldr s13, [x9, x22] - 3cc: fc31db28 str d8, [x25, w17, sxtw #3] - 3d0: bc255b01 str s1, [x24, w5, uxtw #2] - 3d4: f91c52aa str x10, [x21, #14496] - 3d8: b91c3fb2 str w18, [x29, #7228] - 3dc: 391f8877 strb w23, [x3, #2018] - 3e0: 791ac97c strh w28, [x11, #3428] - 3e4: f95c1758 ldr x24, [x26, #14376] - 3e8: b95b3c55 ldr w21, [x2, #6972] - 3ec: 395ce0a4 ldrb w4, [x5, #1848] - 3f0: 795851ce ldrh w14, [x14, #3112] - 3f4: 399e9f64 ldrsb x4, [x27, #1959] - 3f8: 79993764 ldrsh x4, [x27, #3226] - 3fc: 79d9af8a ldrsh w10, [x28, #3286] - 400: b99eea2a ldrsw x10, [x17, #7912] - 404: fd5a2f8d ldr d13, [x28, #13400] - 408: bd5dac78 ldr s24, [x3, #7596] - 40c: fd1e0182 str d2, [x12, #15360] - 410: bd195c31 str s17, [x1, #6492] - 414: 58000010 ldr x16, 414 - 418: 1800000d ldr w13, 418 - 41c: f8981240 prfum pldl1keep, [x18, #-127] - 420: d8ffdf00 prfm pldl1keep, 0 - 424: f8a27a80 prfm pldl1keep, [x20, x2, lsl #3] - 428: f99af920 prfm pldl1keep, [x9, #13808] - 42c: 1a0202e8 adc w8, w23, w2 - 430: 3a130078 adcs w24, w3, w19 - 434: 5a1d0316 sbc w22, w24, w29 - 438: 7a03036c sbcs w12, w27, w3 - 43c: 9a0102eb adc x11, x23, x1 - 440: ba1700bd adcs x29, x5, x23 - 444: da0c0329 sbc x9, x25, x12 - 448: fa16000c sbcs x12, x0, x22 - 44c: 0b23459a add w26, w12, w3, uxtw #1 - 450: 2b328a14 adds w20, w16, w18, sxtb #2 - 454: cb274bde sub x30, x30, w7, uxtw #2 - 458: 6b222eab subs w11, w21, w2, uxth #3 - 45c: 8b214b42 add x2, x26, w1, uxtw #2 - 460: ab34a7b2 adds x18, x29, w20, sxth #1 - 464: cb24520e sub x14, x16, w4, uxtw #4 - 468: eb378e20 subs x0, x17, w23, sxtb #3 - 46c: 3a565283 ccmn w20, w22, #0x3, pl // pl = nfrst - 470: 7a420321 ccmp w25, w2, #0x1, eq // eq = none - 474: ba58c247 ccmn x18, x24, #0x7, gt - 478: fa4d5106 ccmp x8, x13, #0x6, pl // pl = nfrst - 47c: 3a426924 ccmn w9, #0x2, #0x4, vs - 480: 7a5b0847 ccmp w2, #0x1b, #0x7, eq // eq = none - 484: ba413a02 ccmn x16, #0x1, #0x2, cc // cc = lo, ul, last - 488: fa5fba23 ccmp x17, #0x1f, #0x3, lt // lt = tstop - 48c: 1a979377 csel w23, w27, w23, ls // ls = plast - 490: 1a86640a csinc w10, w0, w6, vs - 494: 5a89300b csinv w11, w0, w9, cc // cc = lo, ul, last - 498: 5a923771 csneg w17, w27, w18, cc // cc = lo, ul, last - 49c: 9a8b720c csel x12, x16, x11, vc - 4a0: 9a868786 csinc x6, x28, x6, hi // hi = pmore - 4a4: da9a736d csinv x13, x27, x26, vc - 4a8: da9256dd csneg x29, x22, x18, pl // pl = nfrst - 4ac: 5ac0026c rbit w12, w19 - 4b0: 5ac00657 rev16 w23, w18 - 4b4: 5ac00b89 rev w9, w28 - 4b8: 5ac01262 clz w2, w19 - 4bc: 5ac017b9 cls w25, w29 - 4c0: dac002e4 rbit x4, x23 - 4c4: dac0065d rev16 x29, x18 - 4c8: dac00907 rev32 x7, x8 - 4cc: dac00e2d rev x13, x17 - 4d0: dac01011 clz x17, x0 - 4d4: dac01752 cls x18, x26 - 4d8: 1ad0098b udiv w11, w12, w16 - 4dc: 1ac70d24 sdiv w4, w9, w7 - 4e0: 1ad020ec lsl w12, w7, w16 - 4e4: 1ad72613 lsr w19, w16, w23 - 4e8: 1ac62887 asr w7, w4, w6 - 4ec: 1ad72e95 ror w21, w20, w23 - 4f0: 9adc0990 udiv x16, x12, x28 - 4f4: 9acd0d84 sdiv x4, x12, x13 - 4f8: 9ac721a9 lsl x9, x13, x7 - 4fc: 9acf277c lsr x28, x27, x15 - 500: 9ace2bd4 asr x20, x30, x14 - 504: 9ade2e4e ror x14, x18, x30 - 508: 9bc77d63 umulh x3, x11, x7 - 50c: 9b587e97 smulh x23, x20, x24 - 510: 1b1524a2 madd w2, w5, w21, w9 - 514: 1b04a318 msub w24, w24, w4, w8 - 518: 9b0f4d8b madd x11, x12, x15, x19 - 51c: 9b0ce73d msub x29, x25, x12, x25 - 520: 9b2c5971 smaddl x17, w11, w12, x22 - 524: 9b34c87c smsubl x28, w3, w20, x18 - 528: 9bbc6887 umaddl x7, w4, w28, x26 - 52c: 9bb19556 umsubl x22, w10, w17, x5 - 530: 1e310871 fmul s17, s3, s17 - 534: 1e261a2b fdiv s11, s17, s6 - 538: 1e2928fd fadd s29, s7, s9 - 53c: 1e333987 fsub s7, s12, s19 - 540: 1e230ae0 fmul s0, s23, s3 - 544: 1e75087a fmul d26, d3, d21 - 548: 1e651a60 fdiv d0, d19, d5 - 54c: 1e692b40 fadd d0, d26, d9 - 550: 1e753ab9 fsub d25, d21, d21 - 554: 1e7309b0 fmul d16, d13, d19 - 558: 1f00425d fmadd s29, s18, s0, s16 - 55c: 1f1d95b7 fmsub s23, s13, s29, s5 - 560: 1f2a38e9 fnmadd s9, s7, s10, s14 - 564: 1f2f5f99 fnmadd s25, s28, s15, s23 - 568: 1f5545a6 fmadd d6, d13, d21, d17 - 56c: 1f429ea3 fmsub d3, d21, d2, d7 - 570: 1f65472a fnmadd d10, d25, d5, d17 - 574: 1f7449ce fnmadd d14, d14, d20, d18 - 578: 1e20404f fmov s15, s2 - 57c: 1e20c0f2 fabs s18, s7 - 580: 1e2140c3 fneg s3, s6 - 584: 1e21c02c fsqrt s12, s1 - 588: 1e22c009 fcvt d9, s0 - 58c: 1e6040a4 fmov d4, d5 - 590: 1e60c1e3 fabs d3, d15 - 594: 1e614331 fneg d17, d25 - 598: 1e61c30c fsqrt d12, d24 - 59c: 1e6240b5 fcvt s21, d5 - 5a0: 1e3802a4 fcvtzs w4, s21 - 5a4: 9e38007b fcvtzs x27, s3 - 5a8: 1e78011d fcvtzs w29, d8 - 5ac: 9e7802a9 fcvtzs x9, d21 - 5b0: 1e2203b4 scvtf s20, w29 - 5b4: 9e220107 scvtf s7, x8 - 5b8: 1e6202ac scvtf d12, w21 - 5bc: 9e6202b0 scvtf d16, x21 - 5c0: 1e2600b2 fmov w18, s5 - 5c4: 9e660119 fmov x25, d8 - 5c8: 1e270352 fmov s18, w26 - 5cc: 9e670160 fmov d0, x11 - 5d0: 1e262200 fcmp s16, s6 - 5d4: 1e7d2200 fcmp d16, d29 - 5d8: 1e2023c8 fcmp s30, #0.0 - 5dc: 1e602128 fcmp d9, #0.0 - 5e0: 293e119b stp w27, w4, [x12, #-16] - 5e4: 294a2543 ldp w3, w9, [x10, #80] - 5e8: 69480c70 ldpsw x16, x3, [x3, #64] - 5ec: a934726a stp x10, x28, [x19, #-192] - 5f0: a97448f3 ldp x19, x18, [x7, #-192] - 5f4: 298243ca stp w10, w16, [x30, #16]! - 5f8: 29e21242 ldp w2, w4, [x18, #-240]! - 5fc: 69c64db8 ldpsw x24, x19, [x13, #48]! - 600: a9800311 stp x17, x0, [x24, #0]! - 604: a9f4686e ldp x14, x26, [x3, #-192]! - 608: 288a0416 stp w22, w1, [x0], #80 - 60c: 28fe2812 ldp w18, w10, [x0], #-16 - 610: 68fe62d8 .inst 0x68fe62d8 ; undefined - 614: a885308c stp x12, x12, [x4], #80 - 618: a8f12664 ldp x4, x9, [x19], #-240 - 61c: 282468d2 stnp w18, w26, [x6, #-224] - 620: 284e5035 ldnp w21, w20, [x1, #112] - 624: a8327699 stnp x25, x29, [x20, #-224] - 628: a84716e1 ldnp x1, x5, [x23, #112] - 62c: 0c407284 ld1 {v4.8b}, [x20] - 630: 4cdfa158 ld1 {v24.16b, v25.16b}, [x10], #32 - 634: 0ccf6cd8 ld1 {v24.1d-v26.1d}, [x6], x15 - 638: 4cdf2483 ld1 {v3.8h-v6.8h}, [x4], #64 - 63c: 0d40c0c2 ld1r {v2.8b}, [x6] - 640: 4ddfc9cd ld1r {v13.4s}, [x14], #4 - 644: 0dd8ceaf ld1r {v15.1d}, [x21], x24 - 648: 4c408ea9 ld2 {v9.2d, v10.2d}, [x21] - 64c: 0cdf86bd ld2 {v29.4h, v30.4h}, [x21], #16 - 650: 4d60c1c8 ld2r {v8.16b, v9.16b}, [x14] - 654: 0dffca87 ld2r {v7.2s, v8.2s}, [x20], #8 - 658: 4de3cc7c ld2r {v28.2d, v29.2d}, [x3], x3 - 65c: 4cdd497b ld3 {v27.4s-v29.4s}, [x11], x29 - 660: 0c404950 ld3 {v16.2s-v18.2s}, [x10] - 664: 4d40e595 ld3r {v21.8h-v23.8h}, [x12] - 668: 4ddfeba4 ld3r {v4.4s-v6.4s}, [x29], #12 - 66c: 0dd3ed38 ld3r {v24.1d-v26.1d}, [x9], x19 - 670: 4cdf046a ld4 {v10.8h-v13.8h}, [x3], #64 - 674: 0cc9039b ld4 {v27.8b-v30.8b}, [x28], x9 - 678: 0d60e3d5 ld4r {v21.8b-v24.8b}, [x30] - 67c: 0dffe5d7 ld4r {v23.4h-v26.4h}, [x14], #8 - 680: 0df4e9a4 ld4r {v4.2s-v7.2s}, [x13], x20 - 684: ba5fd3e3 ccmn xzr, xzr, #0x3, le - 688: 3a5f03e5 ccmn wzr, wzr, #0x5, eq // eq = none - 68c: fa411be4 ccmp xzr, #0x1, #0x4, ne // ne = any - 690: 7a42cbe2 ccmp wzr, #0x2, #0x2, gt - 694: 93df03ff ror xzr, xzr, #0 - 698: c820ffff stlxp w0, xzr, xzr, [sp] - 69c: 8822fc7f stlxp w2, wzr, wzr, [x3] - 6a0: c8247cbf stxp w4, xzr, xzr, [x5] - 6a4: 88267fff stxp w6, wzr, wzr, [sp] - 6a8: 4e010fe0 dup v0.16b, wzr - 6ac: 4e081fe1 mov v1.d[0], xzr - 6b0: 4e0c1fe1 mov v1.s[1], wzr - 6b4: 4e0a1fe1 mov v1.h[2], wzr - 6b8: 4e071fe1 mov v1.b[3], wzr - 6bc: 4cc0ac3f ld1 {v31.2d, v0.2d}, [x1], x0 - 6c0: 1e601000 fmov d0, #2.000000000000000000e+00 - 6c4: 1e603000 fmov d0, #2.125000000000000000e+00 - 6c8: 1e621000 fmov d0, #4.000000000000000000e+00 - 6cc: 1e623000 fmov d0, #4.250000000000000000e+00 - 6d0: 1e641000 fmov d0, #8.000000000000000000e+00 - 6d4: 1e643000 fmov d0, #8.500000000000000000e+00 - 6d8: 1e661000 fmov d0, #1.600000000000000000e+01 - 6dc: 1e663000 fmov d0, #1.700000000000000000e+01 - 6e0: 1e681000 fmov d0, #1.250000000000000000e-01 - 6e4: 1e683000 fmov d0, #1.328125000000000000e-01 - 6e8: 1e6a1000 fmov d0, #2.500000000000000000e-01 - 6ec: 1e6a3000 fmov d0, #2.656250000000000000e-01 - 6f0: 1e6c1000 fmov d0, #5.000000000000000000e-01 - 6f4: 1e6c3000 fmov d0, #5.312500000000000000e-01 - 6f8: 1e6e1000 fmov d0, #1.000000000000000000e+00 - 6fc: 1e6e3000 fmov d0, #1.062500000000000000e+00 - 700: 1e701000 fmov d0, #-2.000000000000000000e+00 - 704: 1e703000 fmov d0, #-2.125000000000000000e+00 - 708: 1e721000 fmov d0, #-4.000000000000000000e+00 - 70c: 1e723000 fmov d0, #-4.250000000000000000e+00 - 710: 1e741000 fmov d0, #-8.000000000000000000e+00 - 714: 1e743000 fmov d0, #-8.500000000000000000e+00 - 718: 1e761000 fmov d0, #-1.600000000000000000e+01 - 71c: 1e763000 fmov d0, #-1.700000000000000000e+01 - 720: 1e781000 fmov d0, #-1.250000000000000000e-01 - 724: 1e783000 fmov d0, #-1.328125000000000000e-01 - 728: 1e7a1000 fmov d0, #-2.500000000000000000e-01 - 72c: 1e7a3000 fmov d0, #-2.656250000000000000e-01 - 730: 1e7c1000 fmov d0, #-5.000000000000000000e-01 - 734: 1e7c3000 fmov d0, #-5.312500000000000000e-01 - 738: 1e7e1000 fmov d0, #-1.000000000000000000e+00 - 73c: 1e7e3000 fmov d0, #-1.062500000000000000e+00 - 740: f8358305 swp x21, x5, [x24] - 744: f82d01ed ldadd x13, x13, [x15] - 748: f8361353 ldclr x22, x19, [x26] - 74c: f839234a ldeor x25, x10, [x26] - 750: f82531fb ldset x5, x27, [x15] - 754: f8335165 ldsmin x19, x5, [x11] - 758: f83a4080 ldsmax x26, x0, [x4] - 75c: f83673d7 ldumin x22, x23, [x30] - 760: f832611c ldumax x18, x28, [x8] - 764: f8ad837d swpa x13, x29, [x27] - 768: f8ab01a5 ldadda x11, x5, [x13] - 76c: f8a112b8 ldclra x1, x24, [x21] - 770: f8bb2311 ldeora x27, x17, [x24] - 774: f8b230be ldseta x18, x30, [x5] - 778: f8a75336 ldsmina x7, x22, [x25] - 77c: f8a4427a ldsmaxa x4, x26, [x19] - 780: f8a6707e ldumina x6, x30, [x3] - 784: f8b860b7 ldumaxa x24, x23, [x5] - 788: f8f88392 swpal x24, x18, [x28] - 78c: f8f300ff ldaddal x19, xzr, [x7] - 790: f8ed1386 ldclral x13, x6, [x28] - 794: f8e822af ldeoral x8, x15, [x21] - 798: f8e2302d ldsetal x2, x13, [x1] - 79c: f8f1533d ldsminal x17, x29, [x25] - 7a0: f8f941d2 ldsmaxal x25, x18, [x14] - 7a4: f8ff7366 lduminal xzr, x6, [x27] - 7a8: f8f061e5 ldumaxal x16, x5, [x15] - 7ac: f86b8072 swpl x11, x18, [x3] - 7b0: f87a0054 ldaddl x26, x20, [x2] - 7b4: f86b1164 ldclrl x11, x4, [x11] - 7b8: f87e22f3 ldeorl x30, x19, [x23] - 7bc: f86331cf ldsetl x3, x15, [x14] - 7c0: f87e5296 ldsminl x30, x22, [x20] - 7c4: f8674305 ldsmaxl x7, x5, [x24] - 7c8: f87771f0 lduminl x23, x16, [x15] - 7cc: f86b6013 ldumaxl x11, x19, [x0] - 7d0: b83c803c swp w28, w28, [x1] - 7d4: b82b0195 ldadd w11, w21, [x12] - 7d8: b83d1240 ldclr w29, w0, [x18] - 7dc: b8252320 ldeor w5, w0, [x25] - 7e0: b82e3340 ldset w14, w0, [x26] - 7e4: b83c53b2 ldsmin w28, w18, [x29] - 7e8: b82f43a1 ldsmax w15, w1, [x29] - 7ec: b828739a ldumin w8, w26, [x28] - 7f0: b831608e ldumax w17, w14, [x4] - 7f4: b8b88039 swpa w24, w25, [x1] - 7f8: b8aa0231 ldadda w10, w17, [x17] - 7fc: b8bd12b4 ldclra w29, w20, [x21] - 800: b8bd2189 ldeora w29, w9, [x12] - 804: b8ab30a6 ldseta w11, w6, [x5] - 808: b8b552a7 ldsmina w21, w7, [x21] - 80c: b8aa4197 ldsmaxa w10, w23, [x12] - 810: b8b57145 ldumina w21, w5, [x10] - 814: b8be6254 ldumaxa w30, w20, [x18] - 818: b8ed80b7 swpal w13, w23, [x5] - 81c: b8ef00b8 ldaddal w15, w24, [x5] - 820: b8e9132a ldclral w9, w10, [x25] - 824: b8f42231 ldeoral w20, w17, [x17] - 828: b8ec33d2 ldsetal w12, w18, [x30] - 82c: b8e35323 ldsminal w3, w3, [x25] - 830: b8fa4159 ldsmaxal w26, w25, [x10] - 834: b8e273eb lduminal w2, w11, [sp] - 838: b8e760a2 ldumaxal w7, w2, [x5] - 83c: b8608287 swpl w0, w7, [x20] - 840: b865005f staddl w5, [x2] - 844: b87b1379 ldclrl w27, w25, [x27] - 848: b87e2358 ldeorl w30, w24, [x26] - 84c: b86f32c2 ldsetl w15, w2, [x22] - 850: b86053e3 ldsminl w0, w3, [sp] - 854: b86f4154 ldsmaxl w15, w20, [x10] - 858: b87671d5 lduminl w22, w21, [x14] - 85c: b866605e ldumaxl w6, w30, [x2] + 0: 8b4db437 add x23, x1, x13, lsr #45 + 4: cb8ce3c8 sub x8, x30, x12, asr #56 + 8: ab0edafb adds x27, x23, x14, lsl #54 + c: eb5499f5 subs x21, x15, x20, lsr #38 + 10: 0b040e39 add w25, w17, w4, lsl #3 + 14: 4b89503d sub w29, w1, w9, asr #20 + 18: 2b89274a adds w10, w26, w9, asr #9 + 1c: 6b870fd5 subs w21, w30, w7, asr #3 + 20: 8a4b1109 and x9, x8, x11, lsr #4 + 24: aa810643 orr x3, x18, x1, asr #1 + 28: ca026e8a eor x10, x20, x2, lsl #27 + 2c: ea8b7d2c ands x12, x9, x11, asr #31 + 30: 0a9e6934 and w20, w9, w30, asr #26 + 34: 2a9a4555 orr w21, w10, w26, asr #17 + 38: 4a871d00 eor w0, w8, w7, asr #7 + 3c: 6a084973 ands w19, w11, w8, lsl #18 + 40: 8a23d497 bic x23, x4, x3, lsl #53 + 44: aa3360c9 orn x9, x6, x19, lsl #24 + 48: ca7ad8cc eon x12, x6, x26, lsr #54 + 4c: ea2c3a76 bics x22, x19, x12, lsl #14 + 50: 0a362dbd bic w29, w13, w22, lsl #11 + 54: 2ab417d1 orn w17, w30, w20, asr #5 + 58: 4a2b23a1 eon w1, w29, w11, lsl #8 + 5c: 6a667684 bics w4, w20, w6, lsr #29 + 60: 1107e0de add w30, w6, #0x1f8 + 64: 310ebd13 adds w19, w8, #0x3af + 68: 5105b55d sub w29, w10, #0x16d + 6c: 71047104 subs w4, w8, #0x11c + 70: 910ef9c3 add x3, x14, #0x3be + 74: b1029e96 adds x22, x20, #0xa7 + 78: d10b55fb sub x27, x15, #0x2d5 + 7c: f10ecf98 subs x24, x28, #0x3b3 + 80: 12099f39 and w25, w25, #0x7f807f80 + 84: 321b3f4d orr w13, w26, #0x1fffe0 + 88: 520309b5 eor w21, w13, #0xe0000000 + 8c: 72134062 ands w2, w3, #0x3fffe000 + 90: 92004548 and x8, x10, #0x3ffff0003ffff + 94: b24d861b orr x27, x16, #0xfff80000001fffff + 98: d219587b eor x27, x3, #0x3fffff803fffff80 + 9c: f25eaee4 ands x4, x23, #0xfffffffc00003fff + a0: 14000000 b a0 + a4: 17ffffd7 b 0 + a8: 140001ee b 860 + ac: 94000000 bl ac + b0: 97ffffd4 bl 0 + b4: 940001eb bl 860 + b8: 34000003 cbz w3, b8 + bc: 34fffa23 cbz w3, 0 + c0: 34003d03 cbz w3, 860 + c4: 35000002 cbnz w2, c4 + c8: 35fff9c2 cbnz w2, 0 + cc: 35003ca2 cbnz w2, 860 + d0: b4000019 cbz x25, d0 + d4: b4fff979 cbz x25, 0 + d8: b4003c59 cbz x25, 860 + dc: b5000012 cbnz x18, dc + e0: b5fff912 cbnz x18, 0 + e4: b5003bf2 cbnz x18, 860 + e8: 10000008 adr x8, e8 + ec: 10fff8a8 adr x8, 0 + f0: 10003b88 adr x8, 860 + f4: 9000000f adrp x15, 0 + f8: 36700012 tbz w18, #14, f8 + fc: 3677f832 tbz w18, #14, 0 + 100: 36703b12 tbz w18, #14, 860 + 104: 37780019 tbnz w25, #15, 104 + 108: 377ff7d9 tbnz w25, #15, 0 + 10c: 37783ab9 tbnz w25, #15, 860 + 110: 12a203d2 mov w18, #0xefe1ffff // #-270401537 + 114: 5286b21e mov w30, #0x3590 // #13712 + 118: 72a66d35 movk w21, #0x3369, lsl #16 + 11c: 92eded92 mov x18, #0x9093ffffffffffff // #-8028792235694751745 + 120: d2eefecd mov x13, #0x77f6000000000000 // #8644096534784245760 + 124: f2ef69a3 movk x3, #0x7b4d, lsl #48 + 128: 93400c2a sbfx x10, x1, #0, #4 + 12c: 330562cc bfxil w12, w22, #5, #20 + 130: 530b2071 ubfiz w17, w3, #21, #9 + 134: 934b3860 sbfx x0, x3, #11, #4 + 138: b3473cdc bfxil x28, x6, #7, #9 + 13c: d3416549 ubfx x9, x10, #1, #25 + 140: 13995f75 extr w21, w27, w25, #23 + 144: 93d6462e extr x14, x17, x22, #17 + 148: 54000000 b.eq 148 // b.none + 14c: 54fff5a0 b.eq 0 // b.none + 150: 54003880 b.eq 860 // b.none + 154: 54000001 b.ne 154 // b.any + 158: 54fff541 b.ne 0 // b.any + 15c: 54003821 b.ne 860 // b.any + 160: 54000002 b.cs 160 // b.hs, b.nlast + 164: 54fff4e2 b.cs 0 // b.hs, b.nlast + 168: 540037c2 b.cs 860 // b.hs, b.nlast + 16c: 54000002 b.cs 16c // b.hs, b.nlast + 170: 54fff482 b.cs 0 // b.hs, b.nlast + 174: 54003762 b.cs 860 // b.hs, b.nlast + 178: 54000003 b.cc 178 // b.lo, b.ul, b.last + 17c: 54fff423 b.cc 0 // b.lo, b.ul, b.last + 180: 54003703 b.cc 860 // b.lo, b.ul, b.last + 184: 54000003 b.cc 184 // b.lo, b.ul, b.last + 188: 54fff3c3 b.cc 0 // b.lo, b.ul, b.last + 18c: 540036a3 b.cc 860 // b.lo, b.ul, b.last + 190: 54000004 b.mi 190 // b.first + 194: 54fff364 b.mi 0 // b.first + 198: 54003644 b.mi 860 // b.first + 19c: 54000005 b.pl 19c // b.nfrst + 1a0: 54fff305 b.pl 0 // b.nfrst + 1a4: 540035e5 b.pl 860 // b.nfrst + 1a8: 54000006 b.vs 1a8 + 1ac: 54fff2a6 b.vs 0 + 1b0: 54003586 b.vs 860 + 1b4: 54000007 b.vc 1b4 + 1b8: 54fff247 b.vc 0 + 1bc: 54003527 b.vc 860 + 1c0: 54000008 b.hi 1c0 // b.pmore + 1c4: 54fff1e8 b.hi 0 // b.pmore + 1c8: 540034c8 b.hi 860 // b.pmore + 1cc: 54000009 b.ls 1cc // b.plast + 1d0: 54fff189 b.ls 0 // b.plast + 1d4: 54003469 b.ls 860 // b.plast + 1d8: 5400000a b.ge 1d8 // b.tcont + 1dc: 54fff12a b.ge 0 // b.tcont + 1e0: 5400340a b.ge 860 // b.tcont + 1e4: 5400000b b.lt 1e4 // b.tstop + 1e8: 54fff0cb b.lt 0 // b.tstop + 1ec: 540033ab b.lt 860 // b.tstop + 1f0: 5400000c b.gt 1f0 + 1f4: 54fff06c b.gt 0 + 1f8: 5400334c b.gt 860 + 1fc: 5400000d b.le 1fc + 200: 54fff00d b.le 0 + 204: 540032ed b.le 860 + 208: 5400000e b.al 208 + 20c: 54ffefae b.al 0 + 210: 5400328e b.al 860 + 214: 5400000f b.nv 214 + 218: 54ffef4f b.nv 0 + 21c: 5400322f b.nv 860 + 220: d40f9ca1 svc #0x7ce5 + 224: d4008b22 hvc #0x459 + 228: d40be1c3 smc #0x5f0e + 22c: d423d0e0 brk #0x1e87 + 230: d44dee20 hlt #0x6f71 + 234: d503201f nop + 238: d69f03e0 eret + 23c: d6bf03e0 drps + 240: d5033fdf isb + 244: d503359f dsb nshld + 248: d50337bf dmb nsh + 24c: d61f0380 br x28 + 250: d63f0220 blr x17 + 254: c8127f47 stxr w18, x7, [x26] + 258: c819fccc stlxr w25, x12, [x6] + 25c: c85f7e00 ldxr x0, [x16] + 260: c85ffc66 ldaxr x6, [x3] + 264: c89ffc2e stlr x14, [x1] + 268: c8dfff1d ldar x29, [x24] + 26c: 881c7eef stxr w28, w15, [x23] + 270: 8809fc67 stlxr w9, w7, [x3] + 274: 885f7e81 ldxr w1, [x20] + 278: 885ffdf4 ldaxr w20, [x15] + 27c: 889ffd35 stlr w21, [x9] + 280: 88dffe25 ldar w5, [x17] + 284: 480d7fd4 stxrh w13, w20, [x30] + 288: 480afe4c stlxrh w10, w12, [x18] + 28c: 485f7e64 ldxrh w4, [x19] + 290: 485ffd56 ldaxrh w22, [x10] + 294: 489ffdfe stlrh w30, [x15] + 298: 48dfff04 ldarh w4, [x24] + 29c: 080a7d94 stxrb w10, w20, [x12] + 2a0: 0814fd7d stlxrb w20, w29, [x11] + 2a4: 085f7cb5 ldxrb w21, [x5] + 2a8: 085ffd24 ldaxrb w4, [x9] + 2ac: 089fff9e stlrb w30, [x28] + 2b0: 08dfff13 ldarb w19, [x24] + 2b4: c87f424b ldxp x11, x16, [x18] + 2b8: c87f9de8 ldaxp x8, x7, [x15] + 2bc: c83c4154 stxp w28, x20, x16, [x10] + 2c0: c827d469 stlxp w7, x9, x21, [x3] + 2c4: 887f1a79 ldxp w25, w6, [x19] + 2c8: 887fa45e ldaxp w30, w9, [x2] + 2cc: 88305180 stxp w16, w0, w20, [x12] + 2d0: 88259f82 stlxp w5, w2, w7, [x28] + 2d4: f81b5270 stur x16, [x19, #-75] + 2d8: b801e381 stur w1, [x28, #30] + 2dc: 381e61bc sturb w28, [x13, #-26] + 2e0: 781cd0c8 sturh w8, [x6, #-51] + 2e4: f851d380 ldur x0, [x28, #-227] + 2e8: b85e615c ldur w28, [x10, #-26] + 2ec: 39403164 ldrb w4, [x11, #12] + 2f0: 78405221 ldurh w1, [x17, #5] + 2f4: 3980312b ldrsb x11, [x9, #12] + 2f8: 789ef108 ldursh x8, [x8, #-17] + 2fc: 78ddd1b4 ldursh w20, [x13, #-35] + 300: b8831137 ldursw x23, [x9, #49] + 304: fc41d089 ldur d9, [x4, #29] + 308: bd402a6b ldr s11, [x19, #40] + 30c: fc1d5299 stur d25, [x20, #-43] + 310: bc1b0039 stur s25, [x1, #-80] + 314: f8019c14 str x20, [x0, #25]! + 318: b81cfd8c str w12, [x12, #-49]! + 31c: 381f6e7c strb w28, [x19, #-10]! + 320: 781c1f8d strh w13, [x28, #-63]! + 324: f85d2eeb ldr x11, [x23, #-46]! + 328: b8411f1b ldr w27, [x24, #17]! + 32c: 385f4f4e ldrb w14, [x26, #-12]! + 330: 785d3ed8 ldrh w24, [x22, #-45]! + 334: 389f5d39 ldrsb x25, [x9, #-11]! + 338: 7881dcc5 ldrsh x5, [x6, #29]! + 33c: 78dffee7 ldrsh w7, [x23, #-1]! + 340: b89c3dba ldrsw x26, [x13, #-61]! + 344: fc50bf18 ldr d24, [x24, #-245]! + 348: bc5c9f34 ldr s20, [x25, #-55]! + 34c: fc135c49 str d9, [x2, #-203]! + 350: bc1c5c2e str s14, [x1, #-59]! + 354: f806d433 str x19, [x1], #109 + 358: b81ca4a4 str w4, [x5], #-54 + 35c: 3800947d strb w29, [x3], #9 + 360: 781ce420 strh w0, [x1], #-50 + 364: f85d04c2 ldr x2, [x6], #-48 + 368: b858d4cf ldr w15, [x6], #-115 + 36c: 385e5444 ldrb w4, [x2], #-27 + 370: 785eb751 ldrh w17, [x26], #-21 + 374: 389f3715 ldrsb x21, [x24], #-13 + 378: 789d04d6 ldrsh x22, [x6], #-48 + 37c: 78dd04cb ldrsh w11, [x6], #-48 + 380: b89fb7ce ldrsw x14, [x30], #-5 + 384: fc5975e2 ldr d2, [x15], #-105 + 388: bc5a5679 ldr s25, [x19], #-91 + 38c: fc1416ed str d13, [x23], #-191 + 390: bc0006b6 str s22, [x21], #0 + 394: f832c996 str x22, [x12, w18, sxtw] + 398: b82c4b7e str w30, [x27, w12, uxtw] + 39c: 38367887 strb w7, [x4, x22, lsl #0] + 3a0: 783dfaf3 strh w19, [x23, x29, sxtx #1] + 3a4: f87bf891 ldr x17, [x4, x27, sxtx #3] + 3a8: b871c9a1 ldr w1, [x13, w17, sxtw] + 3ac: 387dfb70 ldrb w16, [x27, x29, sxtx #0] + 3b0: 78645939 ldrh w25, [x9, w4, uxtw #1] + 3b4: 38b67984 ldrsb x4, [x12, x22, lsl #0] + 3b8: 78a55839 ldrsh x25, [x1, w5, uxtw #1] + 3bc: 78fc6a09 ldrsh w9, [x16, x28] + 3c0: b8aee8e8 ldrsw x8, [x7, x14, sxtx] + 3c4: fc705b84 ldr d4, [x28, w16, uxtw #3] + 3c8: bc7bd850 ldr s16, [x2, w27, sxtw #2] + 3cc: fc396817 str d23, [x0, x25] + 3d0: bc277a06 str s6, [x16, x7, lsl #2] + 3d4: f91ddd82 str x2, [x12, #15288] + 3d8: b91b10a8 str w8, [x5, #6928] + 3dc: 391f8221 strb w1, [x17, #2016] + 3e0: 79197728 strh w8, [x25, #3258] + 3e4: f95ca07c ldr x28, [x3, #14656] + 3e8: b95b5d75 ldr w21, [x11, #7004] + 3ec: 395dc8af ldrb w15, [x5, #1906] + 3f0: 795caa60 ldrh w0, [x19, #3668] + 3f4: 399dd53d ldrsb x29, [x9, #1909] + 3f8: 799c7397 ldrsh x23, [x28, #3640] + 3fc: 79dcb15b ldrsh w27, [x10, #3672] + 400: b99e3b75 ldrsw x21, [x27, #7736] + 404: fd5c7f7a ldr d26, [x27, #14584] + 408: bd5d2882 ldr s2, [x4, #7464] + 40c: fd1fb2a1 str d1, [x21, #16224] + 410: bd1d82c4 str s4, [x22, #7552] + 414: 58000001 ldr x1, 414 + 418: 1800001b ldr w27, 418 + 41c: f882d080 prfum pldl1keep, [x4, #45] + 420: d8000000 prfm pldl1keep, 420 + 424: f8a0cbc0 prfm pldl1keep, [x30, w0, sxtw] + 428: f99fab00 prfm pldl1keep, [x24, #16208] + 42c: 1a1803a0 adc w0, w29, w24 + 430: 3a120396 adcs w22, w28, w18 + 434: 5a1e0217 sbc w23, w16, w30 + 438: 7a0e03a7 sbcs w7, w29, w14 + 43c: 9a0e0196 adc x22, x12, x14 + 440: ba17031d adcs x29, x24, x23 + 444: da160391 sbc x17, x28, x22 + 448: fa130298 sbcs x24, x20, x19 + 44c: 0b26cadb add w27, w22, w6, sxtw #2 + 450: 2b38516d adds w13, w11, w24, uxtw #4 + 454: cb242d10 sub x16, x8, w4, uxth #3 + 458: 6b34ea55 subs w21, w18, w20, sxtx #2 + 45c: 8b3d0a2e add x14, x17, w29, uxtb #2 + 460: ab2eb231 adds x17, x17, w14, sxth #4 + 464: cb3ac476 sub x22, x3, w26, sxtw #1 + 468: eb3531ad subs x13, x13, w21, uxth #4 + 46c: 3a5a722f ccmn w17, w26, #0xf, vc + 470: 7a463325 ccmp w25, w6, #0x5, cc // cc = lo, ul, last + 474: ba5e9021 ccmn x1, x30, #0x1, ls // ls = plast + 478: fa47a222 ccmp x17, x7, #0x2, ge // ge = tcont + 47c: 3a590a26 ccmn w17, #0x19, #0x6, eq // eq = none + 480: 7a450845 ccmp w2, #0x5, #0x5, eq // eq = none + 484: ba514a6a ccmn x19, #0x11, #0xa, mi // mi = first + 488: fa48c9c3 ccmp x14, #0x8, #0x3, gt + 48c: 1a8e9109 csel w9, w8, w14, ls // ls = plast + 490: 1a85d57b csinc w27, w11, w5, le + 494: 5a9632eb csinv w11, w23, w22, cc // cc = lo, ul, last + 498: 5a9b2793 csneg w19, w28, w27, cs // cs = hs, nlast + 49c: 9a815130 csel x16, x9, x1, pl // pl = nfrst + 4a0: 9a8c05dc csinc x28, x14, x12, eq // eq = none + 4a4: da8e5096 csinv x22, x4, x14, pl // pl = nfrst + 4a8: da9b257a csneg x26, x11, x27, cs // cs = hs, nlast + 4ac: 5ac00178 rbit w24, w11 + 4b0: 5ac005ca rev16 w10, w14 + 4b4: 5ac008a9 rev w9, w5 + 4b8: 5ac01292 clz w18, w20 + 4bc: 5ac01519 cls w25, w8 + 4c0: dac00316 rbit x22, x24 + 4c4: dac0077c rev16 x28, x27 + 4c8: dac00ba8 rev32 x8, x29 + 4cc: dac00d51 rev x17, x10 + 4d0: dac01177 clz x23, x11 + 4d4: dac015da cls x26, x14 + 4d8: 1adc0895 udiv w21, w4, w28 + 4dc: 1ad60d5e sdiv w30, w10, w22 + 4e0: 1ada205d lsl w29, w2, w26 + 4e4: 1aca26dc lsr w28, w22, w10 + 4e8: 1acc2b0b asr w11, w24, w12 + 4ec: 1ad02fd5 ror w21, w30, w16 + 4f0: 9acd0801 udiv x1, x0, x13 + 4f4: 9ac60e22 sdiv x2, x17, x6 + 4f8: 9ad5230a lsl x10, x24, x21 + 4fc: 9ac62525 lsr x5, x9, x6 + 500: 9ac42b60 asr x0, x27, x4 + 504: 9ac22c9c ror x28, x4, x2 + 508: 9bc77fc1 umulh x1, x30, x7 + 50c: 9b4a7cbe smulh x30, x5, x10 + 510: 1b0d45e7 madd w7, w15, w13, w17 + 514: 1b0cf039 msub w25, w1, w12, w28 + 518: 9b1e2562 madd x2, x11, x30, x9 + 51c: 9b03dae5 msub x5, x23, x3, x22 + 520: 9b291159 smaddl x25, w10, w9, x4 + 524: 9b27c905 smsubl x5, w8, w7, x18 + 528: 9bba64b8 umaddl x24, w5, w26, x25 + 52c: 9bbaf02e umsubl x14, w1, w26, x28 + 530: 1e280ad8 fmul s24, s22, s8 + 534: 1e261870 fdiv s16, s3, s6 + 538: 1e392ab0 fadd s16, s21, s25 + 53c: 1e3b3b40 fsub s0, s26, s27 + 540: 1e310878 fmul s24, s3, s17 + 544: 1e660909 fmul d9, d8, d6 + 548: 1e7e1a76 fdiv d22, d19, d30 + 54c: 1e632a2e fadd d14, d17, d3 + 550: 1e743b78 fsub d24, d27, d20 + 554: 1e76082c fmul d12, d1, d22 + 558: 1f0b7510 fmadd s16, s8, s11, s29 + 55c: 1f128676 fmsub s22, s19, s18, s1 + 560: 1f38270f fnmadd s15, s24, s24, s9 + 564: 1f2d5e7b fnmadd s27, s19, s13, s23 + 568: 1f503003 fmadd d3, d0, d16, d12 + 56c: 1f52a873 fmsub d19, d3, d18, d10 + 570: 1f6b5041 fnmadd d1, d2, d11, d20 + 574: 1f79392c fnmadd d12, d9, d25, d14 + 578: 1e2042e0 fmov s0, s23 + 57c: 1e20c0d7 fabs s23, s6 + 580: 1e214084 fneg s4, s4 + 584: 1e21c385 fsqrt s5, s28 + 588: 1e22c1f5 fcvt d21, s15 + 58c: 1e6040ab fmov d11, d5 + 590: 1e60c092 fabs d18, d4 + 594: 1e61418b fneg d11, d12 + 598: 1e61c10f fsqrt d15, d8 + 59c: 1e624048 fcvt s8, d2 + 5a0: 1e380253 fcvtzs w19, s18 + 5a4: 9e380011 fcvtzs x17, s0 + 5a8: 1e7801a0 fcvtzs w0, d13 + 5ac: 9e780136 fcvtzs x22, d9 + 5b0: 1e2203a6 scvtf s6, w29 + 5b4: 9e2201cc scvtf s12, x14 + 5b8: 1e6202d0 scvtf d16, w22 + 5bc: 9e6200ae scvtf d14, x5 + 5c0: 1e260007 fmov w7, s0 + 5c4: 9e6600dc fmov x28, d6 + 5c8: 1e270342 fmov s2, w26 + 5cc: 9e670004 fmov d4, x0 + 5d0: 1e2b2020 fcmp s1, s11 + 5d4: 1e7520c0 fcmp d6, d21 + 5d8: 1e202208 fcmp s16, #0.0 + 5dc: 1e6022c8 fcmp d22, #0.0 + 5e0: 290c0045 stp w5, w0, [x2, #96] + 5e4: 2978766e ldp w14, w29, [x19, #-64] + 5e8: 696c0c6f ldpsw x15, x3, [x3, #-160] + 5ec: a9323767 stp x7, x13, [x27, #-224] + 5f0: a9483831 ldp x17, x14, [x1, #128] + 5f4: 29905895 stp w21, w22, [x4, #128]! + 5f8: 29f43451 ldp w17, w13, [x2, #-96]! + 5fc: 69ee66f5 ldpsw x21, x25, [x23, #-144]! + 600: a9bf41e4 stp x4, x16, [x15, #-16]! + 604: a9f6573d ldp x29, x21, [x25, #-160]! + 608: 288a4758 stp w24, w17, [x26], #80 + 60c: 28e27bc3 ldp w3, w30, [x30], #-240 + 610: 68fc4fc3 ldpsw x3, x19, [x30], #-32 + 614: a8b70779 stp x25, x1, [x27], #-144 + 618: a8fc539a ldp x26, x20, [x28], #-64 + 61c: 283a653d stnp w29, w25, [x9, #-48] + 620: 28703a79 ldnp w25, w14, [x19, #-128] + 624: a8025879 stnp x25, x22, [x3, #32] + 628: a8734ba9 ldnp x9, x18, [x29, #-208] + 62c: 0c407275 ld1 {v21.8b}, [x19] + 630: 4cdfa29b ld1 {v27.16b, v28.16b}, [x20], #32 + 634: 0cc66ec5 ld1 {v5.1d-v7.1d}, [x22], x6 + 638: 4cdf2596 ld1 {v22.8h-v25.8h}, [x12], #64 + 63c: 0d40c131 ld1r {v17.8b}, [x9] + 640: 4ddfcaa5 ld1r {v5.4s}, [x21], #4 + 644: 0dd2cf8a ld1r {v10.1d}, [x28], x18 + 648: 4c408dfa ld2 {v26.2d, v27.2d}, [x15] + 64c: 0cdf8750 ld2 {v16.4h, v17.4h}, [x26], #16 + 650: 4d60c04e ld2r {v14.16b, v15.16b}, [x2] + 654: 0dffcb92 ld2r {v18.2s, v19.2s}, [x28], #8 + 658: 4df6cc13 ld2r {v19.2d, v20.2d}, [x0], x22 + 65c: 4cd24850 ld3 {v16.4s-v18.4s}, [x2], x18 + 660: 0c404818 ld3 {v24.2s-v26.2s}, [x0] + 664: 4d40e604 ld3r {v4.8h-v6.8h}, [x16] + 668: 4ddfe825 ld3r {v5.4s-v7.4s}, [x1], #12 + 66c: 0dd0ed47 ld3r {v7.1d-v9.1d}, [x10], x16 + 670: 4cdf0696 ld4 {v22.8h-v25.8h}, [x20], #64 + 674: 0cd9008f ld4 {v15.8b-v18.8b}, [x4], x25 + 678: 0d60e0a0 ld4r {v0.8b-v3.8b}, [x5] + 67c: 0dffe420 ld4r {v0.4h-v3.4h}, [x1], #8 + 680: 0deeeb9e ld4r {v30.2s, v31.2s, v0.2s, v1.2s}, [x28], x14 + 684: ba5fd3e3 ccmn xzr, xzr, #0x3, le + 688: 3a5f03e5 ccmn wzr, wzr, #0x5, eq // eq = none + 68c: fa411be4 ccmp xzr, #0x1, #0x4, ne // ne = any + 690: 7a42cbe2 ccmp wzr, #0x2, #0x2, gt + 694: 93df03ff ror xzr, xzr, #0 + 698: c820ffff stlxp w0, xzr, xzr, [sp] + 69c: 8822fc7f stlxp w2, wzr, wzr, [x3] + 6a0: c8247cbf stxp w4, xzr, xzr, [x5] + 6a4: 88267fff stxp w6, wzr, wzr, [sp] + 6a8: 4e010fe0 dup v0.16b, wzr + 6ac: 4e081fe1 mov v1.d[0], xzr + 6b0: 4e0c1fe1 mov v1.s[1], wzr + 6b4: 4e0a1fe1 mov v1.h[2], wzr + 6b8: 4e071fe1 mov v1.b[3], wzr + 6bc: 4cc0ac3f ld1 {v31.2d, v0.2d}, [x1], x0 + 6c0: 1e601000 fmov d0, #2.000000000000000000e+00 + 6c4: 1e603000 fmov d0, #2.125000000000000000e+00 + 6c8: 1e621000 fmov d0, #4.000000000000000000e+00 + 6cc: 1e623000 fmov d0, #4.250000000000000000e+00 + 6d0: 1e641000 fmov d0, #8.000000000000000000e+00 + 6d4: 1e643000 fmov d0, #8.500000000000000000e+00 + 6d8: 1e661000 fmov d0, #1.600000000000000000e+01 + 6dc: 1e663000 fmov d0, #1.700000000000000000e+01 + 6e0: 1e681000 fmov d0, #1.250000000000000000e-01 + 6e4: 1e683000 fmov d0, #1.328125000000000000e-01 + 6e8: 1e6a1000 fmov d0, #2.500000000000000000e-01 + 6ec: 1e6a3000 fmov d0, #2.656250000000000000e-01 + 6f0: 1e6c1000 fmov d0, #5.000000000000000000e-01 + 6f4: 1e6c3000 fmov d0, #5.312500000000000000e-01 + 6f8: 1e6e1000 fmov d0, #1.000000000000000000e+00 + 6fc: 1e6e3000 fmov d0, #1.062500000000000000e+00 + 700: 1e701000 fmov d0, #-2.000000000000000000e+00 + 704: 1e703000 fmov d0, #-2.125000000000000000e+00 + 708: 1e721000 fmov d0, #-4.000000000000000000e+00 + 70c: 1e723000 fmov d0, #-4.250000000000000000e+00 + 710: 1e741000 fmov d0, #-8.000000000000000000e+00 + 714: 1e743000 fmov d0, #-8.500000000000000000e+00 + 718: 1e761000 fmov d0, #-1.600000000000000000e+01 + 71c: 1e763000 fmov d0, #-1.700000000000000000e+01 + 720: 1e781000 fmov d0, #-1.250000000000000000e-01 + 724: 1e783000 fmov d0, #-1.328125000000000000e-01 + 728: 1e7a1000 fmov d0, #-2.500000000000000000e-01 + 72c: 1e7a3000 fmov d0, #-2.656250000000000000e-01 + 730: 1e7c1000 fmov d0, #-5.000000000000000000e-01 + 734: 1e7c3000 fmov d0, #-5.312500000000000000e-01 + 738: 1e7e1000 fmov d0, #-1.000000000000000000e+00 + 73c: 1e7e3000 fmov d0, #-1.062500000000000000e+00 + 740: f83a8229 swp x26, x9, [x17] + 744: f83c0057 ldadd x28, x23, [x2] + 748: f8361062 ldclr x22, x2, [x3] + 74c: f82b23d9 ldeor x11, x25, [x30] + 750: f836309c ldset x22, x28, [x4] + 754: f826530b ldsmin x6, x11, [x24] + 758: f82c43ff stsmax x12, [sp] + 75c: f837713e ldumin x23, x30, [x9] + 760: f8266281 ldumax x6, x1, [x20] + 764: f8b182c2 swpa x17, x2, [x22] + 768: f8ae015b ldadda x14, x27, [x10] + 76c: f8a6127e ldclra x6, x30, [x19] + 770: f8a02179 ldeora x0, x25, [x11] + 774: f8b733c0 ldseta x23, x0, [x30] + 778: f8b55143 ldsmina x21, x3, [x10] + 77c: f8af4016 ldsmaxa x15, x22, [x0] + 780: f8b17280 ldumina x17, x0, [x20] + 784: f8b0602d ldumaxa x16, x13, [x1] + 788: f8fb82ef swpal x27, x15, [x23] + 78c: f8f3003e ldaddal x19, x30, [x1] + 790: f8ef12fc ldclral x15, x28, [x23] + 794: f8e7226f ldeoral x7, x15, [x19] + 798: f8eb314c ldsetal x11, x12, [x10] + 79c: f8e65187 ldsminal x6, x7, [x12] + 7a0: f8fc41a5 ldsmaxal x28, x5, [x13] + 7a4: f8e97234 lduminal x9, x20, [x17] + 7a8: f8f56179 ldumaxal x21, x25, [x11] + 7ac: f8738318 swpl x19, x24, [x24] + 7b0: f86803da ldaddl x8, x26, [x30] + 7b4: f8711112 ldclrl x17, x18, [x8] + 7b8: f8622063 ldeorl x2, x3, [x3] + 7bc: f87a3207 ldsetl x26, x7, [x16] + 7c0: f87b50a6 ldsminl x27, x6, [x5] + 7c4: f8764280 ldsmaxl x22, x0, [x20] + 7c8: f86b705a lduminl x11, x26, [x2] + 7cc: f87e609d ldumaxl x30, x29, [x4] + 7d0: b82480e5 swp w4, w5, [x7] + 7d4: b82a005a ldadd w10, w26, [x2] + 7d8: b83b1370 ldclr w27, w16, [x27] + 7dc: b83f2157 ldeor wzr, w23, [x10] + 7e0: b82431a2 ldset w4, w2, [x13] + 7e4: b823506f ldsmin w3, w15, [x3] + 7e8: b82340ca ldsmax w3, w10, [x6] + 7ec: b828714b ldumin w8, w11, [x10] + 7f0: b83d61be ldumax w29, w30, [x13] + 7f4: b8ab8291 swpa w11, w17, [x20] + 7f8: b8ba00d0 ldadda w26, w16, [x6] + 7fc: b8b5102a ldclra w21, w10, [x1] + 800: b8bd22ec ldeora w29, w12, [x23] + 804: b8bd3108 ldseta w29, w8, [x8] + 808: b8ab51ca ldsmina w11, w10, [x14] + 80c: b8a442cd ldsmaxa w4, w13, [x22] + 810: b8a770ed ldumina w7, w13, [x7] + 814: b8ae63e0 ldumaxa w14, w0, [sp] + 818: b8f18382 swpal w17, w2, [x28] + 81c: b8f3014b ldaddal w19, w11, [x10] + 820: b8ec1293 ldclral w12, w19, [x20] + 824: b8e02108 ldeoral w0, w8, [x8] + 828: b8f13303 ldsetal w17, w3, [x24] + 82c: b8f950e5 ldsminal w25, w5, [x7] + 830: b8f0413e ldsmaxal w16, w30, [x9] + 834: b8ea71df lduminal w10, wzr, [x14] + 838: b8f16173 ldumaxal w17, w19, [x11] + 83c: b87481a1 swpl w20, w1, [x13] + 840: b87a028b ldaddl w26, w11, [x20] + 844: b87213d8 ldclrl w18, w24, [x30] + 848: b86c2299 ldeorl w12, w25, [x20] + 84c: b86e30bd ldsetl w14, w29, [x5] + 850: b862537a ldsminl w2, w26, [x27] + 854: b879417b ldsmaxl w25, w27, [x11] + 858: b86470fd lduminl w4, w29, [x7] + 85c: b870615d ldumaxl w16, w29, [x10] */ static const unsigned int insns[] = { - 0x8b50798f, 0xcb4381e1, 0xab05372d, 0xeb864796, - 0x0b961920, 0x4b195473, 0x2b0b5264, 0x6b9300f8, - 0x8a0bc0fe, 0xaa0f3118, 0xca170531, 0xea44dd6e, - 0x0a4c44f3, 0x2a8b7373, 0x4a567c7e, 0x6a9c0353, - 0x8a3accdd, 0xaa318f7a, 0xca2e1495, 0xeaa015e2, - 0x0a2274e2, 0x2a751598, 0x4a3309fe, 0x6ab172fe, - 0x110a5284, 0x310b1942, 0x5103d353, 0x710125bc, - 0x910d7bc2, 0xb108fa1b, 0xd1093536, 0xf10ae824, - 0x120e667c, 0x321f6cbb, 0x520f6a9e, 0x72136f56, - 0x927e4ce5, 0xb278b4ed, 0xd24c6527, 0xf2485803, + 0x8b4db437, 0xcb8ce3c8, 0xab0edafb, 0xeb5499f5, + 0x0b040e39, 0x4b89503d, 0x2b89274a, 0x6b870fd5, + 0x8a4b1109, 0xaa810643, 0xca026e8a, 0xea8b7d2c, + 0x0a9e6934, 0x2a9a4555, 0x4a871d00, 0x6a084973, + 0x8a23d497, 0xaa3360c9, 0xca7ad8cc, 0xea2c3a76, + 0x0a362dbd, 0x2ab417d1, 0x4a2b23a1, 0x6a667684, + 0x1107e0de, 0x310ebd13, 0x5105b55d, 0x71047104, + 0x910ef9c3, 0xb1029e96, 0xd10b55fb, 0xf10ecf98, + 0x12099f39, 0x321b3f4d, 0x520309b5, 0x72134062, + 0x92004548, 0xb24d861b, 0xd219587b, 0xf25eaee4, 0x14000000, 0x17ffffd7, 0x140001ee, 0x94000000, - 0x97ffffd4, 0x940001eb, 0x34000010, 0x34fffa30, - 0x34003d10, 0x35000013, 0x35fff9d3, 0x35003cb3, - 0xb4000005, 0xb4fff965, 0xb4003c45, 0xb5000004, - 0xb5fff904, 0xb5003be4, 0x1000001b, 0x10fff8bb, - 0x10003b9b, 0x90000010, 0x3640001c, 0x3647f83c, - 0x36403b1c, 0x37080001, 0x370ff7c1, 0x37083aa1, - 0x12a437f4, 0x528c9d67, 0x72838bb1, 0x92c1062e, - 0xd287da49, 0xf2a6d153, 0x93465ac9, 0x330b0013, - 0x530b4e6a, 0x934545e4, 0xb35370a3, 0xd3510b8c, - 0x13960c0f, 0x93ceddc6, 0x54000000, 0x54fff5a0, + 0x97ffffd4, 0x940001eb, 0x34000003, 0x34fffa23, + 0x34003d03, 0x35000002, 0x35fff9c2, 0x35003ca2, + 0xb4000019, 0xb4fff979, 0xb4003c59, 0xb5000012, + 0xb5fff912, 0xb5003bf2, 0x10000008, 0x10fff8a8, + 0x10003b88, 0x9000000f, 0x36700012, 0x3677f832, + 0x36703b12, 0x37780019, 0x377ff7d9, 0x37783ab9, + 0x12a203d2, 0x5286b21e, 0x72a66d35, 0x92eded92, + 0xd2eefecd, 0xf2ef69a3, 0x93400c2a, 0x330562cc, + 0x530b2071, 0x934b3860, 0xb3473cdc, 0xd3416549, + 0x13995f75, 0x93d6462e, 0x54000000, 0x54fff5a0, 0x54003880, 0x54000001, 0x54fff541, 0x54003821, 0x54000002, 0x54fff4e2, 0x540037c2, 0x54000002, 0x54fff482, 0x54003762, 0x54000003, 0x54fff423, @@ -1336,77 +1336,77 @@ Disassembly of section .text: 0x5400000c, 0x54fff06c, 0x5400334c, 0x5400000d, 0x54fff00d, 0x540032ed, 0x5400000e, 0x54ffefae, 0x5400328e, 0x5400000f, 0x54ffef4f, 0x5400322f, - 0xd40ac601, 0xd40042a2, 0xd404dac3, 0xd4224d40, - 0xd44219c0, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, - 0xd5033fdf, 0xd503339f, 0xd50335bf, 0xd61f0280, - 0xd63f0040, 0xc8127c17, 0xc81efec5, 0xc85f7d05, - 0xc85ffe14, 0xc89ffd66, 0xc8dfff66, 0x880a7cb1, - 0x8816fd89, 0x885f7d1b, 0x885ffc57, 0x889fffba, - 0x88dffd4d, 0x48197f7c, 0x481dfd96, 0x485f7f96, - 0x485fffc3, 0x489ffdf8, 0x48dfff5b, 0x080b7e6a, - 0x0817fedb, 0x085f7e18, 0x085ffc38, 0x089fffa5, - 0x08dffe18, 0xc87f6239, 0xc87fb276, 0xc820573a, - 0xc821aca6, 0x887f388d, 0x887f88d1, 0x882f2643, - 0x88329131, 0xf81cf2b7, 0xb803f055, 0x39002f9b, - 0x781f31fd, 0xf85d33ce, 0xb843539d, 0x39401f54, - 0x785ce059, 0x389f1143, 0x788131ee, 0x78dfb17d, - 0xb89b90af, 0xfc403193, 0xbc42a36c, 0xfc07d396, - 0xbc1ec1f8, 0xf81e8f88, 0xb8025de6, 0x38007c27, - 0x7801ee20, 0xf8454fb9, 0xb85cce9a, 0x385e7fba, - 0x7841af24, 0x389ebd1c, 0x789fadd1, 0x78c0aefc, - 0xb89c0f7e, 0xfc50efd4, 0xbc414f71, 0xfc011c67, - 0xbc1f0d6d, 0xf81c3526, 0xb81e34b0, 0x3800f7bd, - 0x78012684, 0xf842e653, 0xb8417456, 0x385e2467, - 0x785e358b, 0x389e34c8, 0x788046f8, 0x78c00611, - 0xb89f8680, 0xfc582454, 0xbc5987d3, 0xfc076624, - 0xbc190675, 0xf833785a, 0xb82fd809, 0x3821799a, - 0x782a7975, 0xf870eaf0, 0xb871d96a, 0x386b7aed, - 0x7875689b, 0x38afd91a, 0x78a2c955, 0x78ee6bc8, - 0xb8b4f9dd, 0xfc76eb7e, 0xbc76692d, 0xfc31db28, - 0xbc255b01, 0xf91c52aa, 0xb91c3fb2, 0x391f8877, - 0x791ac97c, 0xf95c1758, 0xb95b3c55, 0x395ce0a4, - 0x795851ce, 0x399e9f64, 0x79993764, 0x79d9af8a, - 0xb99eea2a, 0xfd5a2f8d, 0xbd5dac78, 0xfd1e0182, - 0xbd195c31, 0x58000010, 0x1800000d, 0xf8981240, - 0xd8ffdf00, 0xf8a27a80, 0xf99af920, 0x1a0202e8, - 0x3a130078, 0x5a1d0316, 0x7a03036c, 0x9a0102eb, - 0xba1700bd, 0xda0c0329, 0xfa16000c, 0x0b23459a, - 0x2b328a14, 0xcb274bde, 0x6b222eab, 0x8b214b42, - 0xab34a7b2, 0xcb24520e, 0xeb378e20, 0x3a565283, - 0x7a420321, 0xba58c247, 0xfa4d5106, 0x3a426924, - 0x7a5b0847, 0xba413a02, 0xfa5fba23, 0x1a979377, - 0x1a86640a, 0x5a89300b, 0x5a923771, 0x9a8b720c, - 0x9a868786, 0xda9a736d, 0xda9256dd, 0x5ac0026c, - 0x5ac00657, 0x5ac00b89, 0x5ac01262, 0x5ac017b9, - 0xdac002e4, 0xdac0065d, 0xdac00907, 0xdac00e2d, - 0xdac01011, 0xdac01752, 0x1ad0098b, 0x1ac70d24, - 0x1ad020ec, 0x1ad72613, 0x1ac62887, 0x1ad72e95, - 0x9adc0990, 0x9acd0d84, 0x9ac721a9, 0x9acf277c, - 0x9ace2bd4, 0x9ade2e4e, 0x9bc77d63, 0x9b587e97, - 0x1b1524a2, 0x1b04a318, 0x9b0f4d8b, 0x9b0ce73d, - 0x9b2c5971, 0x9b34c87c, 0x9bbc6887, 0x9bb19556, - 0x1e310871, 0x1e261a2b, 0x1e2928fd, 0x1e333987, - 0x1e230ae0, 0x1e75087a, 0x1e651a60, 0x1e692b40, - 0x1e753ab9, 0x1e7309b0, 0x1f00425d, 0x1f1d95b7, - 0x1f2a38e9, 0x1f2f5f99, 0x1f5545a6, 0x1f429ea3, - 0x1f65472a, 0x1f7449ce, 0x1e20404f, 0x1e20c0f2, - 0x1e2140c3, 0x1e21c02c, 0x1e22c009, 0x1e6040a4, - 0x1e60c1e3, 0x1e614331, 0x1e61c30c, 0x1e6240b5, - 0x1e3802a4, 0x9e38007b, 0x1e78011d, 0x9e7802a9, - 0x1e2203b4, 0x9e220107, 0x1e6202ac, 0x9e6202b0, - 0x1e2600b2, 0x9e660119, 0x1e270352, 0x9e670160, - 0x1e262200, 0x1e7d2200, 0x1e2023c8, 0x1e602128, - 0x293e119b, 0x294a2543, 0x69480c70, 0xa934726a, - 0xa97448f3, 0x298243ca, 0x29e21242, 0x69c64db8, - 0xa9800311, 0xa9f4686e, 0x288a0416, 0x28fe2812, - 0x68fe62d8, 0xa885308c, 0xa8f12664, 0x282468d2, - 0x284e5035, 0xa8327699, 0xa84716e1, 0x0c407284, - 0x4cdfa158, 0x0ccf6cd8, 0x4cdf2483, 0x0d40c0c2, - 0x4ddfc9cd, 0x0dd8ceaf, 0x4c408ea9, 0x0cdf86bd, - 0x4d60c1c8, 0x0dffca87, 0x4de3cc7c, 0x4cdd497b, - 0x0c404950, 0x4d40e595, 0x4ddfeba4, 0x0dd3ed38, - 0x4cdf046a, 0x0cc9039b, 0x0d60e3d5, 0x0dffe5d7, - 0x0df4e9a4, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, + 0xd40f9ca1, 0xd4008b22, 0xd40be1c3, 0xd423d0e0, + 0xd44dee20, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, + 0xd5033fdf, 0xd503359f, 0xd50337bf, 0xd61f0380, + 0xd63f0220, 0xc8127f47, 0xc819fccc, 0xc85f7e00, + 0xc85ffc66, 0xc89ffc2e, 0xc8dfff1d, 0x881c7eef, + 0x8809fc67, 0x885f7e81, 0x885ffdf4, 0x889ffd35, + 0x88dffe25, 0x480d7fd4, 0x480afe4c, 0x485f7e64, + 0x485ffd56, 0x489ffdfe, 0x48dfff04, 0x080a7d94, + 0x0814fd7d, 0x085f7cb5, 0x085ffd24, 0x089fff9e, + 0x08dfff13, 0xc87f424b, 0xc87f9de8, 0xc83c4154, + 0xc827d469, 0x887f1a79, 0x887fa45e, 0x88305180, + 0x88259f82, 0xf81b5270, 0xb801e381, 0x381e61bc, + 0x781cd0c8, 0xf851d380, 0xb85e615c, 0x39403164, + 0x78405221, 0x3980312b, 0x789ef108, 0x78ddd1b4, + 0xb8831137, 0xfc41d089, 0xbd402a6b, 0xfc1d5299, + 0xbc1b0039, 0xf8019c14, 0xb81cfd8c, 0x381f6e7c, + 0x781c1f8d, 0xf85d2eeb, 0xb8411f1b, 0x385f4f4e, + 0x785d3ed8, 0x389f5d39, 0x7881dcc5, 0x78dffee7, + 0xb89c3dba, 0xfc50bf18, 0xbc5c9f34, 0xfc135c49, + 0xbc1c5c2e, 0xf806d433, 0xb81ca4a4, 0x3800947d, + 0x781ce420, 0xf85d04c2, 0xb858d4cf, 0x385e5444, + 0x785eb751, 0x389f3715, 0x789d04d6, 0x78dd04cb, + 0xb89fb7ce, 0xfc5975e2, 0xbc5a5679, 0xfc1416ed, + 0xbc0006b6, 0xf832c996, 0xb82c4b7e, 0x38367887, + 0x783dfaf3, 0xf87bf891, 0xb871c9a1, 0x387dfb70, + 0x78645939, 0x38b67984, 0x78a55839, 0x78fc6a09, + 0xb8aee8e8, 0xfc705b84, 0xbc7bd850, 0xfc396817, + 0xbc277a06, 0xf91ddd82, 0xb91b10a8, 0x391f8221, + 0x79197728, 0xf95ca07c, 0xb95b5d75, 0x395dc8af, + 0x795caa60, 0x399dd53d, 0x799c7397, 0x79dcb15b, + 0xb99e3b75, 0xfd5c7f7a, 0xbd5d2882, 0xfd1fb2a1, + 0xbd1d82c4, 0x58000001, 0x1800001b, 0xf882d080, + 0xd8000000, 0xf8a0cbc0, 0xf99fab00, 0x1a1803a0, + 0x3a120396, 0x5a1e0217, 0x7a0e03a7, 0x9a0e0196, + 0xba17031d, 0xda160391, 0xfa130298, 0x0b26cadb, + 0x2b38516d, 0xcb242d10, 0x6b34ea55, 0x8b3d0a2e, + 0xab2eb231, 0xcb3ac476, 0xeb3531ad, 0x3a5a722f, + 0x7a463325, 0xba5e9021, 0xfa47a222, 0x3a590a26, + 0x7a450845, 0xba514a6a, 0xfa48c9c3, 0x1a8e9109, + 0x1a85d57b, 0x5a9632eb, 0x5a9b2793, 0x9a815130, + 0x9a8c05dc, 0xda8e5096, 0xda9b257a, 0x5ac00178, + 0x5ac005ca, 0x5ac008a9, 0x5ac01292, 0x5ac01519, + 0xdac00316, 0xdac0077c, 0xdac00ba8, 0xdac00d51, + 0xdac01177, 0xdac015da, 0x1adc0895, 0x1ad60d5e, + 0x1ada205d, 0x1aca26dc, 0x1acc2b0b, 0x1ad02fd5, + 0x9acd0801, 0x9ac60e22, 0x9ad5230a, 0x9ac62525, + 0x9ac42b60, 0x9ac22c9c, 0x9bc77fc1, 0x9b4a7cbe, + 0x1b0d45e7, 0x1b0cf039, 0x9b1e2562, 0x9b03dae5, + 0x9b291159, 0x9b27c905, 0x9bba64b8, 0x9bbaf02e, + 0x1e280ad8, 0x1e261870, 0x1e392ab0, 0x1e3b3b40, + 0x1e310878, 0x1e660909, 0x1e7e1a76, 0x1e632a2e, + 0x1e743b78, 0x1e76082c, 0x1f0b7510, 0x1f128676, + 0x1f38270f, 0x1f2d5e7b, 0x1f503003, 0x1f52a873, + 0x1f6b5041, 0x1f79392c, 0x1e2042e0, 0x1e20c0d7, + 0x1e214084, 0x1e21c385, 0x1e22c1f5, 0x1e6040ab, + 0x1e60c092, 0x1e61418b, 0x1e61c10f, 0x1e624048, + 0x1e380253, 0x9e380011, 0x1e7801a0, 0x9e780136, + 0x1e2203a6, 0x9e2201cc, 0x1e6202d0, 0x9e6200ae, + 0x1e260007, 0x9e6600dc, 0x1e270342, 0x9e670004, + 0x1e2b2020, 0x1e7520c0, 0x1e202208, 0x1e6022c8, + 0x290c0045, 0x2978766e, 0x696c0c6f, 0xa9323767, + 0xa9483831, 0x29905895, 0x29f43451, 0x69ee66f5, + 0xa9bf41e4, 0xa9f6573d, 0x288a4758, 0x28e27bc3, + 0x68fc4fc3, 0xa8b70779, 0xa8fc539a, 0x283a653d, + 0x28703a79, 0xa8025879, 0xa8734ba9, 0x0c407275, + 0x4cdfa29b, 0x0cc66ec5, 0x4cdf2596, 0x0d40c131, + 0x4ddfcaa5, 0x0dd2cf8a, 0x4c408dfa, 0x0cdf8750, + 0x4d60c04e, 0x0dffcb92, 0x4df6cc13, 0x4cd24850, + 0x0c404818, 0x4d40e604, 0x4ddfe825, 0x0dd0ed47, + 0x4cdf0696, 0x0cd9008f, 0x0d60e0a0, 0x0dffe420, + 0x0deeeb9e, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, 0x7a42cbe2, 0x93df03ff, 0xc820ffff, 0x8822fc7f, 0xc8247cbf, 0x88267fff, 0x4e010fe0, 0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, 0x4e071fe1, 0x4cc0ac3f, @@ -1418,24 +1418,24 @@ Disassembly of section .text: 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, 0x1e7e3000, - 0xf8358305, 0xf82d01ed, 0xf8361353, 0xf839234a, - 0xf82531fb, 0xf8335165, 0xf83a4080, 0xf83673d7, - 0xf832611c, 0xf8ad837d, 0xf8ab01a5, 0xf8a112b8, - 0xf8bb2311, 0xf8b230be, 0xf8a75336, 0xf8a4427a, - 0xf8a6707e, 0xf8b860b7, 0xf8f88392, 0xf8f300ff, - 0xf8ed1386, 0xf8e822af, 0xf8e2302d, 0xf8f1533d, - 0xf8f941d2, 0xf8ff7366, 0xf8f061e5, 0xf86b8072, - 0xf87a0054, 0xf86b1164, 0xf87e22f3, 0xf86331cf, - 0xf87e5296, 0xf8674305, 0xf87771f0, 0xf86b6013, - 0xb83c803c, 0xb82b0195, 0xb83d1240, 0xb8252320, - 0xb82e3340, 0xb83c53b2, 0xb82f43a1, 0xb828739a, - 0xb831608e, 0xb8b88039, 0xb8aa0231, 0xb8bd12b4, - 0xb8bd2189, 0xb8ab30a6, 0xb8b552a7, 0xb8aa4197, - 0xb8b57145, 0xb8be6254, 0xb8ed80b7, 0xb8ef00b8, - 0xb8e9132a, 0xb8f42231, 0xb8ec33d2, 0xb8e35323, - 0xb8fa4159, 0xb8e273eb, 0xb8e760a2, 0xb8608287, - 0xb865005f, 0xb87b1379, 0xb87e2358, 0xb86f32c2, - 0xb86053e3, 0xb86f4154, 0xb87671d5, 0xb866605e, + 0xf83a8229, 0xf83c0057, 0xf8361062, 0xf82b23d9, + 0xf836309c, 0xf826530b, 0xf82c43ff, 0xf837713e, + 0xf8266281, 0xf8b182c2, 0xf8ae015b, 0xf8a6127e, + 0xf8a02179, 0xf8b733c0, 0xf8b55143, 0xf8af4016, + 0xf8b17280, 0xf8b0602d, 0xf8fb82ef, 0xf8f3003e, + 0xf8ef12fc, 0xf8e7226f, 0xf8eb314c, 0xf8e65187, + 0xf8fc41a5, 0xf8e97234, 0xf8f56179, 0xf8738318, + 0xf86803da, 0xf8711112, 0xf8622063, 0xf87a3207, + 0xf87b50a6, 0xf8764280, 0xf86b705a, 0xf87e609d, + 0xb82480e5, 0xb82a005a, 0xb83b1370, 0xb83f2157, + 0xb82431a2, 0xb823506f, 0xb82340ca, 0xb828714b, + 0xb83d61be, 0xb8ab8291, 0xb8ba00d0, 0xb8b5102a, + 0xb8bd22ec, 0xb8bd3108, 0xb8ab51ca, 0xb8a442cd, + 0xb8a770ed, 0xb8ae63e0, 0xb8f18382, 0xb8f3014b, + 0xb8ec1293, 0xb8e02108, 0xb8f13303, 0xb8f950e5, + 0xb8f0413e, 0xb8ea71df, 0xb8f16173, 0xb87481a1, + 0xb87a028b, 0xb87213d8, 0xb86c2299, 0xb86e30bd, + 0xb862537a, 0xb879417b, 0xb86470fd, 0xb870615d, }; // END Generated code -- do not edit diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 0824ca393..dc2d5e2c9 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -139,6 +139,9 @@ REGISTER_DECLARATION(Register, rdispatch, r21); // Java stack pointer REGISTER_DECLARATION(Register, esp, r20); +// Preserved predicate register with all elements set TRUE. +REGISTER_DECLARATION(PRegister, ptrue, p7); + #define assert_cond(ARG1) assert(ARG1, #ARG1) namespace asm_util { @@ -273,6 +276,14 @@ public: f(r->encoding_nocheck(), lsb + 4, lsb); } + void prf(PRegister r, int lsb) { + f(r->encoding_nocheck(), lsb + 3, lsb); + } + + void pgrf(PRegister r, int lsb) { + f(r->encoding_nocheck(), lsb + 2, lsb); + } + unsigned get(int msb = 31, int lsb = 0) { int nbits = msb - lsb + 1; unsigned mask = checked_cast(right_n_bits(nbits)) << lsb; @@ -554,6 +565,18 @@ class Address { void lea(MacroAssembler *, Register) const; static bool offset_ok_for_immed(int64_t offset, uint shift = 0); + + static bool offset_ok_for_sve_immed(long offset, int shift, int vl /* sve vector length */) { + if (offset % vl == 0) { + // Convert address offset into sve imm offset (MUL VL). + int sve_offset = offset / vl; + if (((-(1 << (shift - 1))) <= sve_offset) && (sve_offset < (1 << (shift - 1)))) { + // sve_offset can be encoded + return true; + } + } + return false; + } }; // Convience classes @@ -669,6 +692,12 @@ public: void rf(FloatRegister reg, int lsb) { current->rf(reg, lsb); } + void prf(PRegister reg, int lsb) { + current->prf(reg, lsb); + } + void pgrf(PRegister reg, int lsb) { + current->pgrf(reg, lsb); + } void fixed(unsigned value, unsigned mask) { current->fixed(value, mask); } @@ -2431,13 +2460,18 @@ public: f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); } - void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { - starti; - f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); - f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10); - rf(Vn, 5), rf(Rd, 0); +#define INSN(NAME, op) \ + void NAME(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { \ + starti; \ + f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); \ + f(((idx<<1)|1)<<(int)T, 20, 16), f(op, 15, 10); \ + rf(Vn, 5), rf(Rd, 0); \ } + INSN(umov, 0b001111); + INSN(smov, 0b001011); +#undef INSN + #define INSN(NAME, opc, opc2, isSHR) \ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){ \ starti; \ @@ -2670,13 +2704,299 @@ public: #undef INSN void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) - { +{ + starti; + assert(T == T8B || T == T16B, "invalid arrangement"); + assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value"); + f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); + rf(Vm, 16), f(0, 15), f(index, 14, 11); + f(0, 10), rf(Vn, 5), rf(Vd, 0); +} + +// SVE arithmetics - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid register variant"); \ + f(0b00000100, 31, 24), f(T, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_add, 0b000); + INSN(sve_sub, 0b001); +#undef INSN + +// SVE floating-point arithmetic - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T == S || T == D, "invalid register variant"); \ + f(0b01100101, 31, 24), f(T, 23, 22), f(0, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_fadd, 0b000); + INSN(sve_fmul, 0b010); + INSN(sve_fsub, 0b001); +#undef INSN + +private: + void sve_predicate_reg_insn(unsigned op24, unsigned op13, + FloatRegister Zd_or_Vd, SIMD_RegVariant T, + PRegister Pg, FloatRegister Zn_or_Vn) { + starti; + f(op24, 31, 24), f(T, 23, 22), f(op13, 21, 13); + pgrf(Pg, 10), rf(Zn_or_Vn, 5), rf(Zd_or_Vd, 0); + } + +public: + +// SVE integer arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn_or_Zd_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm_or_Vn) { \ + assert(T != Q, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zdn_or_Zd_or_Vd, T, Pg, Znm_or_Vn); \ + } + + INSN(sve_abs, 0b00000100, 0b010110101); // vector abs, unary + INSN(sve_add, 0b00000100, 0b000000000); // vector add + INSN(sve_andv, 0b00000100, 0b011010001); // bitwise and reduction to scalar + INSN(sve_asr, 0b00000100, 0b010000100); // vector arithmetic shift right + INSN(sve_cnt, 0b00000100, 0b011010101) // count non-zero bits + INSN(sve_cpy, 0b00000101, 0b100000100); // copy scalar to each active vector element + INSN(sve_eorv, 0b00000100, 0b011001001); // bitwise xor reduction to scalar + INSN(sve_lsl, 0b00000100, 0b010011100); // vector logical shift left + INSN(sve_lsr, 0b00000100, 0b010001100); // vector logical shift right + INSN(sve_mul, 0b00000100, 0b010000000); // vector mul + INSN(sve_neg, 0b00000100, 0b010111101); // vector neg, unary + INSN(sve_not, 0b00000100, 0b011110101); // bitwise invert vector, unary + INSN(sve_orv, 0b00000100, 0b011000001); // bitwise or reduction to scalar + INSN(sve_smax, 0b00000100, 0b001000000); // signed maximum vectors + INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar + INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors + INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar + INSN(sve_sub, 0b00000100, 0b000001000); // vector sub + INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar +#undef INSN + +// SVE floating-point arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zd_or_Zdn_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn_or_Zm) { \ + assert(T == S || T == D, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zd_or_Zdn_or_Vd, T, Pg, Zn_or_Zm); \ + } + + INSN(sve_fabs, 0b00000100, 0b011100101); + INSN(sve_fadd, 0b01100101, 0b000000100); + INSN(sve_fadda, 0b01100101, 0b011000001); // add strictly-ordered reduction to scalar Vd + INSN(sve_fdiv, 0b01100101, 0b001101100); + INSN(sve_fmax, 0b01100101, 0b000110100); // floating-point maximum + INSN(sve_fmaxv, 0b01100101, 0b000110001); // floating-point maximum recursive reduction to scalar + INSN(sve_fmin, 0b01100101, 0b000111100); // floating-point minimum + INSN(sve_fminv, 0b01100101, 0b000111001); // floating-point minimum recursive reduction to scalar + INSN(sve_fmul, 0b01100101, 0b000010100); + INSN(sve_fneg, 0b00000100, 0b011101101); + INSN(sve_frintm, 0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity + INSN(sve_frintn, 0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even + INSN(sve_frintp, 0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity + INSN(sve_fsqrt, 0b01100101, 0b001101101); + INSN(sve_fsub, 0b01100101, 0b000001100); +#undef INSN + + // SVE multiple-add/sub - predicated +#define INSN(NAME, op0, op1, op2) \ + void NAME(FloatRegister Zda, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(op0, 31, 24), f(T, 23, 22), f(op1, 21), rf(Zm, 16); \ + f(op2, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zda, 0); \ + } + + INSN(sve_fmla, 0b01100101, 1, 0b000); // floating-point fused multiply-add: Zda = Zda + Zn * Zm + INSN(sve_fmls, 0b01100101, 1, 0b001); // floating-point fused multiply-subtract: Zda = Zda + -Zn * Zm + INSN(sve_fnmla, 0b01100101, 1, 0b010); // floating-point negated fused multiply-add: Zda = -Zda + -Zn * Zm + INSN(sve_fnmls, 0b01100101, 1, 0b011); // floating-point negated fused multiply-subtract: Zda = -Zda + Zn * Zm + INSN(sve_mla, 0b00000100, 0, 0b010); // multiply-add: Zda = Zda + Zn*Zm + INSN(sve_mls, 0b00000100, 0, 0b011); // multiply-subtract: Zda = Zda + -Zn*Zm +#undef INSN + +// SVE bitwise logical - unpredicated +#define INSN(NAME, opc) \ + void NAME(FloatRegister Zd, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + f(0b00000100, 31, 24), f(opc, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_and, 0b00); + INSN(sve_eor, 0b10); + INSN(sve_orr, 0b01); +#undef INSN + +// SVE shift immediate - unpredicated +#define INSN(NAME, opc, isSHR) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, int shift) { \ + starti; \ + /* The encodings for the tszh:tszl:imm3 fields (bits 23:22 20:19 18:16) \ + * for shift right is calculated as: \ + * 0001 xxx B, shift = 16 - UInt(tszh:tszl:imm3) \ + * 001x xxx H, shift = 32 - UInt(tszh:tszl:imm3) \ + * 01xx xxx S, shift = 64 - UInt(tszh:tszl:imm3) \ + * 1xxx xxx D, shift = 128 - UInt(tszh:tszl:imm3) \ + * for shift left is calculated as: \ + * 0001 xxx B, shift = UInt(tszh:tszl:imm3) - 8 \ + * 001x xxx H, shift = UInt(tszh:tszl:imm3) - 16 \ + * 01xx xxx S, shift = UInt(tszh:tszl:imm3) - 32 \ + * 1xxx xxx D, shift = UInt(tszh:tszl:imm3) - 64 \ + */ \ + assert(T != Q, "Invalid register variant"); \ + if (isSHR) { \ + assert(((1 << (T + 3)) >= shift) && (shift > 0) , "Invalid shift value"); \ + } else { \ + assert(((1 << (T + 3)) > shift) && (shift >= 0) , "Invalid shift value"); \ + } \ + int cVal = (1 << ((T + 3) + (isSHR ? 1 : 0))); \ + int encodedShift = isSHR ? cVal - shift : cVal + shift; \ + int tszh = encodedShift >> 5; \ + int tszl_imm = encodedShift & 0x1f; \ + f(0b00000100, 31, 24); \ + f(tszh, 23, 22), f(1,21), f(tszl_imm, 20, 16); \ + f(0b100, 15, 13), f(opc, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_asr, 0b100, /* isSHR = */ true); + INSN(sve_lsl, 0b111, /* isSHR = */ false); + INSN(sve_lsr, 0b101, /* isSHR = */ true); +#undef INSN + +private: + + // Scalar base + immediate index + void sve_ld_st1(FloatRegister Zt, Register Xn, int imm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + f(0, 20), sf(imm, 19, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + // Scalar base + scalar index + void sve_ld_st1(FloatRegister Zt, Register Xn, Register Xm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + rf(Xm, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + void sve_ld_st1(FloatRegister Zt, PRegister Pg, + SIMD_RegVariant T, const Address &a, + int op1, int type, int imm_op2, int scalar_op2) { + switch (a.getMode()) { + case Address::base_plus_offset: + sve_ld_st1(Zt, a.base(), a.offset(), Pg, T, op1, type, imm_op2); + break; + case Address::base_plus_offset_reg: + sve_ld_st1(Zt, a.base(), a.index(), Pg, T, op1, type, scalar_op2); + break; + default: + ShouldNotReachHere(); + } + } + +public: + +// SVE load/store - predicated +#define INSN(NAME, op1, type, imm_op2, scalar_op2) \ + void NAME(FloatRegister Zt, SIMD_RegVariant T, PRegister Pg, const Address &a) { \ + assert(T != Q, "invalid register variant"); \ + sve_ld_st1(Zt, Pg, T, a, op1, type, imm_op2, scalar_op2); \ + } + + INSN(sve_ld1b, 0b1010010, 0b00, 0b101, 0b010); + INSN(sve_st1b, 0b1110010, 0b00, 0b111, 0b010); + INSN(sve_ld1h, 0b1010010, 0b01, 0b101, 0b010); + INSN(sve_st1h, 0b1110010, 0b01, 0b111, 0b010); + INSN(sve_ld1w, 0b1010010, 0b10, 0b101, 0b010); + INSN(sve_st1w, 0b1110010, 0b10, 0b111, 0b010); + INSN(sve_ld1d, 0b1010010, 0b11, 0b101, 0b010); + INSN(sve_st1d, 0b1110010, 0b11, 0b111, 0b010); +#undef INSN + +// SVE load/store - unpredicated +#define INSN(NAME, op1) \ + void NAME(FloatRegister Zt, const Address &a) { \ + starti; \ + assert(a.index() == noreg, "invalid address variant"); \ + f(op1, 31, 29), f(0b0010110, 28, 22), sf(a.offset() >> 3, 21, 16), \ + f(0b010, 15, 13), f(a.offset() & 0x7, 12, 10), srf(a.base(), 5), rf(Zt, 0); \ + } + + INSN(sve_ldr, 0b100); // LDR (vector) + INSN(sve_str, 0b111); // STR (vector) +#undef INSN + +#define INSN(NAME, op) \ + void NAME(Register Xd, Register Xn, int imm6) { \ + starti; \ + f(0b000001000, 31, 23), f(op, 22, 21); \ + srf(Xn, 16), f(0b01010, 15, 11), sf(imm6, 10, 5), srf(Xd, 0); \ + } + + INSN(sve_addvl, 0b01); + INSN(sve_addpl, 0b11); +#undef INSN + +// SVE inc/dec register by element count +#define INSN(NAME, op) \ + void NAME(Register Xdn, SIMD_RegVariant T, unsigned imm4 = 1, int pattern = 0b11111) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(0b00000100,31, 24), f(T, 23, 22), f(0b11, 21, 20); \ + f(imm4 - 1, 19, 16), f(0b11100, 15, 11), f(op, 10), f(pattern, 9, 5), rf(Xdn, 0); \ + } + + INSN(sve_inc, 0); + INSN(sve_dec, 1); +#undef INSN + +// SVE predicate count + void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) { + starti; + assert(T != Q, "invalid size"); + f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14); + prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0); + } + + // SVE dup scalar + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, Register Rn) { + starti; + assert(T != Q, "invalid size"); + f(0b00000101, 31, 24), f(T, 23, 22), f(0b100000001110, 21, 10); + srf(Rn, 5), rf(Zd, 0); + } + + // SVE dup imm + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, int imm8) { + starti; + assert(T != Q, "invalid size"); + int sh = 0; + if (imm8 <= 127 && imm8 >= -128) { + sh = 0; + } else if (T != B && imm8 <= 32512 && imm8 >= -32768 && (imm8 & 0xff) == 0) { + sh = 1; + imm8 = (imm8 >> 8); + } else { + guarantee(false, "invalid immediate"); + } + f(0b00100101, 31, 24), f(T, 23, 22), f(0b11100011, 21, 14); + f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0); + } + + void sve_ptrue(PRegister pd, SIMD_RegVariant esize, int pattern = 0b11111) { starti; - assert(T == T8B || T == T16B, "invalid arrangement"); - assert((T == T8B && index <= 0b0111) || (T == T16B && index <= 0b1111), "Invalid index value"); - f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); - rf(Vm, 16), f(0, 15), f(index, 14, 11); - f(0, 10), rf(Vn, 5), rf(Vd, 0); + f(0b00100101, 31, 24), f(esize, 23, 22), f(0b011000111000, 21, 10); + f(pattern, 9, 5), f(0b0, 4), prf(pd, 0); } Assembler(CodeBuffer* code) : AbstractAssembler(code) { diff --git a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp index 6ac54f257..a258528ea 100644 --- a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp @@ -456,8 +456,12 @@ void ZBarrierSetAssembler::generate_c2_load_barrier_stub(MacroAssembler* masm, Z ZSetupArguments setup_arguments(masm, stub); __ mov(rscratch1, stub->slow_path()); __ blr(rscratch1); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } } - // Stub exit __ b(*stub->continuation()); } diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 071845e5b..f26ea2a8b 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -112,6 +112,9 @@ define_pd_global(intx, InlineSmallCode, 1000); "Avoid generating unaligned memory accesses") \ product(bool, UseLSE, false, \ "Use LSE instructions") \ + product(uint, UseSVE, 0, \ + "Highest supported SVE instruction set version") \ + range(0, 2) \ product(bool, UseBlockZeroing, true, \ "Use DC ZVA for block zeroing") \ product(intx, BlockZeroingLowLimit, 256, \ diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index aecab30c1..b6b070e62 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -53,6 +53,7 @@ #include "opto/compile.hpp" #include "opto/intrinsicnode.hpp" #include "opto/node.hpp" +#include "opto/matcher.hpp" #endif #ifdef PRODUCT @@ -2110,8 +2110,17 @@ int MacroAssembler::pop(unsigned int bitset, Register stack) { } // Push lots of registers in the bit set supplied. Don't push sp. -// Return the number of words pushed +// Return the number of dwords pushed int MacroAssembler::push_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif + // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2126,8 +2135,18 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + sub(stack, stack, sve_vector_size_in_bytes * count); + for (int i = 0; i < count; i++) { + sve_str(as_FloatRegister(regs[i]), Address(stack, i)); + } + return count * sve_vector_size_in_bytes / 8; + } + add(stack, stack, -count * wordSize * 2); + // NEON if (count & 1) { strq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2140,7 +2159,16 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return count; } +// Return the number of dwords poped int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2155,6 +2183,16 @@ int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + for (int i = count - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(regs[i]), Address(stack, i)); + } + add(stack, stack, sve_vector_size_in_bytes * count); + return count * sve_vector_size_in_bytes / 8; + } + + // NEON if (count & 1) { ldrq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2630,23 +2668,39 @@ void MacroAssembler::pop_call_clobbered_registers() { pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); } -void MacroAssembler::push_CPU_state(bool save_vectors) { - int step = (save_vectors ? 8 : 4) * wordSize; +void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve, + int sve_vector_size_in_bytes) { push(0x3fffffff, sp); // integer registers except lr & sp - mov(rscratch1, -step); - sub(sp, sp, step); - for (int i = 28; i >= 4; i -= 4) { - st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) { + sub(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { + sve_str(as_FloatRegister(i), Address(sp, i)); + } + } else { + int step = (save_vectors ? 8 : 4) * wordSize; + mov(rscratch1, -step); + sub(sp, sp, step); + for (int i = 28; i >= 4; i -= 4) { + st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + } + st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } - st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } -void MacroAssembler::pop_CPU_state(bool restore_vectors) { - int step = (restore_vectors ? 8 : 4) * wordSize; - for (int i = 0; i <= 28; i += 4) - ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); +void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve, + int sve_vector_size_in_bytes) { + if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) { + for (int i = FloatRegisterImpl::number_of_registers - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(i), Address(sp, i)); + } + add(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + } else { + int step = (restore_vectors ? 8 : 4) * wordSize; + for (int i = 0; i <= 28; i += 4) + ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); + } pop(0x3fffffff, sp); // integer registers except lr & sp } @@ -2695,6 +2749,21 @@ Address MacroAssembler::spill_address(int size, int offset, Register tmp) return Address(base, offset); } +Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) { + assert(offset >= 0, "spill to negative address?"); + + Register base = sp; + + // An immediate offset in the range 0 to 255 which is multiplied + // by the current vector or predicate register size in bytes. + if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) { + return Address(base, offset / sve_reg_size_in_bytes); + } + + add(tmp, base, offset); + return Address(tmp); +} + // Checks whether offset is aligned. // Returns true if it is, else false. bool MacroAssembler::merge_alignment_check(Register base, @@ -5879,3 +5948,24 @@ void MacroAssembler::get_thread(Register dst) { pop(saved_regs, sp); } + +void MacroAssembler::verify_sve_vector_length() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + movw(rscratch1, zr); + sve_inc(rscratch1, B); + subsw(zr, rscratch1, VM_Version::get_initial_sve_vector_length()); + br(EQ, verify_ok); + stop("Error: SVE vector length has changed since jvm startup"); + bind(verify_ok); +} + +void MacroAssembler::verify_ptrue() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count. + sve_dec(rscratch1, B); + cbz(rscratch1, verify_ok); + stop("Error: the preserved predicate register (p7) elements are not all true"); + bind(verify_ok); +} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 85fdc0c88..dccd24911 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -862,8 +862,10 @@ public: DEBUG_ONLY(void verify_heapbase(const char* msg);) - void push_CPU_state(bool save_vectors = false); - void pop_CPU_state(bool restore_vectors = false) ; + void push_CPU_state(bool save_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); + void pop_CPU_state(bool restore_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); // Round up to a power of two void round_to(Register reg, int modulus); @@ -938,6 +940,11 @@ public: Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); + void verify_sve_vector_length(); + void reinitialize_ptrue() { + sve_ptrue(ptrue, B); + } + void verify_ptrue(); // Debugging @@ -1319,6 +1326,7 @@ private: // Returns an address on the stack which is reachable with a ldr/str of size // Uses rscratch2 if the address is not directly reachable Address spill_address(int size, int offset, Register tmp=rscratch2); + Address sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp=rscratch2); bool merge_alignment_check(Register base, size_t size, int64_t cur_offset, int64_t prev_offset) const; @@ -1342,6 +1350,9 @@ public: void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) { str(Vx, T, spill_address(1 << (int)T, offset)); } + void spill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_str(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void unspill(Register Rx, bool is64, int offset) { if (is64) { ldr(Rx, spill_address(8, offset)); @@ -1352,6 +1363,9 @@ public: void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) { ldr(Vx, T, spill_address(1 << (int)T, offset)); } + void unspill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_ldr(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void spill_copy128(int src_offset, int dst_offset, Register tmp1=rscratch1, Register tmp2=rscratch2) { if (src_offset < 512 && (src_offset & 7) == 0 && @@ -1365,6 +1379,15 @@ public: spill(tmp1, true, dst_offset+8); } } + void spill_copy_sve_vector_stack_to_stack(int src_offset, int dst_offset, + int sve_vec_reg_size_in_bytes) { + assert(sve_vec_reg_size_in_bytes % 16 == 0, "unexpected sve vector reg size"); + for (int i = 0; i < sve_vec_reg_size_in_bytes / 16; i++) { + spill_copy128(src_offset, dst_offset); + src_offset += 16; + dst_offset += 16; + } + } }; #ifdef ASSERT diff --git a/src/hotspot/cpu/aarch64/register_aarch64.cpp b/src/hotspot/cpu/aarch64/register_aarch64.cpp index 36cbe3fee..3db8e8337 100644 --- a/src/hotspot/cpu/aarch64/register_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_aarch64.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,6 +33,9 @@ const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr + FloatRegisterImpl::number_of_registers * FloatRegisterImpl::max_slots_per_register; +const int ConcreteRegisterImpl::max_pr + = ConcreteRegisterImpl::max_fpr + PRegisterImpl::number_of_registers; + const char* RegisterImpl::name() const { const char* names[number_of_registers] = { "c_rarg0", "c_rarg1", "c_rarg2", "c_rarg3", "c_rarg4", "c_rarg5", "c_rarg6", "c_rarg7", @@ -54,3 +57,10 @@ const char* FloatRegisterImpl::name() const { }; return is_valid() ? names[encoding()] : "noreg"; } + +const char* PRegisterImpl::name() const { + const char* names[number_of_registers] = { + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7" + }; + return is_valid() ? names[encoding()] : "noreg"; +} diff --git a/src/hotspot/cpu/aarch64/register_aarch64.hpp b/src/hotspot/cpu/aarch64/register_aarch64.hpp @@ -129,9 +129,10 @@ class FloatRegisterImpl: public AbstractRegisterImpl { public: enum { number_of_registers = 32, - max_slots_per_register = 4, + max_slots_per_register = 8, save_slots_per_register = 2, - extra_save_slots_per_register = max_slots_per_register - save_slots_per_register + slots_per_neon_register = 4, + extra_save_slots_per_neon_register = slots_per_neon_register - save_slots_per_register }; // construction @@ -187,6 +188,79 @@ CONSTANT_REGISTER_DECLARATION(FloatRegister, v29 , (29)); CONSTANT_REGISTER_DECLARATION(FloatRegister, v30 , (30)); CONSTANT_REGISTER_DECLARATION(FloatRegister, v31 , (31)); +// SVE vector registers, shared with the SIMD&FP v0-v31. Vn maps to Zn[127:0]. +CONSTANT_REGISTER_DECLARATION(FloatRegister, z0 , ( 0)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z1 , ( 1)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z2 , ( 2)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z3 , ( 3)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z4 , ( 4)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z5 , ( 5)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z6 , ( 6)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z7 , ( 7)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z8 , ( 8)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z9 , ( 9)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z10 , (10)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z11 , (11)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z12 , (12)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z13 , (13)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z14 , (14)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z15 , (15)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z16 , (16)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z17 , (17)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z18 , (18)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z19 , (19)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z20 , (20)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z21 , (21)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z22 , (22)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z23 , (23)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z24 , (24)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z25 , (25)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z26 , (26)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z27 , (27)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z28 , (28)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z29 , (29)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z30 , (30)); +CONSTANT_REGISTER_DECLARATION(FloatRegister, z31 , (31)); + +class PRegisterImpl; +typedef PRegisterImpl* PRegister; +inline PRegister as_PRegister(int encoding) { + return (PRegister)(intptr_t)encoding; +} + +// The implementation of predicate registers for the architecture +class PRegisterImpl: public AbstractRegisterImpl { + public: + enum { + number_of_registers = 8, + max_slots_per_register = 1 + }; + + // construction + inline friend PRegister as_PRegister(int encoding); + + VMReg as_VMReg(); + + // derived registers, offsets, and addresses + PRegister successor() const { return as_PRegister(encoding() + 1); } + + // accessors + int encoding() const { assert(is_valid(), "invalid register"); return (intptr_t)this; } + int encoding_nocheck() const { return (intptr_t)this; } + bool is_valid() const { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; } + const char* name() const; +}; + +// The predicate registers of SVE. +CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0)); +CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1)); +CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2)); +CONSTANT_REGISTER_DECLARATION(PRegister, p3, ( 3)); +CONSTANT_REGISTER_DECLARATION(PRegister, p4, ( 4)); +CONSTANT_REGISTER_DECLARATION(PRegister, p5, ( 5)); +CONSTANT_REGISTER_DECLARATION(PRegister, p6, ( 6)); +CONSTANT_REGISTER_DECLARATION(PRegister, p7, ( 7)); + // Need to know the total number of registers of all sorts for SharedInfo. // Define a class that exports it. class ConcreteRegisterImpl : public AbstractRegisterImpl { @@ -199,12 +273,14 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl { number_of_registers = (RegisterImpl::max_slots_per_register * RegisterImpl::number_of_registers + FloatRegisterImpl::max_slots_per_register * FloatRegisterImpl::number_of_registers + + PRegisterImpl::max_slots_per_register * PRegisterImpl::number_of_registers + 1) // flags }; // added to make it compile static const int max_gpr; static const int max_fpr; + static const int max_pr; }; // A set of registers diff --git a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp index c18109087..e337f582a 100644 --- a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -154,3 +154,47 @@ REGISTER_DEFINITION(Register, rthread); REGISTER_DEFINITION(Register, rheapbase); REGISTER_DEFINITION(Register, r31_sp); + +REGISTER_DEFINITION(FloatRegister, z0); +REGISTER_DEFINITION(FloatRegister, z1); +REGISTER_DEFINITION(FloatRegister, z2); +REGISTER_DEFINITION(FloatRegister, z3); +REGISTER_DEFINITION(FloatRegister, z4); +REGISTER_DEFINITION(FloatRegister, z5); +REGISTER_DEFINITION(FloatRegister, z6); +REGISTER_DEFINITION(FloatRegister, z7); +REGISTER_DEFINITION(FloatRegister, z8); +REGISTER_DEFINITION(FloatRegister, z9); +REGISTER_DEFINITION(FloatRegister, z10); +REGISTER_DEFINITION(FloatRegister, z11); +REGISTER_DEFINITION(FloatRegister, z12); +REGISTER_DEFINITION(FloatRegister, z13); +REGISTER_DEFINITION(FloatRegister, z14); +REGISTER_DEFINITION(FloatRegister, z15); +REGISTER_DEFINITION(FloatRegister, z16); +REGISTER_DEFINITION(FloatRegister, z17); +REGISTER_DEFINITION(FloatRegister, z18); +REGISTER_DEFINITION(FloatRegister, z19); +REGISTER_DEFINITION(FloatRegister, z20); +REGISTER_DEFINITION(FloatRegister, z21); +REGISTER_DEFINITION(FloatRegister, z22); +REGISTER_DEFINITION(FloatRegister, z23); +REGISTER_DEFINITION(FloatRegister, z24); +REGISTER_DEFINITION(FloatRegister, z25); +REGISTER_DEFINITION(FloatRegister, z26); +REGISTER_DEFINITION(FloatRegister, z27); +REGISTER_DEFINITION(FloatRegister, z28); +REGISTER_DEFINITION(FloatRegister, z29); +REGISTER_DEFINITION(FloatRegister, z30); +REGISTER_DEFINITION(FloatRegister, z31); + +REGISTER_DEFINITION(PRegister, p0); +REGISTER_DEFINITION(PRegister, p1); +REGISTER_DEFINITION(PRegister, p2); +REGISTER_DEFINITION(PRegister, p3); +REGISTER_DEFINITION(PRegister, p4); +REGISTER_DEFINITION(PRegister, p5); +REGISTER_DEFINITION(PRegister, p6); +REGISTER_DEFINITION(PRegister, p7); + +REGISTER_DEFINITION(PRegister, ptrue); diff --git a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp index 3d3cc3a1e..6242cce08 100644 --- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp @@ -111,11 +111,28 @@ class RegisterSaver { }; OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + int sve_vector_size_in_slots = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + sve_vector_size_in_slots = Matcher::scalable_vector_reg_size(T_FLOAT); +#endif + #if COMPILER2_OR_JVMCI if (save_vectors) { + int vect_words = 0; + int extra_save_slots_per_register = 0; // Save upper half of vector registers - int vect_words = FloatRegisterImpl::number_of_registers * FloatRegisterImpl::extra_save_slots_per_register / - VMRegImpl::slots_per_word; + if (use_sve) { + extra_save_slots_per_register = sve_vector_size_in_slots - FloatRegisterImpl::save_slots_per_register; + } else { + extra_save_slots_per_register = FloatRegisterImpl::extra_save_slots_per_neon_register; + } + vect_words = FloatRegisterImpl::number_of_registers * extra_save_slots_per_register / + VMRegImpl::slots_per_word; additional_frame_words += vect_words; } #else @@ -134,7 +151,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ // Save Integer and Float registers. __ enter(); - __ push_CPU_state(save_vectors); + __ push_CPU_state(save_vectors, use_sve, sve_vector_size_in_bytes); // Set an oopmap for the call site. This oopmap will map all // oop-registers and debug-info registers as callee-saved. This @@ -158,8 +175,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { FloatRegister r = as_FloatRegister(i); - int sp_offset = save_vectors ? (FloatRegisterImpl::max_slots_per_register * i) : - (FloatRegisterImpl::save_slots_per_register * i); + int sp_offset = 0; + if (save_vectors) { + sp_offset = use_sve ? (sve_vector_size_in_slots * i) : + (FloatRegisterImpl::slots_per_neon_register * i); + } else { + sp_offset = FloatRegisterImpl::save_slots_per_register * i; + } oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset), r->as_VMReg()); } @@ -168,10 +190,15 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { -#ifndef COMPILER2 +#ifdef COMPILER2 + __ pop_CPU_state(restore_vectors, Matcher::supports_scalable_vector(), + Matcher::scalable_vector_reg_size(T_BYTE)); +#else +#if !INCLUDE_JVMCI assert(!restore_vectors, "vectors are generated only by C2 and JVMCI"); #endif __ pop_CPU_state(restore_vectors); +#endif __ leave(); } @@ -1829,6 +1856,11 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm, __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset())); } + if (UseSVE > 0) { + // Make sure that jni code does not change SVE vector length. + __ verify_sve_vector_length(); + } + // check for safepoint operation in progress and/or pending suspend requests Label safepoint_in_progress, safepoint_in_progress_done; { @@ -2759,6 +2791,12 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t __ maybe_isb(); __ membar(Assembler::LoadLoad | Assembler::LoadStore); + if (UseSVE > 0 && save_vectors) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); __ cbz(rscratch1, noException); diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 26a54c87e..85f64c007 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -488,6 +488,11 @@ class StubGenerator: public StubCodeGenerator { __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); + if (UseSVE > 0 ) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } // we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own @@ -5092,6 +5097,12 @@ class StubGenerator: public StubCodeGenerator { __ reset_last_Java_frame(true); __ maybe_isb(); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ leave(); // check for pending exceptions diff --git a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp index 03d7a6e2d..42f301531 100644 --- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp @@ -1377,6 +1377,11 @@ address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) { __ push(dtos); __ push(ltos); + if (UseSVE > 0) { + // Make sure that jni code does not change SVE vector length. + __ verify_sve_vector_length(); + } + // change thread state __ mov(rscratch1, _thread_in_native_trans); __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset())); diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index e906454f1..7ae881b74 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -30,12 +30,14 @@ #include "runtime/java.hpp" #include "runtime/stubCodeGenerator.hpp" #include "runtime/vm_version.hpp" +#include "utilities/formatBuffer.hpp" #include "utilities/macros.hpp" #include OS_HEADER_INLINE(os) -#include #include +#include +#include #ifndef HWCAP_AES #define HWCAP_AES (1<<3) @@ -61,12 +63,27 @@ #define HWCAP_ATOMICS (1<<8) #endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif + +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) +#endif + +#ifndef PR_SVE_GET_VL +// For old toolchains which do not have SVE related macros defined. +#define PR_SVE_SET_VL 50 +#define PR_SVE_GET_VL 51 +#endif + int VM_Version::_cpu; int VM_Version::_model; int VM_Version::_model2; int VM_Version::_variant; int VM_Version::_revision; int VM_Version::_stepping; +int VM_Version::_initial_sve_vector_length; VM_Version::PsrInfo VM_Version::_psr_info = { 0, }; static BufferBlob* stub_blob; @@ -164,6 +181,7 @@ void VM_Version::get_processor_features() { } uint64_t auxv = getauxval(AT_HWCAP); + uint64_t auxv2 = getauxval(AT_HWCAP2); char buf[512]; @@ -269,6 +287,8 @@ void VM_Version::get_processor_features() { if (auxv & HWCAP_SHA1) strcat(buf, ", sha1"); if (auxv & HWCAP_SHA2) strcat(buf, ", sha256"); if (auxv & HWCAP_ATOMICS) strcat(buf, ", lse"); + if (auxv & HWCAP_SVE) strcat(buf, ", sve"); + if (auxv2 & HWCAP2_SVE2) strcat(buf, ", sve2"); _features_string = os::strdup(buf); @@ -402,6 +422,18 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseBlockZeroing, false); } + if (auxv & HWCAP_SVE) { + if (FLAG_IS_DEFAULT(UseSVE)) { + FLAG_SET_DEFAULT(UseSVE, (auxv2 & HWCAP2_SVE2) ? 2 : 1); + } + if (UseSVE > 0) { + _initial_sve_vector_length = prctl(PR_SVE_GET_VL); + } + } else if (UseSVE > 0) { + warning("UseSVE specified, but not supported on current CPU. Disabling SVE."); + FLAG_SET_DEFAULT(UseSVE, 0); + } + // This machine allows unaligned memory accesses if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) { FLAG_SET_DEFAULT(UseUnalignedAccesses, true); @@ -435,6 +467,50 @@ void VM_Version::get_processor_features() { UseMontgomerySquareIntrinsic = true; } + if (UseSVE > 0) { + if (FLAG_IS_DEFAULT(MaxVectorSize)) { + MaxVectorSize = _initial_sve_vector_length; + } else if (MaxVectorSize < 16) { + warning("SVE does not support vector length less than 16 bytes. Disabling SVE."); + UseSVE = 0; + } else if ((MaxVectorSize % 16) == 0 && is_power_of_2(MaxVectorSize)) { + int new_vl = prctl(PR_SVE_SET_VL, MaxVectorSize); + _initial_sve_vector_length = new_vl; + // If MaxVectorSize is larger than system largest supported SVE vector length, above prctl() + // call will set task vector length to the system largest supported value. So, we also update + // MaxVectorSize to that largest supported value. + if (new_vl < 0) { + vm_exit_during_initialization( + err_msg("Current system does not support SVE vector length for MaxVectorSize: %d", + (int)MaxVectorSize)); + } else if (new_vl != MaxVectorSize) { + warning("Current system only supports max SVE vector length %d. Set MaxVectorSize to %d", + new_vl, new_vl); + } + MaxVectorSize = new_vl; + } else { + vm_exit_during_initialization(err_msg("Unsupported MaxVectorSize: %d", (int)MaxVectorSize)); + } + } + + if (UseSVE == 0) { // NEON + int min_vector_size = 8; + int max_vector_size = 16; + if (!FLAG_IS_DEFAULT(MaxVectorSize)) { + if (!is_power_of_2(MaxVectorSize)) { + vm_exit_during_initialization(err_msg("Unsupported MaxVectorSize: %d", (int)MaxVectorSize)); + } else if (MaxVectorSize < min_vector_size) { + warning("MaxVectorSize must be at least %i on this platform", min_vector_size); + FLAG_SET_DEFAULT(MaxVectorSize, min_vector_size); + } else if (MaxVectorSize > max_vector_size) { + warning("MaxVectorSize must be at most %i on this platform", max_vector_size); + FLAG_SET_DEFAULT(MaxVectorSize, max_vector_size); + } + } else { + FLAG_SET_DEFAULT(MaxVectorSize, 16); + } + } + if (FLAG_IS_DEFAULT(OptoScheduling)) { OptoScheduling = true; } diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp index dcb6342e1..ae2715102 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp @@ -40,6 +40,7 @@ protected: static int _variant; static int _revision; static int _stepping; + static int _initial_sve_vector_length; struct PsrInfo { uint32_t dczid_el0; @@ -101,6 +102,7 @@ public: static int cpu_model2() { return _model2; } static int cpu_variant() { return _variant; } static int cpu_revision() { return _revision; } + static int get_initial_sve_vector_length() { return _initial_sve_vector_length; }; static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); } static ByteSize ctr_el0_offset() { return byte_offset_of(PsrInfo, ctr_el0); } static bool is_zva_enabled() { diff --git a/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp b/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp index c5d4383b4..dde7a7a91 100644 --- a/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp +++ b/src/hotspot/cpu/aarch64/vmreg_aarch64.inline.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Red Hat Inc. All rights reserved. + * Copyright (c) 2006, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,4 +36,8 @@ inline VMReg FloatRegisterImpl::as_VMReg() { ConcreteRegisterImpl::max_gpr); } +inline VMReg PRegisterImpl::as_VMReg() { + return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_fpr); +} + #endif // CPU_AARCH64_VM_VMREG_AARCH64_INLINE_HPP diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index 4a32af54e..03248b2e0 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1093,7 +1093,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for @@ -1121,6 +1121,14 @@ const int Matcher::vector_width_in_bytes(BasicType bt) { return MaxVectorSize; } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // Vector ideal reg corresponding to specified size in bytes const uint Matcher::vector_ideal_reg(int size) { assert(MaxVectorSize >= size, ""); diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 36cbada53..571a6aeb0 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2242,7 +2242,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for @@ -2310,6 +2310,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return max_vector_size(bt); // Same as max. } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // PPC implementation uses VSX load/store instructions (if // SuperwordUseVSX) which support 4 byte but not arbitrary alignment const bool Matcher::misaligned_vectors_ok() { diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 96c231b0a..782c1c7c4 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1522,7 +1522,7 @@ const bool Matcher::match_rule_supported(int opcode) { // BUT: make sure match rule is not disabled by a false predicate! } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // Identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen. @@ -1573,6 +1573,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return max_vector_size(bt); // Same as max. } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + const uint Matcher::vector_shift_count_ideal_reg(int size) { fatal("vector shift is not supported"); return Node::NotAMachineReg; diff --git a/src/hotspot/cpu/sparc/sparc.ad b/src/hotspot/cpu/sparc/sparc.ad index a09c795c9..3b1b1046e 100644 --- a/src/hotspot/cpu/sparc/sparc.ad +++ b/src/hotspot/cpu/sparc/sparc.ad @@ -1710,7 +1710,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index abdd7483d..93aee6d6c 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -1354,7 +1354,7 @@ const bool Matcher::match_rule_supported(int opcode) { return ret_value; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen bool ret_value = match_rule_supported(opcode); @@ -1485,6 +1485,14 @@ const int Matcher::min_vector_size(const BasicType bt) { return MIN2(size,max_size); } +const bool Matcher::supports_scalable_vector() { + return false; +} + +const int Matcher::scalable_vector_reg_size(const BasicType bt) { + return -1; +} + // Vector ideal reg corresponding to specified size in bytes const uint Matcher::vector_ideal_reg(int size) { assert(MaxVectorSize >= size, ""); diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index 8904bba97..e09cdd061 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -2968,7 +2968,7 @@ frame RAX_H_num // Op_RegL }; // Excluded flags and vector registers. - assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type"); + assert(ARRAY_SIZE(hi) == _last_machine_leaf - 8, "missing type"); return OptoRegPair(hi[ideal_reg], lo[ideal_reg]); %} %} diff --git a/src/hotspot/share/adlc/archDesc.cpp b/src/hotspot/share/adlc/archDesc.cpp index ba61aa4c0..9e41b2dc6 100644 --- a/src/hotspot/share/adlc/archDesc.cpp +++ b/src/hotspot/share/adlc/archDesc.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -929,6 +929,7 @@ const char *ArchDesc::getIdealType(const char *idealOp) { // Match Vector types. if (strncmp(idealOp, "Vec",3)==0) { switch(last_char) { + case 'A': return "TypeVect::VECTA"; case 'S': return "TypeVect::VECTS"; case 'D': return "TypeVect::VECTD"; case 'X': return "TypeVect::VECTX"; @@ -939,6 +940,10 @@ const char *ArchDesc::getIdealType(const char *idealOp) { } } + if (strncmp(idealOp, "RegVMask", 8) == 0) { + return "Type::BOTTOM"; + } + // !!!!! switch(last_char) { case 'I': return "TypeInt::INT"; diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index c7b855a7e..a37866824 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -3963,6 +3963,8 @@ bool MatchRule::is_base_register(FormDict &globals) const { strcmp(opType,"RegL")==0 || strcmp(opType,"RegF")==0 || strcmp(opType,"RegD")==0 || + strcmp(opType,"RegVMask")==0 || + strcmp(opType,"VecA")==0 || strcmp(opType,"VecS")==0 || strcmp(opType,"VecD")==0 || strcmp(opType,"VecX")==0 || diff --git a/src/hotspot/share/opto/chaitin.cpp b/src/hotspot/share/opto/chaitin.cpp index 914dc43f6..710af9de8 100644 --- a/src/hotspot/share/opto/chaitin.cpp +++ b/src/hotspot/share/opto/chaitin.cpp @@ -77,6 +77,7 @@ void LRG::dump() const { if( _is_oop ) tty->print("Oop "); if( _is_float ) tty->print("Float "); if( _is_vector ) tty->print("Vector "); + if( _is_scalable ) tty->print("Scalable "); if( _was_spilled1 ) tty->print("Spilled "); if( _was_spilled2 ) tty->print("Spilled2 "); if( _direct_conflict ) tty->print("Direct_conflict "); @@ -646,7 +647,15 @@ void PhaseChaitin::Register_Allocate() { // Live ranges record the highest register in their mask. // We want the low register for the AD file writer's convenience. OptoReg::Name hi = lrg.reg(); // Get hi register - OptoReg::Name lo = OptoReg::add(hi, (1-lrg.num_regs())); // Find lo + int num_regs = lrg.num_regs(); + if (lrg.is_scalable() && OptoReg::is_stack(hi)) { + // For scalable vector registers, when they are allocated in physical + // registers, num_regs is RegMask::SlotsPerVecA for reg mask of scalable + // vector. If they are allocated on stack, we need to get the actual + // num_regs, which reflects the physical length of scalable registers. + num_regs = lrg.scalable_reg_slots(); + } + OptoReg::Name lo = OptoReg::add(hi, (1-num_regs)); // Find lo // We have to use pair [lo,lo+1] even for wide vectors because // the rest of code generation works only with pairs. It is safe // since for registers encoding only 'lo' is used. @@ -801,8 +810,19 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { // Check for vector live range (only if vector register is used). // On SPARC vector uses RegD which could be misaligned so it is not // processes as vector in RA. - if (RegMask::is_vector(ireg)) + if (RegMask::is_vector(ireg)) { lrg._is_vector = 1; + if (ireg == Op_VecA) { + assert(Matcher::supports_scalable_vector(), "scalable vector should be supported"); + lrg._is_scalable = 1; + // For scalable vector, when it is allocated in physical register, + // num_regs is RegMask::SlotsPerVecA for reg mask, + // which may not be the actual physical register size. + // If it is allocated in stack, we need to get the actual + // physical length of scalable vector register. + lrg.set_scalable_reg_slots(Matcher::scalable_vector_reg_size(T_FLOAT)); + } + } assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD || ireg == Op_RegL, "vector must be in vector registers"); @@ -912,6 +932,13 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { lrg.set_reg_pressure(1); #endif break; + case Op_VecA: + assert(Matcher::supports_scalable_vector(), "does not support scalable vector"); + assert(RegMask::num_registers(Op_VecA) == RegMask::SlotsPerVecA, "sanity"); + assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecA), "vector should be aligned"); + lrg.set_num_regs(RegMask::SlotsPerVecA); + lrg.set_reg_pressure(1); + break; case Op_VecS: assert(Matcher::vector_size_supported(T_BYTE,4), "sanity"); assert(RegMask::num_registers(Op_VecS) == RegMask::SlotsPerVecS, "sanity"); @@ -1358,6 +1385,46 @@ static bool is_legal_reg(LRG &lrg, OptoReg::Name reg, int chunk) { return false; } +static OptoReg::Name find_first_set(LRG &lrg, RegMask mask, int chunk) { + int num_regs = lrg.num_regs(); + OptoReg::Name assigned = mask.find_first_set(lrg, num_regs); + + if (lrg.is_scalable()) { + // a physical register is found + if (chunk == 0 && OptoReg::is_reg(assigned)) { + return assigned; + } + + // find available stack slots for scalable register + if (lrg._is_vector) { + num_regs = lrg.scalable_reg_slots(); + // if actual scalable vector register is exactly SlotsPerVecA * 32 bits + if (num_regs == RegMask::SlotsPerVecA) { + return assigned; + } + + // mask has been cleared out by clear_to_sets(SlotsPerVecA) before choose_color, but it + // does not work for scalable size. We have to find adjacent scalable_reg_slots() bits + // instead of SlotsPerVecA bits. + assigned = mask.find_first_set(lrg, num_regs); // find highest valid reg + while (OptoReg::is_valid(assigned) && RegMask::can_represent(assigned)) { + // Verify the found reg has scalable_reg_slots() bits set. + if (mask.is_valid_reg(assigned, num_regs)) { + return assigned; + } else { + // Remove more for each iteration + mask.Remove(assigned - num_regs + 1); // Unmask the lowest reg + mask.clear_to_sets(RegMask::SlotsPerVecA); // Align by SlotsPerVecA bits + assigned = mask.find_first_set(lrg, num_regs); + } + } + return OptoReg::Bad; // will cause chunk change, and retry next chunk + } + } + + return assigned; +} + // Choose a color using the biasing heuristic OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { @@ -1391,7 +1458,7 @@ OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { RegMask tempmask = lrg.mask(); tempmask.AND(lrgs(copy_lrg).mask()); tempmask.clear_to_sets(lrg.num_regs()); - OptoReg::Name reg = tempmask.find_first_set(lrg.num_regs()); + OptoReg::Name reg = find_first_set(lrg, tempmask, chunk); if (OptoReg::is_valid(reg)) return reg; } @@ -1400,7 +1467,7 @@ OptoReg::Name PhaseChaitin::bias_color( LRG &lrg, int chunk ) { // If no bias info exists, just go with the register selection ordering if (lrg._is_vector || lrg.num_regs() == 2) { // Find an aligned set - return OptoReg::add(lrg.mask().find_first_set(lrg.num_regs()),chunk); + return OptoReg::add(find_first_set(lrg, lrg.mask(), chunk), chunk); } // CNC - Fun hack. Alternate 1st and 2nd selection. Enables post-allocate @@ -1455,7 +1522,6 @@ uint PhaseChaitin::Select( ) { LRG *lrg = &lrgs(lidx); _simplified = lrg->_next; - #ifndef PRODUCT if (trace_spilling()) { ttyLocker ttyl; @@ -1539,7 +1605,6 @@ uint PhaseChaitin::Select( ) { // Bump register mask up to next stack chunk chunk += RegMask::CHUNK_SIZE; lrg->Set_All(); - goto retry_next_chunk; } @@ -1564,12 +1629,21 @@ uint PhaseChaitin::Select( ) { int n_regs = lrg->num_regs(); assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity"); if (n_regs == 1 || !lrg->_fat_proj) { - assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity"); + if (Matcher::supports_scalable_vector()) { + assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecA, "sanity"); + } else { + assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity"); + } lrg->Clear(); // Clear the mask lrg->Insert(reg); // Set regmask to match selected reg // For vectors and pairs, also insert the low bit of the pair - for (int i = 1; i < n_regs; i++) + // We always choose the high bit, then mask the low bits by register size + if (lrg->is_scalable() && OptoReg::is_stack(lrg->reg())) { // stack + n_regs = lrg->scalable_reg_slots(); + } + for (int i = 1; i < n_regs; i++) { lrg->Insert(OptoReg::add(reg,-i)); + } lrg->set_mask_size(n_regs); } else { // Else fatproj // mask must be equal to fatproj bits, by definition diff --git a/src/hotspot/share/opto/chaitin.hpp b/src/hotspot/share/opto/chaitin.hpp index e5be5b966..5408a24ef 100644 --- a/src/hotspot/share/opto/chaitin.hpp +++ b/src/hotspot/share/opto/chaitin.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -115,7 +115,9 @@ public: _msize_valid=1; if (_is_vector) { assert(!_fat_proj, "sanity"); - _mask.verify_sets(_num_regs); + if (!(_is_scalable && OptoReg::is_stack(_reg))) { + _mask.verify_sets(_num_regs); + } } else if (_num_regs == 2 && !_fat_proj) { _mask.verify_pairs(); } @@ -139,14 +141,37 @@ public: void clear_to_pairs() { _mask.clear_to_pairs(); debug_only(_msize_valid=0;) } void clear_to_sets() { _mask.clear_to_sets(_num_regs); debug_only(_msize_valid=0;) } - // Number of registers this live range uses when it colors private: + // Number of registers this live range uses when it colors uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else // except _num_regs is kill count for fat_proj + + // For scalable register, num_regs may not be the actual physical register size. + // We need to get the actual physical length of scalable register when scalable + // register is spilled. The size of one slot is 32-bit. + uint _scalable_reg_slots; // Actual scalable register length of slots. + // Meaningful only when _is_scalable is true. public: int num_regs() const { return _num_regs; } void set_num_regs( int reg ) { assert( _num_regs == reg || !_num_regs, "" ); _num_regs = reg; } + uint scalable_reg_slots() { return _scalable_reg_slots; } + void set_scalable_reg_slots(uint slots) { + assert(_is_scalable, "scalable register"); + assert(slots > 0, "slots of scalable register is not valid"); + _scalable_reg_slots = slots; + } + + bool is_scalable() { +#ifdef ASSERT + if (_is_scalable) { + // Should only be a vector for now, but it could also be a RegVMask in future. + assert(_is_vector && (_num_regs == RegMask::SlotsPerVecA), "unexpected scalable reg"); + } +#endif + return _is_scalable; + } + private: // Number of physical registers this live range uses when it colors // Architecture and register-set dependent @@ -172,6 +197,8 @@ public: uint _is_oop:1, // Live-range holds an oop _is_float:1, // True if in float registers _is_vector:1, // True if in vector registers + _is_scalable:1, // True if register size is scalable + // e.g. Arm SVE vector/predicate registers. _was_spilled1:1, // True if prior spilling on def _was_spilled2:1, // True if twice prior spilling on def _is_bound:1, // live range starts life with no diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 4cc7580a8..4fb732161 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -84,6 +84,7 @@ Matcher::Matcher() idealreg2spillmask [Op_RegF] = NULL; idealreg2spillmask [Op_RegD] = NULL; idealreg2spillmask [Op_RegP] = NULL; + idealreg2spillmask [Op_VecA] = NULL; idealreg2spillmask [Op_VecS] = NULL; idealreg2spillmask [Op_VecD] = NULL; idealreg2spillmask [Op_VecX] = NULL; @@ -97,6 +98,7 @@ Matcher::Matcher() idealreg2debugmask [Op_RegF] = NULL; idealreg2debugmask [Op_RegD] = NULL; idealreg2debugmask [Op_RegP] = NULL; + idealreg2debugmask [Op_VecA] = NULL; idealreg2debugmask [Op_VecS] = NULL; idealreg2debugmask [Op_VecD] = NULL; idealreg2debugmask [Op_VecX] = NULL; @@ -110,6 +112,7 @@ Matcher::Matcher() idealreg2mhdebugmask[Op_RegF] = NULL; idealreg2mhdebugmask[Op_RegD] = NULL; idealreg2mhdebugmask[Op_RegP] = NULL; + idealreg2mhdebugmask[Op_VecA] = NULL; idealreg2mhdebugmask[Op_VecS] = NULL; idealreg2mhdebugmask[Op_VecD] = NULL; idealreg2mhdebugmask[Op_VecX] = NULL; @@ -417,6 +420,8 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) { return rms; } +#define NOF_STACK_MASKS (3*6+6) + //---------------------------init_first_stack_mask----------------------------- // Create the initial stack mask used by values spilling to the stack. // Disallow any debug info in outgoing argument areas by setting the @@ -424,7 +429,12 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) { void Matcher::init_first_stack_mask() { // Allocate storage for spill masks as masks for the appropriate load type. - RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5)); + RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * NOF_STACK_MASKS); + + // Initialize empty placeholder masks into the newly allocated arena + for (int i = 0; i < NOF_STACK_MASKS; i++) { + new (rms + i) RegMask(); + } idealreg2spillmask [Op_RegN] = &rms[0]; idealreg2spillmask [Op_RegI] = &rms[1]; @@ -447,11 +457,12 @@ void Matcher::init_first_stack_mask() { idealreg2mhdebugmask[Op_RegD] = &rms[16]; idealreg2mhdebugmask[Op_RegP] = &rms[17]; - idealreg2spillmask [Op_VecS] = &rms[18]; - idealreg2spillmask [Op_VecD] = &rms[19]; - idealreg2spillmask [Op_VecX] = &rms[20]; - idealreg2spillmask [Op_VecY] = &rms[21]; - idealreg2spillmask [Op_VecZ] = &rms[22]; + idealreg2spillmask [Op_VecA] = &rms[18]; + idealreg2spillmask [Op_VecS] = &rms[19]; + idealreg2spillmask [Op_VecD] = &rms[20]; + idealreg2spillmask [Op_VecX] = &rms[21]; + idealreg2spillmask [Op_VecY] = &rms[22]; + idealreg2spillmask [Op_VecZ] = &rms[23]; OptoReg::Name i; @@ -478,6 +489,7 @@ void Matcher::init_first_stack_mask() { // Keep spill masks aligned. aligned_stack_mask.clear_to_pairs(); assert(aligned_stack_mask.is_AllStack(), "should be infinite stack"); + RegMask scalable_stack_mask = aligned_stack_mask; *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP]; #ifdef _LP64 @@ -548,28 +560,48 @@ void Matcher::init_first_stack_mask() { *idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ]; idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask); } - if (UseFPUForSpilling) { - // This mask logic assumes that the spill operations are - // symmetric and that the registers involved are the same size. - // On sparc for instance we may have to use 64 bit moves will - // kill 2 registers when used with F0-F31. - idealreg2spillmask[Op_RegI]->OR(*idealreg2regmask[Op_RegF]); - idealreg2spillmask[Op_RegF]->OR(*idealreg2regmask[Op_RegI]); + + if (Matcher::supports_scalable_vector()) { + int k = 1; + OptoReg::Name in = OptoReg::add(_in_arg_limit, -1); + // Exclude last input arg stack slots to avoid spilling vector register there, + // otherwise vector spills could stomp over stack slots in caller frame. + for (; (in >= init_in) && (k < scalable_vector_reg_size(T_FLOAT)); k++) { + scalable_stack_mask.Remove(in); + in = OptoReg::add(in, -1); + } + + // For VecA + scalable_stack_mask.clear_to_sets(RegMask::SlotsPerVecA); + assert(scalable_stack_mask.is_AllStack(), "should be infinite stack"); + *idealreg2spillmask[Op_VecA] = *idealreg2regmask[Op_VecA]; + idealreg2spillmask[Op_VecA]->OR(scalable_stack_mask); + } else { + *idealreg2spillmask[Op_VecA] = RegMask::Empty; + } + + if (UseFPUForSpilling) { + // This mask logic assumes that the spill operations are + // symmetric and that the registers involved are the same size. + // On sparc for instance we may have to use 64 bit moves will + // kill 2 registers when used with F0-F31. + idealreg2spillmask[Op_RegI]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegF]->OR(*idealreg2regmask[Op_RegI]); #ifdef _LP64 - idealreg2spillmask[Op_RegN]->OR(*idealreg2regmask[Op_RegF]); - idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); - idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); - idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegN]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); + idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegD]); #else - idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegF]); + idealreg2spillmask[Op_RegP]->OR(*idealreg2regmask[Op_RegF]); #ifdef ARM - // ARM has support for moving 64bit values between a pair of - // integer registers and a double register - idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); - idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); + // ARM has support for moving 64bit values between a pair of + // integer registers and a double register + idealreg2spillmask[Op_RegL]->OR(*idealreg2regmask[Op_RegD]); + idealreg2spillmask[Op_RegD]->OR(*idealreg2regmask[Op_RegL]); #endif #endif - } + } // Make up debug masks. Any spill slot plus callee-save registers. // Caller-save registers are assumed to be trashable by the various @@ -872,6 +904,10 @@ void Matcher::init_spill_mask( Node *ret ) { idealreg2regmask[Op_RegP] = &spillP->out_RegMask(); // Vector regmasks. + if (Matcher::supports_scalable_vector()) { + MachNode *spillVectA = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTA)); + idealreg2regmask[Op_VecA] = &spillVectA->out_RegMask(); + } if (Matcher::vector_size_supported(T_BYTE,4)) { TypeVect::VECTS = TypeVect::make(T_BYTE, 4); MachNode *spillVectS = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTS)); @@ -1575,7 +1611,6 @@ Node* Matcher::Label_Root(const Node* n, State* svec, Node* control, Node*& mem) } } - // Call DFA to match this node, and return svec->DFA( n->Opcode(), n ); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 244e3d1f8..9a8307102 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -310,7 +310,7 @@ public: // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - static const bool match_rule_supported_vector(int opcode, int vlen); + static const bool match_rule_supported_vector(int opcode, int vlen, BasicType bt); // Some microarchitectures have mask registers used on vectors static const bool has_predicated_vectors(void); @@ -333,6 +333,10 @@ public: Matcher::min_vector_size(bt) <= size); } + static const bool supports_scalable_vector(); + // Actual max scalable vector register length. + static const int scalable_vector_reg_size(const BasicType bt); + // Vector ideal reg static const uint vector_ideal_reg(int len); static const uint vector_shift_count_ideal_reg(int len); diff --git a/src/hotspot/share/opto/opcodes.cpp b/src/hotspot/share/opto/opcodes.cpp index e31e8d847..1a826d8ba 100644 --- a/src/hotspot/share/opto/opcodes.cpp +++ b/src/hotspot/share/opto/opcodes.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -38,12 +38,14 @@ const char *NodeClassNames[] = { "RegF", "RegD", "RegL", - "RegFlags", + "VecA", "VecS", "VecD", "VecX", "VecY", "VecZ", + "RegVMask", + "RegFlags", "_last_machine_leaf", #include "classes.hpp" "_last_class_name", diff --git a/src/hotspot/share/opto/opcodes.hpp b/src/hotspot/share/opto/opcodes.hpp index ae3d61ce0..ec96ba055 100644 --- a/src/hotspot/share/opto/opcodes.hpp +++ b/src/hotspot/share/opto/opcodes.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,11 +37,13 @@ enum Opcodes { macro(RegF) // Machine float register macro(RegD) // Machine double register macro(RegL) // Machine long register + macro(VecA) // Machine vectora register macro(VecS) // Machine vectors register macro(VecD) // Machine vectord register macro(VecX) // Machine vectorx register macro(VecY) // Machine vectory register macro(VecZ) // Machine vectorz register + macro(RegVMask) // Vector mask/predicate register macro(RegFlags) // Machine flags register _last_machine_leaf, // Split between regular opcodes and machine #include "classes.hpp" diff --git a/src/hotspot/share/opto/postaloc.cpp b/src/hotspot/share/opto/postaloc.cpp index d572ac9fe..3514b37bc 100644 --- a/src/hotspot/share/opto/postaloc.cpp +++ b/src/hotspot/share/opto/postaloc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -266,9 +266,9 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v Node *val = skip_copies(n->in(k)); if (val == x) return blk_adjust; // No progress? - int n_regs = RegMask::num_registers(val->ideal_reg()); uint val_idx = _lrg_map.live_range_id(val); OptoReg::Name val_reg = lrgs(val_idx).reg(); + int n_regs = RegMask::num_registers(val->ideal_reg(), lrgs(val_idx)); // See if it happens to already be in the correct register! // (either Phi's direct register, or the common case of the name @@ -305,8 +305,26 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v } Node *vv = value[reg]; + // For scalable register, number of registers may be inconsistent between + // "val_reg" and "reg". For example, when "val" resides in register + // but "reg" is located in stack. + if (lrgs(val_idx).is_scalable()) { + assert(val->ideal_reg() == Op_VecA, "scalable vector register"); + if (OptoReg::is_stack(reg)) { + n_regs = lrgs(val_idx).scalable_reg_slots(); + } else { + n_regs = RegMask::SlotsPerVecA; + } + } if (n_regs > 1) { // Doubles and vectors check for aligned-adjacent set - uint last = (n_regs-1); // Looking for the last part of a set + uint last; + if (lrgs(val_idx).is_scalable()) { + assert(val->ideal_reg() == Op_VecA, "scalable vector register"); + // For scalable vector register, regmask is always SlotsPerVecA bits aligned + last = RegMask::SlotsPerVecA - 1; + } else { + last = (n_regs-1); // Looking for the last part of a set + } if ((reg&last) != last) continue; // Wrong part of a set if (!register_contains_value(vv, reg, n_regs, value)) continue; // Different value } @@ -591,7 +609,7 @@ void PhaseChaitin::post_allocate_copy_removal() { uint k; Node *phi = block->get_node(j); uint pidx = _lrg_map.live_range_id(phi); - OptoReg::Name preg = lrgs(_lrg_map.live_range_id(phi)).reg(); + OptoReg::Name preg = lrgs(pidx).reg(); // Remove copies remaining on edges. Check for junk phi. Node *u = NULL; @@ -619,7 +637,7 @@ void PhaseChaitin::post_allocate_copy_removal() { if( pidx ) { value.map(preg,phi); regnd.map(preg,phi); - int n_regs = RegMask::num_registers(phi->ideal_reg()); + int n_regs = RegMask::num_registers(phi->ideal_reg(), lrgs(pidx)); for (int l = 1; l < n_regs; l++) { OptoReg::Name preg_lo = OptoReg::add(preg,-l); value.map(preg_lo,phi); @@ -663,7 +681,7 @@ void PhaseChaitin::post_allocate_copy_removal() { regnd.map(ureg, def); // Record other half of doubles uint def_ideal_reg = def->ideal_reg(); - int n_regs = RegMask::num_registers(def_ideal_reg); + int n_regs = RegMask::num_registers(def_ideal_reg, lrgs(_lrg_map.live_range_id(def))); for (int l = 1; l < n_regs; l++) { OptoReg::Name ureg_lo = OptoReg::add(ureg,-l); if (!value[ureg_lo] && @@ -707,7 +725,7 @@ void PhaseChaitin::post_allocate_copy_removal() { } uint n_ideal_reg = n->ideal_reg(); - int n_regs = RegMask::num_registers(n_ideal_reg); + int n_regs = RegMask::num_registers(n_ideal_reg, lrgs(lidx)); if (n_regs == 1) { // If Node 'n' does not change the value mapped by the register, // then 'n' is a useless copy. Do not update the register->node diff --git a/src/hotspot/share/opto/regmask.cpp b/src/hotspot/share/opto/regmask.cpp index 2e04c42eb..dd9b5476b 100644 --- a/src/hotspot/share/opto/regmask.cpp +++ b/src/hotspot/share/opto/regmask.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,6 +24,7 @@ #include "precompiled.hpp" #include "opto/ad.hpp" +#include "opto/chaitin.hpp" #include "opto/compile.hpp" #include "opto/matcher.hpp" #include "opto/node.hpp" @@ -116,30 +117,47 @@ const RegMask RegMask::Empty( //============================================================================= bool RegMask::is_vector(uint ireg) { - return (ireg == Op_VecS || ireg == Op_VecD || + return (ireg == Op_VecA || ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ); } int RegMask::num_registers(uint ireg) { switch(ireg) { case Op_VecZ: - return 16; + return SlotsPerVecZ; case Op_VecY: - return 8; + return SlotsPerVecY; case Op_VecX: - return 4; + return SlotsPerVecX; case Op_VecD: + return SlotsPerVecD; case Op_RegD: case Op_RegL: #ifdef _LP64 case Op_RegP: #endif return 2; + case Op_VecA: + assert(Matcher::supports_scalable_vector(), "does not support scalable vector"); + return SlotsPerVecA; } // Op_VecS and the rest ideal registers. return 1; } +int RegMask::num_registers(uint ireg, LRG &lrg) { + int n_regs = num_registers(ireg); + + // assigned is OptoReg which is selected by register allocator + OptoReg::Name assigned = lrg.reg(); + assert(OptoReg::is_valid(assigned), "should be valid opto register"); + + if (lrg.is_scalable() && OptoReg::is_stack(assigned)) { + n_regs = lrg.scalable_reg_slots(); + } + return n_regs; +} + //------------------------------find_first_pair-------------------------------- // Find the lowest-numbered register pair in the mask. Return the // HIGHEST register number in the pair, or BAD if no pairs. @@ -238,14 +256,30 @@ int RegMask::is_bound_pair() const { return true; } +// Check that whether given reg number with size is valid +// for current regmask, where reg is the highest number. +bool RegMask::is_valid_reg(OptoReg::Name reg, const int size) const { + for (int i = 0; i < size; i++) { + if (!Member(reg - i)) { + return false; + } + } + return true; +} + // only indicies of power 2 are accessed, so index 3 is only filled in for storage. static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 }; //------------------------------find_first_set--------------------------------- // Find the lowest-numbered register set in the mask. Return the // HIGHEST register number in the set, or BAD if no sets. // Works also for size 1. -OptoReg::Name RegMask::find_first_set(const int size) const { - verify_sets(size); +OptoReg::Name RegMask::find_first_set(LRG &lrg, const int size) const { + if (lrg.is_scalable()) { + // For scalable vector register, regmask is SlotsPerVecA bits aligned. + assert(is_aligned_sets(SlotsPerVecA), "mask is not aligned, adjacent sets"); + } else { + assert(is_aligned_sets(size), "mask is not aligned, adjacent sets"); + } for (int i = 0; i < RM_SIZE; i++) { if (_A[i]) { // Found some bits int bit = _A[i] & -_A[i]; // Extract low bit @@ -325,12 +359,16 @@ bool RegMask::is_aligned_sets(const int size) const { while (bits) { // Check bits for pairing int bit = bits & -bits; // Extract low bit // Low bit is not odd means its mis-aligned. - if ((bit & low_bits_mask) == 0) return false; + if ((bit & low_bits_mask) == 0) { + return false; + } // Do extra work since (bit << size) may overflow. int hi_bit = bit << (size-1); // high bit int set = hi_bit + ((hi_bit-1) & ~(bit-1)); // Check for aligned adjacent bits in this set - if ((bits & set) != set) return false; + if ((bits & set) != set) { + return false; + } bits -= set; // Remove this set } } diff --git a/src/hotspot/share/opto/regmask.hpp b/src/hotspot/share/opto/regmask.hpp index c64d08795..b733b87ad 100644 --- a/src/hotspot/share/opto/regmask.hpp +++ b/src/hotspot/share/opto/regmask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,6 +28,8 @@ #include "code/vmreg.hpp" #include "opto/optoreg.hpp" +class LRG; + // Some fun naming (textual) substitutions: // // RegMask::get_low_elem() ==> RegMask::find_first_elem() @@ -95,11 +97,13 @@ public: // requirement is internal to the allocator, and independent of any // particular platform. enum { SlotsPerLong = 2, + SlotsPerVecA = 8, SlotsPerVecS = 1, SlotsPerVecD = 2, SlotsPerVecX = 4, SlotsPerVecY = 8, - SlotsPerVecZ = 16 }; + SlotsPerVecZ = 16, + }; // A constructor only used by the ADLC output. All mask fields are filled // in directly. Calls to this look something like RM(1,2,3,4); @@ -204,10 +208,14 @@ public: return false; } + // Check that whether given reg number with size is valid + // for current regmask, where reg is the highest number. + bool is_valid_reg(OptoReg::Name reg, const int size) const; + // Find the lowest-numbered register set in the mask. Return the // HIGHEST register number in the set, or BAD if no sets. // Assert that the mask contains only bit sets. - OptoReg::Name find_first_set(const int size) const; + OptoReg::Name find_first_set(LRG &lrg, const int size) const; // Clear out partial bits; leave only aligned adjacent bit sets of size. void clear_to_sets(const int size); @@ -226,6 +234,7 @@ public: static bool is_vector(uint ireg); static int num_registers(uint ireg); + static int num_registers(uint ireg, LRG &lrg); // Fast overlap test. Non-zero if any registers in common. int overlap( const RegMask &rm ) const { diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 92f70b77d..ed67928f5 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -96,8 +96,11 @@ static const bool _do_vector_loop_experimental = false; // Experimental vectoriz //------------------------------transform_loop--------------------------- void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { assert(UseSuperWord, "should be"); - // Do vectors exist on this architecture? - if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; + // SuperWord only works with power of two vector sizes. + int vector_width = Matcher::vector_width_in_bytes(T_BYTE); + if (vector_width < 2 || !is_power_of_2(vector_width)) { + return; + } assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); diff --git a/src/hotspot/share/opto/type.cpp b/src/hotspot/share/opto/type.cpp index 223b7a1c6..1b46cb452 100644 --- a/src/hotspot/share/opto/type.cpp +++ b/src/hotspot/share/opto/type.cpp @@ -79,6 +79,7 @@ const Type::TypeInfo Type::_type_info[Type::lastype] = { { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #else // all other + { Bad, T_ILLEGAL, "vectora:", false, Op_VecA, relocInfo::none }, // VectorA. { Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX @@ -655,6 +656,10 @@ void Type::Initialize_shared(Compile* current) { // get_zero_type() should not happen for T_CONFLICT _zero_type[T_CONFLICT]= NULL; + if (Matcher::supports_scalable_vector()) { + TypeVect::VECTA = TypeVect::make(T_BYTE, Matcher::scalable_vector_reg_size(T_BYTE)); + } + // Vector predefined types, it needs initialized _const_basic_type[]. if (Matcher::vector_size_supported(T_BYTE,4)) { TypeVect::VECTS = TypeVect::make(T_BYTE,4); @@ -671,6 +676,8 @@ void Type::Initialize_shared(Compile* current) { if (Matcher::vector_size_supported(T_FLOAT,16)) { TypeVect::VECTZ = TypeVect::make(T_FLOAT,16); } + + mreg2type[Op_VecA] = TypeVect::VECTA; mreg2type[Op_VecS] = TypeVect::VECTS; mreg2type[Op_VecD] = TypeVect::VECTD; mreg2type[Op_VecX] = TypeVect::VECTX; @@ -990,6 +997,7 @@ const Type::TYPES Type::dual_type[Type::lastype] = { Bad, // Tuple - handled in v-call Bad, // Array - handled in v-call + Bad, // VectorA - handled in v-call Bad, // VectorS - handled in v-call Bad, // VectorD - handled in v-call Bad, // VectorX - handled in v-call @@ -1890,7 +1898,6 @@ const TypeTuple *TypeTuple::LONG_PAIR; const TypeTuple *TypeTuple::INT_CC_PAIR; const TypeTuple *TypeTuple::LONG_CC_PAIR; - //------------------------------make------------------------------------------- // Make a TypeTuple from the range of a method signature const TypeTuple *TypeTuple::make_range(ciSignature* sig) { @@ -2262,6 +2269,7 @@ bool TypeAry::ary_must_be_exact() const { //==============================TypeVect======================================= // Convenience common pre-built types. +const TypeVect *TypeVect::VECTA = NULL; // vector length agnostic const TypeVect *TypeVect::VECTS = NULL; // 32-bit vectors const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors @@ -2272,10 +2280,11 @@ const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors const TypeVect* TypeVect::make(const Type *elem, uint length) { BasicType elem_bt = elem->array_element_basic_type(); assert(is_java_primitive(elem_bt), "only primitive types in vector"); - assert(length > 1 && is_power_of_2(length), "vector length is power of 2"); assert(Matcher::vector_size_supported(elem_bt, length), "length in range"); int size = length * type2aelembytes(elem_bt); switch (Matcher::vector_ideal_reg(size)) { + case Op_VecA: + return (TypeVect*)(new TypeVectA(elem, length))->hashcons(); case Op_VecS: return (TypeVect*)(new TypeVectS(elem, length))->hashcons(); case Op_RegL: @@ -2307,7 +2316,7 @@ const Type *TypeVect::xmeet( const Type *t ) const { default: // All else is a mistake typerr(t); - + case VectorA: case VectorS: case VectorD: case VectorX: @@ -2362,6 +2371,8 @@ bool TypeVect::empty(void) const { #ifndef PRODUCT void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const { switch (base()) { + case VectorA: + st->print("vectora["); break; case VectorS: st->print("vectors["); break; case VectorD: diff --git a/src/hotspot/share/opto/type.hpp b/src/hotspot/share/opto/type.hpp index a7eec281e..6787b947d 100644 --- a/src/hotspot/share/opto/type.hpp +++ b/src/hotspot/share/opto/type.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -53,6 +53,7 @@ class TypeNarrowKlass; class TypeAry; class TypeTuple; class TypeVect; +class TypeVectA; class TypeVectS; class TypeVectD; class TypeVectX; @@ -87,6 +88,7 @@ public: Tuple, // Method signature or object layout Array, // Array types + VectorA, // (Scalable) Vector types for vector length agnostic VectorS, // 32bit Vector types VectorD, // 64bit Vector types VectorX, // 128bit Vector types @@ -758,6 +760,7 @@ public: virtual const Type *xmeet( const Type *t) const; virtual const Type *xdual() const; // Compute dual right now. + static const TypeVect *VECTA; static const TypeVect *VECTS; static const TypeVect *VECTD; static const TypeVect *VECTX; @@ -769,6 +772,11 @@ public: #endif }; +class TypeVectA : public TypeVect { + friend class TypeVect; + TypeVectA(const Type* elem, uint length) : TypeVect(VectorA, elem, length) {} +}; + class TypeVectS : public TypeVect { friend class TypeVect; TypeVectS(const Type* elem, uint length) : TypeVect(VectorS, elem, length) {} @@ -1619,12 +1627,12 @@ inline const TypeAry *Type::is_ary() const { } inline const TypeVect *Type::is_vect() const { - assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" ); + assert( _base >= VectorA && _base <= VectorZ, "Not a Vector" ); return (TypeVect*)this; } inline const TypeVect *Type::isa_vect() const { - return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL; + return (_base >= VectorA && _base <= VectorZ) ? (TypeVect*)this : NULL; } inline const TypePtr *Type::is_ptr() const { diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 1f2cf2c64..6867177c1 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -236,7 +236,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = VectorNode::opcode(opc, bt); - return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen); + return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } @@ -653,7 +653,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported(vopc); + return vopc != opc && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java new file mode 100644 index 000000000..dc15ca800 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestSVEWithJNI.java @@ -0,0 +1,128 @@ +/* +* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +* Copyright (c) 2020, Arm Limited. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +/** + * @test + * + * @requires os.arch == "aarch64" & vm.compiler2.enabled + * @summary Verify VM SVE checking behavior + * @library /test/lib + * @run main/othervm/native compiler.c2.aarch64.TestSVEWithJNI + * + */ + +package compiler.c2.aarch64; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import jdk.test.lib.process.ProcessTools; +import jdk.test.lib.process.OutputAnalyzer; + +public class TestSVEWithJNI { + static { + System.loadLibrary("TestSVEWithJNI"); + } + + static final int EXIT_CODE = 99; + // Returns a nonnegative on success, or a negative value on error. + public static native int setVectorLength(int arg); + // Returns a nonnegative value on success, or a negative value on error. + public static native int getVectorLength(); + + public static final String MSG = "Current Vector Size: "; + public static void testNormal() { + int vlen = getVectorLength(); + System.out.println(MSG + vlen); + // Should be fine if no vector length changed. + if (setVectorLength(vlen) < 0) { + throw new Error("Error in setting vector length."); + } + } + + public static void testAbort() { + int vlen = getVectorLength(); + if (vlen <= 16) { + throw new Error("Error: unsupported vector length."); + } + if (setVectorLength(16) < 0) { + throw new Error("Error: setting vector length failed."); + } + } + + public static ProcessBuilder createProcessBuilder(String [] args, String mode) { + List vmopts = new ArrayList<>(); + String testjdkPath = System.getProperty("test.jdk"); + Collections.addAll(vmopts, "-Dtest.jdk=" + testjdkPath); + Collections.addAll(vmopts, args); + Collections.addAll(vmopts, TestSVEWithJNI.class.getName(), mode); + return ProcessTools.createJavaProcessBuilder(vmopts.toArray(new String[vmopts.size()])); + } + + public static void main(String [] args) throws Exception { + if (args.length == 0) { + int vlen = getVectorLength(); + if (vlen < 0) { + return; + } + String [][] testOpts = { + {"-Xint", "-XX:UseSVE=1"}, + {"-Xcomp", "-XX:UseSVE=1"}, + }; + ProcessBuilder pb; + OutputAnalyzer output; + for (String [] opts : testOpts) { + pb = createProcessBuilder(opts, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + + pb = createProcessBuilder(opts, "abort"); + output = new OutputAnalyzer(pb.start()); + output.shouldNotHaveExitValue(EXIT_CODE); + output.shouldMatch("(error|Error|ERROR)"); + } + + // Verify MaxVectorSize + + // Any SVE architecture should support 128-bit vector size. + pb = createProcessBuilder(new String []{"-XX:UseSVE=1", "-XX:MaxVectorSize=16"}, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + output.shouldContain(MSG + 16); + + // An unsupported large vector size value. + pb = createProcessBuilder(new String []{"-XX:UseSVE=1", "-XX:MaxVectorSize=512"}, "normal"); + output = new OutputAnalyzer(pb.start()); + output.shouldHaveExitValue(EXIT_CODE); + output.shouldContain("warning"); + } else if (args[0].equals("normal")) { + testNormal(); + System.exit(EXIT_CODE); + } else if (args[0].equals("abort")) { + testAbort(); + System.exit(EXIT_CODE); + } + } +} diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c b/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c new file mode 100644 index 000000000..0cb3ab0b5 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/libTestSVEWithJNI.c @@ -0,0 +1,68 @@ +/* +* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +* Copyright (c) 2020, Arm Limited. All rights reserved. +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#ifdef __aarch64__ + +#include +#include +#include +#include +#include +#include + +#ifndef PR_SVE_GET_VL +// For old toolchains which do not have SVE related macros defined. +#define PR_SVE_SET_VL 50 +#define PR_SVE_GET_VL 51 +#endif + +int get_current_thread_vl() { + return prctl(PR_SVE_GET_VL); +} + +int set_current_thread_vl(unsigned long arg) { + return prctl(PR_SVE_SET_VL, arg); +} + +#ifdef __cplusplus +extern "C" { +#endif + +JNIEXPORT jint JNICALL Java_compiler_c2_aarch64_TestSVEWithJNI_setVectorLength +(JNIEnv * env, jclass clz, jint length) { + return set_current_thread_vl(length); +} + +JNIEXPORT jint JNICALL Java_compiler_c2_aarch64_TestSVEWithJNI_getVectorLength +(JNIEnv *env, jclass clz) { + return get_current_thread_vl(); +} + + +#ifdef __cplusplus +} +#endif + +#endif -- 2.19.0