61 lines
2.6 KiB
Diff
61 lines
2.6 KiB
Diff
|
|
Subject: 8256488: Use ldpq/stpq instead of ld4/st4 for small copies in StubGenerator::copy_memory
|
||
|
|
|
||
|
|
--
|
||
|
|
.../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 30 ++++++++++++++++---
|
||
|
|
1 file changed, 26 insertions(+), 4 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
|
||
|
|
index f61028d5007..cf66df296e4 100644
|
||
|
|
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
|
||
|
|
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
|
||
|
|
@@ -1149,10 +1149,10 @@ class StubGenerator: public StubCodeGenerator {
|
||
|
|
Register count, Register tmp, int step) {
|
||
|
|
copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
|
||
|
|
bool is_backwards = step < 0;
|
||
|
|
- int granularity = uabs(step);
|
||
|
|
+ unsigned granularity = uabs(step);
|
||
|
|
const Register t0 = r3, t1 = r4;
|
||
|
|
|
||
|
|
- // <= 96 bytes do inline. Direction doesn't matter because we always
|
||
|
|
+ // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
|
||
|
|
// load all the data before writing anything
|
||
|
|
Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
|
||
|
|
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
|
||
|
|
@@ -1207,9 +1207,31 @@ class StubGenerator: public StubCodeGenerator {
|
||
|
|
// (96 bytes if SIMD because we do 32 byes per instruction)
|
||
|
|
__ bind(copy80);
|
||
|
|
if (UseSIMDForMemoryOps) {
|
||
|
|
- __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
|
||
|
|
+ __ ldpq(v0, v1, Address(s, 0));
|
||
|
|
+ __ ldpq(v2, v3, Address(s, 32));
|
||
|
|
+ // Unaligned pointers can be an issue for copying.
|
||
|
|
+ // The issue has more chances to happen when granularity of data is
|
||
|
|
+ // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
|
||
|
|
+ // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
|
||
|
|
+ // The most performance drop has been seen for the range 65-80 bytes.
|
||
|
|
+ // For such cases using the pair of ldp/stp instead of the third pair of
|
||
|
|
+ // ldpq/stpq fixes the performance issue.
|
||
|
|
+ if (granularity < sizeof (jint)) {
|
||
|
|
+ Label copy96;
|
||
|
|
+ __ cmp(count, u1(80/granularity));
|
||
|
|
+ __ br(Assembler::HI, copy96);
|
||
|
|
+ __ ldp(t0, t1, Address(send, -16));
|
||
|
|
+
|
||
|
|
+ __ stpq(v0, v1, Address(d, 0));
|
||
|
|
+ __ stpq(v2, v3, Address(d, 32));
|
||
|
|
+ __ stp(t0, t1, Address(dend, -16));
|
||
|
|
+ __ b(finish);
|
||
|
|
+
|
||
|
|
+ __ bind(copy96);
|
||
|
|
+ }
|
||
|
|
__ ldpq(v4, v5, Address(send, -32));
|
||
|
|
- __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
|
||
|
|
+ __ stpq(v0, v1, Address(d, 0));
|
||
|
|
+ __ stpq(v2, v3, Address(d, 32));
|
||
|
|
__ stpq(v4, v5, Address(dend, -32));
|
||
|
|
} else {
|
||
|
|
__ ldp(t0, t1, Address(s, 0));
|
||
|
|
--
|
||
|
|
2.19.1
|
||
|
|
|