!3 Add new patch: Accelerate adler32 by SVE instructions

From: @liqiang9102 Reviewed-by: @xiezhipeng1 Signed-off-by: @xiezhipeng1
2021-09-06 08:46:45 +00:00 · 2021-09-06 08:46:45 +00:00 · 5f8562d628
commit 5f8562d628
parent 72f73f6524 f51d1baf10
2 changed files with 180 additions and 1 deletions
--- a/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch
+++ b/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch
@ -0,0 +1,175 @@
+From 41ebac8b7d7485a5396ae25ce2412cafcd03f1a2 Mon Sep 17 00:00:00 2001
+From: liqiang <liqiang64@huawei.com>
+Date: Thu, 2 Sep 2021 17:31:48 +0800
+Subject: [PATCH] Accelerate Adler32 using arm64 SVE instructions
+
+	This patch uses the SVE instruction set to rewrite the Adler32
+	algorithm (checksum algorithm in libz). By dividing the data into
+	blocks, a vector operation can complete a data block in parallel.
+
+	Measured on a Taishan 1951 machine that supports 256bit width SVE,
+	this algorithm is about 3~5 times faster than the algorithm implemented
+	in C language in libz. The wider the bit width, the better the
+	acceleration effect. Below are the results of my measured random
+	data of 1M and 10M:
+
+		[root@xxx adler32]# ./benchmark 1000000
+		Libz alg: Time used:    608 us, 1644.7 Mb/s.
+		SVE  alg: Time used:    166 us, 6024.1 Mb/s.
+
+		[root@xxx adler32]# ./benchmark 10000000
+		Libz alg: Time used:   6484 us, 1542.3 Mb/s.
+		SVE  alg: Time used:   2034 us, 4916.4 Mb/s.
+
+	On machines that support ARM64 sve instructions, this algorithm can
+	effectively accelerate adler32, thereby achieving the effect of improving
+	the performance of the basic compression algorithm libz.
+
+	In the implementation of this patch, blocks can be of any size, so the
+	algorithm can automatically adapt to SVE hardware with different bit
+	widths without modifying the code.
+
+Signed-off-by: liqiang <liqiang64@huawei.com>
+---
+ contrib/arm/adler32_sve.S | 129 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 129 insertions(+)
+ create mode 100644 contrib/arm/adler32_sve.S
+
+diff --git a/contrib/arm/adler32_sve.S b/contrib/arm/adler32_sve.S
+new file mode 100644
+index 0000000..97c5930
+--- /dev/null
+++ b/contrib/arm/adler32_sve.S
+@@ -0,0 +1,129 @@
+/******************************************************************************
+ * Copyright (c) Huawei Technologies Co., Ltd. 2018-2020. All rights reserved.
+ * iSulad licensed under the Mulan PSL v2.
+ * You can use this software according to the terms and conditions of the Mulan PSL v2.
+ * You may obtain a copy of Mulan PSL v2 at:
+ *     http://license.coscl.org.cn/MulanPSL2
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+ * PURPOSE.
+ * See the Mulan PSL v2 for more details.
+ * Author: liqiang
+ * Create: 2020-07-13
+ * Description: Use SVE instruction to optimize adler32 algorithm.
+ * Enhancement: 2020-10-13 
+                Automatically support different SVE vector length(128~2048). 
+ ******************************************************************************/
+
+.file "adler32_sve.S"
+.text
+.align 4
+
+//The supported sve vector length range is 128~2048 by this Adler_sequence
+.Adler_sequence:
+    .short 256,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241,240,239,238,237,236,235,234,233,232,231,230,229,228,227,226,225,224,223,222,221,220,219,218,217,216,215,214,213,212,211,210,209,208,207,206,205,204,203,202,201,200,199,198,197,196,195,194,193,192,191,190,189,188,187,186,185,184,183,182,181,180,179,178,177,176,175,174,173,172,171,170,169,168,167,166,165,164,163,162,161,160,159,158,157,156,155,154,153,152,151,150,149,148,147,146,145,144,143,142,141,140,139,138,137,136,135,134,133,132,131,130,129,128,127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1
+
+.global adler32_sve
+.type adler32_sve, %function
+adler32_sve:
+    // x0: unsigned long adler
+    // x1: const unsigned char *buf
+    // x2: unsigned long len
+    
+    // w10 : A = adler & 0xffff
+    // w11 : B = (adler >> 16) & 0xffff
+    // first byte A = 1, B = 0
+    and w10, w0, #0xffff
+    lsr w11, w0, #16
+    // less than and equal 63byte, jumper to normal proc
+    cmp x2, #0x3f
+    b.le Lnormal_proc
+    
+    // Get the length of the sve vector to x6.
+    mov x6, #0
+    addvl x6, x6, #1
+    adr x12, .Adler_sequence
+    ptrue p0.h
+
+    // Get the starting position of the required sequence.
+    mov x9, #256
+    sub x9, x9, x6
+    ld1h z24.h, p0/z, [x12, x9, lsl #1] // taps1 to z24.h
+    inch x9
+    ld1h z25.h, p0/z, [x12, x9, lsl #1] // taps2 to z25.h
+    // must bigger than 64byte 
+    ptrue p0.b
+    ptrue p1.h
+    mov x9, #0
+.align 4
+LBig_loop:
+    // x is SVE vector length.
+    // Bn = Bn-1 + An-1 * x + x * D1 + (x-1) * D2 + ... + 1 * Dx
+    // An = An-1 + D1 + D2 + D3 + ... + Dx
+    
+    .macro ADLER_BLOCK_32
+    ld1b z0.b, p0/z, [x1, x9]
+    
+    uaddv d20, p0, z0.b // D1 + D2 + ... + D32
+    mov x12, v20.2d[0]  // mov sum to w12
+    madd x11, x10, x6, x11 // Bn = An-1 * 32 + Bn-1
+
+    uunpklo z26.h, z0.b
+    uunpkhi z27.h, z0.b
+    mul z26.h, p1/m, z26.h, z24.h // x * D1 + (x-1) * D2 + ... + (x/2 + 1) * D(x/2)
+    mul z27.h, p1/m, z27.h, z25.h // (x/2) * D(x/2 + 1) + (x/2 - 1) * D(x/2 + 2) + ... + 1 * Dx
+
+    uaddv d21, p1, z26.h
+    uaddv d22, p1, z27.h
+    mov x13, v21.2d[0]
+    mov x14, v22.2d[0]
+
+    add x11, x13, x11
+    add x11, x14, x11     // Bn += x * D1 + (x-1) * D2 + ... + 1 * Dx
+    add x10, x12, x10     // An += D1 + D2 + ... + Dx
+    incb x9
+    .endm
+    mov x15, #4
+    ADLER_BLOCK_32
+    ADLER_BLOCK_32
+    ADLER_BLOCK_32
+    ADLER_BLOCK_32
+    
+    // calc = reg0 % 65521
+    .macro mod65521, reg0, reg1, reg2
+    mov w\reg1, #0x8071
+    mov w\reg2, #0xfff1
+    movk w\reg1, #0x8007, lsl #16
+    umull x\reg1, w\reg0, w\reg1
+    lsr x\reg1, x\reg1, #47
+    msub w\reg0, w\reg1, w\reg2, w\reg0
+    .endm
+    
+    mod65521 10, 14, 16
+    mod65521 11, 14, 16
+
+Lloop_cond:
+    mul x12, x6, x15
+    sub x2, x2, x12
+    cmp x2, x12
+    b.ge LBig_loop
+
+Lnormal_proc:
+    cmp x2, #0
+    b.eq Lret
+
+    ldrb w15, [x1, x9]
+    add x9, x9, #1
+    add x10, x15, x10
+    add x11, x10, x11
+    sub x2, x2, #1
+    b Lnormal_proc
+
+Lret:
+    mod65521 10, 14, 5
+    mod65521 11, 14, 5
+    lsl x11, x11, #16
+    orr x0, x10, x11
+    ret
+
+.size adler32_sve, .-adler32_sve
+-- 
+2.17.1
+
--- a/zlib.spec
+++ b/zlib.spec
@ -1,6 +1,6 @@
 Name:             zlib
 Version:          1.2.11
-Release:          18
+Release:          19
 Summary:          A lossless data-compression library
 License:          zlib and Boost
 URL:              http://www.zlib.net
@ -15,6 +15,7 @@ Patch2:           0002-Porting-optimized-longest_match.patch
 Patch3:           0003-arm64-specific-build-patch.patch
 Patch4:           0004-zlib-Optimize-CRC32.patch
 Patch5:           zlib-1.2.11-SIMD.patch
+Patch6:           0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch
 %endif

 Patch6000:        fix-undefined-buffer-detected-by-oss-fuzz.patch
@ -118,6 +119,9 @@ make test
 %{_libdir}/pkgconfig/minizip.pc

 %changelog
+* Thu Sep 2 2021 liqiang <liqiang64@huawei.com> - 1.2.11-19
+- Optimize Adler32 by SVE instructions.
+
 * Mon Sep 14 2020 noah <hedongbo@huawei.com> - 1.2.11-18
 - add zlib-1.2.11-SIMD.patch