diff --git a/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch b/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch new file mode 100644 index 0000000..23cf171 --- /dev/null +++ b/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch @@ -0,0 +1,175 @@ +From 41ebac8b7d7485a5396ae25ce2412cafcd03f1a2 Mon Sep 17 00:00:00 2001 +From: liqiang +Date: Thu, 2 Sep 2021 17:31:48 +0800 +Subject: [PATCH] Accelerate Adler32 using arm64 SVE instructions + + This patch uses the SVE instruction set to rewrite the Adler32 + algorithm (checksum algorithm in libz). By dividing the data into + blocks, a vector operation can complete a data block in parallel. + + Measured on a Taishan 1951 machine that supports 256bit width SVE, + this algorithm is about 3~5 times faster than the algorithm implemented + in C language in libz. The wider the bit width, the better the + acceleration effect. Below are the results of my measured random + data of 1M and 10M: + + [root@xxx adler32]# ./benchmark 1000000 + Libz alg: Time used: 608 us, 1644.7 Mb/s. + SVE alg: Time used: 166 us, 6024.1 Mb/s. + + [root@xxx adler32]# ./benchmark 10000000 + Libz alg: Time used: 6484 us, 1542.3 Mb/s. + SVE alg: Time used: 2034 us, 4916.4 Mb/s. + + On machines that support ARM64 sve instructions, this algorithm can + effectively accelerate adler32, thereby achieving the effect of improving + the performance of the basic compression algorithm libz. + + In the implementation of this patch, blocks can be of any size, so the + algorithm can automatically adapt to SVE hardware with different bit + widths without modifying the code. + +Signed-off-by: liqiang +--- + contrib/arm/adler32_sve.S | 129 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 129 insertions(+) + create mode 100644 contrib/arm/adler32_sve.S + +diff --git a/contrib/arm/adler32_sve.S b/contrib/arm/adler32_sve.S +new file mode 100644 +index 0000000..97c5930 +--- /dev/null ++++ b/contrib/arm/adler32_sve.S +@@ -0,0 +1,129 @@ ++/****************************************************************************** ++ * Copyright (c) Huawei Technologies Co., Ltd. 2018-2020. All rights reserved. ++ * iSulad licensed under the Mulan PSL v2. ++ * You can use this software according to the terms and conditions of the Mulan PSL v2. ++ * You may obtain a copy of Mulan PSL v2 at: ++ * http://license.coscl.org.cn/MulanPSL2 ++ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++ * PURPOSE. ++ * See the Mulan PSL v2 for more details. ++ * Author: liqiang ++ * Create: 2020-07-13 ++ * Description: Use SVE instruction to optimize adler32 algorithm. ++ * Enhancement: 2020-10-13 ++ Automatically support different SVE vector length(128~2048). ++ ******************************************************************************/ ++ ++.file "adler32_sve.S" ++.text ++.align 4 ++ ++//The supported sve vector length range is 128~2048 by this Adler_sequence ++.Adler_sequence: ++ .short 256,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241,240,239,238,237,236,235,234,233,232,231,230,229,228,227,226,225,224,223,222,221,220,219,218,217,216,215,214,213,212,211,210,209,208,207,206,205,204,203,202,201,200,199,198,197,196,195,194,193,192,191,190,189,188,187,186,185,184,183,182,181,180,179,178,177,176,175,174,173,172,171,170,169,168,167,166,165,164,163,162,161,160,159,158,157,156,155,154,153,152,151,150,149,148,147,146,145,144,143,142,141,140,139,138,137,136,135,134,133,132,131,130,129,128,127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 ++ ++.global adler32_sve ++.type adler32_sve, %function ++adler32_sve: ++ // x0: unsigned long adler ++ // x1: const unsigned char *buf ++ // x2: unsigned long len ++ ++ // w10 : A = adler & 0xffff ++ // w11 : B = (adler >> 16) & 0xffff ++ // first byte A = 1, B = 0 ++ and w10, w0, #0xffff ++ lsr w11, w0, #16 ++ // less than and equal 63byte, jumper to normal proc ++ cmp x2, #0x3f ++ b.le Lnormal_proc ++ ++ // Get the length of the sve vector to x6. ++ mov x6, #0 ++ addvl x6, x6, #1 ++ adr x12, .Adler_sequence ++ ptrue p0.h ++ ++ // Get the starting position of the required sequence. ++ mov x9, #256 ++ sub x9, x9, x6 ++ ld1h z24.h, p0/z, [x12, x9, lsl #1] // taps1 to z24.h ++ inch x9 ++ ld1h z25.h, p0/z, [x12, x9, lsl #1] // taps2 to z25.h ++ // must bigger than 64byte ++ ptrue p0.b ++ ptrue p1.h ++ mov x9, #0 ++.align 4 ++LBig_loop: ++ // x is SVE vector length. ++ // Bn = Bn-1 + An-1 * x + x * D1 + (x-1) * D2 + ... + 1 * Dx ++ // An = An-1 + D1 + D2 + D3 + ... + Dx ++ ++ .macro ADLER_BLOCK_32 ++ ld1b z0.b, p0/z, [x1, x9] ++ ++ uaddv d20, p0, z0.b // D1 + D2 + ... + D32 ++ mov x12, v20.2d[0] // mov sum to w12 ++ madd x11, x10, x6, x11 // Bn = An-1 * 32 + Bn-1 ++ ++ uunpklo z26.h, z0.b ++ uunpkhi z27.h, z0.b ++ mul z26.h, p1/m, z26.h, z24.h // x * D1 + (x-1) * D2 + ... + (x/2 + 1) * D(x/2) ++ mul z27.h, p1/m, z27.h, z25.h // (x/2) * D(x/2 + 1) + (x/2 - 1) * D(x/2 + 2) + ... + 1 * Dx ++ ++ uaddv d21, p1, z26.h ++ uaddv d22, p1, z27.h ++ mov x13, v21.2d[0] ++ mov x14, v22.2d[0] ++ ++ add x11, x13, x11 ++ add x11, x14, x11 // Bn += x * D1 + (x-1) * D2 + ... + 1 * Dx ++ add x10, x12, x10 // An += D1 + D2 + ... + Dx ++ incb x9 ++ .endm ++ mov x15, #4 ++ ADLER_BLOCK_32 ++ ADLER_BLOCK_32 ++ ADLER_BLOCK_32 ++ ADLER_BLOCK_32 ++ ++ // calc = reg0 % 65521 ++ .macro mod65521, reg0, reg1, reg2 ++ mov w\reg1, #0x8071 ++ mov w\reg2, #0xfff1 ++ movk w\reg1, #0x8007, lsl #16 ++ umull x\reg1, w\reg0, w\reg1 ++ lsr x\reg1, x\reg1, #47 ++ msub w\reg0, w\reg1, w\reg2, w\reg0 ++ .endm ++ ++ mod65521 10, 14, 16 ++ mod65521 11, 14, 16 ++ ++Lloop_cond: ++ mul x12, x6, x15 ++ sub x2, x2, x12 ++ cmp x2, x12 ++ b.ge LBig_loop ++ ++Lnormal_proc: ++ cmp x2, #0 ++ b.eq Lret ++ ++ ldrb w15, [x1, x9] ++ add x9, x9, #1 ++ add x10, x15, x10 ++ add x11, x10, x11 ++ sub x2, x2, #1 ++ b Lnormal_proc ++ ++Lret: ++ mod65521 10, 14, 5 ++ mod65521 11, 14, 5 ++ lsl x11, x11, #16 ++ orr x0, x10, x11 ++ ret ++ ++.size adler32_sve, .-adler32_sve +-- +2.17.1 + diff --git a/zlib.spec b/zlib.spec index e6ecf27..39160cc 100644 --- a/zlib.spec +++ b/zlib.spec @@ -1,6 +1,6 @@ Name: zlib Version: 1.2.11 -Release: 18 +Release: 19 Summary: A lossless data-compression library License: zlib and Boost URL: http://www.zlib.net @@ -15,6 +15,7 @@ Patch2: 0002-Porting-optimized-longest_match.patch Patch3: 0003-arm64-specific-build-patch.patch Patch4: 0004-zlib-Optimize-CRC32.patch Patch5: zlib-1.2.11-SIMD.patch +Patch6: 0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch %endif Patch6000: fix-undefined-buffer-detected-by-oss-fuzz.patch @@ -118,6 +119,9 @@ make test %{_libdir}/pkgconfig/minizip.pc %changelog +* Thu Sep 2 2021 liqiang - 1.2.11-19 +- Optimize Adler32 by SVE instructions. + * Mon Sep 14 2020 noah - 1.2.11-18 - add zlib-1.2.11-SIMD.patch