diff --git a/0001-Neon-Optimized-hash-chain-rebase.patch b/0001-Neon-Optimized-hash-chain-rebase.patch deleted file mode 100644 index e6f909f..0000000 --- a/0001-Neon-Optimized-hash-chain-rebase.patch +++ /dev/null @@ -1,170 +0,0 @@ -From f0fd8c553fa024c599f4aff65d7c603ceeaa6a58 Mon Sep 17 00:00:00 2001 -From: Adenilson Cavalcanti -Date: Mon, 9 Apr 2018 13:52:17 -0700 -Subject: [PATCH 1/3] Neon-Optimized hash chain rebase - -This should help with compression of data, using NEON instructions -(therefore useful for ARMv7/ARMv8). - -Original patch by Jun He. ---- - CMakeLists.txt | 18 ++++++++ - contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++ - deflate.c | 7 +++ - 3 files changed, 109 insertions(+) - create mode 100644 contrib/arm/neon_slide_hash.h - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 0fe939d..e9a74e9 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -136,6 +136,24 @@ if(CMAKE_COMPILER_IS_GNUCC) - set(ZLIB_ASMS contrib/amd64/amd64-match.S) - endif () - -+ if(ARM_NEON) -+ list(REMOVE_ITEM ZLIB_SRCS inflate.c) -+ set(ZLIB_ARM_NEON_HDRS -+ contrib/arm/chunkcopy.h -+ contrib/arm/inffast_chunk.h -+ contrib/arm/neon_slide_hash.h) -+ set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) -+ add_definitions(-DARM_NEON) -+ set(COMPILER ${CMAKE_C_COMPILER}) -+ # NEON is mandatory in ARMv8. -+ if(${COMPILER} MATCHES "aarch64") -+ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a) -+ # But it was optional for ARMv7. -+ elseif(${COMPILER} MATCHES "arm") -+ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon) -+ endif() -+ endif() -+ - if(ZLIB_ASMS) - add_definitions(-DASMV) - set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) -diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h -new file mode 100644 -index 0000000..0daffa1 ---- /dev/null -+++ b/contrib/arm/neon_slide_hash.h -@@ -0,0 +1,84 @@ -+/* Copyright (C) 1995-2011, 2016 Mark Adler -+ * Copyright (C) 2017 ARM Holdings Inc. -+ * Authors: Adenilson Cavalcanti -+ * Jun He -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+#ifndef __NEON_SLIDE_HASH__ -+#define __NEON_SLIDE_HASH__ -+ -+#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) -+#include "deflate.h" -+#include -+ -+inline static void neon_slide_hash(deflate_state *s) -+{ -+ /* -+ * This is ASIMD implementation for hash table rebase -+ * it assumes: -+ * 1. hash chain offset (Pos) is 2 bytes -+ * 2. hash table size is multiple*128 bytes -+ * #1 should be true as Pos is defined as "ush" -+ * #2 should be true as hash_bits are greater that 7 -+ */ -+ unsigned n, m; -+ unsigned short wsize = s->w_size; -+ uint16x8_t v, *p; -+ size_t size; -+ -+ size = s->hash_size*sizeof(s->head[0]); -+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); -+ -+ Assert(sizeof(Pos) == 2, "Wrong Pos size"); -+ -+ /* slide s->head */ -+ v = vdupq_n_u16(wsize); -+ p = (uint16x8_t *)(s->head); -+ n = size / (sizeof(uint16x8_t) * 8); -+ do { -+ p[0] = vqsubq_u16(p[0], v); -+ p[1] = vqsubq_u16(p[1], v); -+ p[2] = vqsubq_u16(p[2], v); -+ p[3] = vqsubq_u16(p[3], v); -+ p[4] = vqsubq_u16(p[4], v); -+ p[5] = vqsubq_u16(p[5], v); -+ p[6] = vqsubq_u16(p[6], v); -+ p[7] = vqsubq_u16(p[7], v); -+ p += 8; -+ } while (--n); -+#ifndef FASTEST -+ /* slide s->prev */ -+ size = wsize*sizeof(s->prev[0]); -+ -+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); -+ -+ p = (uint16x8_t *)(s->prev); -+ n = size / (sizeof(uint16x8_t) * 8); -+ do { -+ p[0] = vqsubq_u16(p[0], v); -+ p[1] = vqsubq_u16(p[1], v); -+ p[2] = vqsubq_u16(p[2], v); -+ p[3] = vqsubq_u16(p[3], v); -+ p[4] = vqsubq_u16(p[4], v); -+ p[5] = vqsubq_u16(p[5], v); -+ p[6] = vqsubq_u16(p[6], v); -+ p[7] = vqsubq_u16(p[7], v); -+ p += 8; -+ } while (--n); -+#endif -+} -+ -+#endif -+#endif -diff --git a/deflate.c b/deflate.c -index 1ec7614..36f99ac 100644 ---- a/deflate.c -+++ b/deflate.c -@@ -50,6 +50,9 @@ - /* @(#) $Id$ */ - - #include "deflate.h" -+#if __ARM_NEON -+#include "contrib/arm/neon_slide_hash.h" -+#endif - - const char deflate_copyright[] = - " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; -@@ -201,6 +204,9 @@ local const config configuration_table[10] = { - local void slide_hash(s) - deflate_state *s; - { -+#if ARM_NEON -+ return neon_slide_hash(s); -+#else - unsigned n, m; - Posf *p; - uInt wsize = s->w_size; -@@ -222,6 +228,7 @@ local void slide_hash(s) - */ - } while (--n); - #endif -+#endif - } - - /* ========================================================================= */ --- -2.19.0 - diff --git a/0002-Porting-optimized-longest_match.patch b/0002-Porting-optimized-longest_match.patch deleted file mode 100644 index 7fda12e..0000000 --- a/0002-Porting-optimized-longest_match.patch +++ /dev/null @@ -1,218 +0,0 @@ -From 17a154db6774a4acf347cfc5189eaf2cd675e696 Mon Sep 17 00:00:00 2001 -From: Adenilson Cavalcanti -Date: Mon, 9 Apr 2018 15:14:19 -0700 -Subject: [PATCH 2/3] Porting optimized longest_match - -This patch was contributed to zlib-ng and features an improved longest_match -function using the most distant hash code to reduce number of checks -(see: http://www.gildor.org/en/projects/zlib). - -Original patch by Jun He. ---- - CMakeLists.txt | 3 +- - contrib/arm/arm_longest_match.h | 142 ++++++++++++++++++++++++++++++++ - deflate.c | 11 ++- - 3 files changed, 152 insertions(+), 4 deletions(-) - create mode 100644 contrib/arm/arm_longest_match.h - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index e9a74e9..3826eba 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -141,7 +141,8 @@ if(CMAKE_COMPILER_IS_GNUCC) - set(ZLIB_ARM_NEON_HDRS - contrib/arm/chunkcopy.h - contrib/arm/inffast_chunk.h -- contrib/arm/neon_slide_hash.h) -+ contrib/arm/neon_slide_hash.h -+ contrib/arm/arm_longest_match.h) - set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) - add_definitions(-DARM_NEON) - set(COMPILER ${CMAKE_C_COMPILER}) -diff --git a/contrib/arm/arm_longest_match.h b/contrib/arm/arm_longest_match.h -new file mode 100644 -index 0000000..9e7083f ---- /dev/null -+++ b/contrib/arm/arm_longest_match.h -@@ -0,0 +1,142 @@ -+/* Copyright (C) 1995-2011, 2016 Mark Adler -+ * Copyright (C) 2017 ARM Holdings Inc. -+ * Authors: Adenilson Cavalcanti -+ * Jun He -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+#ifndef __ARM_LONGEST__MATCH__ -+#define __ARM_LONGEST__MATCH__ -+ -+#if defined(ARM_NEON) -+#include "deflate.h" -+#include -+static inline long get_match_len(const unsigned char *a, const unsigned char *b, long max) -+{ -+ register int len = 0; -+ register unsigned long xor = 0; -+ register int check_loops = max/sizeof(unsigned long); -+ while(check_loops-- > 0) { -+ xor = (*(unsigned long *)(a+len)) ^ (*(unsigned long *)(b+len)); -+ if (xor) break; -+ len += sizeof(unsigned long); -+ } -+ if (0 == xor) { -+ while (len < max) { -+ if (a[len] != b[len]) break; -+ len++; -+ } -+ return len; -+ } -+ xor = __builtin_ctzl(xor)>>3; -+ return len + xor; -+} -+ -+/* -+ * This implementation is based on algorithm described at: -+ * http://www.gildor.org/en/projects/zlib -+ * It uses the hash chain indexed by the most distant hash code to -+ * reduce number of checks. -+ * This also eliminates the those unnecessary check loops in legacy -+ * longest_match's do..while loop if the "most distant code" is out -+ * of search buffer -+ * -+ */ -+static inline unsigned arm_longest_match(deflate_state *const s, IPos cur_match) { -+ unsigned chain_length = s->max_chain_length;/* max hash chain length */ -+ unsigned char *scan = s->window + s->strstart; /* current string */ -+ unsigned char *match; /* matched string */ -+ unsigned int len; /* length of current match */ -+ unsigned int best_len = s->prev_length; /* best match length so far */ -+ unsigned int nice_match = s->nice_match; /* stop if match long enough */ -+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? -+ s->strstart - (IPos)MAX_DIST(s) : 0; -+ /* Stop when cur_match becomes <= limit. To simplify the code, -+ * we prevent matches with the string of window index 0. -+ */ -+ int offset = 0; /* offset of the head[most_distant_hash] from IN cur_match */ -+ Pos *prev = s->prev; -+ unsigned int wmask = s->w_mask; -+ unsigned char *scan_buf_base = s->window; -+ -+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. -+ * It is easy to get rid of this optimization if necessary. -+ */ -+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); -+ -+ /* Do not look for matches beyond the end of the input. This is necessary -+ * to make deflate deterministic. -+ */ -+ if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead; -+ -+ Assert((unsigned long)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); -+ -+ /* find most distant hash code for lazy_match */ -+ if (best_len > MIN_MATCH) { -+ /* search for most distant hash code */ -+ int i; -+ uint16_t hash = 0; -+ IPos pos; -+ -+ UPDATE_HASH(s, hash, scan[1]); -+ UPDATE_HASH(s, hash, scan[2]); -+ for (i = 3; i <= best_len; i++) { -+ UPDATE_HASH(s, hash, scan[i]); -+ /* get head IPos of hash calced by scan[i-2..i] */ -+ pos = s->head[hash]; -+ /* compare it to current "farthest hash" IPos */ -+ if (pos <= cur_match) { -+ /* we have a new "farthest hash" now */ -+ offset = i - 2; -+ cur_match = pos; -+ } -+ } -+ -+ /* update variables to correspond offset */ -+ limit += offset; -+ /* -+ * check if the most distant code's offset is out of search buffer -+ * if it is true, then this means scan[offset..offset+2] are not -+ * presented in the search buffer. So we just return best_len -+ * we've found. -+ */ -+ if (cur_match < limit) return best_len; -+ -+ scan_buf_base -= offset; -+ /* reduce hash search depth based on best_len */ -+ chain_length /= best_len - MIN_MATCH; -+ } -+ -+ do { -+ Assert(cur_match < s->strstart, "no future"); -+ -+ /* Determine matched length at current pos */ -+ match = scan_buf_base + cur_match; -+ len = get_match_len(match, scan, MAX_MATCH); -+ -+ if (len > best_len) { -+ /* found longer string */ -+ s->match_start = cur_match - offset; -+ best_len = len; -+ /* good enough? */ -+ if (len >= nice_match) break; -+ } -+ /* move to prev pos in this hash chain */ -+ } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); -+ -+ return (best_len <= s->lookahead)? best_len : s->lookahead; -+} -+ -+#endif -+#endif -diff --git a/deflate.c b/deflate.c -index 36f99ac..4c42259 100644 ---- a/deflate.c -+++ b/deflate.c -@@ -50,9 +50,6 @@ - /* @(#) $Id$ */ - - #include "deflate.h" --#if __ARM_NEON --#include "contrib/arm/neon_slide_hash.h" --#endif - - const char deflate_copyright[] = - " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; -@@ -196,6 +193,11 @@ local const config configuration_table[10] = { - s->head[s->hash_size-1] = NIL; \ - zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); - -+#if defined(ARM_NEON) -+#include "contrib/arm/arm_longest_match.h" -+#include "contrib/arm/neon_slide_hash.h" -+#endif -+ - /* =========================================================================== - * Slide the hash table when sliding the window down (could be avoided with 32 - * bit values at the expense of memory usage). We slide even when level == 0 to -@@ -1244,6 +1246,9 @@ local uInt longest_match(s, cur_match) - deflate_state *s; - IPos cur_match; /* current match */ - { -+#if defined(ARM_NEON) -+ return arm_longest_match(s, cur_match); -+#endif - unsigned chain_length = s->max_chain_length;/* max hash chain length */ - register Bytef *scan = s->window + s->strstart; /* current string */ - register Bytef *match; /* matched string */ --- -2.19.0 - diff --git a/0003-arm64-specific-build-patch.patch b/0003-arm64-specific-build-patch.patch deleted file mode 100644 index b8b6b55..0000000 --- a/0003-arm64-specific-build-patch.patch +++ /dev/null @@ -1,115 +0,0 @@ -From e0be75f8dce27a4e32196529df2a08dca791a286 Mon Sep 17 00:00:00 2001 -From: Jeremy Linton -Date: Fri, 6 Apr 2018 11:46:42 -0500 -Subject: [PATCH 3/3] arm64 specific build patch - ---- - Makefile.in | 19 ++++++++++++------- - configure | 2 +- - contrib/minizip/zip.c | 6 ++++-- - 3 files changed, 17 insertions(+), 10 deletions(-) - -diff --git a/Makefile.in b/Makefile.in -index 5a77949..9f088e5 100644 ---- a/Makefile.in -+++ b/Makefile.in -@@ -57,7 +57,7 @@ SRCDIR= - ZINC= - ZINCOUT=-I. - --OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o -+OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inffast.o inflate.o inftrees.o trees.o zutil.o - OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o - OBJC = $(OBJZ) $(OBJG) - -@@ -163,16 +163,16 @@ crc32.o: $(SRCDIR)crc32.c - $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c - - deflate.o: $(SRCDIR)deflate.c -- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c -+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)deflate.c - - infback.o: $(SRCDIR)infback.c - $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c - - inffast.o: $(SRCDIR)inffast.c -- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inffast.c -+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)inffast.c - - inflate.o: $(SRCDIR)inflate.c -- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inflate.c -+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)inflate.c - - inftrees.o: $(SRCDIR)inftrees.c - $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inftrees.c -@@ -214,7 +214,7 @@ crc32.lo: $(SRCDIR)crc32.c - - deflate.lo: $(SRCDIR)deflate.c - -@mkdir objs 2>/dev/null || test -d objs -- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c -+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c - -@mv objs/deflate.o $@ - - infback.lo: $(SRCDIR)infback.c -@@ -222,14 +222,19 @@ infback.lo: $(SRCDIR)infback.c - $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c - -@mv objs/infback.o $@ - -+arminffast.lo: $(SRCDIR)contrib/arm/inffast_chunk.c $(SRCDIR)inffast.c -+ -@mkdir objs 2>/dev/null || test -d objs -+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/arminffast.o $(SRCDIR)contrib/arm/inffast_chunk.c -+ -@mv objs/arminffast.o $@ -+ - inffast.lo: $(SRCDIR)inffast.c - -@mkdir objs 2>/dev/null || test -d objs -- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c -+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c - -@mv objs/inffast.o $@ - - inflate.lo: $(SRCDIR)inflate.c - -@mkdir objs 2>/dev/null || test -d objs -- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inflate.o $(SRCDIR)inflate.c -+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate.o $(SRCDIR)inflate.c - -@mv objs/inflate.o $@ - - inftrees.lo: $(SRCDIR)inftrees.c -diff --git a/configure b/configure -index e974d1f..0c5f837 100755 ---- a/configure -+++ b/configure -@@ -23,7 +23,7 @@ SRCDIR=`dirname $0` - if test $SRCDIR = "."; then - ZINC="" - ZINCOUT="-I." -- SRCDIR="" -+ SRCDIR="./" - else - ZINC='-include zconf.h' - ZINCOUT='-I. -I$(SRCDIR)' -diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c -index 44e88a9..0517930 100644 ---- a/contrib/minizip/zip.c -+++ b/contrib/minizip/zip.c -@@ -519,15 +519,17 @@ local ZPOS64_T zip64local_SearchCentralDir(const zlib_filefunc64_32_def* pzlib_f - break; - - for (i=(int)uReadSize-3; (i--)>0;) -+ { - if (((*(buf+i))==0x50) && ((*(buf+i+1))==0x4b) && - ((*(buf+i+2))==0x05) && ((*(buf+i+3))==0x06)) - { - uPosFound = uReadPos+i; - break; - } -+ } - -- if (uPosFound!=0) -- break; -+ if (uPosFound!=0) -+ break; - } - TRYFREE(buf); - return uPosFound; --- -2.19.0 - diff --git a/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch b/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch deleted file mode 100644 index 23cf171..0000000 --- a/0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch +++ /dev/null @@ -1,175 +0,0 @@ -From 41ebac8b7d7485a5396ae25ce2412cafcd03f1a2 Mon Sep 17 00:00:00 2001 -From: liqiang -Date: Thu, 2 Sep 2021 17:31:48 +0800 -Subject: [PATCH] Accelerate Adler32 using arm64 SVE instructions - - This patch uses the SVE instruction set to rewrite the Adler32 - algorithm (checksum algorithm in libz). By dividing the data into - blocks, a vector operation can complete a data block in parallel. - - Measured on a Taishan 1951 machine that supports 256bit width SVE, - this algorithm is about 3~5 times faster than the algorithm implemented - in C language in libz. The wider the bit width, the better the - acceleration effect. Below are the results of my measured random - data of 1M and 10M: - - [root@xxx adler32]# ./benchmark 1000000 - Libz alg: Time used: 608 us, 1644.7 Mb/s. - SVE alg: Time used: 166 us, 6024.1 Mb/s. - - [root@xxx adler32]# ./benchmark 10000000 - Libz alg: Time used: 6484 us, 1542.3 Mb/s. - SVE alg: Time used: 2034 us, 4916.4 Mb/s. - - On machines that support ARM64 sve instructions, this algorithm can - effectively accelerate adler32, thereby achieving the effect of improving - the performance of the basic compression algorithm libz. - - In the implementation of this patch, blocks can be of any size, so the - algorithm can automatically adapt to SVE hardware with different bit - widths without modifying the code. - -Signed-off-by: liqiang ---- - contrib/arm/adler32_sve.S | 129 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 129 insertions(+) - create mode 100644 contrib/arm/adler32_sve.S - -diff --git a/contrib/arm/adler32_sve.S b/contrib/arm/adler32_sve.S -new file mode 100644 -index 0000000..97c5930 ---- /dev/null -+++ b/contrib/arm/adler32_sve.S -@@ -0,0 +1,129 @@ -+/****************************************************************************** -+ * Copyright (c) Huawei Technologies Co., Ltd. 2018-2020. All rights reserved. -+ * iSulad licensed under the Mulan PSL v2. -+ * You can use this software according to the terms and conditions of the Mulan PSL v2. -+ * You may obtain a copy of Mulan PSL v2 at: -+ * http://license.coscl.org.cn/MulanPSL2 -+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+ * PURPOSE. -+ * See the Mulan PSL v2 for more details. -+ * Author: liqiang -+ * Create: 2020-07-13 -+ * Description: Use SVE instruction to optimize adler32 algorithm. -+ * Enhancement: 2020-10-13 -+ Automatically support different SVE vector length(128~2048). -+ ******************************************************************************/ -+ -+.file "adler32_sve.S" -+.text -+.align 4 -+ -+//The supported sve vector length range is 128~2048 by this Adler_sequence -+.Adler_sequence: -+ .short 256,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241,240,239,238,237,236,235,234,233,232,231,230,229,228,227,226,225,224,223,222,221,220,219,218,217,216,215,214,213,212,211,210,209,208,207,206,205,204,203,202,201,200,199,198,197,196,195,194,193,192,191,190,189,188,187,186,185,184,183,182,181,180,179,178,177,176,175,174,173,172,171,170,169,168,167,166,165,164,163,162,161,160,159,158,157,156,155,154,153,152,151,150,149,148,147,146,145,144,143,142,141,140,139,138,137,136,135,134,133,132,131,130,129,128,127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 -+ -+.global adler32_sve -+.type adler32_sve, %function -+adler32_sve: -+ // x0: unsigned long adler -+ // x1: const unsigned char *buf -+ // x2: unsigned long len -+ -+ // w10 : A = adler & 0xffff -+ // w11 : B = (adler >> 16) & 0xffff -+ // first byte A = 1, B = 0 -+ and w10, w0, #0xffff -+ lsr w11, w0, #16 -+ // less than and equal 63byte, jumper to normal proc -+ cmp x2, #0x3f -+ b.le Lnormal_proc -+ -+ // Get the length of the sve vector to x6. -+ mov x6, #0 -+ addvl x6, x6, #1 -+ adr x12, .Adler_sequence -+ ptrue p0.h -+ -+ // Get the starting position of the required sequence. -+ mov x9, #256 -+ sub x9, x9, x6 -+ ld1h z24.h, p0/z, [x12, x9, lsl #1] // taps1 to z24.h -+ inch x9 -+ ld1h z25.h, p0/z, [x12, x9, lsl #1] // taps2 to z25.h -+ // must bigger than 64byte -+ ptrue p0.b -+ ptrue p1.h -+ mov x9, #0 -+.align 4 -+LBig_loop: -+ // x is SVE vector length. -+ // Bn = Bn-1 + An-1 * x + x * D1 + (x-1) * D2 + ... + 1 * Dx -+ // An = An-1 + D1 + D2 + D3 + ... + Dx -+ -+ .macro ADLER_BLOCK_32 -+ ld1b z0.b, p0/z, [x1, x9] -+ -+ uaddv d20, p0, z0.b // D1 + D2 + ... + D32 -+ mov x12, v20.2d[0] // mov sum to w12 -+ madd x11, x10, x6, x11 // Bn = An-1 * 32 + Bn-1 -+ -+ uunpklo z26.h, z0.b -+ uunpkhi z27.h, z0.b -+ mul z26.h, p1/m, z26.h, z24.h // x * D1 + (x-1) * D2 + ... + (x/2 + 1) * D(x/2) -+ mul z27.h, p1/m, z27.h, z25.h // (x/2) * D(x/2 + 1) + (x/2 - 1) * D(x/2 + 2) + ... + 1 * Dx -+ -+ uaddv d21, p1, z26.h -+ uaddv d22, p1, z27.h -+ mov x13, v21.2d[0] -+ mov x14, v22.2d[0] -+ -+ add x11, x13, x11 -+ add x11, x14, x11 // Bn += x * D1 + (x-1) * D2 + ... + 1 * Dx -+ add x10, x12, x10 // An += D1 + D2 + ... + Dx -+ incb x9 -+ .endm -+ mov x15, #4 -+ ADLER_BLOCK_32 -+ ADLER_BLOCK_32 -+ ADLER_BLOCK_32 -+ ADLER_BLOCK_32 -+ -+ // calc = reg0 % 65521 -+ .macro mod65521, reg0, reg1, reg2 -+ mov w\reg1, #0x8071 -+ mov w\reg2, #0xfff1 -+ movk w\reg1, #0x8007, lsl #16 -+ umull x\reg1, w\reg0, w\reg1 -+ lsr x\reg1, x\reg1, #47 -+ msub w\reg0, w\reg1, w\reg2, w\reg0 -+ .endm -+ -+ mod65521 10, 14, 16 -+ mod65521 11, 14, 16 -+ -+Lloop_cond: -+ mul x12, x6, x15 -+ sub x2, x2, x12 -+ cmp x2, x12 -+ b.ge LBig_loop -+ -+Lnormal_proc: -+ cmp x2, #0 -+ b.eq Lret -+ -+ ldrb w15, [x1, x9] -+ add x9, x9, #1 -+ add x10, x15, x10 -+ add x11, x10, x11 -+ sub x2, x2, #1 -+ b Lnormal_proc -+ -+Lret: -+ mod65521 10, 14, 5 -+ mod65521 11, 14, 5 -+ lsl x11, x11, #16 -+ orr x0, x10, x11 -+ ret -+ -+.size adler32_sve, .-adler32_sve --- -2.17.1 - diff --git a/backport-0001-CVE-2018-25032.patch b/backport-0001-CVE-2018-25032.patch deleted file mode 100644 index 74eb73f..0000000 --- a/backport-0001-CVE-2018-25032.patch +++ /dev/null @@ -1,346 +0,0 @@ -From 5c44459c3b28a9bd3283aaceab7c615f8020c531 Mon Sep 17 00:00:00 2001 -From: Mark Adler -Date: Tue, 17 Apr 2018 22:09:22 -0700 -Subject: [PATCH] Fix a bug that can crash deflate on some input when using - Z_FIXED. - -This bug was reported by Danilo Ramos of Eideticom, Inc. It has -lain in wait 13 years before being found! The bug was introduced -in zlib 1.2.2.2, with the addition of the Z_FIXED option. That -option forces the use of fixed Huffman codes. For rare inputs with -a large number of distant matches, the pending buffer into which -the compressed data is written can overwrite the distance symbol -table which it overlays. That results in corrupted output due to -invalid distances, and can result in out-of-bound accesses, -crashing the application. - -The fix here combines the distance buffer and literal/length -buffers into a single symbol buffer. Now three bytes of pending -buffer space are opened up for each literal or length/distance -pair consumed, instead of the previous two bytes. This assures -that the pending buffer cannot overwrite the symbol table, since -the maximum fixed code compressed length/distance is 31 bits, and -since there are four bytes of pending space for every three bytes -of symbol space. - ---- - deflate.c | 74 ++++++++++++++++++++++++++++++++++++++++--------------- - deflate.h | 25 +++++++++---------- - trees.c | 49 +++++++++++------------------------- - 3 files changed, 79 insertions(+), 69 deletions(-) - -diff --git a/deflate.c b/deflate.c -index ef981b0..3db6d75 100644 ---- a/deflate.c -+++ b/deflate.c -@@ -269,11 +269,6 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, - int wrap = 1; - static const char my_version[] = ZLIB_VERSION; - -- ushf *overlay; -- /* We overlay pending_buf and d_buf+l_buf. This works since the average -- * output size for (length,distance) codes is <= 24 bits. -- */ -- - if (version == Z_NULL || version[0] != my_version[0] || - stream_size != sizeof(z_stream)) { - return Z_VERSION_ERROR; -@@ -344,9 +339,47 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, - - s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ - -- overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2); -- s->pending_buf = (uchf *) overlay; -- s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); -+ /* We overlay pending_buf and sym_buf. This works since the average size -+ * for length/distance pairs over any compressed block is assured to be 31 -+ * bits or less. -+ * -+ * Analysis: The longest fixed codes are a length code of 8 bits plus 5 -+ * extra bits, for lengths 131 to 257. The longest fixed distance codes are -+ * 5 bits plus 13 extra bits, for distances 16385 to 32768. The longest -+ * possible fixed-codes length/distance pair is then 31 bits total. -+ * -+ * sym_buf starts one-fourth of the way into pending_buf. So there are -+ * three bytes in sym_buf for every four bytes in pending_buf. Each symbol -+ * in sym_buf is three bytes -- two for the distance and one for the -+ * literal/length. As each symbol is consumed, the pointer to the next -+ * sym_buf value to read moves forward three bytes. From that symbol, up to -+ * 31 bits are written to pending_buf. The closest the written pending_buf -+ * bits gets to the next sym_buf symbol to read is just before the last -+ * code is written. At that time, 31*(n-2) bits have been written, just -+ * after 24*(n-2) bits have been consumed from sym_buf. sym_buf starts at -+ * 8*n bits into pending_buf. (Note that the symbol buffer fills when n-1 -+ * symbols are written.) The closest the writing gets to what is unread is -+ * then n+14 bits. Here n is lit_bufsize, which is 16384 by default, and -+ * can range from 128 to 32768. -+ * -+ * Therefore, at a minimum, there are 142 bits of space between what is -+ * written and what is read in the overlain buffers, so the symbols cannot -+ * be overwritten by the compressed data. That space is actually 139 bits, -+ * due to the three-bit fixed-code block header. -+ * -+ * That covers the case where either Z_FIXED is specified, forcing fixed -+ * codes, or when the use of fixed codes is chosen, because that choice -+ * results in a smaller compressed block than dynamic codes. That latter -+ * condition then assures that the above analysis also covers all dynamic -+ * blocks. A dynamic-code block will only be chosen to be emitted if it has -+ * fewer bits than a fixed-code block would for the same set of symbols. -+ * Therefore its average symbol length is assured to be less than 31. So -+ * the compressed data for a dynamic block also cannot overwrite the -+ * symbols from which it is being constructed. -+ */ -+ -+ s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 4); -+ s->pending_buf_size = (ulg)s->lit_bufsize * 4; - - if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || - s->pending_buf == Z_NULL) { -@@ -355,8 +388,12 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, - deflateEnd (strm); - return Z_MEM_ERROR; - } -- s->d_buf = overlay + s->lit_bufsize/sizeof(ush); -- s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; -+ s->sym_buf = s->pending_buf + s->lit_bufsize; -+ s->sym_end = (s->lit_bufsize - 1) * 3; -+ /* We avoid equality with lit_bufsize*3 because of wraparound at 64K -+ * on 16 bit machines and because stored blocks are restricted to -+ * 64K-1 bytes. -+ */ - - s->level = level; - s->strategy = strategy; -@@ -567,7 +604,7 @@ int ZEXPORT deflatePrime (strm, bits, value) - - if (deflateStateCheck(strm)) return Z_STREAM_ERROR; - s = strm->state; -- if ((Bytef *)(s->d_buf) < s->pending_out + ((Buf_size + 7) >> 3)) -+ if (s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3)) - return Z_BUF_ERROR; - do { - put = Buf_size - s->bi_valid; -@@ -1126,7 +1163,6 @@ int ZEXPORT deflateCopy (dest, source) - #else - deflate_state *ds; - deflate_state *ss; -- ushf *overlay; - - - if (deflateStateCheck(source) || dest == Z_NULL) { -@@ -1146,8 +1182,7 @@ int ZEXPORT deflateCopy (dest, source) - ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); - ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); - ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); -- overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2); -- ds->pending_buf = (uchf *) overlay; -+ ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, 4); - - if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || - ds->pending_buf == Z_NULL) { -@@ -1161,8 +1196,7 @@ int ZEXPORT deflateCopy (dest, source) - zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); - - ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); -- ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); -- ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; -+ ds->sym_buf = ds->pending_buf + ds->lit_bufsize; - - ds->l_desc.dyn_tree = ds->dyn_ltree; - ds->d_desc.dyn_tree = ds->dyn_dtree; -@@ -1934,7 +1968,7 @@ local block_state deflate_fast(s, flush) - FLUSH_BLOCK(s, 1); - return finish_done; - } -- if (s->last_lit) -+ if (s->sym_next) - FLUSH_BLOCK(s, 0); - return block_done; - } -@@ -2065,7 +2099,7 @@ local block_state deflate_slow(s, flush) - FLUSH_BLOCK(s, 1); - return finish_done; - } -- if (s->last_lit) -+ if (s->sym_next) - FLUSH_BLOCK(s, 0); - return block_done; - } -@@ -2140,7 +2174,7 @@ local block_state deflate_rle(s, flush) - FLUSH_BLOCK(s, 1); - return finish_done; - } -- if (s->last_lit) -+ if (s->sym_next) - FLUSH_BLOCK(s, 0); - return block_done; - } -@@ -2179,7 +2213,7 @@ local block_state deflate_huff(s, flush) - FLUSH_BLOCK(s, 1); - return finish_done; - } -- if (s->last_lit) -+ if (s->sym_next) - FLUSH_BLOCK(s, 0); - return block_done; - } -diff --git a/deflate.h b/deflate.h -index 23ecdd3..d4cf1a9 100644 ---- a/deflate.h -+++ b/deflate.h -@@ -217,7 +217,7 @@ typedef struct internal_state { - /* Depth of each subtree used as tie breaker for trees of equal frequency - */ - -- uchf *l_buf; /* buffer for literals or lengths */ -+ uchf *sym_buf; /* buffer for distances and literals/lengths */ - - uInt lit_bufsize; - /* Size of match buffer for literals/lengths. There are 4 reasons for -@@ -239,13 +239,8 @@ typedef struct internal_state { - * - I can't count above 4 - */ - -- uInt last_lit; /* running index in l_buf */ -- -- ushf *d_buf; -- /* Buffer for distances. To simplify the code, d_buf and l_buf have -- * the same number of elements. To use different lengths, an extra flag -- * array would be necessary. -- */ -+ uInt sym_next; /* running index in sym_buf */ -+ uInt sym_end; /* symbol table full when sym_next reaches this */ - - ulg opt_len; /* bit length of current block with optimal trees */ - ulg static_len; /* bit length of current block with static trees */ -@@ -325,20 +320,22 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf, - - # define _tr_tally_lit(s, c, flush) \ - { uch cc = (c); \ -- s->d_buf[s->last_lit] = 0; \ -- s->l_buf[s->last_lit++] = cc; \ -+ s->sym_buf[s->sym_next++] = 0; \ -+ s->sym_buf[s->sym_next++] = 0; \ -+ s->sym_buf[s->sym_next++] = cc; \ - s->dyn_ltree[cc].Freq++; \ -- flush = (s->last_lit == s->lit_bufsize-1); \ -+ flush = (s->sym_next == s->sym_end); \ - } - # define _tr_tally_dist(s, distance, length, flush) \ - { uch len = (uch)(length); \ - ush dist = (ush)(distance); \ -- s->d_buf[s->last_lit] = dist; \ -- s->l_buf[s->last_lit++] = len; \ -+ s->sym_buf[s->sym_next++] = dist; \ -+ s->sym_buf[s->sym_next++] = dist >> 8; \ -+ s->sym_buf[s->sym_next++] = len; \ - dist--; \ - s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ - s->dyn_dtree[d_code(dist)].Freq++; \ -- flush = (s->last_lit == s->lit_bufsize-1); \ -+ flush = (s->sym_next == s->sym_end); \ - } - #else - # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) -diff --git a/trees.c b/trees.c -index 50cf4b4..8bf5c80 100644 ---- a/trees.c -+++ b/trees.c -@@ -416,7 +416,7 @@ local void init_block(s) - - s->dyn_ltree[END_BLOCK].Freq = 1; - s->opt_len = s->static_len = 0L; -- s->last_lit = s->matches = 0; -+ s->sym_next = s->matches = 0; - } - - #define SMALLEST 1 -@@ -947,7 +947,7 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last) - - Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ", - opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, -- s->last_lit)); -+ s->sym_next / 3)); - - if (static_lenb <= opt_lenb) opt_lenb = static_lenb; - -@@ -1016,8 +1016,9 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc) - unsigned dist; /* distance of matched string */ - unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ - { -- s->d_buf[s->last_lit] = (ush)dist; -- s->l_buf[s->last_lit++] = (uch)lc; -+ s->sym_buf[s->sym_next++] = dist; -+ s->sym_buf[s->sym_next++] = dist >> 8; -+ s->sym_buf[s->sym_next++] = lc; - if (dist == 0) { - /* lc is the unmatched char */ - s->dyn_ltree[lc].Freq++; -@@ -1033,29 +1034,7 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc) - s->dyn_dtree[d_code(dist)].Freq++; - } - --#ifdef TRUNCATE_BLOCK -- /* Try to guess if it is profitable to stop the current block here */ -- if ((s->last_lit & 0x1fff) == 0 && s->level > 2) { -- /* Compute an upper bound for the compressed length */ -- ulg out_length = (ulg)s->last_lit*8L; -- ulg in_length = (ulg)((long)s->strstart - s->block_start); -- int dcode; -- for (dcode = 0; dcode < D_CODES; dcode++) { -- out_length += (ulg)s->dyn_dtree[dcode].Freq * -- (5L+extra_dbits[dcode]); -- } -- out_length >>= 3; -- Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ", -- s->last_lit, in_length, out_length, -- 100L - out_length*100L/in_length)); -- if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1; -- } --#endif -- return (s->last_lit == s->lit_bufsize-1); -- /* We avoid equality with lit_bufsize because of wraparound at 64K -- * on 16 bit machines and because stored blocks are restricted to -- * 64K-1 bytes. -- */ -+ return (s->sym_next == s->sym_end); - } - - /* =========================================================================== -@@ -1068,13 +1047,14 @@ local void compress_block(s, ltree, dtree) - { - unsigned dist; /* distance of matched string */ - int lc; /* match length or unmatched char (if dist == 0) */ -- unsigned lx = 0; /* running index in l_buf */ -+ unsigned sx = 0; /* running index in sym_buf */ - unsigned code; /* the code to send */ - int extra; /* number of extra bits to send */ - -- if (s->last_lit != 0) do { -- dist = s->d_buf[lx]; -- lc = s->l_buf[lx++]; -+ if (s->sym_next != 0) do { -+ dist = s->sym_buf[sx++] & 0xff; -+ dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8; -+ lc = s->sym_buf[sx++]; - if (dist == 0) { - send_code(s, lc, ltree); /* send a literal byte */ - Tracecv(isgraph(lc), (stderr," '%c' ", lc)); -@@ -1099,11 +1079,10 @@ local void compress_block(s, ltree, dtree) - } - } /* literal or match pair ? */ - -- /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */ -- Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx, -- "pendingBuf overflow"); -+ /* Check that the overlay between pending_buf and sym_buf is ok: */ -+ Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow"); - -- } while (lx < s->last_lit); -+ } while (sx < s->sym_next); - - send_code(s, END_BLOCK, ltree); - } --- -2.27.0 - diff --git a/backport-0001-CVE-2022-37434.patch b/backport-0001-CVE-2022-37434.patch deleted file mode 100644 index 59d75a6..0000000 --- a/backport-0001-CVE-2022-37434.patch +++ /dev/null @@ -1,35 +0,0 @@ -From eff308af425b67093bab25f80f1ae950166bece1 Mon Sep 17 00:00:00 2001 -From: Mark Adler -Date: Sat, 30 Jul 2022 15:51:11 -0700 -Subject: [PATCH] Fix a bug when getting a gzip header extra field with - inflate(). - -If the extra field was larger than the space the user provided with -inflateGetHeader(), and if multiple calls of inflate() delivered -the extra header data, then there could be a buffer overflow of the -provided space. This commit assures that provided space is not -exceeded. ---- - inflate.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/inflate.c b/inflate.c -index 2a0ac30..95a38f5 100644 ---- a/inflate.c -+++ b/inflate.c -@@ -765,9 +765,10 @@ int flush; - copy = state->length; - if (copy > have) copy = have; - if (copy) { -+ len = state->head->extra_len - state->length; - if (state->head != Z_NULL && -- state->head->extra != Z_NULL) { -- len = state->head->extra_len - state->length; -+ state->head->extra != Z_NULL && -+ len < state->head->extra_max) { - zmemcpy(state->head->extra + len, next, - len + copy > state->head->extra_max ? - state->head->extra_max - len : copy); --- -2.27.0 - diff --git a/backport-0002-CVE-2018-25032.patch b/backport-0002-CVE-2018-25032.patch deleted file mode 100644 index 20b69d0..0000000 --- a/backport-0002-CVE-2018-25032.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 4346a16853e19b45787ce933666026903fb8f3f8 Mon Sep 17 00:00:00 2001 -From: Mark Adler -Date: Tue, 17 Apr 2018 22:44:41 -0700 -Subject: [PATCH] Assure that the number of bits for deflatePrime() is valid. - ---- - deflate.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/deflate.c b/deflate.c -index 3db6d75..a4b0977 100644 ---- a/deflate.c -+++ b/deflate.c -@@ -604,7 +604,8 @@ int ZEXPORT deflatePrime (strm, bits, value) - - if (deflateStateCheck(strm)) return Z_STREAM_ERROR; - s = strm->state; -- if (s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3)) -+ if (bits < 0 || bits > 16 || -+ s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3)) - return Z_BUF_ERROR; - do { - put = Buf_size - s->bi_valid; --- -2.27.0 - diff --git a/backport-0002-CVE-2022-37434.patch b/backport-0002-CVE-2022-37434.patch deleted file mode 100644 index a115fce..0000000 --- a/backport-0002-CVE-2022-37434.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 1eb7682f845ac9e9bf9ae35bbfb3bad5dacbd91d Mon Sep 17 00:00:00 2001 -From: Mark Adler -Date: Mon, 8 Aug 2022 10:50:09 -0700 -Subject: [PATCH] Fix extra field processing bug that dereferences NULL - state->head. - -The recent commit to fix a gzip header extra field processing bug -introduced the new bug fixed here. ---- - inflate.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/inflate.c b/inflate.c -index 95a38f5..9c5934e 100644 ---- a/inflate.c -+++ b/inflate.c -@@ -765,10 +765,10 @@ int flush; - copy = state->length; - if (copy > have) copy = have; - if (copy) { -- len = state->head->extra_len - state->length; - if (state->head != Z_NULL && - state->head->extra != Z_NULL && -- len < state->head->extra_max) { -+ (len = state->head->extra_len - state->length) < -+ state->head->extra_max) { - zmemcpy(state->head->extra + len, next, - len + copy > state->head->extra_max ? - state->head->extra_max - len : copy); --- -2.27.0 - diff --git a/fix-undefined-buffer-detected-by-oss-fuzz.patch b/backport-fix-undefined-buffer-detected-by-oss-fuzz.patch similarity index 100% rename from fix-undefined-buffer-detected-by-oss-fuzz.patch rename to backport-fix-undefined-buffer-detected-by-oss-fuzz.patch diff --git a/zlib-1.2.5-minizip-fixuncrypt.patch b/backport-zlib-1.2.5-minizip-fixuncrypt.patch similarity index 100% rename from zlib-1.2.5-minizip-fixuncrypt.patch rename to backport-zlib-1.2.5-minizip-fixuncrypt.patch diff --git a/zlib-1.2.11-SIMD.patch b/zlib-1.2.11-SIMD.patch index d9f251b..df0bf7a 100644 --- a/zlib-1.2.11-SIMD.patch +++ b/zlib-1.2.11-SIMD.patch @@ -1,26 +1,247 @@ +From 91c1e78feec94739cc5da8562b3e2395bfdf6193 Mon Sep 17 00:00:00 2001 +From: hedongbo +Date: Sun, 14 Sep 2020 15:36:12 +0800 +Subject: [PATCH] zlib-1.2.11-SIMD.patch + +In the sampling of the Hive test program, it is found that inflate occupies a high proportion. +The zlib is optimized through instruction set optimization, hash replacement, and compilation option optimization. +The inflate and deflate processes of the Zlib library provided by the JDK are optimized to shorten the invoking time. +--- + CMakeLists.txt | 6 + + adler32.c | 169 +++++++++++++++++++++- + deflate.c | 22 ++- + inffast.c | 62 ++++++++- + inffast.h | 370 +++++++++++++++++++++++++++++++++++++++++++++++++ + inflate.c | 7 + + 6 files changed, 627 insertions(+), 9 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index b412dc7..40dc533 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -126,6 +126,12 @@ if(NOT MINGW) + ) + endif() + ++if(CMAKE_COMPILER_IS_GNUCC) ++ if(ARM_NEON) ++ add_definitions(-DHASH_ARMV8_CRC32 -march=armv8-a+crc -DUNALIGNED_OK -DADLER32_SIMD_NEON -DINFLATE_CHUNK_SIMD_NEON -O3) ++ endif() ++endif() ++ + # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents) + string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" +diff --git a/adler32.c b/adler32.c +index d0be438..6ced75d 100644 +--- a/adler32.c ++++ b/adler32.c +@@ -59,7 +59,169 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); + # define MOD63(a) a %= BASE + #endif + +-/* ========================================================================= */ ++#if defined(ADLER32_SIMD_NEON) ++#include ++/* ++ * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. ++ */ ++uint32x4_t ZLIB_INTERNAL mul_add_bytes( ++ uint32x4_t v_s2, ++ uint16x8_t v_column_sum_1, ++ uint16x8_t v_column_sum_2, ++ uint16x8_t v_column_sum_3, ++ uint16x8_t v_column_sum_4) ++{ ++ v_s2 = vshlq_n_u32(v_s2, 5); ++ ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), ++ (uint16x4_t) { 32, 31, 30, 29 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), ++ (uint16x4_t) { 28, 27, 26, 25 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), ++ (uint16x4_t) { 24, 23, 22, 21 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), ++ (uint16x4_t) { 20, 19, 18, 17 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), ++ (uint16x4_t) { 16, 15, 14, 13 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), ++ (uint16x4_t) { 12, 11, 10, 9 }); ++ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), ++ (uint16x4_t) { 8, 7, 6, 5 }); ++ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), ++ (uint16x4_t) { 4, 3, 2, 1 }); ++ return v_s2; ++} ++ ++/* ++ * Handle leftover data. ++ */ ++uLong ZLIB_INTERNAL leftover_handler(uint32_t s1, uint32_t s2, const Bytef *buf, z_size_t len) ++{ ++ if (len) { ++ if (len >= 16) { ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ s2 += (s1 += *buf++); ++ ++ len -= 16; ++ } ++ ++ while (len--) { ++ s2 += (s1 += *buf++); ++ } ++ ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 %= BASE; ++ } ++ ++ /* ++ * Return the recombined sums. ++ */ ++ return s1 | (s2 << 16); ++} ++ ++uLong ZLIB_INTERNAL adler32_simd_(uLong adler, const Bytef *buf, z_size_t len) ++{ ++ /* ++ * Split Adler-32 into component sums. ++ */ ++ uint32_t s1 = adler & 0xffff; ++ uint32_t s2 = adler >> 16; ++ /* ++ * Serially compute s1 & s2, until the data is 16-byte aligned. ++ */ ++ if ((uintptr_t)buf & 0xf) { ++ while ((uintptr_t)buf & 0xf) { ++ s2 += (s1 += *buf++); ++ --len; ++ } ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 %= BASE; ++ } ++ /* ++ * Process the data in blocks. ++ */ ++ const unsigned BLOCK_SIZE = 1 << 5; ++ z_size_t blocks = len / BLOCK_SIZE; ++ len -= blocks * BLOCK_SIZE; ++ while (blocks) { ++ unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ ++ if (n > blocks) ++ n = (unsigned) blocks; ++ blocks -= n; ++ /* ++ * Process n blocks of data. At most NMAX data bytes can be ++ * processed before s2 must be reduced modulo BASE. ++ */ ++ uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; ++ uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; ++ ++ uint16x8_t v_column_sum_1 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_2 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_3 = vdupq_n_u16(0); ++ uint16x8_t v_column_sum_4 = vdupq_n_u16(0); ++ do { ++ /* ++ * Load 32 input bytes. ++ */ ++ const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); ++ const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); ++ /* ++ * Add previous block byte sum to v_s2. ++ */ ++ v_s2 = vaddq_u32(v_s2, v_s1); ++ /* ++ * Horizontally add the bytes for s1. ++ */ ++ v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); ++ /* ++ * Vertically add the bytes for s2. ++ */ ++ v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); ++ v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); ++ v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); ++ v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); ++ buf += BLOCK_SIZE; ++ } while (--n); ++ v_s2 = mul_add_bytes(v_s2, v_column_sum_1, v_column_sum_2, v_column_sum_3, v_column_sum_4); ++ /* ++ * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). ++ */ ++ uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); ++ uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); ++ uint32x2_t s1s2 = vpadd_u32(sum1, sum2); ++ ++ s1 += vget_lane_u32(s1s2, 0); ++ s2 += vget_lane_u32(s1s2, 1); ++ /* ++ * Reduce. ++ */ ++ s1 %= BASE; ++ s2 %= BASE; ++ } ++ return leftover_handler(s1, s2, buf, len); ++ ++} ++#endif ++ + uLong ZEXPORT adler32_z(adler, buf, len) + uLong adler; + const Bytef *buf; +@@ -68,6 +230,11 @@ uLong ZEXPORT adler32_z(adler, buf, len) + unsigned long sum2; + unsigned n; + ++#if defined(ADLER32_SIMD_NEON) ++ if (buf && len >= 64) ++ return adler32_simd_(adler, buf, len); ++#endif ++ + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; diff --git a/deflate.c b/deflate.c -index f30f71b..c018064 100644 +index f290783..31d1cfe 100644 --- a/deflate.c +++ b/deflate.c -@@ -184,8 +184,16 @@ local const config configuration_table[10] = { +@@ -154,7 +154,16 @@ local const config configuration_table[10] = { * characters, so that a running hash key can be computed from the previous * key instead of complete recalculation each time. */ --#define UPDATE_HASH(s,h,c) (h = (((h)<hash_shift) ^ (c)) & s->hash_mask) +-#define UPDATE_HASH(s,h,c) (h = (((h) << s->hash_shift) ^ (c)) & s->hash_mask) +#if defined(HASH_ARMV8_CRC32) +#include +#define UPDATE_HASH_CRC_INTERNAL(s, h, c) \ -+ (h = __crc32w(0, (c) & 0xFFFFFF) & ((deflate_state *)s)->hash_mask) - ++ (h = __crc32w(0, (c) & 0xFFFFFF) & ((deflate_state *)s)->hash_mask) ++ +#define UPDATE_HASH(s, h, c) \ + UPDATE_HASH_CRC_INTERNAL(s, h, *(unsigned *)((uintptr_t)(&c) - (MIN_MATCH-1))) +#else +#define UPDATE_HASH(s,h,c) (h = (((h)<hash_shift) ^ (c)) & s->hash_mask) +#endif + /* =========================================================================== - * Insert string str in the dictionary and set match_head to the previous head -@@ -1198,14 +1247,15 @@ local unsigned read_buf(strm, buf, size) +@@ -1226,14 +1235,15 @@ local unsigned read_buf(strm, buf, size) strm->avail_in -= len; zmemcpy(buf, strm->next_in, len); @@ -42,10 +263,10 @@ index f30f71b..c018064 100644 strm->total_in += len; diff --git a/inffast.c b/inffast.c -index 4bfc995..2084739 100644 +index 1fec7f3..84c5aba 100644 --- a/inffast.c +++ b/inffast.c -@@ -81,6 +81,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ +@@ -57,6 +57,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ unsigned char FAR *out; /* local strm->next_out */ unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ unsigned char FAR *end; /* while out < end, enough space available */ @@ -55,7 +276,7 @@ index 4bfc995..2084739 100644 #ifdef INFLATE_STRICT unsigned dmax; /* maximum distance from zlib header */ #endif -@@ -113,7 +116,12 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ +@@ -89,7 +92,12 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ #endif wsize = state->wsize; whave = state->whave; @@ -68,7 +289,7 @@ index 4bfc995..2084739 100644 window = state->window; hold = state->hold; bits = state->bits; -@@ -221,6 +229,45 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ +@@ -197,6 +205,45 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ #endif } from = window; @@ -114,7 +335,7 @@ index 4bfc995..2084739 100644 if (wnext == 0) { /* very common case */ from += wsize - op; if (op < len) { /* some from window */ -@@ -271,8 +318,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ +@@ -247,8 +294,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ if (len > 1) *out++ = *from++; } @@ -134,7 +355,7 @@ index 4bfc995..2084739 100644 from = out - dist; /* copy direct from output */ do { /* minimum length is three */ *out++ = *from++; -@@ -284,7 +341,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ +@@ -260,7 +317,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ *out++ = *from++; if (len > 1) *out++ = *from++; @@ -145,10 +366,10 @@ index 4bfc995..2084739 100644 } else if ((op & 64) == 0) { /* 2nd level distance code */ diff --git a/inffast.h b/inffast.h -index b8da8bb..0def2e3 100644 +index e5c1aa4..259882c 100644 --- a/inffast.h +++ b/inffast.h -@@ -32,4 +32,374 @@ +@@ -8,4 +8,374 @@ subject to change. Applications should only use zlib.h. */ @@ -524,10 +745,10 @@ index b8da8bb..0def2e3 100644 + +#endif //defined(INFLATE_CHUNK_SIMD_NEON) diff --git a/inflate.c b/inflate.c -index ca904e7..c78e05b 100644 +index 8acbef4..4e695b1 100644 --- a/inflate.c +++ b/inflate.c -@@ -429,9 +429,16 @@ unsigned copy; +@@ -408,9 +408,16 @@ unsigned copy; /* if it hasn't been done already, allocate space for the window */ if (state->window == Z_NULL) { @@ -544,200 +765,6 @@ index ca904e7..c78e05b 100644 if (state->window == Z_NULL) return 1; } -diff --git a/adler32.c b/adler32.c -index e148022..e024a15 100644 ---- a/adler32.c -+++ b/adler32.c -@@ -83,7 +83,169 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); - # define MOD63(a) a %= BASE - #endif - --/* ========================================================================= */ -+#if defined(ADLER32_SIMD_NEON) -+#include -+/* -+ * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. -+ */ -+uint32x4_t ZLIB_INTERNAL mul_add_bytes( -+ uint32x4_t v_s2, -+ uint16x8_t v_column_sum_1, -+ uint16x8_t v_column_sum_2, -+ uint16x8_t v_column_sum_3, -+ uint16x8_t v_column_sum_4) -+{ -+ v_s2 = vshlq_n_u32(v_s2, 5); -+ -+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), -+ (uint16x4_t) { 32, 31, 30, 29 }); -+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), -+ (uint16x4_t) { 28, 27, 26, 25 }); -+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), -+ (uint16x4_t) { 24, 23, 22, 21 }); -+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), -+ (uint16x4_t) { 20, 19, 18, 17 }); -+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), -+ (uint16x4_t) { 16, 15, 14, 13 }); -+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), -+ (uint16x4_t) { 12, 11, 10, 9 }); -+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), -+ (uint16x4_t) { 8, 7, 6, 5 }); -+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), -+ (uint16x4_t) { 4, 3, 2, 1 }); -+ return v_s2; -+} -+ -+/* -+ * Handle leftover data. -+ */ -+uLong ZLIB_INTERNAL leftover_handler(uint32_t s1, uint32_t s2, const Bytef *buf, z_size_t len) -+{ -+ if (len) { -+ if (len >= 16) { -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ s2 += (s1 += *buf++); -+ -+ len -= 16; -+ } -+ -+ while (len--) { -+ s2 += (s1 += *buf++); -+ } -+ -+ if (s1 >= BASE) -+ s1 -= BASE; -+ s2 %= BASE; -+ } -+ -+ /* -+ * Return the recombined sums. -+ */ -+ return s1 | (s2 << 16); -+} -+ -+uLong ZLIB_INTERNAL adler32_simd_(uLong adler, const Bytef *buf, z_size_t len) -+{ -+ /* -+ * Split Adler-32 into component sums. -+ */ -+ uint32_t s1 = adler & 0xffff; -+ uint32_t s2 = adler >> 16; -+ /* -+ * Serially compute s1 & s2, until the data is 16-byte aligned. -+ */ -+ if ((uintptr_t)buf & 0xf) { -+ while ((uintptr_t)buf & 0xf) { -+ s2 += (s1 += *buf++); -+ --len; -+ } -+ if (s1 >= BASE) -+ s1 -= BASE; -+ s2 %= BASE; -+ } -+ /* -+ * Process the data in blocks. -+ */ -+ const unsigned BLOCK_SIZE = 1 << 5; -+ z_size_t blocks = len / BLOCK_SIZE; -+ len -= blocks * BLOCK_SIZE; -+ while (blocks) { -+ unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ -+ if (n > blocks) -+ n = (unsigned) blocks; -+ blocks -= n; -+ /* -+ * Process n blocks of data. At most NMAX data bytes can be -+ * processed before s2 must be reduced modulo BASE. -+ */ -+ uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; -+ uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; -+ -+ uint16x8_t v_column_sum_1 = vdupq_n_u16(0); -+ uint16x8_t v_column_sum_2 = vdupq_n_u16(0); -+ uint16x8_t v_column_sum_3 = vdupq_n_u16(0); -+ uint16x8_t v_column_sum_4 = vdupq_n_u16(0); -+ do { -+ /* -+ * Load 32 input bytes. -+ */ -+ const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); -+ const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); -+ /* -+ * Add previous block byte sum to v_s2. -+ */ -+ v_s2 = vaddq_u32(v_s2, v_s1); -+ /* -+ * Horizontally add the bytes for s1. -+ */ -+ v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); -+ /* -+ * Vertically add the bytes for s2. -+ */ -+ v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); -+ v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); -+ v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); -+ v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); -+ buf += BLOCK_SIZE; -+ } while (--n); -+ v_s2 = mul_add_bytes(v_s2, v_column_sum_1, v_column_sum_2, v_column_sum_3, v_column_sum_4); -+ /* -+ * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). -+ */ -+ uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); -+ uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); -+ uint32x2_t s1s2 = vpadd_u32(sum1, sum2); -+ -+ s1 += vget_lane_u32(s1s2, 0); -+ s2 += vget_lane_u32(s1s2, 1); -+ /* -+ * Reduce. -+ */ -+ s1 %= BASE; -+ s2 %= BASE; -+ } -+ return leftover_handler(s1, s2, buf, len); -+ -+} -+#endif -+ - uLong ZEXPORT adler32_z(adler, buf, len) - uLong adler; - const Bytef *buf; -@@ -92,6 +254,11 @@ uLong ZEXPORT adler32_z(adler, buf, len) - unsigned long sum2; - unsigned n; - -+#if defined(ADLER32_SIMD_NEON) -+ if (buf && len >= 64) -+ return adler32_simd_(adler, buf, len); -+#endif -+ - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; ---- zlib-1.2.11/CMakeLists.txt 2020-08-04 14:35:44.023579477 +0800 -+++ CMakeLists.txt 2020-08-04 14:39:38.937798725 +0800 -@@ -145,6 +145,7 @@ if(CMAKE_COMPILER_IS_GNUCC) - contrib/arm/arm_longest_match.h) - set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) - add_definitions(-DARM_NEON) -+ add_definitions(-DHASH_ARMV8_CRC32 -march=armv8-a+crc -DUNALIGNED_OK -DADLER32_SIMD_NEON -DINFLATE_CHUNK_SIMD_NEON -O3) - set(COMPILER ${CMAKE_C_COMPILER}) - # NEON is mandatory in ARMv8. - if(${COMPILER} MATCHES "aarch64") +-- +2.33.0 + diff --git a/zlib-1.2.11.tar.xz b/zlib-1.2.11.tar.xz deleted file mode 100644 index 305b7a0..0000000 Binary files a/zlib-1.2.11.tar.xz and /dev/null differ diff --git a/zlib-1.2.13.tar.xz b/zlib-1.2.13.tar.xz new file mode 100644 index 0000000..c01659e Binary files /dev/null and b/zlib-1.2.13.tar.xz differ diff --git a/0004-zlib-Optimize-CRC32.patch b/zlib-Optimize-CRC32.patch similarity index 71% rename from 0004-zlib-Optimize-CRC32.patch rename to zlib-Optimize-CRC32.patch index 1068625..16d66b4 100644 --- a/0004-zlib-Optimize-CRC32.patch +++ b/zlib-Optimize-CRC32.patch @@ -6,32 +6,33 @@ Subject: [PATCH] zlib: Optimize CRC32 This patch uses the NEON instruction set to optimize the CRC32 algorithm. -On the ARM architecture, we can optimize the efficiency of +On the ARM architecture, we can optimize the efficiency of crc32 through the interface provided by the neon instruction set. Modify by Li Qiang. --- - crc32.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 47 insertions(+) + crc32.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 50 insertions(+) diff --git a/crc32.c b/crc32.c -index 9580440..79ebdbd 100644 +index f8357b0..5c53068 100644 --- a/crc32.c +++ b/crc32.c -@@ -29,6 +29,9 @@ +@@ -28,6 +28,9 @@ #endif /* MAKECRCH */ - #include "zutil.h" /* for STDC and FAR definitions */ + #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ +#ifdef __aarch64__ +#include "arm_acle.h" +#endif - /* Definitions for doing the crc four data bytes at a time. */ - #if !defined(NOBYFOUR) && defined(Z_U4) -@@ -194,6 +197,47 @@ const z_crc_t FAR * ZEXPORT get_crc_table() + /* + A CRC of a message is computed on N braids of words in the message, where +@@ -600,6 +603,49 @@ const z_crc_t FAR * ZEXPORT get_crc_table() return (const z_crc_t FAR *)crc_table; } - + ++#ifdef __aarch64__ +ulg crc32_neon(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; @@ -72,20 +73,22 @@ index 9580440..79ebdbd 100644 + + return (crc_result ^ 0xffffffffL); +} ++#endif + - /* ========================================================================= */ - #define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) - #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 -@@ -204,6 +248,9 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) + /* ========================================================================= + * Use ARM machine instructions if available. This will compute the CRC about + * ten times faster than the braided calculation. This code does not check for +@@ -750,6 +794,10 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) const unsigned char FAR *buf; z_size_t len; { + #ifdef __aarch64__ + return crc32_neon(crc, buf, len); + #endif - if (buf == Z_NULL) return 0UL; ++ + /* Return initial CRC, if requested. */ + if (buf == Z_NULL) return 0; - #ifdef DYNAMIC_CRC_TABLE -- -1.8.3.1 +2.27.0 diff --git a/zlib.spec b/zlib.spec index 9bc0cac..8cd5494 100644 --- a/zlib.spec +++ b/zlib.spec @@ -1,26 +1,17 @@ Name: zlib -Version: 1.2.11 -Release: 24 +Version: 1.2.13 +Release: 1 Summary: A lossless data-compression library License: zlib and Boost URL: http://www.zlib.net Source0: http://www.zlib.net/zlib-%{version}.tar.xz # Patch0 get from fedora -Patch0: zlib-1.2.5-minizip-fixuncrypt.patch -# Patch1 to Patch3 get from http://www.gildor.org/en/projects/zlib -Patch1: 0001-Neon-Optimized-hash-chain-rebase.patch -Patch2: 0002-Porting-optimized-longest_match.patch -Patch3: 0003-arm64-specific-build-patch.patch -Patch4: 0004-zlib-Optimize-CRC32.patch -Patch5: zlib-1.2.11-SIMD.patch -Patch6: 0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch +Patch6000: backport-zlib-1.2.5-minizip-fixuncrypt.patch +Patch6001: backport-fix-undefined-buffer-detected-by-oss-fuzz.patch -Patch6000: fix-undefined-buffer-detected-by-oss-fuzz.patch -Patch6001: backport-0001-CVE-2018-25032.patch -Patch6002: backport-0002-CVE-2018-25032.patch -Patch6003: backport-0001-CVE-2022-37434.patch -Patch6004: backport-0002-CVE-2022-37434.patch +Patch9000: zlib-Optimize-CRC32.patch +Patch9001: zlib-1.2.11-SIMD.patch BuildRequires: automake, autoconf, libtool @@ -64,25 +55,11 @@ This package contains the development-related content related to minizip. %prep %setup -n %{name}-%{version} -%patch0 -p1 -%ifarch aarch64 -%patch1 -p1 -%patch2 -p1 -%patch3 -p1 -%patch4 -p1 -%patch5 -p1 -%patch6 -p1 -%endif -%patch6000 -p1 -%patch6001 -p1 -%patch6002 -p1 -%patch6003 -p1 -%patch6004 -p1 +%autosetup -b 0 -n %{name}-%{version} -p1 %build export CFLAGS="$RPM_OPT_FLAGS" %ifarch aarch64 -CFLAGS+=" -DARM_NEON -O3" CFLAGS+=" -march=armv8-a+crc" %endif @@ -135,6 +112,10 @@ make test %{_libdir}/pkgconfig/minizip.pc %changelog +* Thu Dec 29 2022 zhoupengcheng - 1.2.13-1 +- update to zlib-1.2.13 +- remove openEuler uncompiled patch : 0005-Accelerate-Adler32-using-arm64-SVE-instructions.patch + * Mon Dec 26 2022 zhoupengcheng - 1.2.11-24 - DESC:remove unapplied patches