From c1006eccfa06b410ac407f1cfbcd43e066ad751f Mon Sep 17 00:00:00 2001 From: dogsheng <960055655@qq.com> Date: Tue, 19 Nov 2019 14:09:22 +0800 Subject: [PATCH] Package init --- performance-neoncrc32-and-prfm.patch | 170 +++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 performance-neoncrc32-and-prfm.patch diff --git a/performance-neoncrc32-and-prfm.patch b/performance-neoncrc32-and-prfm.patch new file mode 100644 index 0000000..ad36f80 --- /dev/null +++ b/performance-neoncrc32-and-prfm.patch @@ -0,0 +1,170 @@ +From 134712c35ed2ec5a06c61583dce59867aeb28862 Mon Sep 17 00:00:00 2001 +From: liqiang64 +Date: Mon, 11 Nov 2019 19:47:36 +0800 +Subject: [PATCH] performance-neoncrc32-and-prfm + +Analysis of gzip software by perf tool, found that crc32 and +longest_match hotspots are very high. + +On the ARM architecture, we can optimize the efficiency of +crc32 through the interface provided by the neon instruction +set, and optimize the performance of random access code through +prefetch instructions. +Modify by Li Qiang. +--- + deflate.c | 27 ++++++++++++++++++++++++++- + util.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 75 insertions(+), 1 deletion(-) + +diff --git a/deflate.c b/deflate.c +index 951d7af..f15a227 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -392,6 +392,9 @@ longest_match(IPos cur_match) + register int len; /* length of current match */ + int best_len = prev_length; /* best match length so far */ + IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL; ++ #ifdef __aarch64__ ++ IPos next_match; ++ #endif + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ +@@ -425,6 +428,10 @@ longest_match(IPos cur_match) + do { + Assert(cur_match < strstart, "no future"); + match = window + cur_match; ++ #ifdef __aarch64__ ++ next_match = prev[cur_match & WMASK]; ++ __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK]))); ++ #endif + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2: +@@ -502,8 +509,14 @@ longest_match(IPos cur_match) + scan_end = scan[best_len]; + #endif + } +- } while ((cur_match = prev[cur_match & WMASK]) > limit ++ } ++ #ifdef __aarch64__ ++ while ((cur_match = next_match) > limit + && --chain_length != 0); ++ #else ++ while ((cur_match = prev[cur_match & WMASK]) > limit ++ && --chain_length != 0); ++ #endif + + return best_len; + } +@@ -788,7 +801,19 @@ off_t deflate() + lookahead -= prev_length-1; + prev_length -= 2; + RSYNC_ROLL(strstart, prev_length+1); ++ while (prev_length >= 4) { ++ prev_length -= 4; ++ strstart++; ++ INSERT_STRING(strstart, hash_head); ++ strstart++; ++ INSERT_STRING(strstart, hash_head); ++ strstart++; ++ INSERT_STRING(strstart, hash_head); ++ strstart++; ++ INSERT_STRING(strstart, hash_head); ++ } + do { ++ if (prev_length == 0) break; + strstart++; + INSERT_STRING(strstart, hash_head); + /* strstart never exceeds WSIZE-MAX_MATCH, so there are +diff --git a/util.c b/util.c +index bb5e9f3..d0b3cb0 100644 +--- a/util.c ++++ b/util.c +@@ -31,6 +31,9 @@ + #include "gzip.h" + #include + #include ++#ifdef __aarch64__ ++#include ++#endif + + #ifndef CHAR_BIT + # define CHAR_BIT 8 +@@ -41,6 +44,7 @@ static int write_buffer (int, voidp, unsigned int); + /* ======================================================================== + * Table of CRC-32's of all single-byte values (made by makecrc.c) + */ ++#ifndef __aarch64__ + static const ulg crc_32_tab[] = { + 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, + 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, +@@ -95,6 +99,7 @@ static const ulg crc_32_tab[] = { + 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, + 0x2d02ef8dL + }; ++#endif + + /* =========================================================================== + * Copy input to output unchanged: zcat == cat with --force. +@@ -129,6 +134,49 @@ ulg updcrc(s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { ++ #ifdef __aarch64__ ++ register ulg c; ++ static ulg crc = (ulg)0xffffffffL; ++ register const uint8_t *buf1; ++ register const uint16_t *buf2; ++ register const uint32_t *buf4; ++ register const uint64_t *buf8; ++ int64_t length = (int64_t)n; ++ buf8 = (const uint64_t *)(const void *)s; ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ ++ while(length >= sizeof(uint64_t)) { ++ c = __crc32d(c, *buf8++); ++ length -= sizeof(uint64_t); ++ } ++ ++ buf4 = (const uint32_t *)(const void *)buf8; ++ if (length >= sizeof(uint32_t)) { ++ c = __crc32w(c, *buf4++); ++ length -= sizeof(uint32_t); ++ } ++ ++ buf2 = (const uint16_t *)(const void *)buf4; ++ if(length >= sizeof(uint16_t)) { ++ c = __crc32h(c, *buf2++); ++ length -= sizeof(uint16_t); ++ } ++ ++ buf1 = (const uint8_t *)(const void *)buf2; ++ if (length >= sizeof(uint8_t)) { ++ c = __crc32b(c, *buf1); ++ length -= sizeof(uint8_t); ++ } ++ } ++ ++ crc = c; ++ ++ return (c ^ 0xffffffffL); ++#else + register ulg c; /* temporary variable */ + + static ulg crc = (ulg)0xffffffffL; /* shift register contents */ +@@ -143,6 +191,7 @@ ulg updcrc(s, n) + } + crc = c; + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++#endif + } + + /* =========================================================================== +-- +1.8.3.1 +