Backport the Kunpeng snappy patches

This patch backports the Kunpeng snappy from kunpengcompute upstream, all patches are from [1]. [1] https://github.com/kunpengcompute/snappy/pull/1
2020-05-09 15:15:21 +08:00 · 2020-05-09 15:15:21 +08:00 · 1af74439a8
commit 1af74439a8
parent ac69d7c8a9
2 changed files with 276 additions and 0 deletions
--- a/snappy-kunpeng-backport.patch
+++ b/snappy-kunpeng-backport.patch
@ -0,0 +1,275 @@
 diff --git a/snappy.cc b/snappy.cc
 index fd519e5..e7d7373 100644
 --- a/snappy.cc
 +++ b/snappy.cc
@@ -42,6 +42,11 @@
 #if SNAPPY_HAVE_SSE2
 #include <emmintrin.h>
 #endif
 +
 +#if ARCH_ARM
 +#include <arm_neon.h>
 +#endif
 +
 #include <stdio.h>
 #include <algorithm>
@@ -53,6 +58,7 @@ namespace snappy {
 using internal::COPY_1_BYTE_OFFSET;
 using internal::COPY_2_BYTE_OFFSET;
 +using internal::COPY_4_BYTE_OFFSET;
 using internal::LITERAL;
 using internal::char_table;
 using internal::kMaximumTagLength;
@@ -63,13 +69,22 @@ using internal::kMaximumTagLength;
 // input. Of course, it doesn't hurt if the hash function is reasonably fast
 // either, as it gets called a lot.
 static inline uint32 HashBytes(uint32 bytes, int shift) {
 -  uint32 kMul = 0x1e35a7bd;
 +  const uint32 kMul = 0x1e35a7bd;
   return (bytes * kMul) >> shift;
 }
 static inline uint32 Hash(const char* p, int shift) {
   return HashBytes(UNALIGNED_LOAD32(p), shift);
 }
 +#if ARCH_ARM
 +static inline void Prefetch(const void* data) {
 +	__asm__ __volatile__(
 +		"prfm PLDL1STRM, [%[data]]		\n\t"
 +		:: [data] "r" (data)
 +	);
 +}
 +#endif // ARCH_ARM
 +
 size_t MaxCompressedLength(size_t source_len) {
   // Compressed data can be defined as:
   //    compressed := item* literal*
@@ -184,12 +199,12 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // abcabcxxxxx
     // abcabcabcabcxxxxx
     //    ^
 -    // The last x is 14 bytes after ^.
 -    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
 +    // The last x is 11 bytes after ^.
 +    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
       while (pattern_size < 8) {
         UnalignedCopy64(src, op);
         op += pattern_size;
 -        pattern_size *= 2;
 +        pattern_size <<= 1;
       }
       if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
     } else {
@@ -202,9 +217,22 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
   // because expanding the pattern to at least 8 bytes guarantees that
   // op - src >= 8.
 -  while (op <= buf_limit - 16) {
 +  const char* loop_limit = buf_limit - 16;
 +  while (op <= loop_limit) {
 +#if ARCH_ARM
 +	__asm__ __volatile__(
 +		"ldr d0, [%[src]]   	\n\t"
 +		"str d0, [%[op]]    	\n\t"
 +		"ldr d1, [%[src], #8]	\n\t"
 +		"str d1, [%[op], #8]	\n\t"
 +		: [op] "+r" (op)
 +		: [src] "r" (src)
 +		: "d0", "d1"
 +	);
 +#else
     UnalignedCopy64(src, op);
     UnalignedCopy64(src + 8, op + 8);
 +#endif
     src += 16;
     op += 16;
     if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
@@ -219,7 +247,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   return IncrementalCopySlow(src, op, op_limit);
 }
 -}  // namespace
 +}  // namespaceEncode32
 static inline char* EmitLiteral(char* op,
                                 const char* literal,
@@ -237,30 +265,27 @@ static inline char* EmitLiteral(char* op,
   //     MaxCompressedLength).
   assert(len > 0);      // Zero-length literals are disallowed
   int n = len - 1;
 -  if (allow_fast_path && len <= 16) {
 +  if (allow_fast_path && SNAPPY_PREDICT_TRUE(len <= 16)) {
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
 -
 +#if ARCH_ARM
 +	vst1q_u64((uint64_t*)op, vld1q_u64((const uint64_t*)literal));
 +#else
     UnalignedCopy128(literal, op);
 +#endif
     return op + len;
   }
 -  if (n < 60) {
 +  if (SNAPPY_PREDICT_TRUE(n < 60)) {
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
   } else {
 -    // Encode in upcoming bytes
 -    char* base = op;
 -    int count = 0;
 -    op++;
 -    while (n > 0) {
 -      *op++ = n & 0xff;
 -      n >>= 8;
 -      count++;
 -    }
 +    int count = (Bits::Log2Floor(n) >> 3) + 1;
     assert(count >= 1);
     assert(count <= 4);
 -    *base = LITERAL | ((59+count) << 2);
 +    *op++ = LITERAL | ((59 + count) << 2);
 +    LittleEndian::Store32(op,n);
 +    op += count;
   }
   memcpy(op, literal, len);
   return op + len;
@@ -471,12 +496,12 @@ char* CompressFragment(const char* input,
         if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
           goto emit_remainder;
         }
 -        next_hash = Hash(next_ip, shift);
         candidate = base_ip + table[hash];
         assert(candidate >= base_ip);
         assert(candidate < ip);
         table[hash] = ip - base_ip;
 +        next_hash = Hash(next_ip, shift);
       } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
                                  UNALIGNED_LOAD32(candidate)));
@@ -496,17 +521,22 @@ char* CompressFragment(const char* input,
       // this loop via goto if we get close to exhausting the input.
       EightBytesReference input_bytes;
       uint32 candidate_bytes = 0;
 +      uint32 prev_val = 0;
 +      uint32 cur_val = 0;
 +      uint32 next_val = 0;
       do {
         // We have a 4-byte match at ip, and no need to emit any
         // "literal bytes" prior to ip.
 -        const char* base = ip;
 +#if defined(ARCH_ARM)
 +		Prefetch(ip + 256);
 +#endif
 +        size_t offset = ip - candidate;
         std::pair<size_t, bool> p =
             FindMatchLength(candidate + 4, ip + 4, ip_end);
         size_t matched = 4 + p.first;
 +        assert(0 == memcmp(ip, candidate, matched));
         ip += matched;
 -        size_t offset = base - candidate;
 -        assert(0 == memcmp(base, candidate, matched));
         op = EmitCopy(op, offset, matched, p.second);
         next_emit = ip;
         if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
@@ -516,15 +546,19 @@ char* CompressFragment(const char* input,
         // table[Hash(ip, shift)] for that.  To improve compression,
         // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
         input_bytes = GetEightBytesAt(ip - 1);
 -        uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
 -        table[prev_hash] = ip - base_ip - 1;
 -        uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
 +        prev_val = GetUint32AtOffset(input_bytes, 0);
 +        cur_val = GetUint32AtOffset(input_bytes, 1);
 +        next_val = GetUint32AtOffset(input_bytes, 2);
 +		
 +        uint32 prev_hash = HashBytes(prev_val, shift);
 +        uint32 cur_hash = HashBytes(cur_val, shift);
         candidate = base_ip + table[cur_hash];
         candidate_bytes = UNALIGNED_LOAD32(candidate);
 +        table[prev_hash] = ip - base_ip - 1;
         table[cur_hash] = ip - base_ip;
 -      } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
 +      } while (cur_val == candidate_bytes);
 -      next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
 +      next_hash = HashBytes(next_val, shift);
       ++ip;
     }
   }
@@ -636,8 +670,7 @@ class SnappyDecompressor {
     // Length is encoded in 1..5 bytes
     *result = 0;
     uint32 shift = 0;
 -    while (true) {
 -      if (shift >= 32) return false;
 +    for (;;) {
       size_t n;
       const char* ip = reader_->Peek(&n);
       if (n == 0) return false;
@@ -650,6 +683,7 @@ class SnappyDecompressor {
         break;
       }
       shift += 7;
 +      if (shift >= 32) return false;
     }
     return true;
   }
@@ -658,10 +692,9 @@ class SnappyDecompressor {
   // Returns true if successful, false on error or end of input.
   template <class Writer>
   void DecompressAllTags(Writer* writer) {
 -    const char* ip = ip_;
     // For position-independent executables, accessing global arrays can be
     // slow.  Move wordmask array onto the stack to mitigate this.
 -    uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
 +    uint32 wordmask[5];
     // Do not use memcpy to copy internal::wordmask to
     // wordmask.  LLVM converts stack arrays to global arrays if it detects
     // const stack arrays and this hurts the performance of position
@@ -673,6 +706,7 @@ class SnappyDecompressor {
     wordmask[3] = internal::wordmask[3];
     wordmask[4] = internal::wordmask[4];
 +    const char* ip = ip_;
     // We could have put this refill fragment only at the beginning of the loop.
     // However, duplicating it at the end of each branch gives the compiler more
     // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -737,22 +771,28 @@ class SnappyDecompressor {
           if (avail == 0) return;  // Premature end of input
           ip_limit_ = ip + avail;
         }
 -        if (!writer->Append(ip, literal_length)) {
 +        bool append_res = !writer->Append(ip, literal_length);
 +        if (SNAPPY_PREDICT_TRUE(append_res)) {
           return;
         }
         ip += literal_length;
         MAYBE_REFILL();
       } else {
 +        const uint32 val = LittleEndian::Load32(ip);
         const size_t entry = char_table[c];
 -        const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
         const size_t length = entry & 0xff;
 -        ip += entry >> 11;
 +        const size_t copy_offset = entry & 0x700;
 +        const size_t mask_idx = entry >> 11;
 +        const uint32 mask = wordmask[mask_idx];
 +        const size_t trailer = val & mask;
 +        const size_t offset = copy_offset + trailer;
 +        ip += mask_idx;
         // copy_offset/256 is encoded in bits 8..10.  By just fetching
         // those bits, we get copy_offset (since the bit-field starts at
         // bit 8).
 -        const size_t copy_offset = entry & 0x700;
 -        if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
 +        bool append_res = !writer->AppendFromSelf(offset, length);
 +        if (append_res) {
           return;
         }
         MAYBE_REFILL();
--- a/snappy.spec
+++ b/snappy.spec
@ -9,6 +9,7 @@ Source1:	snappy.pc
 Patch0:		snappy-gtest.patch
 Patch1:		snappy-version-macros.patch
 Patch2:		snappy-kunpeng-backport.patch
 Patch6000:	Fix-Travis-CI-configuration-for-OSX.patch
 BuildRequires:	gcc-c++ automake autoconf gtest-devel cmake