!1 Backport the Kunpeng snappy patches
Merge pull request !1 from Yikun/kunpeng
This commit is contained in:
commit
9c61785deb
275
snappy-kunpeng-backport.patch
Normal file
275
snappy-kunpeng-backport.patch
Normal file
@ -0,0 +1,275 @@
|
||||
diff --git a/snappy.cc b/snappy.cc
|
||||
index fd519e5..e7d7373 100644
|
||||
--- a/snappy.cc
|
||||
+++ b/snappy.cc
|
||||
@@ -42,6 +42,11 @@
|
||||
#if SNAPPY_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
+
|
||||
+#if ARCH_ARM
|
||||
+#include <arm_neon.h>
|
||||
+#endif
|
||||
+
|
||||
#include <stdio.h>
|
||||
|
||||
#include <algorithm>
|
||||
@@ -53,6 +58,7 @@ namespace snappy {
|
||||
|
||||
using internal::COPY_1_BYTE_OFFSET;
|
||||
using internal::COPY_2_BYTE_OFFSET;
|
||||
+using internal::COPY_4_BYTE_OFFSET;
|
||||
using internal::LITERAL;
|
||||
using internal::char_table;
|
||||
using internal::kMaximumTagLength;
|
||||
@@ -63,13 +69,22 @@ using internal::kMaximumTagLength;
|
||||
// input. Of course, it doesn't hurt if the hash function is reasonably fast
|
||||
// either, as it gets called a lot.
|
||||
static inline uint32 HashBytes(uint32 bytes, int shift) {
|
||||
- uint32 kMul = 0x1e35a7bd;
|
||||
+ const uint32 kMul = 0x1e35a7bd;
|
||||
return (bytes * kMul) >> shift;
|
||||
}
|
||||
static inline uint32 Hash(const char* p, int shift) {
|
||||
return HashBytes(UNALIGNED_LOAD32(p), shift);
|
||||
}
|
||||
|
||||
+#if ARCH_ARM
|
||||
+static inline void Prefetch(const void* data) {
|
||||
+ __asm__ __volatile__(
|
||||
+ "prfm PLDL1STRM, [%[data]] \n\t"
|
||||
+ :: [data] "r" (data)
|
||||
+ );
|
||||
+}
|
||||
+#endif // ARCH_ARM
|
||||
+
|
||||
size_t MaxCompressedLength(size_t source_len) {
|
||||
// Compressed data can be defined as:
|
||||
// compressed := item* literal*
|
||||
@@ -184,12 +199,12 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||
// abcabcxxxxx
|
||||
// abcabcabcabcxxxxx
|
||||
// ^
|
||||
- // The last x is 14 bytes after ^.
|
||||
- if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
|
||||
+ // The last x is 11 bytes after ^.
|
||||
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
|
||||
while (pattern_size < 8) {
|
||||
UnalignedCopy64(src, op);
|
||||
op += pattern_size;
|
||||
- pattern_size *= 2;
|
||||
+ pattern_size <<= 1;
|
||||
}
|
||||
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
||||
} else {
|
||||
@@ -202,9 +217,22 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||
// UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
|
||||
// because expanding the pattern to at least 8 bytes guarantees that
|
||||
// op - src >= 8.
|
||||
- while (op <= buf_limit - 16) {
|
||||
+ const char* loop_limit = buf_limit - 16;
|
||||
+ while (op <= loop_limit) {
|
||||
+#if ARCH_ARM
|
||||
+ __asm__ __volatile__(
|
||||
+ "ldr d0, [%[src]] \n\t"
|
||||
+ "str d0, [%[op]] \n\t"
|
||||
+ "ldr d1, [%[src], #8] \n\t"
|
||||
+ "str d1, [%[op], #8] \n\t"
|
||||
+ : [op] "+r" (op)
|
||||
+ : [src] "r" (src)
|
||||
+ : "d0", "d1"
|
||||
+ );
|
||||
+#else
|
||||
UnalignedCopy64(src, op);
|
||||
UnalignedCopy64(src + 8, op + 8);
|
||||
+#endif
|
||||
src += 16;
|
||||
op += 16;
|
||||
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
||||
@@ -219,7 +247,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
||||
return IncrementalCopySlow(src, op, op_limit);
|
||||
}
|
||||
|
||||
-} // namespace
|
||||
+} // namespaceEncode32
|
||||
|
||||
static inline char* EmitLiteral(char* op,
|
||||
const char* literal,
|
||||
@@ -237,30 +265,27 @@ static inline char* EmitLiteral(char* op,
|
||||
// MaxCompressedLength).
|
||||
assert(len > 0); // Zero-length literals are disallowed
|
||||
int n = len - 1;
|
||||
- if (allow_fast_path && len <= 16) {
|
||||
+ if (allow_fast_path && SNAPPY_PREDICT_TRUE(len <= 16)) {
|
||||
// Fits in tag byte
|
||||
*op++ = LITERAL | (n << 2);
|
||||
-
|
||||
+#if ARCH_ARM
|
||||
+ vst1q_u64((uint64_t*)op, vld1q_u64((const uint64_t*)literal));
|
||||
+#else
|
||||
UnalignedCopy128(literal, op);
|
||||
+#endif
|
||||
return op + len;
|
||||
}
|
||||
|
||||
- if (n < 60) {
|
||||
+ if (SNAPPY_PREDICT_TRUE(n < 60)) {
|
||||
// Fits in tag byte
|
||||
*op++ = LITERAL | (n << 2);
|
||||
} else {
|
||||
- // Encode in upcoming bytes
|
||||
- char* base = op;
|
||||
- int count = 0;
|
||||
- op++;
|
||||
- while (n > 0) {
|
||||
- *op++ = n & 0xff;
|
||||
- n >>= 8;
|
||||
- count++;
|
||||
- }
|
||||
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
|
||||
assert(count >= 1);
|
||||
assert(count <= 4);
|
||||
- *base = LITERAL | ((59+count) << 2);
|
||||
+ *op++ = LITERAL | ((59 + count) << 2);
|
||||
+ LittleEndian::Store32(op,n);
|
||||
+ op += count;
|
||||
}
|
||||
memcpy(op, literal, len);
|
||||
return op + len;
|
||||
@@ -471,12 +496,12 @@ char* CompressFragment(const char* input,
|
||||
if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
|
||||
goto emit_remainder;
|
||||
}
|
||||
- next_hash = Hash(next_ip, shift);
|
||||
candidate = base_ip + table[hash];
|
||||
assert(candidate >= base_ip);
|
||||
assert(candidate < ip);
|
||||
|
||||
table[hash] = ip - base_ip;
|
||||
+ next_hash = Hash(next_ip, shift);
|
||||
} while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
|
||||
UNALIGNED_LOAD32(candidate)));
|
||||
|
||||
@@ -496,17 +521,22 @@ char* CompressFragment(const char* input,
|
||||
// this loop via goto if we get close to exhausting the input.
|
||||
EightBytesReference input_bytes;
|
||||
uint32 candidate_bytes = 0;
|
||||
+ uint32 prev_val = 0;
|
||||
+ uint32 cur_val = 0;
|
||||
+ uint32 next_val = 0;
|
||||
|
||||
do {
|
||||
// We have a 4-byte match at ip, and no need to emit any
|
||||
// "literal bytes" prior to ip.
|
||||
- const char* base = ip;
|
||||
+#if defined(ARCH_ARM)
|
||||
+ Prefetch(ip + 256);
|
||||
+#endif
|
||||
+ size_t offset = ip - candidate;
|
||||
std::pair<size_t, bool> p =
|
||||
FindMatchLength(candidate + 4, ip + 4, ip_end);
|
||||
size_t matched = 4 + p.first;
|
||||
+ assert(0 == memcmp(ip, candidate, matched));
|
||||
ip += matched;
|
||||
- size_t offset = base - candidate;
|
||||
- assert(0 == memcmp(base, candidate, matched));
|
||||
op = EmitCopy(op, offset, matched, p.second);
|
||||
next_emit = ip;
|
||||
if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
|
||||
@@ -516,15 +546,19 @@ char* CompressFragment(const char* input,
|
||||
// table[Hash(ip, shift)] for that. To improve compression,
|
||||
// we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
|
||||
input_bytes = GetEightBytesAt(ip - 1);
|
||||
- uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
|
||||
- table[prev_hash] = ip - base_ip - 1;
|
||||
- uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
|
||||
+ prev_val = GetUint32AtOffset(input_bytes, 0);
|
||||
+ cur_val = GetUint32AtOffset(input_bytes, 1);
|
||||
+ next_val = GetUint32AtOffset(input_bytes, 2);
|
||||
+
|
||||
+ uint32 prev_hash = HashBytes(prev_val, shift);
|
||||
+ uint32 cur_hash = HashBytes(cur_val, shift);
|
||||
candidate = base_ip + table[cur_hash];
|
||||
candidate_bytes = UNALIGNED_LOAD32(candidate);
|
||||
+ table[prev_hash] = ip - base_ip - 1;
|
||||
table[cur_hash] = ip - base_ip;
|
||||
- } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
|
||||
+ } while (cur_val == candidate_bytes);
|
||||
|
||||
- next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
|
||||
+ next_hash = HashBytes(next_val, shift);
|
||||
++ip;
|
||||
}
|
||||
}
|
||||
@@ -636,8 +670,7 @@ class SnappyDecompressor {
|
||||
// Length is encoded in 1..5 bytes
|
||||
*result = 0;
|
||||
uint32 shift = 0;
|
||||
- while (true) {
|
||||
- if (shift >= 32) return false;
|
||||
+ for (;;) {
|
||||
size_t n;
|
||||
const char* ip = reader_->Peek(&n);
|
||||
if (n == 0) return false;
|
||||
@@ -650,6 +683,7 @@ class SnappyDecompressor {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
+ if (shift >= 32) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -658,10 +692,9 @@ class SnappyDecompressor {
|
||||
// Returns true if successful, false on error or end of input.
|
||||
template <class Writer>
|
||||
void DecompressAllTags(Writer* writer) {
|
||||
- const char* ip = ip_;
|
||||
// For position-independent executables, accessing global arrays can be
|
||||
// slow. Move wordmask array onto the stack to mitigate this.
|
||||
- uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
|
||||
+ uint32 wordmask[5];
|
||||
// Do not use memcpy to copy internal::wordmask to
|
||||
// wordmask. LLVM converts stack arrays to global arrays if it detects
|
||||
// const stack arrays and this hurts the performance of position
|
||||
@@ -673,6 +706,7 @@ class SnappyDecompressor {
|
||||
wordmask[3] = internal::wordmask[3];
|
||||
wordmask[4] = internal::wordmask[4];
|
||||
|
||||
+ const char* ip = ip_;
|
||||
// We could have put this refill fragment only at the beginning of the loop.
|
||||
// However, duplicating it at the end of each branch gives the compiler more
|
||||
// scope to optimize the <ip_limit_ - ip> expression based on the local
|
||||
@@ -737,22 +771,28 @@ class SnappyDecompressor {
|
||||
if (avail == 0) return; // Premature end of input
|
||||
ip_limit_ = ip + avail;
|
||||
}
|
||||
- if (!writer->Append(ip, literal_length)) {
|
||||
+ bool append_res = !writer->Append(ip, literal_length);
|
||||
+ if (SNAPPY_PREDICT_TRUE(append_res)) {
|
||||
return;
|
||||
}
|
||||
ip += literal_length;
|
||||
MAYBE_REFILL();
|
||||
} else {
|
||||
+ const uint32 val = LittleEndian::Load32(ip);
|
||||
const size_t entry = char_table[c];
|
||||
- const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
|
||||
const size_t length = entry & 0xff;
|
||||
- ip += entry >> 11;
|
||||
+ const size_t copy_offset = entry & 0x700;
|
||||
+ const size_t mask_idx = entry >> 11;
|
||||
+ const uint32 mask = wordmask[mask_idx];
|
||||
+ const size_t trailer = val & mask;
|
||||
+ const size_t offset = copy_offset + trailer;
|
||||
+ ip += mask_idx;
|
||||
|
||||
// copy_offset/256 is encoded in bits 8..10. By just fetching
|
||||
// those bits, we get copy_offset (since the bit-field starts at
|
||||
// bit 8).
|
||||
- const size_t copy_offset = entry & 0x700;
|
||||
- if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
|
||||
+ bool append_res = !writer->AppendFromSelf(offset, length);
|
||||
+ if (append_res) {
|
||||
return;
|
||||
}
|
||||
MAYBE_REFILL();
|
||||
@ -9,6 +9,7 @@ Source1: snappy.pc
|
||||
|
||||
Patch0: snappy-gtest.patch
|
||||
Patch1: snappy-version-macros.patch
|
||||
Patch2: snappy-kunpeng-backport.patch
|
||||
Patch6000: Fix-Travis-CI-configuration-for-OSX.patch
|
||||
|
||||
BuildRequires: gcc-c++ automake autoconf gtest-devel cmake
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user