diff options
Diffstat (limited to 'package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch')
-rw-r--r-- | package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch | 502 |
1 files changed, 0 insertions, 502 deletions
diff --git a/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch b/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch index 475ed6f3e9..d181b034e5 100644 --- a/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch +++ b/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch @@ -1907,505 +1907,3 @@ index 00000000..ac333e8c + state = (struct inflate_state FAR *)strm->state; + return (unsigned long)(state->next - state->codes); +} - -From 247147654fe5cd11cf15d8dff91440405ea57040 Mon Sep 17 00:00:00 2001 -From: Simon Hosie <simon.hosie@arm.com> -Date: Wed, 12 Apr 2017 15:44:21 -0700 -Subject: [PATCH 2/2] Inflate using wider loads and stores - -In inflate_fast() the output pointer always has plenty of room to write. This -means that so long as the target is capable, wide un-aligned loads and stores -can be used to transfer several bytes at once. When the reference distance is -too short simply unroll the data a little to increase the distance. - -Change-Id: I59854eb25d2b1e43561c8a2afaf9175bf10cf674 ---- - contrib/arm/chunkcopy.h | 279 ++++++++++++++++++++++++++++++++++++++++++++++++ - contrib/arm/inffast.c | 96 +++++++---------- - contrib/arm/inflate.c | 22 ++-- - 3 files changed, 335 insertions(+), 62 deletions(-) - create mode 100644 contrib/arm/chunkcopy.h - -diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h -new file mode 100644 -index 00000000..2d6fd6f9 ---- /dev/null -+++ b/contrib/arm/chunkcopy.h -@@ -0,0 +1,279 @@ -+/* chunkcopy.h -- fast copies and sets -+ * Copyright (C) 2017 ARM, Inc. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#ifndef CHUNKCOPY_H -+#define CHUNKCOPY_H -+ -+#include "zutil.h" -+#include <arm_neon.h> -+ -+#if __STDC_VERSION__ >= 199901L -+#define Z_RESTRICT restrict -+#else -+#define Z_RESTRICT -+#endif -+ -+typedef uint8x16_t chunkcopy_chunk_t; -+#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) -+ -+/* -+ Ask the compiler to perform a wide, unaligned load with an machine -+ instruction appropriate for the chunkcopy_chunk_t type. -+ */ -+static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) { -+ chunkcopy_chunk_t c; -+ __builtin_memcpy(&c, s, sizeof(c)); -+ return c; -+} -+ -+/* -+ Ask the compiler to perform a wide, unaligned store with an machine -+ instruction appropriate for the chunkcopy_chunk_t type. -+ */ -+static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) { -+ __builtin_memcpy(d, &c, sizeof(c)); -+} -+ -+/* -+ Perform a memcpy-like operation, but assume that length is non-zero and that -+ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if -+ the length is shorter than this. -+ -+ It also guarantees that it will properly unroll the data if the distance -+ between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on -+ in chunkcopy_relaxed(). -+ -+ Aside from better memory bus utilisation, this means that short copies -+ (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop -+ without iteration, which will hopefully make the branch prediction more -+ reliable. -+ */ -+static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out, -+ const unsigned char FAR *from, -+ unsigned len) { -+ int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; -+ storechunk(out, loadchunk(from)); -+ out += bump; -+ from += bump; -+ len /= CHUNKCOPY_CHUNK_SIZE; -+ while (len-- > 0) { -+ storechunk(out, loadchunk(from)); -+ out += CHUNKCOPY_CHUNK_SIZE; -+ from += CHUNKCOPY_CHUNK_SIZE; -+ } -+ return out; -+} -+ -+/* -+ Like chunkcopy_core, but avoid writing beyond of legal output. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out, -+ const unsigned char FAR * from, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ if (limit - out < CHUNKCOPY_CHUNK_SIZE) { -+ const unsigned char FAR * Z_RESTRICT rfrom = from; -+ if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; } -+ if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; } -+ if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; } -+ if (len & 1) { *out++ = *rfrom++; } -+ return out; -+ } -+ return chunkcopy_core(out, from, len); -+} -+ -+/* -+ Perform short copies until distance can be rewritten as being at least -+ CHUNKCOPY_CHUNK_SIZE. -+ -+ This assumes that it's OK to overwrite at least the first -+ 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than -+ this. This assumption holds within inflate_fast() which starts every -+ iteration with at least 258 bytes of output space available (258 being the -+ maximum length output from a single token; see inffast.c). -+ */ -+static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out, -+ unsigned FAR *dist, -+ unsigned FAR *len) { -+ const unsigned char FAR *from = out - *dist; -+ while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { -+ storechunk(out, loadchunk(from)); -+ out += *dist; -+ *len -= *dist; -+ *dist += *dist; -+ } -+ return out; -+} -+ -+ -+static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RESTRICT from) { -+#if defined(__clang__) || defined(__aarch64__) -+ return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from)); -+#else -+ /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a -+ * void pointer, so here's an alternate implementation. -+ */ -+ uint8x8_t h = vld1_u8(from); -+ return vcombine_u8(h, h); -+#endif -+} -+ -+/* -+ Perform an overlapping copy which behaves as a memset() operation, but -+ supporting periods other than one, and assume that length is non-zero and -+ that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output -+ even if the length is shorter than this. -+ */ -+static inline unsigned char FAR *chunkset_core(unsigned char FAR *out, -+ unsigned period, -+ unsigned len) { -+ uint8x16_t f; -+ int bump = ((len - 1) % sizeof(f)) + 1; -+ -+ switch (period) { -+ case 1: -+ f = vld1q_dup_u8(out - 1); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ while (len > 0) { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } -+ return out; -+ case 2: -+ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ case 4: -+ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ case 8: -+ f = chunkset_vld1q_dup_u8x8(out - 8); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = chunkset_vld1q_dup_u8x8(out - 8); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ } -+ out = chunkunroll_relaxed(out, &period, &len); -+ return chunkcopy_core(out, out - period, len); -+} -+ -+/* -+ Perform a memcpy-like operation, but assume that length is non-zero and that -+ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if -+ the length is shorter than this. -+ -+ Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour -+ of overlapping buffers, regardless of the distance between the pointers. -+ This is reflected in the `restrict`-qualified pointers, allowing the -+ compiler to reorder loads and stores. -+ */ -+static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out, -+ const unsigned char FAR * Z_RESTRICT from, -+ unsigned len) { -+ return chunkcopy_core(out, from, len); -+} -+ -+/* -+ Like chunkcopy_relaxed, but avoid writing beyond of legal output. -+ -+ Unlike chunkcopy_core_safe() above, no guarantee is made regarding the -+ behaviour of overlapping buffers, regardless of the distance between the -+ pointers. This is reflected in the `restrict`-qualified pointers, allowing -+ the compiler to reorder loads and stores. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out, -+ const unsigned char FAR * Z_RESTRICT from, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ return chunkcopy_core_safe(out, from, len, limit); -+} -+ -+/* -+ Perform chunky copy within the same buffer, where the source and destination -+ may potentially overlap. -+ -+ Assumes that len > 0 on entry, and that it's safe to write at least -+ CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. -+ */ -+static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out, -+ unsigned dist, -+ unsigned len) { -+ if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { -+ return chunkset_core(out, dist, len); -+ } -+ return chunkcopy_core(out, out - dist, len); -+} -+ -+/* -+ Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal output. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out, -+ unsigned dist, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { -+ /* TODO: try harder to optimise this */ -+ while (len-- > 0) { -+ *out = *(out - dist); -+ out++; -+ } -+ return out; -+ } -+ return chunkcopy_lapped_relaxed(out, dist, len); -+} -+ -+#undef Z_RESTRICT -+ -+#endif /* CHUNKCOPY_H */ -diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c -index 0dbd1dbc..f7f50071 100644 ---- a/contrib/arm/inffast.c -+++ b/contrib/arm/inffast.c -@@ -7,6 +7,7 @@ - #include "inftrees.h" - #include "inflate.h" - #include "inffast.h" -+#include "chunkcopy.h" - - #ifdef ASMINF - # pragma message("Assembler code may have bugs -- use at your own risk") -@@ -57,6 +58,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - unsigned char FAR *out; /* local strm->next_out */ - unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ - unsigned char FAR *end; /* while out < end, enough space available */ -+ unsigned char FAR *limit; /* safety limit for chunky copies */ - #ifdef INFLATE_STRICT - unsigned dmax; /* maximum distance from zlib header */ - #endif -@@ -84,12 +86,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - out = strm->next_out; - beg = out - (start - strm->avail_out); - end = out + (strm->avail_out - 257); -+ limit = out + strm->avail_out; - #ifdef INFLATE_STRICT - dmax = state->dmax; - #endif - wsize = state->wsize; - whave = state->whave; -- wnext = state->wnext; -+ wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; - window = state->window; - hold = state->hold; - bits = state->bits; -@@ -197,70 +200,51 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - #endif - } - from = window; -- if (wnext == 0) { /* very common case */ -- from += wsize - op; -- if (op < len) { /* some from window */ -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -+ if (wnext >= op) { /* contiguous in window */ -+ from += wnext - op; - } -- else if (wnext < op) { /* wrap around window */ -- from += wsize + wnext - op; -+ else { /* wrap around window */ - op -= wnext; -+ from += wsize - op; - if (op < len) { /* some from end of window */ - len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = window; -- if (wnext < len) { /* some from start of window */ -- op = wnext; -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -+ out = chunkcopy_safe(out, from, op, limit); -+ from = window; /* more from start of window */ -+ op = wnext; -+ /* This (rare) case can create a situation where -+ the first chunkcopy below must be checked. -+ */ - } - } -- else { /* contiguous in window */ -- from += wnext - op; -- if (op < len) { /* some from window */ -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -- } -- while (len > 2) { -- *out++ = *from++; -- *out++ = *from++; -- *out++ = *from++; -- len -= 3; -- } -- if (len) { -- *out++ = *from++; -- if (len > 1) -- *out++ = *from++; -+ if (op < len) { /* still need some from output */ -+ out = chunkcopy_safe(out, from, op, limit); -+ len -= op; -+ /* When dist is small the amount of data that can be -+ copied from the window is also small, and progress -+ towards the dangerous end of the output buffer is -+ also small. This means that for trivial memsets and -+ for chunkunroll_relaxed() a safety check is -+ unnecessary. However, these conditions may not be -+ entered at all, and in that case it's possible that -+ the main copy is near the end. -+ */ -+ out = chunkunroll_relaxed(out, &dist, &len); -+ out = chunkcopy_safe(out, out - dist, len, limit); -+ } else { -+ /* from points to window, so there is no risk of -+ overlapping pointers requiring memset-like behaviour -+ */ -+ out = chunkcopy_safe(out, from, len, limit); - } - } - else { -- from = out - dist; /* copy direct from output */ -- do { /* minimum length is three */ -- *out++ = *from++; -- *out++ = *from++; -- *out++ = *from++; -- len -= 3; -- } while (len > 2); -- if (len) { -- *out++ = *from++; -- if (len > 1) -- *out++ = *from++; -- } -+ /* Whole reference is in range of current output. No -+ range checks are necessary because we start with room -+ for at least 258 bytes of output, so unroll and roundoff -+ operations can write beyond `out+len` so long as they -+ stay within 258 bytes of `out`. -+ */ -+ out = chunkcopy_lapped_relaxed(out, dist, len); - } - } - else if ((op & 64) == 0) { /* 2nd level distance code */ -diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c -index ac333e8c..e40322c3 100644 ---- a/contrib/arm/inflate.c -+++ b/contrib/arm/inflate.c -@@ -84,6 +84,7 @@ - #include "inftrees.h" - #include "inflate.h" - #include "inffast.h" -+#include "contrib/arm/chunkcopy.h" - - #ifdef MAKEFIXED - # ifndef BUILDFIXED -@@ -405,10 +406,20 @@ unsigned copy; - - /* if it hasn't been done already, allocate space for the window */ - if (state->window == Z_NULL) { -+ unsigned wsize = 1U << state->wbits; - state->window = (unsigned char FAR *) -- ZALLOC(strm, 1U << state->wbits, -+ ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, - sizeof(unsigned char)); - if (state->window == Z_NULL) return 1; -+#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED -+ /* Copies from the overflow portion of this buffer are undefined and -+ may cause analysis tools to raise a warning if we don't initialize -+ it. However, this undefined data overwrites other undefined data -+ and is subsequently either overwritten or left deliberately -+ undefined at the end of decode; so there's really no point. -+ */ -+ memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE); -+#endif - } - - /* if window not in use yet, initialize */ -@@ -1175,17 +1186,16 @@ int flush; - else - from = state->window + (state->wnext - copy); - if (copy > state->length) copy = state->length; -+ if (copy > left) copy = left; -+ put = chunkcopy_safe(put, from, copy, put + left); - } - else { /* copy from output */ -- from = put - state->offset; - copy = state->length; -+ if (copy > left) copy = left; -+ put = chunkcopy_lapped_safe(put, state->offset, copy, put + left); - } -- if (copy > left) copy = left; - left -= copy; - state->length -= copy; -- do { -- *put++ = *from++; -- } while (--copy); - if (state->length == 0) state->mode = LEN; - break; - case LIT: |