From 247147654fe5cd11cf15d8dff91440405ea57040 Mon Sep 17 00:00:00 2001 From: Simon Hosie Date: Wed, 12 Apr 2017 15:44:21 -0700 Subject: [PATCH 2/2] Inflate using wider loads and stores In inflate_fast() the output pointer always has plenty of room to write. This means that so long as the target is capable, wide un-aligned loads and stores can be used to transfer several bytes at once. When the reference distance is too short simply unroll the data a little to increase the distance. Change-Id: I59854eb25d2b1e43561c8a2afaf9175bf10cf674 --- contrib/arm/chunkcopy.h | 279 ++++++++++++++++++++++++++++++++++++++++++++++++ contrib/arm/inffast.c | 96 +++++++---------- contrib/arm/inflate.c | 22 ++-- 3 files changed, 335 insertions(+), 62 deletions(-) create mode 100644 contrib/arm/chunkcopy.h diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h new file mode 100644 index 00000000..2d6fd6f9 --- /dev/null +++ b/contrib/arm/chunkcopy.h @@ -0,0 +1,279 @@ +/* chunkcopy.h -- fast copies and sets + * Copyright (C) 2017 ARM, Inc. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNKCOPY_H +#define CHUNKCOPY_H + +#include "zutil.h" +#include + +#if __STDC_VERSION__ >= 199901L +#define Z_RESTRICT restrict +#else +#define Z_RESTRICT +#endif + +typedef uint8x16_t chunkcopy_chunk_t; +#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) + +/* + Ask the compiler to perform a wide, unaligned load with an machine + instruction appropriate for the chunkcopy_chunk_t type. + */ +static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) { + chunkcopy_chunk_t c; + __builtin_memcpy(&c, s, sizeof(c)); + return c; +} + +/* + Ask the compiler to perform a wide, unaligned store with an machine + instruction appropriate for the chunkcopy_chunk_t type. + */ +static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) { + __builtin_memcpy(d, &c, sizeof(c)); +} + +/* + Perform a memcpy-like operation, but assume that length is non-zero and that + it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if + the length is shorter than this. + + It also guarantees that it will properly unroll the data if the distance + between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on + in chunkcopy_relaxed(). + + Aside from better memory bus utilisation, this means that short copies + (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop + without iteration, which will hopefully make the branch prediction more + reliable. + */ +static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out, + const unsigned char FAR *from, + unsigned len) { + int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; + storechunk(out, loadchunk(from)); + out += bump; + from += bump; + len /= CHUNKCOPY_CHUNK_SIZE; + while (len-- > 0) { + storechunk(out, loadchunk(from)); + out += CHUNKCOPY_CHUNK_SIZE; + from += CHUNKCOPY_CHUNK_SIZE; + } + return out; +} + +/* + Like chunkcopy_core, but avoid writing beyond of legal output. + + Accepts an additional pointer to the end of safe output. A generic safe + copy would use (out + len), but it's normally the case that the end of the + output buffer is beyond the end of the current copy, and this can still be + exploited. + */ +static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out, + const unsigned char FAR * from, + unsigned len, + unsigned char FAR *limit) { + Assert(out + len <= limit, "chunk copy exceeds safety limit"); + if (limit - out < CHUNKCOPY_CHUNK_SIZE) { + const unsigned char FAR * Z_RESTRICT rfrom = from; + if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; } + if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; } + if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; } + if (len & 1) { *out++ = *rfrom++; } + return out; + } + return chunkcopy_core(out, from, len); +} + +/* + Perform short copies until distance can be rewritten as being at least + CHUNKCOPY_CHUNK_SIZE. + + This assumes that it's OK to overwrite at least the first + 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than + this. This assumption holds within inflate_fast() which starts every + iteration with at least 258 bytes of output space available (258 being the + maximum length output from a single token; see inffast.c). + */ +static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out, + unsigned FAR *dist, + unsigned FAR *len) { + const unsigned char FAR *from = out - *dist; + while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { + storechunk(out, loadchunk(from)); + out += *dist; + *len -= *dist; + *dist += *dist; + } + return out; +} + + +static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RESTRICT from) { +#if defined(__clang__) || defined(__aarch64__) + return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from)); +#else + /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a + * void pointer, so here's an alternate implementation. + */ + uint8x8_t h = vld1_u8(from); + return vcombine_u8(h, h); +#endif +} + +/* + Perform an overlapping copy which behaves as a memset() operation, but + supporting periods other than one, and assume that length is non-zero and + that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output + even if the length is shorter than this. + */ +static inline unsigned char FAR *chunkset_core(unsigned char FAR *out, + unsigned period, + unsigned len) { + uint8x16_t f; + int bump = ((len - 1) % sizeof(f)) + 1; + + switch (period) { + case 1: + f = vld1q_dup_u8(out - 1); + vst1q_u8(out, f); + out += bump; + len -= bump; + while (len > 0) { + vst1q_u8(out, f); + out += sizeof(f); + len -= sizeof(f); + } + return out; + case 2: + f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); + vst1q_u8(out, f); + out += bump; + len -= bump; + if (len > 0) { + f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); + do { + vst1q_u8(out, f); + out += sizeof(f); + len -= sizeof(f); + } while (len > 0); + } + return out; + case 4: + f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); + vst1q_u8(out, f); + out += bump; + len -= bump; + if (len > 0) { + f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); + do { + vst1q_u8(out, f); + out += sizeof(f); + len -= sizeof(f); + } while (len > 0); + } + return out; + case 8: + f = chunkset_vld1q_dup_u8x8(out - 8); + vst1q_u8(out, f); + out += bump; + len -= bump; + if (len > 0) { + f = chunkset_vld1q_dup_u8x8(out - 8); + do { + vst1q_u8(out, f); + out += sizeof(f); + len -= sizeof(f); + } while (len > 0); + } + return out; + } + out = chunkunroll_relaxed(out, &period, &len); + return chunkcopy_core(out, out - period, len); +} + +/* + Perform a memcpy-like operation, but assume that length is non-zero and that + it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if + the length is shorter than this. + + Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour + of overlapping buffers, regardless of the distance between the pointers. + This is reflected in the `restrict`-qualified pointers, allowing the + compiler to reorder loads and stores. + */ +static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out, + const unsigned char FAR * Z_RESTRICT from, + unsigned len) { + return chunkcopy_core(out, from, len); +} + +/* + Like chunkcopy_relaxed, but avoid writing beyond of legal output. + + Unlike chunkcopy_core_safe() above, no guarantee is made regarding the + behaviour of overlapping buffers, regardless of the distance between the + pointers. This is reflected in the `restrict`-qualified pointers, allowing + the compiler to reorder loads and stores. + + Accepts an additional pointer to the end of safe output. A generic safe + copy would use (out + len), but it's normally the case that the end of the + output buffer is beyond the end of the current copy, and this can still be + exploited. + */ +static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out, + const unsigned char FAR * Z_RESTRICT from, + unsigned len, + unsigned char FAR *limit) { + Assert(out + len <= limit, "chunk copy exceeds safety limit"); + return chunkcopy_core_safe(out, from, len, limit); +} + +/* + Perform chunky copy within the same buffer, where the source and destination + may potentially overlap. + + Assumes that len > 0 on entry, and that it's safe to write at least + CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. + */ +static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out, + unsigned dist, + unsigned len) { + if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { + return chunkset_core(out, dist, len); + } + return chunkcopy_core(out, out - dist, len); +} + +/* + Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal output. + + Accepts an additional pointer to the end of safe output. A generic safe + copy would use (out + len), but it's normally the case that the end of the + output buffer is beyond the end of the current copy, and this can still be + exploited. + */ +static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out, + unsigned dist, + unsigned len, + unsigned char FAR *limit) { + Assert(out + len <= limit, "chunk copy exceeds safety limit"); + if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { + /* TODO: try harder to optimise this */ + while (len-- > 0) { + *out = *(out - dist); + out++; + } + return out; + } + return chunkcopy_lapped_relaxed(out, dist, len); +} + +#undef Z_RESTRICT + +#endif /* CHUNKCOPY_H */ diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c index 0dbd1dbc..f7f50071 100644 --- a/contrib/arm/inffast.c +++ b/contrib/arm/inffast.c @@ -7,6 +7,7 @@ #include "inftrees.h" #include "inflate.h" #include "inffast.h" +#include "chunkcopy.h" #ifdef ASMINF # pragma message("Assembler code may have bugs -- use at your own risk") @@ -57,6 +58,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ unsigned char FAR *out; /* local strm->next_out */ unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ unsigned char FAR *end; /* while out < end, enough space available */ + unsigned char FAR *limit; /* safety limit for chunky copies */ #ifdef INFLATE_STRICT unsigned dmax; /* maximum distance from zlib header */ #endif @@ -84,12 +86,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ out = strm->next_out; beg = out - (start - strm->avail_out); end = out + (strm->avail_out - 257); + limit = out + strm->avail_out; #ifdef INFLATE_STRICT dmax = state->dmax; #endif wsize = state->wsize; whave = state->whave; - wnext = state->wnext; + wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; window = state->window; hold = state->hold; bits = state->bits; @@ -197,70 +200,51 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ #endif } from = window; - if (wnext == 0) { /* very common case */ - from += wsize - op; - if (op < len) { /* some from window */ - len -= op; - do { - *out++ = *from++; - } while (--op); - from = out - dist; /* rest from output */ - } + if (wnext >= op) { /* contiguous in window */ + from += wnext - op; } - else if (wnext < op) { /* wrap around window */ - from += wsize + wnext - op; + else { /* wrap around window */ op -= wnext; + from += wsize - op; if (op < len) { /* some from end of window */ len -= op; - do { - *out++ = *from++; - } while (--op); - from = window; - if (wnext < len) { /* some from start of window */ - op = wnext; - len -= op; - do { - *out++ = *from++; - } while (--op); - from = out - dist; /* rest from output */ - } + out = chunkcopy_safe(out, from, op, limit); + from = window; /* more from start of window */ + op = wnext; + /* This (rare) case can create a situation where + the first chunkcopy below must be checked. + */ } } - else { /* contiguous in window */ - from += wnext - op; - if (op < len) { /* some from window */ - len -= op; - do { - *out++ = *from++; - } while (--op); - from = out - dist; /* rest from output */ - } - } - while (len > 2) { - *out++ = *from++; - *out++ = *from++; - *out++ = *from++; - len -= 3; - } - if (len) { - *out++ = *from++; - if (len > 1) - *out++ = *from++; + if (op < len) { /* still need some from output */ + out = chunkcopy_safe(out, from, op, limit); + len -= op; + /* When dist is small the amount of data that can be + copied from the window is also small, and progress + towards the dangerous end of the output buffer is + also small. This means that for trivial memsets and + for chunkunroll_relaxed() a safety check is + unnecessary. However, these conditions may not be + entered at all, and in that case it's possible that + the main copy is near the end. + */ + out = chunkunroll_relaxed(out, &dist, &len); + out = chunkcopy_safe(out, out - dist, len, limit); + } else { + /* from points to window, so there is no risk of + overlapping pointers requiring memset-like behaviour + */ + out = chunkcopy_safe(out, from, len, limit); } } else { - from = out - dist; /* copy direct from output */ - do { /* minimum length is three */ - *out++ = *from++; - *out++ = *from++; - *out++ = *from++; - len -= 3; - } while (len > 2); - if (len) { - *out++ = *from++; - if (len > 1) - *out++ = *from++; - } + /* Whole reference is in range of current output. No + range checks are necessary because we start with room + for at least 258 bytes of output, so unroll and roundoff + operations can write beyond `out+len` so long as they + stay within 258 bytes of `out`. + */ + out = chunkcopy_lapped_relaxed(out, dist, len); } } else if ((op & 64) == 0) { /* 2nd level distance code */ diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c index ac333e8c..e40322c3 100644 --- a/contrib/arm/inflate.c +++ b/contrib/arm/inflate.c @@ -84,6 +84,7 @@ #include "inftrees.h" #include "inflate.h" #include "inffast.h" +#include "contrib/arm/chunkcopy.h" #ifdef MAKEFIXED # ifndef BUILDFIXED @@ -405,10 +406,20 @@ unsigned copy; /* if it hasn't been done already, allocate space for the window */ if (state->window == Z_NULL) { + unsigned wsize = 1U << state->wbits; state->window = (unsigned char FAR *) - ZALLOC(strm, 1U << state->wbits, + ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, sizeof(unsigned char)); if (state->window == Z_NULL) return 1; +#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED + /* Copies from the overflow portion of this buffer are undefined and + may cause analysis tools to raise a warning if we don't initialize + it. However, this undefined data overwrites other undefined data + and is subsequently either overwritten or left deliberately + undefined at the end of decode; so there's really no point. + */ + memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE); +#endif } /* if window not in use yet, initialize */ @@ -1175,17 +1186,16 @@ int flush; else from = state->window + (state->wnext - copy); if (copy > state->length) copy = state->length; + if (copy > left) copy = left; + put = chunkcopy_safe(put, from, copy, put + left); } else { /* copy from output */ - from = put - state->offset; copy = state->length; + if (copy > left) copy = left; + put = chunkcopy_lapped_safe(put, state->offset, copy, put + left); } - if (copy > left) copy = left; left -= copy; state->length -= copy; - do { - *put++ = *from++; - } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: