Skip to content

Commit 94aacd8

Browse files
KungFuJesusDead2
authored andcommitted
Try to simply the inflate loop by collapsing most cases to chunksets
1 parent e874b34 commit 94aacd8

12 files changed

Lines changed: 90 additions & 83 deletions

arch/arm/arm_functions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#ifdef ARM_NEON
99
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
1010
uint32_t chunksize_neon(void);
11-
uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
11+
uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
1212

1313
# ifdef HAVE_BUILTIN_CTZLL
1414
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);

arch/generic/generic_functions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
2222
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
2323

2424
uint32_t chunksize_c(void);
25-
uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
25+
uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
2626
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
2727

2828
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);

arch/power/power_functions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ void slide_hash_vmx(deflate_state *s);
1515
#ifdef POWER8_VSX
1616
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
1717
uint32_t chunksize_power8(void);
18-
uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
18+
uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
1919
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
2020
void slide_hash_power8(deflate_state *s);
2121
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);

arch/riscv/riscv_functions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
1414
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
1515
uint32_t chunksize_rvv(void);
16-
uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
16+
uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
1717
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
1818

1919
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);

arch/x86/chunkset_avx2.c

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ typedef __m128i halfchunk_t;
1515
#define HAVE_CHUNKMEMSET_4
1616
#define HAVE_CHUNKMEMSET_8
1717
#define HAVE_CHUNKMEMSET_16
18+
#define HAVE_CHUNKMEMSET_1
1819
#define HAVE_CHUNK_MAG
1920
#define HAVE_HALF_CHUNK
2021

@@ -125,24 +126,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
125126
return ret_vec;
126127
}
127128

128-
static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
129-
int16_t tmp;
130-
memcpy(&tmp, from, sizeof(tmp));
131-
*chunk = _mm_set1_epi16(tmp);
132-
}
133-
134-
static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
135-
int32_t tmp;
136-
memcpy(&tmp, from, sizeof(tmp));
137-
*chunk = _mm_set1_epi32(tmp);
138-
}
139-
140-
static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
141-
int64_t tmp;
142-
memcpy(&tmp, from, sizeof(tmp));
143-
*chunk = _mm_set1_epi64x(tmp);
144-
}
145-
146129
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
147130
*chunk = _mm_loadu_si128((__m128i *)s);
148131
}
@@ -151,10 +134,10 @@ static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
151134
_mm_storeu_si128((__m128i *)out, *chunk);
152135
}
153136

154-
static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
137+
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
155138
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
156139
* unlikely to be actually written or read from */
157-
return _mm256_zextsi128_si256(chunk);
140+
return _mm256_zextsi128_si256(*chunk);
158141
}
159142

160143
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {

arch/x86/x86_functions.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#ifdef X86_SSE2
1010
uint32_t chunksize_sse2(void);
11-
uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
11+
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
1212

1313
# ifdef HAVE_BUILTIN_CTZ
1414
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
@@ -21,7 +21,7 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsign
2121

2222
#ifdef X86_SSSE3
2323
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
24-
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
24+
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
2525
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
2626
#endif
2727

@@ -33,7 +33,7 @@ uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *sr
3333
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
3434
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
3535
uint32_t chunksize_avx2(void);
36-
uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
36+
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
3737

3838
# ifdef HAVE_BUILTIN_CTZ
3939
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);

chunkset_tpl.h

Lines changed: 62 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include "zbuild.h"
66
#include <stdlib.h>
7+
#include <stdio.h>
78

89
/* Returns the chunk size */
910
Z_INTERNAL uint32_t CHUNKSIZE(void) {
@@ -69,18 +70,18 @@ static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len)
6970
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
7071
/* This code takes string of length dist from "from" and repeats
7172
* it for as many times as can fit in a chunk_t (vector register) */
72-
uint32_t cpy_dist;
73-
uint32_t bytes_remaining = sizeof(chunk_t);
73+
uint64_t cpy_dist;
74+
uint64_t bytes_remaining = sizeof(chunk_t);
7475
chunk_t chunk_load;
7576
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
7677
while (bytes_remaining) {
7778
cpy_dist = MIN(dist, bytes_remaining);
78-
memcpy(cur_chunk, buf, cpy_dist);
79+
memcpy(cur_chunk, buf, (size_t)cpy_dist);
7980
bytes_remaining -= cpy_dist;
8081
cur_chunk += cpy_dist;
8182
/* This allows us to bypass an expensive integer division since we're effectively
8283
* counting in this loop, anyway */
83-
*chunk_rem = cpy_dist;
84+
*chunk_rem = (uint32_t)cpy_dist;
8485
}
8586

8687
return chunk_load;
@@ -109,21 +110,33 @@ static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned
109110

110111
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
111112
Return OUT + LEN. */
112-
static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
113+
static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
113114
/* Debug performance related issues when len < sizeof(uint64_t):
114115
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
115-
Assert(dist > 0, "chunkmemset cannot have a distance 0");
116+
Assert(from != out, "chunkmemset cannot have a distance 0");
116117

117-
uint8_t *from = out - dist;
118118
chunk_t chunk_load;
119119
uint32_t chunk_mod = 0;
120120
uint32_t adv_amount;
121+
int64_t sdist = out - from;
122+
uint64_t dist = llabs(sdist);
123+
124+
/* We are supporting the case for when we are reading bytes from ahead in the buffer.
125+
* We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
126+
* always needed to be handled here or if we're just now seeing it because we are
127+
* dispatching to this function, more */
128+
if (sdist < 0 && dist < len) {
129+
/* Here the memmove semantics match perfectly, as when this happens we are
130+
* effectively sliding down the contents of memory by dist bytes */
131+
memmove(out, from, len);
132+
return out + len;
133+
}
121134

122135
if (dist == 1) {
123136
memset(out, *from, len);
124137
return out + len;
125-
} else if (dist > sizeof(chunk_t)) {
126-
return CHUNKCOPY(out, out - dist, len);
138+
} else if (dist >= sizeof(chunk_t)) {
139+
return CHUNKCOPY(out, from, len);
127140
}
128141

129142
/* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
@@ -135,33 +148,22 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
135148
* making the code a little smaller. */
136149
#ifdef HAVE_HALF_CHUNK
137150
if (len <= sizeof(halfchunk_t)) {
138-
if (dist > sizeof(halfchunk_t)) {
139-
return HALFCHUNKCOPY(out, out - dist, len);
140-
}
151+
if (dist >= sizeof(halfchunk_t))
152+
return HALFCHUNKCOPY(out, from, len);
141153

142-
halfchunk_t halfchunk_load;
143-
144-
if (dist == 2) {
145-
halfchunkmemset_2(from, &halfchunk_load);
146-
} else if (dist == 4) {
147-
halfchunkmemset_4(from, &halfchunk_load);
148-
} else if (dist == 8) {
149-
halfchunkmemset_8(from, &halfchunk_load);
150-
} else if (dist == 16) {
151-
loadhalfchunk(from, &halfchunk_load);
152-
} else {
153-
halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
154-
}
154+
if ((dist % 2) != 0 || dist == 6) {
155+
halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
155156

156-
adv_amount = sizeof(halfchunk_t) - chunk_mod;
157-
while (len >= sizeof(halfchunk_t)) {
158-
storehalfchunk(out, &halfchunk_load);
159-
len -= adv_amount;
160-
out += adv_amount;
161-
}
157+
adv_amount = sizeof(halfchunk_t) - chunk_mod;
158+
if (len == sizeof(halfchunk_t)) {
159+
storehalfchunk(out, &halfchunk_load);
160+
len -= adv_amount;
161+
out += adv_amount;
162+
}
162163

163-
chunk_load = halfchunk2whole(halfchunk_load);
164-
goto rem_bytes;
164+
chunk_load = halfchunk2whole(&halfchunk_load);
165+
goto rem_bytes;
166+
}
165167
}
166168
#endif
167169

@@ -185,11 +187,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
185187
chunkmemset_16(from, &chunk_load);
186188
} else
187189
#endif
188-
if (dist == sizeof(chunk_t)) {
189-
loadchunk(from, &chunk_load);
190-
} else {
191-
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
192-
}
190+
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, (unsigned)dist);
193191

194192
adv_amount = sizeof(chunk_t) - chunk_mod;
195193

@@ -221,7 +219,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
221219
return out;
222220
}
223221

224-
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
222+
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, unsigned left) {
225223
#if !defined(UNALIGNED64_OK)
226224
# if !defined(UNALIGNED_OK)
227225
static const uint32_t align_mask = 7;
@@ -231,23 +229,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len,
231229
#endif
232230

233231
len = MIN(len, left);
234-
uint8_t *from = out - dist;
232+
235233
#if !defined(UNALIGNED64_OK)
236234
while (((uintptr_t)out & align_mask) && (len > 0)) {
237235
*out++ = *from++;
238236
--len;
239237
--left;
240238
}
241239
#endif
242-
if (left < (unsigned)(3 * sizeof(chunk_t))) {
240+
if (UNLIKELY(left < sizeof(chunk_t))) {
243241
while (len > 0) {
244242
*out++ = *from++;
245243
--len;
246244
}
245+
247246
return out;
248247
}
248+
249249
if (len)
250-
return CHUNKMEMSET(out, dist, len);
250+
out = CHUNKMEMSET(out, from, len);
251251

252252
return out;
253253
}
254+
255+
static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe)
256+
{
257+
if (out == from)
258+
return out + len;
259+
260+
uint64_t safelen = (safe - out);
261+
len = MIN(len, (unsigned)safelen);
262+
263+
uint64_t from_dist = (uint64_t)llabs(safe - from);
264+
if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
265+
while (len--) {
266+
*out++ = *from++;
267+
}
268+
269+
return out;
270+
}
271+
272+
return CHUNKMEMSET(out, from, len);
273+
}

functable.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,9 @@ static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8
273273
return functable.adler32_fold_copy(adler, dst, src, len);
274274
}
275275

276-
static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
276+
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
277277
init_functable();
278-
return functable.chunkmemset_safe(out, dist, len, left);
278+
return functable.chunkmemset_safe(out, from, len, left);
279279
}
280280

281281
static uint32_t chunksize_stub(void) {

functable.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ struct functable_s {
2727
void (* force_init) (void);
2828
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
2929
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
30-
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
30+
uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, unsigned len, unsigned left);
3131
uint32_t (* chunksize) (void);
3232
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
3333
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);

inffast_tpl.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
235235
from += wsize - op;
236236
if (op < len) { /* some from end of window */
237237
len -= op;
238-
out = chunkcopy_safe(out, from, op, safe);
238+
out = CHUNKCOPY_SAFE(out, from, op, safe);
239239
from = window; /* more from start of window */
240240
op = wnext;
241241
/* This (rare) case can create a situation where
@@ -245,19 +245,23 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
245245
}
246246
if (op < len) { /* still need some from output */
247247
len -= op;
248-
out = chunkcopy_safe(out, from, op, safe);
249-
if (!extra_safe)
248+
if (!extra_safe) {
249+
out = CHUNKCOPY_SAFE(out, from, op, safe);
250250
out = CHUNKUNROLL(out, &dist, &len);
251-
out = chunkcopy_safe(out, out - dist, len, safe);
251+
out = CHUNKCOPY_SAFE(out, out - dist, len, safe);
252+
} else {
253+
out = chunkcopy_safe(out, from, op, safe);
254+
out = chunkcopy_safe(out, out - dist, len, safe);
255+
}
252256
} else {
253-
out = chunkcopy_safe(out, from, len, safe);
257+
if (!extra_safe)
258+
out = CHUNKCOPY_SAFE(out, from, len, safe);
259+
else
260+
out = chunkcopy_safe(out, from, len, safe);
254261
}
255262
} else if (extra_safe) {
256263
/* Whole reference is in range of current output. */
257-
if (dist >= len || dist >= state->chunksize)
258264
out = chunkcopy_safe(out, out - dist, len, safe);
259-
else
260-
out = CHUNKMEMSET_SAFE(out, dist, len, (unsigned)((safe - out)));
261265
} else {
262266
/* Whole reference is in range of current output. No range checks are
263267
necessary because we start with room for at least 258 bytes of output,
@@ -267,7 +271,7 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
267271
if (dist >= len || dist >= state->chunksize)
268272
out = CHUNKCOPY(out, out - dist, len);
269273
else
270-
out = CHUNKMEMSET(out, dist, len);
274+
out = CHUNKMEMSET(out, out - dist, len);
271275
}
272276
} else if ((op & 64) == 0) { /* 2nd level distance code */
273277
here = dcode + here->val + BITS(op);

0 commit comments

Comments
 (0)