44
55#include "zbuild.h"
66#include <stdlib.h>
7+ #include <stdio.h>
78
89/* Returns the chunk size */
910Z_INTERNAL uint32_t CHUNKSIZE (void ) {
@@ -69,18 +70,18 @@ static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len)
6970static inline chunk_t GET_CHUNK_MAG (uint8_t * buf , uint32_t * chunk_rem , uint32_t dist ) {
7071 /* This code takes string of length dist from "from" and repeats
7172 * it for as many times as can fit in a chunk_t (vector register) */
72- uint32_t cpy_dist ;
73- uint32_t bytes_remaining = sizeof (chunk_t );
73+ uint64_t cpy_dist ;
74+ uint64_t bytes_remaining = sizeof (chunk_t );
7475 chunk_t chunk_load ;
7576 uint8_t * cur_chunk = (uint8_t * )& chunk_load ;
7677 while (bytes_remaining ) {
7778 cpy_dist = MIN (dist , bytes_remaining );
78- memcpy (cur_chunk , buf , cpy_dist );
79+ memcpy (cur_chunk , buf , ( size_t ) cpy_dist );
7980 bytes_remaining -= cpy_dist ;
8081 cur_chunk += cpy_dist ;
8182 /* This allows us to bypass an expensive integer division since we're effectively
8283 * counting in this loop, anyway */
83- * chunk_rem = cpy_dist ;
84+ * chunk_rem = ( uint32_t ) cpy_dist ;
8485 }
8586
8687 return chunk_load ;
@@ -109,21 +110,33 @@ static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned
109110
110111/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
111112 Return OUT + LEN. */
112- static inline uint8_t * CHUNKMEMSET (uint8_t * out , unsigned dist , unsigned len ) {
113+ static inline uint8_t * CHUNKMEMSET (uint8_t * out , uint8_t * from , unsigned len ) {
113114 /* Debug performance related issues when len < sizeof(uint64_t):
114115 Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
115- Assert (dist > 0 , "chunkmemset cannot have a distance 0" );
116+ Assert (from != out , "chunkmemset cannot have a distance 0" );
116117
117- uint8_t * from = out - dist ;
118118 chunk_t chunk_load ;
119119 uint32_t chunk_mod = 0 ;
120120 uint32_t adv_amount ;
121+ int64_t sdist = out - from ;
122+ uint64_t dist = llabs (sdist );
123+
124+ /* We are supporting the case for when we are reading bytes from ahead in the buffer.
125+ * We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
126+ * always needed to be handled here or if we're just now seeing it because we are
127+ * dispatching to this function, more */
128+ if (sdist < 0 && dist < len ) {
129+ /* Here the memmove semantics match perfectly, as when this happens we are
130+ * effectively sliding down the contents of memory by dist bytes */
131+ memmove (out , from , len );
132+ return out + len ;
133+ }
121134
122135 if (dist == 1 ) {
123136 memset (out , * from , len );
124137 return out + len ;
125- } else if (dist > sizeof (chunk_t )) {
126- return CHUNKCOPY (out , out - dist , len );
138+ } else if (dist >= sizeof (chunk_t )) {
139+ return CHUNKCOPY (out , from , len );
127140 }
128141
129142 /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
@@ -135,33 +148,22 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
135148 * making the code a little smaller. */
136149#ifdef HAVE_HALF_CHUNK
137150 if (len <= sizeof (halfchunk_t )) {
138- if (dist > sizeof (halfchunk_t )) {
139- return HALFCHUNKCOPY (out , out - dist , len );
140- }
151+ if (dist >= sizeof (halfchunk_t ))
152+ return HALFCHUNKCOPY (out , from , len );
141153
142- halfchunk_t halfchunk_load ;
143-
144- if (dist == 2 ) {
145- halfchunkmemset_2 (from , & halfchunk_load );
146- } else if (dist == 4 ) {
147- halfchunkmemset_4 (from , & halfchunk_load );
148- } else if (dist == 8 ) {
149- halfchunkmemset_8 (from , & halfchunk_load );
150- } else if (dist == 16 ) {
151- loadhalfchunk (from , & halfchunk_load );
152- } else {
153- halfchunk_load = GET_HALFCHUNK_MAG (from , & chunk_mod , dist );
154- }
154+ if ((dist % 2 ) != 0 || dist == 6 ) {
155+ halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG (from , & chunk_mod , (unsigned )dist );
155156
156- adv_amount = sizeof (halfchunk_t ) - chunk_mod ;
157- while (len > = sizeof (halfchunk_t )) {
158- storehalfchunk (out , & halfchunk_load );
159- len -= adv_amount ;
160- out += adv_amount ;
161- }
157+ adv_amount = sizeof (halfchunk_t ) - chunk_mod ;
158+ if (len = = sizeof (halfchunk_t )) {
159+ storehalfchunk (out , & halfchunk_load );
160+ len -= adv_amount ;
161+ out += adv_amount ;
162+ }
162163
163- chunk_load = halfchunk2whole (halfchunk_load );
164- goto rem_bytes ;
164+ chunk_load = halfchunk2whole (& halfchunk_load );
165+ goto rem_bytes ;
166+ }
165167 }
166168#endif
167169
@@ -185,11 +187,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
185187 chunkmemset_16 (from , & chunk_load );
186188 } else
187189#endif
188- if (dist == sizeof (chunk_t )) {
189- loadchunk (from , & chunk_load );
190- } else {
191- chunk_load = GET_CHUNK_MAG (from , & chunk_mod , dist );
192- }
190+ chunk_load = GET_CHUNK_MAG (from , & chunk_mod , (unsigned )dist );
193191
194192 adv_amount = sizeof (chunk_t ) - chunk_mod ;
195193
@@ -221,7 +219,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
221219 return out ;
222220}
223221
224- Z_INTERNAL uint8_t * CHUNKMEMSET_SAFE (uint8_t * out , unsigned dist , unsigned len , unsigned left ) {
222+ Z_INTERNAL uint8_t * CHUNKMEMSET_SAFE (uint8_t * out , uint8_t * from , unsigned len , unsigned left ) {
225223#if !defined(UNALIGNED64_OK )
226224# if !defined(UNALIGNED_OK )
227225 static const uint32_t align_mask = 7 ;
@@ -231,23 +229,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len,
231229#endif
232230
233231 len = MIN (len , left );
234- uint8_t * from = out - dist ;
232+
235233#if !defined(UNALIGNED64_OK )
236234 while (((uintptr_t )out & align_mask ) && (len > 0 )) {
237235 * out ++ = * from ++ ;
238236 -- len ;
239237 -- left ;
240238 }
241239#endif
242- if (left < ( unsigned )( 3 * sizeof (chunk_t ))) {
240+ if (UNLIKELY ( left < sizeof (chunk_t ))) {
243241 while (len > 0 ) {
244242 * out ++ = * from ++ ;
245243 -- len ;
246244 }
245+
247246 return out ;
248247 }
248+
249249 if (len )
250- return CHUNKMEMSET (out , dist , len );
250+ out = CHUNKMEMSET (out , from , len );
251251
252252 return out ;
253253}
254+
255+ static inline uint8_t * CHUNKCOPY_SAFE (uint8_t * out , uint8_t * from , unsigned len , uint8_t * safe )
256+ {
257+ if (out == from )
258+ return out + len ;
259+
260+ uint64_t safelen = (safe - out );
261+ len = MIN (len , (unsigned )safelen );
262+
263+ uint64_t from_dist = (uint64_t )llabs (safe - from );
264+ if (UNLIKELY (from_dist < sizeof (chunk_t ) || safelen < sizeof (chunk_t ))) {
265+ while (len -- ) {
266+ * out ++ = * from ++ ;
267+ }
268+
269+ return out ;
270+ }
271+
272+ return CHUNKMEMSET (out , from , len );
273+ }
0 commit comments