@@ -62,20 +62,22 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
6262 _mm256_storeu_si256 ((__m256i * )out , * chunk );
6363}
6464
65- static inline void storechunk_mask (uint8_t * out , mask_t mask , chunk_t * chunk ) {
66- _mm256_mask_storeu_epi8 (out , mask , * chunk );
67- }
68-
6965static inline uint8_t * CHUNKCOPY (uint8_t * out , uint8_t const * from , unsigned len ) {
7066 Assert (len > 0 , "chunkcopy should never have a length 0" );
7167
72- unsigned rem = len % sizeof (chunk_t );
73- mask_t rem_mask = gen_mask (rem );
74-
75- /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
7668 chunk_t chunk ;
69+ uint32_t rem = len % sizeof (chunk_t );
70+
71+ if (len < sizeof (chunk_t )) {
72+ mask_t rem_mask = gen_mask (rem );
73+ chunk = _mm256_maskz_loadu_epi8 (rem_mask , from );
74+ _mm256_mask_storeu_epi8 (out , rem_mask , chunk );
75+ return out + rem ;
76+ }
77+
7778 loadchunk (from , & chunk );
78- _mm256_mask_storeu_epi8 (out , rem_mask , chunk );
79+ rem = (rem == 0 ) ? sizeof (chunk_t ) : rem ;
80+ storechunk (out , & chunk );
7981 out += rem ;
8082 from += rem ;
8183 len -= rem ;
@@ -122,10 +124,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
122124 return ret_vec ;
123125}
124126
125- static inline void loadhalfchunk (uint8_t const * s , halfchunk_t * chunk ) {
126- * chunk = _mm_loadu_si128 ((__m128i * )s );
127- }
128-
129127static inline void storehalfchunk (uint8_t * out , halfchunk_t * chunk ) {
130128 _mm_storeu_si128 ((__m128i * )out , * chunk );
131129}
@@ -151,27 +149,18 @@ static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, u
151149
152150static inline uint8_t * HALFCHUNKCOPY (uint8_t * out , uint8_t const * from , unsigned len ) {
153151 Assert (len > 0 , "chunkcopy should never have a length 0" );
154-
155- unsigned rem = len % sizeof (halfchunk_t );
156- halfmask_t rem_mask = gen_half_mask (rem );
157-
158- /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
159152 halfchunk_t chunk ;
160- loadhalfchunk (from , & chunk );
161- _mm_mask_storeu_epi8 (out , rem_mask , chunk );
162- out += rem ;
163- from += rem ;
164- len -= rem ;
165153
166- while (len > 0 ) {
167- loadhalfchunk (from , & chunk );
168- storehalfchunk (out , & chunk );
169- out += sizeof (halfchunk_t );
170- from += sizeof (halfchunk_t );
171- len -= sizeof (halfchunk_t );
154+ uint32_t rem = len % sizeof (halfchunk_t );
155+ if (rem == 0 ) {
156+ rem = sizeof (halfchunk_t );
172157 }
173158
174- return out ;
159+ halfmask_t rem_mask = gen_half_mask (rem );
160+ chunk = _mm_maskz_loadu_epi8 (rem_mask , from );
161+ _mm_mask_storeu_epi8 (out , rem_mask , chunk );
162+
163+ return out + rem ;
175164}
176165
177166#define CHUNKSIZE chunksize_avx512
0 commit comments