Skip to content

Commit 43d74a2

Browse files
KungFuJesusDead2
authored andcommitted
Improve pipeling for AVX512 chunking
For reasons that aren't quite so clear, using the masked writes here did not pipeline very well. Either setting up the mask stalled things or masked moves have issues overlapping regular moves. Simply putting the masked moves behind a branch that is rarely taken seemed to do the trick in improving the ILP. While here, put masked loads behind the same branch in case there were ever a hazard for overreading.
1 parent a4e7c34 commit 43d74a2

2 files changed

Lines changed: 19 additions & 34 deletions

File tree

arch/x86/chunkset_avx512.c

Lines changed: 19 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -62,20 +62,22 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
6262
_mm256_storeu_si256((__m256i *)out, *chunk);
6363
}
6464

65-
static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) {
66-
_mm256_mask_storeu_epi8(out, mask, *chunk);
67-
}
68-
6965
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
7066
Assert(len > 0, "chunkcopy should never have a length 0");
7167

72-
unsigned rem = len % sizeof(chunk_t);
73-
mask_t rem_mask = gen_mask(rem);
74-
75-
/* Since this is only ever called if dist >= a chunk, we don't need a masked load */
7668
chunk_t chunk;
69+
uint32_t rem = len % sizeof(chunk_t);
70+
71+
if (len < sizeof(chunk_t)) {
72+
mask_t rem_mask = gen_mask(rem);
73+
chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
74+
_mm256_mask_storeu_epi8(out, rem_mask, chunk);
75+
return out + rem;
76+
}
77+
7778
loadchunk(from, &chunk);
78-
_mm256_mask_storeu_epi8(out, rem_mask, chunk);
79+
rem = (rem == 0) ? sizeof(chunk_t) : rem;
80+
storechunk(out, &chunk);
7981
out += rem;
8082
from += rem;
8183
len -= rem;
@@ -122,10 +124,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
122124
return ret_vec;
123125
}
124126

125-
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
126-
*chunk = _mm_loadu_si128((__m128i *)s);
127-
}
128-
129127
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
130128
_mm_storeu_si128((__m128i *)out, *chunk);
131129
}
@@ -151,27 +149,18 @@ static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, u
151149

152150
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
153151
Assert(len > 0, "chunkcopy should never have a length 0");
154-
155-
unsigned rem = len % sizeof(halfchunk_t);
156-
halfmask_t rem_mask = gen_half_mask(rem);
157-
158-
/* Since this is only ever called if dist >= a chunk, we don't need a masked load */
159152
halfchunk_t chunk;
160-
loadhalfchunk(from, &chunk);
161-
_mm_mask_storeu_epi8(out, rem_mask, chunk);
162-
out += rem;
163-
from += rem;
164-
len -= rem;
165153

166-
while (len > 0) {
167-
loadhalfchunk(from, &chunk);
168-
storehalfchunk(out, &chunk);
169-
out += sizeof(halfchunk_t);
170-
from += sizeof(halfchunk_t);
171-
len -= sizeof(halfchunk_t);
154+
uint32_t rem = len % sizeof(halfchunk_t);
155+
if (rem == 0) {
156+
rem = sizeof(halfchunk_t);
172157
}
173158

174-
return out;
159+
halfmask_t rem_mask = gen_half_mask(rem);
160+
chunk = _mm_maskz_loadu_epi8(rem_mask, from);
161+
_mm_mask_storeu_epi8(out, rem_mask, chunk);
162+
163+
return out + rem;
175164
}
176165

177166
#define CHUNKSIZE chunksize_avx512

chunkset_tpl.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -219,11 +219,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
219219
rem_bytes:
220220
#endif
221221
if (len) {
222-
#ifndef HAVE_MASKED_READWRITE
223222
memcpy(out, &chunk_load, len);
224-
#else
225-
storechunk_mask(out, gen_mask(len), &chunk_load);
226-
#endif
227223
out += len;
228224
}
229225

0 commit comments

Comments
 (0)