Skip to content

Commit b52e703

Browse files
KungFuJesusDead2
authored andcommitted
Simplify avx2 chunkset a bit
Put length 16 in the length checking ladder and take care of it there since it's also a simple case to handle. We kind of went out of our way to pretend 128 bit vectors didn't exist when using avx2 but this can be handled in a single instruction. Strangely the intrinsic uses vector register operands but the instruction itself assumes a memory operand for the source. This also means we don't have to handle this case in our "GET_CHUNK_MAG" function.
1 parent dae668d commit b52e703

2 files changed

Lines changed: 14 additions & 7 deletions

File tree

arch/x86/chunkset_avx2.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ typedef __m256i chunk_t;
1414
#define HAVE_CHUNKMEMSET_2
1515
#define HAVE_CHUNKMEMSET_4
1616
#define HAVE_CHUNKMEMSET_8
17+
#define HAVE_CHUNKMEMSET_16
1718
#define HAVE_CHUNK_MAG
1819

1920
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
@@ -68,6 +69,10 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
6869
*chunk = _mm256_set1_epi64x(tmp);
6970
}
7071

72+
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
73+
*chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
74+
}
75+
7176
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
7277
*chunk = _mm256_loadu_si256((__m256i *)s);
7378
}
@@ -99,10 +104,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
99104
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
100105
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
101106
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
102-
} else if (dist == 16) {
103-
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
104-
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
105-
} else {
107+
} else {
106108
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
107109
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
108110
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */

chunkset_tpl.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,16 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
130130
#ifdef HAVE_CHUNKMEMSET_8
131131
if (dist == 8) {
132132
chunkmemset_8(from, &chunk_load);
133-
} else if (dist == sizeof(chunk_t)) {
134-
loadchunk(from, &chunk_load);
135133
} else
136134
#endif
137-
{
135+
#ifdef HAVE_CHUNKMEMSET_16
136+
if (dist == 16) {
137+
chunkmemset_16(from, &chunk_load);
138+
} else
139+
#endif
140+
if (dist == sizeof(chunk_t)) {
141+
loadchunk(from, &chunk_load);
142+
} else {
138143
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
139144
}
140145

0 commit comments

Comments
 (0)