Skip to content

Commit e128537

Browse files
nmoinvazDead2
authored andcommitted
Use static inline functions for crc32 folding load/save.
1 parent d802e89 commit e128537

1 file changed

Lines changed: 34 additions & 39 deletions

File tree

arch/x86/crc32_fold_pclmulqdq.c

Lines changed: 34 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,6 @@
2525

2626
#include "../../crc32_fold.h"
2727

28-
Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
29-
/* CRC_SAVE */
30-
_mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487));
31-
_mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128());
32-
_mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128());
33-
_mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128());
34-
35-
return 0;
36-
}
37-
3828
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
3929
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
4030
0x00000001, 0xc6e41596);
@@ -227,24 +217,45 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
227217
*xmm_crc3 = _mm_castps_si128(ps_res);
228218
}
229219

220+
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
221+
*fold0 = _mm_load_si128(fold + 0);
222+
*fold1 = _mm_load_si128(fold + 1);
223+
*fold2 = _mm_load_si128(fold + 2);
224+
*fold3 = _mm_load_si128(fold + 3);
225+
}
226+
227+
static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) {
228+
_mm_storeu_si128(fold + 0, fold0);
229+
_mm_storeu_si128(fold + 1, fold1);
230+
_mm_storeu_si128(fold + 2, fold2);
231+
_mm_storeu_si128(fold + 3, fold3);
232+
}
233+
234+
static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) {
235+
_mm_store_si128(fold + 4, foldp);
236+
}
237+
238+
Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
239+
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
240+
__m128i xmm_zero = _mm_setzero_si128();
241+
crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero);
242+
return 0;
243+
}
244+
230245
Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
231246
unsigned long algn_diff;
232247
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
248+
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part;
233249
char ALIGNED_(16) partial_buf[16] = { 0 };
234250

235-
/* CRC_LOAD */
236-
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
237-
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
238-
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
239-
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
240-
__m128i xmm_crc_part;
251+
crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
241252

242253
if (len < 16) {
243254
if (len == 0)
244255
return;
245256

246257
memcpy(partial_buf, src, len);
247-
xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
258+
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
248259
memcpy(dst, partial_buf, len);
249260
goto partial;
250261
}
@@ -264,19 +275,11 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u
264275
}
265276

266277
while (len >= 64) {
267-
/* CRC_LOAD */
268-
xmm_t0 = _mm_load_si128((__m128i *)src);
269-
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
270-
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
271-
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
278+
crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
272279

273280
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
274281

275-
/* CRC_SAVE */
276-
_mm_storeu_si128((__m128i *)dst, xmm_t0);
277-
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
278-
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
279-
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
282+
crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3);
280283

281284
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
282285
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
@@ -356,12 +359,8 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u
356359
partial:
357360
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
358361
done:
359-
/* CRC_SAVE */
360-
_mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0);
361-
_mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1);
362-
_mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2);
363-
_mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3);
364-
_mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part);
362+
crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
363+
crc32_fold_save_partial((__m128i *)&crc->fold, xmm_crc_part);
365364
}
366365

367366
static const unsigned ALIGNED_(16) crc_k[] = {
@@ -384,14 +383,10 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
384383
Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
385384
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
386385
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
387-
386+
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
388387
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
389388

390-
/* CRC_LOAD */
391-
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
392-
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
393-
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
394-
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
389+
crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
395390

396391
/*
397392
* k1

0 commit comments

Comments
 (0)