Skip to content

Commit b2192e6

Browse files
committed
Reuse fold_4 and add fold_12
1 parent 2c4ca03 commit b2192e6

2 files changed

Lines changed: 50 additions & 99 deletions

File tree

arch/x86/crc32_fold_pclmulqdq_tpl.h

Lines changed: 8 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,6 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
107107

108108
/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
109109
while (len >= 512 + 64 + 16*8) {
110-
__m128i shift544_shift480 = _mm_set_epi64x(0x1D9513D7, 0x8F352D95);
111-
__m128i shift1568_shift1504 = _mm_set_epi64x(0xF5E48C85, 0x596C8D81);
112110
__m128i chorba8 = _mm_loadu_si128((__m128i *)src);
113111
__m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
114112
__m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
@@ -140,19 +138,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
140138
xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
141139
xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
142140

143-
// fold 12x because we took an extra 8 values to begin
144-
__m128i fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift1568_shift1504, 0x11);
145-
__m128i fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift1568_shift1504, 0x00);
146-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
147-
__m128i fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift1568_shift1504, 0x11);
148-
__m128i fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift1568_shift1504, 0x00);
149-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
150-
__m128i fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift1568_shift1504, 0x11);
151-
__m128i fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift1568_shift1504, 0x00);
152-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
153-
__m128i fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift1568_shift1504, 0x11);
154-
__m128i fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift1568_shift1504, 0x00);
155-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
141+
fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
156142
#ifdef COPY
157143
_mm_storeu_si128((__m128i *)dst, xmm_t0);
158144
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -174,18 +160,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
174160
xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
175161
xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
176162

177-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
178-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
179-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
180-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
181-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
182-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
183-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
184-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
185-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
186-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
187-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
188-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
163+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
189164
#ifdef COPY
190165
_mm_storeu_si128((__m128i *)dst, xmm_t0);
191166
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -208,18 +183,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
208183
xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
209184
xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
210185

211-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
212-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
213-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
214-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
215-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
216-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
217-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
218-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
219-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
220-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
221-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
222-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
186+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
223187
#ifdef COPY
224188
_mm_storeu_si128((__m128i *)dst, xmm_t0);
225189
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -242,18 +206,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
242206
xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
243207
xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
244208

245-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
246-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
247-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
248-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
249-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
250-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
251-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
252-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
253-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
254-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
255-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
256-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
209+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
257210
#ifdef COPY
258211
_mm_storeu_si128((__m128i *)dst, xmm_t0);
259212
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -276,18 +229,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
276229
xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
277230
xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
278231

279-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
280-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
281-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
282-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
283-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
284-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
285-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
286-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
287-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
288-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
289-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
290-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
232+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
291233
#ifdef COPY
292234
_mm_storeu_si128((__m128i *)dst, xmm_t0);
293235
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -310,18 +252,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
310252
xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
311253
xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
312254

313-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
314-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
315-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
316-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
317-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
318-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
319-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
320-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
321-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
322-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
323-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
324-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
255+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
325256
#ifdef COPY
326257
_mm_storeu_si128((__m128i *)dst, xmm_t0);
327258
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -344,18 +275,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
344275
xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
345276
xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
346277

347-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
348-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
349-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
350-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
351-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
352-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
353-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
354-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
355-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
356-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
357-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
358-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
278+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
359279
#ifdef COPY
360280
_mm_storeu_si128((__m128i *)dst, xmm_t0);
361281
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
@@ -377,18 +297,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
377297
xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
378298
xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
379299

380-
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
381-
fold_low1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x00);
382-
xmm_crc0 = _mm_xor_si128(fold_high1, fold_low1);
383-
fold_high2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x11);
384-
fold_low2 = _mm_clmulepi64_si128(xmm_crc1, shift544_shift480, 0x00);
385-
xmm_crc1 = _mm_xor_si128(fold_high2, fold_low2);
386-
fold_high3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x11);
387-
fold_low3 = _mm_clmulepi64_si128(xmm_crc2, shift544_shift480, 0x00);
388-
xmm_crc2 = _mm_xor_si128(fold_high3, fold_low3);
389-
fold_high4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x11);
390-
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
391-
xmm_crc3 = _mm_xor_si128(fold_high4, fold_low4);
300+
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
392301
#ifdef COPY
393302
_mm_storeu_si128((__m128i *)dst, xmm_t0);
394303
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);

arch/x86/crc32_pclmulqdq_tpl.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,48 @@ static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m1
168168
*xmm_crc3 = _mm_castps_si128(ps_res3);
169169
}
170170

171+
static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
172+
const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
173+
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
174+
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
175+
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
176+
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
177+
178+
x_tmp0 = *xmm_crc0;
179+
x_tmp1 = *xmm_crc1;
180+
x_tmp2 = *xmm_crc2;
181+
x_tmp3 = *xmm_crc3;
182+
183+
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
184+
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold12, 0x10);
185+
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
186+
ps_t0 = _mm_castsi128_ps(x_tmp0);
187+
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
188+
189+
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
190+
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold12, 0x10);
191+
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
192+
ps_t1 = _mm_castsi128_ps(x_tmp1);
193+
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
194+
195+
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
196+
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold12, 0x10);
197+
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
198+
ps_t2 = _mm_castsi128_ps(x_tmp2);
199+
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
200+
201+
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
202+
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold12, 0x10);
203+
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
204+
ps_t3 = _mm_castsi128_ps(x_tmp3);
205+
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
206+
207+
*xmm_crc0 = _mm_castps_si128(ps_res0);
208+
*xmm_crc1 = _mm_castps_si128(ps_res1);
209+
*xmm_crc2 = _mm_castps_si128(ps_res2);
210+
*xmm_crc3 = _mm_castps_si128(ps_res3);
211+
}
212+
171213
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
172214
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
173215
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */

0 commit comments

Comments
 (0)