@@ -107,8 +107,6 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
107107
108108 /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
109109 while (len >= 512 + 64 + 16 * 8 ) {
110- __m128i shift544_shift480 = _mm_set_epi64x (0x1D9513D7 , 0x8F352D95 );
111- __m128i shift1568_shift1504 = _mm_set_epi64x (0xF5E48C85 , 0x596C8D81 );
112110 __m128i chorba8 = _mm_loadu_si128 ((__m128i * )src );
113111 __m128i chorba7 = _mm_loadu_si128 ((__m128i * )src + 1 );
114112 __m128i chorba6 = _mm_loadu_si128 ((__m128i * )src + 2 );
@@ -140,19 +138,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
140138 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 2 );
141139 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 3 );
142140
143- // fold 12x because we took an extra 8 values to begin
144- __m128i fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift1568_shift1504 , 0x11 );
145- __m128i fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift1568_shift1504 , 0x00 );
146- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
147- __m128i fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift1568_shift1504 , 0x11 );
148- __m128i fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift1568_shift1504 , 0x00 );
149- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
150- __m128i fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift1568_shift1504 , 0x11 );
151- __m128i fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift1568_shift1504 , 0x00 );
152- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
153- __m128i fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift1568_shift1504 , 0x11 );
154- __m128i fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift1568_shift1504 , 0x00 );
155- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
141+ fold_12 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
156142#ifdef COPY
157143 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
158144 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -174,18 +160,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
174160 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 6 );
175161 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 7 );
176162
177- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
178- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
179- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
180- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
181- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
182- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
183- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
184- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
185- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
186- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
187- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
188- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
163+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
189164#ifdef COPY
190165 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
191166 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -208,18 +183,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
208183 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 10 );
209184 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 11 );
210185
211- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
212- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
213- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
214- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
215- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
216- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
217- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
218- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
219- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
220- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
221- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
222- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
186+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
223187#ifdef COPY
224188 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
225189 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -242,18 +206,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
242206 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 14 );
243207 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 15 );
244208
245- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
246- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
247- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
248- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
249- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
250- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
251- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
252- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
253- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
254- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
255- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
256- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
209+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
257210#ifdef COPY
258211 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
259212 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -276,18 +229,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
276229 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 18 );
277230 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 19 );
278231
279- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
280- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
281- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
282- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
283- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
284- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
285- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
286- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
287- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
288- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
289- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
290- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
232+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
291233#ifdef COPY
292234 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
293235 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -310,18 +252,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
310252 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 22 );
311253 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 23 );
312254
313- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
314- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
315- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
316- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
317- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
318- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
319- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
320- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
321- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
322- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
323- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
324- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
255+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
325256#ifdef COPY
326257 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
327258 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -344,18 +275,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
344275 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 26 );
345276 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 27 );
346277
347- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
348- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
349- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
350- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
351- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
352- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
353- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
354- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
355- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
356- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
357- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
358- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
278+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
359279#ifdef COPY
360280 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
361281 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
@@ -377,18 +297,7 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
377297 xmm_t2 = _mm_loadu_si128 ((__m128i * )src + 30 );
378298 xmm_t3 = _mm_loadu_si128 ((__m128i * )src + 31 );
379299
380- fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
381- fold_low1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x00 );
382- xmm_crc0 = _mm_xor_si128 (fold_high1 , fold_low1 );
383- fold_high2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x11 );
384- fold_low2 = _mm_clmulepi64_si128 (xmm_crc1 , shift544_shift480 , 0x00 );
385- xmm_crc1 = _mm_xor_si128 (fold_high2 , fold_low2 );
386- fold_high3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x11 );
387- fold_low3 = _mm_clmulepi64_si128 (xmm_crc2 , shift544_shift480 , 0x00 );
388- xmm_crc2 = _mm_xor_si128 (fold_high3 , fold_low3 );
389- fold_high4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x11 );
390- fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
391- xmm_crc3 = _mm_xor_si128 (fold_high4 , fold_low4 );
300+ fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
392301#ifdef COPY
393302 _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
394303 _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
0 commit comments