@@ -107,25 +107,25 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
107107
108108
109109 while (len >= 512 + 64 + 16 * 8 ) {
110- __m128i shift544_shift480 = _mm_set_epi64x (0x1D9513D7 , 0x8F352D95 );
111- // __m128i shift672_shift608 = _mm_set_epi64x(0xAE0B5394, 0x1C279815);
112- // __m128i shift800_shift736 = _mm_set_epi64x(0x57C54819, 0xDF068DC2);
113- // __m128i shift1568_shift1504 = _mm_set_epi64x(0x910EEEC1, 0x33FFF533);
114- __m128i shift1568_shift1504 = _mm_set_epi64x (0xF5E48C85 , 0x596C8D81 );
115- __m128i bonus8 = _mm_loadu_si128 (src + ( 16 * 4 ) );
116- __m128i bonus7 = _mm_loadu_si128 (src + ( 16 * 5 ) );
117- __m128i bonus6 = _mm_loadu_si128 (src + ( 16 * 6 ) );
118- __m128i bonus5 = _mm_loadu_si128 (src + ( 16 * 7 ) );
119- __m128i bonus4 = _mm_loadu_si128 (src + ( 16 * 8 ) );
120- __m128i bonus3 = _mm_loadu_si128 (src + ( 16 * 9 ) );
121- __m128i bonus2 = _mm_loadu_si128 (src + ( 16 * 10 ) ) ^ bonus8 ;
122- __m128i bonus1 = _mm_loadu_si128 (src + ( 16 * 11 ) ) ^ bonus7 ;
110+ __m128i shift544_shift480 = _mm_set_epi64x (0x1D9513D7 , 0x8F352D95 );
111+ // __m128i shift672_shift608 = _mm_set_epi64x(0xAE0B5394, 0x1C279815);
112+ // __m128i shift800_shift736 = _mm_set_epi64x(0x57C54819, 0xDF068DC2);
113+ // __m128i shift1568_shift1504 = _mm_set_epi64x(0x910EEEC1, 0x33FFF533);
114+ __m128i shift1568_shift1504 = _mm_set_epi64x (0xF5E48C85 , 0x596C8D81 );
115+ __m128i bonus8 = _mm_loadu_si128 (( __m128i * ) src );
116+ __m128i bonus7 = _mm_loadu_si128 (( __m128i * ) src + 1 );
117+ __m128i bonus6 = _mm_loadu_si128 (( __m128i * ) src + 2 );
118+ __m128i bonus5 = _mm_loadu_si128 (( __m128i * ) src + 3 );
119+ __m128i bonus4 = _mm_loadu_si128 (( __m128i * ) src + 4 );
120+ __m128i bonus3 = _mm_loadu_si128 (( __m128i * ) src + 5 );
121+ __m128i bonus2 = _mm_loadu_si128 (( __m128i * ) src + 6 ) ^ bonus8 ;
122+ __m128i bonus1 = _mm_loadu_si128 (( __m128i * ) src + 7 ) ^ bonus7 ;
123123 src += 16 * 8 ;
124124 len -= 16 * 8 ;
125- xmm_t0 = _mm_loadu_si128 (src + (16 * 4 )) ^ bonus6 ;
126- xmm_t1 = _mm_loadu_si128 (src + (16 * 5 )) ^ bonus5 ^ bonus8 ;
127- xmm_t2 = _mm_loadu_si128 (src + (16 * 6 )) ^ bonus4 ^ bonus8 ^ bonus7 ;
128- xmm_t3 = _mm_loadu_si128 (src + (16 * 7 )) ^ bonus3 ^ bonus7 ^ bonus6 ;
125+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (4 - 4 )) ^ bonus6 ;
126+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (5 - 4 )) ^ bonus5 ^ bonus8 ;
127+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (6 - 4 )) ^ bonus4 ^ bonus8 ^ bonus7 ;
128+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (7 - 4 )) ^ bonus3 ^ bonus7 ^ bonus6 ;
129129
130130 // now we fold xmm_crc0 onto xmm_crc1
131131 // fold 8x because we stole a value
@@ -145,10 +145,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
145145 __m128i fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift1568_shift1504 , 0x00 );
146146 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
147147 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
148- xmm_t0 = _mm_loadu_si128 (src + (16 * 8 )) ^ bonus2 ^ bonus6 ^ bonus5 ;
149- xmm_t1 = _mm_loadu_si128 (src + (16 * 9 )) ^ bonus1 ^ bonus4 ^ bonus5 ;
150- xmm_t2 = _mm_loadu_si128 (src + (16 * 10 )) ^ bonus3 ^ bonus4 ;
151- xmm_t3 = _mm_loadu_si128 (src + (16 * 11 )) ^ bonus2 ^ bonus3 ;
148+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (8 - 4 )) ^ bonus2 ^ bonus6 ^ bonus5 ;
149+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (9 - 4 )) ^ bonus1 ^ bonus4 ^ bonus5 ;
150+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (10 - 4 )) ^ bonus3 ^ bonus4 ;
151+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (11 - 4 )) ^ bonus2 ^ bonus3 ;
152152
153153 // now we fold xmm_crc0 onto xmm_crc1
154154 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -167,10 +167,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
167167 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
168168 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
169169 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
170- xmm_t0 = _mm_loadu_si128 (src + (16 * 12 )) ^ bonus1 ^ bonus2 ^ bonus8 ;
171- xmm_t1 = _mm_loadu_si128 (src + (16 * 13 )) ^ bonus1 ^ bonus7 ;
172- xmm_t2 = _mm_loadu_si128 (src + (16 * 14 )) ^ bonus6 ;
173- xmm_t3 = _mm_loadu_si128 (src + (16 * 15 )) ^ bonus5 ;
170+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (12 - 4 )) ^ bonus1 ^ bonus2 ^ bonus8 ;
171+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (13 - 4 )) ^ bonus1 ^ bonus7 ;
172+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (14 - 4 )) ^ bonus6 ;
173+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (15 - 4 )) ^ bonus5 ;
174174
175175 // now we fold xmm_crc0 onto xmm_crc1
176176 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -189,10 +189,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
189189 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
190190 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
191191 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
192- xmm_t0 = _mm_loadu_si128 (src + (16 * 16 )) ^ bonus4 ^ bonus8 ;
193- xmm_t1 = _mm_loadu_si128 (src + (16 * 17 )) ^ bonus3 ^ bonus8 ^ bonus7 ;
194- xmm_t2 = _mm_loadu_si128 (src + (16 * 18 )) ^ bonus2 ^ bonus8 ^ bonus7 ^ bonus6 ;
195- xmm_t3 = _mm_loadu_si128 (src + (16 * 19 )) ^ bonus1 ^ bonus7 ^ bonus6 ^ bonus5 ;
192+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (16 - 4 )) ^ bonus4 ^ bonus8 ;
193+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (17 - 4 )) ^ bonus3 ^ bonus8 ^ bonus7 ;
194+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (18 - 4 )) ^ bonus2 ^ bonus8 ^ bonus7 ^ bonus6 ;
195+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (19 - 4 )) ^ bonus1 ^ bonus7 ^ bonus6 ^ bonus5 ;
196196
197197 // now we fold xmm_crc0 onto xmm_crc1
198198 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -211,10 +211,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
211211 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
212212 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
213213 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
214- xmm_t0 = _mm_loadu_si128 (src + (16 * 20 )) ^ bonus4 ^ bonus8 ^ bonus6 ^ bonus5 ;
215- xmm_t1 = _mm_loadu_si128 (src + (16 * 21 )) ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
216- xmm_t2 = _mm_loadu_si128 (src + (16 * 22 )) ^ bonus2 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6 ;
217- xmm_t3 = _mm_loadu_si128 (src + (16 * 23 )) ^ bonus1 ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5 ;
214+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (20 - 4 )) ^ bonus4 ^ bonus8 ^ bonus6 ^ bonus5 ;
215+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (21 - 4 )) ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
216+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (22 - 4 )) ^ bonus2 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6 ;
217+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (23 - 4 )) ^ bonus1 ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5 ;
218218
219219 // now we fold xmm_crc0 onto xmm_crc1
220220 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -233,10 +233,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
233233 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
234234 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
235235 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
236- xmm_t0 = _mm_loadu_si128 (src + (16 * 24 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
237- xmm_t1 = _mm_loadu_si128 (src + (16 * 25 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6 ;
238- xmm_t2 = _mm_loadu_si128 (src + (16 * 26 )) ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5 ;
239- xmm_t3 = _mm_loadu_si128 (src + (16 * 27 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
236+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (24 - 4 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
237+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (25 - 4 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6 ;
238+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (26 - 4 )) ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5 ;
239+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (27 - 4 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5 ;
240240
241241 // now we fold xmm_crc0 onto xmm_crc1
242242 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -255,10 +255,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
255255 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
256256 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
257257 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
258- xmm_t0 = _mm_loadu_si128 (src + (16 * 28 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus6 ;
259- xmm_t1 = _mm_loadu_si128 (src + (16 * 29 )) ^ bonus2 ^ bonus3 ^ bonus7 ^ bonus6 ^ bonus5 ;
260- xmm_t2 = _mm_loadu_si128 (src + (16 * 30 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus6 ^ bonus5 ;
261- xmm_t3 = _mm_loadu_si128 (src + (16 * 31 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus5 ;
258+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (28 - 4 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus6 ;
259+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (29 - 4 )) ^ bonus2 ^ bonus3 ^ bonus7 ^ bonus6 ^ bonus5 ;
260+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (30 - 4 )) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus6 ^ bonus5 ;
261+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (31 - 4 )) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus5 ;
262262
263263 // now we fold xmm_crc0 onto xmm_crc1
264264 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
@@ -277,10 +277,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
277277 fold_low4 = _mm_clmulepi64_si128 (xmm_crc3 , shift544_shift480 , 0x00 );
278278 xmm_crc3 = _mm_xor_si128 (xmm_t3 , fold_high4 );
279279 xmm_crc3 = _mm_xor_si128 (xmm_crc3 , fold_low4 );
280- xmm_t0 = _mm_loadu_si128 (src + (16 * 32 )) ^ bonus2 ^ bonus3 ^ bonus4 ;
281- xmm_t1 = _mm_loadu_si128 (src + (16 * 33 )) ^ bonus1 ^ bonus2 ^ bonus3 ;
282- xmm_t2 = _mm_loadu_si128 (src + (16 * 34 )) ^ bonus1 ^ bonus2 ;
283- xmm_t3 = _mm_loadu_si128 (src + (16 * 35 )) ^ bonus1 ;
280+ xmm_t0 = _mm_loadu_si128 (( __m128i * ) src + (32 - 4 )) ^ bonus2 ^ bonus3 ^ bonus4 ;
281+ xmm_t1 = _mm_loadu_si128 (( __m128i * ) src + (33 - 4 )) ^ bonus1 ^ bonus2 ^ bonus3 ;
282+ xmm_t2 = _mm_loadu_si128 (( __m128i * ) src + (34 - 4 )) ^ bonus1 ^ bonus2 ;
283+ xmm_t3 = _mm_loadu_si128 (( __m128i * ) src + (35 - 4 )) ^ bonus1 ;
284284
285285 // now we fold xmm_crc0 onto xmm_crc1
286286 fold_high1 = _mm_clmulepi64_si128 (xmm_crc0 , shift544_shift480 , 0x11 );
0 commit comments