Skip to content

Commit 6a2aaa6

Browse files
committed
Fix pclmul bug
1 parent 63814ad commit 6a2aaa6

1 file changed

Lines changed: 45 additions & 45 deletions

File tree

arch/x86/crc32_fold_pclmulqdq_tpl.h

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -107,25 +107,25 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
107107

108108

109109
while (len >= 512 + 64 + 16*8) {
110-
__m128i shift544_shift480 = _mm_set_epi64x(0x1D9513D7, 0x8F352D95);
111-
// __m128i shift672_shift608 = _mm_set_epi64x(0xAE0B5394, 0x1C279815);
112-
// __m128i shift800_shift736 = _mm_set_epi64x(0x57C54819, 0xDF068DC2);
113-
// __m128i shift1568_shift1504 = _mm_set_epi64x(0x910EEEC1, 0x33FFF533);
114-
__m128i shift1568_shift1504 = _mm_set_epi64x(0xF5E48C85, 0x596C8D81);
115-
__m128i bonus8 = _mm_loadu_si128(src + (16 * 4));
116-
__m128i bonus7 = _mm_loadu_si128(src + (16 * 5));
117-
__m128i bonus6 = _mm_loadu_si128(src + (16 * 6));
118-
__m128i bonus5 = _mm_loadu_si128(src + (16 * 7));
119-
__m128i bonus4 = _mm_loadu_si128(src + (16 * 8));
120-
__m128i bonus3 = _mm_loadu_si128(src + (16 * 9));
121-
__m128i bonus2 = _mm_loadu_si128(src + (16 * 10)) ^ bonus8;
122-
__m128i bonus1 = _mm_loadu_si128(src + (16 * 11)) ^ bonus7;
110+
__m128i shift544_shift480 = _mm_set_epi64x(0x1D9513D7, 0x8F352D95);
111+
// __m128i shift672_shift608 = _mm_set_epi64x(0xAE0B5394, 0x1C279815);
112+
// __m128i shift800_shift736 = _mm_set_epi64x(0x57C54819, 0xDF068DC2);
113+
// __m128i shift1568_shift1504 = _mm_set_epi64x(0x910EEEC1, 0x33FFF533);
114+
__m128i shift1568_shift1504 = _mm_set_epi64x(0xF5E48C85, 0x596C8D81);
115+
__m128i bonus8 = _mm_loadu_si128((__m128i *)src);
116+
__m128i bonus7 = _mm_loadu_si128((__m128i *)src + 1);
117+
__m128i bonus6 = _mm_loadu_si128((__m128i *)src + 2);
118+
__m128i bonus5 = _mm_loadu_si128((__m128i *)src + 3);
119+
__m128i bonus4 = _mm_loadu_si128((__m128i *)src + 4);
120+
__m128i bonus3 = _mm_loadu_si128((__m128i *)src + 5);
121+
__m128i bonus2 = _mm_loadu_si128((__m128i *)src + 6) ^ bonus8;
122+
__m128i bonus1 = _mm_loadu_si128((__m128i *)src + 7) ^ bonus7;
123123
src += 16*8;
124124
len -= 16*8;
125-
xmm_t0 = _mm_loadu_si128(src + (16 * 4)) ^ bonus6;
126-
xmm_t1 = _mm_loadu_si128(src + (16 * 5)) ^ bonus5 ^ bonus8;
127-
xmm_t2 = _mm_loadu_si128(src + (16 * 6)) ^ bonus4 ^ bonus8 ^ bonus7;
128-
xmm_t3 = _mm_loadu_si128(src + (16 * 7)) ^ bonus3 ^ bonus7 ^ bonus6;
125+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (4-4)) ^ bonus6;
126+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (5-4)) ^ bonus5 ^ bonus8;
127+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (6-4)) ^ bonus4 ^ bonus8 ^ bonus7;
128+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (7-4)) ^ bonus3 ^ bonus7 ^ bonus6;
129129

130130
// now we fold xmm_crc0 onto xmm_crc1
131131
// fold 8x because we stole a value
@@ -145,10 +145,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
145145
__m128i fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift1568_shift1504, 0x00);
146146
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
147147
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
148-
xmm_t0 = _mm_loadu_si128(src + (16 * 8)) ^ bonus2 ^ bonus6 ^ bonus5;
149-
xmm_t1 = _mm_loadu_si128(src + (16 * 9)) ^ bonus1 ^ bonus4 ^ bonus5;
150-
xmm_t2 = _mm_loadu_si128(src + (16 * 10)) ^ bonus3 ^ bonus4;
151-
xmm_t3 = _mm_loadu_si128(src + (16 * 11)) ^ bonus2 ^ bonus3;
148+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (8-4)) ^ bonus2 ^ bonus6 ^ bonus5;
149+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (9-4)) ^ bonus1 ^ bonus4 ^ bonus5;
150+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (10-4)) ^ bonus3 ^ bonus4;
151+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (11-4)) ^ bonus2 ^ bonus3;
152152

153153
// now we fold xmm_crc0 onto xmm_crc1
154154
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -167,10 +167,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
167167
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
168168
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
169169
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
170-
xmm_t0 = _mm_loadu_si128(src + (16 * 12)) ^ bonus1 ^ bonus2 ^ bonus8;
171-
xmm_t1 = _mm_loadu_si128(src + (16 * 13)) ^ bonus1 ^ bonus7;
172-
xmm_t2 = _mm_loadu_si128(src + (16 * 14)) ^ bonus6;
173-
xmm_t3 = _mm_loadu_si128(src + (16 * 15)) ^ bonus5;
170+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (12-4)) ^ bonus1 ^ bonus2 ^ bonus8;
171+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (13-4)) ^ bonus1 ^ bonus7;
172+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (14-4)) ^ bonus6;
173+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (15-4)) ^ bonus5;
174174

175175
// now we fold xmm_crc0 onto xmm_crc1
176176
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -189,10 +189,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
189189
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
190190
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
191191
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
192-
xmm_t0 = _mm_loadu_si128(src + (16 * 16)) ^ bonus4 ^ bonus8;
193-
xmm_t1 = _mm_loadu_si128(src + (16 * 17)) ^ bonus3 ^ bonus8 ^ bonus7;
194-
xmm_t2 = _mm_loadu_si128(src + (16 * 18)) ^ bonus2 ^ bonus8 ^ bonus7 ^ bonus6;
195-
xmm_t3 = _mm_loadu_si128(src + (16 * 19)) ^ bonus1 ^ bonus7 ^ bonus6 ^ bonus5;
192+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (16-4)) ^ bonus4 ^ bonus8;
193+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (17-4)) ^ bonus3 ^ bonus8 ^ bonus7;
194+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (18-4)) ^ bonus2 ^ bonus8 ^ bonus7 ^ bonus6;
195+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (19-4)) ^ bonus1 ^ bonus7 ^ bonus6 ^ bonus5;
196196

197197
// now we fold xmm_crc0 onto xmm_crc1
198198
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -211,10 +211,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
211211
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
212212
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
213213
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
214-
xmm_t0 = _mm_loadu_si128(src + (16 * 20)) ^ bonus4 ^ bonus8 ^ bonus6 ^ bonus5;
215-
xmm_t1 = _mm_loadu_si128(src + (16 * 21)) ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
216-
xmm_t2 = _mm_loadu_si128(src + (16 * 22)) ^ bonus2 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6;
217-
xmm_t3 = _mm_loadu_si128(src + (16 * 23)) ^ bonus1 ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5;
214+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (20-4)) ^ bonus4 ^ bonus8 ^ bonus6 ^ bonus5;
215+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (21-4)) ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
216+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (22-4)) ^ bonus2 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6;
217+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (23-4)) ^ bonus1 ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5;
218218

219219
// now we fold xmm_crc0 onto xmm_crc1
220220
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -233,10 +233,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
233233
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
234234
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
235235
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
236-
xmm_t0 = _mm_loadu_si128(src + (16 * 24)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
237-
xmm_t1 = _mm_loadu_si128(src + (16 * 25)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6;
238-
xmm_t2 = _mm_loadu_si128(src + (16 * 26)) ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5;
239-
xmm_t3 = _mm_loadu_si128(src + (16 * 27)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
236+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (24-4)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
237+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (25-4)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus7 ^ bonus6;
238+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (26-4)) ^ bonus2 ^ bonus3 ^ bonus8 ^ bonus6 ^ bonus5;
239+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (27-4)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus5;
240240

241241
// now we fold xmm_crc0 onto xmm_crc1
242242
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -255,10 +255,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
255255
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
256256
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
257257
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
258-
xmm_t0 = _mm_loadu_si128(src + (16 * 28)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus6;
259-
xmm_t1 = _mm_loadu_si128(src + (16 * 29)) ^ bonus2 ^ bonus3 ^ bonus7 ^ bonus6 ^ bonus5;
260-
xmm_t2 = _mm_loadu_si128(src + (16 * 30)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus6 ^ bonus5;
261-
xmm_t3 = _mm_loadu_si128(src + (16 * 31)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus5;
258+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (28-4)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus8 ^ bonus7 ^ bonus6;
259+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (29-4)) ^ bonus2 ^ bonus3 ^ bonus7 ^ bonus6 ^ bonus5;
260+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (30-4)) ^ bonus1 ^ bonus2 ^ bonus4 ^ bonus6 ^ bonus5;
261+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (31-4)) ^ bonus1 ^ bonus3 ^ bonus4 ^ bonus5;
262262

263263
// now we fold xmm_crc0 onto xmm_crc1
264264
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);
@@ -277,10 +277,10 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
277277
fold_low4 = _mm_clmulepi64_si128(xmm_crc3, shift544_shift480, 0x00);
278278
xmm_crc3 = _mm_xor_si128(xmm_t3, fold_high4);
279279
xmm_crc3 = _mm_xor_si128(xmm_crc3, fold_low4);
280-
xmm_t0 = _mm_loadu_si128(src + (16 * 32)) ^ bonus2 ^ bonus3 ^ bonus4;
281-
xmm_t1 = _mm_loadu_si128(src + (16 * 33)) ^ bonus1 ^ bonus2 ^ bonus3;
282-
xmm_t2 = _mm_loadu_si128(src + (16 * 34)) ^ bonus1 ^ bonus2;
283-
xmm_t3 = _mm_loadu_si128(src + (16 * 35)) ^ bonus1;
280+
xmm_t0 = _mm_loadu_si128((__m128i *)src + (32-4)) ^ bonus2 ^ bonus3 ^ bonus4;
281+
xmm_t1 = _mm_loadu_si128((__m128i *)src + (33-4)) ^ bonus1 ^ bonus2 ^ bonus3;
282+
xmm_t2 = _mm_loadu_si128((__m128i *)src + (34-4)) ^ bonus1 ^ bonus2;
283+
xmm_t3 = _mm_loadu_si128((__m128i *)src + (35-4)) ^ bonus1;
284284

285285
// now we fold xmm_crc0 onto xmm_crc1
286286
fold_high1 = _mm_clmulepi64_si128(xmm_crc0, shift544_shift480, 0x11);

0 commit comments

Comments
 (0)