2525
2626#include "../../crc32_fold.h"
2727
28- Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq (crc32_fold * crc ) {
29- /* CRC_SAVE */
30- _mm_storeu_si128 ((__m128i * )crc -> fold + 0 , _mm_cvtsi32_si128 (0x9db42487 ));
31- _mm_storeu_si128 ((__m128i * )crc -> fold + 1 , _mm_setzero_si128 ());
32- _mm_storeu_si128 ((__m128i * )crc -> fold + 2 , _mm_setzero_si128 ());
33- _mm_storeu_si128 ((__m128i * )crc -> fold + 3 , _mm_setzero_si128 ());
34-
35- return 0 ;
36- }
37-
3828static void fold_1 (__m128i * xmm_crc0 , __m128i * xmm_crc1 , __m128i * xmm_crc2 , __m128i * xmm_crc3 ) {
3929 const __m128i xmm_fold4 = _mm_set_epi32 ( 0x00000001 , 0x54442bd4 ,
4030 0x00000001 , 0xc6e41596 );
@@ -227,24 +217,45 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
227217 * xmm_crc3 = _mm_castps_si128 (ps_res );
228218}
229219
220+ static inline void crc32_fold_load (__m128i * fold , __m128i * fold0 , __m128i * fold1 , __m128i * fold2 , __m128i * fold3 ) {
221+ * fold0 = _mm_load_si128 (fold + 0 );
222+ * fold1 = _mm_load_si128 (fold + 1 );
223+ * fold2 = _mm_load_si128 (fold + 2 );
224+ * fold3 = _mm_load_si128 (fold + 3 );
225+ }
226+
227+ static inline void crc32_fold_save (__m128i * fold , __m128i fold0 , __m128i fold1 , __m128i fold2 , __m128i fold3 ) {
228+ _mm_storeu_si128 (fold + 0 , fold0 );
229+ _mm_storeu_si128 (fold + 1 , fold1 );
230+ _mm_storeu_si128 (fold + 2 , fold2 );
231+ _mm_storeu_si128 (fold + 3 , fold3 );
232+ }
233+
234+ static inline void crc32_fold_save_partial (__m128i * fold , __m128i foldp ) {
235+ _mm_store_si128 (fold + 4 , foldp );
236+ }
237+
238+ Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq (crc32_fold * crc ) {
239+ __m128i xmm_crc0 = _mm_cvtsi32_si128 (0x9db42487 );
240+ __m128i xmm_zero = _mm_setzero_si128 ();
241+ crc32_fold_save ((__m128i * )& crc -> fold , xmm_crc0 , xmm_zero , xmm_zero , xmm_zero );
242+ return 0 ;
243+ }
244+
230245Z_INTERNAL void crc32_fold_copy_pclmulqdq (crc32_fold * crc , uint8_t * dst , const uint8_t * src , size_t len ) {
231246 unsigned long algn_diff ;
232247 __m128i xmm_t0 , xmm_t1 , xmm_t2 , xmm_t3 ;
248+ __m128i xmm_crc0 , xmm_crc1 , xmm_crc2 , xmm_crc3 , xmm_crc_part ;
233249 char ALIGNED_ (16 ) partial_buf [16 ] = { 0 };
234250
235- /* CRC_LOAD */
236- __m128i xmm_crc0 = _mm_loadu_si128 ((__m128i * )crc -> fold + 0 );
237- __m128i xmm_crc1 = _mm_loadu_si128 ((__m128i * )crc -> fold + 1 );
238- __m128i xmm_crc2 = _mm_loadu_si128 ((__m128i * )crc -> fold + 2 );
239- __m128i xmm_crc3 = _mm_loadu_si128 ((__m128i * )crc -> fold + 3 );
240- __m128i xmm_crc_part ;
251+ crc32_fold_load ((__m128i * )& crc -> fold , & xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
241252
242253 if (len < 16 ) {
243254 if (len == 0 )
244255 return ;
245256
246257 memcpy (partial_buf , src , len );
247- xmm_crc_part = _mm_loadu_si128 ((const __m128i * )partial_buf );
258+ xmm_crc_part = _mm_load_si128 ((const __m128i * )partial_buf );
248259 memcpy (dst , partial_buf , len );
249260 goto partial ;
250261 }
@@ -264,19 +275,11 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u
264275 }
265276
266277 while (len >= 64 ) {
267- /* CRC_LOAD */
268- xmm_t0 = _mm_load_si128 ((__m128i * )src );
269- xmm_t1 = _mm_load_si128 ((__m128i * )src + 1 );
270- xmm_t2 = _mm_load_si128 ((__m128i * )src + 2 );
271- xmm_t3 = _mm_load_si128 ((__m128i * )src + 3 );
278+ crc32_fold_load ((__m128i * )src , & xmm_t0 , & xmm_t1 , & xmm_t2 , & xmm_t3 );
272279
273280 fold_4 (& xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
274281
275- /* CRC_SAVE */
276- _mm_storeu_si128 ((__m128i * )dst , xmm_t0 );
277- _mm_storeu_si128 ((__m128i * )dst + 1 , xmm_t1 );
278- _mm_storeu_si128 ((__m128i * )dst + 2 , xmm_t2 );
279- _mm_storeu_si128 ((__m128i * )dst + 3 , xmm_t3 );
282+ crc32_fold_save ((__m128i * )dst , xmm_t0 , xmm_t1 , xmm_t2 , xmm_t3 );
280283
281284 xmm_crc0 = _mm_xor_si128 (xmm_crc0 , xmm_t0 );
282285 xmm_crc1 = _mm_xor_si128 (xmm_crc1 , xmm_t1 );
@@ -356,12 +359,8 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u
356359partial :
357360 partial_fold ((size_t )len , & xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 , & xmm_crc_part );
358361done :
359- /* CRC_SAVE */
360- _mm_storeu_si128 ((__m128i * )crc -> fold + 0 , xmm_crc0 );
361- _mm_storeu_si128 ((__m128i * )crc -> fold + 1 , xmm_crc1 );
362- _mm_storeu_si128 ((__m128i * )crc -> fold + 2 , xmm_crc2 );
363- _mm_storeu_si128 ((__m128i * )crc -> fold + 3 , xmm_crc3 );
364- _mm_storeu_si128 ((__m128i * )crc -> fold + 4 , xmm_crc_part );
362+ crc32_fold_save ((__m128i * )& crc -> fold , xmm_crc0 , xmm_crc1 , xmm_crc2 , xmm_crc3 );
363+ crc32_fold_save_partial ((__m128i * )& crc -> fold , xmm_crc_part );
365364}
366365
367366static const unsigned ALIGNED_ (16 ) crc_k [] = {
@@ -384,14 +383,10 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
384383Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq (crc32_fold * crc ) {
385384 const __m128i xmm_mask = _mm_load_si128 ((__m128i * )crc_mask );
386385 const __m128i xmm_mask2 = _mm_load_si128 ((__m128i * )crc_mask2 );
387-
386+ __m128i xmm_crc0 , xmm_crc1 , xmm_crc2 , xmm_crc3 ;
388387 __m128i x_tmp0 , x_tmp1 , x_tmp2 , crc_fold ;
389388
390- /* CRC_LOAD */
391- __m128i xmm_crc0 = _mm_loadu_si128 ((__m128i * )crc -> fold + 0 );
392- __m128i xmm_crc1 = _mm_loadu_si128 ((__m128i * )crc -> fold + 1 );
393- __m128i xmm_crc2 = _mm_loadu_si128 ((__m128i * )crc -> fold + 2 );
394- __m128i xmm_crc3 = _mm_loadu_si128 ((__m128i * )crc -> fold + 3 );
389+ crc32_fold_load ((__m128i * )& crc -> fold , & xmm_crc0 , & xmm_crc1 , & xmm_crc2 , & xmm_crc3 );
395390
396391 /*
397392 * k1
0 commit comments