@@ -496,6 +496,41 @@ inline void write8(char* buffer, uint64_t value) noexcept {
496496 memcpy (buffer, &value, 8 );
497497}
498498
499+ #if ZMIJ_USE_SSE
500+ constexpr auto splat64 (uint64_t x) -> __m128i {
501+ return __m128i{static_cast <long long >(x), static_cast <long long >(x)};
502+ }
503+ constexpr auto splat32 (uint32_t x) -> __m128i {
504+ return splat64 (uint64_t (x) << 32 | x);
505+ }
506+ constexpr auto splat16 (uint16_t x) -> __m128i {
507+ return splat32 (uint32_t (x) << 16 | x);
508+ }
509+ constexpr auto pack8 (uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
510+ uint8_t e, uint8_t f, uint8_t g, uint8_t h) -> uint64_t {
511+ using u64 = uint64_t ;
512+ return u64 (h) << 56 | u64 (g) << 48 | u64 (f) << 40 | u64 (e) << 32 |
513+ u64 (d) << 24 | u64 (c) << 16 | u64 (b) << +8 | u64 (a);
514+ }
515+
516+ alignas (64 ) constexpr struct {
517+ __m128i div10k = splat64(div10k_sig);
518+ __m128i neg10k = splat64(::neg10k);
519+ __m128i div100 = splat32(div100_sig);
520+ __m128i div10 = splat16((1 << 16 ) / 10 + 1 );
521+ # if ZMIJ_USE_SSE4_1
522+ __m128i neg100 = splat32(::neg100);
523+ __m128i neg10 = splat16((1 << 8 ) - 10 );
524+ __m128i bswap = __m128i{pack8 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 ),
525+ pack8 (7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 )};
526+ # else
527+ __m128i hundred = splat32(100 );
528+ __m128i moddiv10 = splat16(10 * (1 << 8 ) - 1 );
529+ # endif
530+ __m128i zeros = splat64(::zeros);
531+ } consts;
532+ #endif // ZMIJ_USE_SSE
533+
499534// Writes a significand consisting of up to 17 decimal digits (16-17 for
500535// normals) and removes trailing zeros. The significant digits start
501536// from buffer[1]. buffer[0] may contain '0' after this function if
@@ -585,47 +620,44 @@ auto write_significand17(char* buffer, uint64_t value,
585620 uint32_t abcdefgh = digits_16 / uint64_t (1e8 );
586621 uint32_t ijklmnop = digits_16 % uint64_t (1e8 );
587622
588- alignas (64 ) static const struct {
589- __m128i div10k = _mm_set1_epi64x(div10k_sig);
590- __m128i neg10k = _mm_set1_epi64x(::neg10k);
591- __m128i div100 = _mm_set1_epi32(div100_sig);
592- __m128i div10 = _mm_set1_epi16((1 << 16 ) / 10 + 1 );
623+ const __m128i div10k = _mm_load_si128 (&consts.div10k );
624+ const __m128i neg10k = _mm_load_si128 (&consts.neg10k );
625+ const __m128i div100 = _mm_load_si128 (&consts.div100 );
626+ const __m128i div10 = _mm_load_si128 (&consts.div10 );
593627# if ZMIJ_USE_SSE4_1
594- __m128i neg100 = _mm_set1_epi32(::neg100);
595- __m128i neg10 = _mm_set1_epi16((1 << 8 ) - 10 );
596- __m128i bswap =
597- _mm_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 );
628+ const __m128i neg100 = _mm_load_si128 (&consts.neg100 );
629+ const __m128i neg10 = _mm_load_si128 (&consts.neg10 );
630+ const __m128i bswap = _mm_load_si128 (&consts.bswap );
598631# else
599- __m128i hundred = _mm_set1_epi32(100 );
600- __m128i moddiv10 = _mm_set1_epi16(10 * (1 << 8 ) - 1 );
601- # endif // ZMIJ_USE_SSE4_1
602- __m128i zeros = _mm_set1_epi64x(::zeros);
603- } c;
632+ const __m128i hundred = _mm_load_si128 (&consts.hundred );
633+ const __m128i moddiv10 = _mm_load_si128 (&consts.moddiv10 );
634+ # endif
635+ const __m128i zeros = _mm_load_si128 (&consts.zeros );
604636
605637 // The BCD sequences are based on ones provided by Xiang JunBo.
606638 __m128i x = _mm_set_epi64x (abcdefgh, ijklmnop);
607639 __m128i y = _mm_add_epi64 (
608- x, _mm_mul_epu32 (c. neg10k ,
609- _mm_srli_epi64 (_mm_mul_epu32 (x, c. div10k ), div10k_exp)));
640+ x, _mm_mul_epu32 (neg10k,
641+ _mm_srli_epi64 (_mm_mul_epu32 (x, div10k), div10k_exp)));
610642# if ZMIJ_USE_SSE4_1
611643 // _mm_mullo_epi32 is SSE 4.1
612644 __m128i z = _mm_add_epi64 (
613- y, _mm_mullo_epi32 (c. neg100 ,
614- _mm_srli_epi32 (_mm_mulhi_epu16 (y, c. div100 ), 3 )));
645+ y, _mm_mullo_epi32 (neg100,
646+ _mm_srli_epi32 (_mm_mulhi_epu16 (y, div100), 3 )));
615647 __m128i big_endian_bcd =
616- _mm_add_epi16 (z, _mm_mullo_epi16 (c. neg10 , _mm_mulhi_epu16 (z, c. div10 )));
617- __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, c. bswap ); // SSSE3
648+ _mm_add_epi16 (z, _mm_mullo_epi16 (neg10, _mm_mulhi_epu16 (z, div10)));
649+ __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, bswap); // SSSE3
618650# else
619- __m128i y_div_100 = _mm_srli_epi16 (_mm_mulhi_epu16 (y, c. div100 ), 3 );
620- __m128i y_mod_100 = _mm_sub_epi16 (y, _mm_mullo_epi16 (y_div_100, c. hundred ));
651+ __m128i y_div_100 = _mm_srli_epi16 (_mm_mulhi_epu16 (y, div100), 3 );
652+ __m128i y_mod_100 = _mm_sub_epi16 (y, _mm_mullo_epi16 (y_div_100, hundred));
621653 __m128i z = _mm_or_si128 (_mm_slli_epi32 (y_mod_100, 16 ), y_div_100);
622654 __m128i bcd_shuffled =
623655 _mm_sub_epi16 (_mm_slli_epi16 (z, 8 ),
624- _mm_mullo_epi16 (c. moddiv10 , _mm_mulhi_epu16 (z, c. div10 )));
656+ _mm_mullo_epi16 (moddiv10, _mm_mulhi_epu16 (z, div10)));
625657 __m128i bcd = _mm_shuffle_epi32 (bcd_shuffled, _MM_SHUFFLE (0 , 1 , 2 , 3 ));
626658# endif // ZMIJ_USE_SSE4_1
627659
628- auto digits = _mm_or_si128 (bcd, c. zeros );
660+ auto digits = _mm_or_si128 (bcd, zeros);
629661
630662 // determine number of leading zeros
631663 __m128i mask128 = _mm_cmpgt_epi8 (bcd, _mm_setzero_si128 ());
0 commit comments