@@ -497,13 +497,20 @@ inline void write8(char* buffer, uint64_t value) noexcept {
497497}
498498
499499#if ZMIJ_USE_SSE
500- constexpr auto splat64 (uint64_t x) -> __m128i {
501- return __m128i{static_cast <long long >(x), static_cast <long long >(x)};
500+ using m128i = __m128i;
501+ #else
502+ struct m128i {
503+ long long data[2 ];
504+ };
505+ #endif
506+
507+ constexpr auto splat64 (uint64_t x) -> m128i {
508+ return m128i{static_cast <long long >(x), static_cast <long long >(x)};
502509}
503- constexpr auto splat32 (uint32_t x) -> __m128i {
510+ constexpr auto splat32 (uint32_t x) -> m128i {
504511 return splat64 (uint64_t (x) << 32 | x);
505512}
506- constexpr auto splat16 (uint16_t x) -> __m128i {
513+ constexpr auto splat16 (uint16_t x) -> m128i {
507514 return splat32 (uint32_t (x) << 16 | x);
508515}
509516constexpr auto pack8 (uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
@@ -513,24 +520,6 @@ constexpr auto pack8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
513520 u64 (d) << 24 | u64 (c) << 16 | u64 (b) << +8 | u64 (a);
514521}
515522
516- alignas (64 ) constexpr struct {
517- __m128i div10k = splat64(div10k_sig);
518- __m128i neg10k = splat64(::neg10k);
519- __m128i div100 = splat32(div100_sig);
520- __m128i div10 = splat16((1 << 16 ) / 10 + 1 );
521- # if ZMIJ_USE_SSE4_1
522- __m128i neg100 = splat32(::neg100);
523- __m128i neg10 = splat16((1 << 8 ) - 10 );
524- __m128i bswap = __m128i{pack8 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 ),
525- pack8 (7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 )};
526- # else
527- __m128i hundred = splat32(100 );
528- __m128i moddiv10 = splat16(10 * (1 << 8 ) - 1 );
529- # endif
530- __m128i zeros = splat64(::zeros);
531- } consts;
532- #endif // ZMIJ_USE_SSE
533-
534523// Writes a significand consisting of up to 17 decimal digits (16-17 for
535524// normals) and removes trailing zeros. The significant digits start
536525// from buffer[1]. buffer[0] may contain '0' after this function if
@@ -620,6 +609,23 @@ auto write_significand17(char* buffer, uint64_t value,
620609 uint32_t abcdefgh = digits_16 / uint64_t (1e8 );
621610 uint32_t ijklmnop = digits_16 % uint64_t (1e8 );
622611
612+ alignas (64 ) static constexpr struct {
613+ __m128i div10k = splat64(div10k_sig);
614+ __m128i neg10k = splat64(::neg10k);
615+ __m128i div100 = splat32(div100_sig);
616+ __m128i div10 = splat16((1 << 16 ) / 10 + 1 );
617+ # if ZMIJ_USE_SSE4_1
618+ __m128i neg100 = splat32(::neg100);
619+ __m128i neg10 = splat16((1 << 8 ) - 10 );
620+ __m128i bswap = __m128i{pack8 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 ),
621+ pack8 (7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 )};
622+ # else
623+ __m128i hundred = splat32(100 );
624+ __m128i moddiv10 = splat16(10 * (1 << 8 ) - 1 );
625+ # endif
626+ __m128i zeros = splat64(::zeros);
627+ } consts;
628+
623629 const __m128i div10k = _mm_load_si128 (&consts.div10k );
624630 const __m128i neg10k = _mm_load_si128 (&consts.neg10k );
625631 const __m128i div100 = _mm_load_si128 (&consts.div100 );
@@ -642,8 +648,8 @@ auto write_significand17(char* buffer, uint64_t value,
642648# if ZMIJ_USE_SSE4_1
643649 // _mm_mullo_epi32 is SSE 4.1
644650 __m128i z = _mm_add_epi64 (
645- y, _mm_mullo_epi32 (neg100,
646- _mm_srli_epi32 (_mm_mulhi_epu16 (y, div100), 3 )));
651+ y,
652+ _mm_mullo_epi32 (neg100, _mm_srli_epi32 (_mm_mulhi_epu16 (y, div100), 3 )));
647653 __m128i big_endian_bcd =
648654 _mm_add_epi16 (z, _mm_mullo_epi16 (neg10, _mm_mulhi_epu16 (z, div10)));
649655 __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, bswap); // SSSE3
0 commit comments