Skip to content

Commit 4f718c5

Browse files
committed
Cleanup SSE
1 parent 242429c commit 4f718c5

File tree

1 file changed

+30
-24
lines changed

1 file changed

+30
-24
lines changed

zmij.cc

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -497,13 +497,20 @@ inline void write8(char* buffer, uint64_t value) noexcept {
497497
}
498498

499499
#if ZMIJ_USE_SSE
500-
constexpr auto splat64(uint64_t x) -> __m128i {
501-
return __m128i{static_cast<long long>(x), static_cast<long long>(x)};
500+
using m128i = __m128i;
501+
#else
502+
struct m128i {
503+
long long data[2];
504+
};
505+
#endif
506+
507+
constexpr auto splat64(uint64_t x) -> m128i {
508+
return m128i{static_cast<long long>(x), static_cast<long long>(x)};
502509
}
503-
constexpr auto splat32(uint32_t x) -> __m128i {
510+
constexpr auto splat32(uint32_t x) -> m128i {
504511
return splat64(uint64_t(x) << 32 | x);
505512
}
506-
constexpr auto splat16(uint16_t x) -> __m128i {
513+
constexpr auto splat16(uint16_t x) -> m128i {
507514
return splat32(uint32_t(x) << 16 | x);
508515
}
509516
constexpr auto pack8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
@@ -513,24 +520,6 @@ constexpr auto pack8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
513520
u64(d) << 24 | u64(c) << 16 | u64(b) << +8 | u64(a);
514521
}
515522

516-
alignas(64) constexpr struct {
517-
__m128i div10k = splat64(div10k_sig);
518-
__m128i neg10k = splat64(::neg10k);
519-
__m128i div100 = splat32(div100_sig);
520-
__m128i div10 = splat16((1 << 16) / 10 + 1);
521-
# if ZMIJ_USE_SSE4_1
522-
__m128i neg100 = splat32(::neg100);
523-
__m128i neg10 = splat16((1 << 8) - 10);
524-
__m128i bswap = __m128i{pack8(15, 14, 13, 12, 11, 10, 9, 8),
525-
pack8(7, 6, 5, 4, 3, 2, 1, 0)};
526-
# else
527-
__m128i hundred = splat32(100);
528-
__m128i moddiv10 = splat16(10 * (1 << 8) - 1);
529-
# endif
530-
__m128i zeros = splat64(::zeros);
531-
} consts;
532-
#endif // ZMIJ_USE_SSE
533-
534523
// Writes a significand consisting of up to 17 decimal digits (16-17 for
535524
// normals) and removes trailing zeros. The significant digits start
536525
// from buffer[1]. buffer[0] may contain '0' after this function if
@@ -620,6 +609,23 @@ auto write_significand17(char* buffer, uint64_t value,
620609
uint32_t abcdefgh = digits_16 / uint64_t(1e8);
621610
uint32_t ijklmnop = digits_16 % uint64_t(1e8);
622611

612+
alignas(64) static constexpr struct {
613+
__m128i div10k = splat64(div10k_sig);
614+
__m128i neg10k = splat64(::neg10k);
615+
__m128i div100 = splat32(div100_sig);
616+
__m128i div10 = splat16((1 << 16) / 10 + 1);
617+
# if ZMIJ_USE_SSE4_1
618+
__m128i neg100 = splat32(::neg100);
619+
__m128i neg10 = splat16((1 << 8) - 10);
620+
__m128i bswap = __m128i{pack8(15, 14, 13, 12, 11, 10, 9, 8),
621+
pack8(7, 6, 5, 4, 3, 2, 1, 0)};
622+
# else
623+
__m128i hundred = splat32(100);
624+
__m128i moddiv10 = splat16(10 * (1 << 8) - 1);
625+
# endif
626+
__m128i zeros = splat64(::zeros);
627+
} consts;
628+
623629
const __m128i div10k = _mm_load_si128(&consts.div10k);
624630
const __m128i neg10k = _mm_load_si128(&consts.neg10k);
625631
const __m128i div100 = _mm_load_si128(&consts.div100);
@@ -642,8 +648,8 @@ auto write_significand17(char* buffer, uint64_t value,
642648
# if ZMIJ_USE_SSE4_1
643649
// _mm_mullo_epi32 is SSE 4.1
644650
__m128i z = _mm_add_epi64(
645-
y, _mm_mullo_epi32(neg100,
646-
_mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)));
651+
y,
652+
_mm_mullo_epi32(neg100, _mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)));
647653
__m128i big_endian_bcd =
648654
_mm_add_epi16(z, _mm_mullo_epi16(neg10, _mm_mulhi_epu16(z, div10)));
649655
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, bswap); // SSSE3

0 commit comments

Comments
 (0)