Skip to content

Commit 6523396

Browse files
committed
Drop the guard
1 parent 4ae872e commit 6523396

File tree

1 file changed

+56
-24
lines changed

1 file changed

+56
-24
lines changed

zmij.cc

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,41 @@ inline void write8(char* buffer, uint64_t value) noexcept {
496496
memcpy(buffer, &value, 8);
497497
}
498498

499+
#if ZMIJ_USE_SSE
500+
constexpr auto splat64(uint64_t x) -> __m128i {
501+
return __m128i{static_cast<long long>(x), static_cast<long long>(x)};
502+
}
503+
constexpr auto splat32(uint32_t x) -> __m128i {
504+
return splat64(uint64_t(x) << 32 | x);
505+
}
506+
constexpr auto splat16(uint16_t x) -> __m128i {
507+
return splat32(uint32_t(x) << 16 | x);
508+
}
509+
constexpr auto pack8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
510+
uint8_t e, uint8_t f, uint8_t g, uint8_t h) -> uint64_t {
511+
using u64 = uint64_t;
512+
return u64(h) << 56 | u64(g) << 48 | u64(f) << 40 | u64(e) << 32 |
513+
u64(d) << 24 | u64(c) << 16 | u64(b) << +8 | u64(a);
514+
}
515+
516+
alignas(64) constexpr struct {
517+
__m128i div10k = splat64(div10k_sig);
518+
__m128i neg10k = splat64(::neg10k);
519+
__m128i div100 = splat32(div100_sig);
520+
__m128i div10 = splat16((1 << 16) / 10 + 1);
521+
# if ZMIJ_USE_SSE4_1
522+
__m128i neg100 = splat32(::neg100);
523+
__m128i neg10 = splat16((1 << 8) - 10);
524+
__m128i bswap = __m128i{pack8(15, 14, 13, 12, 11, 10, 9, 8),
525+
pack8(7, 6, 5, 4, 3, 2, 1, 0)};
526+
# else
527+
__m128i hundred = splat32(100);
528+
__m128i moddiv10 = splat16(10 * (1 << 8) - 1);
529+
# endif
530+
__m128i zeros = splat64(::zeros);
531+
} consts;
532+
#endif // ZMIJ_USE_SSE
533+
499534
// Writes a significand consisting of up to 17 decimal digits (16-17 for
500535
// normals) and removes trailing zeros. The significant digits start
501536
// from buffer[1]. buffer[0] may contain '0' after this function if
@@ -585,47 +620,44 @@ auto write_significand17(char* buffer, uint64_t value,
585620
uint32_t abcdefgh = digits_16 / uint64_t(1e8);
586621
uint32_t ijklmnop = digits_16 % uint64_t(1e8);
587622

588-
alignas(64) static const struct {
589-
__m128i div10k = _mm_set1_epi64x(div10k_sig);
590-
__m128i neg10k = _mm_set1_epi64x(::neg10k);
591-
__m128i div100 = _mm_set1_epi32(div100_sig);
592-
__m128i div10 = _mm_set1_epi16((1 << 16) / 10 + 1);
623+
const __m128i div10k = _mm_load_si128(&consts.div10k);
624+
const __m128i neg10k = _mm_load_si128(&consts.neg10k);
625+
const __m128i div100 = _mm_load_si128(&consts.div100);
626+
const __m128i div10 = _mm_load_si128(&consts.div10);
593627
# if ZMIJ_USE_SSE4_1
594-
__m128i neg100 = _mm_set1_epi32(::neg100);
595-
__m128i neg10 = _mm_set1_epi16((1 << 8) - 10);
596-
__m128i bswap =
597-
_mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
628+
const __m128i neg100 = _mm_load_si128(&consts.neg100);
629+
const __m128i neg10 = _mm_load_si128(&consts.neg10);
630+
const __m128i bswap = _mm_load_si128(&consts.bswap);
598631
# else
599-
__m128i hundred = _mm_set1_epi32(100);
600-
__m128i moddiv10 = _mm_set1_epi16(10 * (1 << 8) - 1);
601-
# endif // ZMIJ_USE_SSE4_1
602-
__m128i zeros = _mm_set1_epi64x(::zeros);
603-
} c;
632+
const __m128i hundred = _mm_load_si128(&consts.hundred);
633+
const __m128i moddiv10 = _mm_load_si128(&consts.moddiv10);
634+
# endif
635+
const __m128i zeros = _mm_load_si128(&consts.zeros);
604636

605637
// The BCD sequences are based on ones provided by Xiang JunBo.
606638
__m128i x = _mm_set_epi64x(abcdefgh, ijklmnop);
607639
__m128i y = _mm_add_epi64(
608-
x, _mm_mul_epu32(c.neg10k,
609-
_mm_srli_epi64(_mm_mul_epu32(x, c.div10k), div10k_exp)));
640+
x, _mm_mul_epu32(neg10k,
641+
_mm_srli_epi64(_mm_mul_epu32(x, div10k), div10k_exp)));
610642
# if ZMIJ_USE_SSE4_1
611643
// _mm_mullo_epi32 is SSE 4.1
612644
__m128i z = _mm_add_epi64(
613-
y, _mm_mullo_epi32(c.neg100,
614-
_mm_srli_epi32(_mm_mulhi_epu16(y, c.div100), 3)));
645+
y, _mm_mullo_epi32(neg100,
646+
_mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)));
615647
__m128i big_endian_bcd =
616-
_mm_add_epi16(z, _mm_mullo_epi16(c.neg10, _mm_mulhi_epu16(z, c.div10)));
617-
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, c.bswap); // SSSE3
648+
_mm_add_epi16(z, _mm_mullo_epi16(neg10, _mm_mulhi_epu16(z, div10)));
649+
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, bswap); // SSSE3
618650
# else
619-
__m128i y_div_100 = _mm_srli_epi16(_mm_mulhi_epu16(y, c.div100), 3);
620-
__m128i y_mod_100 = _mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, c.hundred));
651+
__m128i y_div_100 = _mm_srli_epi16(_mm_mulhi_epu16(y, div100), 3);
652+
__m128i y_mod_100 = _mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, hundred));
621653
__m128i z = _mm_or_si128(_mm_slli_epi32(y_mod_100, 16), y_div_100);
622654
__m128i bcd_shuffled =
623655
_mm_sub_epi16(_mm_slli_epi16(z, 8),
624-
_mm_mullo_epi16(c.moddiv10, _mm_mulhi_epu16(z, c.div10)));
656+
_mm_mullo_epi16(moddiv10, _mm_mulhi_epu16(z, div10)));
625657
__m128i bcd = _mm_shuffle_epi32(bcd_shuffled, _MM_SHUFFLE(0, 1, 2, 3));
626658
# endif // ZMIJ_USE_SSE4_1
627659

628-
auto digits = _mm_or_si128(bcd, c.zeros);
660+
auto digits = _mm_or_si128(bcd, zeros);
629661

630662
// determine number of leading zeros
631663
__m128i mask128 = _mm_cmpgt_epi8(bcd, _mm_setzero_si128());

0 commit comments

Comments
 (0)