@@ -524,8 +524,8 @@ constexpr auto pack8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, //
524524// normals) and removes trailing zeros. The significant digits start
525525// from buffer[1]. buffer[0] may contain '0' after this function if
526526// the significand has length 16.
527- auto write_significand17 (char * buffer, uint64_t value,
528- bool has17digits ) noexcept -> char* {
527+ auto write_significand17 (char * buffer, uint64_t value, bool has17digits,
528+ long long value_div10 ) noexcept -> char* {
529529 if (!ZMIJ_USE_NEON && !ZMIJ_USE_SSE) {
530530 char * start = buffer + 1 ;
531531 // Each digit is denoted by a letter so value is abbccddeeffgghhii.
@@ -612,8 +612,7 @@ auto write_significand17(char* buffer, uint64_t value,
612612 buffer += 16 - ((zeroes != 0 ? clz (zeroes) : 64 ) >> 2 );
613613 return buffer - int (buffer - start == 1 );
614614#elif ZMIJ_USE_SSE
615- uint64_t digits_16 =
616- use_umul128_hi64 ? umul128_hi64 (value, div10_sig64) : value / 10 ;
615+ uint64_t digits_16 = value_div10;
617616 uint32_t last_digit = value - digits_16 * 10 ;
618617
619618 // We always write 17 digits into the buffer, but the first one can be zero.
@@ -630,15 +629,15 @@ auto write_significand17(char* buffer, uint64_t value,
630629 __m128i neg10k = splat64(::neg10k);
631630 __m128i div100 = splat32(div100_sig);
632631 __m128i div10 = splat16((1 << 16 ) / 10 + 1 );
633- # if ZMIJ_USE_SSE4_1
632+ # if ZMIJ_USE_SSE4_1
634633 __m128i neg100 = splat32(::neg100);
635634 __m128i neg10 = splat16((1 << 8 ) - 10 );
636635 __m128i bswap = __m128i{pack8 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 ),
637636 pack8 (7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 )};
638- # else
637+ # else
639638 __m128i hundred = splat32(100 );
640639 __m128i moddiv10 = splat16(10 * (1 << 8 ) - 1 );
641- # endif
640+ # endif
642641 __m128i zeros = splat64(::zeros);
643642 } consts;
644643
@@ -693,7 +692,7 @@ auto write_significand17(char* buffer, uint64_t value,
693692
694693 _mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), digits);
695694 return buffer + ((last_digit != 0 ) ? 17 : len - (len == 1 ));
696- #endif // ZMIJ_USE_SSE
695+ #endif // ZMIJ_USE_SSE
697696}
698697
699698// Writes a significand consisting of up to 9 decimal digits (7-9 for normals)
@@ -708,20 +707,40 @@ auto write_significand9(char* buffer, uint32_t value, bool has9digits) noexcept
708707 return buffer - int (buffer - start == 1 );
709708}
710709
710+ struct to_decimal_result {
711+ long long sig;
712+ int exp;
713+ static constexpr long long sig_div10 = 0 ;
714+ void set_div10 (long long ) {}
715+ };
716+
717+ struct to_decimal_result_sse {
718+ long long sig;
719+ int exp;
720+ long long sig_div10;
721+ void set_div10 (long long value) { sig_div10 = value; }
722+ };
723+
724+ using to_decimal_result_t =
725+ std::conditional_t <ZMIJ_USE_SSE != 0 , to_decimal_result_sse,
726+ to_decimal_result>;
727+
711728template <int num_bits>
712- auto normalize (zmij::dec_fp dec, bool subnormal) noexcept -> zmij::dec_fp {
729+ auto normalize (to_decimal_result_t dec, bool subnormal) noexcept
730+ -> to_decimal_result_t {
713731 if (!subnormal) [[ZMIJ_LIKELY]]
714732 return dec;
715733 while (dec.sig < (num_bits == 64 ? uint64_t (1e16 ) : uint64_t (1e8 ))) {
716734 dec.sig *= 10 ;
717735 --dec.exp ;
718736 }
737+ dec.set_div10 (dec.sig / 10 );
719738 return dec;
720739}
721740
722741template <bool subnormal = false , typename UInt>
723742auto to_decimal_schubfach (UInt bin_sig, int64_t bin_exp, bool regular) noexcept
724- -> zmij::dec_fp {
743+ -> to_decimal_result_t {
725744 constexpr int num_bits = std::numeric_limits<UInt>::digits;
726745 int dec_exp = compute_dec_exp (bin_exp, regular);
727746 unsigned char exp_shift = compute_exp_shift<num_bits>(bin_exp, dec_exp);
@@ -745,9 +764,13 @@ auto to_decimal_schubfach(UInt bin_sig, int64_t bin_exp, bool regular) noexcept
745764
746765 // The idea of using a single shorter candidate is by Cassio Neri.
747766 // It is less or equal to the upper bound by construction.
748- UInt shorter = 10 * ((upper >> bound_shift) / 10 );
749- if ((shorter << bound_shift) >= lower)
750- return normalize<num_bits>({int64_t (shorter), dec_exp}, subnormal);
767+ long long div10 = (upper >> bound_shift) / 10 ;
768+ UInt shorter = div10 * 10 ;
769+ if ((shorter << bound_shift) >= lower) {
770+ to_decimal_result_t result = {int64_t (shorter), dec_exp};
771+ result.set_div10 (div10);
772+ return normalize<num_bits>(result, subnormal);
773+ }
751774
752775 UInt scaled_sig =
753776 umulhi_inexact_to_odd (pow10.hi , pow10.lo , bin_sig_shifted << exp_shift);
@@ -761,15 +784,18 @@ auto to_decimal_schubfach(UInt bin_sig, int64_t bin_exp, bool regular) noexcept
761784 bool below_closer = cmp < 0 || (cmp == 0 && (longer_below & 1 ) == 0 );
762785 bool below_in = (longer_below << bound_shift) >= lower;
763786 UInt dec_sig = (below_closer & below_in) ? longer_below : longer_above;
764- return normalize<num_bits>({int64_t (dec_sig), dec_exp}, subnormal);
787+ to_decimal_result_t result = {int64_t (dec_sig), dec_exp};
788+ result.set_div10 (dec_sig / 10 );
789+ return normalize<num_bits>(result, subnormal);
765790}
766791
767792// Here be 🐉s.
768793// Converts a binary FP number bin_sig * 2**bin_exp to the shortest decimal
769794// representation, where bin_exp = raw_exp - exp_offset.
770795template <typename Float, typename UInt>
771796ZMIJ_INLINE auto to_decimal_normal (UInt bin_sig, int64_t raw_exp,
772- bool regular) noexcept -> zmij::dec_fp {
797+ bool regular) noexcept
798+ -> to_decimal_result_t {
773799 using traits = float_traits<Float>;
774800 int64_t bin_exp = raw_exp - traits::exp_offset;
775801 constexpr int num_bits = std::numeric_limits<UInt>::digits;
@@ -799,16 +825,13 @@ ZMIJ_INLINE auto to_decimal_normal(UInt bin_sig, int64_t raw_exp,
799825 if (cmp == 0 ) [[ZMIJ_UNLIKELY]]
800826 break ;
801827
802- uint64_t digit;
803- if (ZMIJ_USE_INT128) {
804- // An optimization of integral % 10 by Dougall Johnson.
805- // Relies on range calculation: (max_bin_sig << max_exp_shift) * max_u128.
806- digit = integral - umul128_hi64 (integral, div10_sig64) * 10 ;
807- // or it narrows to 32-bit and doesn't use madd/msub
808- ZMIJ_ASM ((" " : " +r" (digit)));
809- } else {
810- digit = integral % 10 ;
811- }
828+ // An optimization of integral % 10 by Dougall Johnson.
829+ // Relies on range calculation: (max_bin_sig << max_exp_shift) * max_u128.
830+ long long div10 =
831+ ZMIJ_USE_INT128 ? umul128_hi64 (integral, div10_sig64) : integral / 10 ;
832+ uint64_t digit = integral - div10 * 10 ;
833+ // or it narrows to 32-bit and doesn't use madd/msub
834+ ZMIJ_ASM ((" " : " +r" (digit)));
812835
813836 // Switch to a fixed-point representation with the least significant
814837 // integral digit in the upper bits and fractional digits in the lower bits.
@@ -863,7 +886,9 @@ ZMIJ_INLINE auto to_decimal_normal(UInt bin_sig, int64_t raw_exp,
863886 }
864887 shorter += round_up * 10 ;
865888 bool use_shorter = (scaled_sig_mod10 <= scaled_half_ulp) + round_up != 0 ;
866- return {use_shorter ? shorter : longer, dec_exp};
889+ to_decimal_result_t result = {use_shorter ? shorter : longer, dec_exp};
890+ result.set_div10 (div10 + use_shorter * round_up);
891+ return result;
867892 }
868893 return to_decimal_schubfach (bin_sig, bin_exp, regular);
869894}
@@ -877,7 +902,7 @@ inline auto to_decimal(double value) noexcept -> dec_fp {
877902 auto bits = traits::to_bits (value);
878903 auto bin_exp = traits::get_exp (bits); // binary exponent
879904 auto bin_sig = traits::get_sig (bits); // binary significand
880- dec_fp dec;
905+ to_decimal_result_t dec;
881906 if (bin_exp == 0 || bin_exp == traits::exp_mask) [[ZMIJ_UNLIKELY]] {
882907 if (bin_exp != 0 ) return {0 , int (~0u >> 1 )};
883908 if (bin_sig == 0 ) return {0 , 0 };
@@ -903,7 +928,7 @@ auto write(Float value, char* buffer) noexcept -> char* {
903928 *buffer = ' -' ;
904929 buffer += traits::is_negative (bits);
905930
906- dec_fp dec;
931+ to_decimal_result_t dec;
907932 if (bin_exp == 0 || bin_exp == traits::exp_mask) [[ZMIJ_UNLIKELY]] {
908933 if (bin_exp != 0 ) {
909934 memcpy (buffer, bin_sig == 0 ? " inf" : " nan" , 4 );
@@ -925,7 +950,7 @@ auto write(Float value, char* buffer) noexcept -> char* {
925950 if (traits::num_bits == 64 ) {
926951 bool has17digits = dec.sig >= uint64_t (1e16 );
927952 dec_exp += traits::max_digits10 - 2 + has17digits;
928- buffer = write_significand17 (buffer, dec.sig , has17digits);
953+ buffer = write_significand17 (buffer, dec.sig , has17digits, dec. sig_div10 );
929954 } else {
930955 if (dec.sig < uint32_t (1e7 )) [[ZMIJ_UNLIKELY]] {
931956 dec.sig *= 10 ;
0 commit comments