Skip to content

Commit e9bf7b3

Browse files
committed
Apply pow10 compression method by Dougall Johnson
1 parent b9487ea commit e9bf7b3

File tree

1 file changed

+71
-44
lines changed

1 file changed

+71
-44
lines changed

zmij.cc

Lines changed: 71 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -337,51 +337,81 @@ constexpr auto floor_log2_pow10(int e) noexcept -> int {
337337
return e * 1741647 >> 19;
338338
}
339339

340-
constexpr uint64_t powers_of_5[] = {
341-
0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
342-
0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
343-
0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
344-
0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
345-
0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
346-
0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
347-
0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
348-
0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
349-
0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
350-
351340
// 128-bit significands of powers of 10 rounded down.
352341
// Generated using 192-bit arithmetic method by Dougall Johnson.
353342
struct pow10_significands_table {
354343
static constexpr bool compress = ZMIJ_OPTIMIZE_SIZE != 0;
355344
static constexpr bool split_tables = !compress && ZMIJ_AARCH64 != 0;
356345
static constexpr int num_pow10 = 617;
357-
static constexpr int compression_ratio = compress ? 27 : 1;
358-
uint64_t data[(num_pow10 / compression_ratio + compress) * 2] = {};
346+
uint64_t data[compress ? 1 : num_pow10 * 2] = {};
359347

360348
ZMIJ_CONSTEXPR auto operator[](int dec_exp) const noexcept -> uint128 {
361349
constexpr int dec_exp_min = -292;
362350
if (compress) {
363-
int base_index = (dec_exp - dec_exp_min) / compression_ratio;
364-
int base_dec_exp = base_index * compression_ratio + dec_exp_min;
365-
int offset = dec_exp - base_dec_exp;
366-
367-
uint64_t base_hi = data[base_index * 2];
368-
uint64_t base_lo = data[base_index * 2 + 1];
369-
if (offset == 0) return {base_hi, base_lo};
370-
371-
int shift = floor_log2_pow10(base_dec_exp + offset) -
372-
floor_log2_pow10(base_dec_exp) - offset;
373-
374-
uint64_t pow5 = powers_of_5[offset];
375-
uint128_t p_hi = umul128(base_hi, pow5);
376-
uint128_t p_lo = umul128(base_lo, pow5);
377-
378-
uint64_t r_hi = uint64_t(p_hi >> 64);
379-
uint64_t r_lo = uint64_t(p_hi) + uint64_t(p_lo >> 64);
380-
if (r_lo < uint64_t(p_hi)) ++r_hi;
351+
unsigned i = dec_exp - dec_exp_min;
352+
// 672 bytes of data
353+
static const uint64_t pow10s[28] = {
354+
0x8000000000000000, 0xa000000000000000, 0xc800000000000000,
355+
0xfa00000000000000, 0x9c40000000000000, 0xc350000000000000,
356+
0xf424000000000000, 0x9896800000000000, 0xbebc200000000000,
357+
0xee6b280000000000, 0x9502f90000000000, 0xba43b74000000000,
358+
0xe8d4a51000000000, 0x9184e72a00000000, 0xb5e620f480000000,
359+
0xe35fa931a0000000, 0x8e1bc9bf04000000, 0xb1a2bc2ec5000000,
360+
0xde0b6b3a76400000, 0x8ac7230489e80000, 0xad78ebc5ac620000,
361+
0xd8d726b7177a8000, 0x878678326eac9000, 0xa968163f0a57b400,
362+
0xd3c21bcecceda100, 0x84595161401484a0, 0xa56fa5b99019a5c8,
363+
0xcecb8f27f4200f3a,
364+
};
365+
366+
static const uint128 high_parts[23] = {
367+
{0xaf8e5410288e1b6f, 0x07ecf0ae5ee44dda},
368+
{0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
369+
{0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15},
370+
{0xb4bca50b065abe63, 0x0fed077a756b53aa},
371+
{0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
372+
{0xb84687c269ef3bfb, 0x3d5d514f40eea742},
373+
{0xba121a4650e4ddeb, 0x92f34d62616ce413},
374+
{0xbbe226efb628afea, 0x890489f70a55368c},
375+
{0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
376+
{0xbf8fdb78849a5f96, 0xde98520472bdd034},
377+
{0xc16d9a0095928a27, 0x75b7053c0f178294},
378+
{0xc350000000000000, 0x0000000000000000},
379+
{0xc5371912364ce305, 0x6c28000000000000},
380+
{0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6},
381+
{0xc913936dd571c84c, 0x03bc3a19cd1e38ea},
382+
{0xcb090c8001ab551c, 0x5cadf5bfd3072cc6},
383+
{0xcd036837130890a1, 0x36dba887c37a8c10},
384+
{0xcf02b2c21207ef2e, 0x94f967e45e03f4bc},
385+
{0xd106f86e69d785c7, 0xe13336d701beba52},
386+
{0xd31045a8341ca07c, 0x1ede48111209a051},
387+
{0xd51ea6fa85785631, 0x552a74227f3ea566},
388+
{0xd732290fbacaf133, 0xa97c177947ad4096},
389+
{0xd94ad8b1c7380874, 0x18375281ae7822bc},
390+
};
391+
392+
uint32_t fixups[20] = {0x05271b1f, 0x00000c20, 0x00003200, 0x12100020,
393+
0x00000000, 0x06000000, 0xc16409c0, 0xaf26700f,
394+
0xeb987b07, 0x0000000d, 0x00000000, 0x66fbfffe,
395+
0xb74100ec, 0xa0669fe8, 0xedb21280, 0x00000686,
396+
0x0a021200, 0x29b89c20, 0x08bc0eda, 0x00000000};
397+
398+
uint64_t m = pow10s[(i + 11) % 28];
399+
uint128 h = high_parts[(i + 11) / 28];
400+
401+
uint64_t h1 = umul128_hi64(h.lo, m);
402+
403+
uint64_t c0 = h.lo * m;
404+
uint64_t c1 = h1 + h.hi * m;
405+
uint64_t c2 = (c1 < h1) + umul128_hi64(h.hi, m);
406+
407+
uint128 result;
408+
if (c2 >> 63)
409+
result = uint128{c2, c1};
410+
else
411+
result = uint128{(c2 << 1) | (c1 >> 63), (c1 << 1) | (c0 >> 63)};
381412

382-
uint64_t lo = ((uint64_t(p_lo) >> shift) | (r_lo << (64 - shift))) + 1;
383-
uint64_t hi = (r_lo >> shift) | (r_hi << (64 - shift));
384-
return {lo != 0 ? hi : hi + 1, lo};
413+
result.lo -= (fixups[i >> 5] >> (i & 31)) & 1;
414+
return result;
385415
}
386416
if (!split_tables) {
387417
int index = (dec_exp - dec_exp_min) * 2;
@@ -406,16 +436,13 @@ struct pow10_significands_table {
406436
uint192 current = {0xe000000000000000, 0x25e8e89c13bb0f7a,
407437
0xff77b1fcbebcdc4f};
408438
uint64_t ten = 0xa000000000000000;
409-
for (int i = 0; i < num_pow10; ++i) {
410-
if (i % compression_ratio == 0) {
411-
int index = i / compression_ratio;
412-
if (split_tables) {
413-
data[num_pow10 - index - 1] = current.w2;
414-
data[num_pow10 * 2 - index - 1] = current.w1;
415-
} else {
416-
data[index * 2] = current.w2;
417-
data[index * 2 + 1] = current.w1;
418-
}
439+
for (int i = 0; i < num_pow10 && !compress; ++i) {
440+
if (split_tables) {
441+
data[num_pow10 - i - 1] = current.w2;
442+
data[num_pow10 * 2 - i - 1] = current.w1;
443+
} else {
444+
data[i * 2] = current.w2;
445+
data[i * 2 + 1] = current.w1;
419446
}
420447

421448
uint64_t h0 = umul128_hi64(current.w0, ten);

0 commit comments

Comments
 (0)