Skip to content

Clang/LLVM optimizes division and modulo worse than MSVC, part 2 #37565

@StephanTLavavej

Description

@StephanTLavavej
Bugzilla Link 38217
Version trunk
OS Windows NT
Attachments Test case, Clang codegen for workaround, Clang codegen for modulo (this is the bug), MSVC codegen for workaround, MSVC codegen for modulo (this is fine)
CC @topperc,@LebedevRI,@RKSimon,@nico,@rotateright

Extended Description

This appears to be a different bug than #37331 "Clang/LLVM optimizes division and modulo worse than MSVC" (which is probably a duplicate of #23480 "Division followed by modulo generates longer machine code than vice versa") because it involves modulo followed by division.

This affects the Ryu algorithm for printing floating-point numbers (https://github.com/ulfjack/ryu ) and therefore affects C++17 floating-point std::to_chars().

I observe that MSVC's codegen is unaffected by WORKAROUND, while Clang/LLVM generates less assembly code (which is faster when profiled in the real algorithm) for WORKAROUND.

Here's a Godbolt link demonstrating the codegen difference (this isn't Windows-specific): https://godbolt.org/g/uX1AD8

C:\Temp\TESTING_X64>cl
Microsoft (R) C/C++ Optimizing Compiler Version 19.16.26504 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.

usage: cl [ option... ] filename... [ /link linkoption... ]

C:\Temp\TESTING_X64>clang-cl -m64 -v
clang version 6.0.0 (tags/RELEASE_600/final)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: S:\msvc\src\vctools\NonShip\ClangLLVM\bin

C:\Temp\TESTING_X64>type d2s.cpp
#include <stdint.h>
#include <string.h>

static const char DIGIT_TABLE[] =
"0001020304050607080910111213141516171819"
"2021222324252627282930313233343536373839"
"4041424344454647484950515253545556575859"
"6061626364656667686970717273747576777879"
"8081828384858687888990919293949596979899";

void d2s_buffered(uint64_t output, char * result) {
    uint32_t i = 0;

    while (output >= 10000) {
#ifdef WORKAROUND
        const uint32_t c = (uint32_t) (output - 10000 * (output / 10000));
#else
        const uint32_t c = (uint32_t) (output % 10000);
#endif

        output /= 10000;
        const uint32_t c0 = (c % 100) << 1;
        const uint32_t c1 = (c / 100) << 1;
        memcpy(result - i - 1, DIGIT_TABLE + c0, 2);
        memcpy(result - i - 3, DIGIT_TABLE + c1, 2);
        i += 4;
    }
}

C:\Temp\TESTING_X64>cl /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FAsc /Famsvc_workaround.cod /DWORKAROUND
d2s.cpp

C:\Temp\TESTING_X64>cl /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FAsc /Famsvc_modulo.cod
d2s.cpp

C:\Temp\TESTING_X64>git diff msvc_workaround.cod msvc_modulo.cod
diff --git a/msvc_workaround.cod b/msvc_modulo.cod
index 1be1419..aff234c 100644
--- a/msvc_workaround.cod
+++ b/msvc_modulo.cod
@@ -86,11 +86,11 @@ $LL2@d2s_buffer:

 ; 15   : #ifdef WORKAROUND
 ; 16   :         const uint32_t c = (uint32_t) (output - 10000 * (output / 10000));
+; 17   : #else^M
+; 18   :         const uint32_t c = (uint32_t) (output % 10000);^M

   00030        48 8b c7         mov     rax, rdi

-; 17   : #else
-; 18   :         const uint32_t c = (uint32_t) (output % 10000);
 ; 19   : #endif
 ; 20   :
 ; 21   :         output /= 10000;

C:\Temp\TESTING_X64>clang-cl -m64 /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FA /Faclang_workaround.asm /DWORKAROUND

C:\Temp\TESTING_X64>clang-cl -m64 /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FA /Faclang_modulo.asm

C:\Temp\TESTING_X64>git diff clang_workaround.asm clang_modulo.asm
diff --git a/clang_workaround.asm b/clang_modulo.asm
index 2a8cb49..6026638 100644
--- a/clang_workaround.asm
+++ b/clang_modulo.asm
@@ -29,19 +29,22 @@
        movq    %r9, %rax
        mulq    %r10
        shrq    $11, %rdx
-       imulq   $-10000, %rdx, %rax     # imm = 0xD8F0
-       addq    %r9, %rax
-       movl    %eax, %esi
-       imulq   $1374389535, %rsi, %rsi # imm = 0x51EB851F
-       shrq    $37, %rsi
-       imull   $100, %esi, %edi
-       subl    %edi, %eax
+       imulq   $10000, %rdx, %rax      # imm = 0x2710
+       movq    %r9, %rsi
+       subq    %rax, %rsi
+       imulq   $1374389535, %rsi, %rax # imm = 0x51EB851F
+       movq    %rax, %rdi
+       shrq    $37, %rdi
+       imull   $100, %edi, %edi
+       subl    %edi, %esi
+       shrq    $36, %rax
+       andl    $510, %eax              # imm = 0x1FE
        movl    %ecx, %edi
        movq    %r8, %rbx
        subq    %rdi, %rbx
-       movzwl  (%r11,%rax,2), %eax
-       movw    %ax, -1(%rbx)
-       movzwl  (%r11,%rsi,2), %eax
+       movzwl  (%r11,%rsi,2), %esi
+       movw    %si, -1(%rbx)
+       movzwl  (%rax,%r11), %eax
        movw    %ax, -3(%rbx)
        addl    $4, %ecx
        cmpq    $99999999, %r9          # imm = 0x5F5E0FF

Metadata

Metadata

Assignees

No one assigned

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions