Skip to content

RyuJIT: missed opportunity for LICM #6666

@pgavlin

Description

@pgavlin

The following program contains a number of hoistable expressions:

class C
{
    static int M(int searchBound, int targetNumber)
    {
        int nresults = 0;

        for (int i = 0; i <= searchBound; i++)
        {
            for (int j = 0; j >= (-1 * searchBound); j--)
            {
                for (int k = (-1) * searchBound; k <= searchBound; k++)
                {
                    if ((i * i * i) + (j * j * j) + (k * k * k) == targetNumber)
                    {
                        nresults++;
                    }
                }
            }
        }

        return nresults;
    }

    static int Main()
    {
        return M(1000, 9);
    }
}

Layout optimization changes the flow graph for M to the following:

int nresults = 0;
int i = 0;
if (i <= searchBound)
{
    do
    {
        int j = 0;
        if (-searchBound <= j)
        {
            do
            {
                int k = -searchBound;
                if (k <= searchBound)
                {
                    do
                    {
                        if ((i * i * i) + (j * j * j) + (k * k * k) == targetNumber)
                            nresults++;
                        k++;
                    } while (k <= searchBound);
                }
                j++;
            } while (-searchBound <= j);
        }
        i++;
    } while (i <= searchBound);
}

LICM then walks the loops from outer- to innermost and performs the following hoists:

  1. The occurrence of -searchBound in if (searchBound <= j) is hoisted out of the outermost loop
  2. The occurrence of (i * i * i) + (j * j * j) is hoisted out of the innermost loop

Notably, LICM fails to hoist (i * i * i) out of the second loop once it has been placed there by (2).

These transformations give the following structure:

int nresults = 0;
int i = 0;
if (i <= searchBound)
{
    (-searchBound);
    do
    {
        int j = 0;
        if (-searchBound <= j)
        {
            do
            {
                int k = -searchBound;
                if (k <= searchBound)
                {
                    ((i * i * i) + (j * j * j));
                    do
                    {
                        if ((i * i * i) + (j * j * j) + (k * k * k) == targetNumber)
                            nresults++;
                        k++;
                    } while (k <= searchBound);
                }
                j++;
            } while (-searchBound <= j);
        }
        i++;
    } while (i <= searchBound);
}

Once CSE cleans things up, we end up with:

int nresults = 0;
int i = 0;
if (i <= searchBound)
{
    int cse1 = -searchBound;
    do
    {
        int j = 0;
        if (cse1 <= j)
        {
            do
            {
                int k = cse1;
                if (k <= searchBound)
                {
                    int cse0 = (i * i * i) + (j * j * j);
                    do
                    {
                        if (cse0 + (k * k * k) == targetNumber)
                            nresults++;
                        k++;
                    } while (k <= searchBound);
                }
                j++;
            } while (cse1 <= j);
        }
        i++;
    } while (i <= searchBound);
}

And the generated assembly, for good measure:

; Assembly listing for method C:M(int,int):int
; Emitting BLENDED_CODE for X64 CPU with SSE2
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T02] (  7,  88  )     int  ->  rcx        
;  V01 arg1         [V01,T04] (  3,  66  )     int  ->  rdx        
;  V02 loc0         [V02,T05] (  4,  66  )     int  ->  rax        
;  V03 loc1         [V03,T06] (  7,  61  )     int  ->   r8        
;  V04 loc2         [V04,T01] (  7, 100  )     int  ->  r10        
;  V05 loc3         [V05,T00] (  8, 416  )     int  ->  r11        
;# V06 OutArgs      [V06    ] (  1,   1  )  lclBlk ( 0) [rsp+0x00]  
;  V07 cse0         [V07,T03] (  2,  80  )     int  ->  rsi        
;  V08 cse1         [V08,T07] (  4,  37  )     int  ->   r9        
;
; Lcl frame size = 0

G_M28799_IG01:
       57                   push     rdi
       56                   push     rsi

G_M28799_IG02:
       33C0                 xor      eax, eax
       4533C0               xor      r8d, r8d
       85C9                 test     ecx, ecx
       7C59                 jl       SHORT G_M28799_IG09
       448BC9               mov      r9d, ecx
       41F7D9               neg      r9d

G_M28799_IG03:
       4533D2               xor      r10d, r10d
       4585C9               test     r9d, r9d
       7F43                 jg       SHORT G_M28799_IG08

G_M28799_IG04:
       458BD9               mov      r11d, r9d
       443BD9               cmp      r11d, ecx
       7F33                 jg       SHORT G_M28799_IG07
       418BF0               mov      esi, r8d
       410FAFF0             imul     esi, r8d
       410FAFF0             imul     esi, r8d
       418BFA               mov      edi, r10d
       410FAFFA             imul     edi, r10d
       410FAFFA             imul     edi, r10d
       03F7                 add      esi, edi

G_M28799_IG05:
       418BFB               mov      edi, r11d
       410FAFFB             imul     edi, r11d
       410FAFFB             imul     edi, r11d
       03FE                 add      edi, esi
       3BFA                 cmp      edi, edx
       7502                 jne      SHORT G_M28799_IG06
       FFC0                 inc      eax

G_M28799_IG06:
       41FFC3               inc      r11d
       443BD9               cmp      r11d, ecx
       7EE5                 jle      SHORT G_M28799_IG05

G_M28799_IG07:
       41FFCA               dec      r10d
       453BCA               cmp      r9d, r10d
       7EBD                 jle      SHORT G_M28799_IG04

G_M28799_IG08:
       41FFC0               inc      r8d
       443BC1               cmp      r8d, ecx
       7EAD                 jle      SHORT G_M28799_IG03

G_M28799_IG09:
       5E                   pop      rsi
       5F                   pop      rdi
       C3                   ret      

; Total bytes of code 103, prolog size 2 for method C:M(int,int):int
; ============================================================

cc @JosephTremoulet

category:cq
theme:loop-opt
skill-level:expert
cost:medium

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsoptimizationtenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions