-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Description
Arm64 provides branchless conditional selection and comparison instructions that should be utilized by RyuJIT in the code it generates.
Reference: https://eclecticlight.co/2021/07/20/code-in-arm-assembly-conditions-without-branches/
RyuJIT already has support for them as seen below:
runtime/src/coreclr/jit/instrsarm64.h
Lines 1353 to 1375 in f0b7773
| INST1(csel, "csel", 0, IF_DR_3D, 0x1A800000) | |
| // csel Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc00nnnnnddddd 1A80 0000 cond | |
| INST1(csinc, "csinc", 0, IF_DR_3D, 0x1A800400) | |
| // csinc Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc01nnnnnddddd 1A80 0400 cond | |
| INST1(csinv, "csinv", 0, IF_DR_3D, 0x5A800000) | |
| // csinv Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc00nnnnnddddd 5A80 0000 cond | |
| INST1(csneg, "csneg", 0, IF_DR_3D, 0x5A800400) | |
| // csneg Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc01nnnnnddddd 5A80 0400 cond | |
| INST1(cinc, "cinc", 0, IF_DR_2D, 0x1A800400) | |
| // cinc Rd,Rn,cond DR_2D X0011010100nnnnn cccc01nnnnnddddd 1A80 0400 cond | |
| INST1(cinv, "cinv", 0, IF_DR_2D, 0x5A800000) | |
| // cinv Rd,Rn,cond DR_2D X1011010100nnnnn cccc00nnnnnddddd 5A80 0000 cond | |
| INST1(cneg, "cneg", 0, IF_DR_2D, 0x5A800400) | |
| // cneg Rd,Rn,cond DR_2D X1011010100nnnnn cccc01nnnnnddddd 5A80 0400 cond | |
| INST1(cset, "cset", 0, IF_DR_1D, 0x1A9F07E0) | |
| // cset Rd,cond DR_1D X001101010011111 cccc0111111ddddd 1A9F 07E0 Rd cond |
runtime/src/coreclr/jit/instrsarm64.h
Lines 633 to 639 in f0b7773
| INST2(ccmp, "ccmp", CMP, IF_EN2F, 0x7A400000, 0x7A400800) | |
| // ccmp Rn,Rm, nzcv,cond DR_2I X1111010010mmmmm cccc00nnnnn0nzcv 7A40 0000 nzcv, cond | |
| // ccmp Rn,imm5,nzcv,cond DI_1F X1111010010iiiii cccc10nnnnn0nzcv 7A40 0800 imm5, nzcv, cond | |
| INST2(ccmn, "ccmn", CMP, IF_EN2F, 0x3A400000, 0x3A400800) | |
| // ccmn Rn,Rm, nzcv,cond DR_2I X0111010010mmmmm cccc00nnnnn0nzcv 3A40 0000 nzcv, cond | |
| // ccmn Rn,imm5,nzcv,cond DI_1F X0111010910iiiii cccc10nnnnn0nzcv 3A40 0800 imm5, nzcv, cond |
Currently, the method emitIns_R_R_R_COND and emitIns_R_I_FLAGS_COND that produces these instructions are not utilized at all. emitIns_R_R_R_COND was recently used in #66407 to generate csneg instruction. Once these instructions are used, we could produce much better code. Below are some examples:
Example# 1:
static void Test(uint op1, uint op2) {
if (op1 > 0 && op2 > 0) {
op1 = 5;
} else {
op1 = 10;
}
Consume(op1, op2);
}Ideal code: https://godbolt.org/z/5ov9TKx6P
Current code:
G_M2878_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M2878_IG02:
cbz w0, G_M2878_IG04
;; bbWeight=1 PerfScore 1.00
G_M2878_IG03:
cbz w1, G_M2878_IG04
mov w0, #5
b G_M2878_IG05
;; bbWeight=0.50 PerfScore 1.25
G_M2878_IG04:
mov w0, #10
;; bbWeight=0.50 PerfScore 0.25
G_M2878_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M2878_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00Example# 2:
static void Test(uint op1, uint op2) {
op1 = op1 > 0 ? 5 : 6;
Consume(op1, op2);
}Ideal code: https://godbolt.org/z/GTnc4jjfG
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cmp w0, #0
bgt G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
mov w0, #6
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
mov w0, #5
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00Example# 3:
static void Test(uint op1, uint op2) {
op1 = (op1 > 0) ? 0 : 1;
Consume(op1, op2);
}Ideal code: https://godbolt.org/z/GoqcsM1Tf
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cmp w0, #0
bgt G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
mov w0, #1
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
mov w0, wzr
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00Example# 4:
static void Test(uint op1, uint op2, uint xyz, uint def) {
op1 = op1 > 0 ? xyz : def;
Consume(op1, op2);
}Ideal code: https://godbolt.org/z/1EfxPn48q
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cbnz w0, G_M9565_IG04
;; bbWeight=1 PerfScore 1.00
G_M9565_IG03:
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
mov w3, w2
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
mov w0, w3
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.50
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Example# 5:
static void Test(int op1, int op2, int xyz, int def) {
op1 = ((op1 & op2) == 0) ? 5 : def;
Consume(op1, op2);
}Ideal code: https://godbolt.org/z/fc3eddPx3
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
tst w0, w1
beq G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
mov w3, #5
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
mov w0, w3
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.50
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00Some related issues:
- RyuJit: avoid conditional jumps using cmov and similar instructions #6749 RyuJit: avoid conditional jumps using cmov and similar instructions
- RyuJIT: Optimize "X / POW2_CNS" via cmovns #41549 RyuJIT: Optimize "X / POW2_CNS" via cmovns
- [RyuJIT][arm64] Optimize "x<0" and "x>=0" #43440 [RyuJIT][arm64] Optimize "x<0" and "x>=0"
Presumably, some parts of the analysis can be implemented in platform agnostic way and benefit both Arm64 and X86 platforms.
category:cq
theme:intrinsics
skill-level:expert
cost:large
impact:medium
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
