[RISC-V] Improve clamped subtract & increment#118530
[RISC-V] Improve clamped subtract & increment#118530jakobbotsch merged 7 commits intodotnet:mainfrom
Conversation
|
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
|
For future PRs: these constructs are a good candidate to augment the if-conversion pass (#116581) |
|
Diffs are based on 172,775 contexts (51,182 MinOpts, 121,593 FullOpts). Overall (-256 bytes)
MinOpts (-8 bytes)
FullOpts (-248 bytes)
Example diffslinux.riscv64.Checked.2.mch-12 (-5.36%) : 38740.dasm - StackallocTests:Test20000_SkipLocalsInit(byref):System.Guid (FullOpts)@@ -41,17 +41,14 @@ G_M62621_IG02: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0200 {s1}, byre
jalr a0 // <unknown method>
lui a0, 0xD1FFAB1E
addiw a0, a0, 0xD1FFAB1E
- sltu a1, sp, a0
- sub a0, sp, a0
- beqz a1, pc+8 (2 instructions)
- mv a0, zero
+ maxu a1, sp, a0
+ sub a0, a1, a0
lui a2, 0xD1FFAB1E
mv a1, sp
lw zero, 0xD1FFAB1E(a1)
sub a1, a1, a2
bgeu a1, a0, pc-8 (-2 instructions)
mv sp, a0
- mv a0, sp
lui t6, 0xD1FFAB1E
addiw t6, t6, 0xD1FFAB1E
slli t6, t6, 16
@@ -70,7 +67,7 @@ G_M62621_IG02: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0200 {s1}, byre
auipc ra, 0xD1FFAB1E
jalr ra // CORINFO_HELP_FAIL_FAST
; byrRegs -[s1]
- ;; size=140 bbWeight=1 PerfScore 62.50
+ ;; size=128 bbWeight=1 PerfScore 58.00
G_M62621_IG03: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
addi sp, fp, -24
ld s1, 40(sp)
@@ -81,7 +78,7 @@ G_M62621_IG03: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
RWD00 dq 9ABCDEF012345678h
-; Total bytes of code 224, prolog size 56, PerfScore 98.00, instruction count 42, allocated bytes for code 224 (MethodHash=43680b62) for method StackallocTests:Test20000_SkipLocalsInit(byref):System.Guid (FullOpts)
+; Total bytes of code 212, prolog size 56, PerfScore 93.50, instruction count 39, allocated bytes for code 212 (MethodHash=43680b62) for method StackallocTests:Test20000_SkipLocalsInit(byref):System.Guid (FullOpts)
; ============================================================
Unwind Info:
@@ -92,7 +89,7 @@ Unwind Info:
E bit : 0
X bit : 0
Vers : 0
- Function Length : 56 (0x00038) Actual length = 224 (0x0000e0)
+ Function Length : 53 (0x00035) Actual length = 212 (0x0000d4)
---- Epilog scopes ----
---- Scope 0
Epilog Start Offset : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)-8 (-2.35%) : 124769.dasm - T:dirtyStack() (MinOpts)@@ -61,18 +61,16 @@ G_M4777_IG04: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
mv a1, a0
addi a1, a1, 0xD1FFAB1E
andi a1, a1, -16
- sltu a2, sp, a1
- sub a1, sp, a1
- beqz a2, pc+8 (2 instructions)
- mv a1, zero
+ maxu a2, sp, a1
+ sub a1, a2, a1
lui a3, 0xD1FFAB1E
mv a2, sp
lw zero, 0xD1FFAB1E(a2)
sub a2, a2, a3
bgeu a2, a1, pc-8 (-2 instructions)
mv sp, a1
- mv a0, sp
- ;; size=88 bbWeight=1 PerfScore 25.50
+ mv a0, a1
+ ;; size=80 bbWeight=1 PerfScore 21.50
G_M4777_IG05: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
sd a0, -8(fp)
sw zero, -12(fp)
@@ -119,7 +117,7 @@ G_M4777_IG08: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
RWD00 dq 9ABCDEF012345678h
-; Total bytes of code 340, prolog size 52, PerfScore 123.00, instruction count 60, allocated bytes for code 340 (MethodHash=759ced56) for method T:dirtyStack() (MinOpts)
+; Total bytes of code 332, prolog size 52, PerfScore 119.00, instruction count 58, allocated bytes for code 332 (MethodHash=759ced56) for method T:dirtyStack() (MinOpts)
; ============================================================
Unwind Info:
@@ -130,7 +128,7 @@ Unwind Info:
E bit : 0
X bit : 0
Vers : 0
- Function Length : 85 (0x00055) Actual length = 340 (0x000154)
+ Function Length : 83 (0x00053) Actual length = 332 (0x00014c)
---- Epilog scopes ----
---- Scope 0
Epilog Start Offset : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)-24 (-1.46%) : 599.dasm - (dynamicClass):IL_STUB_PInvoke(int,System.String,System.String,System.String):int (FullOpts)@@ -139,18 +139,16 @@ G_M26182_IG05: ; bbWeight=0.50, gcrefRegs=C0200 {s1 s2 s3}, byrefRegs=000
mv a0, a2
addi a0, a0, 0xD1FFAB1E
andi a0, a0, -16
- sltu a1, sp, a0
- sub a0, sp, a0
- beqz a1, pc+8 (2 instructions)
- mv a0, zero
+ maxu a1, sp, a0
+ sub a0, a1, a0
lui a3, 0xD1FFAB1E
mv a1, sp
lw zero, 0xD1FFAB1E(a1)
sub a1, a1, a3
bgeu a1, a0, pc-8 (-2 instructions)
mv sp, a0
- mv a2, sp
- ;; size=116 bbWeight=0.50 PerfScore 19.00
+ mv a2, a0
+ ;; size=108 bbWeight=0.50 PerfScore 17.00
G_M26182_IG06: ; bbWeight=0.50, gcrefRegs=C0200 {s1 s2 s3}, byrefRegs=0000 {}, byref
sd a2, -24(fp)
;; size=4 bbWeight=0.50 PerfScore 2.00
@@ -208,18 +206,16 @@ G_M26182_IG09: ; bbWeight=0.50, gcrefRegs=C0000 {s2 s3}, byrefRegs=0000 {
mv a0, a2
addi a0, a0, 0xD1FFAB1E
andi a0, a0, -16
- sltu a1, sp, a0
- sub a0, sp, a0
- beqz a1, pc+8 (2 instructions)
- mv a0, zero
+ maxu a1, sp, a0
+ sub a0, a1, a0
lui a3, 0xD1FFAB1E
mv a1, sp
lw zero, 0xD1FFAB1E(a1)
sub a1, a1, a3
bgeu a1, a0, pc-8 (-2 instructions)
mv sp, a0
- mv a2, sp
- ;; size=116 bbWeight=0.50 PerfScore 19.00
+ mv a2, a0
+ ;; size=108 bbWeight=0.50 PerfScore 17.00
G_M26182_IG10: ; bbWeight=0.50, gcrefRegs=C0000 {s2 s3}, byrefRegs=0000 {}, byref
sd a2, -40(fp)
;; size=4 bbWeight=0.50 PerfScore 2.00
@@ -277,18 +273,16 @@ G_M26182_IG13: ; bbWeight=0.50, gcrefRegs=80000 {s3}, byrefRegs=0000 {},
mv a0, a2
addi a0, a0, 0xD1FFAB1E
andi a0, a0, -16
- sltu a1, sp, a0
- sub a0, sp, a0
- beqz a1, pc+8 (2 instructions)
- mv a0, zero
+ maxu a1, sp, a0
+ sub a0, a1, a0
lui a3, 0xD1FFAB1E
mv a1, sp
lw zero, 0xD1FFAB1E(a1)
sub a1, a1, a3
bgeu a1, a0, pc-8 (-2 instructions)
mv sp, a0
- mv a2, sp
- ;; size=116 bbWeight=0.50 PerfScore 19.00
+ mv a2, a0
+ ;; size=108 bbWeight=0.50 PerfScore 17.00
G_M26182_IG14: ; bbWeight=0.50, gcrefRegs=80000 {s3}, byrefRegs=0000 {}, byref
sd a2, -56(fp)
;; size=4 bbWeight=0.50 PerfScore 2.00
@@ -556,7 +550,7 @@ G_M26182_IG34: ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
RWD00 dq 9ABCDEF012345678h
-; Total bytes of code 1640, prolog size 116, PerfScore 396.75, instruction count 311, allocated bytes for code 1640 (MethodHash=57cb99b9) for method (dynamicClass):IL_STUB_PInvoke(int,System.String,System.String,System.String):int (FullOpts)
+; Total bytes of code 1616, prolog size 116, PerfScore 390.75, instruction count 305, allocated bytes for code 1616 (MethodHash=57cb99b9) for method (dynamicClass):IL_STUB_PInvoke(int,System.String,System.String,System.String):int (FullOpts)
; ============================================================
Unwind Info:
@@ -567,7 +561,7 @@ Unwind Info:
E bit : 0
X bit : 0
Vers : 0
- Function Length : 336 (0x00150) Actual length = 1344 (0x000540)
+ Function Length : 330 (0x0014a) Actual length = 1320 (0x000528)
---- Epilog scopes ----
---- Scope 0
Epilog Start Offset : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)+0 (0.00%) : 172672.dasm - Microsoft.Diagnostics.Tracing.ZippedETLWriter:GetNGenPdbs(System.String,Microsoft.Diagnostics.Symbols.SymbolReader,System.IO.TextWriter):System.Collections.Generic.List`1[System.String] (FullOpts)No diffs found? +0 (0.00%) : 171872.dasm - Microsoft.Diagnostics.Symbols.SymbolReader:BypassSystem32FileRedirection(System.String):System.String (FullOpts)No diffs found? +0 (0.00%) : 171504.dasm - JIT.HardwareIntrinsics.X86._Avx512DQ.UnaryOpTest__ConvertToVector512Int64DoubletoInt64ToPositiveInfinity+TestStruct:Create():JIT.HardwareIntrinsics.X86._Avx512DQ.UnaryOpTest__ConvertToVector512Int64DoubletoInt64ToPositiveInfinity+TestStruct (MinOpts)No diffs found? DetailsSize improvements/regressions per collection
PerfScore improvements/regressions per collection
Context information
jit-analyze output |
|
@risc-vv /run |
RISC-V pull_request-CLR-QEMU: 9092 / 9131 (99.57%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-CLR-VF2: 9091 / 9130 (99.57%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-FX-QEMU: 0 / 1 (0.00%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-FX-VF2: 0 / 62 (0.00%)report.xml, report.md, failures.xml, testclr_details.tar.zst Build information and commandsGIT: |
1840916 to
b1572f8
Compare
|
@risc-vv /run |
RISC-V pull_request-CLR-QEMU: 9101 / 9131 (99.67%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-CLR-VF2: 9101 / 9131 (99.67%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-FX-QEMU: 0 / 0 (100.00%)report.xml, report.md, failures.xml, testclr_details.tar.zst RISC-V pull_request-FX-VF2: 0 / 58 (0.00%)report.xml, report.md, failures.xml, testclr_details.tar.zst Build information and commandsGIT: |
|
We will review .NET 11 PRs when we are done with .NET 10 last minute works and less busy. Please give us a few more weeks. |
Improve codegens with hardcoded subtract clamped to 0 (in localloc) and increment clamped to maximum (in inc_saturate).
Part of #84834, cc @dotnet/samsung