Extend redundant zero init optimization to recognize assignments to GT_OBJ(lcl_addr)#38314
Merged
erozenfeld merged 1 commit intodotnet:masterfrom Jun 24, 2020
Merged
Conversation
Member
Author
|
Diff for the example from #38070: ; Assembly listing for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 RetBuf [V01,T00] ( 4, 4 ) byref -> rsi
-; V02 loc0 [V02 ] ( 4, 4 ) struct (48) [rsp+0x20] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
+; V02 loc0 [V02 ] ( 3, 3 ) struct (48) [rsp+0x20] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) struct (40) zero-ref do-not-enreg[SB] ld-addr-op "Inline ldloca(s) first use temp"
;
; Lcl frame size = 80
G_M9604_IG01:
push rsi
sub rsp, 80
- vzeroupper
vxorps xmm4, xmm4
vmovdqa xmmword ptr [rsp+20H], xmm4
vmovdqa xmmword ptr [rsp+30H], xmm4
vmovdqa xmmword ptr [rsp+40H], xmm4
mov rsi, rdx
- ;; bbWeight=1 PerfScore 5.83
+ ;; bbWeight=1 PerfScore 4.83
G_M9604_IG02:
- xor ecx, ecx
- vxorps xmm0, xmm0
- vmovdqu xmmword ptr [rsp+28H], xmm0
- vmovdqu xmmword ptr [rsp+38H], xmm0
- mov qword ptr [rsp+48H], rcx
mov dword ptr [rsp+20H], -1
lea rcx, [rsp+20H]
call System.Runtime.CompilerServices.AsyncMethodBuilderCore:Start(byref)
lea rcx, bword ptr [rsp+28H]
mov rdx, rsi
call System.Runtime.CompilerServices.AsyncValueTaskMethodBuilder`1[ReadResult][DoubleZero+ReadResult]:get_Task():System.Threading.Tasks.ValueTask`1[ReadResult]:this
mov rax, rsi
- ;; bbWeight=1 PerfScore 8.08
+ ;; bbWeight=1 PerfScore 4.50
G_M9604_IG03:
add rsp, 80
pop rsi
ret
;; bbWeight=1 PerfScore 1.75
-; Total bytes of code 96, prolog size 30, PerfScore 25.97, (MethodHash=85efda7b) for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
+; Total bytes of code 70, prolog size 27, PerfScore 18.48, (MethodHash=85efda7b) for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
; ============================================================
|
Member
Author
|
@AndyAyersMS PTAL /cc @dotnet/jit-contrib |
Member
|
Could also be that PMI is not instantiating enough interesting value classes to show broad applicability here -- you might experiment with adding a value class with a GC ref field to the "types to try" array. |
AndyAyersMS
approved these changes
Jun 24, 2020
Member
AndyAyersMS
left a comment
There was a problem hiding this comment.
LGTM. Left a couple of suggestions.
src/coreclr/src/jit/optimizer.cpp
Outdated
Member
There was a problem hiding this comment.
Should we check for GT_BLK too?
Member
Author
There was a problem hiding this comment.
Good suggestion, thanks.
…GT_OBJ(lcl_addr)` and `GT_BLK(lcl_addr)` Fixes dotnet#38070.
6ca7b19 to
9d35b6d
Compare
Member
Author
|
There are a few diffs after adding x64 framework pmi: |
Member
Author
|
One of the framework diffs: ; Assembly listing for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
G_M4265_IG01:
push rsi
sub rsp, 80
vzeroupper
vxorps xmm4, xmm4
vmovdqa xmmword ptr [rsp+30H], xmm4
vmovdqa xmmword ptr [rsp+40H], xmm4
mov rsi, r8
;; bbWeight=1 PerfScore 4.83
G_M4265_IG02:
mov r9, bword ptr [rcx]
mov r8d, dword ptr [rcx+8]
- xor eax, eax
- mov qword ptr [rsp+30H], rax
- mov byte ptr [rsp+38H], 0
lea rax, bword ptr [rsp+40H]
mov bword ptr [rax], r9
mov dword ptr [rax+8], r8d
G_M4265_IG03:
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmmword ptr [rsp+20H], xmm0
G_M4265_IG04:
lea rcx, bword ptr [rsp+20H]
lea r9, [rsp+30H]
mov r8, rdx
mov edx, 3
call TimeSpanParse:TryParseTimeSpan(ReadOnlySpan`1,ubyte,IFormatProvider,byref):bool
test eax, eax
je SHORT G_M4265_IG07
G_M4265_IG05:
mov rax, qword ptr [rsp+30H]
mov qword ptr [rsi], rax
mov eax, 1
G_M4265_IG06:
add rsp, 80
pop rsi
ret
G_M4265_IG07:
xor eax, eax
mov qword ptr [rsi], rax
G_M4265_IG08:
add rsp, 80
pop rsi
ret
-; Total bytes of code 125, prolog size 24, PerfScore 36.83, (MethodHash=8014ef56) for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
+; Total bytes of code 113, prolog size 24, PerfScore 33.38, (MethodHash=8014ef56) for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
|
Member
|
Not sure why that leg timed out, but all of the jobs completed correctly. So you can go ahead and merge despite that failure. |
Member
|
Thank you! 😍 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to subscribe to this conversation on GitHub.
Already have an account?
Sign in.
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Fixes #38070.
No diffs in frameworks and benchmarks. Diffs are expected once we start using Roslyn that includes @benadams's change dotnet/roslyn#45262