Skip to content

[arm64] Volatile.Read/Write is 2x faster than "volatile" loads/stores #60232

@EgorBo

Description

@EgorBo

The following code (same for "loads"):

private volatile int A;
private volatile int B;

public void SetValueInLoop_volatile()
{
    for (int i = 0; i < 1000; i++)
    {
        A = i;
        B = i;
    }
}
; Method VolatileKeyword:SetValueInLoop():this
G_M8342_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
G_M8342_IG02:
            mov     w1, wzr
G_M8342_IG03:
            dmb     ish ;; <--- full memory barrier 
            str     w1, [x0,#8]
            dmb     ish ;; <--- full memory barrier 
            str     w1, [x0,#12]
            add     w1, w1, #1
            cmp     w1, #1000
            blt     G_M8342_IG03
G_M8342_IG04:
            ldp     fp, lr, [sp],#16
            ret     lr
; Total bytes of code: 48

is twice slower than:

private int A;
private int B;

public void SetValueInLoop_VolatileWrite()
{
    for (int i = 0; i < 1000; i++)
    {
        Volatile.Write(ref A, i);
        Volatile.Write(ref B, i);
    }
}
; Method VolatileClass:SetValueInLoop():this
G_M39137_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
G_M39137_IG02:
            mov     w1, wzr
            add     x2, x0, #8
            add     x0, x0, #12
G_M39137_IG03:
            mov     x3, x2
            stlr    w1, [x3]
            mov     x3, x0
            stlr    w1, [x3]
            add     w1, w1, #1
            cmp     w1, #1000
            blt     G_M39137_IG03
G_M39137_IG04:
            ldp     fp, lr, [sp],#16
            ret     lr
; Total bytes of code: 56

Benchmark (Apple M1, arm64):

|                       Method |       Mean |   Error |  StdDev |
|----------------------------- |-----------:|--------:|--------:|
| SetValueInLoop_VolatileWrite |   623.7 ns | 0.12 ns | 0.11 ns |
|      SetValueInLoop_volatile | 1,257.0 ns | 0.26 ns | 0.23 ns |

Unrelated, but just for fun - same benchmark, same M1 machine but under Rosetta (x64 emulation):

| SetValueInLoop_VolatileWrite |   320.9 ns | 0.11 ns | 0.10 ns |
|      SetValueInLoop_volatile |   325.5 ns | 1.31 ns | 1.22 ns |

cc @dotnet/jit-contrib

category:cq
theme:volatile

Metadata

Metadata

Assignees

Labels

arch-arm64area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

Type

No type

Projects

Status

Done

Relationships

None yet

Development

No branches or pull requests

Issue actions