Skip to content

Vectorized memory access in TensorIterator GPU loop for 1d contiguous case#32383

Closed
zasdfgbnm wants to merge 65 commits intopytorch:masterfrom
zasdfgbnm:vectorized_memory_access
Closed

Vectorized memory access in TensorIterator GPU loop for 1d contiguous case#32383
zasdfgbnm wants to merge 65 commits intopytorch:masterfrom
zasdfgbnm:vectorized_memory_access

Conversation

@zasdfgbnm
Copy link
Collaborator

@zasdfgbnm zasdfgbnm commented Jan 18, 2020

Step 2 of #31975

Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb

void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array<char*, 3>)

**ASM:**

	.section	.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",@progbits
	.sectioninfo	@"SHI_REGISTERS=20"
	.align	128
        .global         _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_
        .type           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@function
        .size           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_)
        .other          _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT"
_ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
        /*0020*/                   S2R R9, SR_CTAID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177
        /*0030*/                   S2R R0, SR_TID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0040*/                   IMAD.SHL.U32 R9, R9, 0x100, RZ ;
        /*0050*/                   IADD3 R5, -R9, c[0x0][0x160], RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0060*/                   SHF.R.S32.HI R17, RZ, 0x1f, R9 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296
        /*0070*/                   ISETP.GE.AND P0, PT, R5, 0x100, PT ;
        /*0080*/              @!P0 BRA `(.L_3173) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0090*/                   IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ;
        /*00a0*/                   SHF.L.U64.HI R17, R9, 0x2, R17 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260
        /*00b0*/                   IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ;
        /*00c0*/                   IADD3 R2, P1, R12, c[0x0][0x190], RZ ;
        /*00d0*/                   IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ;
        /*00e0*/                   IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218
        /*00f0*/                   IMAD.WIDE R8, R0, 0x10, R8 ;
        /*0100*/                   IMAD.WIDE R2, R0, 0x10, R2 ;
        /*0110*/                   LDG.E.128.SYS R8, [R8] ;
        /*0120*/                   LDG.E.128.SYS R4, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0130*/                   IADD3 R12, P0, R12, c[0x0][0x180], RZ ;
        /*0140*/                   IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*0150*/                   IMAD.WIDE R12, R0, 0x10, R12 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0160*/                   FFMA R7, R7, c[0x0][0x168], R11 ;
        /*0170*/                   FFMA R6, R6, c[0x0][0x168], R10 ;
        /*0180*/                   FFMA R5, R5, c[0x0][0x168], R9 ;
        /*0190*/                   FFMA R4, R4, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*01a0*/                   STG.E.128.SYS [R12], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301
        /*01b0*/                   EXIT ;
.L_3173:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*01c0*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
        /*01d0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*01e0*/                   BSSY B0, `(.L_3174) ;
        /*01f0*/               @P0 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0200*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0210*/                   LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ;
        /*0220*/                   LEA R2, P1, R3, c[0x0][0x188], 0x2 ;
        /*0230*/                   LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ;
        /*0240*/                   LDG.E.SYS R8, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0250*/                   IADD3 R4, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0260*/                   ISETP.GE.AND P1, PT, R4, R5, PT ;
        /*0270*/               @P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0280*/                   LDG.E.SYS R4, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0290*/                   IADD3 R6, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02a0*/                   ISETP.GE.AND P1, PT, R6, R5, PT ;
        /*02b0*/               @P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*02c0*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02d0*/                   LDG.E.SYS R7, [R2+0x200] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02f0*/              @!P1 LDG.E.SYS R6, [R2+0x300] ;
.L_3175:
        /*0300*/                   BSYNC B0 ;
.L_3174:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0310*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0320*/                   BSSY B0, `(.L_3176) ;
        /*0330*/               @P0 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0340*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0350*/                   LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ;
        /*0360*/                   LEA R2, P1, R3, c[0x0][0x190], 0x2 ;
        /*0370*/                   LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ;
        /*0380*/                   LDG.E.SYS R11, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0390*/                   IADD3 R10, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03a0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03b0*/               @P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*03c0*/                   LDG.E.SYS R13, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*03d0*/                   IADD3 R10, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03f0*/               @P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0400*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0410*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0420*/                   LDG.E.SYS R10, [R2+0x200] ;
        /*0430*/              @!P1 LDG.E.SYS R15, [R2+0x300] ;
.L_3177:
        /*0440*/                   BSYNC B0 ;
.L_3176:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0450*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0460*/                   IADD3 R9, P0, R9, R0, RZ ;
        /*0470*/                   FFMA R11, R11, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0480*/                   IADD3 R14, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0490*/                   LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ;
        /*04a0*/                   LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04b0*/                   ISETP.GE.AND P1, PT, R14, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*04c0*/                   LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ;
        /*04d0*/                   STG.E.SYS [R2], R11 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04e0*/               @P1 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*04f0*/                   IADD3 R8, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0500*/                   FFMA R13, R13, c[0x0][0x168], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0510*/                   ISETP.GE.AND P0, PT, R8, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0520*/                   STG.E.SYS [R2+0x100], R13 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0530*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0540*/                   IADD3 R0, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0550*/                   FFMA R7, R10, c[0x0][0x168], R7 ;
        /*0560*/                   FFMA R15, R15, c[0x0][0x168], R6 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0570*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0580*/                   STG.E.SYS [R2+0x200], R7 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0590*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*05a0*/                   STG.E.SYS [R2+0x300], R15 ;
        /*05b0*/                   EXIT ;
.L_3178:
        /*05c0*/                   BRA `(.L_3178);
        /*05d0*/                   NOP;
        /*05e0*/                   NOP;
        /*05f0*/                   NOP;
.L_40898:

We can clearly see the LDG.E.128 in it, which is a result of vectorization.

Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb

Benchmark on P100, dtype uint8:

before:

1.4.0a0+a5b4d78
e1d97025eeeddcf083e9bee0c8f6a53168991a71
22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

after:

1.4.0a0+a5b4d78
1281cdfd8188fe86241ecaf71d001809d016c3a3
24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Benchmark on P100, dtype half:

Before:

1.4.0a0+a5b4d78
1c017f0c14c91bd5125ab387a90441b0c0e2f3ad
30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)

After

1.4.0a0+a5b4d78
7e50ee27333e7047072d328d03767b4845286356
28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)

cc: @csarofeen @ptrblck

@kostmo
Copy link
Member

kostmo commented Jan 18, 2020

💊 CircleCI build failures summary and remediations

As of commit 6eb6c2e:

Commit 6eb6c2e was recently pushed. Waiting for builds...


This comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.

Please report bugs/suggestions on the GitHub issue tracker.

This comment has been revised 65 times.

@zasdfgbnm zasdfgbnm closed this Jan 21, 2020
@zasdfgbnm zasdfgbnm reopened this Jan 21, 2020
@zasdfgbnm zasdfgbnm restored the vectorized_memory_access branch January 31, 2020 18:37
@zasdfgbnm zasdfgbnm reopened this Jan 31, 2020
@zasdfgbnm
Copy link
Collaborator Author

should be working this time

@zasdfgbnm
Copy link
Collaborator Author

@iotamudelta It seems to be hard to write code that has great perf on both CUDA and ROCm, so I make Loops.cuh.

// Note:
// CUDA and ROCm get diverged in this PR:
//   https://github.com/pytorch/pytorch/pull/32383
// Because for some reason trying to enable vectorized
// memory access introduce regression on ROCm.

#ifndef __HIP_PLATFORM_HCC__
#include <ATen/native/cuda/CUDALoops.cuh>
#else
#include <ATen/native/cuda/ROCmLoops.cuh>
#endif

where the ROCmLoops.cuh is copied from the master branch unchanged. If you want them to be merged together, this could be done in the future, but I don't want this PR to be delayed any more.

Copy link
Contributor

@facebook-github-bot facebook-github-bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngimel has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

@ngimel
Copy link
Collaborator

ngimel commented Feb 3, 2020

@iotamudelta please take a look at this PR. There should not be regressions because literally ROCm code paths are unchanged, but we don't have a way to test it.

Copy link
Contributor

@facebook-github-bot facebook-github-bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngimel has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

Copy link
Contributor

@iotamudelta iotamudelta left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK for me. We will want to unify the divergence there somewhat soonish.

@iotamudelta
Copy link
Contributor

We are also testing on CUDA if we make changes to shared code. I am unsure why this cannot be done equally for ROCm?

@zasdfgbnm
Copy link
Collaborator Author

@iotamudelta I believe there must be a way to get it done equally in both CUDA and ROCm, but in practice, since the compiler of CUDA and ROCm are not working exactly the same (such as loop unrolling), writing shared code could need lots of try and error or some knowledge with details about the details about these compilers (things like: does aligned array generate vectorized instructions? under which condition the compiler would be able to unroll the loop?). I tried to install ROCm locally and compile PyTorch with it locally, there was no luck. And I don't know enough about the details of ROCm's compiler, so I have to diverge the code here.

@facebook-github-bot
Copy link
Contributor

@ngimel merged this pull request in 9c2ed25.

@zasdfgbnm zasdfgbnm deleted the vectorized_memory_access branch February 4, 2020 01:35
facebook-github-bot pushed a commit that referenced this pull request Feb 11, 2020
Summary:
`where` is special because the arguments do not have the same type, which does not satisfy the assumption in modern #32383. I migrate it to TensorIterator so that there is something to test that this case is not broken. Currently, this case fallback to using legacy (not vectorized, not unrolled) code. It should be supported in the future when I cleanup `Loops.cuh`.

I also move some sharing part of `CUDALoops.cuh` and `ROCmLoops.cuh` into `Loops.cuh` so that to logic for checking whether `func_t` has the same arg types could be shared.
Pull Request resolved: #32984

Differential Revision: D19825127

Pulled By: ngimel

fbshipit-source-id: bbf4682349d96b4480c4d657f3c18a3a67a9bf17
BowenBao pushed a commit to BowenBao/pytorch that referenced this pull request Feb 12, 2020
… case (pytorch#32383)

Summary:
Step 2 of pytorch#31975

Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb

```
void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3>)

**ASM:**

	.section	.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits
	.sectioninfo	@"SHI_REGISTERS=20"
	.align	128
        .global         _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_
        .type           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function
        .size           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_)
        .other          _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT"
_ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
        /*0020*/                   S2R R9, SR_CTAID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177
        /*0030*/                   S2R R0, SR_TID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0040*/                   IMAD.SHL.U32 R9, R9, 0x100, RZ ;
        /*0050*/                   IADD3 R5, -R9, c[0x0][0x160], RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0060*/                   SHF.R.S32.HI R17, RZ, 0x1f, R9 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296
        /*0070*/                   ISETP.GE.AND P0, PT, R5, 0x100, PT ;
        /*0080*/              @!P0 BRA `(.L_3173) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0090*/                   IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ;
        /*00a0*/                   SHF.L.U64.HI R17, R9, 0x2, R17 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260
        /*00b0*/                   IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ;
        /*00c0*/                   IADD3 R2, P1, R12, c[0x0][0x190], RZ ;
        /*00d0*/                   IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ;
        /*00e0*/                   IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218
        /*00f0*/                   IMAD.WIDE R8, R0, 0x10, R8 ;
        /*0100*/                   IMAD.WIDE R2, R0, 0x10, R2 ;
        /*0110*/                   LDG.E.128.SYS R8, [R8] ;
        /*0120*/                   LDG.E.128.SYS R4, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0130*/                   IADD3 R12, P0, R12, c[0x0][0x180], RZ ;
        /*0140*/                   IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*0150*/                   IMAD.WIDE R12, R0, 0x10, R12 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0160*/                   FFMA R7, R7, c[0x0][0x168], R11 ;
        /*0170*/                   FFMA R6, R6, c[0x0][0x168], R10 ;
        /*0180*/                   FFMA R5, R5, c[0x0][0x168], R9 ;
        /*0190*/                   FFMA R4, R4, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*01a0*/                   STG.E.128.SYS [R12], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301
        /*01b0*/                   EXIT ;
.L_3173:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*01c0*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
        /*01d0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*01e0*/                   BSSY B0, `(.L_3174) ;
        /*01f0*/               P0 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0200*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0210*/                   LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ;
        /*0220*/                   LEA R2, P1, R3, c[0x0][0x188], 0x2 ;
        /*0230*/                   LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ;
        /*0240*/                   LDG.E.SYS R8, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0250*/                   IADD3 R4, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0260*/                   ISETP.GE.AND P1, PT, R4, R5, PT ;
        /*0270*/               P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0280*/                   LDG.E.SYS R4, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0290*/                   IADD3 R6, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02a0*/                   ISETP.GE.AND P1, PT, R6, R5, PT ;
        /*02b0*/               P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*02c0*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02d0*/                   LDG.E.SYS R7, [R2+0x200] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02f0*/              @!P1 LDG.E.SYS R6, [R2+0x300] ;
.L_3175:
        /*0300*/                   BSYNC B0 ;
.L_3174:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0310*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0320*/                   BSSY B0, `(.L_3176) ;
        /*0330*/               P0 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0340*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0350*/                   LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ;
        /*0360*/                   LEA R2, P1, R3, c[0x0][0x190], 0x2 ;
        /*0370*/                   LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ;
        /*0380*/                   LDG.E.SYS R11, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0390*/                   IADD3 R10, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03a0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03b0*/               P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*03c0*/                   LDG.E.SYS R13, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*03d0*/                   IADD3 R10, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03f0*/               P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0400*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0410*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0420*/                   LDG.E.SYS R10, [R2+0x200] ;
        /*0430*/              @!P1 LDG.E.SYS R15, [R2+0x300] ;
.L_3177:
        /*0440*/                   BSYNC B0 ;
.L_3176:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0450*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0460*/                   IADD3 R9, P0, R9, R0, RZ ;
        /*0470*/                   FFMA R11, R11, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0480*/                   IADD3 R14, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0490*/                   LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ;
        /*04a0*/                   LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04b0*/                   ISETP.GE.AND P1, PT, R14, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*04c0*/                   LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ;
        /*04d0*/                   STG.E.SYS [R2], R11 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04e0*/               P1 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*04f0*/                   IADD3 R8, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0500*/                   FFMA R13, R13, c[0x0][0x168], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0510*/                   ISETP.GE.AND P0, PT, R8, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0520*/                   STG.E.SYS [R2+0x100], R13 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0530*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0540*/                   IADD3 R0, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0550*/                   FFMA R7, R10, c[0x0][0x168], R7 ;
        /*0560*/                   FFMA R15, R15, c[0x0][0x168], R6 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0570*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0580*/                   STG.E.SYS [R2+0x200], R7 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0590*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*05a0*/                   STG.E.SYS [R2+0x300], R15 ;
        /*05b0*/                   EXIT ;
.L_3178:
        /*05c0*/                   BRA `(.L_3178);
        /*05d0*/                   NOP;
        /*05e0*/                   NOP;
        /*05f0*/                   NOP;
.L_40898:
```

We can clearly see the `LDG.E.128` in it, which is a result of vectorization.

Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb

Benchmark on P100, dtype `uint8`:

before:
```
1.4.0a0+a5b4d78
e1d9702
22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

after:
```
1.4.0a0+a5b4d78
1281cdf
24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

Benchmark on P100, dtype `half`:

Before:
```
1.4.0a0+a5b4d78
1c017f0
30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

After

```
1.4.0a0+a5b4d78
7e50ee2
28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

cc: csarofeen ptrblck
Pull Request resolved: pytorch#32383

Differential Revision: D19697455

Pulled By: ngimel

fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1
ttumiel pushed a commit to ttumiel/pytorch that referenced this pull request Mar 4, 2020
Summary:
Recent PR pytorch#31974 and upcoming PR pytorch#32383 are changing the behavior of the elementwise_kernel infrastructure on CUDA.

In order to stay in sync, change the nd-loop behavior to match ROCm and CUDA for now. Once the full rework is done, the ROCm settings will likely diverge again.
Pull Request resolved: pytorch#32609

Differential Revision: D19580121

Pulled By: ezyang

fbshipit-source-id: 4c8dcf6db3ac973e48ece6a665615cfe7d7cb764
ttumiel pushed a commit to ttumiel/pytorch that referenced this pull request Mar 4, 2020
… case (pytorch#32383)

Summary:
Step 2 of pytorch#31975

Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb

```
void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3>)

**ASM:**

	.section	.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits
	.sectioninfo	@"SHI_REGISTERS=20"
	.align	128
        .global         _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_
        .type           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function
        .size           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_)
        .other          _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT"
_ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
        /*0020*/                   S2R R9, SR_CTAID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177
        /*0030*/                   S2R R0, SR_TID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0040*/                   IMAD.SHL.U32 R9, R9, 0x100, RZ ;
        /*0050*/                   IADD3 R5, -R9, c[0x0][0x160], RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0060*/                   SHF.R.S32.HI R17, RZ, 0x1f, R9 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296
        /*0070*/                   ISETP.GE.AND P0, PT, R5, 0x100, PT ;
        /*0080*/              @!P0 BRA `(.L_3173) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0090*/                   IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ;
        /*00a0*/                   SHF.L.U64.HI R17, R9, 0x2, R17 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260
        /*00b0*/                   IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ;
        /*00c0*/                   IADD3 R2, P1, R12, c[0x0][0x190], RZ ;
        /*00d0*/                   IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ;
        /*00e0*/                   IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218
        /*00f0*/                   IMAD.WIDE R8, R0, 0x10, R8 ;
        /*0100*/                   IMAD.WIDE R2, R0, 0x10, R2 ;
        /*0110*/                   LDG.E.128.SYS R8, [R8] ;
        /*0120*/                   LDG.E.128.SYS R4, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0130*/                   IADD3 R12, P0, R12, c[0x0][0x180], RZ ;
        /*0140*/                   IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*0150*/                   IMAD.WIDE R12, R0, 0x10, R12 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0160*/                   FFMA R7, R7, c[0x0][0x168], R11 ;
        /*0170*/                   FFMA R6, R6, c[0x0][0x168], R10 ;
        /*0180*/                   FFMA R5, R5, c[0x0][0x168], R9 ;
        /*0190*/                   FFMA R4, R4, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*01a0*/                   STG.E.128.SYS [R12], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301
        /*01b0*/                   EXIT ;
.L_3173:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*01c0*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
        /*01d0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*01e0*/                   BSSY B0, `(.L_3174) ;
        /*01f0*/               P0 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0200*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0210*/                   LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ;
        /*0220*/                   LEA R2, P1, R3, c[0x0][0x188], 0x2 ;
        /*0230*/                   LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ;
        /*0240*/                   LDG.E.SYS R8, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0250*/                   IADD3 R4, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0260*/                   ISETP.GE.AND P1, PT, R4, R5, PT ;
        /*0270*/               P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0280*/                   LDG.E.SYS R4, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0290*/                   IADD3 R6, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02a0*/                   ISETP.GE.AND P1, PT, R6, R5, PT ;
        /*02b0*/               P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*02c0*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02d0*/                   LDG.E.SYS R7, [R2+0x200] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02f0*/              @!P1 LDG.E.SYS R6, [R2+0x300] ;
.L_3175:
        /*0300*/                   BSYNC B0 ;
.L_3174:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0310*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0320*/                   BSSY B0, `(.L_3176) ;
        /*0330*/               P0 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0340*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0350*/                   LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ;
        /*0360*/                   LEA R2, P1, R3, c[0x0][0x190], 0x2 ;
        /*0370*/                   LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ;
        /*0380*/                   LDG.E.SYS R11, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0390*/                   IADD3 R10, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03a0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03b0*/               P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*03c0*/                   LDG.E.SYS R13, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*03d0*/                   IADD3 R10, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03f0*/               P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0400*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0410*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0420*/                   LDG.E.SYS R10, [R2+0x200] ;
        /*0430*/              @!P1 LDG.E.SYS R15, [R2+0x300] ;
.L_3177:
        /*0440*/                   BSYNC B0 ;
.L_3176:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0450*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0460*/                   IADD3 R9, P0, R9, R0, RZ ;
        /*0470*/                   FFMA R11, R11, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0480*/                   IADD3 R14, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0490*/                   LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ;
        /*04a0*/                   LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04b0*/                   ISETP.GE.AND P1, PT, R14, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*04c0*/                   LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ;
        /*04d0*/                   STG.E.SYS [R2], R11 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04e0*/               P1 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*04f0*/                   IADD3 R8, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0500*/                   FFMA R13, R13, c[0x0][0x168], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0510*/                   ISETP.GE.AND P0, PT, R8, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0520*/                   STG.E.SYS [R2+0x100], R13 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0530*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0540*/                   IADD3 R0, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0550*/                   FFMA R7, R10, c[0x0][0x168], R7 ;
        /*0560*/                   FFMA R15, R15, c[0x0][0x168], R6 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0570*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0580*/                   STG.E.SYS [R2+0x200], R7 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0590*/               P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*05a0*/                   STG.E.SYS [R2+0x300], R15 ;
        /*05b0*/                   EXIT ;
.L_3178:
        /*05c0*/                   BRA `(.L_3178);
        /*05d0*/                   NOP;
        /*05e0*/                   NOP;
        /*05f0*/                   NOP;
.L_40898:
```

We can clearly see the `LDG.E.128` in it, which is a result of vectorization.

Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb

Benchmark on P100, dtype `uint8`:

before:
```
1.4.0a0+a5b4d78
e1d9702
22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

after:
```
1.4.0a0+a5b4d78
1281cdf
24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

Benchmark on P100, dtype `half`:

Before:
```
1.4.0a0+a5b4d78
1c017f0
30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

After

```
1.4.0a0+a5b4d78
7e50ee2
28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

cc: csarofeen ptrblck
Pull Request resolved: pytorch#32383

Differential Revision: D19697455

Pulled By: ngimel

fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1
ttumiel pushed a commit to ttumiel/pytorch that referenced this pull request Mar 4, 2020
Summary:
`where` is special because the arguments do not have the same type, which does not satisfy the assumption in modern pytorch#32383. I migrate it to TensorIterator so that there is something to test that this case is not broken. Currently, this case fallback to using legacy (not vectorized, not unrolled) code. It should be supported in the future when I cleanup `Loops.cuh`.

I also move some sharing part of `CUDALoops.cuh` and `ROCmLoops.cuh` into `Loops.cuh` so that to logic for checking whether `func_t` has the same arg types could be shared.
Pull Request resolved: pytorch#32984

Differential Revision: D19825127

Pulled By: ngimel

fbshipit-source-id: bbf4682349d96b4480c4d657f3c18a3a67a9bf17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Merged open source triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module

Projects

None yet

Development

Successfully merging this pull request may close these issues.

8 participants