Vectorized memory access in TensorIterator GPU loop for 1d contiguous case by zasdfgbnm · Pull Request #32383 · pytorch/pytorch

zasdfgbnm · 2020-01-18T04:56:02Z

Step 2 of #31975

Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb

void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array<char*, 3>)

**ASM:**

	.section	.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",@progbits
	.sectioninfo	@"SHI_REGISTERS=20"
	.align	128
        .global         _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_
        .type           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@function
        .size           _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_)
        .other          _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT"
_ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
        /*0020*/                   S2R R9, SR_CTAID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177
        /*0030*/                   S2R R0, SR_TID.X ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
        /*0040*/                   IMAD.SHL.U32 R9, R9, 0x100, RZ ;
        /*0050*/                   IADD3 R5, -R9, c[0x0][0x160], RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0060*/                   SHF.R.S32.HI R17, RZ, 0x1f, R9 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296
        /*0070*/                   ISETP.GE.AND P0, PT, R5, 0x100, PT ;
        /*0080*/              @!P0 BRA `(.L_3173) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0090*/                   IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ;
        /*00a0*/                   SHF.L.U64.HI R17, R9, 0x2, R17 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260
        /*00b0*/                   IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ;
        /*00c0*/                   IADD3 R2, P1, R12, c[0x0][0x190], RZ ;
        /*00d0*/                   IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ;
        /*00e0*/                   IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218
        /*00f0*/                   IMAD.WIDE R8, R0, 0x10, R8 ;
        /*0100*/                   IMAD.WIDE R2, R0, 0x10, R2 ;
        /*0110*/                   LDG.E.128.SYS R8, [R8] ;
        /*0120*/                   LDG.E.128.SYS R4, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
        /*0130*/                   IADD3 R12, P0, R12, c[0x0][0x180], RZ ;
        /*0140*/                   IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*0150*/                   IMAD.WIDE R12, R0, 0x10, R12 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0160*/                   FFMA R7, R7, c[0x0][0x168], R11 ;
        /*0170*/                   FFMA R6, R6, c[0x0][0x168], R10 ;
        /*0180*/                   FFMA R5, R5, c[0x0][0x168], R9 ;
        /*0190*/                   FFMA R4, R4, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
        /*01a0*/                   STG.E.128.SYS [R12], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301
        /*01b0*/                   EXIT ;
.L_3173:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*01c0*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
        /*01d0*/                   BMOV.32.CLEAR RZ, B0 ;
        /*01e0*/                   BSSY B0, `(.L_3174) ;
        /*01f0*/               @P0 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0200*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0210*/                   LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ;
        /*0220*/                   LEA R2, P1, R3, c[0x0][0x188], 0x2 ;
        /*0230*/                   LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ;
        /*0240*/                   LDG.E.SYS R8, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0250*/                   IADD3 R4, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0260*/                   ISETP.GE.AND P1, PT, R4, R5, PT ;
        /*0270*/               @P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0280*/                   LDG.E.SYS R4, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0290*/                   IADD3 R6, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02a0*/                   ISETP.GE.AND P1, PT, R6, R5, PT ;
        /*02b0*/               @P1 BRA `(.L_3175) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*02c0*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02d0*/                   LDG.E.SYS R7, [R2+0x200] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*02e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*02f0*/              @!P1 LDG.E.SYS R6, [R2+0x300] ;
.L_3175:
        /*0300*/                   BSYNC B0 ;
.L_3174:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0310*/                   BMOV.32.CLEAR RZ, B0 ;
        /*0320*/                   BSSY B0, `(.L_3176) ;
        /*0330*/               @P0 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0340*/                   IADD3 R3, P1, R9, R0, RZ ;
        /*0350*/                   LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ;
        /*0360*/                   LEA R2, P1, R3, c[0x0][0x190], 0x2 ;
        /*0370*/                   LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ;
        /*0380*/                   LDG.E.SYS R11, [R2] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0390*/                   IADD3 R10, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03a0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03b0*/               @P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*03c0*/                   LDG.E.SYS R13, [R2+0x100] ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*03d0*/                   IADD3 R10, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*03e0*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
        /*03f0*/               @P1 BRA `(.L_3177) ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
        /*0400*/                   IADD3 R10, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
        /*0410*/                   ISETP.GE.AND P1, PT, R10, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
        /*0420*/                   LDG.E.SYS R10, [R2+0x200] ;
        /*0430*/              @!P1 LDG.E.SYS R15, [R2+0x300] ;
.L_3177:
        /*0440*/                   BSYNC B0 ;
.L_3176:
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0450*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0460*/                   IADD3 R9, P0, R9, R0, RZ ;
        /*0470*/                   FFMA R11, R11, c[0x0][0x168], R8 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0480*/                   IADD3 R14, R0, 0x40, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0490*/                   LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ;
        /*04a0*/                   LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04b0*/                   ISETP.GE.AND P1, PT, R14, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*04c0*/                   LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ;
        /*04d0*/                   STG.E.SYS [R2], R11 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*04e0*/               @P1 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*04f0*/                   IADD3 R8, R0, 0x80, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0500*/                   FFMA R13, R13, c[0x0][0x168], R4 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0510*/                   ISETP.GE.AND P0, PT, R8, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0520*/                   STG.E.SYS [R2+0x100], R13 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0530*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
        /*0540*/                   IADD3 R0, R0, 0xc0, RZ ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
        /*0550*/                   FFMA R7, R10, c[0x0][0x168], R7 ;
        /*0560*/                   FFMA R15, R15, c[0x0][0x168], R6 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0570*/                   ISETP.GE.AND P0, PT, R0, R5, PT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*0580*/                   STG.E.SYS [R2+0x200], R7 ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
        /*0590*/               @P0 EXIT ;
	//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
        /*05a0*/                   STG.E.SYS [R2+0x300], R15 ;
        /*05b0*/                   EXIT ;
.L_3178:
        /*05c0*/                   BRA `(.L_3178);
        /*05d0*/                   NOP;
        /*05e0*/                   NOP;
        /*05f0*/                   NOP;
.L_40898:

We can clearly see the LDG.E.128 in it, which is a result of vectorization.

Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb

Benchmark on P100, dtype uint8:

before:

1.4.0a0+a5b4d78
e1d97025eeeddcf083e9bee0c8f6a53168991a71
22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

after:

1.4.0a0+a5b4d78
1281cdfd8188fe86241ecaf71d001809d016c3a3
24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Benchmark on P100, dtype half:

Before:

1.4.0a0+a5b4d78
1c017f0c14c91bd5125ab387a90441b0c0e2f3ad
30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)

After

1.4.0a0+a5b4d78
7e50ee27333e7047072d328d03767b4845286356
28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)

cc: @csarofeen @ptrblck

…emory_access

kostmo · 2020-01-18T05:22:40Z

💊 CircleCI build failures summary and remediations

As of commit 6eb6c2e:

Commit 6eb6c2e was recently pushed. Waiting for builds...

This comment was automatically generated by Dr. CI (expand for details).

Follow this link to opt-out of these comments for your Pull Requests.

Please report bugs/suggestions on the GitHub issue tracker.

This comment has been revised 65 times.

…orized_memory_access

…emory_access

…orized_memory_access

…ch into vectorized_memory_access

…emory_access

zasdfgbnm · 2020-01-31T22:23:27Z

should be working this time

zasdfgbnm · 2020-01-31T22:53:47Z

@iotamudelta It seems to be hard to write code that has great perf on both CUDA and ROCm, so I make Loops.cuh.

// Note:
// CUDA and ROCm get diverged in this PR:
//   https://github.com/pytorch/pytorch/pull/32383
// Because for some reason trying to enable vectorized
// memory access introduce regression on ROCm.

#ifndef __HIP_PLATFORM_HCC__
#include <ATen/native/cuda/CUDALoops.cuh>
#else
#include <ATen/native/cuda/ROCmLoops.cuh>
#endif

where the ROCmLoops.cuh is copied from the master branch unchanged. If you want them to be merged together, this could be done in the future, but I don't want this PR to be delayed any more.

facebook-github-bot

@ngimel has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

ngimel · 2020-02-03T16:22:52Z

@iotamudelta please take a look at this PR. There should not be regressions because literally ROCm code paths are unchanged, but we don't have a way to test it.

facebook-github-bot

@ngimel has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

iotamudelta

OK for me. We will want to unify the divergence there somewhat soonish.

iotamudelta · 2020-02-03T21:15:27Z

We are also testing on CUDA if we make changes to shared code. I am unsure why this cannot be done equally for ROCm?

zasdfgbnm · 2020-02-03T22:45:36Z

@iotamudelta I believe there must be a way to get it done equally in both CUDA and ROCm, but in practice, since the compiler of CUDA and ROCm are not working exactly the same (such as loop unrolling), writing shared code could need lots of try and error or some knowledge with details about the details about these compilers (things like: does aligned array generate vectorized instructions? under which condition the compiler would be able to unroll the loop?). I tried to install ROCm locally and compile PyTorch with it locally, there was no luck. And I don't know enough about the details of ROCm's compiler, so I have to diverge the code here.

facebook-github-bot · 2020-02-04T01:12:52Z

@ngimel merged this pull request in 9c2ed25.

Summary: `where` is special because the arguments do not have the same type, which does not satisfy the assumption in modern #32383. I migrate it to TensorIterator so that there is something to test that this case is not broken. Currently, this case fallback to using legacy (not vectorized, not unrolled) code. It should be supported in the future when I cleanup `Loops.cuh`. I also move some sharing part of `CUDALoops.cuh` and `ROCmLoops.cuh` into `Loops.cuh` so that to logic for checking whether `func_t` has the same arg types could be shared. Pull Request resolved: #32984 Differential Revision: D19825127 Pulled By: ngimel fbshipit-source-id: bbf4682349d96b4480c4d657f3c18a3a67a9bf17

… case (pytorch#32383) Summary: Step 2 of pytorch#31975 Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb ``` void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3>) **ASM:** .section .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits .sectioninfo @"SHI_REGISTERS=20" .align 128 .global _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_ .type _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function .size _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_) .other _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT" _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /*0020*/ S2R R9, SR_CTAID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177 /*0030*/ S2R R0, SR_TID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0040*/ IMAD.SHL.U32 R9, R9, 0x100, RZ ; /*0050*/ IADD3 R5, -R9, c[0x0][0x160], RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0060*/ SHF.R.S32.HI R17, RZ, 0x1f, R9 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296 /*0070*/ ISETP.GE.AND P0, PT, R5, 0x100, PT ; /*0080*/ @!P0 BRA `(.L_3173) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0090*/ IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ; /*00a0*/ SHF.L.U64.HI R17, R9, 0x2, R17 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260 /*00b0*/ IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ; /*00c0*/ IADD3 R2, P1, R12, c[0x0][0x190], RZ ; /*00d0*/ IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ; /*00e0*/ IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218 /*00f0*/ IMAD.WIDE R8, R0, 0x10, R8 ; /*0100*/ IMAD.WIDE R2, R0, 0x10, R2 ; /*0110*/ LDG.E.128.SYS R8, [R8] ; /*0120*/ LDG.E.128.SYS R4, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0130*/ IADD3 R12, P0, R12, c[0x0][0x180], RZ ; /*0140*/ IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*0150*/ IMAD.WIDE R12, R0, 0x10, R12 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0160*/ FFMA R7, R7, c[0x0][0x168], R11 ; /*0170*/ FFMA R6, R6, c[0x0][0x168], R10 ; /*0180*/ FFMA R5, R5, c[0x0][0x168], R9 ; /*0190*/ FFMA R4, R4, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*01a0*/ STG.E.128.SYS [R12], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301 /*01b0*/ EXIT ; .L_3173: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*01c0*/ ISETP.GE.AND P0, PT, R0, R5, PT ; /*01d0*/ BMOV.32.CLEAR RZ, B0 ; /*01e0*/ BSSY B0, `(.L_3174) ; /*01f0*/ P0 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0200*/ IADD3 R3, P1, R9, R0, RZ ; /*0210*/ LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ; /*0220*/ LEA R2, P1, R3, c[0x0][0x188], 0x2 ; /*0230*/ LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ; /*0240*/ LDG.E.SYS R8, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0250*/ IADD3 R4, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0260*/ ISETP.GE.AND P1, PT, R4, R5, PT ; /*0270*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0280*/ LDG.E.SYS R4, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0290*/ IADD3 R6, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02a0*/ ISETP.GE.AND P1, PT, R6, R5, PT ; /*02b0*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*02c0*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02d0*/ LDG.E.SYS R7, [R2+0x200] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02f0*/ @!P1 LDG.E.SYS R6, [R2+0x300] ; .L_3175: /*0300*/ BSYNC B0 ; .L_3174: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0310*/ BMOV.32.CLEAR RZ, B0 ; /*0320*/ BSSY B0, `(.L_3176) ; /*0330*/ P0 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0340*/ IADD3 R3, P1, R9, R0, RZ ; /*0350*/ LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ; /*0360*/ LEA R2, P1, R3, c[0x0][0x190], 0x2 ; /*0370*/ LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ; /*0380*/ LDG.E.SYS R11, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0390*/ IADD3 R10, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03a0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03b0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*03c0*/ LDG.E.SYS R13, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*03d0*/ IADD3 R10, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03f0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0400*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0410*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0420*/ LDG.E.SYS R10, [R2+0x200] ; /*0430*/ @!P1 LDG.E.SYS R15, [R2+0x300] ; .L_3177: /*0440*/ BSYNC B0 ; .L_3176: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0450*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0460*/ IADD3 R9, P0, R9, R0, RZ ; /*0470*/ FFMA R11, R11, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0480*/ IADD3 R14, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0490*/ LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ; /*04a0*/ LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04b0*/ ISETP.GE.AND P1, PT, R14, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*04c0*/ LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ; /*04d0*/ STG.E.SYS [R2], R11 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04e0*/ P1 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*04f0*/ IADD3 R8, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0500*/ FFMA R13, R13, c[0x0][0x168], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0510*/ ISETP.GE.AND P0, PT, R8, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0520*/ STG.E.SYS [R2+0x100], R13 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0530*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0540*/ IADD3 R0, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0550*/ FFMA R7, R10, c[0x0][0x168], R7 ; /*0560*/ FFMA R15, R15, c[0x0][0x168], R6 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0570*/ ISETP.GE.AND P0, PT, R0, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0580*/ STG.E.SYS [R2+0x200], R7 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0590*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*05a0*/ STG.E.SYS [R2+0x300], R15 ; /*05b0*/ EXIT ; .L_3178: /*05c0*/ BRA `(.L_3178); /*05d0*/ NOP; /*05e0*/ NOP; /*05f0*/ NOP; .L_40898: ``` We can clearly see the `LDG.E.128` in it, which is a result of vectorization. Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb Benchmark on P100, dtype `uint8`: before: ``` 1.4.0a0+a5b4d78 e1d9702 22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` after: ``` 1.4.0a0+a5b4d78 1281cdf 24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Benchmark on P100, dtype `half`: Before: ``` 1.4.0a0+a5b4d78 1c017f0 30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) 6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` After ``` 1.4.0a0+a5b4d78 7e50ee2 28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` cc: csarofeen ptrblck Pull Request resolved: pytorch#32383 Differential Revision: D19697455 Pulled By: ngimel fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1

Summary: Recent PR pytorch#31974 and upcoming PR pytorch#32383 are changing the behavior of the elementwise_kernel infrastructure on CUDA. In order to stay in sync, change the nd-loop behavior to match ROCm and CUDA for now. Once the full rework is done, the ROCm settings will likely diverge again. Pull Request resolved: pytorch#32609 Differential Revision: D19580121 Pulled By: ezyang fbshipit-source-id: 4c8dcf6db3ac973e48ece6a665615cfe7d7cb764

… case (pytorch#32383) Summary: Step 2 of pytorch#31975 Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb ``` void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)pytorch#1}, at::detail::Array<char*, 3>) **ASM:** .section .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits .sectioninfo @"SHI_REGISTERS=20" .align 128 .global _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_ .type _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function .size _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_) .other _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT" _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /*0020*/ S2R R9, SR_CTAID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177 /*0030*/ S2R R0, SR_TID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0040*/ IMAD.SHL.U32 R9, R9, 0x100, RZ ; /*0050*/ IADD3 R5, -R9, c[0x0][0x160], RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0060*/ SHF.R.S32.HI R17, RZ, 0x1f, R9 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296 /*0070*/ ISETP.GE.AND P0, PT, R5, 0x100, PT ; /*0080*/ @!P0 BRA `(.L_3173) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0090*/ IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ; /*00a0*/ SHF.L.U64.HI R17, R9, 0x2, R17 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260 /*00b0*/ IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ; /*00c0*/ IADD3 R2, P1, R12, c[0x0][0x190], RZ ; /*00d0*/ IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ; /*00e0*/ IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218 /*00f0*/ IMAD.WIDE R8, R0, 0x10, R8 ; /*0100*/ IMAD.WIDE R2, R0, 0x10, R2 ; /*0110*/ LDG.E.128.SYS R8, [R8] ; /*0120*/ LDG.E.128.SYS R4, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0130*/ IADD3 R12, P0, R12, c[0x0][0x180], RZ ; /*0140*/ IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*0150*/ IMAD.WIDE R12, R0, 0x10, R12 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0160*/ FFMA R7, R7, c[0x0][0x168], R11 ; /*0170*/ FFMA R6, R6, c[0x0][0x168], R10 ; /*0180*/ FFMA R5, R5, c[0x0][0x168], R9 ; /*0190*/ FFMA R4, R4, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*01a0*/ STG.E.128.SYS [R12], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301 /*01b0*/ EXIT ; .L_3173: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*01c0*/ ISETP.GE.AND P0, PT, R0, R5, PT ; /*01d0*/ BMOV.32.CLEAR RZ, B0 ; /*01e0*/ BSSY B0, `(.L_3174) ; /*01f0*/ P0 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0200*/ IADD3 R3, P1, R9, R0, RZ ; /*0210*/ LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ; /*0220*/ LEA R2, P1, R3, c[0x0][0x188], 0x2 ; /*0230*/ LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ; /*0240*/ LDG.E.SYS R8, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0250*/ IADD3 R4, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0260*/ ISETP.GE.AND P1, PT, R4, R5, PT ; /*0270*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0280*/ LDG.E.SYS R4, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0290*/ IADD3 R6, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02a0*/ ISETP.GE.AND P1, PT, R6, R5, PT ; /*02b0*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*02c0*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02d0*/ LDG.E.SYS R7, [R2+0x200] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02f0*/ @!P1 LDG.E.SYS R6, [R2+0x300] ; .L_3175: /*0300*/ BSYNC B0 ; .L_3174: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0310*/ BMOV.32.CLEAR RZ, B0 ; /*0320*/ BSSY B0, `(.L_3176) ; /*0330*/ P0 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0340*/ IADD3 R3, P1, R9, R0, RZ ; /*0350*/ LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ; /*0360*/ LEA R2, P1, R3, c[0x0][0x190], 0x2 ; /*0370*/ LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ; /*0380*/ LDG.E.SYS R11, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0390*/ IADD3 R10, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03a0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03b0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*03c0*/ LDG.E.SYS R13, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*03d0*/ IADD3 R10, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03f0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0400*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0410*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0420*/ LDG.E.SYS R10, [R2+0x200] ; /*0430*/ @!P1 LDG.E.SYS R15, [R2+0x300] ; .L_3177: /*0440*/ BSYNC B0 ; .L_3176: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0450*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0460*/ IADD3 R9, P0, R9, R0, RZ ; /*0470*/ FFMA R11, R11, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0480*/ IADD3 R14, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0490*/ LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ; /*04a0*/ LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04b0*/ ISETP.GE.AND P1, PT, R14, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*04c0*/ LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ; /*04d0*/ STG.E.SYS [R2], R11 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04e0*/ P1 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*04f0*/ IADD3 R8, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0500*/ FFMA R13, R13, c[0x0][0x168], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0510*/ ISETP.GE.AND P0, PT, R8, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0520*/ STG.E.SYS [R2+0x100], R13 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0530*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0540*/ IADD3 R0, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0550*/ FFMA R7, R10, c[0x0][0x168], R7 ; /*0560*/ FFMA R15, R15, c[0x0][0x168], R6 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0570*/ ISETP.GE.AND P0, PT, R0, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0580*/ STG.E.SYS [R2+0x200], R7 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0590*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*05a0*/ STG.E.SYS [R2+0x300], R15 ; /*05b0*/ EXIT ; .L_3178: /*05c0*/ BRA `(.L_3178); /*05d0*/ NOP; /*05e0*/ NOP; /*05f0*/ NOP; .L_40898: ``` We can clearly see the `LDG.E.128` in it, which is a result of vectorization. Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb Benchmark on P100, dtype `uint8`: before: ``` 1.4.0a0+a5b4d78 e1d9702 22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` after: ``` 1.4.0a0+a5b4d78 1281cdf 24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Benchmark on P100, dtype `half`: Before: ``` 1.4.0a0+a5b4d78 1c017f0 30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) 6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` After ``` 1.4.0a0+a5b4d78 7e50ee2 28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` cc: csarofeen ptrblck Pull Request resolved: pytorch#32383 Differential Revision: D19697455 Pulled By: ngimel fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1

Summary: `where` is special because the arguments do not have the same type, which does not satisfy the assumption in modern pytorch#32383. I migrate it to TensorIterator so that there is something to test that this case is not broken. Currently, this case fallback to using legacy (not vectorized, not unrolled) code. It should be supported in the future when I cleanup `Loops.cuh`. I also move some sharing part of `CUDALoops.cuh` and `ROCmLoops.cuh` into `Loops.cuh` so that to logic for checking whether `func_t` has the same arg types could be shared. Pull Request resolved: pytorch#32984 Differential Revision: D19825127 Pulled By: ngimel fbshipit-source-id: bbf4682349d96b4480c4d657f3c18a3a67a9bf17

zasdfgbnm added 17 commits January 17, 2020 14:51

merge

b7489e9

MemAccess.cuh

33778d5

save

1bed61e

Merge branch 'master' of github.com:pytorch/pytorch into vectorized_m…

064ba21

…emory_access

revert submodule

dc7e189

version 0

f81237f

some fix

dca23a2

fix

0cae4ea

fixes

5030a02

cleanup

daff059

cleanup

479a04a

fix

3f6eced

fix

721a767

run cuda_vectorized_test

233fcb4

cleanup

232a087

fix

c1e1f83

remove cout

0f09974

zasdfgbnm requested a review from ngimel January 18, 2020 04:56

pytorchbot added the open source label Jan 18, 2020

zasdfgbnm added 2 commits January 17, 2020 23:38

skip cuda if no GPU

e5700fb

Merge branch 'master' of https://github.com/pytorch/pytorch into vect…

d7b8186

…orized_memory_access

zasdfgbnm closed this Jan 21, 2020

zasdfgbnm added 5 commits January 21, 2020 11:59

fix

26b6610

Merge branch 'master' of github.com:pytorch/pytorch into vectorized_m…

6c2609f

…emory_access

use unrolled loop for reminder

1281cdf

Merge branch 'master' of https://github.com/pytorch/pytorch into vect…

c6e5aee

…orized_memory_access

Merge branch 'vectorized_memory_access' of github.com:zasdfgbnm/pytor…

e38f82e

…ch into vectorized_memory_access

zasdfgbnm reopened this Jan 21, 2020

Fix windows build

836ecf0

zasdfgbnm restored the vectorized_memory_access branch January 31, 2020 18:37

zasdfgbnm reopened this Jan 31, 2020

zasdfgbnm added 5 commits January 31, 2020 10:57

Merge branch 'master' of github.com:pytorch/pytorch into vectorized_m…

533ba44

…emory_access

remove ROCm

2cdd3e7

Merge branch 'master' of github.com:pytorch/pytorch into vectorized_m…

df72316

…emory_access

diverge CUDA and ROCm

8a55aa9

fix

6eb6c2e

facebook-github-bot reviewed Feb 3, 2020

View reviewed changes

iotamudelta reviewed Feb 3, 2020

View reviewed changes

facebook-github-bot closed this in 9c2ed25 Feb 4, 2020

facebook-github-bot added the merged label Feb 4, 2020

zasdfgbnm deleted the vectorized_memory_access branch February 4, 2020 01:35

zasdfgbnm mentioned this pull request Feb 4, 2020

Move where cuda implementation to TensorIterator #32984

Closed

jeffdaily mentioned this pull request Jul 29, 2020

unify ROCmLoops and CUDALoops ROCm/pytorch#700

Closed

mruberry added the Merged label Oct 28, 2020

jeffdaily mentioned this pull request May 18, 2022

[ROCm] unify Loops implementations between CUDA/ROCm #77779

Closed

Cemberk mentioned this pull request Nov 16, 2022

ROCMLoops and elementwise optimization patch #89134

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Vectorized memory access in TensorIterator GPU loop for 1d contiguous case#32383

Vectorized memory access in TensorIterator GPU loop for 1d contiguous case#32383
zasdfgbnm wants to merge 65 commits intopytorch:masterfrom
zasdfgbnm:vectorized_memory_access

zasdfgbnm commented Jan 18, 2020 •

edited

Loading

Uh oh!

kostmo commented Jan 18, 2020 •

edited

Loading

Uh oh!

zasdfgbnm commented Jan 31, 2020

Uh oh!

zasdfgbnm commented Jan 31, 2020

Uh oh!

facebook-github-bot left a comment

Uh oh!

ngimel commented Feb 3, 2020

Uh oh!

facebook-github-bot left a comment

Uh oh!

iotamudelta left a comment

Uh oh!

iotamudelta commented Feb 3, 2020

Uh oh!

zasdfgbnm commented Feb 3, 2020

Uh oh!

facebook-github-bot commented Feb 4, 2020

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

8 participants

Conversation

zasdfgbnm commented Jan 18, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kostmo commented Jan 18, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

💊 CircleCI build failures summary and remediations

Uh oh!

zasdfgbnm commented Jan 31, 2020

Uh oh!

zasdfgbnm commented Jan 31, 2020

Uh oh!

facebook-github-bot left a comment

Choose a reason for hiding this comment

Uh oh!

ngimel commented Feb 3, 2020

Uh oh!

facebook-github-bot left a comment

Choose a reason for hiding this comment

Uh oh!

iotamudelta left a comment

Choose a reason for hiding this comment

Uh oh!

iotamudelta commented Feb 3, 2020

Uh oh!

zasdfgbnm commented Feb 3, 2020

Uh oh!

facebook-github-bot commented Feb 4, 2020

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

8 participants

zasdfgbnm commented Jan 18, 2020 •

edited

Loading

kostmo commented Jan 18, 2020 •

edited

Loading