Diff
checker
Text
Text
Images
Documents
Excel
Folders
Legal
Enterprise
Desktop
Pricing
Sign in
Download Diffchecker Desktop
Compare text
Find the difference between two text files
Tools
History
Real-time editor
Hide unchanged lines
Disable line wrap
Layout
Split
Unified
Diff precision
Smart
Word
Char
Syntax highlighting
Choose syntax
Ignore
Transform text
Go to first change
Edit input
Diffchecker Desktop
The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!
Get Desktop
Bad left good right
Created
last year
Diff never expires
Clear
Export
Share
Explain
370 removals
Lines
Total
Removed
Characters
Total
Removed
To continue using this feature, upgrade to
Diff
checker
Pro
View Pricing
559 lines
Copy
322 additions
Lines
Total
Added
Characters
Total
Added
To continue using this feature, upgrade to
Diff
checker
Pro
View Pricing
508 lines
Copy
//
//
// Generated by LLVM NVPTX Back-End
// Generated by LLVM NVPTX Back-End
//
//
.version 8.4
.version 8.4
.target sm_90a
.target sm_90a
.address_size 64
.address_size 64
// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
.visible .entry triton_f4_to_bf16_kernel(
.visible .entry triton_f4_to_bf16_kernel(
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
Copy
Copied
Copy
Copied
.param .u32 triton_f4_to_bf16_kernel_param_2
,
.param .u32 triton_f4_to_bf16_kernel_param_2
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_3
)
)
.reqntid 128, 1, 1
.reqntid 128, 1, 1
{
{
.reg .pred %p<25>;
.reg .pred %p<25>;
Copy
Copied
Copy
Copied
.reg .b16 %rs<
61
>;
.reg .b16 %rs<
65
>;
.reg .b32 %r<
132>;
.reg .b32 %r<
111
>;
.reg .f32 %f<9
>;
.reg .b64 %rd<13>;
.reg .b64 %rd<13>;
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0
$L__func_begin0:
$L__func_begin0:
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0
// %bb.0:
// %bb.0:
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
$L__tmp0:
$L__tmp0:
.loc 1 194 24 // custom_cast.py:194:24
.loc 1 194 24 // custom_cast.py:194:24
Copy
Copied
Copy
Copied
mov.u32
%r5
, %ctaid.x;
// begin inline asm
ld.param.u32 %
r6
, [triton_f4_to_bf16_kernel_param_2];
mov.u32
%r1
, %ctaid.x;
// end inline asm
ld.param.u32 %
r14
, [triton_f4_to_bf16_kernel_param_2];
.loc 1 195 37 // custom_cast.py:195:37
.loc 1 195 37 // custom_cast.py:195:37
Copy
Copied
Copy
Copied
shl.b32 %
r7
, %
r6
, 1;
shl.b32 %
r15
, %
r14
, 1;
.loc 1 198 27 // custom_cast.py:198:27
.loc 1 198 27 // custom_cast.py:198:27
Copy
Copied
Copy
Copied
shl.b32 %
r8
, %
r5
, 9;
shl.b32 %
r16
, %
r1
, 9;
.loc 1 199 47 // custom_cast.py:199:47
.loc 1 199 47 // custom_cast.py:199:47
Copy
Copied
Copy
Copied
mov.u32 %
r9
, %tid.x;
mov.u32 %
r17
, %tid.x;
shl.b32 %r1
0
, %
r9
, 2;
shl.b32 %r1
8
, %
r17
, 2;
and.b32 %r1
1
, %r1
0
, 508;
and.b32 %r1
9
, %r1
8
, 508;
.loc 1 199 34 // custom_cast.py:199:34
.loc 1 199 34 // custom_cast.py:199:34
Copy
Copied
Copy
Copied
or.b32 %r
12
, %r1
1
, %
r8
;
or.b32 %r
20
, %r1
6
, %
r19
;
or.b32 %r
13
, %r
12
, 1;
or.b32 %r
21
, %r
20
, 1;
or.b32 %
r14
, %r
12
, 2;
or.b32 %
r22
, %r
20
, 2;
or.b32 %
r15
, %r
12
, 3;
or.b32 %
r23
, %r
20
, 3;
.loc 1 201 27 // custom_cast.py:201:27
.loc 1 201 27 // custom_cast.py:201:27
Copy
Copied
Copy
Copied
setp.lt.s32 %p1, %r
12
, %
r6
;
setp.lt.s32 %p1, %r
20
, %
r14
;
setp.lt.s32 %p2, %r
13
, %
r6
;
setp.lt.s32 %p2, %r
21
, %
r14
;
setp.lt.s32 %p3, %
r14
, %
r6
;
setp.lt.s32 %p3, %
r22
, %
r14
;
setp.lt.s32 %p4, %
r15
, %
r6
;
setp.lt.s32 %p4, %
r23
, %
r14
;
.loc 1 204 31 // custom_cast.py:204:31
.loc 1 204 31 // custom_cast.py:204:31
Copy
Copied
Copy
Copied
cvt.s64.s32 %rd11, %r
12
;
cvt.s64.s32 %rd11, %r
20
;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd2, %rd1, 1;
add.s64 %rd2, %rd1, 1;
add.s64 %rd3, %rd1, 2;
add.s64 %rd3, %rd1, 2;
add.s64 %rd4, %rd1, 3;
add.s64 %rd4, %rd1, 3;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1 204 23 // custom_cast.py:204:23
// begin inline asm
// begin inline asm
mov.u16 %rs1, 0x0;
mov.u16 %rs1, 0x0;
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs2, 0x0;
mov.u16 %rs2, 0x0;
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs3, 0x0;
mov.u16 %rs3, 0x0;
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs4, 0x0;
mov.u16 %rs4, 0x0;
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
// end inline asm
// end inline asm
$L__tmp1:
$L__tmp1:
Copy
Copied
Copy
Copied
.loc 1 151 26 // custom_cast.py:151:26
and.b16 %rs5, %rs1, 1;
and.b16 %rs6, %rs2, 1;
and.b16 %rs7, %rs3, 1;
and.b16 %rs8, %rs4, 1;
.loc 1 123 29 // custom_cast.py:123:29
.loc 1 123 29 // custom_cast.py:123:29
Copy
Copied
Copy
Copied
shr.u16 %
rs9
, %rs1, 4;
shr.u16 %
rs13
, %rs1, 4;
and.b
16
%rs1
0
, %rs
9, 15
;
shr.u
16
%rs14, %rs2, 4;
shr.u16
%rs1
5
, %rs
3, 4;
shr.u16 %rs16, %rs4, 4
;
.loc 1 131 18 // custom_cast.py:131:18
.loc 1 131 18 // custom_cast.py:131:18
Copy
Copied
Copy
Copied
mov.b32 %r16, {
%rs1
0
, %rs1
1}
;
and.b16
%rs1
7
, %rs1
3, 8
;
mov.b32
%
r17, {
%rs1
0
, %rs1
}
;
and.b16
%
rs18,
%rs1
4, 8;
and.
b32
%
r18
, %
r17, 524296
;
and.b16 %rs19
, %rs1
5, 8
;
and.
b16
%
rs20
, %
rs16, 8
;
.loc 1 134 16 // custom_cast.py:134:16
.loc 1 134 16 // custom_cast.py:134:16
Copy
Copied
Copy
Copied
and.
b32 %r19
, %
r17, 458759
;
and.
b16 %rs21, %rs13, 7;
and.b16 %rs22, %rs1, 6;
and.b16 %rs23, %rs1, 7;
and.b16 %rs24, %rs14, 7;
and.b16 %rs25, %rs2, 6;
and.b16 %rs26, %rs2, 7;
and.b16 %rs27, %rs15, 7;
and.b16 %rs28, %rs3, 6;
and.b16 %rs29
, %
rs3, 7;
and.b16 %rs30, %rs16, 7;
and.b16 %rs31, %rs4, 6;
and.b16 %rs32, %rs4, 7
;
.loc 1 137 25 // custom_cast.py:137:25
.loc 1 137 25 // custom_cast.py:137:25
Copy
Copied
Copy
Copied
mov.b32 {%rs12, %rs13}, %r19;
setp.eq.s16 %p9, %rs
21
, 0;
setp.eq.s16 %p9, %rs
12
, 0;
setp.eq.s16 %p10, %rs
2
3, 0;
setp.eq.s16 %p10, %rs
1
3, 0;
setp.eq.s16 %p11, %rs24, 0;
setp.eq.s16 %p12, %rs26, 0;
setp.eq.s16 %p13, %rs27, 0;
setp.eq.s16 %p14, %rs29, 0;
setp.eq.s16 %p15, %rs30, 0;
setp.eq.s16 %p16, %rs32, 0;
.loc 1 143 29 // custom_cast.py:143:29
.loc 1 143 29 // custom_cast.py:143:29
Copy
Copied
Copy
Copied
setp.eq.s16 %p1
1
, %
rs13
, 1;
setp.eq.s16 %p1
7
, %
rs21, 1;
setp.eq.s16 %
p12, %rs12, 1;
setp.eq.s16 %p18, %rs23
, 1;
setp.eq.s16 %
p19, %rs24, 1;
setp.eq.s16 %p20, %rs26, 1;
setp.eq.s16 %p21, %rs27, 1;
setp.eq.s16 %p22, %rs29, 1;
setp.eq.s16 %p23, %rs30, 1;
setp.eq.s16 %p24, %rs32, 1;
.loc 1 146 29 // custom_cast.py:146:29
.loc 1 146 29 // custom_cast.py:146:29
Copy
Copied
Copy
Copied
shr.u16 %
rs14
, %
rs12
, 1;
shr.u16 %
rs33, %rs1, 5;
shr.u16 %
rs15, %rs13, 1;
and.b16 %rs34
, %
rs33, 3;
shr.u16 %rs35, %rs22
, 1;
shr.u16 %
rs36, %rs2, 5;
and.b16 %rs37, %rs36, 3;
shr.u16 %rs38, %rs25, 1;
shr.u16 %rs39, %rs3, 5;
and.b16 %rs40, %rs39, 3;
shr.u16 %rs41, %rs28, 1;
shr.u16 %rs42, %rs4, 5;
and.b16 %rs43, %rs42, 3;
shr.u16 %rs44, %rs31, 1;
.loc 1 147 56 // custom_cast.py:147:56
.loc 1 147 56 // custom_cast.py:147:56
Copy
Copied
Copy
Copied
add.s16 %
rs16
, %rs
1
4, 126;
add.s16 %
rs45
, %rs
3
4, 126;
.loc 1 151 26 // custom_cast.py:151:26
add.s16 %rs46
, %rs
35, 126
;
mov.b32 %r20, {%rs17
, %rs
15}
;
a
dd.s16
%
rs47
, %
rs37, 126
;
a
nd.b32
%
r21
, %
r16, 1;
add.s16
%
rs48
, %
rs38, 126
;
mov.b32 %r22, 8257536
;
add.s
16
%rs49
, %
rs40, 126;
add.s16
x2
%
r23
, %
r20, %r22
;
add.s
16 %
rs50
, %rs
41, 126
;
.loc 1 152 34 // custom_cast.py:152:34
add.s
16
%rs
51
, %
rs43, 126;
{ .reg .b
16
tmp; mov.b32 {tmp
, %
rs18}, %r23; }
add.s
16 %
rs52
, %
rs44, 126
;
cvt.u32.u
16 %
r24
, %rs
18
;
{ .reg .b
16
tmp; mov.b32 {
%rs
19, tmp}
, %
r21; }
cvt.u32.u
16 %
r25
, %
rs19
;
.loc 1 148 39 // custom_cast.py:148:39
.loc 1 148 39 // custom_cast.py:148:39
Copy
Copied
Copy
Copied
cvt.u32.u16 %r26, %
rs16
;
cvt.u32.u16 %r24, %rs45;
cvt.u32.u16 %r27, %
rs5
;
cvt.u32.u16 %r25, %rs46;
.loc 1 152 48 // custom_cast.py:152:48
cvt.u32.u16 %r26, %
rs47
;
shl.b32
%r28, %
r24, 23
;
cvt.u32.u16 %r27, %
rs48
;
shl.b32
%r29, %
r25, 22
;
cvt.u32.u16
%r28, %
rs49
;
cvt.u32.u16
%r29, %
rs50;
cvt.u32.u16 %r30, %rs51;
cvt.u32.u16 %r31, %rs52
;
.loc 1 148 52 // custom_cast.py:148:52
.loc 1 148 52 // custom_cast.py:148:52
Copy
Copied
Copy
Copied
shl.b32 %r3
0
, %r2
6
, 23;
shl.b32 %r3
2
, %r2
4
, 23;
shl.b32 %
r31, %r27, 22;
shl.b32 %
r33, %r25, 23;
.loc 1 156 30 // custom_cast.py:156:30
shl.b32 %r34, %r26, 23;
or.b32 %r32, %r29, %r30;
shl.b32 %r35, %r27, 23;
or.b32 %r33, %r28, %r31;
shl.b32 %r36, %r28, 23;
.loc 1 158 48 // custom_cast.py:158:48
shl.b32 %r37, %r29, 23;
selp.b32 %r34, 0, %r33, %p10;
shl.b32
%r38, %r30, 23;
selp.b32 %r35, 0, %r32, %p9;
shl.b32 %r39, %r31, 23;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r36, 1056964608, %r35, %p12;
selp.b32 %r37, 1056964608, %r34, %p11;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs20, %rs21}, %r18;
cvt.u32.u16 %r38, %rs21;
cvt.u32.u16 %r39, %rs20;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r40, %r39, 28;
shl.b32
%r41, %r38, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r42, %r37, %r41;
or.b32 %r43, %r36, %r40;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f1, %r43;
mov.b32 %f2, %r42;
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r1, %f2, %f1;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs22, %rs2, 4;
and.b16 %rs23, %rs22, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r44, {%rs23, %rs24};
mov.b32 %r45, {%rs23, %rs2};
and.b32 %r46, %r45, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r47, %r45, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs25, %rs26}, %r47;
setp.eq.s16 %p13, %rs25, 0;
setp.eq.s16 %p14, %rs26, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p15, %rs26, 1;
setp.eq.s16 %p16, %rs25, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs27, %rs25, 1;
shr.u16 %rs28, %rs26, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs29, %rs27, 126;
.loc 1 151 26 // custom_cast.py:151:26
.loc 1 151 26 // custom_cast.py:151:26
Copy
Copied
Copy
Copied
mov.b32 %r48, {
%rs
30
, %
rs28}
;
and.b16
%rs
53
, %
rs13, 1
;
and.
b32
%
r49
, %
r44
, 1;
and.
b16
%
rs54
, %
rs14
, 1;
a
dd.s16x2
%
r50
, %
r48
, %
r22
;
a
nd.b16
%
rs55
, %
rs15, 1;
and.b16 %rs56
, %
rs16, 1
;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
Copy
Copied
Copy
Copied
{ .reg .b16 tmp; mov.b32 {tmp, %rs31}, %r50; }
cvt.u32.u16 %r40, %rs53;
cvt.u32.u16 %r51, %rs31;
$L__tmp2:
{ .reg .b16 tmp; mov.b32 {%rs32, tmp}, %r49; }
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %r52, %rs32;
cvt.
u32.u16 %r41, %rs1;
.loc 1 148 39 // custom_cast.py:148:39
$L__tmp3:
cvt.u32.u16 %r53, %rs29;
cvt.u32.u16 %r54, %rs6;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r55, %r51, 23;
shl.b32 %r56, %r52, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r57, %r53, 23;
shl.b32 %r58, %r54, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r59, %r56, %r57;
or.b32 %r60, %r55, %r58;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r61, 0, %r60, %p14;
selp.b32 %r62, 0, %r59, %p13;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r63, 1056964608, %r62, %p16;
selp.b32 %r64, 1056964608, %r61, %p15;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs33, %rs34}, %r46;
cvt.u32.u16 %r65, %rs34;
cvt.u32.u16 %r66, %rs33;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r67, %r66, 28;
shl.b32 %r68, %r65, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r69, %r64, %r68;
or.b32 %r70, %r63, %r67;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f3, %r70;
mov.b32 %f4, %r69;
.loc 1 174 23 // custom_cast.py:174:23
cvt.
rn.bf16x2.f32 %r2, %f4, %f3;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs35, %rs3, 4;
and.b16 %rs36, %rs35, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r71, {%rs36, %rs37};
mov.b32 %r72, {%rs36, %rs3};
and.b32 %r73, %r72, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r74, %r72, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs38, %rs39}, %r74;
setp.eq.s16 %p17, %rs38, 0;
setp.eq.s16 %p18, %rs39, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p19, %rs39, 1;
setp.eq.s16 %p20, %rs38, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs40, %rs38, 1;
shr.u16 %rs41, %rs39, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs42, %rs40, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r75, {%rs43, %rs41};
and.b32 %r76, %r71, 1;
add.s16x2 %r77, %r75, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
Copy
Copied
Copy
Copied
{ .reg .b16 tmp; mov.b32 {tmp, %rs44}, %r77; }
and.b32 %r42, %r41, 1;
cvt.u32.u16 %r78, %rs44;
cvt.u32.u16 %r43, %rs54;
{ .reg .b16 tmp; mov.b32 {%rs45, tmp}, %r76; }
$L__tmp4:
cvt.u32.u16 %r79, %rs45;
.loc 1 204
23 // custom_cast.py:
204:23
.loc 1 148 39 // custom_cast.py:148:39
cvt.
u32.u16 %r44, %rs2;
cvt.u32.u16 %r80, %rs42;
$L__tmp5:
cvt.u32.u16 %r81, %rs7;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r82, %r78, 23;
shl.b32 %r83, %r79, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r84, %r80, 23;
shl.b32 %r85, %r81, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r86, %r83, %r84;
or.b32 %r87, %r82, %r85;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r88, 0, %r87, %p18;
selp.b32 %r89, 0, %r86, %p17;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r90, 1056964608, %r89, %p20;
selp.b32 %r91, 1056964608, %r88, %p19;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs46, %rs47}, %r73;
cvt.u32.u16 %r92, %rs47;
cvt.u32.u16 %r93, %rs46;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r94, %r93, 28;
shl.b32 %r95, %r92, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r96, %r91, %r95;
or.b32 %r97, %r90, %r94;
.loc 1 173
23 // custom_cast.py:
173:23
mov.b32 %f5, %r97;
mov.b32 %f6, %r96;
.loc 1 174 23 // custom_cast.py:174:23
cvt.
rn.bf16x2.f32 %r3, %f6, %f5;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs48, %rs4, 4;
and.b16 %rs49, %rs48, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r98, {%rs49, %rs50};
mov.b32 %r99, {%rs49, %rs4};
and.b32 %r100, %r99, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r101, %r99, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs51, %rs52}, %r101;
setp.eq.s16 %p21, %rs51, 0;
setp.eq.s16 %p22, %rs52, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p23, %rs52, 1;
setp.eq.s16 %p24, %rs51, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs53, %rs51, 1;
shr.u16 %rs54, %rs52, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs55, %rs53, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r102, {%rs56, %rs54};
and.b32 %r103, %r98, 1;
add.s16x2 %r104, %r102, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
Copy
Copied
Copy
Copied
{ .reg .b16 tmp; mov
.b32
{tmp, %rs57}
, %
r104; }
and
.b32
%r45
, %
r44, 1;
cvt.u32.u16 %
r105
, %rs5
7;
cvt.u32.u16 %
r46
, %rs5
5;
{ .reg .b16
tmp
; mov.b32 {%rs58, tmp}, %r103; }
$L__
tmp
6:
cvt.u32.u16 %
r106
, %
rs58;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1
148 39
// custom_cast.py:
148:39
cvt.u32.u16 %
r47
, %
rs3;
cvt.u32.u16 %
r107
, %rs5
5;
$L__tmp7:
cvt.u32.u16 %
r108
, %rs
8;
.loc 1
152 34
// custom_cast.py:
152:34
and.b32 %r48, %r47, 1;
cvt.u32.u16 %
r49
, %rs5
6;
$L__tmp8:
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %
r50
, %rs
4;
$L__tmp9:
.loc 1 152 34 // custom_cast.py:152:34
and.b32 %r51, %r50, 1;
.loc 1 152 48 // custom_cast.py:152:48
.loc 1 152 48 // custom_cast.py:152:48
Copy
Copied
Copy
Copied
shl.b32 %
r109
, %
r105, 23
;
shl.b32 %
r52
, %
r40, 22;
shl.b32 %
r110
, %
r106
, 22;
shl.b32 %r53, %r42, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r54, %r43, 22;
shl.b32 %
r111
, %
r107, 23
;
shl.b32 %r55, %r45, 22
;
shl.b32 %
r112
, %
r108
, 22;
shl.b32 %
r56
, %
r46
, 22;
shl.b32 %r57, %r48, 22;
shl.b32 %
r58
, %
r49, 22
;
shl.b32 %
r59
, %
r51
, 22;
.loc 1 156 30 // custom_cast.py:156:30
.loc 1 156 30 // custom_cast.py:156:30
Copy
Copied
Copy
Copied
or.b32 %
r113
, %
r110
, %
r111
;
or.b32 %
r60, %r32, %r52;
or.b32 %
r114
, %
r109
, %
r112
;
or.b32 %r61, %r33
, %
r53;
or.b32 %r62
, %
r34, %r54
;
or.b32 %
r63, %r35, %r55;
or.b32 %r64
, %
r36, %r56;
or.b32 %r65, %r37, %r57;
or.b32 %r66, %r38, %r58;
or.b32 %r67, %r39
, %
r59
;
.loc 1 158 48 // custom_cast.py:158:48
.loc 1 158 48 // custom_cast.py:158:48
Copy
Copied
Copy
Copied
selp.b32 %
r115, 0, %r114, %p2
2;
selp.b32 %
r68, 0, %r60, %p9;
selp.b32 %
r116
, 0, %
r113, %p21;
selp.b32 %r69, 0, %r61, %p10;
selp.b32 %r70, 0, %r62, %p11;
selp.b32 %r71, 0, %r63, %p1
2;
selp.b32 %
r72
, 0, %
r64, %p13;
selp.b32 %r73, 0, %r65, %p14;
selp.b32 %r74, 0, %r66, %p15;
selp.b32 %r75, 0, %r67, %p16;
.loc 1 160 63 // custom_cast.py:160:63
.loc 1 160 63 // custom_cast.py:160:63
Copy
Copied
Copy
Copied
selp.b32 %
r117
, 1056964608, %
r116
, %
p24
;
selp.b32 %
r76, 1056964608, %r68, %p17;
selp.b32 %
r118
, 1056964608, %
r115, %p23;
selp.b32 %r77
, 1056964608, %
r69
, %
p18
;
selp.b32 %
r78
, 1056964608, %
r70, %p19;
selp.b32 %r79, 1056964608, %r71, %p20;
selp.b32 %r80, 1056964608, %r72, %p21;
selp.b32 %r81, 1056964608, %r73, %p22;
selp.b32 %r82, 1056964608, %r74, %p23;
selp.b32 %r83, 1056964608, %r75, %p24;
.loc 1 163 26 // custom_cast.py:163:26
.loc 1 163 26 // custom_cast.py:163:26
Copy
Copied
Copy
Copied
mov
.b32
{%rs59
, %
rs60}
, %
r100
;
cvt.u32.u16 %r84, %rs17;
cvt.u32.u16 %
r119
, %
rs60
;
and
.b32
%r85, %r41, 8;
cvt.u32.u16 %
r120
, %
rs59
;
cvt.u32.u16 %r86
, %
rs18;
and.b32 %r87
, %
r44, 8
;
cvt.u32.u16 %
r88
, %
rs19;
and.b32 %r89, %r47, 8
;
cvt.u32.u16 %
r90
, %
rs20;
and.b32 %r91, %r50, 8
;
.loc 1 164 8 // custom_cast.py:164:8
.loc 1 164 8 // custom_cast.py:164:8
Copy
Copied
Copy
Copied
shl.b32 %
r121, %r120
, 28;
shl.b32 %
r92, %r84, 28;
shl.b32 %
r122
, %
r119
, 28;
shl.b32 %r93, %r85, 28;
shl.b32 %r94, %r86, 28;
shl.b32 %r95, %r87, 28;
shl.b32 %r96, %r88, 28;
shl.b32 %r97, %r89, 28;
shl.b32 %r98, %r90
, 28;
shl.b32 %
r99
, %
r91
, 28;
.loc 1 166 22 // custom_cast.py:166:22
.loc 1 166 22 // custom_cast.py:166:22
Copy
Copied
Copy
Copied
or.b32 %r
123
, %
r118
, %
r122
;
or.b32 %r
2
, %
r76
, %
r92
;
or.b32 %
r124, %r117, %r121;
or.b32 %
r3, %r77, %r93;
.loc 1 173 23 // custom_cast.py:173:23
or.b32 %r4, %r78, %r94;
mov.b32 %f7, %r124;
or.b32 %r5, %r79, %r95;
mov.b32 %f8, %r123;
or.b32 %r6, %r80, %r96;
or.b32 %r7, %r81, %r97;
or.b32 %r8, %r82, %r98;
or.b32 %r9, %r83, %r99;
.loc 1 174 23 // custom_cast.py:174:23
.loc 1 174 23 // custom_cast.py:174:23
Copy
Copied
Copy
Copied
cvt.rn.bf16x2.f32 %r4, %f8, %f7;
// begin inline asm
$L__tmp2:
cvt.rn.bf16.f32 %rs5, %r2;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs6, %r3;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs7, %r4;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs8, %r5;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs9, %r6;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs10, %r7;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs11, %r8;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs12, %r9;
// end inline asm
$L__tmp10:
.loc 1 220 28 // custom_cast.py:220:28
.loc 1 220 28 // custom_cast.py:220:28
Copy
Copied
Copy
Copied
shl.b32 %
r125
, %
r5
, 10;
shl.b32 %
r100
, %
r1
, 10;
.loc 1 221 49 // custom_cast.py:221:49
.loc 1 221 49 // custom_cast.py:221:49
Copy
Copied
Copy
Copied
shl.b32 %
r126
, %
r9
, 3;
shl.b32 %
r101
, %
r17
, 3;
and.b32 %r1
27
, %
r126
, 1016;
and.b32 %r1
02
, %
r101
, 1016;
.loc 1 221 36 // custom_cast.py:221:36
.loc 1 221 36 // custom_cast.py:221:36
Copy
Copied
Copy
Copied
or.b32 %
r128
, %
r127
, %r1
25
;
or.b32 %
r103
, %
r100
, %r1
02
;
or.b32 %
r129
, %
r128
, 2;
or.b32 %
r104
, %
r103
, 2;
or.b32 %r1
30
, %
r128
, 4;
or.b32 %r1
05
, %
r103
, 4;
or.b32 %
r131
, %
r128
, 6;
or.b32 %
r106
, %
r103
, 6;
.loc 1 222 29 // custom_cast.py:222:29
.loc 1 222 29 // custom_cast.py:222:29
Copy
Copied
Copy
Copied
setp.lt.s32 %p5, %
r128
, %
r7
;
setp.lt.s32 %p5, %
r103
, %
r15
;
setp.lt.s32 %p6, %
r129
, %
r7
;
setp.lt.s32 %p6, %
r104
, %
r15
;
setp.lt.s32 %p7, %r1
30
, %
r7
;
setp.lt.s32 %p7, %r1
05
, %
r15
;
setp.lt.s32 %p8, %
r131
, %
r7
;
setp.lt.s32 %p8, %
r106
, %
r15
;
.loc 1 224 26 // custom_cast.py:224:26
.loc 1 224 26 // custom_cast.py:224:26
Copy
Copied
Copy
Copied
mul.wide.s32 %rd12, %
r128
, 2;
mul.wide.s32 %rd12, %
r103
, 2;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd6, %rd5, 4;
add.s64 %rd6, %rd5, 4;
add.s64 %rd7, %rd5, 8;
add.s64 %rd7, %rd5, 8;
add.s64 %rd8, %rd5, 12;
add.s64 %rd8, %rd5, 12;
.loc 1 224 39 // custom_cast.py:224:39
.loc 1 224 39 // custom_cast.py:224:39
Copy
Copied
Copy
Copied
mov.b32 %r107, {%rs5, %rs6};
// begin inline asm
// begin inline asm
Copy
Copied
Copy
Copied
@%p5 st.global.b32 [ %rd5 + 0 ], { %r1
};
@%p5 st.global.b32 [ %rd5 + 0 ], { %r1
07
};
// end inline asm
// end inline asm
Copy
Copied
Copy
Copied
mov.b32 %r108, {%rs7, %rs8};
// begin inline asm
// begin inline asm
Copy
Copied
Copy
Copied
@%p6 st.global.b32 [ %rd6 + 0 ], { %
r2
};
@%p6 st.global.b32 [ %rd6 + 0 ], { %
r108
};
// end inline asm
// end inline asm
Copy
Copied
Copy
Copied
mov.b32 %r109, {%rs9, %rs10};
// begin inline asm
// begin inline asm
Copy
Copied
Copy
Copied
@%p7 st.global.b32 [ %rd7 + 0 ], { %
r3
};
@%p7 st.global.b32 [ %rd7 + 0 ], { %
r109
};
// end inline asm
// end inline asm
Copy
Copied
Copy
Copied
mov.b32 %r110, {%rs11, %rs12};
// begin inline asm
// begin inline asm
Copy
Copied
Copy
Copied
@%p8 st.global.b32 [ %rd8 + 0 ], { %
r4
};
@%p8 st.global.b32 [ %rd8 + 0 ], { %
r110
};
// end inline asm
// end inline asm
.loc 1 224 4 // custom_cast.py:224:4
.loc 1 224 4 // custom_cast.py:224:4
ret;
ret;
Copy
Copied
Copy
Copied
$L__tmp
3
:
$L__tmp
11
:
$L__func_end0:
$L__func_end0:
// -- End function
// -- End function
}
}
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.section .debug_abbrev
.section .debug_abbrev
{
{
.b8 1 // Abbreviation Code
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
.b8 0 // EOM(3)
}
}
.section .debug_info
.section .debug_info
{
{
.b32 161 // Length of Unit
.b32 161 // Length of Unit
.b8 2 // DWARF version number
.b8 2 // DWARF version number
.b8 0
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 116 // DW_AT_producer
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 0
.b8 0
.b8 2 // DW_AT_language
.b8 2 // DW_AT_language
.b8 0
.b8 0
.b8 99 // DW_AT_name
.b8 99 // DW_AT_name
.b8 117
.b8 117
.b8 115
.b8 115
.b8 116
.b8 116
.b8 111
.b8 111
.b8 109
.b8 109
.b8 95
.b8 95
.b8 99
.b8 99
.b8 97
.b8 97
.b8 115
.b8 115
.b8 116
.b8 116
.b8 46
.b8 46
.b8 112
.b8 112
.b8 121
.b8 121
.b8 0
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 47 // DW_AT_comp_dir
.b8 104
.b8 104
.b8 111
.b8 111
.b8 109
.b8 109
.b8 101
.b8 101
.b8 47
.b8 47
.b8 100
.b8 100
.b8 114
.b8 114
.b8 105
.b8 105
.b8 115
.b8 115
.b8 115
.b8 115
.b8 112
.b8 112
.b8 103
.b8 103
.b8 47
.b8 47
.b8 109
.b8 109
.b8 101
.b8 101
.b8 116
.b8 116
.b8 97
.b8 97
.b8 47
.b8 47
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 116
.b8 116
.b8 111
.b8 111
.b8 114
.b8 114
.b8 99
.b8 99
.b8 104
.b8 104
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 112
.b8 112
.b8 114
.b8 114
.b8 111
.b8 111
.b8 116
.b8 116
.b8 111
.b8 111
.b8 116
.b8 116
.b8 121
.b8 121
.b8 112
.b8 112
.b8 101
.b8 101
.b8 47
.b8 47
.b8 109
.b8 109
.b8 120
.b8 120
.b8 95
.b8 95
.b8 102
.b8 102
.b8 111
.b8 111
.b8 114
.b8 114
.b8 109
.b8 109
.b8 97
.b8 97
.b8 116
.b8 116
.b8 115
.b8 115
.b8 0
.b8 0
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 116 // DW_AT_name
.b8 116 // DW_AT_name
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 95
.b8 95
.b8 102
.b8 102
.b8 52
.b8 52
.b8 95
.b8 95
.b8 116
.b8 116
.b8 111
.b8 111
.b8 95
.b8 95
.b8 98
.b8 98
.b8 102
.b8 102
.b8 49
.b8 49
.b8 54
.b8 54
.b8 95
.b8 95
.b8 107
.b8 107
.b8 101
.b8 101
.b8 114
.b8 114
.b8 110
.b8 110
.b8 101
.b8 101
.b8 108
.b8 108
.b8 0
.b8 0
.b8 1 // DW_AT_inline
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp1 // DW_AT_low_pc
Copy
Copied
Copy
Copied
.b64 $L__tmp
2
// DW_AT_high_pc
.b64 $L__tmp
10
// DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 1 // DW_AT_call_file
.b8 216 // DW_AT_call_line
.b8 216 // DW_AT_call_line
.b8 8 // DW_AT_call_column
.b8 8 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
}
.section .debug_macinfo { }
.section .debug_macinfo { }
Saved diffs
Original text
Open file
// // Generated by LLVM NVPTX Back-End // .version 8.4 .target sm_90a .address_size 64 // .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel // @triton_f4_to_bf16_kernel .visible .entry triton_f4_to_bf16_kernel( .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1, .param .u32 triton_f4_to_bf16_kernel_param_2, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_3 ) .reqntid 128, 1, 1 { .reg .pred %p<25>; .reg .b16 %rs<61>; .reg .b32 %r<132>; .reg .f32 %f<9>; .reg .b64 %rd<13>; .loc 1 178 0 // custom_cast.py:178:0 $L__func_begin0: .loc 1 178 0 // custom_cast.py:178:0 // %bb.0: ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0]; ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1]; $L__tmp0: .loc 1 194 24 // custom_cast.py:194:24 mov.u32 %r5, %ctaid.x; ld.param.u32 %r6, [triton_f4_to_bf16_kernel_param_2]; .loc 1 195 37 // custom_cast.py:195:37 shl.b32 %r7, %r6, 1; .loc 1 198 27 // custom_cast.py:198:27 shl.b32 %r8, %r5, 9; .loc 1 199 47 // custom_cast.py:199:47 mov.u32 %r9, %tid.x; shl.b32 %r10, %r9, 2; and.b32 %r11, %r10, 508; .loc 1 199 34 // custom_cast.py:199:34 or.b32 %r12, %r11, %r8; or.b32 %r13, %r12, 1; or.b32 %r14, %r12, 2; or.b32 %r15, %r12, 3; .loc 1 201 27 // custom_cast.py:201:27 setp.lt.s32 %p1, %r12, %r6; setp.lt.s32 %p2, %r13, %r6; setp.lt.s32 %p3, %r14, %r6; setp.lt.s32 %p4, %r15, %r6; .loc 1 204 31 // custom_cast.py:204:31 cvt.s64.s32 %rd11, %r12; add.s64 %rd1, %rd9, %rd11; add.s64 %rd2, %rd1, 1; add.s64 %rd3, %rd1, 2; add.s64 %rd4, %rd1, 3; .loc 1 204 23 // custom_cast.py:204:23 // begin inline asm mov.u16 %rs1, 0x0; @%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs2, 0x0; @%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs3, 0x0; @%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs4, 0x0; @%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ]; // end inline asm $L__tmp1: .loc 1 151 26 // custom_cast.py:151:26 and.b16 %rs5, %rs1, 1; and.b16 %rs6, %rs2, 1; and.b16 %rs7, %rs3, 1; and.b16 %rs8, %rs4, 1; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs9, %rs1, 4; and.b16 %rs10, %rs9, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r16, {%rs10, %rs11}; mov.b32 %r17, {%rs10, %rs1}; and.b32 %r18, %r17, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r19, %r17, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs12, %rs13}, %r19; setp.eq.s16 %p9, %rs12, 0; setp.eq.s16 %p10, %rs13, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p11, %rs13, 1; setp.eq.s16 %p12, %rs12, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs14, %rs12, 1; shr.u16 %rs15, %rs13, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs16, %rs14, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r20, {%rs17, %rs15}; and.b32 %r21, %r16, 1; mov.b32 %r22, 8257536; add.s16x2 %r23, %r20, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r23; } cvt.u32.u16 %r24, %rs18; { .reg .b16 tmp; mov.b32 {%rs19, tmp}, %r21; } cvt.u32.u16 %r25, %rs19; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r26, %rs16; cvt.u32.u16 %r27, %rs5; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r28, %r24, 23; shl.b32 %r29, %r25, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r30, %r26, 23; shl.b32 %r31, %r27, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r32, %r29, %r30; or.b32 %r33, %r28, %r31; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r34, 0, %r33, %p10; selp.b32 %r35, 0, %r32, %p9; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r36, 1056964608, %r35, %p12; selp.b32 %r37, 1056964608, %r34, %p11; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs20, %rs21}, %r18; cvt.u32.u16 %r38, %rs21; cvt.u32.u16 %r39, %rs20; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r40, %r39, 28; shl.b32 %r41, %r38, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r42, %r37, %r41; or.b32 %r43, %r36, %r40; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f1, %r43; mov.b32 %f2, %r42; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r1, %f2, %f1; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs22, %rs2, 4; and.b16 %rs23, %rs22, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r44, {%rs23, %rs24}; mov.b32 %r45, {%rs23, %rs2}; and.b32 %r46, %r45, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r47, %r45, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs25, %rs26}, %r47; setp.eq.s16 %p13, %rs25, 0; setp.eq.s16 %p14, %rs26, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p15, %rs26, 1; setp.eq.s16 %p16, %rs25, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs27, %rs25, 1; shr.u16 %rs28, %rs26, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs29, %rs27, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r48, {%rs30, %rs28}; and.b32 %r49, %r44, 1; add.s16x2 %r50, %r48, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs31}, %r50; } cvt.u32.u16 %r51, %rs31; { .reg .b16 tmp; mov.b32 {%rs32, tmp}, %r49; } cvt.u32.u16 %r52, %rs32; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r53, %rs29; cvt.u32.u16 %r54, %rs6; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r55, %r51, 23; shl.b32 %r56, %r52, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r57, %r53, 23; shl.b32 %r58, %r54, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r59, %r56, %r57; or.b32 %r60, %r55, %r58; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r61, 0, %r60, %p14; selp.b32 %r62, 0, %r59, %p13; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r63, 1056964608, %r62, %p16; selp.b32 %r64, 1056964608, %r61, %p15; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs33, %rs34}, %r46; cvt.u32.u16 %r65, %rs34; cvt.u32.u16 %r66, %rs33; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r67, %r66, 28; shl.b32 %r68, %r65, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r69, %r64, %r68; or.b32 %r70, %r63, %r67; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f3, %r70; mov.b32 %f4, %r69; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r2, %f4, %f3; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs35, %rs3, 4; and.b16 %rs36, %rs35, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r71, {%rs36, %rs37}; mov.b32 %r72, {%rs36, %rs3}; and.b32 %r73, %r72, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r74, %r72, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs38, %rs39}, %r74; setp.eq.s16 %p17, %rs38, 0; setp.eq.s16 %p18, %rs39, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p19, %rs39, 1; setp.eq.s16 %p20, %rs38, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs40, %rs38, 1; shr.u16 %rs41, %rs39, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs42, %rs40, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r75, {%rs43, %rs41}; and.b32 %r76, %r71, 1; add.s16x2 %r77, %r75, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs44}, %r77; } cvt.u32.u16 %r78, %rs44; { .reg .b16 tmp; mov.b32 {%rs45, tmp}, %r76; } cvt.u32.u16 %r79, %rs45; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r80, %rs42; cvt.u32.u16 %r81, %rs7; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r82, %r78, 23; shl.b32 %r83, %r79, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r84, %r80, 23; shl.b32 %r85, %r81, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r86, %r83, %r84; or.b32 %r87, %r82, %r85; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r88, 0, %r87, %p18; selp.b32 %r89, 0, %r86, %p17; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r90, 1056964608, %r89, %p20; selp.b32 %r91, 1056964608, %r88, %p19; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs46, %rs47}, %r73; cvt.u32.u16 %r92, %rs47; cvt.u32.u16 %r93, %rs46; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r94, %r93, 28; shl.b32 %r95, %r92, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r96, %r91, %r95; or.b32 %r97, %r90, %r94; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f5, %r97; mov.b32 %f6, %r96; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r3, %f6, %f5; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs48, %rs4, 4; and.b16 %rs49, %rs48, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r98, {%rs49, %rs50}; mov.b32 %r99, {%rs49, %rs4}; and.b32 %r100, %r99, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r101, %r99, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs51, %rs52}, %r101; setp.eq.s16 %p21, %rs51, 0; setp.eq.s16 %p22, %rs52, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p23, %rs52, 1; setp.eq.s16 %p24, %rs51, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs53, %rs51, 1; shr.u16 %rs54, %rs52, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs55, %rs53, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r102, {%rs56, %rs54}; and.b32 %r103, %r98, 1; add.s16x2 %r104, %r102, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs57}, %r104; } cvt.u32.u16 %r105, %rs57; { .reg .b16 tmp; mov.b32 {%rs58, tmp}, %r103; } cvt.u32.u16 %r106, %rs58; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r107, %rs55; cvt.u32.u16 %r108, %rs8; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r109, %r105, 23; shl.b32 %r110, %r106, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r111, %r107, 23; shl.b32 %r112, %r108, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r113, %r110, %r111; or.b32 %r114, %r109, %r112; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r115, 0, %r114, %p22; selp.b32 %r116, 0, %r113, %p21; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r117, 1056964608, %r116, %p24; selp.b32 %r118, 1056964608, %r115, %p23; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs59, %rs60}, %r100; cvt.u32.u16 %r119, %rs60; cvt.u32.u16 %r120, %rs59; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r121, %r120, 28; shl.b32 %r122, %r119, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r123, %r118, %r122; or.b32 %r124, %r117, %r121; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f7, %r124; mov.b32 %f8, %r123; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r4, %f8, %f7; $L__tmp2: .loc 1 220 28 // custom_cast.py:220:28 shl.b32 %r125, %r5, 10; .loc 1 221 49 // custom_cast.py:221:49 shl.b32 %r126, %r9, 3; and.b32 %r127, %r126, 1016; .loc 1 221 36 // custom_cast.py:221:36 or.b32 %r128, %r127, %r125; or.b32 %r129, %r128, 2; or.b32 %r130, %r128, 4; or.b32 %r131, %r128, 6; .loc 1 222 29 // custom_cast.py:222:29 setp.lt.s32 %p5, %r128, %r7; setp.lt.s32 %p6, %r129, %r7; setp.lt.s32 %p7, %r130, %r7; setp.lt.s32 %p8, %r131, %r7; .loc 1 224 26 // custom_cast.py:224:26 mul.wide.s32 %rd12, %r128, 2; add.s64 %rd5, %rd10, %rd12; add.s64 %rd6, %rd5, 4; add.s64 %rd7, %rd5, 8; add.s64 %rd8, %rd5, 12; .loc 1 224 39 // custom_cast.py:224:39 // begin inline asm @%p5 st.global.b32 [ %rd5 + 0 ], { %r1 }; // end inline asm // begin inline asm @%p6 st.global.b32 [ %rd6 + 0 ], { %r2 }; // end inline asm // begin inline asm @%p7 st.global.b32 [ %rd7 + 0 ], { %r3 }; // end inline asm // begin inline asm @%p8 st.global.b32 [ %rd8 + 0 ], { %r4 }; // end inline asm .loc 1 224 4 // custom_cast.py:224:4 ret; $L__tmp3: $L__func_end0: // -- End function } .file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py" .section .debug_abbrev { .b8 1 // Abbreviation Code .b8 17 // DW_TAG_compile_unit .b8 1 // DW_CHILDREN_yes .b8 37 // DW_AT_producer .b8 8 // DW_FORM_string .b8 19 // DW_AT_language .b8 5 // DW_FORM_data2 .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 16 // DW_AT_stmt_list .b8 6 // DW_FORM_data4 .b8 27 // DW_AT_comp_dir .b8 8 // DW_FORM_string .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 2 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 0 // DW_CHILDREN_no .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 32 // DW_AT_inline .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 3 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 1 // DW_CHILDREN_yes .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 4 // Abbreviation Code .b8 29 // DW_TAG_inlined_subroutine .b8 0 // DW_CHILDREN_no .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 88 // DW_AT_call_file .b8 11 // DW_FORM_data1 .b8 89 // DW_AT_call_line .b8 11 // DW_FORM_data1 .b8 87 // DW_AT_call_column .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 0 // EOM(3) } .section .debug_info { .b32 161 // Length of Unit .b8 2 // DWARF version number .b8 0 .b32 .debug_abbrev // Offset Into Abbrev. Section .b8 8 // Address Size (in bytes) .b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit .b8 116 // DW_AT_producer .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 // DW_AT_language .b8 0 .b8 99 // DW_AT_name .b8 117 .b8 115 .b8 116 .b8 111 .b8 109 .b8 95 .b8 99 .b8 97 .b8 115 .b8 116 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line // DW_AT_stmt_list .b8 47 // DW_AT_comp_dir .b8 104 .b8 111 .b8 109 .b8 101 .b8 47 .b8 100 .b8 114 .b8 105 .b8 115 .b8 115 .b8 112 .b8 103 .b8 47 .b8 109 .b8 101 .b8 116 .b8 97 .b8 47 .b8 97 .b8 111 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 97 .b8 111 .b8 47 .b8 112 .b8 114 .b8 111 .b8 116 .b8 111 .b8 116 .b8 121 .b8 112 .b8 101 .b8 47 .b8 109 .b8 120 .b8 95 .b8 102 .b8 111 .b8 114 .b8 109 .b8 97 .b8 116 .b8 115 .b8 0 .b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram .b8 116 // DW_AT_name .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 102 .b8 52 .b8 95 .b8 116 .b8 111 .b8 95 .b8 98 .b8 102 .b8 49 .b8 54 .b8 95 .b8 107 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 0 .b8 1 // DW_AT_inline .b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram .b64 $L__func_begin0 // DW_AT_low_pc .b64 $L__func_end0 // DW_AT_high_pc .b32 91 // DW_AT_abstract_origin .b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine .b32 91 // DW_AT_abstract_origin .b64 $L__tmp1 // DW_AT_low_pc .b64 $L__tmp2 // DW_AT_high_pc .b8 1 // DW_AT_call_file .b8 216 // DW_AT_call_line .b8 8 // DW_AT_call_column .b8 0 // End Of Children Mark .b8 0 // End Of Children Mark } .section .debug_macinfo { }
Changed text
Open file
// // Generated by LLVM NVPTX Back-End // .version 8.4 .target sm_90a .address_size 64 // .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel // @triton_f4_to_bf16_kernel .visible .entry triton_f4_to_bf16_kernel( .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1, .param .u32 triton_f4_to_bf16_kernel_param_2 ) .reqntid 128, 1, 1 { .reg .pred %p<25>; .reg .b16 %rs<65>; .reg .b32 %r<111>; .reg .b64 %rd<13>; .loc 1 178 0 // custom_cast.py:178:0 $L__func_begin0: .loc 1 178 0 // custom_cast.py:178:0 // %bb.0: ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0]; ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1]; $L__tmp0: .loc 1 194 24 // custom_cast.py:194:24 // begin inline asm mov.u32 %r1, %ctaid.x; // end inline asm ld.param.u32 %r14, [triton_f4_to_bf16_kernel_param_2]; .loc 1 195 37 // custom_cast.py:195:37 shl.b32 %r15, %r14, 1; .loc 1 198 27 // custom_cast.py:198:27 shl.b32 %r16, %r1, 9; .loc 1 199 47 // custom_cast.py:199:47 mov.u32 %r17, %tid.x; shl.b32 %r18, %r17, 2; and.b32 %r19, %r18, 508; .loc 1 199 34 // custom_cast.py:199:34 or.b32 %r20, %r16, %r19; or.b32 %r21, %r20, 1; or.b32 %r22, %r20, 2; or.b32 %r23, %r20, 3; .loc 1 201 27 // custom_cast.py:201:27 setp.lt.s32 %p1, %r20, %r14; setp.lt.s32 %p2, %r21, %r14; setp.lt.s32 %p3, %r22, %r14; setp.lt.s32 %p4, %r23, %r14; .loc 1 204 31 // custom_cast.py:204:31 cvt.s64.s32 %rd11, %r20; add.s64 %rd1, %rd9, %rd11; add.s64 %rd2, %rd1, 1; add.s64 %rd3, %rd1, 2; add.s64 %rd4, %rd1, 3; .loc 1 204 23 // custom_cast.py:204:23 // begin inline asm mov.u16 %rs1, 0x0; @%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs2, 0x0; @%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs3, 0x0; @%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs4, 0x0; @%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ]; // end inline asm $L__tmp1: .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs13, %rs1, 4; shr.u16 %rs14, %rs2, 4; shr.u16 %rs15, %rs3, 4; shr.u16 %rs16, %rs4, 4; .loc 1 131 18 // custom_cast.py:131:18 and.b16 %rs17, %rs13, 8; and.b16 %rs18, %rs14, 8; and.b16 %rs19, %rs15, 8; and.b16 %rs20, %rs16, 8; .loc 1 134 16 // custom_cast.py:134:16 and.b16 %rs21, %rs13, 7; and.b16 %rs22, %rs1, 6; and.b16 %rs23, %rs1, 7; and.b16 %rs24, %rs14, 7; and.b16 %rs25, %rs2, 6; and.b16 %rs26, %rs2, 7; and.b16 %rs27, %rs15, 7; and.b16 %rs28, %rs3, 6; and.b16 %rs29, %rs3, 7; and.b16 %rs30, %rs16, 7; and.b16 %rs31, %rs4, 6; and.b16 %rs32, %rs4, 7; .loc 1 137 25 // custom_cast.py:137:25 setp.eq.s16 %p9, %rs21, 0; setp.eq.s16 %p10, %rs23, 0; setp.eq.s16 %p11, %rs24, 0; setp.eq.s16 %p12, %rs26, 0; setp.eq.s16 %p13, %rs27, 0; setp.eq.s16 %p14, %rs29, 0; setp.eq.s16 %p15, %rs30, 0; setp.eq.s16 %p16, %rs32, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p17, %rs21, 1; setp.eq.s16 %p18, %rs23, 1; setp.eq.s16 %p19, %rs24, 1; setp.eq.s16 %p20, %rs26, 1; setp.eq.s16 %p21, %rs27, 1; setp.eq.s16 %p22, %rs29, 1; setp.eq.s16 %p23, %rs30, 1; setp.eq.s16 %p24, %rs32, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs33, %rs1, 5; and.b16 %rs34, %rs33, 3; shr.u16 %rs35, %rs22, 1; shr.u16 %rs36, %rs2, 5; and.b16 %rs37, %rs36, 3; shr.u16 %rs38, %rs25, 1; shr.u16 %rs39, %rs3, 5; and.b16 %rs40, %rs39, 3; shr.u16 %rs41, %rs28, 1; shr.u16 %rs42, %rs4, 5; and.b16 %rs43, %rs42, 3; shr.u16 %rs44, %rs31, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs45, %rs34, 126; add.s16 %rs46, %rs35, 126; add.s16 %rs47, %rs37, 126; add.s16 %rs48, %rs38, 126; add.s16 %rs49, %rs40, 126; add.s16 %rs50, %rs41, 126; add.s16 %rs51, %rs43, 126; add.s16 %rs52, %rs44, 126; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r24, %rs45; cvt.u32.u16 %r25, %rs46; cvt.u32.u16 %r26, %rs47; cvt.u32.u16 %r27, %rs48; cvt.u32.u16 %r28, %rs49; cvt.u32.u16 %r29, %rs50; cvt.u32.u16 %r30, %rs51; cvt.u32.u16 %r31, %rs52; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r32, %r24, 23; shl.b32 %r33, %r25, 23; shl.b32 %r34, %r26, 23; shl.b32 %r35, %r27, 23; shl.b32 %r36, %r28, 23; shl.b32 %r37, %r29, 23; shl.b32 %r38, %r30, 23; shl.b32 %r39, %r31, 23; .loc 1 151 26 // custom_cast.py:151:26 and.b16 %rs53, %rs13, 1; and.b16 %rs54, %rs14, 1; and.b16 %rs55, %rs15, 1; and.b16 %rs56, %rs16, 1; .loc 1 152 34 // custom_cast.py:152:34 cvt.u32.u16 %r40, %rs53; $L__tmp2: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r41, %rs1; $L__tmp3: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r42, %r41, 1; cvt.u32.u16 %r43, %rs54; $L__tmp4: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r44, %rs2; $L__tmp5: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r45, %r44, 1; cvt.u32.u16 %r46, %rs55; $L__tmp6: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r47, %rs3; $L__tmp7: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r48, %r47, 1; cvt.u32.u16 %r49, %rs56; $L__tmp8: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r50, %rs4; $L__tmp9: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r51, %r50, 1; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r52, %r40, 22; shl.b32 %r53, %r42, 22; shl.b32 %r54, %r43, 22; shl.b32 %r55, %r45, 22; shl.b32 %r56, %r46, 22; shl.b32 %r57, %r48, 22; shl.b32 %r58, %r49, 22; shl.b32 %r59, %r51, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r60, %r32, %r52; or.b32 %r61, %r33, %r53; or.b32 %r62, %r34, %r54; or.b32 %r63, %r35, %r55; or.b32 %r64, %r36, %r56; or.b32 %r65, %r37, %r57; or.b32 %r66, %r38, %r58; or.b32 %r67, %r39, %r59; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r68, 0, %r60, %p9; selp.b32 %r69, 0, %r61, %p10; selp.b32 %r70, 0, %r62, %p11; selp.b32 %r71, 0, %r63, %p12; selp.b32 %r72, 0, %r64, %p13; selp.b32 %r73, 0, %r65, %p14; selp.b32 %r74, 0, %r66, %p15; selp.b32 %r75, 0, %r67, %p16; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r76, 1056964608, %r68, %p17; selp.b32 %r77, 1056964608, %r69, %p18; selp.b32 %r78, 1056964608, %r70, %p19; selp.b32 %r79, 1056964608, %r71, %p20; selp.b32 %r80, 1056964608, %r72, %p21; selp.b32 %r81, 1056964608, %r73, %p22; selp.b32 %r82, 1056964608, %r74, %p23; selp.b32 %r83, 1056964608, %r75, %p24; .loc 1 163 26 // custom_cast.py:163:26 cvt.u32.u16 %r84, %rs17; and.b32 %r85, %r41, 8; cvt.u32.u16 %r86, %rs18; and.b32 %r87, %r44, 8; cvt.u32.u16 %r88, %rs19; and.b32 %r89, %r47, 8; cvt.u32.u16 %r90, %rs20; and.b32 %r91, %r50, 8; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r92, %r84, 28; shl.b32 %r93, %r85, 28; shl.b32 %r94, %r86, 28; shl.b32 %r95, %r87, 28; shl.b32 %r96, %r88, 28; shl.b32 %r97, %r89, 28; shl.b32 %r98, %r90, 28; shl.b32 %r99, %r91, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r2, %r76, %r92; or.b32 %r3, %r77, %r93; or.b32 %r4, %r78, %r94; or.b32 %r5, %r79, %r95; or.b32 %r6, %r80, %r96; or.b32 %r7, %r81, %r97; or.b32 %r8, %r82, %r98; or.b32 %r9, %r83, %r99; .loc 1 174 23 // custom_cast.py:174:23 // begin inline asm cvt.rn.bf16.f32 %rs5, %r2; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs6, %r3; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs7, %r4; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs8, %r5; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs9, %r6; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs10, %r7; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs11, %r8; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs12, %r9; // end inline asm $L__tmp10: .loc 1 220 28 // custom_cast.py:220:28 shl.b32 %r100, %r1, 10; .loc 1 221 49 // custom_cast.py:221:49 shl.b32 %r101, %r17, 3; and.b32 %r102, %r101, 1016; .loc 1 221 36 // custom_cast.py:221:36 or.b32 %r103, %r100, %r102; or.b32 %r104, %r103, 2; or.b32 %r105, %r103, 4; or.b32 %r106, %r103, 6; .loc 1 222 29 // custom_cast.py:222:29 setp.lt.s32 %p5, %r103, %r15; setp.lt.s32 %p6, %r104, %r15; setp.lt.s32 %p7, %r105, %r15; setp.lt.s32 %p8, %r106, %r15; .loc 1 224 26 // custom_cast.py:224:26 mul.wide.s32 %rd12, %r103, 2; add.s64 %rd5, %rd10, %rd12; add.s64 %rd6, %rd5, 4; add.s64 %rd7, %rd5, 8; add.s64 %rd8, %rd5, 12; .loc 1 224 39 // custom_cast.py:224:39 mov.b32 %r107, {%rs5, %rs6}; // begin inline asm @%p5 st.global.b32 [ %rd5 + 0 ], { %r107 }; // end inline asm mov.b32 %r108, {%rs7, %rs8}; // begin inline asm @%p6 st.global.b32 [ %rd6 + 0 ], { %r108 }; // end inline asm mov.b32 %r109, {%rs9, %rs10}; // begin inline asm @%p7 st.global.b32 [ %rd7 + 0 ], { %r109 }; // end inline asm mov.b32 %r110, {%rs11, %rs12}; // begin inline asm @%p8 st.global.b32 [ %rd8 + 0 ], { %r110 }; // end inline asm .loc 1 224 4 // custom_cast.py:224:4 ret; $L__tmp11: $L__func_end0: // -- End function } .file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py" .section .debug_abbrev { .b8 1 // Abbreviation Code .b8 17 // DW_TAG_compile_unit .b8 1 // DW_CHILDREN_yes .b8 37 // DW_AT_producer .b8 8 // DW_FORM_string .b8 19 // DW_AT_language .b8 5 // DW_FORM_data2 .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 16 // DW_AT_stmt_list .b8 6 // DW_FORM_data4 .b8 27 // DW_AT_comp_dir .b8 8 // DW_FORM_string .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 2 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 0 // DW_CHILDREN_no .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 32 // DW_AT_inline .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 3 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 1 // DW_CHILDREN_yes .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 4 // Abbreviation Code .b8 29 // DW_TAG_inlined_subroutine .b8 0 // DW_CHILDREN_no .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 88 // DW_AT_call_file .b8 11 // DW_FORM_data1 .b8 89 // DW_AT_call_line .b8 11 // DW_FORM_data1 .b8 87 // DW_AT_call_column .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 0 // EOM(3) } .section .debug_info { .b32 161 // Length of Unit .b8 2 // DWARF version number .b8 0 .b32 .debug_abbrev // Offset Into Abbrev. Section .b8 8 // Address Size (in bytes) .b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit .b8 116 // DW_AT_producer .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 // DW_AT_language .b8 0 .b8 99 // DW_AT_name .b8 117 .b8 115 .b8 116 .b8 111 .b8 109 .b8 95 .b8 99 .b8 97 .b8 115 .b8 116 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line // DW_AT_stmt_list .b8 47 // DW_AT_comp_dir .b8 104 .b8 111 .b8 109 .b8 101 .b8 47 .b8 100 .b8 114 .b8 105 .b8 115 .b8 115 .b8 112 .b8 103 .b8 47 .b8 109 .b8 101 .b8 116 .b8 97 .b8 47 .b8 97 .b8 111 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 97 .b8 111 .b8 47 .b8 112 .b8 114 .b8 111 .b8 116 .b8 111 .b8 116 .b8 121 .b8 112 .b8 101 .b8 47 .b8 109 .b8 120 .b8 95 .b8 102 .b8 111 .b8 114 .b8 109 .b8 97 .b8 116 .b8 115 .b8 0 .b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram .b8 116 // DW_AT_name .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 102 .b8 52 .b8 95 .b8 116 .b8 111 .b8 95 .b8 98 .b8 102 .b8 49 .b8 54 .b8 95 .b8 107 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 0 .b8 1 // DW_AT_inline .b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram .b64 $L__func_begin0 // DW_AT_low_pc .b64 $L__func_end0 // DW_AT_high_pc .b32 91 // DW_AT_abstract_origin .b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine .b32 91 // DW_AT_abstract_origin .b64 $L__tmp1 // DW_AT_low_pc .b64 $L__tmp10 // DW_AT_high_pc .b8 1 // DW_AT_call_file .b8 216 // DW_AT_call_line .b8 8 // DW_AT_call_column .b8 0 // End Of Children Mark .b8 0 // End Of Children Mark } .section .debug_macinfo { }
Find difference