Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Matt Arsenault (arsenm) ChangesThis fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16> Insert a bitcast if there is a single part with a different size. Also fix assertions in odd cases, like <3 x i16> -> i32. This needs All of this code is in need of a cleanup; this should look more Full diff: https://github.com/llvm/llvm-project/pull/175780.diff 1 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index e2ed45eec0ecd..0da360d8038b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -568,6 +568,13 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
const TypeSize PartSize = PartTy.getSizeInBits();
+ if (PartSize == SrcTy.getSizeInBits() && DstRegs.size() == 1) {
+ // TODO: Handle int<->ptr casts. It just happens the ABI lowering
+ // assignments are not pointer aware.
+ B.buildBitcast(DstRegs[0], SrcReg);
+ return;
+ }
+
if (PartTy.isVector() == SrcTy.isVector() &&
PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
assert(DstRegs.size() == 1);
@@ -576,9 +583,11 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
}
if (SrcTy.isVector() && !PartTy.isVector() &&
- TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits())) {
+ TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits()) &&
+ SrcTy.getElementCount() == ElementCount::getFixed(DstRegs.size())) {
// Vector was scalarized, and the elements extended.
auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
+
for (int i = 0, e = DstRegs.size(); i != e; ++i)
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
return;
@@ -645,9 +654,22 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
}
}
- if (LCMTy.isVector() && CoveringSize != SrcSize)
+ if (LCMTy.isVector() && CoveringSize != SrcSize) {
UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0);
+ unsigned ExcessBits = CoveringSize - DstSize * DstRegs.size();
+ if (ExcessBits != 0) {
+ SmallVector<Register, 8> PaddedDstRegs(DstRegs.begin(), DstRegs.end());
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ for (unsigned I = 0; I != ExcessBits; I += PartSize)
+ PaddedDstRegs.push_back(MRI.createGenericVirtualRegister(PartTy));
+
+ B.buildUnmerge(PaddedDstRegs, UnmergeSrc);
+ return;
+ }
+ }
+
B.buildUnmerge(DstRegs, UnmergeSrc);
}
|
186d520 to
aa92839
Compare
aa92839 to
29cc6a8
Compare
|
Do we have a test for this or will it be exercised later? |
Later, this combination isn't used until #175781 |
|
ping |
| if (LCMTy.isVector() && CoveringSize != SrcSize) { | ||
| UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0); | ||
|
|
||
| unsigned ExcessBits = CoveringSize - DstSize * DstRegs.size(); | ||
| if (ExcessBits != 0) { | ||
| SmallVector<Register, 8> PaddedDstRegs(DstRegs.begin(), DstRegs.end()); | ||
|
|
||
| MachineRegisterInfo &MRI = *B.getMRI(); | ||
| for (unsigned I = 0; I != ExcessBits; I += PartSize) | ||
| PaddedDstRegs.push_back(MRI.createGenericVirtualRegister(PartTy)); | ||
|
|
||
| B.buildUnmerge(PaddedDstRegs, UnmergeSrc); | ||
| return; | ||
| } | ||
| } |
There was a problem hiding this comment.
%23:_(s16), %24:_(s16), %25:_(s16) = G_UNMERGE_VALUES %20(<3 x s16>)
%26:_(s16) = G_IMPLICIT_DEF
%27:_(<6 x s16>) = G_BUILD_VECTOR %23(s16), %24(s16), %25(s16), %26(s16), %26(s16), %26(s16)
%21:_(s32), %22:_(s32), %28:_(s32) = G_UNMERGE_VALUES %27(<6 x s16>)
$vgpr0 = COPY %21(s32)
$vgpr1 = COPY %22(s32)
Not sure how this part is meant to work but padding DstRegs looks wrong
it is asked to return
<3 x s16>
in 2 s32 (DstRegs)
LCMTy is making larger type then needed it seems
and we return in more registers then target asked
SmallVector<Register, 8> PaddedDstRegs(DstRegs.begin(), DstRegs.end());
This might be getting into target specific territory
Instead of LCMTy
need to calculate with how many elements to pad SrcTy with to have its size equal to (DstRegs.size() * PartTy.getSizeInBits())
There was a problem hiding this comment.
For example when calculating LCMTy like this there is no need to pad DstRegs
LLT LCMTy = getCoverTy(SrcTy, PartTy);
if (SrcTy.isVector() && DstRegs.size() > 1) {
unsigned CoverSize = DstRegs.size() * DstTy.getSizeInBits();
LLT EltTy = SrcTy.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
if (CoverSize % EltSize == 0)
LCMTy = LLT::fixed_vector(CoverSize / EltSize, EltTy);
}
There was a problem hiding this comment.
This returns exactly as many registers as the target asked, the padding is just to conform to the requirements of the unmerge.
e.g. for this:
define <3 x i16> @ret_v3i16() {
ret <3 x i16> poison
}
We get:
%8:_(<3 x s16>) = G_IMPLICIT_DEF
%11:_(s16), %12:_(s16), %13:_(s16) = G_UNMERGE_VALUES %8(<3 x s16>)
%14:_(s16) = G_IMPLICIT_DEF
%15:_(<6 x s16>) = G_BUILD_VECTOR %11(s16), %12(s16), %13(s16), %14(s16), %14(s16), %14(s16)
%9:_(s32), %10:_(s32), %16:_(s32) = G_UNMERGE_VALUES %15(<6 x s16>)
$vgpr0 = COPY %9(s32)
$vgpr1 = COPY %10(s32)
SI_RETURN implicit $vgpr0, implicit $vgpr1
The padding is the unused %16
There was a problem hiding this comment.
What are the requirements of the unmerge?
If LCMTy is calculated in different way (see comment above) there is no need for if (ExcessBits != 0) ...
There was a problem hiding this comment.
The result pieces need to cover the input part piece, which is the widened LCMTy, so you need the extra unused result register
There was a problem hiding this comment.
If LCMTy is calculated in a better way (get V4S16 instead of V6S16), can get:
%8:_(<3 x s16>) = G_IMPLICIT_DEF
%11:_(s16), %12:_(s16), %13:_(s16) = G_UNMERGE_VALUES %8(<3 x s16>)
%14:_(s16) = G_IMPLICIT_DEF
%15:_(<4 x s16>) = G_BUILD_VECTOR %11(s16), %12(s16), %13(s16), %14(s16)
%9:_(s32), %10:_(s32) = G_UNMERGE_VALUES %15(<4 x s16>)
$vgpr0 = COPY %9(s32)
$vgpr1 = COPY %10(s32)
SI_RETURN implicit $vgpr0, implicit $vgpr1
which does not require padding of DstRegs, can just unmerge
There was a problem hiding this comment.
OK, I have your suggestion working. It happens to give worse codegen in 2 tests, but that's a downstream issue
There was a problem hiding this comment.
Will look into regressions, it is something in legalizer/artifact combiner
This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16> values as packed on gfx6/gfx7. The ABI does not pack values currently; this is a pre-fix for that change. Insert a bitcast if there is a single part with a different size. Previously this would miscompile by going through the scalarization and extend path, dropping the high element. Also fix assertions in odd cases, like <3 x i16> -> i32. This needs to unmerge with excess elements from the widened source vector. All of this code is in need of a cleanup; this should look more like the DAG version using getVectorTypeBreakdown.
29cc6a8 to
85d881b
Compare
…75780) This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16> values as packed on gfx6/gfx7. The ABI does not pack values currently; this is a pre-fix for that change. Insert a bitcast if there is a single part with a different size. Previously this would miscompile by going through the scalarization and extend path, dropping the high element. Also fix assertions in odd cases, like <3 x i16> -> i32. This needs to unmerge with excess elements from the widened source vector. All of this code is in need of a cleanup; this should look more like the DAG version using getVectorTypeBreakdown.
…75780) This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16> values as packed on gfx6/gfx7. The ABI does not pack values currently; this is a pre-fix for that change. Insert a bitcast if there is a single part with a different size. Previously this would miscompile by going through the scalarization and extend path, dropping the high element. Also fix assertions in odd cases, like <3 x i16> -> i32. This needs to unmerge with excess elements from the widened source vector. All of this code is in need of a cleanup; this should look more like the DAG version using getVectorTypeBreakdown.

This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16>
values as packed on gfx6/gfx7. The ABI does not pack values
currently; this is a pre-fix for that change.
Insert a bitcast if there is a single part with a different size.
Previously this would miscompile by going through the scalarization
and extend path, dropping the high element.
Also fix assertions in odd cases, like <3 x i16> -> i32. This needs
to unmerge with excess elements from the widened source vector.
All of this code is in need of a cleanup; this should look more
like the DAG version using getVectorTypeBreakdown.