[AArch64] SimplifyDemandedBitsForTargetNode - add AArch64ISD::BICi handling#76644
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Sizov Nikita (snikitav) ChangesFold BICi if all destination bits are already known to be zeroes define <8 x i16> @<!-- -->haddu_known(<8 x i8> %a0, <8 x i8> %a1) {
%x0 = zext <8 x i8> %a0 to <8 x i16>
%x1 = zext <8 x i8> %a1 to <8 x i16>
%hadd = call <8 x i16> @<!-- -->llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
%res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
ret <8 x i16> %res
}
declare <8 x i16> @<!-- -->llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)Full diff: https://github.com/llvm/llvm-project/pull/76644.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 81facf92e55ae9..c79d3d8246daaf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3390,10 +3390,15 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = KnownBits::mulhs(Known, Known2);
break;
}
- case ISD::AVGCEILU: {
+ case ISD::AVGFLOORU:
+ case ISD::AVGCEILU:
+ case ISD::AVGFLOORS:
+ case ISD::AVGCEILS: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- Known = Known.zext(BitWidth + 1);
+ Known = (Opcode == ISD::AVGFLOORU || Opcode == ISD::AVGCEILU)
+ ? Known.zext(BitWidth + 1)
+ : Known.sext(BitWidth + 1);
Known2 = Known2.zext(BitWidth + 1);
KnownBits One = KnownBits::makeConstant(APInt(1, 1));
Known = KnownBits::computeForAddCarry(Known, Known2, One);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dffe69bdb900db..c5f906aa7473f7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23672,6 +23672,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
return R;
return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
+ case AArch64ISD::BICi: {
+ KnownBits Known;
+ APInt DemandedElts(32, N->getValueType(0).getVectorNumElements());
+ APInt EltSize(32, N->getValueType(0).getScalarSizeInBits());
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (SimplifyDemandedBitsForTargetNode(SDValue(N, 0), EltSize, DemandedElts,
+ Known, TLO, 0)) {
+ return TLO.New;
+ }
+ break;
+ }
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
@@ -26658,6 +26670,18 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
// used - simplify to just Val.
return TLO.CombineTo(Op, ShiftR->getOperand(0));
}
+ case AArch64ISD::BICi: {
+ // Fold BICi if all destination bits already known to be zeroed
+ SDValue Op0 = Op.getOperand(0);
+ KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0);
+ APInt Shift = Op.getConstantOperandAPInt(2);
+ APInt Op1Val = Op.getConstantOperandAPInt(1);
+ APInt BitsToClear = Op1Val.shl(Shift).zextOrTrunc(KnownOp0.getBitWidth());
+ APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
+ if (AlreadyZeroedBitsToClear == BitsToClear)
+ return TLO.CombineTo(Op, Op0);
+ return false;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
if (auto ElementSize = IsSVECntIntrinsic(Op)) {
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll
new file mode 100644
index 00000000000000..55ac9c2f1b4075
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-bici.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-neon < %s | FileCheck %s
+
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @haddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: haddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @rhaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: rhaddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @hadds_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: hadds_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @shaddu_zext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: shaddu_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %x0 = zext <8 x i8> %a0 to <8 x i16>
+ %x1 = zext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+; ; negative tests
+
+define <8 x i16> @haddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: haddu_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @urhadd_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: urhadd_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511,i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @hadds_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: hadds_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @shaddu_sext(<8 x i8> %a0, <8 x i8> %a1) {
+; CHECK-LABEL: shaddu_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: bic v0.8h, #254, lsl #8
+; CHECK-NEXT: ret
+ %x0 = sext <8 x i8> %a0 to <8 x i16>
+ %x1 = sext <8 x i8> %a1 to <8 x i16>
+ %hadd = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1)
+ %res = and <8 x i16> %hadd, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
+ ret <8 x i16> %res
+}
|
08f70b5 to
00fd477
Compare
| case AArch64ISD::BICi: { | ||
| KnownBits Known; | ||
| APInt DemandedElts(32, N->getValueType(0).getVectorNumElements()); | ||
| APInt EltSize(32, N->getValueType(0).getScalarSizeInBits()); |
There was a problem hiding this comment.
Why is this hardcoded to 32?
| APInt EltSize(32, N->getValueType(0).getScalarSizeInBits()); | ||
| TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), | ||
| !DCI.isBeforeLegalizeOps()); | ||
| if (SimplifyDemandedBitsForTargetNode(SDValue(N, 0), EltSize, DemandedElts, |
There was a problem hiding this comment.
Use TLI.SimplifyDemandedBits here
| SDValue Op0 = Op.getOperand(0); | ||
| KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0); | ||
| APInt Shift = Op.getConstantOperandAPInt(2); | ||
| APInt Op1Val = Op.getConstantOperandAPInt(1); |
There was a problem hiding this comment.
Just use getConstantOperandVal for these?
| APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero; | ||
| if (AlreadyZeroedBitsToClear == BitsToClear) | ||
| return TLO.CombineTo(Op, Op0); | ||
| return false; |
There was a problem hiding this comment.
Return the calculated KnownBits (copy the AArch64TargetLowering::computeKnownBitsForTargetNode implementation).
| case AArch64ISD::BICi: { | ||
| // Fold BICi if all destination bits already known to be zeroed | ||
| SDValue Op0 = Op.getOperand(0); | ||
| KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, 0); |
There was a problem hiding this comment.
Use the computeKnownBits variant with OriginalDemandedElts
Don't reset the recursion depth - it must be Depth + 1
|
@snikitav reverse-ping. Are you still looking at this? |
Yes. Sorry, been distracted. Will come up with patch this weekend. |
c220a3e to
40ce707
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Ideally we need to separate the ISD::AVG knownbits handling into a followup patch - @davemgreen can you suggest any simpler tests that will create BICi nodes that the target node KnownBits handling could show?
| : Known.sext(BitWidth + 1); | ||
| Known2 = Known2.zext(BitWidth + 1); | ||
| KnownBits One = KnownBits::makeConstant(APInt(1, 1)); | ||
| Known = KnownBits::computeForAddCarry(Known, Known2, One); |
There was a problem hiding this comment.
Floor variants don't add the One:
bool IsCeil = Opcode == ISD::AVGCEILU || Opcode == ISD::AVGCEILS);
KnownBits Carry= KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
| declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) | ||
| declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) | ||
| declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) | ||
| declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) |
There was a problem hiding this comment.
These need pre-committing to trunk so the patch shows the codegen diff
There was a problem hiding this comment.
Should I make a separate PR for this test?
There was a problem hiding this comment.
Yeah that sounds good, if you don't have access to push them directly. Hopefully it is a simple one to get committed either way.
|
Also, please can you update the title of the patch to be more descriptive? Does this sound OK? "[AArch64] SimplifyDemandedBitsForTargetNode - add AArch64ISD::BICi handling" |
I'm not sure. Considering where they are created it will be common for simple cases to already be optimized. Maybe they could be added to llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp? I agree it would be good if there were separate prs for the BIC vs RADD parts. |
53709ee to
f1f275e
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Please consult FoldValue in SelectionDAG.cpp which has the full evaluation for each of the AVG nodes
| Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | ||
| Known = Known.zext(BitWidth + 1); | ||
| Known = IsSigned ? Known.sext(BitWidth + 1) : Known.zext(BitWidth + 1); | ||
| Known2 = Known2.zext(BitWidth + 1); |
There was a problem hiding this comment.
Known2 = IsSigned ? Known2.sext(BitWidth + 1) : Known2.zext(BitWidth + 1);
There was a problem hiding this comment.
BTW, shouldn't we sext/zext(BitWidth + 2) for Ceil versions?
There was a problem hiding this comment.
Good point - we don't do this in FoldValue either
There was a problem hiding this comment.
Actually, this seems fine as we don't care about the MSB: https://alive2.llvm.org/ce/z/iQdZVc panic over
There was a problem hiding this comment.
BTW, shouldn't we sext/zext(BitWidth + 2) for Ceil versions?
It's not necessary even though the Ceil versions add 1 for carry. E.g. with 8 bits the maximum you could get would be 0xFF + 0xFF + 1 which is 0x1FF which still fits in 9 bits - you don't need 10 bits.
| uint64_t BitsToClear = Op->getConstantOperandVal(1) | ||
| << Op->getConstantOperandVal(2); | ||
| APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero; | ||
| if (AlreadyZeroedBitsToClear == BitsToClear) |
| return TLO.CombineTo(Op, ShiftR->getOperand(0)); | ||
| } | ||
| case AArch64ISD::BICi: { | ||
| // Fold BICi if all destination bits already known to be zeroed |
There was a problem hiding this comment.
Its always helpful to include in the comment what each of the immediate operands do, or name them:
uint64_t foo = Op->getConstantOperandVal(1);
uint64_t bar = Op->getConstantOperandVal(2);
uint64_t BitsToClear = foo << bar;
f1f275e to
5a1ca5a
Compare
Add neon bici test for haddu and shadd, prerequisite for #76644
5a1ca5a to
afa5a78
Compare
|
Any luck with creating a test in AArch64SelectionDAGTest.cpp for the BICi known bits? |
afa5a78 to
6ea8481
Compare
| } | ||
|
|
||
| TEST_F(AArch64SelectionDAGTest, | ||
| computeKnownBits_AVGFLOORU_AVGFLOORS_AVGCEILU_AVGCEILS) { |
There was a problem hiding this comment.
I meant add BICi test coverage here, no need for the AVG nodes - the idea being that this PR can be purely about BICi demanded/knownbits and then you can create a followup PR that deals with the AVG known bits support
There was a problem hiding this comment.
Or these tests could be used to create a PR just for the AVG nodes part - to get that part in and reviewed, and the BIC can be rebase on top.
There was a problem hiding this comment.
Either is fine for me - I just want to see the generic ISD::AVG knownbits patch separate from the AArch64 specific BICi patch.
There was a problem hiding this comment.
I'll do a separate patch then
There was a problem hiding this comment.
@RKSimon @davemgreen #86754 PTAL
BTW, now this particular PR is temporary broken, because it relies on missing ISD::AVG* computeKnownBits
| .isSubsetOf(AlreadyZeroedBitsToClear)) | ||
| return TLO.CombineTo(Op, Op0); | ||
|
|
||
| Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear)); |
There was a problem hiding this comment.
Could this be Known = KnownOp0 & KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));? There is already some code for this in AArch64TargetLowering::computeKnownBitsForTargetNode, I would guess these should match in terms of what they produce, but sometimes I mis-understand exactly what SimplifyDemandedBitsForTargetNode should be doing. I think this could be quite similar to the ISD::AND case inside SimplifyDemandedBits, just with an always-constant second argument.
| EXPECT_EQ(Known.One, APInt(32, 0xfffffff0)); | ||
| } | ||
|
|
||
| TEST_F(AArch64SelectionDAGTest, |
There was a problem hiding this comment.
Thanks for adding these, they are useful for checking the produced results more precisely. Would it be possible to add something for the BICi changes too?
Are you sure? I don't see any AArch64ISD namespace using in AArch64SelectionDAGTest.cpp. I feel like missing something. |
6ea8481 to
1e04454
Compare
1e04454 to
c155e8f
Compare
|
@snikitav reverse-ping |
c155e8f to
f02f7b5
Compare
@RKSimon Are you sure? I don't see any AArch64ISD namespace using in AArch64SelectionDAGTest.cpp. I feel like missing something. Could you give me a bit of guidance here? |
|
I don't think that will be necessary anymore unless @davemgreen disagrees? |
davemgreen
left a comment
There was a problem hiding this comment.
AArch64SelectionDAGTest.cpp
I see what you mean, with them being specified in AArch64ISelLowering.h. I think so long as this is tested and looks sensible it should be OK (the C++ tests just make it easier to write more comprehensive tests).
If you can remove the unused variable then this LGTM too.
| return R; | ||
| return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); | ||
| case AArch64ISD::BICi: { | ||
| KnownBits Known; |
| case AArch64ISD::BICi: { | ||
| KnownBits Known; | ||
| APInt DemandedBits = | ||
| APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits()); |
There was a problem hiding this comment.
I would expect this not be all bits, but to only demand the bits that are not cleared by the BIC. Again this might be difficult to test though, so it might be best to leave it as-is for the moment.
f02f7b5 to
dc85ce6
Compare
Fold BICi if all destination bits are already known to be zeroes
Fixes #53881
Fixes #53622