[SDAG] Implement missing legalization for ISD::VECTOR_FIND_LAST_ACTIVE#180290
[SDAG] Implement missing legalization for ISD::VECTOR_FIND_LAST_ACTIVE#180290
ISD::VECTOR_FIND_LAST_ACTIVE#180290Conversation
|
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis lowers the splitting as: The lowering likely can be improved. This patch is for completeness. Should fix: #178862 (comment) Full diff: https://github.com/llvm/llvm-project/pull/180290.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a5efffa2b441c..6646802117f9a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1040,6 +1040,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue SplitVecOp_VP_CttzElements(SDNode *N);
SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N);
SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N);
+ SDValue SplitVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
//===--------------------------------------------------------------------===//
// Vector Widening Support: LegalizeVectorTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index e1e6252fd8abc..5bc5115ef139f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3572,6 +3572,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::INSERT_SUBVECTOR: Res = SplitVecOp_INSERT_SUBVECTOR(N, OpNo); break;
case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = SplitVecOp_VECTOR_FIND_LAST_ACTIVE(N);
+ break;
case ISD::VP_TRUNCATE:
case ISD::TRUNCATE:
Res = SplitVecOp_TruncateHelper(N);
@@ -3732,6 +3735,39 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
return false;
}
+SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
+ SDLoc DL(N);
+
+ SDValue LoMask, HiMask;
+ GetSplitVector(N->getOperand(0), LoMask, HiMask);
+
+ EVT VT = N->getValueType(0);
+ EVT SplitVT = LoMask.getValueType();
+ ElementCount SplitEC = SplitVT.getVectorElementCount();
+
+ // Find the last active in both the low and the high masks.
+ SDValue LoFind = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, VT, LoMask);
+ SDValue HiFind = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, VT, HiMask);
+
+ // Check in any lane is active in the high mask.
+ // FIXME: This would not be necessary if VECTOR_FIND_LAST_ACTIVE returned a
+ // sentinel value for "none active".
+ EVT WideSplitVT = SplitVT.changeElementType(*DAG.getContext(), MVT::i32);
+ SDValue WideHiMask = DAG.getNode(ISD::ZERO_EXTEND, DL, WideSplitVT, HiMask);
+ SDValue ReduceHiMask =
+ DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideHiMask);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ EVT CmpVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ SDValue AnyHiActive = DAG.getSetCC(DL, CmpVT, ReduceHiMask, Zero, ISD::SETNE);
+
+ // Return: AnyHiActive ? (HiFind + SplitEC) : LoFind;
+ return DAG.getSelect(DL, VT, AnyHiActive,
+ DAG.getNode(ISD::ADD, DL, VT, HiFind,
+ DAG.getElementCount(DL, VT, SplitEC)),
+ LoFind);
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
// The only possibility for an illegal operand is the mask, since result type
// legalization would have handled this node already otherwise.
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index 7ee8f6fda93f5..0b9b69f3af49c 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -549,6 +549,109 @@ define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
ret i32 %res
}
+define i8 @extract_last_active_splsit(<32 x i8> %data, <32 x i8> %mask, i8 %passthru) {
+; NEON-FIXED-LABEL: extract_last_active_splsit:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: cmtst v4.16b, v3.16b, v3.16b
+; NEON-FIXED-NEXT: movi v16.4s, #1
+; NEON-FIXED-NEXT: adrp x8, .LCPI19_0
+; NEON-FIXED-NEXT: ext v5.16b, v4.16b, v4.16b, #8
+; NEON-FIXED-NEXT: zip2 v6.8b, v4.8b, v0.8b
+; NEON-FIXED-NEXT: zip1 v4.8b, v4.8b, v0.8b
+; NEON-FIXED-NEXT: zip2 v7.8b, v5.8b, v0.8b
+; NEON-FIXED-NEXT: zip1 v5.8b, v5.8b, v0.8b
+; NEON-FIXED-NEXT: ushll v6.4s, v6.4h, #0
+; NEON-FIXED-NEXT: ushll v4.4s, v4.4h, #0
+; NEON-FIXED-NEXT: and v6.16b, v6.16b, v16.16b
+; NEON-FIXED-NEXT: ushll v7.4s, v7.4h, #0
+; NEON-FIXED-NEXT: ushll v5.4s, v5.4h, #0
+; NEON-FIXED-NEXT: and v4.16b, v4.16b, v16.16b
+; NEON-FIXED-NEXT: and v7.16b, v7.16b, v16.16b
+; NEON-FIXED-NEXT: and v5.16b, v5.16b, v16.16b
+; NEON-FIXED-NEXT: cmeq v16.16b, v3.16b, #0
+; NEON-FIXED-NEXT: add v6.4s, v6.4s, v7.4s
+; NEON-FIXED-NEXT: cmeq v7.16b, v2.16b, #0
+; NEON-FIXED-NEXT: add v4.4s, v4.4s, v5.4s
+; NEON-FIXED-NEXT: ldr q5, [x8, :lo12:.LCPI19_0]
+; NEON-FIXED-NEXT: orr v2.16b, v2.16b, v3.16b
+; NEON-FIXED-NEXT: bic v3.16b, v5.16b, v16.16b
+; NEON-FIXED-NEXT: add v4.4s, v4.4s, v6.4s
+; NEON-FIXED-NEXT: bic v5.16b, v5.16b, v7.16b
+; NEON-FIXED-NEXT: cmtst v2.16b, v2.16b, v2.16b
+; NEON-FIXED-NEXT: umaxv b3, v3.16b
+; NEON-FIXED-NEXT: addv s4, v4.4s
+; NEON-FIXED-NEXT: umaxv b5, v5.16b
+; NEON-FIXED-NEXT: umaxv b2, v2.16b
+; NEON-FIXED-NEXT: fmov w8, s4
+; NEON-FIXED-NEXT: fmov w9, s3
+; NEON-FIXED-NEXT: fmov w10, s5
+; NEON-FIXED-NEXT: cmp w8, #0
+; NEON-FIXED-NEXT: add w9, w9, #16
+; NEON-FIXED-NEXT: csel x8, x9, x10, ne
+; NEON-FIXED-NEXT: stp q0, q1, [sp, #-32]!
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 32
+; NEON-FIXED-NEXT: and x8, x8, #0x1f
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldrb w8, [x9, x8]
+; NEON-FIXED-NEXT: fmov w9, s2
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: add sp, sp, #32
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_active_splsit:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: cmtst v4.16b, v3.16b, v3.16b
+; SVE-FIXED-NEXT: movi v16.4s, #1
+; SVE-FIXED-NEXT: cmeq v17.16b, v3.16b, #0
+; SVE-FIXED-NEXT: ext v5.16b, v4.16b, v4.16b, #8
+; SVE-FIXED-NEXT: zip2 v6.8b, v4.8b, v0.8b
+; SVE-FIXED-NEXT: zip1 v4.8b, v4.8b, v0.8b
+; SVE-FIXED-NEXT: zip2 v7.8b, v5.8b, v0.8b
+; SVE-FIXED-NEXT: zip1 v5.8b, v5.8b, v0.8b
+; SVE-FIXED-NEXT: ushll v6.4s, v6.4h, #0
+; SVE-FIXED-NEXT: ushll v4.4s, v4.4h, #0
+; SVE-FIXED-NEXT: and v6.16b, v6.16b, v16.16b
+; SVE-FIXED-NEXT: ushll v7.4s, v7.4h, #0
+; SVE-FIXED-NEXT: ushll v5.4s, v5.4h, #0
+; SVE-FIXED-NEXT: and v4.16b, v4.16b, v16.16b
+; SVE-FIXED-NEXT: and v7.16b, v7.16b, v16.16b
+; SVE-FIXED-NEXT: and v5.16b, v5.16b, v16.16b
+; SVE-FIXED-NEXT: index z16.b, #0, #1
+; SVE-FIXED-NEXT: add v6.4s, v6.4s, v7.4s
+; SVE-FIXED-NEXT: add v4.4s, v4.4s, v5.4s
+; SVE-FIXED-NEXT: cmeq v5.16b, v2.16b, #0
+; SVE-FIXED-NEXT: orr v2.16b, v2.16b, v3.16b
+; SVE-FIXED-NEXT: bic v3.16b, v16.16b, v17.16b
+; SVE-FIXED-NEXT: add v4.4s, v4.4s, v6.4s
+; SVE-FIXED-NEXT: bic v5.16b, v16.16b, v5.16b
+; SVE-FIXED-NEXT: umaxv b3, v3.16b
+; SVE-FIXED-NEXT: cmtst v2.16b, v2.16b, v2.16b
+; SVE-FIXED-NEXT: addv s4, v4.4s
+; SVE-FIXED-NEXT: umaxv b5, v5.16b
+; SVE-FIXED-NEXT: fmov w9, s3
+; SVE-FIXED-NEXT: umaxv b2, v2.16b
+; SVE-FIXED-NEXT: fmov w8, s4
+; SVE-FIXED-NEXT: fmov w10, s5
+; SVE-FIXED-NEXT: add w9, w9, #16
+; SVE-FIXED-NEXT: cmp w8, #0
+; SVE-FIXED-NEXT: csel x8, x9, x10, ne
+; SVE-FIXED-NEXT: stp q0, q1, [sp, #-32]!
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 32
+; SVE-FIXED-NEXT: and x8, x8, #0x1f
+; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: ldrb w8, [x9, x8]
+; SVE-FIXED-NEXT: fmov w9, s2
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: add sp, sp, #32
+; SVE-FIXED-NEXT: ret
+ %notzero = icmp ne <32 x i8> %mask, zeroinitializer
+ %res = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> %data, <32 x i1> %notzero, i8 %passthru)
+ ret i8 %res
+}
+
+
declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
index e891b27de2756..40de65a96f421 100644
--- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -271,3 +271,166 @@ define i32 @extract_last_active_v4i32_penryn(<4 x i32> %a, <4 x i1> %c) "target-
%res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison)
ret i32 %res
}
+
+define i8 @extract_last_active_split(<32 x i8> %data, <32 x i8> %mask, i8 %passthru) {
+; CHECK-LABEL: extract_last_active_split:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm5, %xmm5
+; CHECK-NEXT: movdqa %xmm2, %xmm6
+; CHECK-NEXT: por %xmm3, %xmm6
+; CHECK-NEXT: pcmpeqb %xmm5, %xmm3
+; CHECK-NEXT: pcmpeqd %xmm7, %xmm7
+; CHECK-NEXT: movdqa %xmm3, %xmm4
+; CHECK-NEXT: pxor %xmm7, %xmm4
+; CHECK-NEXT: pcmpeqb %xmm5, %xmm2
+; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: pandn %xmm8, %xmm2
+; CHECK-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pandn %xmm8, %xmm3
+; CHECK-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pcmpeqb %xmm5, %xmm6
+; CHECK-NEXT: pxor %xmm7, %xmm6
+; CHECK-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %al, %cl
+; CHECK-NEXT: cmoval %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: cmpb %dl, %al
+; CHECK-NEXT: cmovbel %edx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: cmpb %cl, %al
+; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %cl, %dl
+; CHECK-NEXT: cmoval %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: cmpb %sil, %cl
+; CHECK-NEXT: cmovbel %esi, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: cmpb %dl, %cl
+; CHECK-NEXT: cmovbel %edx, %ecx
+; CHECK-NEXT: addl $16, %ecx
+; CHECK-NEXT: movdqa %xmm4, %xmm0
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-NEXT: pand %xmm2, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: movdqa %xmm4, %xmm3
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: pand %xmm2, %xmm3
+; CHECK-NEXT: paddd %xmm1, %xmm3
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: pand %xmm2, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: pand %xmm2, %xmm4
+; CHECK-NEXT: paddd %xmm0, %xmm4
+; CHECK-NEXT: paddd %xmm3, %xmm4
+; CHECK-NEXT: movd %xmm4, %edx
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; CHECK-NEXT: movd %xmm0, %esi
+; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; CHECK-NEXT: movd %xmm0, %edx
+; CHECK-NEXT: addl %esi, %edx
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
+; CHECK-NEXT: movd %xmm0, %esi
+; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: cmoveq %rax, %rcx
+; CHECK-NEXT: andl $31, %ecx
+; CHECK-NEXT: movzbl -40(%rsp,%rcx), %eax
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: cmovel %edi, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %notzero = icmp ne <32 x i8> %mask, zeroinitializer
+ %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<32 x i8> %data, <32 x i1> %notzero, i8 %passthru)
+ ret i8 %res
+}
|
ISD::VECTOR_FIND_LAST_ACTIVEISD::VECTOR_FIND_LAST_ACTIVE
643dbf6 to
1a8f360
Compare
This lowers the splitting as: ``` any_active(hi_mask) ? (find_last_active(hi_mask) + lo_mask.getVectorElementCount()); : find_last_active(lo_mask) ``` The lowering likely can be improved. This patch is for completeness.
1a8f360 to
780716c
Compare
| // FIXME: This would not be necessary if VECTOR_FIND_LAST_ACTIVE returned a | ||
| // sentinel value for "none active". |
There was a problem hiding this comment.
What is currently the behaviour for VECTOR_FIND_LAST_ACTIVE if no lane is active? (is that a poison value?)
There was a problem hiding this comment.
I can't find it specified, though looking at the possible lowerings, it seems it could be zero or it could be the index of the last lane. Neither are useful (since they're within the range of valid elements). I think a more useful result would be -1 (since then you could use a signed-max of both results).
There was a problem hiding this comment.
It seems in SelectionDAGBuilder it creates a select (similar to what you've done here) for the passthru value when the mask is all zero, so the predicate mask can't be all-zero and it's indeed unspecified what the result of VECTOR_FIND_LAST_ACTIVE is otherwise (e.g. for a split vector case like here).
fhahn
left a comment
There was a problem hiding this comment.
Thanks!
I think this may also fix a crash on WebAssembly, for which we currently return invalid costs as workaround? If so, would be great if we could remove the cost workaround
For a reproducer of the wasm crash, see
https://github.com/kripken/emscripten_/blob/vectorize_crash/test/vectorize_crash.cpp
clang++ -target wasm32-unknown-emscripten -O3 -c vectorize_crash.cpp
ISD::VECTOR_FIND_LAST_ACTIVEISD::VECTOR_FIND_LAST_ACTIVE
It does now that I've added scalarization for the |
| // FIXME: This would not be necessary if VECTOR_FIND_LAST_ACTIVE returned a | ||
| // sentinel value for "none active". |
There was a problem hiding this comment.
It seems in SelectionDAGBuilder it creates a select (similar to what you've done here) for the passthru value when the mask is all zero, so the predicate mask can't be all-zero and it's indeed unspecified what the result of VECTOR_FIND_LAST_ACTIVE is otherwise (e.g. for a split vector case like here).
…ons" (#180708) This patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence. --- Reverts #180275 (original PR: #178862) Additional type legalization for `ISD::VECTOR_FIND_LAST_ACTIVE` was added in #180290, which should resolve the backend crashes on x86.
The failures should have been resolved with llvm#180290 (which also added WebAssembly tests). This reverts commit 811fb22.
…nvalid cost." (#181545) The failures should have been resolved with llvm/llvm-project#180290 (which also added WebAssembly tests). This reverts commit llvm/llvm-project@811fb22. --- This is the same as #180942, but with a `lit.local.cfg` added to the CostModel test folder.
…." (llvm#180942) The failures should have been resolved with llvm#180290 (which also added WebAssembly tests). This reverts commit 811fb22.
…." (llvm#181545) The failures should have been resolved with llvm#180290 (which also added WebAssembly tests). This reverts commit llvm@811fb22. --- This is the same as llvm#180942, but with a `lit.local.cfg` added to the CostModel test folder.
…." (llvm#181545) The failures should have been resolved with llvm#180290 (which also added WebAssembly tests). This reverts commit llvm@811fb22. --- This is the same as llvm#180942, but with a `lit.local.cfg` added to the CostModel test folder.
This lowers the splitting as:
And trivially lowers
<1 x i1>scalarization to returning zero. Which is a natural result of the splitting (and the lack of a sentinel "none-active" result value).The lowerings likely can be improved. This patch is for completeness.
Should fix: #178862 (comment)
Fixes #180212