diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index a5efffa2b441c..3274ef1e25358 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -931,6 +931,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N); SDValue ScalarizeVecOp_CMP(SDNode *N); SDValue ScalarizeVecOp_FAKE_USE(SDNode *N); + SDValue ScalarizeVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N); //===--------------------------------------------------------------------===// // Vector Splitting Support: LegalizeVectorTypes.cpp @@ -1040,6 +1041,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SplitVecOp_VP_CttzElements(SDNode *N); SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N); SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue SplitVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index e1e6252fd8abc..dadcca16bd9b8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -885,6 +885,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::UCMP: Res = ScalarizeVecOp_CMP(N); break; + case ISD::VECTOR_FIND_LAST_ACTIVE: + Res = ScalarizeVecOp_VECTOR_FIND_LAST_ACTIVE(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -1185,6 +1188,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CMP(SDNode *N) { return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Cmp); } +SDValue DAGTypeLegalizer::ScalarizeVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { + // Since there is no "none-active" result, the only valid return for <1 x ty> + // is 0. Note: Since we check the high mask during splitting this is safe. + // As e.g., a <2 x ty> operation would split to: + // any_active(%hi_mask) ? (1 + last_active(%hi_mask)) + // : `last_active(%lo_mask)` + // Which then scalarizes to: + // %mask[1] ? 1 : 0 + EVT VT = N->getValueType(0); + return DAG.getConstant(0, SDLoc(N), VT); +} + //===----------------------------------------------------------------------===// // Result Vector Splitting //===----------------------------------------------------------------------===// @@ -3572,6 +3587,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::INSERT_SUBVECTOR: Res = SplitVecOp_INSERT_SUBVECTOR(N, OpNo); break; case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break; + case ISD::VECTOR_FIND_LAST_ACTIVE: + Res = SplitVecOp_VECTOR_FIND_LAST_ACTIVE(N); + break; case ISD::VP_TRUNCATE: case ISD::TRUNCATE: Res = SplitVecOp_TruncateHelper(N); @@ -3732,6 +3750,34 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { return false; } +SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { + SDLoc DL(N); + + SDValue LoMask, HiMask; + GetSplitVector(N->getOperand(0), LoMask, HiMask); + + EVT VT = N->getValueType(0); + EVT SplitVT = LoMask.getValueType(); + ElementCount SplitEC = SplitVT.getVectorElementCount(); + + // Find the last active in both the low and the high masks. + SDValue LoFind = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, VT, LoMask); + SDValue HiFind = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, VT, HiMask); + + // Check if any lane is active in the high mask. + // FIXME: This would not be necessary if VECTOR_FIND_LAST_ACTIVE returned a + // sentinel value for "none active". + SDValue AnyHiActive = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, HiMask); + SDValue Cond = DAG.getBoolExtOrTrunc(AnyHiActive, DL, + getSetCCResultType(MVT::i1), MVT::i1); + + // Return: AnyHiActive ? (HiFind + SplitEC) : LoFind; + return DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, HiFind, + DAG.getElementCount(DL, VT, SplitEC)), + LoFind); +} + SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { // The only possibility for an illegal operand is the mask, since result type // legalization would have handled this node already otherwise. diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll index 7ee8f6fda93f5..3f5c30b0c1fe3 100644 --- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -549,22 +549,34 @@ define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) { ret i32 %res } -declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) -declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) -declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) -declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64) -declare half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half>, <8 x i1>, half) -declare bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat>, <8 x i1>, bfloat) -declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float) -declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double) -declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(, , i8) -declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(, , i16) -declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(, , i32) -declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(, , i64) -declare half @llvm.experimental.vector.extract.last.active.nxv8f16(, , half) -declare bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(, , bfloat) -declare float @llvm.experimental.vector.extract.last.active.nxv4f32(, , float) -declare double @llvm.experimental.vector.extract.last.active.nxv2f64(, , double) -declare i1 @llvm.experimental.vector.extract.last.active.nxv16i1(, , i1) +define i8 @extract_last_active_split( %data, %mask, i8 %passthru) #0 { +; CHECK-LABEL: extract_last_active_split: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptest p1, p1.b +; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lastb w8, p1, z2.b +; CHECK-NEXT: lastb w9, p0, z2.b +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: csel x8, x8, x9, ne +; CHECK-NEXT: sub x9, x10, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: ldrb w8, [x9, x8] +; CHECK-NEXT: csel w0, w8, w0, ne +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8( %data, %mask, i8 %passthru) + ret i8 %res +} -attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } +attributes #0 = { nounwind "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/CodeGen/WebAssembly/vector-extract-last-active.ll b/llvm/test/CodeGen/WebAssembly/vector-extract-last-active.ll new file mode 100644 index 0000000000000..ba1af61433cb3 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/vector-extract-last-active.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) { +; CHECK-LABEL: extract_last_active_v4i32: +; CHECK: .functype extract_last_active_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 8 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 12 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 3 +; CHECK-NEXT: i32.const 2 +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: i32.const 2 +; CHECK-NEXT: i32.shl +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.load 0 +; CHECK-NEXT: i32.const -1 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: i32.or +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: # fallthrough-return + %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1) + ret i32 %res +} + +define i32 @extract_last_active_v4i32_no_default(<4 x i32> %a, <4 x i1> %c) { +; CHECK-LABEL: extract_last_active_v4i32_no_default: +; CHECK: .functype extract_last_active_v4i32_no_default (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 8 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 12 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 3 +; CHECK-NEXT: i32.const 2 +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: i32.const 2 +; CHECK-NEXT: i32.shl +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.load 0 +; CHECK-NEXT: # fallthrough-return + %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison) + ret i32 %res +} + +; Test v2i32 - smaller vector. +define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) { +; CHECK-LABEL: extract_last_active_v2i32: +; CHECK: .functype extract_last_active_v2i32 (i32, i32, i32, i32) -> (i32) +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 4 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 12 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 4 +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.load 0 +; CHECK-NEXT: i32.const -1 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.or +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32.and +; CHECK-NEXT: i32.select +; CHECK-NEXT: # fallthrough-return + %res = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> %a, <2 x i1> %c, i32 -1) + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll index e891b27de2756..2504d4952aeb3 100644 --- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll @@ -271,3 +271,157 @@ define i32 @extract_last_active_v4i32_penryn(<4 x i32> %a, <4 x i1> %c) "target- %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison) ret i32 %res } + +define i8 @extract_last_active_split(<32 x i8> %data, <32 x i8> %mask, i8 %passthru) { +; CHECK-LABEL: extract_last_active_split: +; CHECK: # %bb.0: +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: por %xmm3, %xmm5 +; CHECK-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 +; CHECK-NEXT: movdqa %xmm3, %xmm7 +; CHECK-NEXT: pxor %xmm6, %xmm7 +; CHECK-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: pandn %xmm7, %xmm2 +; CHECK-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pandn %xmm7, %xmm3 +; CHECK-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pcmpeqb %xmm4, %xmm5 +; CHECK-NEXT: pxor %xmm6, %xmm5 +; CHECK-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %cl, %dl +; CHECK-NEXT: cmoval %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: cmpb %sil, %cl +; CHECK-NEXT: cmovbel %esi, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %dl, %sil +; CHECK-NEXT: cmoval %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: cmpb %r8b, %dl +; CHECK-NEXT: cmovbel %r8d, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: cmpb %sil, %dl +; CHECK-NEXT: cmovbel %esi, %edx +; CHECK-NEXT: addl $16, %edx +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %al +; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: cmoveq %rcx, %rdx +; CHECK-NEXT: andl $31, %edx +; CHECK-NEXT: movzbl -40(%rsp,%rdx), %eax +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: cmovel %edi, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %notzero = icmp ne <32 x i8> %mask, zeroinitializer + %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<32 x i8> %data, <32 x i1> %notzero, i8 %passthru) + ret i8 %res +}