diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9f1fa43d46066..0debab4a2a0ee 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8334,7 +8334,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // failures. VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range); DenseMap IVEndValues; - VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues); + VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues, + CM.foldTailByMasking()); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8451,7 +8452,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { // TODO: We can't call runPass on the transform yet, due to verifier // failures. DenseMap IVEndValues; - VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues); + VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues, + /*FoldTail=*/false); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 7455129ff91e8..df66dd6cee59a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1267,8 +1267,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, VScale, /// Compute the exiting value of a wide induction after vectorization, that /// is the value of the last lane of the induction increment (i.e. its - /// backedge value). Takes the wide induction recipe and the original - /// backedge value as operands. + /// backedge value). Has the wide induction recipe as operand. ExitingIVValue, MaskedCond, OpsEnd = MaskedCond, diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index d7074a2ecf2c1..8c82e4be88ce4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -638,8 +638,8 @@ createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, VPIRValue *Start, Plan.getScalarPreheader()) && "last lane must be extracted in the middle block"); VPBuilder Builder(ExtractLastLane); - ExtractLastLane->replaceAllUsesWith(Builder.createNaryOp( - VPInstruction::ExitingIVValue, {WideIV, BackedgeVal})); + ExtractLastLane->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::ExitingIVValue, {WideIV})); ExtractLastLane->eraseFromParent(); ExtractLastPart->eraseFromParent(); } @@ -1266,17 +1266,19 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { // Update resume phis for inductions in the scalar preheader. If AnyNaNLane is // true, the resume from the start of the last vector iteration via the // canonical IV, otherwise from the original value. + auto IsTC = [&Plan](VPValue *V) { + return V == &Plan.getVectorTripCount() || V == Plan.getTripCount(); + }; for (auto &R : Plan.getScalarPreheader()->phis()) { auto *ResumeR = cast(&R); VPValue *VecV = ResumeR->getOperand(0); if (RdxResults.contains(VecV)) continue; if (auto *DerivedIV = dyn_cast(VecV)) { - if (DerivedIV->getNumUsers() == 1 && - DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { - auto *NewSel = - MiddleBuilder.createSelect(AnyNaNLane, LoopRegion->getCanonicalIV(), - &Plan.getVectorTripCount()); + VPValue *DIVTC = DerivedIV->getOperand(1); + if (DerivedIV->getNumUsers() == 1 && IsTC(DIVTC)) { + auto *NewSel = MiddleBuilder.createSelect( + AnyNaNLane, LoopRegion->getCanonicalIV(), DIVTC); DerivedIV->moveAfter(&*MiddleBuilder.getInsertPoint()); DerivedIV->setOperand(1, NewSel); continue; @@ -1284,7 +1286,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { } // Bail out and abandon the current, partially modified, VPlan if we // encounter resume phi that cannot be updated yet. - if (VecV != &Plan.getVectorTripCount()) { + if (!IsTC(VecV)) { LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with " "FMaxNum/FMinNum reduction.\n"); return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 7ba302733762a..2b582a607dddc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -542,10 +542,10 @@ inline VPInstruction_match m_StepVector() { return m_VPInstruction(); } -template -inline VPInstruction_match -m_ExitingIVValue(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction(Op0, Op1); +template +inline VPInstruction_match +m_ExitingIVValue(const Op0_t &Op0) { + return m_VPInstruction(Op0); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index dbc2e71c785ee..ad4fd69882f30 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -328,9 +328,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { VPBuilder B(Plan.getMiddleBlock()->getTerminator()); for (VPRecipeBase &R : *Plan.getMiddleBlock()) { VPValue *Op; - if (!match(&R, m_CombineOr( - m_ExitingIVValue(m_VPValue(), m_VPValue(Op)), - m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op)))))) + if (!match(&R, m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op))))) continue; // Compute the index of the last active lane. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f7810c65d000e..a61b9d71183a0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -453,6 +453,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const { case Instruction::Load: case VPInstruction::BranchOnCond: case VPInstruction::Broadcast: + case VPInstruction::ExitingIVValue: case VPInstruction::ExplicitVectorLength: case VPInstruction::ExtractLastLane: case VPInstruction::ExtractLastPart: @@ -469,7 +470,6 @@ unsigned VPInstruction::getNumOperandsForOpcode() const { case Instruction::Store: case VPInstruction::BranchOnCount: case VPInstruction::BranchOnTwoConds: - case VPInstruction::ExitingIVValue: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::LogicalOr: diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e62c0d58ec2ad..3c9d79a62db14 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -987,13 +987,15 @@ static VPValue *optimizeLatchExitInductionUser( DenseMap &EndValues, PredicatedScalarEvolution &PSE) { VPValue *Incoming; VPWidenInductionRecipe *WideIV = nullptr; - if (match(Op, m_ExitingIVValue(m_VPValue(), m_VPValue(Incoming)))) { - WideIV = getOptimizableIVOf(Op->getDefiningRecipe()->getOperand(0), PSE); - assert(WideIV && "must have an optimizable IV"); - } else if (match(Op, m_ExtractLastLaneOfLastPart(m_VPValue(Incoming)))) { + if (match(Op, m_ExitingIVValue(m_VPValue(Incoming)))) { WideIV = getOptimizableIVOf(Incoming, PSE); + assert(WideIV && "must have an optimizable IV"); + return EndValues.lookup(WideIV); } + if (match(Op, m_ExtractLastLaneOfLastPart(m_VPValue(Incoming)))) + WideIV = getOptimizableIVOf(Incoming, PSE); + if (!WideIV) return nullptr; @@ -5515,7 +5517,7 @@ static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, } void VPlanTransforms::updateScalarResumePhis( - VPlan &Plan, DenseMap &IVEndValues) { + VPlan &Plan, DenseMap &IVEndValues, bool FoldTail) { VPTypeAnalysis TypeInfo(Plan); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast(ScalarPH->getPredecessors()[0]); @@ -5530,8 +5532,12 @@ void VPlanTransforms::updateScalarResumePhis( // pre-computed end value together in optimizeInductionExitUsers. auto *VectorPhiR = cast(ResumePhiR->getOperand(0)); if (auto *WideIVR = dyn_cast(VectorPhiR)) { + // TODO: Check if tail is folded directly in VPlan. + VPValue *TC = !FoldTail + ? static_cast(&Plan.getVectorTripCount()) + : Plan.getTripCount(); if (VPValue *EndValue = tryToComputeEndValueForInduction( - WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) { + WideIVR, VectorPHBuilder, TypeInfo, TC)) { IVEndValues[WideIVR] = EndValue; ResumePhiR->setOperand(0, EndValue); ResumePhiR->setName("bc.resume.val"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3e86b4044bbb1..0dce486cb1c2c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -463,9 +463,8 @@ struct VPlanTransforms { /// Update the resume phis in the scalar preheader after creating wide recipes /// for first-order recurrences, reductions and inductions. End values for /// inductions are added to \p IVEndValues. - static void - updateScalarResumePhis(VPlan &Plan, - DenseMap &IVEndValues); + static void updateScalarResumePhis( + VPlan &Plan, DenseMap &IVEndValues, bool FoldTail); /// Handle users in the exit block for first order reductions in the original /// exit block. The penultimate value of recurrences is fed to their LCSSA phi diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll index 1305f103d0f5f..60f7258ca7c2c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll @@ -81,20 +81,60 @@ exit: define ptr @low_trip_count_small_with_live_out(i32 %x, ptr %dst) { ; CHECK-LABEL: define ptr @low_trip_count_small_with_live_out( ; CHECK-SAME: i32 [[X:%.*]], ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 1) ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SMAX]], i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SMAX]] to i64 +; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 4) ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[PTR_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMIN1]], 1 +; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = getelementptr i8, ptr [[DST]], i64 [[UMIN1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr [[DST]], i64 0 +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[DST]], i64 1 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> poison, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP2]], ptr [[NEXT_GEP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP3]], ptr [[NEXT_GEP3]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP4]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 ; CHECK-NEXT: store i8 0, ptr [[PTR_NEXT]], align 1 -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[UMIN]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[EXIT:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 1 +; CHECK-NEXT: store i8 0, ptr [[TMP10]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2 +; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; CHECK: [[PRED_STORE_IF7]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1 +; CHECK-NEXT: store i8 0, ptr [[TMP12]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; CHECK: [[PRED_STORE_CONTINUE8]]: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; CHECK: [[PRED_STORE_IF9]]: +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 1 +; CHECK-NEXT: store i8 0, ptr [[TMP14]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; CHECK: [[PRED_STORE_CONTINUE10]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT1:.*]] +; CHECK: [[EXIT1]]: ; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll index 48fb579f93b74..ecfe7b2788831 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll @@ -3,24 +3,79 @@ target triple = "x86_64-pc-windows-gnu" -define ptr @low_trip_count_via_profile_info_with_live_out(ptr align 16 %start, ptr align 16 %end, ptr noalias %src) #0 { -; CHECK-LABEL: define ptr @low_trip_count_via_profile_info_with_live_out( +define i8 @low_trip_count_via_profile_info_with_live_out(ptr align 16 %start, ptr align 16 %end, ptr noalias %src) #0 { +; CHECK-LABEL: define i8 @low_trip_count_via_profile_info_with_live_out( ; CHECK-SAME: ptr align 16 [[START:%.*]], ptr align 16 [[END:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LOOP:.*]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[START]], %[[LOOP]] ], [ [[PTR_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1 ; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 1 ; CHECK-NEXT: store i8 [[L]], ptr [[PTR]], align 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq ptr [[PTR]], [[END]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]], !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT1:.*]], label %[[VECTOR_BODY]], !prof [[PROF0:![0-9]+]] +; CHECK: [[EXIT1]]: +; CHECK-NEXT: [[L_LCSSA:%.*]] = phi i8 [ [[L]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: ret i8 [[L_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %ptr = phi ptr [ %start, %entry ], [ %ptr.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i8, ptr %src, i64 %iv.next + %l = load i8, ptr %gep.src, align 1 + %ptr.next = getelementptr i8, ptr %ptr, i64 1 + store i8 %l, ptr %ptr, align 1 + %exitcond = icmp eq ptr %ptr, %end + br i1 %exitcond, label %exit, label %loop, !prof !0 + +exit: + ret i8 %l +} + +define ptr @low_trip_count_via_profile_info_with_iv_live_out(ptr align 16 %start, ptr align 16 %end, ptr noalias %src) #0 { +; CHECK-LABEL: define ptr @low_trip_count_via_profile_info_with_iv_live_out( +; CHECK-SAME: ptr align 16 [[START:%.*]], ptr align 16 [[END:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], 63 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i64> [[BROADCAST_SPLATINSERT]], <64 x i64> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i64> [[BROADCAST_SPLATINSERT3]], <64 x i64> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <64 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP5]], <64 x i1> [[TMP3]], <64 x i8> poison) +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[WIDE_MASKED_LOAD]], ptr align 1 [[NEXT_GEP]], <64 x i1> [[TMP3]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]] +; CHECK-NEXT: ret ptr [[TMP2]] ; entry: br label %loop @@ -70,7 +125,7 @@ define void @low_trip_count_via_profile_info(ptr align 16 %start, ptr align 16 % ; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[WIDE_MASKED_LOAD]], ptr align 1 [[NEXT_GEP]], <64 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 93cf59c019d5f..6e74d5afe57ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -529,7 +529,7 @@ define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) o ; CHECK-NEXT: br label [[TMP1:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2 @@ -544,7 +544,7 @@ define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) o ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]] ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP6]], i64 12 -; CHECK-NEXT: [[TMP33:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257) +; CHECK-NEXT: [[TMP33:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP33]], i64 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: @@ -584,18 +584,13 @@ define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) o ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[TMP1]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[TMP30:%.*]] ; CHECK: 25: -; CHECK-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP33]], splat (i1 true) -; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP25]], i1 false) -; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], -1 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = add nsw i64 [[TMP28]], 1 -; CHECK-NEXT: ret i64 [[TMP29]] +; CHECK-NEXT: ret i64 257 ; br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index bf03cfc2e8437..9748409a0094d 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -325,7 +325,7 @@ define float @fmaxnum_tailfold(ptr %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP52]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP57]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP53]] ; CHECK-NEXT: [[TMP61:%.*]] = select i1 [[TMP57]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP54]] -; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP57]], i64 [[INDEX]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP57]], i64 [[INDEX]], i64 [[TMP0]] ; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP60]], <4 x float> [[TMP61]]) ; CHECK-NEXT: [[TMP63:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX]]) ; CHECK-NEXT: [[TMP64:%.*]] = xor i1 [[TMP57]], true diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll index 1e63271cf26d5..fd9d1f67b8b07 100644 --- a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll +++ b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll @@ -89,14 +89,6 @@ define void @test3(ptr %p) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY1:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 6, 1 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 7, 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 8, 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 9, 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP3]], i32 3 ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P:%.*]], i64 0, i64 6 @@ -123,12 +115,9 @@ define void @test3(ptr %p) { ; CHECK: pred.store.continue6: ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> , i1 false) -; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 -; CHECK-NEXT: [[INC46_LCSSA:%.*]] = extractelement <4 x i32> [[TMP11]], i64 [[LAST_ACTIVE_LANE]] ; CHECK-NEXT: br label [[WHILE_END:%.*]] ; CHECK: while.end: -; CHECK-NEXT: [[ADD58:%.*]] = add i32 [[INC46_LCSSA]], 4 +; CHECK-NEXT: [[ADD58:%.*]] = add i32 8, 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll index 252f2942452b4..41cc313b418de 100644 --- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll +++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -101,6 +101,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; FORCED-TF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; FORCED-TF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1 +; FORCED-TF-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[SIZE]] ; FORCED-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; FORCED-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; FORCED-TF-NEXT: br label [[VECTOR_BODY:%.*]] @@ -110,7 +111,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; FORCED-TF-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 ; FORCED-TF-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 -; FORCED-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]] +; FORCED-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP0]] ; FORCED-TF-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]] ; FORCED-TF-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]] ; FORCED-TF-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]] @@ -122,17 +123,10 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer ; FORCED-TF-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], ; FORCED-TF-NEXT: [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] -; FORCED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 -; FORCED-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 -; FORCED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 -; FORCED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 -; FORCED-TF-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0 -; FORCED-TF-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1 -; FORCED-TF-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2 -; FORCED-TF-NEXT: [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3 ; FORCED-TF-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 ; FORCED-TF-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FORCED-TF: pred.store.if: +; FORCED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 ; FORCED-TF-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1 ; FORCED-TF-NEXT: store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1 ; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE]] @@ -140,6 +134,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 ; FORCED-TF-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] ; FORCED-TF: pred.store.if6: +; FORCED-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 ; FORCED-TF-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1 ; FORCED-TF-NEXT: store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1 ; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE7]] @@ -147,6 +142,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 ; FORCED-TF-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] ; FORCED-TF: pred.store.if8: +; FORCED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 ; FORCED-TF-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1 ; FORCED-TF-NEXT: store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1 ; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE9]] @@ -154,6 +150,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 ; FORCED-TF-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] ; FORCED-TF: pred.store.if10: +; FORCED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 ; FORCED-TF-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1 ; FORCED-TF-NEXT: store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1 ; FORCED-TF-NEXT: br label [[PRED_STORE_CONTINUE11]] @@ -162,10 +159,6 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; FORCED-TF-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; FORCED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FORCED-TF: middle.block: -; FORCED-TF-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) -; FORCED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 false) -; FORCED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 -; FORCED-TF-NEXT: [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]] ; FORCED-TF-NEXT: br label [[END:%.*]] ; FORCED-TF: end: ; FORCED-TF-NEXT: store ptr [[TMP29]], ptr [[POS]], align 4 @@ -180,6 +173,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[SIZE]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -189,7 +183,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP0]] ; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]] ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]] @@ -201,17 +195,10 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], ; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] @@ -219,6 +206,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 ; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] ; CHECK: pred.store.if6: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1 ; CHECK-NEXT: store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] @@ -226,6 +214,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 ; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] ; CHECK: pred.store.if8: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1 ; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1 ; CHECK-NEXT: store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] @@ -233,6 +222,7 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]] ; CHECK: pred.store.if10: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1 ; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1 ; CHECK-NEXT: store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] @@ -241,10 +231,6 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 false) -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]] ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: store ptr [[TMP29]], ptr [[POS]], align 4