[VPlan] Simplify ExitingIVValue and use for tail-folded IVs.#182507
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesNow that we have ExitingIVValue, we can also use it for tail-folded loops; the only difference is that we have to compute the end value with the original trip count instead the vector trip count. This allows removing the induction increment operand only used when tail-folding and also fixes a mis-compile in @fmaxnum_tailfold, where we previously were scaling with the vector trip count, not the original trip count. Patch is 36.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182507.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..c264b1d468db3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8345,7 +8345,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// failures.
VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
DenseMap<VPValue *, VPValue *> IVEndValues;
- VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues);
+ VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues,
+ CM.foldTailByMasking());
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8462,7 +8463,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
// TODO: We can't call runPass on the transform yet, due to verifier
// failures.
DenseMap<VPValue *, VPValue *> IVEndValues;
- VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues);
+ VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues,
+ /*FoldTail=*/false);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..6451ced3aa55b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1267,8 +1267,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
VScale,
/// Compute the exiting value of a wide induction after vectorization, that
/// is the value of the last lane of the induction increment (i.e. its
- /// backedge value). Takes the wide induction recipe and the original
- /// backedge value as operands.
+ /// backedge value). Has the wide induction recipe as operand.
ExitingIVValue,
OpsEnd = ExitingIVValue,
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1af7392b904da..c2081d05a79c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -643,7 +643,7 @@ createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, VPIRValue *Start,
"last lane must be extracted in the middle block");
VPBuilder Builder(ExtractLastLane);
ExtractLastLane->replaceAllUsesWith(Builder.createNaryOp(
- VPInstruction::ExitingIVValue, {WideIV, BackedgeVal}));
+ VPInstruction::ExitingIVValue, {WideIV}));
ExtractLastLane->eraseFromParent();
ExtractLastPart->eraseFromParent();
}
@@ -1292,17 +1292,19 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
// Update resume phis for inductions in the scalar preheader. If AnyNaNLane is
// true, the resume from the start of the last vector iteration via the
// canonical IV, otherwise from the original value.
+ auto IsTC = [&Plan](VPValue *V) {
+ return V == &Plan.getVectorTripCount() || V == Plan.getTripCount();
+ };
for (auto &R : Plan.getScalarPreheader()->phis()) {
auto *ResumeR = cast<VPPhi>(&R);
VPValue *VecV = ResumeR->getOperand(0);
if (RdxResults.contains(VecV))
continue;
if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
- if (DerivedIV->getNumUsers() == 1 &&
- DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
- auto *NewSel =
- MiddleBuilder.createSelect(AnyNaNLane, LoopRegion->getCanonicalIV(),
- &Plan.getVectorTripCount());
+ VPValue *DIVTC = DerivedIV->getOperand(1);
+ if (DerivedIV->getNumUsers() == 1 && IsTC(DIVTC)) {
+ auto *NewSel = MiddleBuilder.createSelect(
+ AnyNaNLane, LoopRegion->getCanonicalIV(), DIVTC);
DerivedIV->moveAfter(&*MiddleBuilder.getInsertPoint());
DerivedIV->setOperand(1, NewSel);
continue;
@@ -1310,7 +1312,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
}
// Bail out and abandon the current, partially modified, VPlan if we
// encounter resume phi that cannot be updated yet.
- if (VecV != &Plan.getVectorTripCount()) {
+ if (!IsTC(VecV)) {
LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
"FMaxNum/FMinNum reduction.\n");
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index db229300af17a..a5946cc761302 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -542,10 +542,10 @@ inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
return m_VPInstruction<VPInstruction::StepVector>();
}
-template <typename Op0_t, typename Op1_t>
-inline VPInstruction_match<VPInstruction::ExitingIVValue, Op0_t, Op1_t>
-m_ExitingIVValue(const Op0_t &Op0, const Op1_t &Op1) {
- return m_VPInstruction<VPInstruction::ExitingIVValue>(Op0, Op1);
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExitingIVValue, Op0_t>
+m_ExitingIVValue(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExitingIVValue>(Op0);
}
template <unsigned Opcode, typename Op0_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index dbc2e71c785ee..ad4fd69882f30 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -328,9 +328,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
VPBuilder B(Plan.getMiddleBlock()->getTerminator());
for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
VPValue *Op;
- if (!match(&R, m_CombineOr(
- m_ExitingIVValue(m_VPValue(), m_VPValue(Op)),
- m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op))))))
+ if (!match(&R, m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op)))))
continue;
// Compute the index of the last active lane.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..2cd2109d1a829 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -453,6 +453,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case Instruction::Load:
case VPInstruction::BranchOnCond:
case VPInstruction::Broadcast:
+ case VPInstruction::ExitingIVValue:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
@@ -468,7 +469,6 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case Instruction::Store:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnTwoConds:
- case VPInstruction::ExitingIVValue:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::LogicalOr:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..339e05fe9d049 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -997,9 +997,14 @@ static VPValue *optimizeLatchExitInductionUser(
DenseMap<VPValue *, VPValue *> &EndValues, PredicatedScalarEvolution &PSE) {
VPValue *Incoming;
VPWidenInductionRecipe *WideIV = nullptr;
- if (match(Op, m_ExitingIVValue(m_VPValue(), m_VPValue(Incoming)))) {
+ if (match(Op, m_ExitingIVValue(m_VPValue()))) {
WideIV = getOptimizableIVOf(Op->getDefiningRecipe()->getOperand(0), PSE);
assert(WideIV && "must have an optimizable IV");
+ // ExitingIVValue always uses the post-increment (backedge) value, so
+ // the end value can be used directly without subtracting the step.
+ VPValue *EndValue = EndValues.lookup(WideIV);
+ assert(EndValue && "end value must have been pre-computed");
+ return EndValue;
} else if (match(Op, m_ExtractLastLaneOfLastPart(m_VPValue(Incoming)))) {
WideIV = getOptimizableIVOf(Incoming, PSE);
}
@@ -5509,7 +5514,7 @@ static VPValue *tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV,
}
void VPlanTransforms::updateScalarResumePhis(
- VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
+ VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues, bool FoldTail) {
VPTypeAnalysis TypeInfo(Plan);
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
@@ -5524,8 +5529,11 @@ void VPlanTransforms::updateScalarResumePhis(
// pre-computed end value together in optimizeInductionExitUsers.
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+ VPValue *TC = !FoldTail
+ ? static_cast<VPValue *>(&Plan.getVectorTripCount())
+ : Plan.getTripCount();
if (VPValue *EndValue = tryToComputeEndValueForInduction(
- WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
+ WideIVR, VectorPHBuilder, TypeInfo, TC)) {
IVEndValues[WideIVR] = EndValue;
ResumePhiR->setOperand(0, EndValue);
ResumePhiR->setName("bc.resume.val");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..4112a9db2c929 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -465,7 +465,8 @@ struct VPlanTransforms {
/// inductions are added to \p IVEndValues.
static void
updateScalarResumePhis(VPlan &Plan,
- DenseMap<VPValue *, VPValue *> &IVEndValues);
+ DenseMap<VPValue *, VPValue *> &IVEndValues,
+ bool FoldTail);
/// Handle users in the exit block for first order reductions in the original
/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
index 1305f103d0f5f..60f7258ca7c2c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
@@ -81,20 +81,60 @@ exit:
define ptr @low_trip_count_small_with_live_out(i32 %x, ptr %dst) {
; CHECK-LABEL: define ptr @low_trip_count_small_with_live_out(
; CHECK-SAME: i32 [[X:%.*]], ptr [[DST:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 1)
; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SMAX]], i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[SMAX]] to i64
+; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 4)
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[PTR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 1
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMIN1]], 1
+; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = getelementptr i8, ptr [[DST]], i64 [[UMIN1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[DST]], i64 1
+; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST]], i64 2
+; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> poison, ptr [[PTR]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP2]], ptr [[NEXT_GEP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP3]], ptr [[NEXT_GEP3]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP4]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
; CHECK-NEXT: store i8 0, ptr [[PTR_NEXT]], align 1
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[UMIN]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
+; CHECK-NEXT: br i1 [[TMP9]], label %[[EXIT:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 1
+; CHECK-NEXT: store i8 0, ptr [[TMP10]], align 1
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
+; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1
+; CHECK-NEXT: store i8 0, ptr [[TMP12]], align 1
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 1
+; CHECK-NEXT: store i8 0, ptr [[TMP14]], align 1
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT1:.*]]
+; CHECK: [[EXIT1]]:
; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll
index 48fb579f93b74..2a7164cd05ae8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fold-tail-low-trip-count.ll
@@ -6,20 +6,38 @@ target triple = "x86_64-pc-windows-gnu"
define ptr @low_trip_count_via_profile_info_with_live_out(ptr align 16 %start, ptr align 16 %end, ptr noalias %src) #0 {
; CHECK-LABEL: define ptr @low_trip_count_via_profile_info_with_live_out(
; CHECK-SAME: ptr align 16 [[START:%.*]], ptr align 16 [[END:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], 63
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 64
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i64> [[BROADCAST_SPLATINSERT]], <64 x i64> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[LOOP]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[IV]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i64> poison, i64 [[IV]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i64> [[BROADCAST_SPLATINSERT3]], <64 x i64> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31, i64 32, i64 33, i64 34, i64 35, i64 36, i64 37, i64 38, i64 39, i64 40, i64 41, i64 42, i64 43, i64 44, i64 45, i64 46, i64 47, i64 48, i64 49, i64 50, i64 51, i64 52, i64 53, i64 54, i64 55, i64 56, i64 57, i64 58, i64 59, i64 60, i64 61, i64 62, i64 63>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <64 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[IV_NEXT:%.*]] = add i64 [[IV]], 1
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 1
-; CHECK-NEXT: store i8 [[L]], ptr [[PTR]], align 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq ptr [[PTR]], [[END]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[GEP_SRC]], <64 x i1> [[TMP3]], <64 x i8> poison)
+; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[WIDE_MASKED_LOAD]], ptr align 1 [[NEXT_GEP]], <64 x i1> [[TMP3]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], 64
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: br label %[[EXIT1:.*]]
+; CHECK: [[EXIT1]]:
; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]]
;
entry:
@@ -70,7 +88,7 @@ define void @low_trip_count_via_profile_info(ptr align 16 %start, ptr align 16 %
; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0(<64 x i8> [[WIDE_MASKED_LOAD]], ptr align 1 [[NEXT_GEP]], <64 x i1> [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 93cf59c019d5f..6e74d5afe57ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -529,7 +529,7 @@ defin...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
effc66b to
f5f4870
Compare
Now that we have ExitingIVValue, we can also use it for tail-folded loops; the only difference is that we have to compute the end value with the original trip count instead the vector trip count. This allows removing the induction increment operand only used when tail-folding and also fixes a mis-compile in @fmaxnum_tailfold, where we previously were scaling with the vector trip count, not the original trip count.
f5f4870 to
b016f68
Compare
| /// is the value of the last lane of the induction increment (i.e. its | ||
| /// backedge value). Takes the wide induction recipe and the original | ||
| /// backedge value as operands. | ||
| /// backedge value). Has the wide induction recipe as operand. |
There was a problem hiding this comment.
Just to confirm is removing the backedge value operand NFC?
There was a problem hiding this comment.
Yep, the only use was in the predicator, which is no longer needed
| // pre-computed end value together in optimizeInductionExitUsers. | ||
| auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0)); | ||
| if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { | ||
| VPValue *TC = !FoldTail |
There was a problem hiding this comment.
Instead of plumbing in the cost model can we check Plan.hasScalarTail() instead?
There was a problem hiding this comment.
Unfortunately not at the moment; currently we create a branch-on-cond with a constant for tail-folding/forced scalar iteration. Looking into having branch-on-const simplifications handle this early, but there's a few things that need fixing first it seems
There was a problem hiding this comment.
Ok, do you think it's worth leaving a TODO? It would be nice to move in the direction of removing tail folding specific code across the pipeline, and instead keep it in one place
There was a problem hiding this comment.
Put up #183397 to simplify constant branches earlier, then we can rely on checking if there's a scalar tail
| ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP57]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP53]] | ||
| ; CHECK-NEXT: [[TMP61:%.*]] = select i1 [[TMP57]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP54]] | ||
| ; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP57]], i64 [[INDEX]], i64 [[N_VEC]] | ||
| ; CHECK-NEXT: [[TMP62:%.*]] = select i1 [[TMP57]], i64 [[INDEX]], i64 [[TMP0]] |
There was a problem hiding this comment.
Just confirming, it looks like this won't cause any miscompiles in practice? Since to get the wrong exiting IV we need to be TMP57 to be false, i.e. there's no NaNs. If there's no NaNs then we won't execute the scalar PH.
| // pre-computed end value together in optimizeInductionExitUsers. | ||
| auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0)); | ||
| if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { | ||
| VPValue *TC = !FoldTail |
There was a problem hiding this comment.
Ok, do you think it's worth leaving a TODO? It would be nice to move in the direction of removing tail folding specific code across the pipeline, and instead keep it in one place
| ; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ] | ||
| ; CHECK-NEXT: br label %[[EXIT1:.*]] | ||
| ; CHECK: [[EXIT1]]: | ||
| ; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]] |
There was a problem hiding this comment.
Do we want another test with a live out that can't be simplified to make sure we still test we don't tail fold low trip count loops w/ a middle block?
There was a problem hiding this comment.
Sounds good, I update the existing test to use %l as live-out + added one with the ptr.next
| // true, the resume from the start of the last vector iteration via the | ||
| // canonical IV, otherwise from the original value. | ||
| auto IsTC = [&Plan](VPValue *V) { | ||
| return V == &Plan.getVectorTripCount() || V == Plan.getTripCount(); |
There was a problem hiding this comment.
I presume the only case where the IV operand of a VPDerivedIVRecipe would be Plan.getTripCount() is when using tail-folding? That then means the only way we could even enter the scalar preheader is if:
- We skipped the tail-folded vector loop due to failing runtime checks, in which case we couldn't possibly be entering the scalar pre-header via the middle block and hence we're not resuming anything, or
- The tail-folded vector loop deliberately exits the loop early with the intention of mopping up the remaining iterations in the scalar loop.
Is my understanding correct here?
| ; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[PTR_NEXT:%.*]], %[[LOOP]] ] | ||
| ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] | ||
| ; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 1 | ||
| ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMIN1]], 1 |
There was a problem hiding this comment.
Do you know why we are now tail-folding this loop? Is it due to the lower costs in the middle block?
There was a problem hiding this comment.
Yep, now the cost should be very close to @low_trip_count_small above
| ; CHECK-NEXT: [[PTR_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_NEXT]], %[[LOOP]] ] | ||
| ; CHECK-NEXT: br label %[[EXIT1:.*]] | ||
| ; CHECK: [[EXIT1]]: | ||
| ; CHECK-NEXT: ret ptr [[PTR_NEXT_LCSSA]] |
…s. (#182507) Now that we have ExitingIVValue, we can also use it for tail-folded loops; the only difference is that we have to compute the end value with the original trip count instead the vector trip count. This allows removing the induction increment operand only used when tail-folding. PR: llvm/llvm-project#182507
…2507) Now that we have ExitingIVValue, we can also use it for tail-folded loops; the only difference is that we have to compute the end value with the original trip count instead the vector trip count. This allows removing the induction increment operand only used when tail-folding. PR: llvm#182507
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in #182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on #181252. PR: #183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm#181252. PR: llvm#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm#181252. PR: llvm#183397
Now that we have ExitingIVValue, we can also use it for tail-folded loops; the only difference is that we have to compute the end value with the original trip count instead the vector trip count.
This allows removing the induction increment operand only used when tail-folding and also fixes a mis-compile in @fmaxnum_tailfold, where we previously were scaling with the vector trip count, not the original trip count.