[VPlan] Remove manual region removal when simplifying for VF and UF.#181252
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesReplace manual region dissolution code in The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. Patch is 173.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181252.diff 20 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b2641c2b9bb7..6a711f7a20a0c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7362,37 +7362,43 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
else
return;
- auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
- vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
- if (!EpiRedHeaderPhi) {
- match(BackedgeVal,
- VPlanPatternMatch::m_Select(VPlanPatternMatch::m_VPValue(),
- VPlanPatternMatch::m_VPValue(BackedgeVal),
- VPlanPatternMatch::m_VPValue()));
- EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
+ Value *MainResumeValue;
+ if (match(BackedgeVal, m_VPInstruction<VPInstruction::ReductionStartVector>(
+ m_VPValue(Incoming), m_VPValue(), m_VPValue()))) {
+ MainResumeValue = Incoming->getUnderlyingValue();
+ } else {
+ auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
- }
+ if (!EpiRedHeaderPhi) {
+ match(BackedgeVal, VPlanPatternMatch::m_Select(
+ VPlanPatternMatch::m_VPValue(),
+ VPlanPatternMatch::m_VPValue(BackedgeVal),
+ VPlanPatternMatch::m_VPValue()));
+ EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
+ vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
+ }
- Value *MainResumeValue;
- if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
- assert((VPI->getOpcode() == VPInstruction::Broadcast ||
- VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
- "unexpected start recipe");
- MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
- } else
- MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
- if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
- [[maybe_unused]] Value *StartV =
- EpiRedResult->getOperand(0)->getLiveInIRValue();
- auto *Cmp = cast<ICmpInst>(MainResumeValue);
- assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
- "AnyOf expected to start with ICMP_NE");
- assert(Cmp->getOperand(1) == StartV &&
- "AnyOf expected to start by comparing main resume value to original "
- "start value");
- MainResumeValue = Cmp->getOperand(0);
- } else if (IsFindIV) {
- MainResumeValue = cast<SelectInst>(MainResumeValue)->getFalseValue();
+ if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
+ assert((VPI->getOpcode() == VPInstruction::Broadcast ||
+ VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
+ "unexpected start recipe");
+ MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
+ } else
+ MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
+ if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
+ [[maybe_unused]] Value *StartV =
+ EpiRedResult->getOperand(0)->getLiveInIRValue();
+ auto *Cmp = cast<ICmpInst>(MainResumeValue);
+ assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
+ "AnyOf expected to start with ICMP_NE");
+ assert(
+ Cmp->getOperand(1) == StartV &&
+ "AnyOf expected to start by comparing main resume value to original "
+ "start value");
+ MainResumeValue = Cmp->getOperand(0);
+ } else if (IsFindIV) {
+ MainResumeValue = cast<SelectInst>(MainResumeValue)->getFalseValue();
+ }
}
PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
@@ -7467,6 +7473,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::expandBranchOnTwoConds(BestVPlan);
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
+ // Remove dead edges for single-iteration loops with BranchOnCond(true).
+ VPlanTransforms::removeBranchOnConst(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 89a270d1219e7..7e5fe3a3a7e86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -77,6 +77,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
case VPDerivedIVSC:
+ case VPEVLBasedIVPHISC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPScalarIVStepsSC:
@@ -127,6 +128,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
case VPBranchOnMaskSC:
case VPDerivedIVSC:
+ case VPEVLBasedIVPHISC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPPredInstPHISC:
@@ -164,6 +166,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
case VPActiveLaneMaskPHISC:
case VPDerivedIVSC:
+ case VPEVLBasedIVPHISC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPPredInstPHISC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d49cbfadcf079..122a311f84cb4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1518,7 +1518,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
- if (isa<VPPhi, VPWidenPHIRecipe>(Def)) {
+ if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Def)) {
if (Def->getNumOperands() == 1)
Def->replaceAllUsesWith(Def->getOperand(0));
return;
@@ -2061,74 +2061,17 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
return false;
}
- // The vector loop region only executes once. If possible, completely remove
- // the region, otherwise replace the terminator controlling the latch with
- // (BranchOnCond true).
- // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
- // support for other non-canonical widen induction recipes (e.g.,
- // VPWidenPointerInductionRecipe).
- // TODO: fold branch-on-constant after dissolving region.
- auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
- if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
- if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
- return R->isCanonical();
- return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
- VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
- })) {
- for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
- if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
- VPBuilder Builder(Plan.getVectorPreheader());
- VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
- R->getScalarType());
- HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
- HeaderR.eraseFromParent();
- continue;
- }
- auto *Phi = cast<VPPhiAccessors>(&HeaderR);
- HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
- HeaderR.eraseFromParent();
- }
-
- VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
- SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
- VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
- for (VPBlockBase *Exit : Exits)
- VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
-
- for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
- B->setParent(nullptr);
-
- VPBlockUtils::connectBlocks(Preheader, Header);
-
- for (VPBlockBase *Exit : Exits)
- VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
-
- // Replace terminating branch-on-two-conds with branch-on-cond to early
- // exit.
- if (Exits.size() != 1) {
- assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
- "BranchOnTwoConds needs 2 remaining exits");
- VPBuilder(Term).createNaryOp(VPInstruction::BranchOnCond,
- Term->getOperand(0));
- }
- VPlanTransforms::simplifyRecipes(Plan);
- } else {
- // The vector region contains header phis for which we cannot remove the
- // loop region yet.
-
- // For BranchOnTwoConds, set the latch exit condition to true directly.
- if (match(Term, m_BranchOnTwoConds())) {
- Term->setOperand(1, Plan.getTrue());
- return true;
- }
-
- auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
- {}, {}, Term->getDebugLoc());
- ExitingVPBB->appendRecipe(BOC);
+ // The vector loop region only executes once. Convert terminator of the
+ // exiting block to exit in the first iteration.
+ if (match(Term, m_BranchOnTwoConds())) {
+ Term->setOperand(1, Plan.getTrue());
+ return true;
}
+ auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
+ {}, {}, Term->getDebugLoc());
+ ExitingVPBB->appendRecipe(BOC);
Term->eraseFromParent();
-
return true;
}
@@ -2728,8 +2671,9 @@ void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
continue;
- assert(VPBB->getNumSuccessors() == 2 &&
- "Two successors expected for BranchOnCond");
+ // BranchOnCond requires exactly 2 successors.
+ if (VPBB->getNumSuccessors() != 2)
+ continue;
unsigned RemovedIdx;
if (match(Cond, m_True()))
RemovedIdx = 1;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index e054c916de6e0..d54e8582676d6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -8,28 +8,17 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[TMP8]], splat (i64 3)
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[DST]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]])
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
@@ -68,28 +57,17 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SHR]] to i64
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[TMP8]], splat (i64 3)
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[DST]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]])
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
@@ -124,9 +102,3 @@ for.body: ; preds = %for.body.preheader,
for.cond.cleanup: ; preds = %for.body
ret void
}
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 6ea9809dc8ff8..443f51094ffcf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -12,11 +12,9 @@ define double @test_reduction_costs() {
; COMMON: [[VECTOR_PH]]:
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
-; COMMON-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
-; COMMON-NEXT: [[VEC_PHI1:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
-; COMMON-NEXT: [[TMP0]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> splat (double 3.000000e+00))
-; COMMON-NEXT: [[TMP1]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI1]], <2 x double> splat (double 9.000000e+00))
-; COMMON-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; COMMON-NEXT: [[TMP0:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> splat (double 3.000000e+00))
+; COMMON-NEXT: [[TMP1:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> splat (double 9.000000e+00))
+; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; COMMON: [[MIDDLE_BLOCK]]:
; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index c340cfc9ad6cc..6acda0d4b3294 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -464,13 +464,9 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> zeroinitializer, [[WIDE_LOAD]]
+; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br label %[[EXIT:.*]]
@@ -509,7 +505,7 @@ define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) v
; CHECK-NEXT: [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !prof [[PROF8:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], ...
[truncated]
|
| @@ -125,11 +125,12 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture | |||
| ; CHECK: vector.ph: | |||
| ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | |||
| ; CHECK: vector.body: | |||
| ; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[SRC:%.*]], <vscale x 4 x i1> splat (i1 true), i32 8) | |||
| ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true) | |||
There was a problem hiding this comment.
this I think is just the result of now having one less run of simplifyRecipes
There was a problem hiding this comment.
Is it possible to preserve the simplifyRecipes call in the new code? Would be good to not introduce regressions?
There was a problem hiding this comment.
Unfortunately there's no good place to do this now, unless we would like to run it unconditonally, as we don't simplify the phis early on now
There was a problem hiding this comment.
Maybe run it unconditionally, paying the compile-time cost for now? Atleast the regressions would be avoided?
There was a problem hiding this comment.
Looks like it wasn't simplifyRecieps after all, but a different EVL simplification that now did not trigger. Hopefully should be addressed by moving the fold to simplifyRecipes itself: #183392
There was a problem hiding this comment.
Thanks for looking into this!
There was a problem hiding this comment.
Found some cases where #183392 did not help; moved simplifyKnownEVL to run later, after region has been removed + simplified to avoid regression here
03c9fe5 to
30f7b23
Compare
| @@ -1526,7 +1526,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { | |||
| return; | |||
| } | |||
|
|
|||
| if (isa<VPPhi, VPWidenPHIRecipe>(Def)) { | |||
| if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Def)) { | |||
There was a problem hiding this comment.
Just checking this is because removeBranchOnConst only removes the latch incoming value, so we need to simplify the header recipes to the start value?
There was a problem hiding this comment.
Yep exactly, we remove teh incoming value and then leave it for simplifyRecipes to remove the triivial phis
| @@ -125,11 +125,12 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture | |||
| ; CHECK: vector.ph: | |||
| ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | |||
| ; CHECK: vector.body: | |||
| ; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[SRC:%.*]], <vscale x 4 x i1> splat (i1 true), i32 8) | |||
| ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true) | |||
There was a problem hiding this comment.
Is it possible to preserve the simplifyRecipes call in the new code? Would be good to not introduce regressions?
Move the logic to simplify known EVL to simplifyRecipe as fold once we have a single VF. It does not look like this is only triggers when folding EVL with constant arguments. It cannot be a complete constant fold, as it needs to access the function's vscale range. Needed to avoid regressions in combination with llvm#181252.
609ea43 to
7930fa4
Compare
7930fa4 to
766b3b1
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
766b3b1 to
566485b
Compare
artagnon
left a comment
There was a problem hiding this comment.
Nice, thanks for handling the regression!
…lvm#181252) Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm#181252
…and UF. (llvm#181252)" This reverts commit 9c53215. Appears to cause crashes with ordered reductions, revert while I investigate
When narrowInterleaveGroups transforms a plan, VF and VFxUF are materialized (replaced with concrete values). This patch also materializes the VectorTripCount in the same transform. This ensures that VectorTripCount is properly computed when the narrow interleave transform is applied, instead of using the original VF + UF to compute the vector trip count. The previous behavior generated correct code, but executed fewer iterations in the vector loop. The change also enables stricter verification prevent accesses of VPSymbolicValues after materialization as follow-up. Note that in some cases we no miss branch folding, but that should be addressed separately, llvm#181252
… and UF. (#181252)" This reverts commit d7e037c. Recommit with a small fix to properly handle ordered reductions when connecting the epilogue. Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: #181252
…ying for VF and UF. (#181252)" This reverts commit d7e037c. Recommit with a small fix to properly handle ordered reductions when connecting the epilogue. Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm/llvm-project#181252
|
The reland is still causing assertions. See #185345 for details. Reverting for now to get things back to green. Sorry for the inconvenience. |
When narrowInterleaveGroups transforms a plan, VF and VFxUF are materialized (replaced with concrete values). This patch also materializes the VectorTripCount in the same transform. This ensures that VectorTripCount is properly computed when the narrow interleave transform is applied, instead of using the original VF + UF to compute the vector trip count. The previous behavior generated correct code, but executed fewer iterations in the vector loop. The change also enables stricter verification prevent accesses of UF, VF, VFxUF etc after materialization as follow-up. Note that in some cases we no miss branch folding, but that should be addressed separately, #181252 Fixes one of the violations accessing a VectorTripCount after UF and VF being materialized PR: #182146
…ups. (#182146) When narrowInterleaveGroups transforms a plan, VF and VFxUF are materialized (replaced with concrete values). This patch also materializes the VectorTripCount in the same transform. This ensures that VectorTripCount is properly computed when the narrow interleave transform is applied, instead of using the original VF + UF to compute the vector trip count. The previous behavior generated correct code, but executed fewer iterations in the vector loop. The change also enables stricter verification prevent accesses of UF, VF, VFxUF etc after materialization as follow-up. Note that in some cases we no miss branch folding, but that should be addressed separately, llvm/llvm-project#181252 Fixes one of the violations accessing a VectorTripCount after UF and VF being materialized PR: llvm/llvm-project#182146
… and UF. (llvm#181252)" This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (llvm#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm#181252
… and UF. (#181252)" (#188589) This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: #181252
…ying for VF and UF. (#181252)" (#188589) This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm/llvm-project#181252
… and UF. (llvm#181252)" (llvm#188589) This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (llvm#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm#181252
… and UF. (llvm#181252)" (llvm#188589) This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (llvm#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm#181252
… and UF. (#181252)" (#188589) This reverts commit e30f9c1. Re-land, now that the reported crash causing the revert has been fixed as part of 77fb848 (#187504). Original message: Replace manual region dissolution code in simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds. The loop then gets automatically removed by running removeBranchOnConst. This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains. Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted. The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value. PR: llvm/llvm-project#181252 (cherry picked from commit 40304d8)
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in #182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on #181252. PR: #183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm#181252. PR: llvm#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm/llvm-project#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm/llvm-project#181252. PR: llvm/llvm-project#183397
Simplify constant branches early, after introducing the check in the middle block. This removes any trivial branches in the input CFG (e.g. over-reduced test cases) early and also folds branches on true/false created by addMiddleChecks. This allows to check if there's a scalar tail instead to check if the tail has been folded, as mentioned in llvm#182507 This requires to remove recipes in the new unreachable blocks, as otherwise we would fail during verification, due to uses in unreachable blocks. Alternatively, we may be able to skip verification for uses in unreachable blocks. Depends on llvm#181252. PR: llvm#183397
Replace manual region dissolution code in
simplifyBranchConditionForVFAndUF with using general removeBranchOnConst. simplifyBranchConditionForVFAndUF now just creates a (BranchOnCond true) or updates BranchOnTwoConds.
The loop then gets automatically removed by running removeBranchOnConst.
This removes a bunch of special logic to handle header phi replacements and CFG updates. With the new code, there's no restriction on what kind of header phi recipes the loop contains.
Note that VPEVLBasedIVRecipe needs to be marked as readnone. This is technically unrelated, but I could not find an independent test that would be impacted.
The code to deal with epilogue resume values now needs updating, because we may simplify a reduction directly to the start value.