[VPlan] Introduce explicit ExtractFromEnd recipes for live-outs.#100658
Conversation
Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi. This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences. It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis.
|
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesIntroduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi. This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences. It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis. Patch is 26.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100658.diff 9 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..61b29295d07b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8452,7 +8452,104 @@ static void addUsersInExitBlock(
return P && Inductions.contains(P);
})))
continue;
- Plan.addLiveOut(&ExitPhi, V);
+
+ auto MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ VPBuilder B(MiddleVPBB);
+ if (auto *Terminator = MiddleVPBB->getTerminator()) {
+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+ assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+ "Condition expected in MiddleVPBB");
+ B.setInsertPoint(Condition ? Condition : Terminator);
+ }
+
+ VPValue *Ext;
+ if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+ V->getDefiningRecipe())) {
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have the
+ // following loop with some use after the loop of the last a[i-1],
+ //
+ // for (int i = 0; i < n; ++i) {
+ // t = a[i - 1];
+ // b[i] = a[i] - t;
+ // }
+ // use t;
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // use = lcssa.phi [s1, scalar.body]
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // vector phi v1 for s1. We now complete the vectorization and produce the
+ // shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // s_penultimate = v2(2) = v3(3)
+ // s_resume = v2(3)
+ // br cond, scalar.ph, exit.block
+ //
+ // scalar.ph:
+ // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+ //
+ // After execution completes the vector loop, we extract the next value of
+ // the recurrence (x) to use as the initial value in the scalar loop. This
+ // is modeled by ExtractFromEnd.
+ //
+ // Extract the penultimate value of the recurrence and update VPLiveOut
+ // users of the recurrence splice. Note that the extract of the final
+ // value used to resume in the scalar loop is created earlier during VPlan
+ // construction.
+ Ext =
+ B.createNaryOp(VPInstruction::ExtractFromEnd,
+ {FOR->getBackedgeValue(),
+ Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 2))},
+ {}, "vector.recur.extract.for.phi");
+ } else {
+ Ext = B.createNaryOp(
+ VPInstruction::ExtractFromEnd,
+ {V, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 1))});
+ }
+ Plan.addLiveOut(&ExitPhi, Ext);
}
}
@@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// After here, VPBB should not be used.
VPBB = nullptr;
+ assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+ !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+ "entry block must be set to a VPRegionBlock having a non-empty entry "
+ "VPBasicBlock");
+ RecipeBuilder.fixHeaderPhis();
+
+ addLiveOutsForFirstOrderRecurrences(*Plan);
+
if (CM.requiresScalarEpilogue(Range)) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
@@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
Legal->getInductionVars());
- assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
- !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
- "entry block must be set to a VPRegionBlock having a non-empty entry "
- "VPBasicBlock");
- RecipeBuilder.fixHeaderPhis();
-
- addLiveOutsForFirstOrderRecurrences(*Plan);
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (unsigned I = 0; I != Worklist.size(); ++I) {
VPSingleDefRecipe *Cur = Worklist[I];
for (VPUser *U : Cur->users()) {
- auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
- if (!UserRecipe) {
- assert(isa<VPLiveOut>(U) &&
- "U must either be a VPSingleDef or VPLiveOut");
+ auto *UserRecipe = cast<VPSingleDefRecipe>(U);
+ if (!UserRecipe->getParent()->getParent()) {
+ assert(cast<VPInstruction>(U) &&
+ cast<VPInstruction>(U)->getOpcode() ==
+ VPInstruction::ExtractFromEnd &&
+ "U must be an ExtractFromEnd VPInstruction");
continue;
}
Worklist.insert(UserRecipe);
@@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
OrigExitingVPV->replaceUsesWithIf(
- FinalReductionResult,
- [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
+ FinalReductionResult, [](VPUser &User, unsigned) {
+ auto *R = dyn_cast<VPInstruction>(&User);
+ return R && R->getOpcode() == VPInstruction::ExtractFromEnd;
+ });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..798d178e5a963 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
- auto Lane = vputils::isUniformAfterVectorization(ExitValue)
- ? VPLane::getFirstLane()
- : VPLane::getLastLaneForVF(State.VF);
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
@@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
? MiddleVPBB
: ExitingVPBB;
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
- // Set insertion point in PredBB in case an extract needs to be generated.
- // TODO: Model extracts explicitly.
- State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
- Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+ Value *V = State.get(ExitValue, VPIteration(0, 0));
if (Phi->getBasicBlockIndex(PredBB) != -1)
Phi->setIncomingValueForBlock(PredBB, V);
else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..967939c4854c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
RecurrencePhis.push_back(FOR);
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- VPBuilder MiddleBuilder;
- // Set insert point so new recipes are inserted before terminator and
- // condition, if there is either the former or both.
- if (auto *Term =
- dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator())) {
- if (auto *Cmp = dyn_cast<VPInstruction>(Term->getOperand(0)))
- MiddleBuilder.setInsertPoint(Cmp);
- else
- MiddleBuilder.setInsertPoint(Term);
- } else
- MiddleBuilder.setInsertPoint(MiddleVPBB);
-
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// Set the first operand of RecurSplice to FOR again, after replacing
// all users.
RecurSplice->setOperand(0, FOR);
-
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop with some use after the loop of the last a[i-1],
- //
- // for (int i = 0; i < n; ++i) {
- // t = a[i - 1];
- // b[i] = a[i] - t;
- // }
- // use t;
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // use = lcssa.phi [s1, scalar.body]
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // vector phi v1 for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // s_penultimate = v2(2) = v3(3)
- // s_resume = v2(3)
- // br cond, scalar.ph, exit.block
- //
- // scalar.ph:
- // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop. This
- // is modeled by ExtractFromEnd.
- Type *IntTy = Plan.getCanonicalIV()->getScalarType();
-
- // Extract the penultimate value of the recurrence and update VPLiveOut
- // users of the recurrence splice. Note that the extract of the final value
- // used to resume in the scalar loop is created earlier during VPlan
- // construction.
- auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
- VPInstruction::ExtractFromEnd,
- {FOR->getBackedgeValue(),
- Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
- {}, "vector.recur.extract.for.phi"));
- RecurSplice->replaceUsesWithIf(
- Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
}
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 16db6cf828af8..f14ffe854a3a6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir<true>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; IF-EVL-INLOOP-EMPTY:
@@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: scalar.ph:
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; IF-EVL-INLOOP-NEXT: }
;
@@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-OUTLOOP-NEXT: }
;
@@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: middle.block:
; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-NEXT: scalar.ph:
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-INLOOP-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 06fbeafba31c0..9e49cf6b42c6b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
; CHECK-NEXT: EMIT branch-on-cond ir<true>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
@@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
; CHECK-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 5f5cd78dc2d30..fcc6b7376d408 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) {
; CHECK-NEXT: [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[XOR_LCSSA]]
@@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() {
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() {
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NE...
[truncated]
|
| ; NO-VP-OUTLOOP-EMPTY: | ||
| ; NO-VP-OUTLOOP-NEXT: middle.block: | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> |
There was a problem hiding this comment.
redundant extracts can be folded by VPlan-transform as follow-up
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Add a new VPIRInstruction recipe to wrap existing IR instructions not to be modified during execution, execept for PHIs. For PHIs, a single VPValue operand is allowed, and it is used to add a new incoming value for the single predecessor VPBB. Expect PHIs, VPIRInstructions cannot have any operands. Depends on llvm#100658.
ayalz
left a comment
There was a problem hiding this comment.
Good simplification! Raising some thoughts.
| auto MiddleVPBB = | ||
| cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); | ||
| VPBuilder B(MiddleVPBB); | ||
| if (auto *Terminator = MiddleVPBB->getTerminator()) { | ||
| auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0)); | ||
| assert((!Condition || Condition->getParent() == MiddleVPBB) && | ||
| "Condition expected in MiddleVPBB"); | ||
| B.setInsertPoint(Condition ? Condition : Terminator); | ||
| } |
There was a problem hiding this comment.
Can B be set once, before entering the loop over ExitBB's phis?
There was a problem hiding this comment.
Yep, moved out, thanks!
| auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0)); | ||
| assert((!Condition || Condition->getParent() == MiddleVPBB) && | ||
| "Condition expected in MiddleVPBB"); | ||
| B.setInsertPoint(Condition ? Condition : Terminator); |
There was a problem hiding this comment.
Could B be set to MiddleVPBB->getFirstNonPhi()?
There was a problem hiding this comment.
Unfortunately this will cause a number of test changes, maybe best done as follow-up?
| VPValue *Ext; | ||
| if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>( |
There was a problem hiding this comment.
Deal first with the simpler non-FOR case, potentially early-exiting?
| {V, Plan.getOrAddLiveIn(ConstantInt::get( | ||
| IntegerType::get(ExitBB->getContext(), 32), 1))}); |
There was a problem hiding this comment.
getOrAdd this "1" once, before the loop?
There was a problem hiding this comment.
Moved the code now to addLiveOutsForFirstOrderRecurrences and the changes here should be gone; will adjust separately
| Plan.getOrAddLiveIn(ConstantInt::get( | ||
| IntegerType::get(ExitBB->getContext(), 32), 2))}, |
There was a problem hiding this comment.
getOrAdd this "2" once, before the loop?
| assert(cast<VPInstruction>(U) && | ||
| cast<VPInstruction>(U)->getOpcode() == | ||
| VPInstruction::ExtractFromEnd && |
There was a problem hiding this comment.
Use match?
The excessively long method deserves to be shortened rather than further extended.
There was a problem hiding this comment.
updated to use match, thanks!
There was a problem hiding this comment.
Will see how this can be split up logically separately. Should also be moved to VPlanTransforms.cpp as much as possible, but it still uses CM.blockNeedsPredicationForAnyReason(BB) so maybe not all can e moved yet
| FinalReductionResult, [](VPUser &User, unsigned) { | ||
| auto *R = dyn_cast<VPInstruction>(&User); | ||
| return R && R->getOpcode() == VPInstruction::ExtractFromEnd; |
| @@ -8452,7 +8452,104 @@ static void addUsersInExitBlock( | |||
| return P && Inductions.contains(P); | |||
There was a problem hiding this comment.
Should this also bail out for FOR's, to be handled explicitly elsewhere?
There was a problem hiding this comment.
FOR should now be handled below
| V->getDefiningRecipe())) { | ||
| // This is the second phase of vectorizing first-order recurrences. An | ||
| // overview of the transformation is described below. Suppose we have the | ||
| // following loop with some use after the loop of the last a[i-1], |
There was a problem hiding this comment.
This explanation is moved from adjustFixedOrderRecurrences() to here (addUsersInExitBlock()), yet deserves to still be in a method dedicated to (the 2nd phase of) FORs. How about moving it, along with setting the users of FOR in exit block, to addLiveOutsForFixedOrderRecurrences()?
The documentation of these methods may need updating, once settled.
There was a problem hiding this comment.
Moved toaddLiveOutsForFixedOrderRecurrences , thanks!
There's a bit of duplication at the moment to figure out if live-outs are needed or not. This should go away with #100735 when this should be handled automatically by the exit block not existing in VPlan in those cases
| {FOR->getBackedgeValue(), | ||
| Plan.getOrAddLiveIn(ConstantInt::get( | ||
| IntegerType::get(ExitBB->getContext(), 32), 2))}, | ||
| {}, "vector.recur.extract.for.phi"); |
There was a problem hiding this comment.
Independent note: the last element can be extracted from the spliced v3 instead of the penultimate element from the FOR's backedge value v2, clearly providing the last element as live-out in both (all) cases; then Ext could be created once - depending only on the VPValue from which the last element is to be extracted (and the recipe's name). But this spliced v3 is introduced later, by VPlanTransforms::adjustFixedOrderRecurrences().
| @@ -8470,22 +8480,29 @@ static void addUsersInExitBlock( | |||
| // live-outs. | |||
There was a problem hiding this comment.
Also comment that exit values of FORs are handled elsewhere. I.e., we only handle exit values of reductions here. Worth renaming the method more accurately.
There was a problem hiding this comment.
added a comment are and above, live outs added here may be reductions or non-induction/FOR values, any suggestions for a compact yet more descriptive name for the function?
There was a problem hiding this comment.
..., any suggestions for a compact yet more descriptive name for the function?
Ah, truncated inductions are also apparently handled here, although they should arguably be handled similar to non-truncated inductions. Live-out of live-in values should best be excluded from VPlan altogether.
If/when both are addressed, as follow-ups, addUsersInExitBlock() could be renamed fixExitValuesOfReductions(), as it not only adds users to lcssa phis of Exit block but also introduces extracts in middle block.
Similarly, addLiveOutsForFirstOrderRecurrences() could be renamed fixExitValuesOfFirstOrderRecurrences(), as it not only adds live outs but also introduces extracts in middle block.
For consistency, RecipeBuilder.fixHeaderPhis() could be renamed addUsersToHeaderPhis(), which is more accurate than a general fix which could also introduce new recipes.
WDYT?
| if ((isa<VPWidenIntOrFpInductionRecipe>(V) && | ||
| !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || | ||
| isa<VPWidenPointerInductionRecipe>(V) || | ||
| isa<VPWidenPointerInductionRecipe, VPFirstOrderRecurrencePHIRecipe>( |
There was a problem hiding this comment.
Easier to check directly for reductions than by elimination of neither inductions nor FORs? Can be done as a follow-up.
There was a problem hiding this comment.
It could also be a non-induction/FOR live out I think unfortunately.
There was a problem hiding this comment.
Wonder if it would be better to maintain an ExitingValuesToFix list, similar to [Header]PhisToFix, where induction (w/o truncates) or FORs that are dealt with elsewhere take themselves off the list, followed by the general extract-last treatment applied here to all remaining ones, rather than trying to identify here all cases treated elsewhere only to skip them.
There was a problem hiding this comment.
Added collecting of exitingvaluestofix here, unfortunately we still need to skip inductions explicitly there as they are only fixed later.
|
|
||
| VPValue *Ext = B.createNaryOp( | ||
| VPInstruction::ExtractFromEnd, | ||
| {V, Plan.getOrAddLiveIn(ConstantInt::get( |
There was a problem hiding this comment.
Independently: may be good to provide VPlan::getOrAddConstant(), with type derived somehows.
| /// Feed a resume value for every FOR from the vector loop to the scalar loop, | ||
| /// if middle block branches to scalar preheader, by introducing ExtractFromEnd | ||
| /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the | ||
| /// latter and corresponds to the scalar header. |
There was a problem hiding this comment.
Comment deserves augmenting to also address uses in exit block.
There was a problem hiding this comment.
Updated comment, thanks!
| // | ||
| // middle.block: | ||
| // s_penultimate = v2(2) = v3(3) | ||
| // s_resume = v2(3) |
There was a problem hiding this comment.
Independently: would be more consistent to use the same names that are emitted, i.e., s_resume and vector.recur.extract should read the same, as should s_init' and scalar.recur.init, s_penultimate and vector.recur.extract.for.phi (the latter is ambiguous - both extracts are for phi's).
There was a problem hiding this comment.
Will do separately, thanks!
| @@ -8517,20 +8534,121 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { | |||
| } | |||
There was a problem hiding this comment.
| } | |
| // TODO: set MiddleBuilder to MiddleVPBB->getFirstNonPhi(). | |
| } |
| ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 | ||
| ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
| ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] |
There was a problem hiding this comment.
Induction is now widened? (Sorry if addressed already, can't seem to find if/where/how)
There was a problem hiding this comment.
At the moment, ExtractFromEnd isn't marked as usesScalars, which is causing the change here
There was a problem hiding this comment.
ok, worth indicating that this patch is mostly NFC, but may alter the generated code in seldom cases.
Worth leaving behind a TODO?
| ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 | ||
| ; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] | ||
| ; CHECK: middle.block: | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 |
There was a problem hiding this comment.
Independent of this patch: a live-out induction exiting value is computed by extracting its last iteration value, as in reduction, rather than being pre-computed?
There was a problem hiding this comment.
Currently we only pre-compute the value of the IV and the increment, but here the truncate is used outside, hence the extract.
| ; CHECK-NEXT: Successor(s): middle.block | ||
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: middle.block: | ||
| ; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1> |
There was a problem hiding this comment.
(Extracting live-out induction, as noted above)
There was a problem hiding this comment.
It could be pre-computed, but at the moment we use extracts, pre-computation is only done for the IV or its increment directly
| "VPBasicBlock"); | ||
| RecipeBuilder.fixHeaderPhis(); | ||
|
|
||
| addLiveOutsForFirstOrderRecurrences(*Plan); |
There was a problem hiding this comment.
Hmm, recurrences of orders greater than first are supported, for some time now, involving series of pairwise dependent header phi's; but their live-outs can correspond only to first order, i.e., penultimate value?
Now that the branches to the scalar epilogue are modeled in VPlan directly, check the VPlan to see if a scalar epilogue is required. Preparation for #100658.
ayalz
left a comment
There was a problem hiding this comment.
This LGTM, thanks!
Adding several thoughts and suggestions.
| @@ -8470,22 +8480,29 @@ static void addUsersInExitBlock( | |||
| // live-outs. | |||
There was a problem hiding this comment.
..., any suggestions for a compact yet more descriptive name for the function?
Ah, truncated inductions are also apparently handled here, although they should arguably be handled similar to non-truncated inductions. Live-out of live-in values should best be excluded from VPlan altogether.
If/when both are addressed, as follow-ups, addUsersInExitBlock() could be renamed fixExitValuesOfReductions(), as it not only adds users to lcssa phis of Exit block but also introduces extracts in middle block.
Similarly, addLiveOutsForFirstOrderRecurrences() could be renamed fixExitValuesOfFirstOrderRecurrences(), as it not only adds live outs but also introduces extracts in middle block.
For consistency, RecipeBuilder.fixHeaderPhis() could be renamed addUsersToHeaderPhis(), which is more accurate than a general fix which could also introduce new recipes.
WDYT?
| if ((isa<VPWidenIntOrFpInductionRecipe>(V) && | ||
| !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || | ||
| isa<VPWidenPointerInductionRecipe>(V) || | ||
| isa<VPWidenPointerInductionRecipe, VPFirstOrderRecurrencePHIRecipe>( |
There was a problem hiding this comment.
Wonder if it would be better to maintain an ExitingValuesToFix list, similar to [Header]PhisToFix, where induction (w/o truncates) or FORs that are dealt with elsewhere take themselves off the list, followed by the general extract-last treatment applied here to all remaining ones, rather than trying to identify here all cases treated elsewhere only to skip them.
| /// ExtractFromEnd and ResumePhi recipes in each, respectively, and a | ||
| /// VPLiveOut which uses the latter and corresponds to the scalar header. | ||
| /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in | ||
| /// the original exit block using a VPLiveOut. |
There was a problem hiding this comment.
| /// the original exit block using a VPLiveOut. | |
| /// the original exit block using a VPLiveOut. |
| // vector.body | ||
| // i = phi [0, vector.ph], [i+4, vector.body] | ||
| // v1 = phi [v_init, vector.ph], [v2, vector.body] | ||
| // v2 = a[i] |
There was a problem hiding this comment.
| // v2 = a[i] | |
| // v2 = a[i, i+1, i+2, i+3] |
the first phase vectorizes the load from a[i].
| // b[i] = v2 - v1 | ||
| // b[i, i+1, i+2, i+3] = v2 - v3 |
There was a problem hiding this comment.
| // b[i] = v2 - v1 | |
| // b[i, i+1, i+2, i+3] = v2 - v3 | |
| // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) | |
| // b[i, i+1, i+2, i+3] = v2 - v1 |
if we want to postpone v3/v1' whose splice is introduced later.
There was a problem hiding this comment.
Setting b[i] = v2 - v1 stores a vector difference into a single scalar element, only to be overwritten.
Setting b[i, i+1, i+2, i+3] = v2 - v3 uses an undefined v3. Both should be corrected:
The IR corresponding to the current phase should have b[i,i+1,i+2,i+3] = v2 - v1 (at-least), which results from having vectorized the original b[i] = s2 - s1, carelessly.
The third phase will correct this IR by introducing the splice - that explanation can be left for the splice introducing pass (where it is now), rather than provided here (as a comment), if preferred.
Sounds right?
There was a problem hiding this comment.
Sounds good, added here with the comment thanks!
| // br cond, vector.body, middle.block | ||
| // | ||
| // middle.block: | ||
| // s.penultimate = v2(2) |
There was a problem hiding this comment.
| // s.penultimate = v2(2) | |
| // vector.recur.extract.for.phi = v2(2) |
? Admittedly worth renaming.
| // br cond, scalar.body, exit.block | ||
| // | ||
| // exit.block: | ||
| // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] |
There was a problem hiding this comment.
| // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] | |
| // lo = lcssa.phi [s1, scalar.body], [vector.recur.extract.for.phi, middle.block] |
| ; NO-VP-OUTLOOP-EMPTY: | ||
| ; NO-VP-OUTLOOP-NEXT: middle.block: | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> |
| ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] | ||
| ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ] | ||
| ; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ] | ||
| ; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ] |
There was a problem hiding this comment.
nit: change where ENTRY is defined and used, intentionally? Related?
There was a problem hiding this comment.
Removed the change, thanks
| ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 | ||
| ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
| ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] |
There was a problem hiding this comment.
ok, worth indicating that this patch is mostly NFC, but may alter the generated code in seldom cases.
Worth leaving behind a TODO?
| // b[i] = v2 - v1 | ||
| // b[i, i+1, i+2, i+3] = v2 - v3 |
There was a problem hiding this comment.
Setting b[i] = v2 - v1 stores a vector difference into a single scalar element, only to be overwritten.
Setting b[i, i+1, i+2, i+3] = v2 - v3 uses an undefined v3. Both should be corrected:
The IR corresponding to the current phase should have b[i,i+1,i+2,i+3] = v2 - v1 (at-least), which results from having vectorized the original b[i] = s2 - s1, carelessly.
The third phase will correct this IR by introducing the splice - that explanation can be left for the splice introducing pass (where it is now), rather than provided here (as a comment), if preferred.
Sounds right?
| // and there is nothing to fix from vector loop; phis should have incoming | ||
| // from scalar loop only. | ||
| if (ExitingValuesToFix.empty()) | ||
| continue; |
There was a problem hiding this comment.
It's a bit odd that the existence of an edge from middle block to exit block can be so easily detected, and exitBB set, compared to an edge from middle block to scalar preheader done above: "// Start by finding out if middle block branches to scalar preheader ....".
There was a problem hiding this comment.
Should be better now, thanks!
| // If there are multiple successors of the middle block, their order is | ||
| // fixed; the first successor must be the original exit block. | ||
| BasicBlock *ExitBB = | ||
| cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); |
There was a problem hiding this comment.
Better to set ExitBB alongside ScalarPH, before the loop? Erasing Values from ExitingValuesToFix should not cause subsequent iterations to early continue, right?
Perhaps something like
BasicBlock *ExitBB = nullptr;
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar preheader.
ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else if (ExitingValuesToFix.empty()) {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
} else {
ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor())->getIRBasicBlock();
}
ok to early exit at the outset if ScalarPHVPBB is null, w/o updating exit block?
There was a problem hiding this comment.
Updated and added assert that we don't skip any exiting values when exiting early., thanks!
| if (MiddleVPBB->getNumSuccessors() != 2) | ||
| continue; | ||
| BasicBlock *ExitBB = | ||
| cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); |
There was a problem hiding this comment.
And so the second must be the scalar preheader?
| VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, | ||
| "vector.recur.extract.for.phi"); | ||
| Plan.addLiveOut(cast<PHINode>(UI), Ext); | ||
| ExitingValuesToFix.erase(cast<PHINode>(UI)); |
There was a problem hiding this comment.
Note that it's fine for a FOR to have no users in ExitBB. Can it have duplicates/more than one? They can reuse the same extract (and lcssa phi).
There was a problem hiding this comment.
There can be multiple duplicated exit phis, but those should be handled correctly (and cleaned up earlier). Potentially can also handle this de-duplication here as follow up
| // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the | ||
| // original exit block. | ||
| static void addUsersInExitBlock( | ||
| // Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that |
There was a problem hiding this comment.
| // Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that | |
| // Collect (ExitPhi, ExitingValue) pairs for phis in the original exit block that |
| @@ -8538,9 +8540,8 @@ static void addUsersInExitBlock( | |||
| // and there is nothing to fix from vector loop; phis should have incoming | |||
| // from scalar loop only. | |||
| if (MiddleVPBB->getNumSuccessors() != 2) | |||
There was a problem hiding this comment.
So non-empty exitingValuesToFix implies two successors; having exit block as the only successor is not considered.
Looking forward following our roadmap, initial VPlan should probably always start with both successors, and optimized later to have a single successor if scalar epilog is required or redundant.
Does empty exitingValuesToFix imply no edge from middle block to exit block, i.e., can such an edge exist w/o any lcssa phi's to fix?
| } else if (ExitingValuesToFix.empty()) { | ||
| ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor()); | ||
| } else { | ||
| ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor()) | ||
| ->getIRBasicBlock(); |
There was a problem hiding this comment.
| } else if (ExitingValuesToFix.empty()) { | |
| ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor()); | |
| } else { | |
| ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor()) | |
| ->getIRBasicBlock(); | |
| } else { | |
| // Single successor - only scalar preheader case supported. | |
| assert(exitingValuesToFix.empty() && "Exiting values to fix w/o edge to exit block"); | |
| ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor()); | |
| } |
| if (!ScalarPHVPBB) { | ||
| assert(ExitingValuesToFix.empty() && | ||
| "missed inserting extracts for exiting values"); | ||
| return; | ||
| } |
| // i = phi [0, vector.ph], [i+4, vector.body] | ||
| // v1 = phi [v_init, vector.ph], [v2, vector.body] | ||
| // v2 = a[i, i+1, i+2, i+3] | ||
| // b[i] = v2 - v1 |
There was a problem hiding this comment.
| // b[i] = v2 - v1 |
this line still needs to drop.
| if (ExitingValuesToFix.empty()) | ||
| continue; |
There was a problem hiding this comment.
We can instead check
if (!ExitBB)
continue;
invariantly, but prefer to possibly bail out earlier - as soon as all phi's have been fixed?
Anyhow, the comment talks about no edge to exit block, i.e., !ExitBB, rather than exhausting ExitingsValuesToFix.
| ; NO-VP-OUTLOOP-EMPTY: | ||
| ; NO-VP-OUTLOOP-NEXT: middle.block: | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> | ||
| ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> |
Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi.
This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences.
It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis.