[LV] Use VPReductionRecipe for partial reductions#147513
Conversation
|
@llvm/pr-subscribers-vectorizers Author: Sam Tebbs (SamTebbs33) ChangesPartial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on #144281 . Patch is 170.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147513.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0adff8d957e98..570fcf2f71bde 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7045,7 +7045,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
// The VPlan-based cost model is more accurate for partial reduction and
// comparing against the legacy cost isn't desirable.
- if (isa<VPPartialReductionRecipe>(&R))
+ if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
+ VPR && VPR->isPartialReduction())
return true;
/// If a VPlan transform folded a recipe to one producing a single-scalar,
@@ -8307,11 +8308,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
// If the PHI is used by a partial reduction, set the scale factor.
- unsigned ScaleFactor =
- getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
- PhiRecipe = new VPReductionPHIRecipe(
- Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc), ScaleFactor);
+ bool UseInLoopReduction = CM.isInLoopReduction(Phi);
+ bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+ auto ScaleFactor =
+ (UseOrderedReductions || UseInLoopReduction)
+ ? 0
+ : getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
+ PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+ UseOrderedReductions, ScaleFactor);
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8375,7 +8379,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
VPValue *Accumulator = Operands[1];
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
- isa<VPPartialReductionRecipe>(BinOpRecipe))
+ (isa<VPReductionRecipe>(BinOpRecipe) &&
+ cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction()))
std::swap(BinOp, Accumulator);
unsigned ReductionOpcode = Reduction->getOpcode();
@@ -8396,12 +8401,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero =
- Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
- BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
- return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
- ScaleFactor, Reduction);
+
+ return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction,
+ Accumulator, BinOp, Cond, false, ScaleFactor);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9189,9 +9192,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
? RdxDesc.getFastMathFlags()
: FastMathFlags();
+ bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+ unsigned VFScaleFactor = !UseOrderedReductions;
auto *RedRecipe = new VPReductionRecipe(
Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
- CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
+ UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc());
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
// Delete CurrentLink as it will be invalid if its operand is replaced
@@ -9225,8 +9230,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// Don't output selects for partial reductions because they have an output
// with fewer lanes than the VF. So the operands of the select would have
// different numbers of lanes. Partial reductions mask the input instead.
+ auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
- !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
+ (!RR || !RR->isPartialReduction())) {
VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
std::optional<FastMathFlags> FMFs =
PhiTy->isFloatingPointTy()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 30f3566332d79..a3cc5f335e82e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -552,7 +552,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntOrFpInductionSC:
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
- case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
@@ -2194,26 +2193,29 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Descriptor for the reduction.
const RecurrenceDescriptor &RdxDesc;
- /// The phi is part of an in-loop reduction.
- bool IsInLoop;
-
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
- /// When expanding the reduction PHI, the plan's VF element count is divided
- /// by this factor to form the reduction phi's VF.
- unsigned VFScaleFactor = 1;
+ /// The scaling factor, relative to the VF, that this recipe's output is
+ /// divided by.
+ /// For outer-loop reductions this is equal to 1.
+ /// For in-loop reductions this is equal to 0, to specify that this is equal
+ /// to the VF (which may not be known yet). For partial-reductions this is
+ /// equal to another scalar value.
+ unsigned VFScaleFactor;
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
- VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false, unsigned VFScaleFactor = 1)
+ VPValue &Start, bool IsOrdered = false,
+ unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
- VFScaleFactor(VFScaleFactor) {
- assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
+ RdxDesc(RdxDesc), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
+ assert((!IsOrdered || isInLoop()) &&
+ "IsOrdered requires the reduction to be in-loop");
+ assert(((!isInLoop() && !IsOrdered) || isInLoop()) &&
+ "Invalid VFScaleFactor");
}
~VPReductionPHIRecipe() override = default;
@@ -2221,7 +2223,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
VPReductionPHIRecipe *clone() override {
auto *R = new VPReductionPHIRecipe(
dyn_cast_or_null<PHINode>(getUnderlyingValue()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
+ *getOperand(0), IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -2247,8 +2249,11 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Returns true, if the phi is part of an ordered reduction.
bool isOrdered() const { return IsOrdered; }
- /// Returns true, if the phi is part of an in-loop reduction.
- bool isInLoop() const { return IsInLoop; }
+ /// Returns true if the phi is part of an in-loop reduction.
+ bool isInLoop() const { return VFScaleFactor == 0; }
+
+ /// Returns true if the reduction outputs a vector with a scaled down VF.
+ bool isPartialReduction() const { return VFScaleFactor > 1; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -2421,23 +2426,32 @@ class VPInterleaveRecipe : public VPRecipeBase {
Instruction *getInsertPos() const { return IG->getInsertPos(); }
};
-/// A recipe to represent inloop reduction operations, performing a reduction on
-/// a vector operand into a scalar value, and adding the result to a chain.
-/// The Operands are {ChainOp, VecOp, [Condition]}.
+/// A recipe to represent inloop, ordered or partial reduction operations. It
+/// performs a reduction on a vector operand into a scalar (vector in the case
+/// of a partial reduction) value, and adds the result to a chain. The Operands
+/// are {ChainOp, VecOp, [Condition]}.
class VPReductionRecipe : public VPRecipeWithIRFlags {
/// The recurrence kind for the reduction in question.
RecurKind RdxKind;
bool IsOrdered;
/// Whether the reduction is conditional.
bool IsConditional = false;
+ /// The scaling factor, relative to the VF, that this recipe's output is
+ /// divided by.
+ /// For outer-loop reductions this is equal to 1.
+ /// For in-loop reductions this is equal to 0, to specify that this is equal
+ /// to the VF (which may not be known yet).
+ /// For partial-reductions this is equal to another scalar value.
+ unsigned VFScaleFactor;
protected:
VPReductionRecipe(const unsigned char SC, RecurKind RdxKind,
FastMathFlags FMFs, Instruction *I,
ArrayRef<VPValue *> Operands, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL)
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL)
: VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind),
- IsOrdered(IsOrdered) {
+ IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
+ assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
if (CondOp) {
IsConditional = true;
addOperand(CondOp);
@@ -2448,30 +2462,29 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered, DL) {}
+ IsOrdered, VFScaleFactor, DL) {}
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered, DL) {}
+ IsOrdered, VFScaleFactor, DL) {}
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
- return new VPReductionRecipe(RdxKind, getFastMathFlags(),
- getUnderlyingInstr(), getChainOp(), getVecOp(),
- getCondOp(), IsOrdered, getDebugLoc());
+ return new VPReductionRecipe(
+ RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(),
+ getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc());
}
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
}
static inline bool classof(const VPUser *U) {
@@ -2498,6 +2511,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
bool isOrdered() const { return IsOrdered; };
/// Return true if the in-loop reduction is conditional.
bool isConditional() const { return IsConditional; };
+ /// Returns true if the reduction outputs a vector with a scaled down VF.
+ bool isPartialReduction() const { return VFScaleFactor > 1; }
/// The VPValue of the scalar Chain being accumulated.
VPValue *getChainOp() const { return getOperand(0); }
/// The VPValue of the vector value to be reduced.
@@ -2506,68 +2521,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
VPValue *getCondOp() const {
return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
}
-};
-
-/// A recipe for forming partial reductions. In the loop, an accumulator and
-/// vector operand are added together and passed to the next iteration as the
-/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value.
-class VPPartialReductionRecipe : public VPReductionRecipe {
- unsigned Opcode;
-
- /// The divisor by which the VF of this recipe's output should be divided
- /// during execution.
- unsigned VFScaleFactor;
-
-public:
- VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor)
- : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond,
- VFScaleFactor, ReductionInst) {}
- VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
- VPValue *Cond, unsigned ScaleFactor,
- Instruction *ReductionInst = nullptr)
- : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add,
- FastMathFlags(), ReductionInst,
- ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}),
- Opcode(Opcode), VFScaleFactor(ScaleFactor) {
- [[maybe_unused]] auto *AccumulatorRecipe =
- getChainOp()->getDefiningRecipe();
- // When cloning as part of a VPExpressionRecipe, the chain op could have
- // been removed from the plan and so doesn't have a defining recipe.
- assert((!AccumulatorRecipe ||
- isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
- isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
- "Unexpected operand order for partial reduction recipe");
- }
- ~VPPartialReductionRecipe() override = default;
-
- VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getCondOp(), VFScaleFactor,
- getUnderlyingInstr());
- }
-
- VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
-
- /// Generate the reduction in the loop.
- void execute(VPTransformState &State) override;
-
- /// Return the cost of this VPPartialReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
- /// Get the binary op's opcode.
- unsigned getOpcode() const { return Opcode; }
-
/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
};
/// A recipe to represent inloop reduction operations with vector-predication
@@ -2583,7 +2538,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
R.getFastMathFlags(),
cast_or_null<Instruction>(R.getUnderlyingValue()),
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
- R.isOrdered(), DL) {}
+ R.isOrdered(), 0, DL) {}
~VPReductionEVLRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..6aa3750897309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,10 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
- VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
@@ -396,7 +396,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
static unsigned getVFScaleFactor(VPRecipeBase *R) {
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+ if (auto *RR = dyn_cast<VPReductionRecipe>(R))
return RR->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
@@ -566,8 +566,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(R);
- ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
+ ElementCount VF = VFs[J];
+ if (unsigned ScaleFactor = getVFScaleFactor(R))
+ VF = VF.divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
<< " for " << *R << "\n";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b485fa3d86e40..9dc3fb1b97447 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,7 +164,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
- case VPPartialReductionSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
@@ -294,104 +293,6 @@ bool VPRecipeBase::isScalarCast() const {
return VPI && Instruction::isCast(VPI->getOpcode());
}
-InstructionCost
-VPPartialReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- std::optional<unsigned> Opcode;
- VPValue *Op = getOperand(0);
- VPRecipeBase *OpR = Op->getDefiningRecipe();
-
- // If the partial reduction is predicated, a select will be operand 0
- using namespace llvm::VPlanPatternMatch;
- if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
- OpR = Op->getDefiningRecipe();
- }
-
- Type *InputTypeA = nullptr, *InputTypeB = nullptr;
- TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
- ExtBType = TTI::PR_None;
-
- auto GetExtendKind = [](VPRecipeBase *R) {
- if (!R)
- return TTI::PR_None;
- auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
- if (!WidenCastR)
- return TTI::PR_None;
- if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
- return TTI::PR_ZeroExtend;
- if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
- return TTI::PR_SignExtend;
- return TTI::PR_None;
- };
-
- // Pick out opcode, type/ext information and use sub side effects from a widen
- // recipe....
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Sam Tebbs (SamTebbs33) ChangesPartial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on #144281 . Patch is 170.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147513.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0adff8d957e98..570fcf2f71bde 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7045,7 +7045,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
// The VPlan-based cost model is more accurate for partial reduction and
// comparing against the legacy cost isn't desirable.
- if (isa<VPPartialReductionRecipe>(&R))
+ if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
+ VPR && VPR->isPartialReduction())
return true;
/// If a VPlan transform folded a recipe to one producing a single-scalar,
@@ -8307,11 +8308,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
// If the PHI is used by a partial reduction, set the scale factor.
- unsigned ScaleFactor =
- getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
- PhiRecipe = new VPReductionPHIRecipe(
- Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc), ScaleFactor);
+ bool UseInLoopReduction = CM.isInLoopReduction(Phi);
+ bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+ auto ScaleFactor =
+ (UseOrderedReductions || UseInLoopReduction)
+ ? 0
+ : getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
+ PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+ UseOrderedReductions, ScaleFactor);
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8375,7 +8379,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
VPValue *Accumulator = Operands[1];
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
- isa<VPPartialReductionRecipe>(BinOpRecipe))
+ (isa<VPReductionRecipe>(BinOpRecipe) &&
+ cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction()))
std::swap(BinOp, Accumulator);
unsigned ReductionOpcode = Reduction->getOpcode();
@@ -8396,12 +8401,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero =
- Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
- BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
- return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
- ScaleFactor, Reduction);
+
+ return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction,
+ Accumulator, BinOp, Cond, false, ScaleFactor);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9189,9 +9192,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
? RdxDesc.getFastMathFlags()
: FastMathFlags();
+ bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+ unsigned VFScaleFactor = !UseOrderedReductions;
auto *RedRecipe = new VPReductionRecipe(
Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
- CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
+ UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc());
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
// Delete CurrentLink as it will be invalid if its operand is replaced
@@ -9225,8 +9230,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// Don't output selects for partial reductions because they have an output
// with fewer lanes than the VF. So the operands of the select would have
// different numbers of lanes. Partial reductions mask the input instead.
+ auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
- !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
+ (!RR || !RR->isPartialReduction())) {
VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
std::optional<FastMathFlags> FMFs =
PhiTy->isFloatingPointTy()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 30f3566332d79..a3cc5f335e82e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -552,7 +552,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntOrFpInductionSC:
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
- case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
@@ -2194,26 +2193,29 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Descriptor for the reduction.
const RecurrenceDescriptor &RdxDesc;
- /// The phi is part of an in-loop reduction.
- bool IsInLoop;
-
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
- /// When expanding the reduction PHI, the plan's VF element count is divided
- /// by this factor to form the reduction phi's VF.
- unsigned VFScaleFactor = 1;
+ /// The scaling factor, relative to the VF, that this recipe's output is
+ /// divided by.
+ /// For outer-loop reductions this is equal to 1.
+ /// For in-loop reductions this is equal to 0, to specify that this is equal
+ /// to the VF (which may not be known yet). For partial-reductions this is
+ /// equal to another scalar value.
+ unsigned VFScaleFactor;
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
- VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false, unsigned VFScaleFactor = 1)
+ VPValue &Start, bool IsOrdered = false,
+ unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
- VFScaleFactor(VFScaleFactor) {
- assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
+ RdxDesc(RdxDesc), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
+ assert((!IsOrdered || isInLoop()) &&
+ "IsOrdered requires the reduction to be in-loop");
+ assert(((!isInLoop() && !IsOrdered) || isInLoop()) &&
+ "Invalid VFScaleFactor");
}
~VPReductionPHIRecipe() override = default;
@@ -2221,7 +2223,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
VPReductionPHIRecipe *clone() override {
auto *R = new VPReductionPHIRecipe(
dyn_cast_or_null<PHINode>(getUnderlyingValue()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
+ *getOperand(0), IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -2247,8 +2249,11 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Returns true, if the phi is part of an ordered reduction.
bool isOrdered() const { return IsOrdered; }
- /// Returns true, if the phi is part of an in-loop reduction.
- bool isInLoop() const { return IsInLoop; }
+ /// Returns true if the phi is part of an in-loop reduction.
+ bool isInLoop() const { return VFScaleFactor == 0; }
+
+ /// Returns true if the reduction outputs a vector with a scaled down VF.
+ bool isPartialReduction() const { return VFScaleFactor > 1; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -2421,23 +2426,32 @@ class VPInterleaveRecipe : public VPRecipeBase {
Instruction *getInsertPos() const { return IG->getInsertPos(); }
};
-/// A recipe to represent inloop reduction operations, performing a reduction on
-/// a vector operand into a scalar value, and adding the result to a chain.
-/// The Operands are {ChainOp, VecOp, [Condition]}.
+/// A recipe to represent inloop, ordered or partial reduction operations. It
+/// performs a reduction on a vector operand into a scalar (vector in the case
+/// of a partial reduction) value, and adds the result to a chain. The Operands
+/// are {ChainOp, VecOp, [Condition]}.
class VPReductionRecipe : public VPRecipeWithIRFlags {
/// The recurrence kind for the reduction in question.
RecurKind RdxKind;
bool IsOrdered;
/// Whether the reduction is conditional.
bool IsConditional = false;
+ /// The scaling factor, relative to the VF, that this recipe's output is
+ /// divided by.
+ /// For outer-loop reductions this is equal to 1.
+ /// For in-loop reductions this is equal to 0, to specify that this is equal
+ /// to the VF (which may not be known yet).
+ /// For partial-reductions this is equal to another scalar value.
+ unsigned VFScaleFactor;
protected:
VPReductionRecipe(const unsigned char SC, RecurKind RdxKind,
FastMathFlags FMFs, Instruction *I,
ArrayRef<VPValue *> Operands, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL)
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL)
: VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind),
- IsOrdered(IsOrdered) {
+ IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
+ assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
if (CondOp) {
IsConditional = true;
addOperand(CondOp);
@@ -2448,30 +2462,29 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
public:
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered, DL) {}
+ IsOrdered, VFScaleFactor, DL) {}
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
- bool IsOrdered, DebugLoc DL = {})
+ bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
- IsOrdered, DL) {}
+ IsOrdered, VFScaleFactor, DL) {}
~VPReductionRecipe() override = default;
VPReductionRecipe *clone() override {
- return new VPReductionRecipe(RdxKind, getFastMathFlags(),
- getUnderlyingInstr(), getChainOp(), getVecOp(),
- getCondOp(), IsOrdered, getDebugLoc());
+ return new VPReductionRecipe(
+ RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(),
+ getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc());
}
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
}
static inline bool classof(const VPUser *U) {
@@ -2498,6 +2511,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
bool isOrdered() const { return IsOrdered; };
/// Return true if the in-loop reduction is conditional.
bool isConditional() const { return IsConditional; };
+ /// Returns true if the reduction outputs a vector with a scaled down VF.
+ bool isPartialReduction() const { return VFScaleFactor > 1; }
/// The VPValue of the scalar Chain being accumulated.
VPValue *getChainOp() const { return getOperand(0); }
/// The VPValue of the vector value to be reduced.
@@ -2506,68 +2521,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
VPValue *getCondOp() const {
return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
}
-};
-
-/// A recipe for forming partial reductions. In the loop, an accumulator and
-/// vector operand are added together and passed to the next iteration as the
-/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value.
-class VPPartialReductionRecipe : public VPReductionRecipe {
- unsigned Opcode;
-
- /// The divisor by which the VF of this recipe's output should be divided
- /// during execution.
- unsigned VFScaleFactor;
-
-public:
- VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor)
- : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond,
- VFScaleFactor, ReductionInst) {}
- VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
- VPValue *Cond, unsigned ScaleFactor,
- Instruction *ReductionInst = nullptr)
- : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add,
- FastMathFlags(), ReductionInst,
- ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}),
- Opcode(Opcode), VFScaleFactor(ScaleFactor) {
- [[maybe_unused]] auto *AccumulatorRecipe =
- getChainOp()->getDefiningRecipe();
- // When cloning as part of a VPExpressionRecipe, the chain op could have
- // been removed from the plan and so doesn't have a defining recipe.
- assert((!AccumulatorRecipe ||
- isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
- isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
- "Unexpected operand order for partial reduction recipe");
- }
- ~VPPartialReductionRecipe() override = default;
-
- VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getCondOp(), VFScaleFactor,
- getUnderlyingInstr());
- }
-
- VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
-
- /// Generate the reduction in the loop.
- void execute(VPTransformState &State) override;
-
- /// Return the cost of this VPPartialReductionRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
- /// Get the binary op's opcode.
- unsigned getOpcode() const { return Opcode; }
-
/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-#endif
};
/// A recipe to represent inloop reduction operations with vector-predication
@@ -2583,7 +2538,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
R.getFastMathFlags(),
cast_or_null<Instruction>(R.getUnderlyingValue()),
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
- R.isOrdered(), DL) {}
+ R.isOrdered(), 0, DL) {}
~VPReductionEVLRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..6aa3750897309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,10 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
- VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
@@ -396,7 +396,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
static unsigned getVFScaleFactor(VPRecipeBase *R) {
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+ if (auto *RR = dyn_cast<VPReductionRecipe>(R))
return RR->getVFScaleFactor();
assert(
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
@@ -566,8 +566,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(R);
- ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
+ ElementCount VF = VFs[J];
+ if (unsigned ScaleFactor = getVFScaleFactor(R))
+ VF = VF.divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
<< " for " << *R << "\n";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b485fa3d86e40..9dc3fb1b97447 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,7 +164,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
- case VPPartialReductionSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
@@ -294,104 +293,6 @@ bool VPRecipeBase::isScalarCast() const {
return VPI && Instruction::isCast(VPI->getOpcode());
}
-InstructionCost
-VPPartialReductionRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- std::optional<unsigned> Opcode;
- VPValue *Op = getOperand(0);
- VPRecipeBase *OpR = Op->getDefiningRecipe();
-
- // If the partial reduction is predicated, a select will be operand 0
- using namespace llvm::VPlanPatternMatch;
- if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
- OpR = Op->getDefiningRecipe();
- }
-
- Type *InputTypeA = nullptr, *InputTypeB = nullptr;
- TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
- ExtBType = TTI::PR_None;
-
- auto GetExtendKind = [](VPRecipeBase *R) {
- if (!R)
- return TTI::PR_None;
- auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
- if (!WidenCastR)
- return TTI::PR_None;
- if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
- return TTI::PR_ZeroExtend;
- if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
- return TTI::PR_SignExtend;
- return TTI::PR_None;
- };
-
- // Pick out opcode, type/ext information and use sub side effects from a widen
- // recipe....
[truncated]
|
it does, so that chained partial reductions with subs will be supported straight away. |
This PR allows the loop vectorizer to handle in-loop sub reductions by forming a normal in-loop add reduction with a negated input. Stacked PRs: 1. -> llvm/llvm-project#147026 2. llvm/llvm-project#147255 3. llvm/llvm-project#147302 4. llvm/llvm-project#147513
This PR bundles sub reductions into the VPExpressionRecipe class and adjusts the cost functions to take the negation into account. Stacked PRs: 1. llvm/llvm-project#147026 2. -> llvm/llvm-project#147255 3. llvm/llvm-project#147302 4. llvm/llvm-project#147513
|
Ping. |
7263435 to
d634251
Compare
0a4dd84 to
9130747
Compare
| class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { | ||
| /// The recurrence kind for the reduction in question. | ||
| RecurKind RdxKind; | ||
| bool IsOrdered; | ||
| /// Whether the reduction is conditional. | ||
| bool IsConditional = false; | ||
| /// The scaling factor, relative to the VF, that this recipe's output is | ||
| /// divided by. | ||
| /// For outer-loop reductions this is equal to 1. |
There was a problem hiding this comment.
I think I'm getting a bit confused. In which case do we get a VPReductionRecipe for "outer-loop" reductions? I thought it was only used when the reduction result was computed inside the loop (sometimes partially). For "outer-loop" reductions, don't we "widen everything" and compute the reduction result in the middle block?
There was a problem hiding this comment.
I think you're right actually! I've removed that part of the comment.
|
|
||
| /// Get the binary op's opcode. | ||
| unsigned getOpcode() const { return Opcode; } | ||
|
|
||
| /// Get the factor that the VF of this recipe's output should be scaled by. | ||
| unsigned getVFScaleFactor() const { return VFScaleFactor; } |
There was a problem hiding this comment.
I think this interface is now a bit misleading. We might return 0, which actually means the reduction is "scaled" by whatever the loop VF is, not 0.
Maybe something like below would be clearer:
/// Get the factor that the VF of this recipe's output should be scaled by,
/// if it represents a partial reduction.
std::optional<unsigned> getPartialVFScaleFactor() const {
return isPartial() ? VFScaleFactor : std::nullopt;
}
This way, it forces users of getPartialVFScaleFactor() to question what the result (or lack thereof) means. What do you think?
Edit: getPartialReductionScaleFactor() might be a clearer name.
There was a problem hiding this comment.
I like that, thanks. I'd prefer to keep the current name though.
| /// For in-loop reductions this is equal to 0, to specify that this is equal | ||
| /// to the VF (which may not be known yet). For partial-reductions this is | ||
| /// equal to another scalar value. | ||
| unsigned VFScaleFactor; |
There was a problem hiding this comment.
I would prefer not to overload the meaning of this variable, and keep IsInLoop semantically separate from the scale factor for partial reductions.
I wonder if a variant with empty types would help here, something like:
struct RdxOrderedInLoop {};
struct RdxInLoop {};
struct RdxNormal {};
struct RdxPartial {
unsigned VFScaleFactor;
};
std::variant<RdxOrderedInLoop, RdxInLoop, RdxNormal, RdxPartial> RdxStyle;
That way we can remove the separate boolean flags and asserts that they are consistent, and we don't need to overload the meaning of the scale factor.
(I checked that this has been used elsewhere in the codebase; there's a couple places so far, but LockFileManager.h has the best example).
@fhahn any objections?
There was a problem hiding this comment.
Done, thanks for the suggestion!
…#147302) This PR bundles partial reductions inside the VPExpressionRecipe class. Stacked PRs: 1. llvm/llvm-project#147026 2. llvm/llvm-project#147255 3. llvm/llvm-project#156976 4. llvm/llvm-project#160154 5. -> llvm/llvm-project#147302 6. llvm/llvm-project#162503 7. llvm/llvm-project#147513
This PR bundles partial reductions inside the VPExpressionRecipe class. Stacked PRs: 1. llvm#147026 2. llvm#147255 3. llvm#156976 4. llvm#160154 5. -> llvm#147302 6. llvm#162503 7. llvm#147513
ac146a7 to
0fe3c78
Compare
0fe3c78 to
9b8e26c
Compare
| @@ -410,8 +410,8 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { | |||
| ; CHECK-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> | |||
| ; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> | |||
| ; CHECK-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] | |||
There was a problem hiding this comment.
It’s a bit difficult to see with the rest changes, but I assume this the change is NFC and the test changes are just stray updates?
There was a problem hiding this comment.
They are stray updates, indeed, so I've reverted these unnecessary changes. Thanks!
| PhiRecipe = new VPReductionPHIRecipe( | ||
| Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi), | ||
| CM.useOrderedReductions(RdxDesc), ScaleFactor); | ||
| RdxStyle Style(RdxNormal{}); |
There was a problem hiding this comment.
nit:
| RdxStyle Style(RdxNormal{}); | |
| RdxStyle Style(RdxNormal()); |
I think generally () is used to call constructors instead of {}, not sure if there's a benefit of using {}?
There was a problem hiding this comment.
It's a struct, so I think we have to use {}. My compiler complains at me when I use ():
warning: parentheses were disambiguated as a function declaration
note: replace parentheses with braces to declare a variable
and
error: assignment of function ‘llvm::RdxStyle llvm::Style(RdxNormal (*)())’
| if (isConditional()) { | ||
| CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; | ||
| auto *VecTy = Ctx.Types.inferScalarType(Op); | ||
| auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); | ||
| CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, | ||
| Pred, Ctx.CostKind); | ||
| } |
There was a problem hiding this comment.
I think this should be moved to VPReductionRecipe::computeCost. It seems like we may be missing test coverage for this?
There was a problem hiding this comment.
Yeah you're right, that compute cost function isn't considering the predicate. I moved it over locally and that regressed the codegen for the mve-reductions.ll tests. I think this change should be done separately since it's outside the scope of the PR. Are you okay with that?
There was a problem hiding this comment.
Oh that’s surprising, are we forming partial reductions for those cases? If not than it would be good to check first why the behavior changed, because that would be somewhat unexpected. I think we also have the predicate cost on main, but maybe the difference would be that after moving we may already have and pass the correct vector types?
There was a problem hiding this comment.
For regular reductions there would be a [VP]select in the plan to represent the Condition, but from what I can see for in-loop/ordered reductions the condition is just not represented in the VPReductionRecipe::computeCost function so adding it would introduce changes for MVE. I agree this is best fixed in a separate PR.
For the purpose of this PR though, we don't want to miss out on the condition for the partial reduction case. Can you move the CmpSelInstrCost into the block where it calls getPartialReductionCost, so that we don't regress for partial reductions either?
(I'm not sure if we have a proper test for this at the moment, if not can you add one?)
There was a problem hiding this comment.
Oh that’s surprising, are we forming partial reductions for those cases? If not than it would be good to check first why the behavior changed, because that would be somewhat unexpected. I think we also have the predicate cost on main, but maybe the difference would be that after moving we may already have and pass the correct vector types?
When I first tried moving it over, I applied the cost to all cases in VPReductionRecipe::computeCost, which is what introduced the regressions in mve-reductions.ll. If I just apply the extra cost to the partial reduction case then nothing changes so we should be good to go with that approach and fix up the cost model for other conditional reductions later.
🐧 Linux x64 Test Results
|
| // This reduction is in-loop. | ||
| struct RdxInLoop {}; | ||
| // This reduction isn't partial, ordered or in-loop. | ||
| struct RdxNormal {}; |
There was a problem hiding this comment.
I'd rather see RdxNormal and RdxPartial merged into one RdxUnordered, because:
- a partial reduction is an unordered reduction with a scale factor > 1.
- a normal reduction is an unordered reduction with a scale factor of 1.
There was a problem hiding this comment.
Isn't a normal reduction an unordered reduction with a scale factor of VF? Which is actually why it gets a separate variant, given that VF can be unknown at compile time.
There was a problem hiding this comment.
Sorry I confused "normal" and "inloop". When do we actually get a "full unordered out-of-loop" reduction? Isn't that supposed to be a ComputeReductionResult recipe instead?
There was a problem hiding this comment.
ComputeReductionResult should be the final node in the middle block for a normal reduction (or more specialized recipes for AnyOf or Find'X' reductions). But there needs to be a header phi and an update operation inside the loop as well, we just don't reduce across lanes within the vector body.
There was a problem hiding this comment.
We can't do the whole reduction out of the loop, there needs to be an update operation inside the loop. For a simple example of an add reduction, we would have something like this for a 'normal' reduction:
%header = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %update, %vector.body ]
...
%update = add <4 x i32> %header, %new.data
...
middle.block
%result = call i32 @llvm.vector.reduce.add(<4 x i32> %update)
So there's an update in the loop, but we don't perform any operation across vector lanes until we exit the vector body.
For an in-loop reduction, we would have a scalar phi doing something like this:
%header = phi i32 [ 0, %vector.ph ], [ %update, %vector.body ]
...
%in.loop.reduction = call i32 @llvm.vector.reduce.add(<4 x i32> %new.data)
%update = add i32 %header, %in.loop.reduction
...
;; No need for extra operations, as you already have the result
Partial reductions are like normal reductions in that they accumulate into a vector in the loop, but the VF of the phi and the VF of the new data don't have to be the same. So there can be some cross-lane reductions, e.g. when going from <8 x i32> to <4 x i32> after the inputs were extended from <8 x i16>.
Ordered reductions are inner loop reductions that produce a scalar, but we must emit an update operation that does not reassociate the update calculations.
(I think I've got that right...)
There was a problem hiding this comment.
I did spend some time double checking because this contradicted my understanding of "normal" reductions.
The corresponding recipe for the %update will just be a WIDEN of the scalar add, not a WidenReduction recipe. I also checked the code, and I think the only place where we create a RdxNormal reduction is alongside a ScaleFactor > 1. So AFAIU, RdxNormal can be renamed to RdxPartial. Which is a form of inloop reduction where the elements are reduced to VF / ScaleFactor in the loop, and to a scalar in the middle block.
For other styles,I do agree with your comment, thanks for writing all the details :)
There was a problem hiding this comment.
For a normal reduction, ScaleFactor == 1, so there's no change in VF.
AIUI, @paulwalker-arm 's long-term plan was to unify 'normal', 'partial', and 'in-loop' reductions to always use a partial.reduction intrinsic for the update, since the intrinsic can handle all 3 cases. It's only the ordered case that's different. If the VF is the same then a later pass could exchange it for a normal widened operation for targets which aren't partial-reduction aware.
There was a problem hiding this comment.
I think that plan makes sense, even if it will require to have a partial.reduce.xxxx intrinsic for every type of reduction being supported. But if it harmonises the codegen for reductions, that's probably worth it.
Still, my 2 cents: for now I think RdxNormal (Edit: now called RdxUnordered) is a confusing name, because it's really only used for partial accumulation within the loop. It would be clearer to rename it to RdxPartial and enforce ScaleFactor>1 in its constructor. We can easily rename that ReductionStyle when it actually gains new capabilities.
There was a problem hiding this comment.
IIRC VPReductionRecipe was introduced because there were no suitable intrinsics at that time. The variants that map directly to an intrinsic should not need a dedicated recipe at all down the line, as we can just generate the appropriate intrinsic directly
| RdxStyle Style(RdxNormal{}); | ||
| if (UseInLoopReduction) { | ||
| if (UseOrderedReductions) | ||
| Style = RdxOrderedInLoop{}; | ||
| else | ||
| Style = RdxInLoop{}; | ||
| } else if (ScaleFactor > 1) { | ||
| Style = RdxPartial{/*VFScaleFactor=*/ScaleFactor}; | ||
| } |
There was a problem hiding this comment.
nit: maybe create a convenience function for this?
There was a problem hiding this comment.
Done. I'd like to have namespaced it in ReductionStyle but that's just an alias for the std::variant.
There was a problem hiding this comment.
I'd vouch for that as well. Maybe ReductionStyle::get(UseInLoopReduction, UseOrderedReductions, ScaleFactor)?
There was a problem hiding this comment.
You can't add extra methods to a std::variant, but maybe a static getReductionStyle helper function?
| if (UseInLoopReduction) { | ||
| if (UseOrderedReductions) | ||
| Style = RdxOrderedInLoop{}; | ||
| else | ||
| Style = RdxInLoop{}; |
There was a problem hiding this comment.
nit: this can be simplified to:
Style = RdxOrderedInLoop{};
else if (UseInLoopReductions)
Style = RdxInLoop{};
else ..
| auto *Partial = std::get_if<RdxPartial>(&Style); | ||
| return Partial ? std::make_optional(Partial->VFScaleFactor) : std::nullopt; |
There was a problem hiding this comment.
Can this function be changed so that calling getVFScaleFactor always returns a value. This would mean it can only ever be called if for unordered reductions. Especially when partial- and normal reductions are merged, this would remove the need for .value_or(1) I see in various places.
There was a problem hiding this comment.
Done. I've made 1 the default case.
There was a problem hiding this comment.
Note: I think I suggested this here due to ScaleFactor having a confusing definition: #147513 (comment)
Now that we have a variant for the reduction style, we might be able to do better.
| // Possible variants of a reduction. | ||
|
|
||
| // This reduction is ordered and in-loop. | ||
| struct RdxOrderedInLoop {}; |
There was a problem hiding this comment.
nit:
| struct RdxOrderedInLoop {}; | |
| struct RdxOrdered {}; |
because ordered implies in-loop.
| if (isConditional()) { | ||
| CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; | ||
| auto *VecTy = Ctx.Types.inferScalarType(Op); | ||
| auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); | ||
| CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, | ||
| Pred, Ctx.CostKind); | ||
| } |
There was a problem hiding this comment.
For regular reductions there would be a [VP]select in the plan to represent the Condition, but from what I can see for in-loop/ordered reductions the condition is just not represented in the VPReductionRecipe::computeCost function so adding it would introduce changes for MVE. I agree this is best fixed in a separate PR.
For the purpose of this PR though, we don't want to miss out on the condition for the partial reduction case. Can you move the CmpSelInstrCost into the block where it calls getPartialReductionCost, so that we don't regress for partial reductions either?
(I'm not sure if we have a proper test for this at the moment, if not can you add one?)
| PrevInChain->getType(), Intrinsic::vector_partial_reduce_add, | ||
| {PrevInChain, NewVecOp}, nullptr, "partial.reduce"); | ||
| PrevInChain = NewRed; | ||
| NextInChain = NewRed; | ||
| } else { |
There was a problem hiding this comment.
nit: maybe add an assert that this is an in-loop reduction?
| RdxStyle Style(RdxNormal{}); | ||
| if (UseInLoopReduction) { | ||
| if (UseOrderedReductions) | ||
| Style = RdxOrderedInLoop{}; | ||
| else | ||
| Style = RdxInLoop{}; | ||
| } else if (ScaleFactor > 1) { | ||
| Style = RdxPartial{/*VFScaleFactor=*/ScaleFactor}; | ||
| } |
There was a problem hiding this comment.
I'd vouch for that as well. Maybe ReductionStyle::get(UseInLoopReduction, UseOrderedReductions, ScaleFactor)?
| // This reduction is in-loop. | ||
| struct RdxInLoop {}; | ||
| // This reduction isn't partial, ordered or in-loop. | ||
| struct RdxNormal {}; |
There was a problem hiding this comment.
Isn't a normal reduction an unordered reduction with a scale factor of VF? Which is actually why it gets a separate variant, given that VF can be unknown at compile time.
| auto *Partial = std::get_if<RdxPartial>(&Style); | ||
| return Partial ? std::make_optional(Partial->VFScaleFactor) : std::nullopt; |
There was a problem hiding this comment.
Note: I think I suggested this here due to ScaleFactor having a confusing definition: #147513 (comment)
Now that we have a variant for the reduction style, we might be able to do better.
| /// When expanding the reduction PHI, the plan's VF element count is divided | ||
| /// by this factor to form the reduction phi's VF. | ||
| unsigned VFScaleFactor = 1; | ||
| RdxStyle Style; |
d37f98c to
e675a11
Compare
| InstructionCost CondCost = 0; | ||
| if (isConditional()) { | ||
| CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; | ||
| auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); |
There was a problem hiding this comment.
Should this be:
auto *CondTy = cast<VectorType>(toVectorTy(Ctx.Types.inferScalartype(getCondOp()), VF));
?
There was a problem hiding this comment.
Yeah I think that's correct, done.
| struct RdxInLoop {}; | ||
| // This reduction is normal with a >= 1 factor that the vector length is | ||
| // scaled down by. | ||
| struct RdxNormal { |
There was a problem hiding this comment.
nit: I think RdxUnordered is a better name.
sdesmalen-arm
left a comment
There was a problem hiding this comment.
LGTM with nits addressed
| bool UseOrderedReductions = PhiR->isOrdered(); | ||
| ReductionStyle Style = getReductionStyle(true, UseOrderedReductions, 1); |
There was a problem hiding this comment.
nit:
| bool UseOrderedReductions = PhiR->isOrdered(); | |
| ReductionStyle Style = getReductionStyle(true, UseOrderedReductions, 1); | |
| ReductionStyle Style = getReductionStyle(true, PhiR->isOrdered(), 1); |
| // This reduction is normal with a >= 1 factor that the vector length is | ||
| // scaled down by. |
There was a problem hiding this comment.
| // This reduction is normal with a >= 1 factor that the vector length is | |
| // scaled down by. | |
| // This reduction is unordered with the partial result scaled down by some factor. |
| // This reduction is normal with a >= 1 factor that the vector length is | ||
| // scaled down by. | ||
| struct RdxUnordered { | ||
| // The factor by which the output is scaled down from the VF. |
There was a problem hiding this comment.
nit: superfluous comment, given the comment above.
| @@ -2918,7 +2892,7 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { | |||
| R.getFastMathFlags(), | |||
| cast_or_null<Instruction>(R.getUnderlyingValue()), | |||
| ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, | |||
| R.isOrdered(), DL) {} | |||
| getReductionStyle(true, R.isOrdered(), 1), DL) {} | |||
There was a problem hiding this comment.
| getReductionStyle(true, R.isOrdered(), 1), DL) {} | |
| getReductionStyle(/*InLoop=*/true, R.isOrdered(), 1), DL) {} |
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on llvm#144281
373f118 to
573e002
Compare
| if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); | ||
| VPR && VPR->isPartialReduction()) |
There was a problem hiding this comment.
| if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); | |
| VPR && VPR->isPartialReduction()) | |
| if (auto *VPR = dyn_cast<VPReductionRecipe>(&R)) | |
| if (VPR->isPartialReduction()) |
assigning as part of the if doesn't seem to save lines and I think we generally avoid it in the vectorizer code
| static ReductionStyle getReductionStyle(bool InLoop, bool Ordered, | ||
| unsigned ScaleFactor) { | ||
| assert((!Ordered || InLoop) && "Ordered implies in-loop"); | ||
| if (Ordered) | ||
| return RdxOrdered{}; | ||
| if (InLoop) | ||
| return RdxInLoop{}; | ||
| return RdxUnordered{/*VFScaleFactor=*/ScaleFactor}; | ||
| } | ||
|
|
There was a problem hiding this comment.
Looks like this is only used in LoopVectorize.cpp, can you move it there?
| // Possible variants of a reduction. | ||
|
|
||
| // This reduction is ordered and in-loop. | ||
| struct RdxOrdered {}; | ||
| // This reduction is in-loop. | ||
| struct RdxInLoop {}; | ||
| // This reduction is unordered with the partial result scaled down by some | ||
| // factor. | ||
| struct RdxUnordered { | ||
| unsigned VFScaleFactor; | ||
| }; | ||
| using ReductionStyle = std::variant<RdxOrdered, RdxInLoop, RdxUnordered>; |
There was a problem hiding this comment.
Would be good to use /// here so this gets included in the docs properly.
There was a problem hiding this comment.
It's used in VPlan.h as well, in the VPReductionEVLRecipe constructor.
There was a problem hiding this comment.
Ah right, if left in the header it should probably be inline instead of static
There was a problem hiding this comment.
Done! Just realised I responded to the wrong comment as well 😆
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Stacked PRs: 1. llvm/llvm-project#147026 2. llvm/llvm-project#147255 3. llvm/llvm-project#156976 4. llvm/llvm-project#160154 5. llvm/llvm-project#147302 6. llvm/llvm-project#162503 7. -> llvm/llvm-project#147513 Replaces llvm/llvm-project#146073 .
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Stacked PRs: 1. llvm#147026 2. llvm#147255 3. llvm#156976 4. llvm#160154 5. llvm#147302 6. llvm#162503 7. -> llvm#147513 Replaces llvm#146073 .
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time.
Stacked PRs:
Replaces #146073 .