diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a63956c0cba6b..8daa85671c3b6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7021,10 +7021,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, VPInstruction::FirstOrderRecurrenceSplice>()))) return true; } - // The VPlan-based cost model is more accurate for partial reduction and + // The VPlan-based cost model is more accurate for partial reductions and // comparing against the legacy cost isn't desirable. - if (isa(&R)) - return true; + if (auto *VPR = dyn_cast(&R)) + if (VPR->isPartialReduction()) + return true; // The VPlan-based cost model can analyze if recipes are scalar // recursively, but the legacy cost model cannot. @@ -8207,11 +8208,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); // If the PHI is used by a partial reduction, set the scale factor. + bool UseInLoopReduction = CM.isInLoopReduction(Phi); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); unsigned ScaleFactor = getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); + PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); + Phi, RdxDesc.getRecurrenceKind(), *StartV, + getReductionStyle(UseInLoopReduction, UseOrderedReductions, + ScaleFactor)); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8280,16 +8285,18 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, VPValue *BinOp = Reduction->getOperand(0); VPValue *Accumulator = Reduction->getOperand(1); - if (isa(BinOp) || isa(BinOp)) + VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); + if (isa(BinOpRecipe) || + (isa(BinOpRecipe) && + cast(BinOpRecipe)->isPartialReduction())) std::swap(BinOp, Accumulator); assert(ScaleFactor == vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && "all accumulators in chain must have same scale factor"); - unsigned ReductionOpcode = Reduction->getOpcode(); auto *ReductionI = Reduction->getUnderlyingInstr(); - if (ReductionOpcode == Instruction::Sub) { + if (Reduction->getOpcode() == Instruction::Sub) { auto *const Zero = ConstantInt::get(ReductionI->getType(), 0); SmallVector Ops; Ops.push_back(Plan.getOrAddLiveIn(Zero)); @@ -8297,14 +8304,15 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRFlags(*ReductionI), VPIRMetadata(), ReductionI->getDebugLoc()); Builder.insert(BinOp->getDefiningRecipe()); - ReductionOpcode = Instruction::Add; } VPValue *Cond = nullptr; if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) Cond = getBlockInMask(Builder.getInsertBlock()); - return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, ReductionI); + + return new VPReductionRecipe( + RecurKind::Add, FastMathFlags(), ReductionI, Accumulator, BinOp, Cond, + RdxUnordered{/*VFScaleFactor=*/ScaleFactor}, ReductionI->getDebugLoc()); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8794,9 +8802,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent())) CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent()); - auto *RedRecipe = new VPReductionRecipe( - Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, - PhiR->isOrdered(), CurrentLinkI->getDebugLoc()); + ReductionStyle Style = getReductionStyle(true, PhiR->isOrdered(), 1); + auto *RedRecipe = + new VPReductionRecipe(Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, + CondOp, Style, CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Delete CurrentLink as it will be invalid if its operand is replaced @@ -8831,8 +8840,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Don't output selects for partial reductions because they have an output // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. + auto *RR = dyn_cast(OrigExitingVPV->getDefiningRecipe()); if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa(OrigExitingVPV)) { + (!RR || !RR->isPartialReduction())) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0c7d9c0193a03..d957d9110def9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -44,6 +44,7 @@ #include #include #include +#include namespace llvm { @@ -566,7 +567,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: - case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveEVLSC: @@ -2392,6 +2392,29 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { #endif }; +/// Possible variants of a reduction. + +/// This reduction is ordered and in-loop. +struct RdxOrdered {}; +/// This reduction is in-loop. +struct RdxInLoop {}; +/// This reduction is unordered with the partial result scaled down by some +/// factor. +struct RdxUnordered { + unsigned VFScaleFactor; +}; +using ReductionStyle = std::variant; + +inline ReductionStyle getReductionStyle(bool InLoop, bool Ordered, + unsigned ScaleFactor) { + assert((!Ordered || InLoop) && "Ordered implies in-loop"); + if (Ordered) + return RdxOrdered{}; + if (InLoop) + return RdxInLoop{}; + return RdxUnordered{/*VFScaleFactor=*/ScaleFactor}; +} + /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. @@ -2400,32 +2423,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The recurrence kind of the reduction. const RecurKind Kind; - /// The phi is part of an in-loop reduction. - bool IsInLoop; - - /// The phi is part of an ordered reduction. Requires IsInLoop to be true. - bool IsOrdered; - - /// When expanding the reduction PHI, the plan's VF element count is divided - /// by this factor to form the reduction phi's VF. - unsigned VFScaleFactor = 1; + ReductionStyle Style; public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi. VPReductionPHIRecipe(PHINode *Phi, RecurKind Kind, VPValue &Start, - bool IsInLoop = false, bool IsOrdered = false, - unsigned VFScaleFactor = 1) + ReductionStyle Style) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), Kind(Kind), - IsInLoop(IsInLoop), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { - assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); - } + Style(Style) {} ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { auto *R = new VPReductionPHIRecipe( dyn_cast_or_null(getUnderlyingValue()), getRecurrenceKind(), - *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor); + *getOperand(0), Style); R->addOperand(getBackedgeValue()); return R; } @@ -2435,8 +2447,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; - /// Get the factor that the VF of this recipe's output should be scaled by. - unsigned getVFScaleFactor() const { return VFScaleFactor; } + /// Get the factor that the VF of this recipe's output should be scaled by, or + /// 1 if it isn't scaled. + unsigned getVFScaleFactor() const { + auto *Partial = std::get_if(&Style); + return Partial ? Partial->VFScaleFactor : 1; + } /// Returns the number of incoming values, also number of incoming blocks. /// Note that at the moment, VPWidenPointerInductionRecipe only has a single @@ -2447,10 +2463,16 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, RecurKind getRecurrenceKind() const { return Kind; } /// Returns true, if the phi is part of an ordered reduction. - bool isOrdered() const { return IsOrdered; } + bool isOrdered() const { return std::holds_alternative(Style); } - /// Returns true, if the phi is part of an in-loop reduction. - bool isInLoop() const { return IsInLoop; } + /// Returns true if the phi is part of an in-loop reduction. + bool isInLoop() const { + return std::holds_alternative(Style) || + std::holds_alternative(Style); + } + + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return getVFScaleFactor() > 1; } /// Returns true if the recipe only uses the first lane of operand \p Op. bool usesFirstLaneOnly(const VPValue *Op) const override { @@ -2732,23 +2754,25 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { #endif }; -/// A recipe to represent inloop reduction operations, performing a reduction on -/// a vector operand into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, [Condition]}. +/// A recipe to represent inloop, ordered or partial reduction operations. It +/// performs a reduction on a vector operand into a scalar (vector in the case +/// of a partial reduction) value, and adds the result to a chain. The Operands +/// are {ChainOp, VecOp, [Condition]}. class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { + /// The recurrence kind for the reduction in question. RecurKind RdxKind; - bool IsOrdered; /// Whether the reduction is conditional. bool IsConditional = false; + ReductionStyle Style; protected: VPReductionRecipe(const unsigned char SC, RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, ArrayRef Operands, VPValue *CondOp, - bool IsOrdered, DebugLoc DL) + ReductionStyle Style, DebugLoc DL) : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered) { + Style(Style) { if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2759,30 +2783,29 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) + ReductionStyle Style, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, - ArrayRef({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + ArrayRef({ChainOp, VecOp}), CondOp, Style, + DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) + ReductionStyle Style, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, - ArrayRef({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + ArrayRef({ChainOp, VecOp}), CondOp, Style, + DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { return new VPReductionRecipe(RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(), getVecOp(), - getCondOp(), IsOrdered, getDebugLoc()); + getCondOp(), Style, getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || - R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; } static inline bool classof(const VPUser *U) { @@ -2809,9 +2832,16 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { /// Return the recurrence kind for the in-loop reduction. RecurKind getRecurrenceKind() const { return RdxKind; } /// Return true if the in-loop reduction is ordered. - bool isOrdered() const { return IsOrdered; }; + bool isOrdered() const { return std::holds_alternative(Style); }; /// Return true if the in-loop reduction is conditional. bool isConditional() const { return IsConditional; }; + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return getVFScaleFactor() > 1; } + /// Returns true if the reduction is in-loop. + bool isInLoop() const { + return std::holds_alternative(Style) || + std::holds_alternative(Style); + } /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } /// The VPValue of the vector value to be reduced. @@ -2820,69 +2850,12 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { VPValue *getCondOp() const { return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; } - -protected: -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif -}; - -/// A recipe for forming partial reductions. In the loop, an accumulator and -/// vector operand are added together and passed to the next iteration as the -/// next accumulator. After the loop body, the accumulator is reduced to a -/// scalar value. -class VPPartialReductionRecipe : public VPReductionRecipe { - unsigned Opcode; - - /// The divisor by which the VF of this recipe's output should be divided - /// during execution. - unsigned VFScaleFactor; - -public: - VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor) - : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond, - VFScaleFactor, ReductionInst) {} - VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - VPValue *Cond, unsigned ScaleFactor, - Instruction *ReductionInst = nullptr) - : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add, - FastMathFlags(), ReductionInst, - ArrayRef({Op0, Op1}), Cond, false, {}), - Opcode(Opcode), VFScaleFactor(ScaleFactor) { - [[maybe_unused]] auto *AccumulatorRecipe = - getChainOp()->getDefiningRecipe(); - // When cloning as part of a VPExpressionRecipe the chain op could have - // replaced by a temporary VPValue, so it doesn't have a defining recipe. - assert((!AccumulatorRecipe || - isa(AccumulatorRecipe) || - isa(AccumulatorRecipe)) && - "Unexpected operand order for partial reduction recipe"); - } - ~VPPartialReductionRecipe() override = default; - - VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getCondOp(), VFScaleFactor, - getUnderlyingInstr()); - } - - VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) - - /// Generate the reduction in the loop. - void execute(VPTransformState &State) override; - - /// Return the cost of this VPPartialReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - - /// Get the binary op's opcode. - unsigned getOpcode() const { return Opcode; } - - /// Get the factor that the VF of this recipe's output should be scaled by. - unsigned getVFScaleFactor() const { return VFScaleFactor; } + /// Get the factor that the VF of this recipe's output should be scaled by, or + /// 1 if it isn't scaled. + unsigned getVFScaleFactor() const { + auto *Partial = std::get_if(&Style); + return Partial ? Partial->VFScaleFactor : 1; + } protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2905,7 +2878,7 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { R.getFastMathFlags(), cast_or_null(R.getUnderlyingValue()), ArrayRef({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered(), DL) {} + getReductionStyle(/*InLoop=*/true, R.isOrdered(), 1), DL) {} ~VPReductionEVLRecipe() override = default; @@ -3173,7 +3146,7 @@ class VPExpressionRecipe : public VPSingleDefRecipe { void decompose(); unsigned getVFScaleFactor() const { - auto *PR = dyn_cast(ExpressionRecipes.back()); + auto *PR = dyn_cast(ExpressionRecipes.back()); return PR ? PR->getVFScaleFactor() : 1; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..f84a7914ec850 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -288,10 +288,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( @@ -561,11 +561,12 @@ SmallVector llvm::calculateRegisterUsageForPlan( // fewer lanes than the VF. unsigned ScaleFactor = vputils::getVFScaleFactor(VPV->getDefiningRecipe()); - ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); - LLVM_DEBUG(if (VF != VFs[J]) { - dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF - << " for " << *R << "\n"; - }); + ElementCount VF = VFs[J]; + if (ScaleFactor > 1) { + VF = VFs[J].divideCoefficientBy(ScaleFactor); + LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] + << " to " << VF << " for " << *R << "\n";); + } Type *ScalarTy = TypeInfo.inferScalarType(VPV); unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e41f67103e096..86c1732b9e16e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -180,7 +180,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: - case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -314,134 +313,6 @@ bool VPRecipeBase::isScalarCast() const { return VPI && Instruction::isCast(VPI->getOpcode()); } -InstructionCost -VPPartialReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - std::optional Opcode; - VPValue *Op = getVecOp(); - uint64_t MulConst; - - InstructionCost CondCost = 0; - if (isConditional()) { - CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; - auto *VecTy = Ctx.Types.inferScalarType(Op); - auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); - CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, - Pred, Ctx.CostKind); - } - - // If the partial reduction is predicated, a select will be operand 1. - // If it isn't predicated and the mul isn't operating on a constant, then it - // should have been turned into a VPExpressionRecipe. - // FIXME: Replace the entire function with this once all partial reduction - // variants are bundled into VPExpressionRecipe. - if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { - auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); - auto *InputType = Ctx.Types.inferScalarType(getVecOp()); - return CondCost + Ctx.TTI.getPartialReductionCost( - getOpcode(), InputType, InputType, PhiType, VF, - TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind); - } - - VPRecipeBase *OpR = Op->getDefiningRecipe(); - Type *InputTypeA = nullptr, *InputTypeB = nullptr; - TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, - ExtBType = TTI::PR_None; - - auto GetExtendKind = [](VPRecipeBase *R) { - if (!R) - return TTI::PR_None; - auto *WidenCastR = dyn_cast(R); - if (!WidenCastR) - return TTI::PR_None; - if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) - return TTI::PR_ZeroExtend; - if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) - return TTI::PR_SignExtend; - return TTI::PR_None; - }; - - // Pick out opcode, type/ext information and use sub side effects from a widen - // recipe. - auto HandleWiden = [&](VPWidenRecipe *Widen) { - if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) { - Widen = dyn_cast(Op); - } - Opcode = Widen->getOpcode(); - VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); - VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe(); - InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) - : Widen->getOperand(0)); - InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) - : Widen->getOperand(1)); - ExtAType = GetExtendKind(ExtAR); - ExtBType = GetExtendKind(ExtBR); - - using namespace VPlanPatternMatch; - const APInt *C; - if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) && - canConstantBeExtended(C, InputTypeA, ExtAType)) { - InputTypeB = InputTypeA; - ExtBType = ExtAType; - } - }; - - if (isa(OpR)) { - InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); - ExtAType = GetExtendKind(OpR); - } else if (isa(OpR)) { - if (auto RedPhiOp1R = dyn_cast_or_null(getOperand(1))) { - InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); - ExtAType = GetExtendKind(RedPhiOp1R); - } else if (auto Widen = dyn_cast_or_null(getOperand(1))) - HandleWiden(Widen); - } else if (auto Widen = dyn_cast(OpR)) { - HandleWiden(Widen); - } else if (auto Reduction = dyn_cast(OpR)) { - return CondCost + Reduction->computeCost(VF, Ctx); - } - auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); - return CondCost + Ctx.TTI.getPartialReductionCost( - getOpcode(), InputTypeA, InputTypeB, PhiType, VF, - ExtAType, ExtBType, Opcode, Ctx.CostKind); - ; -} - -void VPPartialReductionRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); - - Value *BinOpVal = State.get(getVecOp()); - Value *PhiVal = State.get(getChainOp()); - assert(PhiVal && BinOpVal && "Phi and Mul must be set"); - - Type *RetTy = PhiVal->getType(); - - if (isConditional()) { - Value *Cond = State.get(getCondOp()); - Value *Zero = ConstantInt::get(BinOpVal->getType(), 0); - BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero); - } - - CallInst *V = - Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, - {PhiVal, BinOpVal}, nullptr, "partial.reduce"); - - State.set(this, V); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPartialReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PARTIAL-REDUCE "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; - printOperands(O, SlotTracker); -} -#endif - void VPIRFlags::intersectFlags(const VPIRFlags &Other) { assert(OpType == Other.OpType && "OpType must match"); switch (OpType) { @@ -2706,7 +2577,6 @@ void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent, void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Reduction being replicated."); - Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); RecurKind Kind = getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && "In-loop AnyOf reductions aren't currently supported"); @@ -2728,7 +2598,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { } Value *NewRed; Value *NextInChain; - if (IsOrdered) { + if (isOrdered()) { + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); if (State.VF.isVector()) NewRed = createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); @@ -2738,8 +2609,18 @@ void VPReductionRecipe::execute(VPTransformState &State) { PrevInChain, NewVecOp); PrevInChain = NewRed; NextInChain = NewRed; + } else if (isPartialReduction()) { + assert(Kind == RecurKind::Add && "Unexpected partial reduction kind"); + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false); + NewRed = State.Builder.CreateIntrinsic( + PrevInChain->getType(), Intrinsic::vector_partial_reduce_add, + {PrevInChain, NewVecOp}, nullptr, "partial.reduce"); + PrevInChain = NewRed; + NextInChain = NewRed; } else { - PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); + assert(isInLoop() && + "The reduction must either be ordered, partial or in-loop"); + Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); @@ -2748,7 +2629,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), PrevInChain, NewRed); } - State.set(this, NextInChain, /*IsScalar*/ true); + State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction()); } void VPReductionEVLRecipe::execute(VPTransformState &State) { @@ -2795,6 +2676,22 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, std::optional OptionalFMF = ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; + if (isPartialReduction()) { + InstructionCost CondCost = 0; + if (isConditional()) { + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *CondTy = cast( + toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF)); + CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, + CondTy, Pred, Ctx.CostKind); + } + return CondCost + Ctx.TTI.getPartialReductionCost( + Opcode, ElementTy, ElementTy, ElementTy, VF, + TargetTransformInfo::PR_None, + TargetTransformInfo::PR_None, std::nullopt, + Ctx.CostKind); + } + // TODO: Support any-of reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || @@ -2900,7 +2797,9 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, unsigned Opcode = RecurrenceDescriptor::getOpcode( cast(ExpressionRecipes[1])->getRecurrenceKind()); auto *ExtR = cast(ExpressionRecipes[0]); - return isa(ExpressionRecipes.back()) + + return cast(ExpressionRecipes.back()) + ->isPartialReduction() ? Ctx.TTI.getPartialReductionCost( Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF, @@ -2920,7 +2819,8 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, Opcode = Instruction::Sub; [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { - if (isa(ExpressionRecipes.back())) { + auto *RedR = cast(ExpressionRecipes.back()); + if (RedR->isPartialReduction()) { auto *Ext0R = cast(ExpressionRecipes[0]); auto *Ext1R = cast(ExpressionRecipes[1]); auto *Mul = cast(ExpressionRecipes[2]); @@ -2959,8 +2859,8 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { bool VPExpressionRecipe::isSingleScalar() const { // Cannot use vputils::isSingleScalar(), because all external operands // of the expression will be live-ins while bundled. - return isa(ExpressionRecipes.back()) && - !isa(ExpressionRecipes.back()); + auto *RR = dyn_cast(ExpressionRecipes.back()); + return RR && !RR->isPartialReduction(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2972,12 +2872,11 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, O << " = "; auto *Red = cast(ExpressionRecipes.back()); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - bool IsPartialReduction = isa(Red); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { getOperand(1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName(Opcode) << " ("; getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); @@ -2994,7 +2893,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, } case ExpressionTypes::ExtNegatedMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " (sub (0, mul"; @@ -3019,7 +2918,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " ("; @@ -3056,7 +2955,10 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "REDUCE "; + if (isPartialReduction()) + O << Indent << "PARTIAL-REDUCE "; + else + O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); @@ -4444,7 +4346,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // this value when we vectorize all of the instructions that use the PHI. BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - bool ScalarPHI = State.VF.isScalar() || IsInLoop; + bool ScalarPHI = State.VF.isScalar() || isInLoop(); Value *StartV = State.get(StartVPV, ScalarPHI); Type *VecTy = StartV->getType(); @@ -4453,7 +4355,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { "recipe must be in the vector loop header"); auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); Phi->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, Phi, IsInLoop); + State.set(this, Phi, isInLoop()); Phi->addIncoming(StartV, VectorPH); } @@ -4466,8 +4368,8 @@ void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); - if (VFScaleFactor != 1) - O << " (VF scaled by 1/" << VFScaleFactor << ")"; + if (getVFScaleFactor() > 1) + O << " (VF scaled by 1/" << getVFScaleFactor() << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9174058baad65..8ccd3ef9b68a7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3675,7 +3675,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, cast(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); - if (isa(Red)) { + if (Red->isPartialReduction()) { TargetTransformInfo::PartialReductionExtendKind ExtKind = TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); // FIXME: Move partial reduction creation, costing and clamping @@ -3716,8 +3716,6 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - bool IsPartialReduction = isa(Red); - unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; @@ -3735,7 +3733,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; InstructionCost MulAccCost; - if (IsPartialReduction) { + if (Red->isPartialReduction()) { Type *SrcTy2 = Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; // FIXME: Move partial reduction creation, costing and clamping diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 839a304904e8b..c7a0fd7407a4e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -195,10 +195,9 @@ bool vputils::isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (preservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); - if (isa(VPV)) - return false; - if (isa( - VPV)) + if (auto *RR = dyn_cast(VPV)) + return !RR->isPartialReduction(); + if (isa(VPV)) return true; if (auto *Expr = dyn_cast(VPV)) return Expr->isSingleScalar(); @@ -270,7 +269,7 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { return 1; if (auto *RR = dyn_cast(R)) return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) + if (auto *RR = dyn_cast(R)) return RR->getVFScaleFactor(); if (auto *ER = dyn_cast(R)) return ER->getVFScaleFactor(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 63eacd3d75721..b9f5847ec731c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -349,7 +349,6 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, - VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, VPVectorPointerSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 6df3f1b418eb6..a1d03c4a7fbc6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -80,8 +80,8 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4) +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index> ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> @@ -89,13 +89,13 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 ; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024> +; CHECK-NEXT: PARTIAL-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%[0-9]+]]> = compute-reduction-result ir<%accum>, ir<%add> +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%[0-9]+]]> = compute-reduction-result ir<[[RDX]]>, ir<[[RDX_NEXT]]> ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 63776b78a2088..a6a1f672c5f70 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1318,35 +1318,29 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { } { - auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), - PoisonValue::get(Int32)); VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), ChainOp, VecOp, + CondOp, RdxUnordered{}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); EXPECT_FALSE(Recipe.mayReadOrWriteMemory()); - delete Add; } { - auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), - PoisonValue::get(Int32)); VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), ChainOp, VecOp, + CondOp, RdxUnordered{}); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 4)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); EXPECT_FALSE(EVLRecipe.mayReadFromMemory()); EXPECT_FALSE(EVLRecipe.mayWriteToMemory()); EXPECT_FALSE(EVLRecipe.mayReadOrWriteMemory()); - delete Add; } { @@ -1689,30 +1683,27 @@ TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); - auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), - PoisonValue::get(Int32)); VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), ChainOp, VecOp, + CondOp, RdxUnordered{}); checkVPRecipeCastImpl(&Recipe); - delete Add; + EXPECT_TRUE(isa(&Recipe)); + VPRecipeBase *BaseR = &Recipe; + EXPECT_TRUE(isa(BaseR)); } TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); - auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), - PoisonValue::get(Int32)); VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, - CondOp, VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), ChainOp, VecOp, + CondOp, RdxUnordered{}); VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); checkVPRecipeCastImpl(&EVLRecipe); - delete Add; } } // namespace