[LV] Vectorize conditional scalar assignments#158088
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-analysis Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Graham Hunter (huntergr-arm) ChangesBased on Michael Maitland's previous work: This PR uses the existing recurrences code instead of introducing a I've enabled it by default to see the impact on tests; if there are I will be doing some performance runs on AArch64 to figure out Patch is 204.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158088.diff 19 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index f9e6da6d0846a..afa175704a7b1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -70,6 +70,9 @@ enum class RecurKind {
FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a UMax reduction.
+ FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y
+ ///< can be any scalar type, one is the current recurrence
+ ///< value, and the other is an arbitrary value.
// clang-format on
// TODO: Any_of and FindLast reduction need not be restricted to integer type
// only.
@@ -183,6 +186,12 @@ class RecurrenceDescriptor {
PHINode *OrigPhi, Instruction *I,
ScalarEvolution &SE);
+ /// Returns a struct describing whether the instruction is of the form
+ /// Select(Cmp(A, B), X, Y)
+ /// where one of (X, Y) is the Phi value and the other is an arbitrary value.
+ LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop);
+
/// Returns a struct describing if the instruction is a
/// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I);
@@ -299,6 +308,12 @@ class RecurrenceDescriptor {
isFindLastIVRecurrenceKind(Kind);
}
+ /// Returns true if the recurrence kind is of the form
+ /// select(cmp(),x,y) where one of (x,y) is an arbitrary value.
+ static bool isFindLastRecurrenceKind(RecurKind Kind) {
+ return Kind == RecurKind::FindLast;
+ }
+
/// Returns the type of the recurrence. This type can be narrower than the
/// actual type of the Phi if the recurrence has been type-promoted.
Type *getRecurrenceType() const { return RecurrenceType; }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..bd87e9de46bd5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ // TODO: Make type-agnostic.
+ case RecurKind::FindLast:
return true;
}
return false;
@@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
if (isAnyOfRecurrenceKind(Kind) && IsASelect)
++NumCmpSelectPatternInst;
+ if (isFindLastRecurrenceKind(Kind) && IsASelect)
+ ++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi && Cur != Start;
@@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
return InstDesc(false, I);
}
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi,
+ Loop *TheLoop) {
+ // Must be a scalar.
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return InstDesc(false, I);
+
+ SelectInst *Select = dyn_cast<SelectInst>(I);
+ if (!Select)
+ return InstDesc(false, I);
+
+ // FIXME: Support more complex patterns, including multiple selects.
+ // Phi or Select must be used only outside the loop,
+ // except for each other.
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return false;
+ });
+ };
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return InstDesc(false, I);
+
+ return InstDesc(I, RecurKind::FindLast);
+}
+
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
const InstDesc &Prev) {
@@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return isConditionalRdxPattern(I);
if (isFindIVRecurrenceKind(Kind) && SE)
return isFindIVPattern(Kind, L, OrigPhi, I, *SE);
+ if (isFindLastRecurrenceKind(Kind))
+ return isFindLastPattern(I, OrigPhi, L);
[[fallthrough]];
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< "\n");
return true;
}
-
+ if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
+ LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
+ case RecurKind::FindLast:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FMin:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..6595c6e770be0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_extract_last_active:
+ if (ST->isSVEAvailable()) {
+ auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+ // This should turn into chained clastb instructions.
+ return LegalCost;
+ }
+ break;
default:
break;
}
@@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
+ case RecurKind::FindLast:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b4acda80cfb93..ea85685cdf7b8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
+ case VPDef::VPWidenSelectVectorSC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPHistogramSC:
@@ -4546,6 +4547,11 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPReductionPHIRecipe>);
+ // FIXME: implement interleaving for FindLast transform correctly.
+ for (auto &[_, RdxDesc] : Legal->getReductionVars())
+ if (RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind()))
+ return 1;
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
@@ -8687,6 +8693,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan, Builder))
return nullptr;
+ // Create whole-vector selects for find-last recurrences.
+ VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences,
+ *Plan, RecipeBuilder, Legal);
+
if (useActiveLaneMask(Style)) {
// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
// TailFoldingStyle is visible there.
@@ -8779,6 +8789,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind Kind = PhiR->getRecurrenceKind();
assert(
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
"AnyOf and FindIV reductions are not allowed for in-loop reductions");
@@ -8987,6 +8998,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
FinalReductionResult =
Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
{PhiR, Start, NewExitingVPV}, ExitDL);
+ } else if (RecurrenceDescriptor::isFindLastRecurrenceKind(
+ RdxDesc.getRecurrenceKind())) {
+ FinalReductionResult = Builder.createNaryOp(
+ VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL);
} else {
VPIRFlags Flags =
RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
@@ -9076,7 +9091,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
RecurKind RK = RdxDesc.getRecurrenceKind();
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
- !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) {
VPBuilder PHBuilder(Plan->getVectorPreheader());
VPValue *Iden = Plan->getOrAddLiveIn(
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
@@ -10069,6 +10085,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
+ // FIXME: Enable interleaving for last_active reductions.
+ if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RecurrenceDescriptor::isFindLastRecurrenceKind(RdxDesc.getRecurrenceKind());
+ })) {
+ LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
+ << "to conditional scalar assignments.\n");
+ IntDiagMsg = {
+ "ConditionalAssignmentPreventsScalarInterleaving",
+ "Unable to interleave without vectorization due to conditional "
+ "assignments"};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 75cace77ec534..7b25731af19d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24868,6 +24868,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25009,6 +25010,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
@@ -25115,6 +25117,7 @@ class HorizontalReduction {
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
+ case RecurKind::FindLast:
case RecurKind::FMaxNum:
case RecurKind::FMinNum:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53291a931530f..2ffe68fedee05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntrinsicSC:
case VPRecipeBase::VPWidenSC:
case VPRecipeBase::VPWidenSelectSC:
+ case VPRecipeBase::VPWidenSelectVectorSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
case VPRecipeBase::VPCanonicalIVPHISC:
@@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ // Extracts the last active lane based on a predicate vector operand.
+ ExtractLastActive,
};
private:
@@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
unsigned getOpcode() const { return Instruction::Select; }
+ VPValue *getCond() const { return getOperand(0); }
+
+ bool isInvariantCond() const {
+ return getCond()->isDefinedOutsideLoopRegions();
+ }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
+};
+
+/// A recipe for selecting whole vector values.
+struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags {
+ VPWidenSelectVectorRecipe(ArrayRef<VPValue *> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {}
+
+ ~VPWidenSelectVectorRecipe() override = default;
+
+ VPWidenSelectVectorRecipe *clone() override {
+ SmallVector<VPValue *, 3> Operands(operands());
+ return new VPWidenSelectVectorRecipe(Operands);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC)
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPWidenSelectVectorRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
VPValue *getCond() const {
return getOperand(0);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..a299ab8593a2f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractPenultimateElement: {
+ case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ExtractLastActive: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
@@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPExpressionRecipe>([this](const auto *R) {
return inferScalarType(R->getOperandOfResultType());
- });
+ })
+ .Case<VPWidenSelectVectorRecipe>(
+ [this](const VPWidenSelectVectorRecipe *R) {
+ return inferScalarType(R->getOperand(1));
+ });
assert(ResultTy && "could not infer type for the given VPValue");
CachedTypes[V] = ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..598fa4888fe8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
- case VPWidenSelectSC: {
+ case VPWidenSelectSC:
+ case VPWidenSelectVectorSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
(void)I;
@@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
+ case VPInstruction::ExtractLastActive:
return 3;
case VPInstruction::ComputeFindIVResult:
return 4;
@@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
+ case VPInstruction::ExtractLastActive: {
+ Value *Data = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
+ Type *VTy = Data->getType();
+
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *ExtractLast = Intrinsic::getOrInsertDeclaration(
+ M, Intrinsic::experimental_vector_extract_last_active, {VTy});
+ return Builder.CreateCall(ExtractLast, {Data, Mask, Default});
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -1119,6 +1134,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ExtractLastActive: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(this);
+ Type *VecTy = toVectorTy(ScalarTy, VF);
+ Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_extract_last_active,
+ ScalarTy, {VecTy, MaskTy, ScalarTy});
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1174,6 +1197,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
+ getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1243,6 +1267,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
+ case VPInstruction::ExtractLastActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
@@ -1414,6 +1439,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::ExtractLastActive:
+ O << "extract-last-active";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1927,7 +1955,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
Mask->printAsOperand(O, SlotTracker);
}
}
+#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";
@@ -2002,6 +2032,42 @@ InstructionCost VPWidenS...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Mel-Chen
left a comment
There was a problem hiding this comment.
Thank you. I've reviewed part of the code, and will continue the review in October, as I'll be on vacation until the end of September.
| ///< (x,y) is increasing loop induction, and both x and y | ||
| ///< are integer type, producing a UMax reduction. | ||
| FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y | ||
| ///< can be any scalar type, one is the current recurrence |
There was a problem hiding this comment.
Since isIntegerRecurrenceKind returns true for FindLast, I suggest
| ///< can be any scalar type, one is the current recurrence | |
| ///< are integer type, one is the current recurrence |
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| if (isFindLastRecurrenceKind(Kind) && IsASelect) | ||
| ++NumCmpSelectPatternInst; |
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi, | ||
| Loop *TheLoop) { |
There was a problem hiding this comment.
Could we reuse RecurrenceDescriptor::isFindIVPattern?
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| case RecurKind::SMin: | ||
| case RecurKind::UMax: | ||
| case RecurKind::UMin: | ||
| case RecurKind::FindLast: |
There was a problem hiding this comment.
#162252
icmp and fcmp shouldn’t be part of the recurrence chain. I think the opcode should return Instruction::Select instead.
d066b28 to
1ff7778
Compare
|
Rebased, addressed comments. Thanks for reviewing :) |
MacDue
left a comment
There was a problem hiding this comment.
Some initial comments (mostly nitpicks), I've not looked at everything in full detail yet, but the general concept makes sense to me.
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| m_Value(NonRdxPhi))))) | ||
| return InstDesc(false, I); | ||
|
|
||
| if (isFindLastRecurrenceKind(Kind)) { |
There was a problem hiding this comment.
Above (not this line), this function is called isFindIVPattern, and has a large comment explaining that with an example. Maybe update that documentation to explain the "FindLastRecurance" case (and possibly rename the function to account for that too?).
There was a problem hiding this comment.
There's also the comment above for We are looking for selects of the form: which should now in include the FindLast case.
1ff7778 to
e6fc0cf
Compare
fhahn
left a comment
There was a problem hiding this comment.
I will be doing some performance runs on AArch64 to figure out
the cost model, as we mostly regard vector selects as per-lane
instead of selecting the whole vector at once.
Did you already get a chance to run preliminary benchmarks? Would be very curious what the impact is, possibly for some microbenchmarks (https://github.com/llvm/llvm-test-suite/tree/main/MicroBenchmarks/LoopVectorization would be a good place)
| } | ||
| break; | ||
| } | ||
| case Intrinsic::experimental_vector_extract_last_active: |
There was a problem hiding this comment.
Could this be split off, with dedicated cost model tests?
| } | ||
| }; | ||
|
|
||
| /// A recipe for selecting whole vector values based on a scalar condition. |
There was a problem hiding this comment.
do we need a dedicated recipe for this? If we have a scalar condition, broadcasting should be fine, although more vebose than needed?
We should be able to handle single-scalar conditions as well, which may be enough? #165506 (comment)
There was a problem hiding this comment.
No, we don't need a dedicated recipe. I'll rebase on top of that other PR when available.
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| m_Value(NonRdxPhi))))) | ||
| return InstDesc(false, I); | ||
|
|
||
| if (isFindLastRecurrenceKind(Kind)) { |
There was a problem hiding this comment.
There's also the comment above for We are looking for selects of the form: which should now in include the FindLast case.
e6fc0cf to
abea562
Compare
|
Rebased on top of the cost model change, and I removed the extra select recipe in favor of the broadcast approach. We can change that to the scalar condition later. |
| ; CHECK-VF4IC1: [[FOR_BODY]]: | ||
| ; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] | ||
| ; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] | ||
| ; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] |
There was a problem hiding this comment.
I think the name/comment for not_vectorized_select_icmp_truncated_iv_out_of_bound missed being updated.
| ; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE | ||
|
|
||
| target triple = "aarch64-linux-gnu" | ||
|
|
There was a problem hiding this comment.
How about testing a pointer type for the data phi, and some negative tests for the cases we bail out for in isFindPattern (e.g. multiple users in the loop)?
There was a problem hiding this comment.
Done. Added tests for pointers and floats, then multiple in-loop users of the select, multiple users of the compare, and chained selects.
abea562 to
049428d
Compare
So running with SVE enabled, we vectorize 7 loops in find-last.cpp from the test suite. I don't see any noticeable performance impact from the vectorization, but total execution time is on the order of a couple of milliseconds, so I maybe need to increase the amount of data for a microbenchmark to see anything. That said, this is the more generic form of this reduction. I intend to follow this part up with an in-loop version of the reduction, so that targets with SVE can just use clastb inside the loop instead of maintaining the extra phi for the mask. I'll try some spec runs and see if there's any interesting results there. |
MacDue
left a comment
There was a problem hiding this comment.
Generally LGTM. Thanks for addressing my comments!
Needing to add the VPLastActiveMaskPHIRecipe is a bit unfortunate, but I don't have another suggestion (maybe other reviewers do).
sdesmalen-arm
left a comment
There was a problem hiding this comment.
Left a few nits, but otherwise looks reasonable to me.
| case RecurKind::FindFirstIVUMin: | ||
| case RecurKind::FindLastIVSMax: | ||
| case RecurKind::FindLastIVUMax: | ||
| // TODO: Make type-agnostic. |
There was a problem hiding this comment.
If this is not type-agnostic, should this be reflected in the name of the recurrence kind?
There was a problem hiding this comment.
It isn't for the other FindFirst/FindLast (though that might be inferred by U/S Min/Max) or AnyOf. I think it's just the fp-based reduction types that are prefixed with an extra F.
I did experiment with treating FindLast separately in AddReductionVar when it checks the type and everything was fine, but decided to leave that out of the initial patch.
| // Override IC if user provided an interleave count. | ||
| IC = UserIC > 0 ? UserIC : IC; | ||
|
|
||
| // FIXME: Enable interleaving for last_active reductions. |
There was a problem hiding this comment.
What would be required to enable interleaving?
There was a problem hiding this comment.
Completing the final reduction outside of the loop, especially the mask phis.
fhahn
left a comment
There was a problem hiding this comment.
So running with SVE enabled, we vectorize 7 loops in find-last.cpp from the test suite. I don't see any noticeable performance impact from the vectorization, but total execution time is on the order of a couple of milliseconds, so I maybe need to increase the amount of data for a microbenchmark to see anything.
Yes that is only intended to test correctness; as I mentioned earlier, it would be good to add some MicroBenchmarks, to make this easier to measure
llvm/lib/Analysis/IVDescriptors.cpp
Outdated
| for (User *U : I->users()) { | ||
| if (U == OrigPhi) | ||
| continue; | ||
| if (auto *UI = dyn_cast<Instruction>(U); UI && !TheLoop->contains(UI)) |
There was a problem hiding this comment.
I think we generally avoid this kind of pattern in LV/IVDesc.
There was a problem hiding this comment.
Removed. This was basically a (mutated) leftover from Michael's code, since it had entirely separate analysis logic.
There was a problem hiding this comment.
Is the FIXME still relevant then?
// FIXME: Support more complex patterns, including multiple selects.
// The Select must be used only outside the loop and by the PHI.
There was a problem hiding this comment.
Yes, though some (all?) of the changes might not be in that area of code. The extra unit tests cover some of the patterns we might want.
| continue; | ||
| } | ||
| } | ||
| } else if (isa<VPLastActiveMaskPHIRecipe>(R)) { |
There was a problem hiding this comment.
why is this needed? Epilogue vectorization is disabled, right?
There was a problem hiding this comment.
I added that before disabling it entirely. Removed (along with the recipe)
| for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { | ||
| if (RecurrenceDescriptor::isFindLastRecurrenceKind( | ||
| RdxDesc.getRecurrenceKind())) { | ||
| VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); |
There was a problem hiding this comment.
Please avoid going through Legal/RecipeBuilder here if possible and instead use the information directly available in VPlan.
| #endif | ||
| }; | ||
|
|
||
| // TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close |
There was a problem hiding this comment.
THis just lowers to a wide phi, right? Can you just use VPWidenPHIRecipe?
There was a problem hiding this comment.
It does, but VPWidenPHIRecipe::clone casts the underlying value to a PHINode. Since there isn't one, that will assert if/when we enable e.g. interleaving. I think that's the same for VPActiveLaneMaskPHIRecipe, which has a comment noting it would be good to drop the specialized recipe in favour of VPWidenPHIRecipe.
I could adjust ::clone() to work (using cast_if_present) if that works for you? I'm not sure if there's any other uses of the underlying value for those recipes; I'll try it and see if any test fails.
There was a problem hiding this comment.
It seems that there isn't a computeCost method for VPWidenPHIRecipe yet either.
There was a problem hiding this comment.
done; assuming you're happy with VPWidenPHIRecipe no longer requiring an underlying value.
| ; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> | ||
| ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> | ||
| ; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir<false>, vp<%8> | ||
| ; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> | ||
| ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> | ||
| ; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> | ||
| ; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6> | ||
| ; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> | ||
| ; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp> | ||
| ; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> | ||
| ; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> |
There was a problem hiding this comment.
Please use patterns for the unnamed vpvalues throughout
|
Any follow-up comments now that there's a microbenchmark for this? (See llvm/llvm-test-suite#295) I'd like to commit this week if possible. |
|
|
||
| ;; The following run line caused an ICE before using a dedicated FindLast PHI recipe. | ||
| ;; We're not looking at the resulting IR, just confirming it doesn't crash. | ||
| ; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null |
There was a problem hiding this comment.
For this, it seems a phase ordering test would be more appropriate?
| ; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON | ||
| ; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE |
There was a problem hiding this comment.
Could we remove instcombine, to avoid the test being impacted by unrelated instcombine changes
| FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of | ||
| ///< (x,y) is increasing loop induction, and both x and y | ||
| ///< are integer type, producing a UMax reduction. | ||
| FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y |
There was a problem hiding this comment.
I think at least loop-unrolling also needs to be thought about the new kind, seeing crashes when building the test suite currently. I think to reproduce you can just add a loop with CAS to https://github.com/llvm/llvm-project/blob/main/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll.
There was a problem hiding this comment.
done. I think I left it alone originally since I made FindLast a generic RecurKind that could handle int, float, and pointer types (and it therefore didn't appear in the isIntegerRecurrenceKind list.) I figured that could be a follow-up PR (and could potentially convert AnyOf at the same time).
| @@ -0,0 +1,123 @@ | |||
| ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ | |||
| ; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \ | |||
| ; RUN: -disable-output 2>&1 < %s | FileCheck %s | |||
There was a problem hiding this comment.
Does this test need scalable vectors?
There was a problem hiding this comment.
no. switched to a fixed width VF (forced to 4)
| ; CHECK-NEXT: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]] | ||
| ; CHECK-NEXT: IR [[LD]] = load i32, ptr [[LDADDR]], align 4 | ||
| ; CHECK-NEXT: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]] | ||
| ; CHECK-NEXT: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]] | ||
| ; CHECK-NEXT: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 | ||
| ; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] | ||
| ; CHECK-NEXT: No successors |
| ; CHECK-NEXT: No successors | ||
| ; CHECK-NEXT: } | ||
|
|
||
| ; CHECK: Cost of 1 for VF vscale x 1: induction instruction [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 |
…rue. Fix a miscompile in the FindLast handling by normalizing selects with the phi node as the first op to ones that select the data value when the condition is true, by swapping operands and inverting the condition. This should ensure correct codegen for both cases. Select normalization: https://alive2.llvm.org/ce/z/yFdivK Fixes a miscompile reported for 2abd6d6 (#158088).
|
This is currently preventing us from making a new emscripten release, due to the failure mention above in #158088 (comment). If its not easily fixable, can we please revert? |
|
The miscompile should be fixed by 5995fe9 |
This is a bug in the backend, can you provide a reproducer? |
thanks! |
|
@sbc100 I've marked the problematic intrinsic as having invalid cost on WebAssembly in 811fb22. That should fix the crash, but it would be great if you could provide a reproducer for the crash and possibly also one for the loop-vectorizer emitting last.active.lane for WebAssembly, simple ones I oculd write were not profitable to vectorize |
The failure is coming from llvm-test-suite/SingleSource/UnitTests/Vectorizer/find-last.cpp which is port of the llvm test suite. I don't have that setup locally I'm afraid, this failure is coming from one of our CI machine. |
I can confirm the problem has been fixed. To be honest, the reproduction conditions are a bit complicated, it takes three iterative ThinLTO+PGO builds with avx512 enabled to finally get a clang that crashes randomly. |
Based on Michael Maitland's previous work: llvm#121222 This PR uses the existing recurrences code instead of introducing a new pass just for CSA autovec. I've also made recipes that are more generic.
Turn assertion from 2abd6d6 (llvm#158088) into a bail out to prevent crash when tail-folding. Fixes llvm#175990.
Add tests showing mis-compiles caused by 2abd6d6 (llvm#158088).
…rue. Fix a miscompile in the FindLast handling by normalizing selects with the phi node as the first op to ones that select the data value when the condition is true, by swapping operands and inverting the condition. This should ensure correct codegen for both cases. Select normalization: https://alive2.llvm.org/ce/z/yFdivK Fixes a miscompile reported for 2abd6d6 (llvm#158088).
Currently the WebAssembly backend crashes when trying to lower some extract.last.active intrinsic calls. Mark their cost as invalid temporarily, to avoid them being introduced by the loop vectorizer after 2abd6d6 (llvm#158088).
|
Emscripten tree is green again: https://ci.chromium.org/p/emscripten-releases/g/main/console. Thanks! |
|
For a reproducer of the wasm crash, see https://github.com/kripken/emscripten_/blob/vectorize_crash/test/vectorize_crash.cpp |
Based on Michael Maitland's previous work: llvm#121222 This PR uses the existing recurrences code instead of introducing a new pass just for CSA autovec. I've also made recipes that are more generic.
Turn assertion from 2abd6d6 (llvm#158088) into a bail out to prevent crash when tail-folding. Fixes llvm#175990.
Add tests showing mis-compiles caused by 2abd6d6 (llvm#158088).
…rue. Fix a miscompile in the FindLast handling by normalizing selects with the phi node as the first op to ones that select the data value when the condition is true, by swapping operands and inverting the condition. This should ensure correct codegen for both cases. Select normalization: https://alive2.llvm.org/ce/z/yFdivK Fixes a miscompile reported for 2abd6d6 (llvm#158088).
Currently the WebAssembly backend crashes when trying to lower some extract.last.active intrinsic calls. Mark their cost as invalid temporarily, to avoid them being introduced by the loop vectorizer after 2abd6d6 (llvm#158088).
This patch extends the support added in llvm#158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
This patch extends the support added in llvm#158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
…8862) This patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
…m#178862) This patch extends the support added in llvm#158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
…m#178862) This patch extends the support added in llvm#158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
…ons" (#180708) This patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence. --- Reverts #180275 (original PR: #178862) Additional type legalization for `ISD::VECTOR_FIND_LAST_ACTIVE` was added in #180290, which should resolve the backend crashes on x86.
…m#178862) This patch extends the support added in llvm#158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: ``` int simple_csa_int_load( int* a, int* b, int default_val, int N, int threshold) { int result = default_val; for (int i = 0; i < N; ++i) if (a[i] > threshold) result = b[i]; return result; } ``` It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
Based on Michael Maitland's previous work:
#121222
This PR uses the existing recurrences code instead of introducing a
new pass just for CSA autovec. I've also made recipes that are more
generic.
I've enabled it by default to see the impact on tests; if there are
regressions we can put it behind a cli option. I haven't corrected
all the comments for the tests, I'll wait until we decide whether
to keep it enabled by default first.
I will be doing some performance runs on AArch64 to figure out
the cost model, as we mostly regard vector selects as per-lane
instead of selecting the whole vector at once.