Reland "[LV] Support conditional scalar assignments of masked operations"#180708
Reland "[LV] Support conditional scalar assignments of masked operations"#180708
Conversation
… operati…" This reverts commit 703c276.
|
@llvm/pr-subscribers-vectorizers Author: Benjamin Maxwell (MacDue) ChangesThis patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence. Reverts llvm/llvm-project#180275 (original PR: #178862) Additional type legalization for Patch is 80.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/180708.diff 7 Files Affected:
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index cf8567673e5e1..e79b609cdad85 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -261,6 +261,23 @@ static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
return true;
}
+// This matches a phi that selects between the original value (HeaderPhi) and an
+// arbitrary non-reduction value.
+static bool isFindLastLikePhi(PHINode *Phi, PHINode *HeaderPhi,
+ SmallPtrSetImpl<Instruction *> &ReductionInstrs) {
+ unsigned NumNonReduxInputs = 0;
+ for (const Value *Op : Phi->operands()) {
+ if (!ReductionInstrs.contains(dyn_cast<Instruction>(Op))) {
+ if (++NumNonReduxInputs > 1)
+ return false;
+ } else if (Op != HeaderPhi) {
+ // TODO: Remove this restriction once chained phis are supported.
+ return false;
+ }
+ }
+ return NumNonReduxInputs == 1;
+}
+
bool RecurrenceDescriptor::AddReductionVar(
PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -308,6 +325,14 @@ bool RecurrenceDescriptor::AddReductionVar(
unsigned NumCmpSelectPatternInst = 0;
InstDesc ReduxDesc(false, nullptr);
+ // To recognize find-lasts of conditional operations (such as loads or
+ // divides), that need masking, we track non-phi users and if we've found a
+ // "find-last-like" phi (see isFindLastLikePhi). We currently only support
+ // find-last reduction chains with a single "find-last-like" phi and do not
+ // allow any other operations.
+ unsigned NumNonPHIUsers = 0;
+ bool FoundFindLastLikePhi = false;
+
// Data used for determining if the recurrence has been type-promoted.
Type *RecurrenceType = Phi->getType();
SmallPtrSet<Instruction *, 4> CastInsts;
@@ -415,6 +440,8 @@ bool RecurrenceDescriptor::AddReductionVar(
return false;
bool IsAPhi = isa<PHINode>(Cur);
+ if (!IsAPhi)
+ ++NumNonPHIUsers;
// A header PHI use other than the original PHI.
if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
@@ -471,9 +498,21 @@ bool RecurrenceDescriptor::AddReductionVar(
!isAnyOfRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1))
return false;
- // All inputs to a PHI node must be a reduction value.
- if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
- return false;
+ // All inputs to a PHI node must be a reduction value, unless the phi is a
+ // "FindLast-like" phi (described below).
+ if (IsAPhi && Cur != Phi) {
+ if (!areAllUsesIn(Cur, VisitedInsts)) {
+ // A "FindLast-like" phi acts like a conditional select between the
+ // previous reduction value, and an arbitrary value. Note: Multiple
+ // "FindLast-like" phis are not supported see:
+ // IVDescriptorsTest.UnsupportedFindLastPhi.
+ FoundFindLastLikePhi =
+ Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
+ isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
+ if (!FoundFindLastLikePhi)
+ return false;
+ }
+ }
if (isIntMinMaxRecurrenceKind(Kind) && (isa<ICmpInst>(Cur) || IsASelect))
++NumCmpSelectPatternInst;
@@ -483,7 +522,7 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
- FoundReduxOp |= !IsAPhi && Cur != Start;
+ FoundReduxOp |= (!IsAPhi || FoundFindLastLikePhi) && Cur != Start;
// Process users of current instruction. Push non-PHI nodes after PHI nodes
// onto the stack. This way we are going to have seen all inputs to PHI
@@ -557,6 +596,12 @@ bool RecurrenceDescriptor::AddReductionVar(
Worklist.append(NonPHIs.begin(), NonPHIs.end());
}
+ // We only expect to match a single "find-last-like" phi per find-last
+ // reduction, with no non-phi operations in the reduction use chain.
+ assert((!FoundFindLastLikePhi ||
+ (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
+ "Unexpectedly matched a 'find-last-like' phi");
+
// This means we have seen one but not the other instruction of the
// pattern or more than just a select and cmp. Zero implies that we saw a
// llvm.min/max intrinsic, which is always OK.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 91d73671cf26a..9b4ae56e7f175 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1357,11 +1357,34 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
PhiR->getRecurrenceKind()))
continue;
- // Find the condition for the select.
+ // Find the condition for the select/blend.
auto *SelectR = cast<VPSingleDefRecipe>(&PhiR->getBackedgeRecipe());
VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
+
+ // If we're matching a blend rather than a select, there should be one
+ // incoming value which is the data, then all other incoming values should
+ // be the phi.
+ auto MatchBlend = [&](VPRecipeBase *R) {
+ auto *Blend = dyn_cast<VPBlendRecipe>(R);
+ if (!Blend)
+ return false;
+ assert(!Blend->isNormalized() && "must run before blend normalizaion");
+ unsigned NumIncomingDataValues = 0;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
+ VPValue *Incoming = Blend->getIncomingValue(I);
+ if (Incoming != PhiR) {
+ ++NumIncomingDataValues;
+ Cond = Blend->getMask(I);
+ Op1 = Incoming;
+ Op2 = PhiR;
+ }
+ }
+ return NumIncomingDataValues == 1;
+ };
+
if (!match(SelectR,
- m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))))
+ m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))) &&
+ !MatchBlend(SelectR))
return false;
// Add mask phi.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 88cd5129b51ab..5708abfaf6f5b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2758,7 +2758,7 @@ void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
if (I != 0)
O << " ";
getIncomingValue(I)->printAsOperand(O, SlotTracker);
- if (I == 0)
+ if (I == 0 && isNormalized())
continue;
O << "/";
getMask(I)->printAsOperand(O, SlotTracker);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index aa31b0ff2bb1a..7053aa60b2035 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -823,6 +823,1150 @@ exit:
ret i32 %select.data
}
+; This test is derived from the following C program:
+; int simple_csa_int_load(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = b[i];
+; return result;
+; }
+define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_load(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_load(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; SVE-NEXT: [[TMP8:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; SVE-NEXT: [[TMP10]] = select i1 [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP13]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; SVE: [[IF_THEN]]:
+; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; SVE-NEXT: br label %[[LATCH]]
+; SVE: [[LATCH]]:
+; SVE-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
+ %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+ %ld.a = load i32, ptr %a.addr, align 4
+ %if.cond = icmp sgt i32 %ld.a, %threshold
+ br i1 %if.cond, label %if.then, label %latch
+
+if.then:
+ %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
+ %ld.b = load i32, ptr %b.addr, align 4
+ br label %latch
+
+latch:
+ %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+; This test is derived from the following loop:
+; int simple_csa_int_divide(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = 42 / a[i]
+; return result;
+; }
+define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_divide(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[DIV:%.*]] = sdiv i32 42, [[LD_A]]
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_divide(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> splat (i32 1)
+; SVE-NEXT: [[TMP8:%.*]] = sdiv <vscale x 4 x i32> splat (i32 42), [[TMP7]]
+; SVE-NEXT: [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP12]] = select i1 [[TMP10]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP14]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_CON...
[truncated]
|
|
@llvm/pr-subscribers-llvm-analysis Author: Benjamin Maxwell (MacDue) ChangesThis patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence. Reverts llvm/llvm-project#180275 (original PR: #178862) Additional type legalization for Patch is 80.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/180708.diff 7 Files Affected:
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index cf8567673e5e1..e79b609cdad85 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -261,6 +261,23 @@ static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
return true;
}
+// This matches a phi that selects between the original value (HeaderPhi) and an
+// arbitrary non-reduction value.
+static bool isFindLastLikePhi(PHINode *Phi, PHINode *HeaderPhi,
+ SmallPtrSetImpl<Instruction *> &ReductionInstrs) {
+ unsigned NumNonReduxInputs = 0;
+ for (const Value *Op : Phi->operands()) {
+ if (!ReductionInstrs.contains(dyn_cast<Instruction>(Op))) {
+ if (++NumNonReduxInputs > 1)
+ return false;
+ } else if (Op != HeaderPhi) {
+ // TODO: Remove this restriction once chained phis are supported.
+ return false;
+ }
+ }
+ return NumNonReduxInputs == 1;
+}
+
bool RecurrenceDescriptor::AddReductionVar(
PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -308,6 +325,14 @@ bool RecurrenceDescriptor::AddReductionVar(
unsigned NumCmpSelectPatternInst = 0;
InstDesc ReduxDesc(false, nullptr);
+ // To recognize find-lasts of conditional operations (such as loads or
+ // divides), that need masking, we track non-phi users and if we've found a
+ // "find-last-like" phi (see isFindLastLikePhi). We currently only support
+ // find-last reduction chains with a single "find-last-like" phi and do not
+ // allow any other operations.
+ unsigned NumNonPHIUsers = 0;
+ bool FoundFindLastLikePhi = false;
+
// Data used for determining if the recurrence has been type-promoted.
Type *RecurrenceType = Phi->getType();
SmallPtrSet<Instruction *, 4> CastInsts;
@@ -415,6 +440,8 @@ bool RecurrenceDescriptor::AddReductionVar(
return false;
bool IsAPhi = isa<PHINode>(Cur);
+ if (!IsAPhi)
+ ++NumNonPHIUsers;
// A header PHI use other than the original PHI.
if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
@@ -471,9 +498,21 @@ bool RecurrenceDescriptor::AddReductionVar(
!isAnyOfRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1))
return false;
- // All inputs to a PHI node must be a reduction value.
- if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
- return false;
+ // All inputs to a PHI node must be a reduction value, unless the phi is a
+ // "FindLast-like" phi (described below).
+ if (IsAPhi && Cur != Phi) {
+ if (!areAllUsesIn(Cur, VisitedInsts)) {
+ // A "FindLast-like" phi acts like a conditional select between the
+ // previous reduction value, and an arbitrary value. Note: Multiple
+ // "FindLast-like" phis are not supported see:
+ // IVDescriptorsTest.UnsupportedFindLastPhi.
+ FoundFindLastLikePhi =
+ Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
+ isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
+ if (!FoundFindLastLikePhi)
+ return false;
+ }
+ }
if (isIntMinMaxRecurrenceKind(Kind) && (isa<ICmpInst>(Cur) || IsASelect))
++NumCmpSelectPatternInst;
@@ -483,7 +522,7 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
- FoundReduxOp |= !IsAPhi && Cur != Start;
+ FoundReduxOp |= (!IsAPhi || FoundFindLastLikePhi) && Cur != Start;
// Process users of current instruction. Push non-PHI nodes after PHI nodes
// onto the stack. This way we are going to have seen all inputs to PHI
@@ -557,6 +596,12 @@ bool RecurrenceDescriptor::AddReductionVar(
Worklist.append(NonPHIs.begin(), NonPHIs.end());
}
+ // We only expect to match a single "find-last-like" phi per find-last
+ // reduction, with no non-phi operations in the reduction use chain.
+ assert((!FoundFindLastLikePhi ||
+ (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
+ "Unexpectedly matched a 'find-last-like' phi");
+
// This means we have seen one but not the other instruction of the
// pattern or more than just a select and cmp. Zero implies that we saw a
// llvm.min/max intrinsic, which is always OK.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 91d73671cf26a..9b4ae56e7f175 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1357,11 +1357,34 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
PhiR->getRecurrenceKind()))
continue;
- // Find the condition for the select.
+ // Find the condition for the select/blend.
auto *SelectR = cast<VPSingleDefRecipe>(&PhiR->getBackedgeRecipe());
VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
+
+ // If we're matching a blend rather than a select, there should be one
+ // incoming value which is the data, then all other incoming values should
+ // be the phi.
+ auto MatchBlend = [&](VPRecipeBase *R) {
+ auto *Blend = dyn_cast<VPBlendRecipe>(R);
+ if (!Blend)
+ return false;
+ assert(!Blend->isNormalized() && "must run before blend normalizaion");
+ unsigned NumIncomingDataValues = 0;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
+ VPValue *Incoming = Blend->getIncomingValue(I);
+ if (Incoming != PhiR) {
+ ++NumIncomingDataValues;
+ Cond = Blend->getMask(I);
+ Op1 = Incoming;
+ Op2 = PhiR;
+ }
+ }
+ return NumIncomingDataValues == 1;
+ };
+
if (!match(SelectR,
- m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))))
+ m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))) &&
+ !MatchBlend(SelectR))
return false;
// Add mask phi.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 88cd5129b51ab..5708abfaf6f5b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2758,7 +2758,7 @@ void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
if (I != 0)
O << " ";
getIncomingValue(I)->printAsOperand(O, SlotTracker);
- if (I == 0)
+ if (I == 0 && isNormalized())
continue;
O << "/";
getMask(I)->printAsOperand(O, SlotTracker);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index aa31b0ff2bb1a..7053aa60b2035 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -823,6 +823,1150 @@ exit:
ret i32 %select.data
}
+; This test is derived from the following C program:
+; int simple_csa_int_load(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = b[i];
+; return result;
+; }
+define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_load(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_load(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; SVE-NEXT: [[TMP8:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; SVE-NEXT: [[TMP10]] = select i1 [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP13]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; SVE: [[IF_THEN]]:
+; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; SVE-NEXT: br label %[[LATCH]]
+; SVE: [[LATCH]]:
+; SVE-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
+ %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+ %ld.a = load i32, ptr %a.addr, align 4
+ %if.cond = icmp sgt i32 %ld.a, %threshold
+ br i1 %if.cond, label %if.then, label %latch
+
+if.then:
+ %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
+ %ld.b = load i32, ptr %b.addr, align 4
+ br label %latch
+
+latch:
+ %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+; This test is derived from the following loop:
+; int simple_csa_int_divide(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = 42 / a[i]
+; return result;
+; }
+define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_divide(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[DIV:%.*]] = sdiv i32 42, [[LD_A]]
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_divide(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> splat (i32 1)
+; SVE-NEXT: [[TMP8:%.*]] = sdiv <vscale x 4 x i32> splat (i32 42), [[TMP7]]
+; SVE-NEXT: [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP12]] = select i1 [[TMP10]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP14]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_CON...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Benjamin Maxwell (MacDue) ChangesThis patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide). For example, the following loop can now be vectorized: It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence. Reverts llvm/llvm-project#180275 (original PR: #178862) Additional type legalization for Patch is 80.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/180708.diff 7 Files Affected:
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index cf8567673e5e1..e79b609cdad85 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -261,6 +261,23 @@ static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
return true;
}
+// This matches a phi that selects between the original value (HeaderPhi) and an
+// arbitrary non-reduction value.
+static bool isFindLastLikePhi(PHINode *Phi, PHINode *HeaderPhi,
+ SmallPtrSetImpl<Instruction *> &ReductionInstrs) {
+ unsigned NumNonReduxInputs = 0;
+ for (const Value *Op : Phi->operands()) {
+ if (!ReductionInstrs.contains(dyn_cast<Instruction>(Op))) {
+ if (++NumNonReduxInputs > 1)
+ return false;
+ } else if (Op != HeaderPhi) {
+ // TODO: Remove this restriction once chained phis are supported.
+ return false;
+ }
+ }
+ return NumNonReduxInputs == 1;
+}
+
bool RecurrenceDescriptor::AddReductionVar(
PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -308,6 +325,14 @@ bool RecurrenceDescriptor::AddReductionVar(
unsigned NumCmpSelectPatternInst = 0;
InstDesc ReduxDesc(false, nullptr);
+ // To recognize find-lasts of conditional operations (such as loads or
+ // divides), that need masking, we track non-phi users and if we've found a
+ // "find-last-like" phi (see isFindLastLikePhi). We currently only support
+ // find-last reduction chains with a single "find-last-like" phi and do not
+ // allow any other operations.
+ unsigned NumNonPHIUsers = 0;
+ bool FoundFindLastLikePhi = false;
+
// Data used for determining if the recurrence has been type-promoted.
Type *RecurrenceType = Phi->getType();
SmallPtrSet<Instruction *, 4> CastInsts;
@@ -415,6 +440,8 @@ bool RecurrenceDescriptor::AddReductionVar(
return false;
bool IsAPhi = isa<PHINode>(Cur);
+ if (!IsAPhi)
+ ++NumNonPHIUsers;
// A header PHI use other than the original PHI.
if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
@@ -471,9 +498,21 @@ bool RecurrenceDescriptor::AddReductionVar(
!isAnyOfRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1))
return false;
- // All inputs to a PHI node must be a reduction value.
- if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
- return false;
+ // All inputs to a PHI node must be a reduction value, unless the phi is a
+ // "FindLast-like" phi (described below).
+ if (IsAPhi && Cur != Phi) {
+ if (!areAllUsesIn(Cur, VisitedInsts)) {
+ // A "FindLast-like" phi acts like a conditional select between the
+ // previous reduction value, and an arbitrary value. Note: Multiple
+ // "FindLast-like" phis are not supported see:
+ // IVDescriptorsTest.UnsupportedFindLastPhi.
+ FoundFindLastLikePhi =
+ Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
+ isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
+ if (!FoundFindLastLikePhi)
+ return false;
+ }
+ }
if (isIntMinMaxRecurrenceKind(Kind) && (isa<ICmpInst>(Cur) || IsASelect))
++NumCmpSelectPatternInst;
@@ -483,7 +522,7 @@ bool RecurrenceDescriptor::AddReductionVar(
++NumCmpSelectPatternInst;
// Check whether we found a reduction operator.
- FoundReduxOp |= !IsAPhi && Cur != Start;
+ FoundReduxOp |= (!IsAPhi || FoundFindLastLikePhi) && Cur != Start;
// Process users of current instruction. Push non-PHI nodes after PHI nodes
// onto the stack. This way we are going to have seen all inputs to PHI
@@ -557,6 +596,12 @@ bool RecurrenceDescriptor::AddReductionVar(
Worklist.append(NonPHIs.begin(), NonPHIs.end());
}
+ // We only expect to match a single "find-last-like" phi per find-last
+ // reduction, with no non-phi operations in the reduction use chain.
+ assert((!FoundFindLastLikePhi ||
+ (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
+ "Unexpectedly matched a 'find-last-like' phi");
+
// This means we have seen one but not the other instruction of the
// pattern or more than just a select and cmp. Zero implies that we saw a
// llvm.min/max intrinsic, which is always OK.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 91d73671cf26a..9b4ae56e7f175 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1357,11 +1357,34 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
PhiR->getRecurrenceKind()))
continue;
- // Find the condition for the select.
+ // Find the condition for the select/blend.
auto *SelectR = cast<VPSingleDefRecipe>(&PhiR->getBackedgeRecipe());
VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
+
+ // If we're matching a blend rather than a select, there should be one
+ // incoming value which is the data, then all other incoming values should
+ // be the phi.
+ auto MatchBlend = [&](VPRecipeBase *R) {
+ auto *Blend = dyn_cast<VPBlendRecipe>(R);
+ if (!Blend)
+ return false;
+ assert(!Blend->isNormalized() && "must run before blend normalizaion");
+ unsigned NumIncomingDataValues = 0;
+ for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
+ VPValue *Incoming = Blend->getIncomingValue(I);
+ if (Incoming != PhiR) {
+ ++NumIncomingDataValues;
+ Cond = Blend->getMask(I);
+ Op1 = Incoming;
+ Op2 = PhiR;
+ }
+ }
+ return NumIncomingDataValues == 1;
+ };
+
if (!match(SelectR,
- m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))))
+ m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))) &&
+ !MatchBlend(SelectR))
return false;
// Add mask phi.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 88cd5129b51ab..5708abfaf6f5b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2758,7 +2758,7 @@ void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
if (I != 0)
O << " ";
getIncomingValue(I)->printAsOperand(O, SlotTracker);
- if (I == 0)
+ if (I == 0 && isNormalized())
continue;
O << "/";
getMask(I)->printAsOperand(O, SlotTracker);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index aa31b0ff2bb1a..7053aa60b2035 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -823,6 +823,1150 @@ exit:
ret i32 %select.data
}
+; This test is derived from the following C program:
+; int simple_csa_int_load(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = b[i];
+; return result;
+; }
+define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_load(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_load(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; SVE-NEXT: [[TMP8:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
+; SVE-NEXT: [[TMP10]] = select i1 [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP13]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; SVE: [[IF_THEN]]:
+; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
+; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
+; SVE-NEXT: br label %[[LATCH]]
+; SVE: [[LATCH]]:
+; SVE-NEXT: [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SVE: [[EXIT]]:
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
+ %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+ %ld.a = load i32, ptr %a.addr, align 4
+ %if.cond = icmp sgt i32 %ld.a, %threshold
+ br i1 %if.cond, label %if.then, label %latch
+
+if.then:
+ %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
+ %ld.b = load i32, ptr %b.addr, align 4
+ br label %latch
+
+latch:
+ %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exit.cmp = icmp eq i64 %iv.next, %N
+ br i1 %exit.cmp, label %exit, label %loop
+
+exit:
+ ret i32 %select.data
+}
+
+; This test is derived from the following loop:
+; int simple_csa_int_divide(int* a, int* b, int default_val, int N, int threshold) {
+; int result = default_val;
+; for (int i = 0; i < N; ++i)
+; if (a[i] > threshold)
+; result = 42 / a[i]
+; return result;
+; }
+define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
+; NEON-LABEL: define i32 @simple_csa_int_divide(
+; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
+; NEON-NEXT: [[ENTRY:.*]]:
+; NEON-NEXT: br label %[[LOOP:.*]]
+; NEON: [[LOOP]]:
+; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; NEON-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; NEON-NEXT: br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
+; NEON: [[IF_THEN]]:
+; NEON-NEXT: [[DIV:%.*]] = sdiv i32 42, [[LD_A]]
+; NEON-NEXT: br label %[[LATCH]]
+; NEON: [[LATCH]]:
+; NEON-NEXT: [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
+; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; NEON: [[EXIT]]:
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
+; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
+;
+; SVE-LABEL: define i32 @simple_csa_int_divide(
+; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
+; SVE-NEXT: [[ENTRY:.*]]:
+; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SVE: [[VECTOR_PH]]:
+; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
+; SVE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; SVE: [[VECTOR_BODY]]:
+; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
+; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SVE-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> splat (i32 1)
+; SVE-NEXT: [[TMP8:%.*]] = sdiv <vscale x 4 x i32> splat (i32 42), [[TMP7]]
+; SVE-NEXT: [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
+; SVE-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SVE-NEXT: [[TMP11]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP12]] = select i1 [[TMP10]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SVE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SVE: [[MIDDLE_BLOCK]]:
+; SVE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
+; SVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP14]])
+; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; SVE: [[SCALAR_PH]]:
+; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
+; SVE-NEXT: br label %[[LOOP:.*]]
+; SVE: [[LOOP]]:
+; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
+; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; SVE-NEXT: [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
+; SVE-NEXT: br i1 [[IF_CON...
[truncated]
|
This patch extends the support added in #158088 to loops where the assignment is non-speculatable (e.g. a conditional load or divide).
For example, the following loop can now be vectorized:
It does this by extending the recurrence matching from only looking for selects, to include phis where all operands are the header phi, except for one which can be an arbitrary value outside the recurrence.
Reverts #180275 (original PR: #178862)
Additional type legalization for
ISD::VECTOR_FIND_LAST_ACTIVEwas added in #180290, which should resolve the backend crashes on x86.