[LoopVectorize] Perform loop versioning for some early exit loops#120603
[LoopVectorize] Perform loop versioning for some early exit loops#120603david-arm wants to merge 4 commits into
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesFull diff: https://github.com/llvm/llvm-project/pull/120603.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b0..828f64e0e59432 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
/// contains read-only memory accesses.
bool isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
- SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
+ SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
+ SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
/// Return true if we know that executing a load from this value cannot trap.
///
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fbe80eddbae07a..7f6f8f9c3f5bfc 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -425,6 +425,19 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
+ /// Return the number of loads in the loop we have to consider that could
+ /// potentially fault in a loop with uncountable early exits.
+ unsigned getNumPotentiallyFaultingLoads() const {
+ return PotentiallyFaultingLoads.size();
+ }
+
+ /// Return a vector of all potentially faulting loads in a loop with
+ /// uncountable early exits.
+ const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
+ getPotentiallyFaultingLoads() const {
+ return &PotentiallyFaultingLoads;
+ }
+
/// Returns a HistogramInfo* for the given instruction if it was determined
/// to be part of a load -> update -> store sequence where multiple lanes
/// may be working on the same memory address.
@@ -533,6 +546,8 @@ class LoopVectorizationLegality {
/// additional cases safely.
bool isVectorizableEarlyExitLoop();
+ bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
+
/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -656,6 +671,10 @@ class LoopVectorizationLegality {
/// Keep track of the destinations of all uncountable exits if the
/// exact backedge taken count is not computable.
SmallVector<BasicBlock *, 4> UncountableExitBlocks;
+
+ /// Keep a record of all potentially faulting loads in loops with
+ /// uncountable early exits.
+ SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
};
} // namespace llvm
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 54b9521fda8fd2..7818dcf84278fa 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -808,15 +808,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
bool llvm::isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
- SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+ SmallVectorImpl<const SCEVPredicate *> *Predicates,
+ SmallVectorImpl<LoadInst *> *NonDerefLoads) {
+ bool Result = true;
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : *BB) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
- if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
+ if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
+ Predicates)) {
+ if (!NonDerefLoads)
+ return false;
+ NonDerefLoads->push_back(LI);
+ Result = false;
+ }
+ } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+ I.mayThrow()) {
+ if (!NonDerefLoads)
return false;
- } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
- return false;
+ Result = false;
+ }
}
}
- return true;
+ return Result;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 555c8435dd330d..e7cd6c433476a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1589,6 +1589,39 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
return Result;
}
+bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
+ SmallVectorImpl<LoadInst *> *Loads) {
+ LLVM_DEBUG(dbgs() << "Found potentially faulting loads in loop with "
+ "uncountable early exit:\n");
+ for (LoadInst *LI : *Loads) {
+ LLVM_DEBUG(dbgs() << "Load: " << *LI << '\n');
+ Value *Ptr = LI->getPointerOperand();
+ if (!Ptr)
+ return false;
+ const SCEV *PtrExpr = PSE.getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
+ // TODO: Deal with loop invariant pointers.
+ if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
+ return false;
+ auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
+ if (!Step)
+ return false;
+ const SCEV *Start = AR->getStart();
+
+ // Make sure the step is positive and matches the object size in memory.
+ // TODO: Extend this to cover more cases.
+ auto &DL = LI->getDataLayout();
+ APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+ DL.getTypeStoreSize(LI->getType()).getFixedValue());
+ if (EltSize != Step->getAPInt())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SCEV for Load Ptr: " << *Start << 'n');
+ PotentiallyFaultingLoads.push_back({LI, Start});
+ }
+ return true;
+}
+
bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
BasicBlock *LatchBB = TheLoop->getLoopLatch();
if (!LatchBB) {
@@ -1713,15 +1746,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
"Expected latch predecessor to be the early exiting block");
- // TODO: Handle loops that may fault.
Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
+ SmallVector<LoadInst *, 4> Loads;
+ if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
+ &Loads)) {
+ if (!analyzePotentiallyFaultingLoads(&Loads)) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a8511483e00fbe..a593259506cdd3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2135,6 +2135,29 @@ class GeneratedRTChecks {
};
} // namespace
+static void addPointerAlignmentChecks(
+ const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
+ PredicatedScalarEvolution &PSE, ElementCount VF) {
+ ScalarEvolution *SE = PSE.getSE();
+ const DataLayout &DL = SE->getDataLayout();
+ Type *PtrIntType = DL.getIntPtrType(SE->getContext());
+
+ const SCEV *Zero = SE->getZero(PtrIntType);
+ const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
+
+ for (auto Load : *Loads) {
+ APInt EltSize(
+ DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
+ DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
+ const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
+ const SCEV *Align =
+ SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
+ (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
+ const SCEV *Rem = SE->getURemExpr(Start, Align);
+ PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
+ }
+}
+
static bool useActiveLaneMask(TailFoldingStyle Style) {
return Style == TailFoldingStyle::Data ||
Style == TailFoldingStyle::DataAndControlFlow ||
@@ -10236,11 +10259,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
- reportVectorizationFailure("Auto-vectorization of loops with uncountable "
- "early exit is not enabled",
- "UncountableEarlyExitLoopsDisabled", ORE, L);
- return false;
+ if (LVL.hasUncountableEarlyExit()) {
+ if (!EnableEarlyExitVectorization) {
+ reportVectorizationFailure("Auto-vectorization of loops with uncountable "
+ "early exit is not enabled",
+ "UncountableEarlyExitLoopsDisabled", ORE, L);
+ return false;
+ }
+
+ unsigned NumPotentiallyFaultingPointers =
+ LVL.getNumPotentiallyFaultingLoads();
+ if (NumPotentiallyFaultingPointers > 1) {
+ reportVectorizationFailure("Not worth vectorizing loop with uncountable "
+ "early exit, due to number of potentially "
+ "faulting loads",
+ "UncountableEarlyExitMayFault", ORE, L);
+ return false;
+ } else if (NumPotentiallyFaultingPointers)
+ LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with"
+ << "pointer alignment checks.\n");
}
// Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10391,8 +10428,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
- if (VF.Width.isVector() || SelectedIC > 1)
+ if (VF.Width.isVector() || SelectedIC > 1) {
+ if (LVL.getNumPotentiallyFaultingLoads())
+ addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), PSE,
+ VF.Width);
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ }
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 8df0eaec6a8c9d..d106b0c0921de4 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -208,7 +208,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
%p1 = alloca [42 x i8]
%p2 = alloca [42 x i8]
@@ -238,7 +238,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
br label %loop
@@ -264,7 +264,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
index c68eeac19c9ecf..12c46f7f60123d 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
declare void @init_mem(ptr, i64);
@@ -141,3 +141,78 @@ loop.end:
%retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
ret i64 %retval
}
+
+define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(ptr %p1) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(
+; CHECK-SAME: ptr [[P1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P11:%.*]] = ptrtoint ptr [[P1]] to i64
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[P11]] to i2
+; CHECK-NEXT: [[TMP1:%.*]] = add i2 [[TMP0]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = zext i2 [[TMP1]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.split:
+; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[VECTOR_SCEVCHECK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK: loop.inc:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.end:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, [[LOOP1]] ], [ 0, [[LOOP_INC]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 1, [[MIDDLE_SPLIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %cmp3 = icmp eq i8 %ld1, 3
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i64 [ 1, %loop ], [ 0, %loop.inc ]
+ ret i64 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
|
e0e8ca8 to
ca000fb
Compare
ca000fb to
50f8fbf
Compare
|
Gentle ping. :) |
| bool isSafeForAnyVectorWidth() const { | ||
| return Legal->isSafeForAnyVectorWidth() && | ||
| (!Legal->hasUncountableEarlyExit() || | ||
| !Legal->getNumPotentiallyFaultingLoads()); | ||
| } |
There was a problem hiding this comment.
Given all the questions are answered by LoopVectorizationLegality why not just extend its implementation instead of adding a cost model specific implementation?
| std::optional<unsigned> getMaxVScale(const Function &F, | ||
| const TargetTransformInfo &TTI) { | ||
| if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) | ||
| return MaxVScale; | ||
|
|
||
| if (F.hasFnAttribute(Attribute::VScaleRange)) | ||
| return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); | ||
|
|
||
| return std::nullopt; | ||
| } |
There was a problem hiding this comment.
Does getMaxVScale need to be moved? I could not spot any uses in addPointerAlignmentChecks but perhaps I've missed something?
There was a problem hiding this comment.
Yeah, I think in an early version of the patch I required getMaxVScale for something, but never moved it. Anyway, it's back in it's original place now!
| if (!TTI->getMinPageSize() || !analyzePotentiallyFaultingLoads(&Loads)) { | ||
| reportVectorizationFailure( |
There was a problem hiding this comment.
What happens now for the situation where the loop is not read only? Previously we would have bailed but now that fact looks ignored?
Is it possible that isDereferenceableReadOnlyLoop needs a more complex return along the lines of "yes, no, yes if alignment is proven"? This would means we can bail faster for loops that are clearly not read only.
Perhaps the LoopVectorize has just outgrown isDereferenceableReadOnlyLoop and it needs to implement something else that better fits its requirements?
There was a problem hiding this comment.
I've just removed isDereferenceableReadOnlyLoop now and inlined some of the logic into the loop above.
| assert(SelectedIC == 1 && | ||
| "Interleaving not supported for early exit loops and " | ||
| "potentially faulting loads"); |
There was a problem hiding this comment.
What makes interleaving problematic? Would it be sufficient to ensure that when choosing combinations of VF and UF that the result of VFxUF is smaller than some safe value (i.e. essentially what you're doing for VF now but extended to VFxUF).
There was a problem hiding this comment.
Yeah potentially, although currently we won't vectorise with UF>1 anyway because it's explicitly disabled in selectInterleaveCount. That's because we don't yet handle or'ing together the multiple early exit conditions for all the parts in vplan. So I didn't try adding support for runtime checks when UF>1 because there isn't any way to test it.
There was a problem hiding this comment.
I've updated the code to handle interleaving, although we can't currently test it when IC > 1. However, it seems trivial/obvious enough that it should work.
| if (!NonDerefLoads) | ||
| return false; | ||
| } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) | ||
| return false; | ||
| Result = false; |
There was a problem hiding this comment.
I have a comment about this below, but it seems weird to carry on for a loop we know is not read only.
| if (!Ptr) | ||
| return false; |
There was a problem hiding this comment.
Is it possible for a LoadInst pointer operand to not exist?
There was a problem hiding this comment.
Damn, forgot to look at this. I'm not sure to be honest. I thought I'd seen similar checks elsewhere, but I'll take another look.
There was a problem hiding this comment.
It should always return a valid value.
| return false; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "LV: SCEV for Load Ptr: " << *Start << '\n'); | ||
| PotentiallyFaultingLoads.push_back({LI, Start}); |
There was a problem hiding this comment.
Is it necessary to record Start? I kind of assumed it would always be LI->getPointerOperand() albeit in SCEV form. Alternatively, perhaps it would be sufficient to record the pointer and type size. Doing this would mean the logic can be used for things other than LoadInst.
There was a problem hiding this comment.
We need the Start because that's guaranteed to be loop-invariant, whereas LI->getPointerOperand() will be a loop-varying expression, i.e. getelementptr i8, ptr %p, i64 %index where %index comes from a PHI, or something like that. We have to add the pointer checks in the loop preheader.
In addition to Start, we also need to know the pointer type and the loaded type, which is why I thought it was easier to pass in LI. However, I could just collect 3 things instead - {Start, PtrType, MemType}. This would be more general than restricting to loads only?
There was a problem hiding this comment.
I've changed the list to now only require Start and the load type.
| uint64_t getMaxSafeVectorWidthInBits() const { | ||
| uint64_t MaxSafeVectorWidth = Legal->getMaxSafeVectorWidthInBits(); | ||
| // The legalizer bails out if getMinPageSize does not return a value. | ||
| if (Legal->hasUncountableEarlyExit() && | ||
| Legal->getNumPotentiallyFaultingLoads()) | ||
| MaxSafeVectorWidth = | ||
| std::min(MaxSafeVectorWidth, uint64_t(*TTI.getMinPageSize()) * 8); | ||
| return MaxSafeVectorWidth; | ||
| } |
There was a problem hiding this comment.
Similar to above in that Legal->getMaxSafeVectorWidthInBits() really should give the correct answer at all times because otherwise it's too easy to tie yourself in knots not knowing which version is valid at which time.
There was a problem hiding this comment.
Are there any documented guarantees related to alignment of pointers and dereferenceability of loads on the LLVM IR level? I had brief look in LangRef but couldn't find anything.
Could the assumption cause problems with HW/software extensions, .e.g. something like AArch64 MTE or a software layer that checks out-of-bounds accesses? Not super familiar with the details of MTE, but I think it tracks sizes of allocations on smaller granularity than page sizes and traps on out-of-bound accesses.
This is something I'll look into - perhaps the worst case for now would be that loop versioning is off by default for AArch64 and would have to be enabled explicitly with a flag. Without digging into this in huge detail, is it possible that if MTE is a problem for this PR, then we also have a problem today in the loop vectoriser when we if-convert loads into unconditional loads based upon the result of |
50f8fbf to
37f8515
Compare
david-arm
left a comment
There was a problem hiding this comment.
For now, I've changed it so that loop versioning is turned off by default. When the user is confident that MTE compatibility is not required they can enable it manually. In future, it would be good to revisit this because even with MTE we can still vectorise with max vector widths of 16 bytes, and memory tagging is currently only supported in LLVM for Android targets anyway.
| if (!Ptr) | ||
| return false; |
There was a problem hiding this comment.
Damn, forgot to look at this. I'm not sure to be honest. I thought I'd seen similar checks elsewhere, but I'll take another look.
| bool isSafeForAnyVectorWidth() const { | ||
| return Legal->isSafeForAnyVectorWidth() && | ||
| (!Legal->hasUncountableEarlyExit() || | ||
| !Legal->getNumPotentiallyFaultingLoads()); | ||
| } |
| std::optional<unsigned> getMaxVScale(const Function &F, | ||
| const TargetTransformInfo &TTI) { | ||
| if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) | ||
| return MaxVScale; | ||
|
|
||
| if (F.hasFnAttribute(Attribute::VScaleRange)) | ||
| return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); | ||
|
|
||
| return std::nullopt; | ||
| } |
There was a problem hiding this comment.
Yeah, I think in an early version of the patch I required getMaxVScale for something, but never moved it. Anyway, it's back in it's original place now!
| if (!TTI->getMinPageSize() || !analyzePotentiallyFaultingLoads(&Loads)) { | ||
| reportVectorizationFailure( |
There was a problem hiding this comment.
I've just removed isDereferenceableReadOnlyLoop now and inlined some of the logic into the loop above.
| assert(SelectedIC == 1 && | ||
| "Interleaving not supported for early exit loops and " | ||
| "potentially faulting loads"); |
There was a problem hiding this comment.
I've updated the code to handle interleaving, although we can't currently test it when IC > 1. However, it seems trivial/obvious enough that it should work.
| if (!NonDerefLoads) | ||
| return false; | ||
| } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) | ||
| return false; | ||
| Result = false; |
| return false; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "LV: SCEV for Load Ptr: " << *Start << '\n'); | ||
| PotentiallyFaultingLoads.push_back({LI, Start}); |
There was a problem hiding this comment.
I've changed the list to now only require Start and the load type.
| uint64_t getMaxSafeVectorWidthInBits() const { | ||
| uint64_t MaxSafeVectorWidth = Legal->getMaxSafeVectorWidthInBits(); | ||
| // The legalizer bails out if getMinPageSize does not return a value. | ||
| if (Legal->hasUncountableEarlyExit() && | ||
| Legal->getNumPotentiallyFaultingLoads()) | ||
| MaxSafeVectorWidth = | ||
| std::min(MaxSafeVectorWidth, uint64_t(*TTI.getMinPageSize()) * 8); | ||
| return MaxSafeVectorWidth; | ||
| } |
fhahn
left a comment
There was a problem hiding this comment.
This is something I'll look into - perhaps the worst case for now would be that loop versioning is off by default for AArch64 and would have to be enabled explicitly with a flag. Without digging into this in huge detail, is it possible that if MTE is a problem for this PR, then we also have a problem today in the loop vectoriser when we if-convert loads into unconditional loads based upon the result of
isDereferenceableAndAlignedInLoop- seeLoopVectorizationLegality::canVectorizeWithIfConvert. MTE can be used for stack tagging.
IIRC isDereferenceableAndAlignedInLoop determines dereferenceability separately from either attributes or known allocation sizes, so it should never return true if we could access out-of-bounds I think. I am not sure how the current uses would cause problems with extensions like MTE, is there any particular issue you are thinking about?
| /// uncountable early exits. | ||
| const SmallVectorImpl<std::pair<const SCEV *, Type *>> * | ||
| getPotentiallyFaultingPointers() const { | ||
| return &PotentiallyFaultingPtrs; |
There was a problem hiding this comment.
Can we just return an ArrayRef here, or do we need to return a pointer
| /// Returns true if all loads in the loop contained in \p Loads can be | ||
| /// analyzed as potentially faulting. Any loads that may fault are added to | ||
| /// the member variable PotentiallyFaultingPtrs. | ||
| bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads); |
| if (!Ptr) | ||
| return false; |
There was a problem hiding this comment.
It should always return a valid value.
| SE->getMulExpr(ScevEC, SE->getConstant(EltSize), | ||
| (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW)); | ||
| const SCEV *Rem = SE->getURemExpr(Start, Align); | ||
| PSE.addPredicate(*(SE->getEqualPredicate(Rem, SE->getZero(PtrIntType)))); |
There was a problem hiding this comment.
| PSE.addPredicate(*(SE->getEqualPredicate(Rem, SE->getZero(PtrIntType)))); | |
| PSE.addPredicate(*SE->getEqualPredicate(Rem, SE->getZero(PtrIntType))); |
| } // namespace | ||
|
|
||
| static void addPointerAlignmentChecks( | ||
| const SmallVectorImpl<std::pair<const SCEV *, Type *>> *Ptrs, Function *F, |
There was a problem hiding this comment.
| const SmallVectorImpl<std::pair<const SCEV *, Type *>> *Ptrs, Function *F, | |
| ArrayRefl<std::pair<const SCEV *, Type *> Ptrs, Function *F, |
| const SCEV *PtrExpr = PSE.getSCEV(Ptr); | ||
| const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr); | ||
| // TODO: Deal with loop invariant pointers. | ||
| if (!AR || AR->getLoop() != TheLoop || !AR->isAffine()) |
There was a problem hiding this comment.
Would be good to explain here that we check for add-recs because the reasoning is only safe if the load executes at least once?
We should probably also restrict this just for loads from the default address space (0) for now
There was a problem hiding this comment.
Do I understand correctly that this is generating out-of-bounds loads and relying on page granularity to guarantee they don't trap? If so, we cannot perform this transform with normal loads, because it is UB at the IR level -- the behavior of the underlying hardware is irrelevant.
You're going to need a new load intrinsic to support this. I have this RFC draft on the topic lying around: https://hackmd.io/@nikic/S1O4QWYZkx I haven't submitted it, because I'm not particularly happy with the %defined_size parameter, which is needed to specify the operational semantics of the intrinsic, but not relevant for lowering.
|
I proposed a similar speculative load instruction here: https://discourse.llvm.org/t/rfc-supporting-more-early-exit-loops/84690/2?u=programmerjake |
For RVV first faulting loads, a VP variant intrinsic could support this, for example |
This patch is explicitly avoiding using first faulting loads, by relying upon a combination of vector load alignment and knowledge of the hardware minimum page size. I believe @huntergr-arm recently created a RFC about an alternative approach using first faulting loads to vectorise more complex loops involving multiple exits and/or stores in the loop. If I understand correctly, the point raised by @nikic is that the semantics of the IR load operation mean that if any portion of the vector load is out-of-bounds (even though we know it won't fault) then the entire load is UB. This essentially means the entire vectorised loop becomes UB and so in theory could be optimised away, even if in practice for existing tests and benchmark it works. Using a new out-of-bounds load intrinsic with different semantics avoids this problem, and still allows us to achieve the best performance. |
|
Hi @nikic, just for my own understanding can you point me to the part of the LangRef that says out-of-bounds loads are UB? Perhaps I'm missing something, but I did have a look and it wasn't immediately obvious to me where this is stated. |
This was my understanding as well, but I also couldn't find a clear reference from LangRef. Put up #128429 to clarify |
When attempting to vectorise a loop with an uncountable early exit, we attempt to discover if all the loads in the loop are known to be dereferenceable. If at least one load could potentially fault then we abandon vectorisation. This patch adds support for vectorising loops with one potentially faulting load by versioning the loop based on the load pointer alignment. It is required that the vector load must always fault on the first lane, i.e. the load should not straddle a page boundary. Doing so ensures that the behaviour of the vector and scalar loops is identical, i.e. if a load does fault it will fault at the same scalar iteration. Such vectorisation depends on the following conditions being met: 1. The max vector width must not exceed the minimum page size. This is done by adding a getMaxSafeVectorWidthInBits wrapper that checks if we have an uncountable early exit. For scalable vectors we must be able to determine the maximum possible value of vscale. 2. The size of the loaded type must be a power of 2. This is checked during legalisation. 3. The VF must be a power of two (so that the vector width can divide wholly into the page size which is also power of 2). For fixed-width vectors this is always true, and for scalable vectors we query the TTI hook isVScaleKnownToBeAPowerOfTwo. If the effective runtime VF could change during the loop then this cannot be vectorised via loop versioning. 4. The load pointer must be aligned to a multiple of the vector width. (NOTE: interleaving is currently disabled for these early exit loops.) We add a runtime check to ensure this is true.
37f8515 to
4884adc
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
I took a stab at a definition of a speculative load intrinsic (name subject to change): #179642 Instead of using |
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
Introduce two new intrinsics to enable vectorization of loops with early exits that have potentially faulting loads. This has previously been discussed in llvm#120603 and is similar to @nikic's https://hackmd.io/@nikic/S1O4QWYZkx, with the major difference being that there is no `%defined_size` argument and instead the load returns the stored values for the bytes within bounds and undef otherwise. I don't think we can easily compute the defined size because it may depend on the loaded values (i.e. at what lane the early exit has been taken). 1. `@llvm.speculative.load` (name subject to change) - perform a load that may access memory beyond the allocated object. It must be used in combination with `@llvm.can.load.speculatively` to ensure the load is guaranteed to not trap. 2. `@llvm.can.load.speculatively` - Returns true if it's safe to speculatively load a given number of bytes from a pointer. The semantics are target-dependent. On some targets, this may check that the access does not cross page boundaries, or stricter checks for example on AArch64 with MTE, which limits the access size to 16 bytes. `@llvm.speculative.load` is lowered to a regular load in SelectionDAG without MODereferenceable. I am not sure if we need to be more careful than this, i.e. if we could still reason about SelectionDAG loads to infer dereferencability for the pointer. `@llvm.can.load.speculatively` is lowered to regular IR in PreISel lowering, using a target-lowering hook. By default, it conservatively expands to false. These intrinsics should allow the loop vectorizer to vectorize early-exit loops with potentially non-dereferenceable loads.
When attempting to vectorise a loop with an uncountable early
exit, we attempt to discover if all the loads in the loop are
known to be dereferenceable. If at least one load could
potentially fault then we abandon vectorisation. This patch
adds support for vectorising loops with one potentially
faulting load by versioning the loop based on the load
pointer alignment. It is required that the vector load must
always fault on the first lane, i.e. the load should not
straddle a page boundary. Doing so ensures that the behaviour
of the vector and scalar loops is identical, i.e. if a load
does fault it will fault at the same scalar iteration.
Such vectorisation depends on the following conditions being
met:
This is done by adding a getMaxSafeVectorWidthInBits
wrapper that checks if we have an uncountable early exit.
For scalable vectors we must be able to determine the maximum
possible value of vscale.
checked during legalisation.
divide wholly into the page size which is also power of 2).
For fixed-width vectors this is always true, and for scalable
vectors we query the TTI hook isVScaleKnownToBeAPowerOfTwo.
If the effective runtime VF could change during the loop then
this cannot be vectorised via loop versioning.
width. (NOTE: interleaving is currently disabled for these early
exit loops.) We add a runtime check to ensure this is true.