diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index e57e0cf636501..65e89f5795c2c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -2118,19 +2118,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { for (const auto &Reduction : getReductionVars()) ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); - for (const auto &Entry : getInductionVars()) { - PHINode *OrigPhi = Entry.first; - for (User *U : OrigPhi->users()) { - auto *UI = cast(U); - if (!TheLoop->contains(UI)) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an " - "outside user for " - << *UI << "\n"); - return false; - } - } - } - // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll index 404ef096b49b3..038205f6704bd 100644 --- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll +++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll @@ -2,11 +2,7 @@ ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -; The vectorizer should refuse to fold the tail by masking because -; %conv is used outside of the loop. Test for this by checking that -; %n.vec, the vector trip count, is rounded down to the next multiple of -; 4. If folding the tail, it would have been rounded up instead. -; Test case for #76069(https://github.com/llvm/llvm-project/issues/76069). +; TODO: Move this test into tail-folding-iv-outside-user.ll define i32 @test(ptr %arr, i64 %n) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) { @@ -15,8 +11,7 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: br i1 [[CMP1]], label [[PREHEADER:%.*]], label [[DONE:%.*]] ; CHECK: preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], -2 ; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i8 @@ -24,34 +19,70 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i8 [[TMP8]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP1]], 255 ; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP9]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 1, [[N_VEC]] -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8 -; CHECK-NEXT: [[IND_END1:%.*]] = add i8 1, [[DOTCAST]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] -; CHECK-NEXT: [[TMP17:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP21:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP17]] -; CHECK-NEXT: store <4 x i32> splat (i32 65), ptr [[TMP18]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store i32 65, ptr [[TMP18]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: pred.store.if3: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP13]] +; CHECK-NEXT: store i32 65, ptr [[TMP14]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]] +; CHECK-NEXT: store i32 65, ptr [[TMP26]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3 +; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.if7: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP19]] +; CHECK-NEXT: store i32 65, ptr [[TMP28]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.continue8: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[IND_END:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP22]], i1 false) ; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i64 [[IND_END]], 1 -; CHECK-NEXT: br i1 [[CMP_N]], label [[LOAD_VAL:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[IND_ESCAPE]] +; CHECK-NEXT: br label [[LOAD_VAL:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I:%.*]] = phi i8 [ [[INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ 1, [[SCALAR_PH]] ] +; CHECK-NEXT: [[I:%.*]] = phi i8 [ [[INC:%.*]], [[LOOP]] ], [ 1, [[SCALAR_PH]] ] ; CHECK-NEXT: [[SUB:%.*]] = add nsw i64 [[CONV]], -1 ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[SUB]] ; CHECK-NEXT: store i32 65, ptr [[PTR]], align 4 @@ -60,7 +91,7 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[CONV2]], [[N]] ; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP]], label [[LOAD_VAL]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: load_val: -; CHECK-NEXT: [[FINAL:%.*]] = phi i64 [ [[CONV]], [[LOOP]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[FINAL:%.*]] = phi i64 [ [[CONV]], [[LOOP]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[FINAL]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4 ; CHECK-NEXT: br label [[DONE]] diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll index 0390346f424f7..8d1b7ec5b574b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 -; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF2 %s -; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-tail-folding-style=none -S %s | FileCheck --check-prefix=VF2 %s +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -force-tail-folding-style=none -S %s | FileCheck %s define void @test1_pr58811(ptr %dst) { ; VF2-LABEL: define void @test1_pr58811( @@ -523,41 +523,41 @@ define void @iv_start_from_shl_of_previous_iv(ptr %dst) { ; ; CHECK-LABEL: define void @iv_start_from_shl_of_previous_iv( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP_1:.*]] -; CHECK: [[LOOP_1]]: -; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ] -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_1]] -; CHECK-NEXT: store i8 0, ptr [[GEP_1]], align 1 -; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_1]], 0 -; CHECK-NEXT: br i1 [[CMP_1]], label %[[LOOP_1]], label %[[LOOP_1_EXIT:.*]] -; CHECK: [[LOOP_1_EXIT]]: -; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ] -; CHECK-NEXT: [[IV_1_SHL:%.*]] = shl i64 [[IV_1_LCSSA]], 1 -; CHECK-NEXT: br label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[IV_1_SHL]], 96 +; CHECK-NEXT: [[PRED_STORE_IF3:.*]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_1_SHL]], [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[PRED_STORE_IF3]] ], [ [[IV_1_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_1]] +; CHECK-NEXT: store i8 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 +; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_1]], 0 +; CHECK-NEXT: br i1 [[CMP_1]], label %[[VECTOR_BODY]], label %[[LOOP_1_EXIT1:.*]] +; CHECK: [[LOOP_1_EXIT1]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ [[IV_1]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV_1_SHL:%.*]] = shl i64 [[TMP4]], 1 +; CHECK-NEXT: br label %[[VECTOR_BODY8:.*]] +; CHECK: [[VECTOR_BODY8]]: +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[IV_1_SHL]], 96 +; CHECK-NEXT: br label %[[VECTOR_BODY1:.*]] +; CHECK: [[VECTOR_BODY1]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_BODY8]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY1]] ] +; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 [[IV_1_SHL]], [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] ; CHECK-NEXT: store <4 x i8> splat (i8 1), ptr [[TMP0]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY1]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: br label %[[LOOP_2:.*]] ; CHECK: [[LOOP_2]]: -; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ] +; CHECK-NEXT: br label %[[LOOP_3:.*]] +; CHECK: [[LOOP_3]]: +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[TMP1]], %[[LOOP_2]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_3]] ] ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_2]] ; CHECK-NEXT: store i8 1, ptr [[GEP_2]], align 1 ; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], 1 ; CHECK-NEXT: [[CMP_2:%.*]] = icmp eq i64 [[IV_2]], 100 -; CHECK-NEXT: br i1 [[CMP_2]], label %[[EXIT:.*]], label %[[LOOP_2]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_2]], label %[[EXIT:.*]], label %[[LOOP_3]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-iv-outside-user.ll new file mode 100644 index 0000000000000..676fe6d5cccd0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-iv-outside-user.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt < %s -S -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s + +define i32 @f(ptr noalias %p, i32 %start, i32 %step, i32 %n) { +; CHECK-LABEL: define i32 @f( +; CHECK-SAME: ptr noalias [[P:%.*]], i32 [[START:%.*]], i32 [[STEP:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 1 +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP0]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> , [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[STEP]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[IV2_LCSSA:%.*]] = extractelement <4 x i32> [[VEC_IND]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: ret i32 [[IV2_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %loop] + %iv2 = phi i32 [%start, %entry], [%iv2.next, %loop] + %iv.next = add i32 %iv, 1 + %iv2.next = add i32 %iv2, %step + %ec = icmp eq i32 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %iv2 +}