Add constant folding for llvm.experimental.cttz.elts #182324#182895
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Kavin Gnanapandithan (KavinTheG) ChangesResolves #182324. Modifies ConstantFoldIntrinsicCall2 to add constant folding for llvm.experimental.cttz.elts. Patch is 36.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182895.diff 2 Files Affected:
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 738d0c063a5e4..0957d8d9b18c3 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1966,6 +1966,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::experimental_constrained_rint:
case Intrinsic::experimental_constrained_fcmp:
case Intrinsic::experimental_constrained_fcmps:
+
+ case Intrinsic::experimental_cttz_elts:
return true;
default:
return false;
@@ -3793,6 +3795,28 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
break;
}
}
+
+ if (isa<Constant>(Operands[0]) && isa<ConstantInt>(Operands[1])) {
+ switch (IntrinsicID) {
+ default:
+ break;
+ case Intrinsic::experimental_cttz_elts:
+ auto *FVTy = cast<FixedVectorType>(Operands[0]->getType());
+ auto *Op2 = cast<ConstantInt>(Operands[1]);
+ if (Op2->isOne())
+ return PoisonValue::get(Ty);
+ Constant *Op1 = Operands[0];
+ for (unsigned I = 0; I < FVTy->getNumElements(); ++I) {
+ Constant *Elt = Op1->getAggregateElement(I);
+ if (!Elt)
+ return nullptr;
+ if (!Elt->isNullValue())
+ return ConstantInt::get(Ty, I);
+ }
+ return ConstantInt::get(Ty, FVTy->getNumElements());
+ break;
+ }
+ }
return nullptr;
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
index 8d1b7ec5b574b..ef84bcc79b773 100644
--- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
-; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-tail-folding-style=none -S %s | FileCheck --check-prefix=VF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -force-tail-folding-style=none -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize,instcombine -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF2 %s
+; RUN: opt -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
define void @test1_pr58811(ptr %dst) {
; VF2-LABEL: define void @test1_pr58811(
@@ -12,31 +12,24 @@ define void @test1_pr58811(ptr %dst) {
; VF2-NEXT: [[TMP0:%.*]] = sub i32 0, [[IV_1_PH]]
; VF2-NEXT: br label %[[LOOP_1:.*]]
; VF2: [[LOOP_1]]:
-; VF2-NEXT: [[INDUCTION_IV:%.*]] = phi i32 [ [[INDUCTION_IV_NEXT:%.*]], %[[LOOP_1]] ], [ [[TMP0]], %[[LOOP_1_PREHEADER]] ]
-; VF2-NEXT: [[IV_1:%.*]] = phi i32 [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ], [ [[IV_1_PH]], %[[LOOP_1_PREHEADER]] ]
-; VF2-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[LOOP_1_PREHEADER]] ]
-; VF2-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IV_2]], -1
-; VF2-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1
-; VF2-NEXT: [[IV_1_NEXT]] = add i32 [[IV_2]], [[IV_1]]
-; VF2-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
; VF2-NEXT: br i1 false, label %[[LOOP_1]], label %[[LOOP_2_PREHEADER:.*]]
; VF2: [[LOOP_2_PREHEADER]]:
-; VF2-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
; VF2-NEXT: br label %[[VECTOR_PH:.*]]
; VF2: [[VECTOR_PH]]:
-; VF2-NEXT: [[TMP2:%.*]] = mul i32 198, [[INDUCTION_IV]]
-; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[INDUCTION_IV]], i64 0
+; VF2-NEXT: [[TMP1:%.*]] = mul i32 [[IV_1_PH]], -198
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[BROADCAST_SPLAT]]
-; VF2-NEXT: [[TMP4:%.*]] = shl i32 [[INDUCTION_IV]], 1
+; VF2-NEXT: [[TMP2:%.*]] = mul nuw <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; VF2-NEXT: [[TMP4:%.*]] = shl i32 [[TMP0]], 1
; VF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[TMP3]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[OFFSET_IDX]]
+; VF2-NEXT: [[TMP8:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]]
; VF2-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
@@ -48,19 +41,19 @@ define void @test1_pr58811(ptr %dst) {
; VF2-NEXT: br label %[[LOOP_2:.*]]
; VF2: [[LOOP_2]]:
; VF2-NEXT: [[IV_3:%.*]] = phi i16 [ [[IV_3_NEXT:%.*]], %[[LOOP_2]] ], [ 198, %[[SCALAR_PH]] ]
-; VF2-NEXT: [[IV_4:%.*]] = phi i32 [ [[IV_4_NEXT:%.*]], %[[LOOP_2]] ], [ [[TMP2]], %[[SCALAR_PH]] ]
-; VF2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[IV_3]]
+; VF2-NEXT: [[IV_4:%.*]] = phi i32 [ [[IV_4_NEXT:%.*]], %[[LOOP_2]] ], [ [[TMP1]], %[[SCALAR_PH]] ]
+; VF2-NEXT: [[TMP7:%.*]] = sext i16 [[IV_3]] to i64
+; VF2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
; VF2-NEXT: store i32 [[IV_4]], ptr [[GEP_DST]], align 4
-; VF2-NEXT: [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_LCSSA]]
+; VF2-NEXT: [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_PH]]
; VF2-NEXT: [[IV_3_NEXT]] = add i16 [[IV_3]], 1
; VF2-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[IV_3]], 198
; VF2-NEXT: br i1 [[CMP88_1]], label %[[LOOP_2]], label %[[LOOP_3_PREHEADER:.*]], !llvm.loop [[LOOP3:![0-9]+]]
; VF2: [[LOOP_3_PREHEADER]]:
-; VF2-NEXT: [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], %[[LOOP_2]] ]
; VF2-NEXT: br label %[[LOOP_3:.*]]
; VF2: [[LOOP_3]]:
; VF2-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_2]], %[[LOOP_3]] ], [ 0, %[[LOOP_3_PREHEADER]] ]
-; VF2-NEXT: [[SUB93_2]] = sub i32 [[IV_5]], [[IV_4_LCSSA]]
+; VF2-NEXT: [[SUB93_2]] = sub i32 [[IV_5]], [[IV_4]]
; VF2-NEXT: br label %[[LOOP_3]]
; VF2: [[UNREACHABLE_BB]]:
; VF2-NEXT: br label %[[LOOP_1_PREHEADER]]
@@ -74,23 +67,15 @@ define void @test1_pr58811(ptr %dst) {
; CHECK-NEXT: [[TMP0:%.*]] = sub i32 0, [[IV_1_PH]]
; CHECK-NEXT: br label %[[LOOP_1:.*]]
; CHECK: [[LOOP_1]]:
-; CHECK-NEXT: [[INDUCTION_IV:%.*]] = phi i32 [ [[INDUCTION_IV_NEXT:%.*]], %[[LOOP_1]] ], [ [[TMP0]], %[[LOOP_1_PREHEADER]] ]
-; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ], [ [[IV_1_PH]], %[[LOOP_1_PREHEADER]] ]
-; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[LOOP_1_PREHEADER]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IV_2]], -1
-; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1
-; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_2]], [[IV_1]]
-; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
; CHECK-NEXT: br i1 false, label %[[LOOP_1]], label %[[LOOP_2_PREHEADER:.*]]
; CHECK: [[LOOP_2_PREHEADER]]:
-; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IV_1_PH]], -196
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDUCTION_IV]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP0]], 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -98,7 +83,8 @@ define void @test1_pr58811(ptr %dst) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[TMP3]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
@@ -110,19 +96,19 @@ define void @test1_pr58811(ptr %dst) {
; CHECK-NEXT: br label %[[LOOP_2:.*]]
; CHECK: [[LOOP_2]]:
; CHECK-NEXT: [[IV_3:%.*]] = phi i16 [ [[IV_3_NEXT:%.*]], %[[LOOP_2]] ], [ 196, %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV_4:%.*]] = phi i32 [ [[IV_4_NEXT:%.*]], %[[LOOP_2]] ], [ [[IND_END]], %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[IV_3]]
+; CHECK-NEXT: [[IV_4:%.*]] = phi i32 [ [[IV_4_NEXT:%.*]], %[[LOOP_2]] ], [ [[TMP1]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[IV_3]] to i64
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
; CHECK-NEXT: store i32 [[IV_4]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT: [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_LCSSA]]
+; CHECK-NEXT: [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_PH]]
; CHECK-NEXT: [[IV_3_NEXT]] = add i16 [[IV_3]], 1
; CHECK-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[IV_3]], 198
; CHECK-NEXT: br i1 [[CMP88_1]], label %[[LOOP_2]], label %[[LOOP_3_PREHEADER:.*]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[LOOP_3_PREHEADER]]:
-; CHECK-NEXT: [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], %[[LOOP_2]] ]
; CHECK-NEXT: br label %[[LOOP_3:.*]]
; CHECK: [[LOOP_3]]:
; CHECK-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_2]], %[[LOOP_3]] ], [ 0, %[[LOOP_3_PREHEADER]] ]
-; CHECK-NEXT: [[SUB93_2]] = sub i32 [[IV_5]], [[IV_4_LCSSA]]
+; CHECK-NEXT: [[SUB93_2]] = sub i32 [[IV_5]], [[IV_4]]
; CHECK-NEXT: br label %[[LOOP_3]]
; CHECK: [[UNREACHABLE_BB]]:
; CHECK-NEXT: br label %[[LOOP_1_PREHEADER]]
@@ -174,39 +160,31 @@ define void @test2_pr58811(ptr %dst) {
; VF2-NEXT: [[ENTRY:.*]]:
; VF2-NEXT: br label %[[LOOP_1_HEADER:.*]]
; VF2: [[LOOP_1_HEADER_LOOPEXIT:.*]]:
-; VF2-NEXT: [[SUB93_2_LCSSA:%.*]] = phi i32 [ [[SUB93_2:%.*]], %[[LOOP_4:.*]] ]
+; VF2-NEXT: [[SUB93_2:%.*]] = sub i32 0, [[IV_5:%.*]]
; VF2-NEXT: br label %[[LOOP_1_HEADER]]
; VF2: [[LOOP_1_HEADER]]:
-; VF2-NEXT: [[P_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB93_2_LCSSA]], %[[LOOP_1_HEADER_LOOPEXIT]] ]
-; VF2-NEXT: [[TMP0:%.*]] = mul i32 [[P_1]], -1
+; VF2-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB93_2]], %[[LOOP_1_HEADER_LOOPEXIT]] ]
+; VF2-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = sub i32 0, [[IV_2_LCSSA]]
; VF2-NEXT: br label %[[LOOP_2:.*]]
; VF2: [[LOOP_2]]:
-; VF2-NEXT: [[INDUCTION_IV:%.*]] = phi i32 [ [[INDUCTION_IV_NEXT:%.*]], %[[LOOP_2]] ], [ [[TMP0]], %[[LOOP_1_HEADER]] ]
-; VF2-NEXT: [[IV_2:%.*]] = phi i32 [ [[P_1]], %[[LOOP_1_HEADER]] ], [ [[ADD101:%.*]], %[[LOOP_2]] ]
-; VF2-NEXT: [[IV_3:%.*]] = phi i32 [ 0, %[[LOOP_1_HEADER]] ], [ [[SUB93:%.*]], %[[LOOP_2]] ]
-; VF2-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IV_3]], -1
-; VF2-NEXT: [[SUB93]] = add i32 [[IV_3]], 1
-; VF2-NEXT: [[ADD101]] = add i32 [[IV_3]], [[IV_2]]
-; VF2-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
; VF2-NEXT: br i1 false, label %[[LOOP_2]], label %[[LOOP_3_PREHEADER:.*]]
; VF2: [[LOOP_3_PREHEADER]]:
-; VF2-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], %[[LOOP_2]] ]
-; VF2-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_2]] ]
; VF2-NEXT: br label %[[VECTOR_PH:.*]]
; VF2: [[VECTOR_PH]]:
-; VF2-NEXT: [[TMP2:%.*]] = mul i32 198, [[INDUCTION_IV_LCSSA]]
+; VF2-NEXT: [[TMP1:%.*]] = mul i32 [[IV_2_LCSSA]], -198
; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[INDUCTION_IV_LCSSA]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[BROADCAST_SPLAT]]
+; VF2-NEXT: [[TMP2:%.*]] = mul nuw <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
; VF2-NEXT: [[TMP4:%.*]] = shl i32 [[INDUCTION_IV_LCSSA]], 1
; VF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[TMP3]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[OFFSET_IDX]]
+; VF2-NEXT: [[TMP8:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]]
; VF2-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; VF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
@@ -218,19 +196,17 @@ define void @test2_pr58811(ptr %dst) {
; VF2-NEXT: br label %[[LOOP_3:.*]]
; VF2: [[LOOP_3]]:
; VF2-NEXT: [[IV_4:%.*]] = phi i16 [ [[INC_1:%.*]], %[[LOOP_3]] ], [ 198, %[[SCALAR_PH]] ]
-; VF2-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_1:%.*]], %[[LOOP_3]] ], [ [[TMP2]], %[[SCALAR_PH]] ]
-; VF2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[IV_4]]
+; VF2-NEXT: [[IV_5]] = phi i32 [ [[SUB93_1:%.*]], %[[LOOP_3]] ], [ [[TMP1]], %[[SCALAR_PH]] ]
+; VF2-NEXT: [[TMP7:%.*]] = sext i16 [[IV_4]] to i64
+; VF2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
; VF2-NEXT: store i32 [[IV_5]], ptr [[GEP_DST]], align 4
; VF2-NEXT: [[SUB93_1]] = sub i32 [[IV_5]], [[IV_2_LCSSA]]
; VF2-NEXT: [[INC_1]] = add i16 [[IV_4]], 1
; VF2-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[IV_4]], 198
; VF2-NEXT: br i1 [[CMP88_1]], label %[[LOOP_3]], label %[[LOOP_4_PREHEADER:.*]], !llvm.loop [[LOOP5:![0-9]+]]
; VF2: [[LOOP_4_PREHEADER]]:
-; VF2-NEXT: [[IV_5_LCSSA:%.*]] = phi i32 [ [[IV_5]], %[[LOOP_3]] ]
-; VF2-NEXT: br label %[[LOOP_4]]
+; VF2-NEXT: br label %[[LOOP_4:.*]]
; VF2: [[LOOP_4]]:
-; VF2-NEXT: [[IV_6:%.*]] = phi i32 [ [[SUB93_2]], %[[LOOP_4]] ], [ 0, %[[LOOP_4_PREHEADER]] ]
-; VF2-NEXT: [[SUB93_2]] = sub i32 [[IV_6]], [[IV_5_LCSSA]]
; VF2-NEXT: br i1 false, label %[[LOOP_4]], label %[[LOOP_1_HEADER_LOOPEXIT]]
;
; CHECK-LABEL: define void @test2_pr58811(
@@ -238,30 +214,21 @@ define void @test2_pr58811(ptr %dst) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP_1_HEADER:.*]]
; CHECK: [[LOOP_1_HEADER_LOOPEXIT:.*]]:
-; CHECK-NEXT: [[SUB93_2_LCSSA:%.*]] = phi i32 [ [[SUB93_2:%.*]], %[[LOOP_4:.*]] ]
+; CHECK-NEXT: [[SUB93_2:%.*]] = sub i32 0, [[IV_5:%.*]]
; CHECK-NEXT: br label %[[LOOP_1_HEADER]]
; CHECK: [[LOOP_1_HEADER]]:
-; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB93_2_LCSSA]], %[[LOOP_1_HEADER_LOOPEXIT]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[P_1]], -1
+; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB93_2]], %[[LOOP_1_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = sub i32 0, [[IV_2_LCSSA]]
; CHECK-NEXT: br label %[[LOOP_2:.*]]
; CHECK: [[LOOP_2]]:
-; CHECK-NEXT: [[INDUCTION_IV:%.*]] = phi i32 [ [[INDUCTION_IV_NEXT:%.*]], %[[LOOP_2]] ], [ [[TMP0]], %[[LOOP_1_HEADER]] ]
-; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[P_1]], %[[LOOP_1_HEADER]] ], [ [[ADD101:%.*]], %[[LOOP_2]] ]
-; CHECK-NEXT: [[IV_3:%.*]] = phi i32 [ 0, %[[LOOP_1_HEADER]] ], [ [[SUB93:%.*]], %[[LOOP_2]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IV_3]], -1
-; CHECK-NEXT: [[SUB93]] = add i32 [[IV_3]], 1
-; CHECK-NEXT: [[ADD101]] = add i32 [[IV_3]], [[IV_2]]
-; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]]
; CHECK-NEXT: br i1 false, label %[[LOOP_2]], label %[[LOOP_3_PREHEADER:.*]]
; CHECK: [[LOOP_3_PREHEADER]]:
-; CHECK-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], %[[LOOP_2]] ]
-; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_2]] ]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]]
+; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IV_2_LCSSA]], -196
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV_LCSSA]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDUCTION_IV_LCSSA]], 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
@@ -270,7 +237,8 @@ define void @test2_pr58811(ptr %dst) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[TMP3]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
@@ -282,19 +250,17 @@ define void @test2_pr58811(ptr %dst) {
; CHECK-NEXT: br label %[[LOOP_3:.*]]
; CHECK: [[LOOP_3]]:
; CHECK-NEXT: [[IV_4:%.*]] = phi i16 [ [[INC_1:%.*]], %[[LOOP_3]] ], [ 196, %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_1:%.*]], %[[LOOP_3]] ], [ [[IND_END]], %[[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelement...
[truncated]
|
|
@lukel97, Gentle ping for review. Summarized changes below
|
There was a problem hiding this comment.
Can you add a separate dedicated test file in llvm/test/Transforms/InstSimplify/ConstProp/cttz-elts.ll? You should precommit the test in the PR so you can see the diff of the change: https://llvm.org/docs/InstCombineContributorGuide.html#precommit-tests
You'll probably want to test a few scenarios, here are a few off the top of my head
- a normal cttz.elts with some trailing zeros
- a cttz.elts with undef in one of the lanes of the input vector
- a cttz.elts with all zeros in the input vector and iszeropoison=0
- a cttz.elts with all zeros in the input vector and iszeropoison=1
Modifies ConstantFoldIntrinsicCall2 to add constant folding for llvm.experimental.cttz.elts.
a54acaa to
9653dd4
Compare
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/test/Transforms/InstSimplify/ConstProp/cttz-elts.ll llvm/lib/Analysis/ConstantFolding.cppThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
lukel97
left a comment
There was a problem hiding this comment.
Thanks for the updates. Just FYI you should generally avoid force pushing to your branch if you can avoid it because it can mess up the review history on github: https://llvm.org/docs/GitHub.html#id9
|
|
||
| if (IntrinsicID == Intrinsic::experimental_cttz_elts) { | ||
| auto *FVTy = dyn_cast<FixedVectorType>(Operands[0]->getType()); | ||
| auto *Op2 = dyn_cast<ConstantInt>(Operands[1]); |
There was a problem hiding this comment.
Op2 I think always needs to be a ConstantInt?
| auto *Op2 = dyn_cast<ConstantInt>(Operands[1]); | |
| auto *Op2 = cast<ConstantInt>(Operands[1]); |
| if (isa<PoisonValue>(Elt)) | ||
| return PoisonValue::get(Ty); | ||
| if (isa<UndefValue>(Elt)) | ||
| return UndefValue::get(Ty); |
There was a problem hiding this comment.
I'm not sure if an undef element results in an undef result. Normally it just means that we can assume the value to be whatever is convenient for us. Can you double check on https://alive2.llvm.org if folding cttz.elts(<2 x i2> <i2 1, i2 undef>, i1 false) -> undef is valid or not?
I think in this case it still needs to be 0 but would be good to verify.
It would also be good to check on alive2 that a poison input element gives a poison result. I think that's correct, but would be good to verify.
There was a problem hiding this comment.
I misunderstood the nature of undef and I think you're correct, that the compiler would treat it as a zero in this context.
I tried alive2 but it seems that alive2 does not know about this intrinsic: https://alive2.llvm.org/ce/z/nizepk
Approximations done:
- Unknown libcall: @llvm.experimental.cttz.elts.i32.v2i2
This might be an invalid alternative but I tried instead using llc to see the return value of cttz.elt and the results seem to be inline with what you were saying in that undefs are treated as zeros (although poisons also shows the same behaviour?)
https://godbolt.org/z/eKGdo3PMs
There was a problem hiding this comment.
Oh that's a shame that alive2 doesn't support it yet. But now that I think about it I don't think a poison element should propagate to a poison result, since it's quite normal in tail folded loops to have poison lanes from e.g. llvm.masked.load or llvm.vp.load.
The LangRef doesn't seem entirely clear if it should be treated as undef, so probably the easiest thing to do here is just to treat it the same as an UndefValue i.e. assume it's zero or one. PoisonValue is a subclass of UndefValue so you can just use the one isa check.
|
Apologies about the rebase, and thanks for letting me know! Will keep that in mind. This commit addresses the undef/poison and treating it as 0 in this case. I think this matches the behaviour of llc.
|
| declare i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1>, i1) | ||
| declare i32 @llvm.experimental.cttz.elts.i32.v4i32(<4 x i32>, i1) |
There was a problem hiding this comment.
You don't need to declare intrinsics anymore these days so you can remove these lines
| declare i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1>, i1) | |
| declare i32 @llvm.experimental.cttz.elts.i32.v4i32(<4 x i32>, i1) |
| ; CHECK-LABEL: @cttz_elts_v4i1( | ||
| ; CHECK-NEXT: ret i32 3 | ||
| ; | ||
| %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 false) |
There was a problem hiding this comment.
Can you add a test with a return type thats too small to fit the maximum number of elements? Something like a call i1 cttz.elts(<4 x i8> <0, 0, 0 0>). The LangRef says this is "undefined" so should probably return poison in this case
There was a problem hiding this comment.
Ah, I missed this, thanks for the catch. Is my interpretation correct in that even if the first lane is a non-zero, if the return type is too small it should still return a poison? For example would call i1 cttz.elts(<4 x i8> <1, 1, 1, 1>) return poison as well?
Also, another part that has me confused is whether the return type is too small using max or unsigned max of type Ty. I ask because the test case below prints out a negative value (I understand 0b11 as signed is interpreted as -1)` .
define i2 @test() {
;
; CHECK-LABEL: @test(
; CHECK-NEXT: ret i2 -1
;
%res = call i2 @llvm.experimental.cttz.elts.i2.v3i32(<3x i32> <i32 0, i32 0, i32 0>, i1 false)
ret i2 %res
}There was a problem hiding this comment.
Is my interpretation correct in that even if the first lane is a non-zero, if the return type is too small it should still return a poison? For example would call i1 cttz.elts(<4 x i8> <1, 1, 1, 1>) return poison as well?
I think so, the LangRef says it needs to hold the maximum number of elements.
Also, another part that has me confused is whether the return type is too small using max or unsigned max of type Ty
I think that example you provided is correct, the IR printer is just printing it as a signed integer but there's no notion of signed-ness in LLVM IR types. Interpreted as unsigned that's 3 which fits the max number of elements.
… of vector elements.
|
This patch ensures |
|
Ping! |
lukel97
left a comment
There was a problem hiding this comment.
LGTM, thanks
Let me know if you'd like me to commit this for you
|
Implemented the review to use
Yes, please! |
Resolves #182324.
Modifies ConstantFoldIntrinsicCall2 to add constant folding for llvm.experimental.cttz.elts.