diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 814a414a52ac8..91a0bda2973a7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -415,9 +415,9 @@ class VPBuilder { return createScalarCast(CastOp, Op, ResultTy, DL); } - VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy) { - return tryInsertInstruction(new VPWidenCastRecipe( + VPInstructionWithType *createWidenCast(Instruction::CastOps Opcode, + VPValue *Op, Type *ResultTy) { + return tryInsertInstruction(VPInstructionWithType::createWide( Opcode, Op, ResultTy, nullptr, VPIRFlags::getDefaultFlags(Opcode))); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 78163b5fe35d5..0b6aaed866c43 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3245,8 +3245,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) - .Case( + .Case( [](const auto *R) { return R->getOpcode(); }) .Case([](const VPInterleaveRecipe *R) { return R->getStoredValues().empty() ? Instruction::Load @@ -3317,7 +3316,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPRecipeBase::VPDerivedIVSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPReplicateSC: - case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPCurrentIterationPHISC: case VPRecipeBase::VPVectorPointerSC: case VPRecipeBase::VPVectorEndPointerSC: @@ -3325,11 +3323,21 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPRecipeBase::VPPredInstPHISC: case VPRecipeBase::VPBranchOnMaskSC: continue; + case VPRecipeBase::VPInstructionSC: { + // VPInstructionWithType for wide casts still produces a vector + // result, so only skip single-scalar VPInstructions. All other + // VPInstructions are treated as not producing vectors to preserve + // existing behavior. + if (auto *VPIT = dyn_cast(&R)) { + if (Instruction::isCast(VPIT->getOpcode()) && !VPIT->isSingleScalar()) + break; + } + continue; + } case VPRecipeBase::VPReductionSC: case VPRecipeBase::VPActiveLaneMaskPHISC: case VPRecipeBase::VPWidenCallSC: case VPRecipeBase::VPWidenCanonicalIVSC: - case VPRecipeBase::VPWidenCastSC: case VPRecipeBase::VPWidenGEPSC: case VPRecipeBase::VPWidenIntrinsicSC: case VPRecipeBase::VPWidenSC: @@ -6708,7 +6716,7 @@ bool VPRecipeBuilder::replaceWithFinalIfReductionStore( return false; } -VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI, +VPSingleDefRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI, VFRange &Range) { auto *I = VPI->getUnderlyingInstr(); bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -6766,6 +6774,15 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI, assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || (Range.Start.isScalable() && isa(I))) && "Should not predicate a uniform recipe"); + if (IsUniform && Instruction::isCast(VPI->getOpcode())) { + assert(!IsPredicated && "IsUniform implies unpredicated"); + auto *CastR = cast(VPI); + auto *Recipe = new VPInstructionWithType( + VPI->getOpcode(), VPI->operandsWithoutMask(), CastR->getResultType(), + *VPI, *VPI, VPI->getDebugLoc(), I->getName()); + Recipe->setUnderlyingValue(I); + return Recipe; + } auto *Recipe = new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform, BlockInMask, *VPI, *VPI, VPI->getDebugLoc()); @@ -6809,9 +6826,9 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, if (Instruction::isCast(VPI->getOpcode())) { auto *CI = cast(Instr); auto *CastR = cast(VPI); - return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0), - CastR->getResultType(), CI, *VPI, *VPI, - VPI->getDebugLoc()); + return VPInstructionWithType::createWide( + CI->getOpcode(), VPI->getOperand(0), CastR->getResultType(), CI, *VPI, + *VPI, VPI->getDebugLoc()); } return tryToWiden(VPI); @@ -7297,10 +7314,10 @@ void LoopVectorizationPlanner::addReductionResultComputation( assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) && "Unexpected truncated min-max recurrence!"); Type *RdxTy = RdxDesc.getRecurrenceType(); - VPWidenCastRecipe *Trunc; + VPInstructionWithType *Trunc; Instruction::CastOps ExtendOpc = RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt; - VPWidenCastRecipe *Extnd; + VPInstructionWithType *Extnd; { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint( diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index a84c77d614673..37114a1414a39 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -88,10 +88,10 @@ class VPRecipeBuilder { bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder); - /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask - /// as last operand. Range.End may be decreased to ensure same recipe behavior - /// from \p Range.Start to \p Range.End. - VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range); + /// Build a VPReplicationRecipe or VPInstrucionWithType for \p VPI. If it is + /// predicated, add the mask as last operand. Range.End may be decreased to + /// ensure same recipe behavior from \p Range.Start to \p Range.End. + VPSingleDefRecipe *handleReplication(VPInstruction *VPI, VFRange &Range); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4a5420185224b..cdd77928efdaf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -437,7 +437,6 @@ class LLVM_ABI_FOR_TEST VPRecipeBase VPVectorEndPointerSC, VPWidenCallSC, VPWidenCanonicalIVSC, - VPWidenCastSC, VPWidenGEPSC, VPWidenIntrinsicSC, VPWidenLoadEVLSC, @@ -626,7 +625,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPRecipeValue { case VPRecipeBase::VPVectorEndPointerSC: case VPRecipeBase::VPWidenCallSC: case VPRecipeBase::VPWidenCanonicalIVSC: - case VPRecipeBase::VPWidenCastSC: case VPRecipeBase::VPWidenGEPSC: case VPRecipeBase::VPWidenIntrinsicSC: case VPRecipeBase::VPWidenSC: @@ -1118,7 +1116,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { R->getVPRecipeID() == VPRecipeBase::VPWidenSC || R->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC || R->getVPRecipeID() == VPRecipeBase::VPWidenCallSC || - R->getVPRecipeID() == VPRecipeBase::VPWidenCastSC || R->getVPRecipeID() == VPRecipeBase::VPWidenIntrinsicSC || R->getVPRecipeID() == VPRecipeBase::VPReductionSC || R->getVPRecipeID() == VPRecipeBase::VPReductionEVLSC || @@ -1512,6 +1509,17 @@ class VPInstructionWithType : public VPInstruction { /// Scalar result type produced by the recipe. Type *ResultTy; + /// Whether the recipe produces a single scalar result (as opposed to a + /// vector/wide result with one lane per VF). + bool IsSingleScalar; + + /// Returns the default value of IsSingleScalar for \p Opcode: true for + /// opcodes that produce a single scalar (loads, casts, vscale). + static bool defaultIsSingleScalar(unsigned Opcode) { + return Instruction::isCast(Opcode) || Opcode == Instruction::Load || + Opcode == VPInstruction::VScale; + } + public: VPInstructionWithType(unsigned Opcode, ArrayRef Operands, Type *ResultTy, const VPIRFlags &Flags = {}, @@ -1519,7 +1527,22 @@ class VPInstructionWithType : public VPInstruction { DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name), - ResultTy(ResultTy) {} + ResultTy(ResultTy), IsSingleScalar(defaultIsSingleScalar(Opcode)) {} + + /// Create a new VPInstructionWithType representing a wide (vector-producing) + /// cast that mirrors the semantics of the legacy VPWidenCastRecipe. + static VPInstructionWithType * + createWide(unsigned Opcode, VPValue *Op, Type *ResultTy, + CastInst *CI = nullptr, const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}, + DebugLoc DL = DebugLoc::getUnknown()) { + assert(Instruction::isCast(Opcode) && "Expected a cast opcode"); + auto *VPI = + new VPInstructionWithType(Opcode, {Op}, ResultTy, Flags, Metadata, DL); + VPI->IsSingleScalar = false; + VPI->setUnderlyingValue(CI); + return VPI; + } static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1544,10 +1567,20 @@ class VPInstructionWithType : public VPInstruction { return isa(cast(R)); } + static inline bool classof(const VPValue *V) { + auto *R = V->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPSingleDefRecipe *R) { + return classof(static_cast(R)); + } + VPInstruction *clone() override { auto *New = new VPInstructionWithType(getOpcode(), operands(), getResultType(), *this, *this, getDebugLoc(), getName()); + New->IsSingleScalar = IsSingleScalar; New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -1556,13 +1589,20 @@ class VPInstructionWithType : public VPInstruction { /// Return the cost of this VPInstruction. InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override { - // TODO: Compute accurate cost after retiring the legacy cost model. - return 0; - } + VPCostContext &Ctx) const override; Type *getResultType() const { return ResultTy; } + /// Returns the cast opcode of this recipe; the opcode must be a cast. + Instruction::CastOps getCastOpcode() const { + assert(Instruction::isCast(getOpcode()) && "not a cast opcode"); + return static_cast(getOpcode()); + } + + /// Returns true if this recipe produces a single scalar result (rather than + /// a vector with VF lanes). + bool isSingleScalar() const { return IsSingleScalar; } + protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1828,58 +1868,6 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags, } }; -/// VPWidenCastRecipe is a recipe to create vector cast instructions. -class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { - /// Cast instruction opcode. - Instruction::CastOps Opcode; - - /// Result type for the cast. - Type *ResultTy; - -public: - VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, - CastInst *CI = nullptr, const VPIRFlags &Flags = {}, - const VPIRMetadata &Metadata = {}, - DebugLoc DL = DebugLoc::getUnknown()) - : VPRecipeWithIRFlags(VPRecipeBase::VPWidenCastSC, Op, Flags, DL), - VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) { - assert(flagsValidForOpcode(Opcode) && - "Set flags not supported for the provided opcode"); - assert(hasRequiredFlagsForOpcode(Opcode) && - "Opcode requires specific flags to be set"); - setUnderlyingValue(CI); - } - - ~VPWidenCastRecipe() override = default; - - VPWidenCastRecipe *clone() override { - return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, - cast_or_null(getUnderlyingValue()), - *this, *this, getDebugLoc()); - } - - VP_CLASSOF_IMPL(VPRecipeBase::VPWidenCastSC) - - /// Produce widened copies of the cast. - LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override; - - /// Return the cost of this VPWidenCastRecipe. - LLVM_ABI_FOR_TEST InstructionCost - computeCost(ElementCount VF, VPCostContext &Ctx) const override; - - Instruction::CastOps getOpcode() const { return Opcode; } - - /// Returns the result type of the cast. - Type *getResultType() const { return ResultTy; } - -protected: -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif -}; - /// A recipe for widening vector intrinsics. class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { /// ID of the vector intrinsic to widen. @@ -3349,15 +3337,15 @@ class VPExpressionRecipe : public VPSingleDefRecipe { ArrayRef ExpressionRecipes); public: - VPExpressionRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red) + VPExpressionRecipe(VPInstructionWithType *Ext, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::ExtendedReduction, {Ext, Red}) {} VPExpressionRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::MulAccReduction, {Mul, Red}) {} - VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPExpressionRecipe(VPInstructionWithType *Ext0, VPInstructionWithType *Ext1, VPWidenRecipe *Mul, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction, {Ext0, Ext1, Mul, Red}) {} - VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPExpressionRecipe(VPInstructionWithType *Ext0, VPInstructionWithType *Ext1, VPWidenRecipe *Mul, VPWidenRecipe *Sub, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction, @@ -4094,11 +4082,10 @@ struct CastInfo /// Used by isa. static inline bool isPossible(VPRecipeBase *R) { // NOTE: Each recipe inheriting from VPIRMetadata must be listed here. - return isa( - R); + return isa(R); } /// Used by cast. @@ -4108,8 +4095,6 @@ struct CastInfo return cast(R); case VPRecipeBase::VPWidenSC: return cast(R); - case VPRecipeBase::VPWidenCastSC: - return cast(R); case VPRecipeBase::VPWidenIntrinsicSC: return cast(R); case VPRecipeBase::VPWidenCallSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 103dff1889a6a..837d060830448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -308,8 +308,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return inferScalarType(R->getOperand(0)); }) // VPInstructionWithType must be handled before VPInstruction. - .Case( + .Case( [](const auto *R) { return R->getResultType(); }) .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 3cafeae7c4aea..c5b6e0d863632 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -303,8 +303,7 @@ struct Recipe_match { template using AllRecipe_match = Recipe_match, Opcode, /*Commutative*/ false, - VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe, - VPInstruction>; + VPWidenRecipe, VPReplicateRecipe, VPInstruction>; template using AllRecipe_commutative_match = @@ -545,7 +544,8 @@ m_ZExtOrSExt(const Op0_t &Op0) { } template inline auto m_WidenAnyExtend(const Op0_t &Op0) { - return m_Isa(m_CombineOr(m_ZExtOrSExt(Op0), m_FPExt(Op0))); + return m_Isa( + m_CombineOr(m_ZExtOrSExt(Op0), m_FPExt(Op0))); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2225dfa310c6c..b0b85ecefdb5a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -87,7 +87,6 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: - case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenLoadEVLSC: @@ -140,7 +139,6 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: - case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: @@ -190,7 +188,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: - case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: @@ -1306,7 +1303,9 @@ bool VPInstruction::isSingleScalar() const { case VPInstruction::VScale: return true; default: - return isScalarCast(); + if (auto *VPI = dyn_cast(this)) + return VPI->isSingleScalar(); + return false; } } @@ -1589,11 +1588,20 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, void VPInstructionWithType::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); - if (isScalarCast()) { - Value *Op = State.get(getOperand(0), VPLane(0)); - Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), - Op, ResultTy); - State.set(this, Cast, VPLane(0)); + if (Instruction::isCast(getOpcode())) { + Type *DestTy = + isSingleScalar() ? ResultTy : VectorType::get(ResultTy, State.VF); + Value *Op = isSingleScalar() ? State.get(getOperand(0), VPLane(0)) + : State.get(getOperand(0)); + Value *Cast = State.Builder.CreateCast(getCastOpcode(), Op, DestTy); + if (auto *I = dyn_cast(Cast)) { + applyFlags(*I); + applyMetadata(*I); + } + if (isSingleScalar()) + State.set(this, Cast, VPLane(0)); + else + State.set(this, Cast); return; } switch (getOpcode()) { @@ -1614,6 +1622,15 @@ void VPInstructionWithType::execute(VPTransformState &State) { } } +InstructionCost VPInstructionWithType::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // TODO: Compute cost for VPInstructions without underlying values. + if (!getUnderlyingValue()) + return 0; + return getCostForRecipeWithOpcode( + getOpcode(), isSingleScalar() ? ElementCount::getFixed(1) : VF, Ctx); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -1638,7 +1655,8 @@ void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, break; default: assert(Instruction::isCast(getOpcode()) && "unhandled opcode"); - O << Instruction::getOpcodeName(getOpcode()) << " "; + O << Instruction::getOpcodeName(getOpcode()); + printFlags(O); printOperands(O, SlotTracker); O << " to " << *ResultTy; } @@ -2479,43 +2497,6 @@ void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent, } #endif -void VPWidenCastRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - /// Vectorize casts. - assert(State.VF.isVector() && "Not vectorizing?"); - Type *DestTy = VectorType::get(getResultType(), State.VF); - VPValue *Op = getOperand(0); - Value *A = State.get(Op); - Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); - State.set(this, Cast); - if (auto *CastOp = dyn_cast(Cast)) { - applyFlags(*CastOp); - applyMetadata(*CastOp); - } -} - -InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // TODO: In some cases, VPWidenCastRecipes are created but not considered in - // the legacy cost model, including truncates/extends when evaluating a - // reduction in a smaller type. - if (!getUnderlyingValue()) - return 0; - return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-CAST "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode); - printFlags(O); - printOperands(O, SlotTracker); - O << " to " << *getResultType(); -} -#endif - InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); @@ -3032,13 +3013,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, case ExpressionTypes::ExtendedReduction: { unsigned Opcode = RecurrenceDescriptor::getOpcode( cast(ExpressionRecipes[1])->getRecurrenceKind()); - auto *ExtR = cast(ExpressionRecipes[0]); + auto *ExtR = cast(ExpressionRecipes[0]); auto *RedR = cast(ExpressionRecipes.back()); if (RedR->isPartialReduction()) return Ctx.TTI.getPartialReductionCost( Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF, - TargetTransformInfo::getPartialReductionExtendKind(ExtR->getOpcode()), + TargetTransformInfo::getPartialReductionExtendKind( + ExtR->getCastOpcode()), TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind, RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} : std::nullopt); @@ -3061,22 +3043,22 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, case ExpressionTypes::ExtMulAccReduction: { auto *RedR = cast(ExpressionRecipes.back()); if (RedR->isPartialReduction()) { - auto *Ext0R = cast(ExpressionRecipes[0]); - auto *Ext1R = cast(ExpressionRecipes[1]); + auto *Ext0R = cast(ExpressionRecipes[0]); + auto *Ext1R = cast(ExpressionRecipes[1]); auto *Mul = cast(ExpressionRecipes[2]); return Ctx.TTI.getPartialReductionCost( Opcode, Ctx.Types.inferScalarType(getOperand(0)), Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF, TargetTransformInfo::getPartialReductionExtendKind( - Ext0R->getOpcode()), + Ext0R->getCastOpcode()), TargetTransformInfo::getPartialReductionExtendKind( - Ext1R->getOpcode()), + Ext1R->getCastOpcode()), Mul->getOpcode(), Ctx.CostKind, RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()} : std::nullopt); } return Ctx.TTI.getMulAccReductionCost( - cast(ExpressionRecipes.front())->getOpcode() == + cast(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, Opcode, RedTy, SrcVecTy, Ctx.CostKind); } @@ -3123,7 +3105,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); - auto *Ext0 = cast(ExpressionRecipes[0]); + auto *Ext0 = cast(ExpressionRecipes[0]); O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " << *Ext0->getResultType(); if (Red->isConditional()) { @@ -3143,11 +3125,11 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, Mul->printFlags(O); O << "("; getOperand(0)->printAsOperand(O, SlotTracker); - auto *Ext0 = cast(ExpressionRecipes[0]); + auto *Ext0 = cast(ExpressionRecipes[0]); O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " << *Ext0->getResultType() << "), ("; getOperand(1)->printAsOperand(O, SlotTracker); - auto *Ext1 = cast(ExpressionRecipes[1]); + auto *Ext1 = cast(ExpressionRecipes[1]); O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " << *Ext1->getResultType() << ")"; if (Red->isConditional()) { @@ -3173,7 +3155,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, O << "("; getOperand(0)->printAsOperand(O, SlotTracker); if (IsExtended) { - auto *Ext0 = cast(ExpressionRecipes[0]); + auto *Ext0 = cast(ExpressionRecipes[0]); O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " << *Ext0->getResultType() << "), ("; } else { @@ -3181,7 +3163,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, } getOperand(1)->printAsOperand(O, SlotTracker); if (IsExtended) { - auto *Ext1 = cast(ExpressionRecipes[1]); + auto *Ext1 = cast(ExpressionRecipes[1]); O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " << *Ext1->getResultType() << ")"; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 262f4798b3d63..f43e45b14bc74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -125,7 +125,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPIRFlags(*CI), *VPI, CI->getDebugLoc()); } } else if (auto *CI = dyn_cast(Inst)) { - NewRecipe = new VPWidenCastRecipe( + NewRecipe = VPInstructionWithType::createWide( CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI, VPIRFlags(*CI), VPIRMetadata(*CI)); } else { @@ -1260,8 +1260,7 @@ static std::optional> getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { return TypeSwitch>>(R) - .Case( + .Case( [](auto *I) { return std::make_pair(false, I->getOpcode()); }) .Case([](const VPWidenIntrinsicRecipe *I) { return std::make_pair(true, I->getVectorIntrinsicID()); @@ -1390,15 +1389,25 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { Def->replaceAllUsesWith(A); } else { // Don't replace a non-widened cast recipe with a widened cast. - if (!isa(Def)) + auto *VPIT = dyn_cast(Def); + if (!VPIT || VPIT->isSingleScalar()) return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; - auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, - TruncTy); + VPSingleDefRecipe *Ext; + if (vputils::isSingleScalar(Def)) { + Ext = new VPInstructionWithType( + Instruction::CastOps(ExtOpcode), {A}, TruncTy, + VPIRFlags::getDefaultFlags(ExtOpcode), {}, Def->getDebugLoc()); + Builder.getInsertBlock()->insert(Ext, Builder.getInsertPoint()); + } else { + Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, + TruncTy); + } + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); @@ -2081,7 +2090,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); - auto *NewBTC = new VPWidenCastRecipe( + auto *NewBTC = VPInstructionWithType::createWide( Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy, nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc)); Plan.getVectorPreheader()->appendRecipe(NewBTC); @@ -2571,13 +2580,13 @@ void VPlanTransforms::truncateToMinimalBitwidths( // cannot use RAUW after creating a new truncate, as this would could make // other uses have different types for their operands, making them invalidly // typed. - DenseMap ProcessedTruncs; + DenseMap ProcessedTruncs; VPTypeAnalysis TypeInfo(Plan); VPBasicBlock *PH = Plan.getVectorPreheader(); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (!isa(&R)) continue; @@ -2591,7 +2600,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( // type. Skip those here, after incrementing NumProcessedRecipes. Also // skip casts which do not need to be handled explicitly here, as // redundant casts will be removed during recipe simplification. - if (isa(&R)) + if (isa(&R)) continue; Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); @@ -2610,7 +2619,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( if (OldResSizeInBits != NewResSizeInBits && !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) { // Extend result to original width. - auto *Ext = new VPWidenCastRecipe( + auto *Ext = VPInstructionWithType::createWide( Instruction::ZExt, ResultVPV, OldResTy, nullptr, VPIRFlags::getDefaultFlags(Instruction::ZExt)); Ext->insertAfter(&R); @@ -2647,7 +2656,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( Builder.setInsertPoint(PH); else Builder.setInsertPoint(&R); - VPWidenCastRecipe *NewOp = + VPInstructionWithType *NewOp = Builder.createWidenCast(Instruction::Trunc, Op, NewResTy); ProcessedIter->second = NewOp; R.setOperand(Idx, NewOp); @@ -4309,7 +4318,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, InstructionCost ExtRedCost = InstructionCost::getInvalid(); InstructionCost ExtCost = - cast(VecOp)->computeCost(VF, Ctx); + cast(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); assert(!RedTy->isFloatingPointTy() && @@ -4324,12 +4333,12 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VPValue *A; // Match reduce(ext)). - if (match(VecOp, m_Isa(m_ZExtOrSExt(m_VPValue(A)))) && + if (match(VecOp, m_Isa(m_ZExtOrSExt(m_VPValue(A)))) && IsExtendedRedValidAndClampRange( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), - cast(VecOp)->getOpcode(), + cast(VecOp)->getCastOpcode(), Ctx.Types.inferScalarType(A))) - return new VPExpressionRecipe(cast(VecOp), Red); + return new VPExpressionRecipe(cast(VecOp), Red); return nullptr; } @@ -4357,8 +4366,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Clamp the range if using multiply-accumulate-reduction is profitable. auto IsMulAccValidAndClampRange = - [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, - VPWidenCastRecipe *OuterExt) -> bool { + [&](VPWidenRecipe *Mul, VPInstructionWithType *Ext0, + VPInstructionWithType *Ext1, + VPInstructionWithType *OuterExt) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -4414,13 +4424,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // creates two uniform extends that can more easily be matched by the rest of // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all // replaced with the new extend of the constant. - auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, - VPWidenCastRecipe *&ExtB, + auto ExtendAndReplaceConstantOp = [&Ctx](VPInstructionWithType *ExtA, + VPInstructionWithType *&ExtB, VPValue *&ValB, VPWidenRecipe *Mul) { if (!ExtA || ExtB || !isa(ValB)) return; Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); - Instruction::CastOps ExtOpc = ExtA->getOpcode(); + Instruction::CastOps ExtOpc = ExtA->getCastOpcode(); const APInt *Const; if (!match(ValB, m_APInt(Const)) || !llvm::canConstantBeExtended( @@ -4441,8 +4451,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { - auto *RecipeA = dyn_cast(A); - auto *RecipeB = dyn_cast(B); + auto *RecipeA = dyn_cast(A); + auto *RecipeB = dyn_cast(B); auto *Mul = cast(VecOp); // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) @@ -4468,10 +4478,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Match reduce.add(ext(mul(A, B))). if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { - auto *Ext = cast(VecOp); + auto *Ext = cast(VecOp); auto *Mul = cast(Ext->getOperand(0)); - auto *Ext0 = dyn_cast(A); - auto *Ext1 = dyn_cast(B); + auto *Ext0 = dyn_cast(A); + auto *Ext1 = dyn_cast(B); // reduce.add(ext(mul(ext, const))) // -> reduce.add(ext(mul(ext, ext(const)))) @@ -4487,16 +4497,16 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { - auto *NewExt0 = new VPWidenCastRecipe( + auto *NewExt0 = VPInstructionWithType::createWide( Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr, *Ext0, *Ext0, Ext0->getDebugLoc()); NewExt0->insertBefore(Ext0); - VPWidenCastRecipe *NewExt1 = NewExt0; + VPInstructionWithType *NewExt1 = NewExt0; if (Ext0 != Ext1) { - NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0), - Ext->getResultType(), nullptr, *Ext1, - *Ext1, Ext1->getDebugLoc()); + NewExt1 = VPInstructionWithType::createWide( + Ext1->getOpcode(), Ext1->getOperand(0), Ext->getResultType(), + nullptr, *Ext1, *Ext1, Ext1->getDebugLoc()); NewExt1->insertBefore(Ext1); } Mul->setOperand(0, NewExt0); @@ -5157,7 +5167,7 @@ static bool canNarrowOps(ArrayRef Ops, bool IsScalable) { if (!WideMember0) return false; for (VPValue *V : Ops) { - if (!isa(V)) + if (!isa(V)) return false; auto *R = cast(V); if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0)) @@ -5250,7 +5260,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl &NarrowedOps) { if (isAlreadyNarrow(V)) return V; - if (isa(R)) { + if (isa(R)) { auto *WideMember0 = cast(R); for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) WideMember0->setOperand( @@ -5894,8 +5904,8 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op, // -> reduce.add(mul(ext(A), ext(trunc(C)))) const APInt *Const; if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) { - auto *ExtA = cast(Op->getOperand(0)); - Instruction::CastOps ExtOpc = ExtA->getOpcode(); + auto *ExtA = cast(Op->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getCastOpcode(); Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0)); if (!Op->hasOneUse() || !llvm::canConstantBeExtended( @@ -5916,9 +5926,9 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op, if (match(Op, m_WidenIntrinsic(m_Sub( m_ZExtOrSExt(m_VPValue(X)), m_ZExtOrSExt(m_VPValue(Y)))))) { auto *Sub = Op->getOperand(0)->getDefiningRecipe(); - auto *Ext = cast(Sub->getOperand(0)); + auto *Ext = cast(Sub->getOperand(0)); assert(Ext->getOpcode() == - cast(Sub->getOperand(1))->getOpcode() && + cast(Sub->getOperand(1))->getOpcode() && "Expected both the LHS and RHS extends to be the same"); bool IsSigned = Ext->getOpcode() == Instruction::SExt; VPBuilder Builder(Op); @@ -5942,21 +5952,21 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op, // TODO: Support this optimization for float types. if (match(Op, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), m_ZExtOrSExt(m_VPValue()))))) { - auto *Ext = cast(Op); + auto *Ext = cast(Op); auto *Mul = cast(Ext->getOperand(0)); - auto *MulLHS = cast(Mul->getOperand(0)); - auto *MulRHS = cast(Mul->getOperand(1)); + auto *MulLHS = cast(Mul->getOperand(0)); + auto *MulRHS = cast(Mul->getOperand(1)); if (!Mul->hasOneUse() || (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) || MulLHS->getOpcode() != MulRHS->getOpcode()) return Op; VPBuilder Builder(Mul); - Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(), + Mul->setOperand(0, Builder.createWidenCast(MulLHS->getCastOpcode(), MulLHS->getOperand(0), Ext->getResultType())); Mul->setOperand(1, MulLHS == MulRHS ? Mul->getOperand(0) - : Builder.createWidenCast(MulRHS->getOpcode(), + : Builder.createWidenCast(MulRHS->getCastOpcode(), MulRHS->getOperand(0), Ext->getResultType())); return Mul; @@ -5972,7 +5982,7 @@ createPartialReductionExpression(VPReductionRecipe *Red) { // reduce.[f]add(ext(op)) // -> VPExpressionRecipe(op, red) if (match(VecOp, m_WidenAnyExtend(m_VPValue()))) - return new VPExpressionRecipe(cast(VecOp), Red); + return new VPExpressionRecipe(cast(VecOp), Red); // reduce.[f]add([f]mul(ext(a), ext(b))) // -> VPExpressionRecipe(a, b, mul, red) @@ -5980,8 +5990,8 @@ createPartialReductionExpression(VPReductionRecipe *Red) { match(VecOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_ZExtOrSExt(m_VPValue())))) { auto *Mul = cast(VecOp); - auto *ExtA = cast(Mul->getOperand(0)); - auto *ExtB = cast(Mul->getOperand(1)); + auto *ExtA = cast(Mul->getOperand(0)); + auto *ExtB = cast(Mul->getOperand(1)); return new VPExpressionRecipe(ExtA, ExtB, Mul, Red); } @@ -5991,8 +6001,8 @@ createPartialReductionExpression(VPReductionRecipe *Red) { m_ZExtOrSExt(m_VPValue()))))) { auto *Sub = cast(VecOp); auto *Mul = cast(Sub->getOperand(1)); - auto *ExtA = cast(Mul->getOperand(0)); - auto *ExtB = cast(Mul->getOperand(1)); + auto *ExtA = cast(Mul->getOperand(0)); + auto *ExtB = cast(Mul->getOperand(1)); return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red); } @@ -6133,8 +6143,8 @@ getPartialReductionLinkCost(VPCostContext &CostCtx, CostCtx.CostKind, Flags); } -static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) { - return TTI::getPartialReductionExtendKind(Cast->getOpcode()); +static ExtendKind getPartialReductionExtendKind(VPInstructionWithType *Cast) { + return TTI::getPartialReductionExtendKind(Cast->getCastOpcode()); } /// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction @@ -6170,8 +6180,8 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op, m_WidenAnyExtend(m_VPValue(Y))))))) { auto *Abs = cast(Op); auto *Sub = cast(Abs->getOperand(0)); - auto *LHSExt = cast(Sub->getOperand(0)); - auto *RHSExt = cast(Sub->getOperand(1)); + auto *LHSExt = cast(Sub->getOperand(0)); + auto *RHSExt = cast(Sub->getOperand(1)); Type *LHSInputType = TypeInfo.inferScalarType(X); Type *RHSInputType = TypeInfo.inferScalarType(Y); if (LHSInputType != RHSInputType || @@ -6187,7 +6197,7 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op, std::optional OuterExtKind; if (match(Op, m_WidenAnyExtend(m_VPValue()))) { - auto *CastRecipe = cast(Op); + auto *CastRecipe = cast(Op); VPValue *CastSource = CastRecipe->getOperand(0); OuterExtKind = getPartialReductionExtendKind(CastRecipe); if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) || @@ -6231,21 +6241,21 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op, if (!match(LHS, m_WidenAnyExtend(m_VPValue()))) return std::nullopt; - auto *LHSCast = cast(LHS); + auto *LHSCast = cast(LHS); Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0)); ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast); // The RHS of the operation can be an extend or a constant integer. const APInt *RHSConst = nullptr; - VPWidenCastRecipe *RHSCast = nullptr; + VPInstructionWithType *RHSCast = nullptr; if (match(RHS, m_WidenAnyExtend(m_VPValue()))) - RHSCast = cast(RHS); + RHSCast = cast(RHS); else if (!match(RHS, m_APInt(RHSConst)) || !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind)) return std::nullopt; // The outer extend kind must match the inner extends for folding. - for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast}) + for (VPInstructionWithType *Cast : {LHSCast, RHSCast}) if (Cast && OuterExtKind && getPartialReductionExtendKind(Cast) != OuterExtKind) return std::nullopt; @@ -6362,9 +6372,11 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan, // something that isn't another partial reduction. This is because the // extends are intended to be lowered along with the reduction itself. auto ExtendUsersValid = [&](VPValue *Ext) { - return !isa(Ext) || all_of(Ext->users(), [&](VPUser *U) { - return PartialReductionOps.contains(cast(U)); - }); + auto *VPI = dyn_cast(Ext); + return !VPI || !Instruction::isCast(VPI->getOpcode()) || + all_of(Ext->users(), [&](VPUser *U) { + return PartialReductionOps.contains(cast(U)); + }); }; auto IsProfitablePartialReductionChainForVF = @@ -6386,7 +6398,8 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan, if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None) RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx); for (VPValue *Op : ExtendedOp.ExtendsUser->operands()) - if (auto *Extend = dyn_cast(Op)) + if (auto *Extend = dyn_cast(Op); + Extend && Instruction::isCast(Extend->getOpcode())) RegularCost += Extend->computeCost(VF, CostCtx); } return PartialCost.isValid() && PartialCost < RegularCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index a60b490a69ce6..364e172c423c2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -86,8 +86,10 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { /// Returns true if \p R propagates poison from any operand to its result. static bool propagatesPoisonFromRecipeOp(const VPRecipeBase *R) { return TypeSwitch(R) - .Case( - [](const VPRecipeBase *) { return true; }) + .Case([](const VPRecipeBase *) { return true; }) + .Case([](const VPInstructionWithType *R) { + return Instruction::isCast(R->getOpcode()); + }) .Case([](const VPReplicateRecipe *Rep) { // GEP and casts propagate poison from all operands. unsigned Opcode = Rep->getOpcode(); @@ -380,10 +382,15 @@ bool vputils::isSingleScalar(const VPValue *VPV) { return preservesUniformity(WidenR->getOpcode()) && all_of(WidenR->operands(), isSingleScalar); } - if (auto *VPI = dyn_cast(VPV)) + if (auto *VPI = dyn_cast(VPV)) { + // VPInstructionWithType carries an explicit IsSingleScalar flag that + // takes precedence over uniformity-based inference. + if (auto *VPIT = dyn_cast(VPI)) + return VPIT->isSingleScalar(); return VPI->isSingleScalar() || VPI->isVectorToScalar() || (preservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); + } if (auto *RR = dyn_cast(VPV)) return !RR->isPartialReduction(); if (isa( @@ -432,10 +439,6 @@ bool vputils::isUniformAcrossVFsAndUFs(const VPValue *V) { return preservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isUniformAcrossVFsAndUFs); }) - .Case([](const VPWidenCastRecipe *R) { - // A cast is uniform according to its operand. - return isUniformAcrossVFsAndUFs(R->getOperand(0)); - }) .Default([](const VPRecipeBase *) { // A value is considered non-uniform // unless proven otherwise. return false; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 31a1d95dedd3c..4f470a48ad9d8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -731,22 +731,22 @@ define void @force_branch_cost(ptr readonly %src, ptr %dst) { ; COST1-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 2 ; COST1-NEXT: [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 3 ; COST1-NEXT: [[TMP41:%.*]] = zext <4 x i8> [[TMP32]] to <4 x i32> -; COST1-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP41]], i64 0 -; COST1-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP41]], i64 1 -; COST1-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP41]], i64 2 -; COST1-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP41]], i64 3 ; COST1-NEXT: [[TMP46:%.*]] = zext <4 x i8> [[TMP40]] to <4 x i32> -; COST1-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP46]], i64 0 -; COST1-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP46]], i64 1 -; COST1-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP46]], i64 2 -; COST1-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP46]], i64 3 +; COST1-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP41]], i64 0 ; COST1-NEXT: store i32 [[TMP44]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META22:![0-9]+]], !noalias [[META19]] +; COST1-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP41]], i64 1 ; COST1-NEXT: store i32 [[TMP49]], ptr [[NEXT_GEP2]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP41]], i64 2 ; COST1-NEXT: store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[TMP41]], i64 3 ; COST1-NEXT: store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP45:%.*]] = extractelement <4 x i32> [[TMP46]], i64 0 ; COST1-NEXT: store i32 [[TMP45]], ptr [[NEXT_GEP5]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[TMP46]], i64 1 ; COST1-NEXT: store i32 [[TMP50]], ptr [[NEXT_GEP6]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP46]], i64 2 ; COST1-NEXT: store i32 [[TMP47]], ptr [[NEXT_GEP7]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST1-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP46]], i64 3 ; COST1-NEXT: store i32 [[TMP48]], ptr [[NEXT_GEP8]], align 4, !alias.scope [[META22]], !noalias [[META19]] ; COST1-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24 ; COST1-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24 @@ -844,12 +844,12 @@ define void @force_branch_cost(ptr readonly %src, ptr %dst) { ; COST10-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP16]], i32 3 ; COST10-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> ; COST10-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0 -; COST10-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1 -; COST10-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2 -; COST10-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3 ; COST10-NEXT: store i32 [[TMP24]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META22:![0-9]+]], !noalias [[META19]] +; COST10-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1 ; COST10-NEXT: store i32 [[TMP25]], ptr [[NEXT_GEP2]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST10-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2 ; COST10-NEXT: store i32 [[TMP22]], ptr [[NEXT_GEP3]], align 4, !alias.scope [[META22]], !noalias [[META19]] +; COST10-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3 ; COST10-NEXT: store i32 [[TMP23]], ptr [[NEXT_GEP4]], align 4, !alias.scope [[META22]], !noalias [[META19]] ; COST10-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24 ; COST10-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 9e8a95b6b9a47..3c3d808342cc2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -608,24 +608,24 @@ define void@sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 { ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <2 x i32> zeroinitializer, [[STEP_ADD_2]] ; CHECK-NEXT: [[TMP42:%.*]] = sub nsw <2 x i32> zeroinitializer, [[STEP_ADD_3]] ; CHECK-NEXT: [[TMP43:%.*]] = sext <2 x i32> [[TMP39]] to <2 x i64> -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i64> [[TMP43]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i64> [[TMP43]], i64 1 ; CHECK-NEXT: [[TMP46:%.*]] = sext <2 x i32> [[TMP40]] to <2 x i64> -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1 ; CHECK-NEXT: [[TMP49:%.*]] = sext <2 x i32> [[TMP41]] to <2 x i64> -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x i64> [[TMP49]], i64 0 -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x i64> [[TMP49]], i64 1 ; CHECK-NEXT: [[TMP52:%.*]] = sext <2 x i32> [[TMP42]] to <2 x i64> -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <2 x i64> [[TMP52]], i64 0 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <2 x i64> [[TMP52]], i64 1 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i64> [[TMP43]], i64 0 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i64> [[TMP43]], i64 1 ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x i64> [[TMP49]], i64 0 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x i64> [[TMP49]], i64 1 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP51]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <2 x i64> [[TMP52]], i64 0 ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP53]] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <2 x i64> [[TMP52]], i64 1 ; CHECK-NEXT: [[TMP62:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP54]] ; CHECK-NEXT: [[TMP63:%.*]] = load double, ptr [[TMP55]], align 8, !alias.scope [[META17:![0-9]+]] ; CHECK-NEXT: [[TMP64:%.*]] = load double, ptr [[TMP56]], align 8, !alias.scope [[META17]] @@ -687,8 +687,8 @@ define void@sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 { ; CHECK-NEXT: [[TMP89:%.*]] = sub nsw <2 x i32> zeroinitializer, [[VEC_IND24]] ; CHECK-NEXT: [[TMP90:%.*]] = sext <2 x i32> [[TMP89]] to <2 x i64> ; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i64> [[TMP90]], i64 0 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i64> [[TMP90]], i64 1 ; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP91]] +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i64> [[TMP90]], i64 1 ; CHECK-NEXT: [[TMP94:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP92]] ; CHECK-NEXT: [[TMP95:%.*]] = load double, ptr [[TMP93]], align 8, !alias.scope [[META17]] ; CHECK-NEXT: [[TMP96:%.*]] = load double, ptr [[TMP94]], align 8, !alias.scope [[META17]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index cba9cdaa66770..9ba7efa2660ea 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -181,17 +181,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i64 0 ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i64 1 ; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i64 2 ; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i64 3 ; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: store i64 0, ptr [[B]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll index d751d39446023..d3d9de99073fa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu" ;; registers required for a when trying to maximize ;; vector bandwidth with SVE. -; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext ir<%load.in> to fp128 +; CHECK: Cost of Invalid for VF vscale x 2: EMIT ir<%load.ext> = fpext ir<%load.in> to fp128 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) { ; CHECK-LABEL: define void @load_ext_trunc_store( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index 79566fb165bc6..843c70f35ddf9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -9,14 +9,14 @@ target triple = "aarch64-unknown-linux-gnu" define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 { ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16' ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = zext i8 %0 to i32 -; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 -; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 4: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 8: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 2 for VF 16: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 1: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 2: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 4: EMIT ir<%conv> = zext ir<%0> to i16 +; CHECK-COST: Cost of 0 for VF vscale x 8: EMIT ir<%conv> = zext ir<%0> to i16 ; CHECK-LABEL: define void @zext_i8_i16 ; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -89,14 +89,14 @@ exit: define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 { ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16' -; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 -; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 4: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF 8: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 2 for VF 16: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 1: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 2: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 1 for VF vscale x 4: EMIT ir<%conv> = sext ir<%0> to i16 +; CHECK-COST: Cost of 0 for VF vscale x 8: EMIT ir<%conv> = sext ir<%0> to i16 ; CHECK-LABEL: define void @sext_i8_i16 ; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index 668096feaf639..d3d6379ce8683 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -26,7 +26,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<{{.+}}> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 4 for VF 2: EMIT ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}> @@ -42,7 +42,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}> @@ -58,7 +58,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<{{.+}}> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32 +; CHECK: Cost of 2 for VF 8: EMIT ir<%conv> = sext ir<%1> to i32 ; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}> @@ -140,15 +140,15 @@ for.inc: ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 4 for VF 2: EMIT ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 4 for VF 2: EMIT ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 22 for VF 2: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> @@ -171,15 +171,15 @@ for.inc: ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> @@ -202,15 +202,15 @@ for.inc: ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 2 for VF 8: EMIT ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<[[VEC_PTR2]]> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 2 for VF 8: EMIT ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 4 for VF 8: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 2 for VF 8: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}} @@ -233,15 +233,15 @@ for.inc: ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]> ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> ; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32 +; CHECK: Cost of 6 for VF 16: EMIT ir<%conv1> = sext ir<%0> to i32 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>.2 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<[[VEC_PTR1]]> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32 +; CHECK: Cost of 6 for VF 16: EMIT ir<%conv3> = sext ir<%1> to i32 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1> ; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7> ; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127> ; CHECK: Cost of 8 for VF 16: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127> -; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8 +; CHECK: Cost of 6 for VF 16: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll index 00d74e27918ac..e17236ee5a02a 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll @@ -29,7 +29,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1>, [[VF]] ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: CLONE [[IDX:.*]] = load [[GEP_IDX]] -; CHECK-NEXT: CLONE [[EXT_IDX:.*]] = zext [[IDX]] +; CHECK-NEXT: EMIT-SCALAR [[EXT_IDX:.*]] = zext [[IDX]] ; CHECK-NEXT: CLONE [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: CLONE [[HISTVAL:.*]] = load [[GEP_BUCKET]] ; CHECK-NEXT: CLONE [[UPDATE:.*]] = add nsw [[HISTVAL]], ir<1> @@ -79,7 +79,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: [[VECP_IDX:vp.*]] = vector-pointer inbounds [[GEP_IDX]] ; CHECK-NEXT: WIDEN [[IDX:.*]] = load [[VECP_IDX]] -; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 +; CHECK-NEXT: EMIT [[EXT_IDX:.*]] = zext [[IDX]] to i64 ; CHECK-NEXT: WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1> ; CHECK-NEXT: EMIT [[IV_NEXT:.*]] = add nuw [[IV]], [[VFxUF]] diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll index d1a2c1acf77ff..d0c7636eeb56f 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll @@ -86,8 +86,8 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> ; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b> -; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: EMIT ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: EMIT ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul>) ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll index a672984426f53..357a163febc3e 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -26,7 +26,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double +; CHECK-NEXT: EMIT ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-CALL ir<%s> = call fast @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> @@ -74,7 +74,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double +; CHECK-NEXT: EMIT ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-INTRINSIC ir<%s> = call fast llvm.sin(ir<%conv>) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll index 32765c53efdc8..acdcfafed372e 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll @@ -31,7 +31,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[EVL_PHI]]> * ir<-1> ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[EVL]]> ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: EMIT-SCALAR ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]> ; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_B]]>, vp<[[EVL]]> ; CHECK-NEXT: WIDEN ir<[[LOAD_B:%.+]]> = vp.load vp<[[VEC_END_PTR_B]]>, vp<[[EVL]]> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll index 1993a275228f7..23b477d15bc68 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll @@ -210,9 +210,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = SCALAR-STEPS vp<[[VP4]]>, ir<1>, vp<[[VP0]]> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[VP5]]> ; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = first-order splice ir<%for.x>, vp<[[VP3]]> -; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[VP6]]> to i32 +; CHECK-NEXT: EMIT ir<%for.x.prev> = trunc vp<[[VP6]]> to i32 ; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> -; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[VP7]]> to i64 +; CHECK-NEXT: EMIT ir<%for.y.i64> = sext vp<[[VP7]]> to i64 ; CHECK-NEXT: vp<[[VP8:%[0-9]+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VP8]]>, ir<%for.y.i64> ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]> @@ -299,9 +299,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VP5]]> ; CHECK-NEXT: EMIT vp<[[VP6]]> = shl ir<%l>, ir<1> ; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = first-order splice ir<%for.x>, vp<[[VP6]]> -; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[VP7]]> to i32 +; CHECK-NEXT: EMIT ir<%for.x.prev> = trunc vp<[[VP7]]> to i32 ; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%for.y>, ir<%for.x.prev> -; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[VP8]]> to i64 +; CHECK-NEXT: EMIT ir<%for.y.i64> = sext vp<[[VP8]]> to i64 ; CHECK-NEXT: vp<[[VP9:%[0-9]+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VP9]]>, ir<%for.y.i64> ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll index c2c1cfe18fadc..539443ebf7059 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll @@ -48,7 +48,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[VP7]]> to i32 +; CHECK-NEXT: EMIT ir<%conv> = sext vp<[[VP7]]> to i32 ; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%0>, ir<%conv> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -134,7 +134,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32 +; CHECK-NEXT: EMIT ir<%recur.next> = sext ir<%y> to i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -230,7 +230,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = reduction-start-vector ir<1234>, ir<-1>, ir<1> -; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32 +; CHECK-NEXT: EMIT ir<%recur.next> = sext ir<%y> to i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -357,7 +357,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[VP7]]> to i32 +; CHECK-NEXT: EMIT ir<%conv> = sext vp<[[VP7]]> to i32 ; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%0>, ir<%conv> ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: @@ -380,7 +380,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-EMPTY: ; CHECK-NEXT: loop.2: ; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, vp<[[VP9]]> -; CHECK-NEXT: WIDEN-CAST ir<%conv.lv.2> = sext vp<[[VP10]]> to i32 +; CHECK-NEXT: EMIT ir<%conv.lv.2> = sext vp<[[VP10]]> to i32 ; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -472,7 +472,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32 +; CHECK-NEXT: EMIT ir<%recur.next> = sext ir<%y> to i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll index a0ad178b2851e..3084a32aff6d8 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll @@ -24,9 +24,9 @@ define void @test_widen_metadata(ptr noalias %A, ptr noalias %B, i32 %n) { ; CHECK-NEXT: CLONE ir<%gep.A> = getelementptr inbounds ir<%A>, vp<[[VP4]]> ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%gep.A> ; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VP5]]> (!tbaa !0) -; CHECK-NEXT: WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float (!fpmath !4) +; CHECK-NEXT: EMIT ir<%conv> = sitofp ir<%lv> to float (!fpmath !4) ; CHECK-NEXT: WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00> (!fpmath !4) -; CHECK-NEXT: WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32 +; CHECK-NEXT: EMIT ir<%conv.back> = fptosi ir<%mul> to i32 ; CHECK-NEXT: CLONE ir<%gep.B> = getelementptr inbounds ir<%B>, vp<[[VP4]]> ; CHECK-NEXT: vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%gep.B> ; CHECK-NEXT: WIDEN store vp<[[VP6]]>, ir<%conv.back> (!tbaa !0) @@ -179,9 +179,9 @@ define void @test_widen_with_multiple_metadata(ptr noalias %A, ptr noalias %B, i ; CHECK-NEXT: CLONE ir<%gep.A> = getelementptr inbounds ir<%A>, vp<[[VP4]]> ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%gep.A> ; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VP5]]> (!tbaa !0) -; CHECK-NEXT: WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float +; CHECK-NEXT: EMIT ir<%conv> = sitofp ir<%lv> to float ; CHECK-NEXT: WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00> -; CHECK-NEXT: WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32 +; CHECK-NEXT: EMIT ir<%conv.back> = fptosi ir<%mul> to i32 ; CHECK-NEXT: CLONE ir<%gep.B> = getelementptr inbounds ir<%B>, vp<[[VP4]]> ; CHECK-NEXT: vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%gep.B> ; CHECK-NEXT: WIDEN store vp<[[VP6]]>, ir<%conv.back> (!tbaa !0) diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll index 598874e282682..a0a5f17a4c37e 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll @@ -1095,7 +1095,7 @@ define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) { ; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]> ; CHECK-NEXT: vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VP8]]> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 +; CHECK-NEXT: EMIT ir<%l.ext> = zext ir<%l> to i32 ; CHECK-NEXT: EXPRESSION vp<[[VP9]]> = ir<%red> + reduce.add (mul ir<63>, ir<%l.ext>) ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]> ; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> @@ -1175,7 +1175,7 @@ define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]> ; CHECK-NEXT: vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VP8]]> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: EMIT ir<%l.ext> = sext ir<%l> to i32 ; CHECK-NEXT: EXPRESSION vp<[[VP9]]> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>) ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]> ; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> @@ -1336,7 +1336,7 @@ define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]> ; CHECK-NEXT: vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VP8]]> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: EMIT ir<%l.ext> = sext ir<%l> to i32 ; CHECK-NEXT: EMIT vp<[[VP9:%[0-9]+]]> = shl ir<%l.ext>, ir<7> ; CHECK-NEXT: EXPRESSION vp<[[VP10]]> = ir<%red> + reduce.add (vp<[[VP9]]> sext to i64) ; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]> @@ -1426,8 +1426,8 @@ define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32 -; CHECK-NEXT: WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64 +; CHECK-NEXT: EMIT ir<%load.ext> = sext ir<%load> to i32 +; CHECK-NEXT: EMIT ir<%load.ext.ext> = sext ir<%load.ext> to i64 ; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = compute-reduction-result (add, in-loop) vp<[[VP5]]> ; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = extract-last-part ir<%load.ext.ext> ; CHECK-NEXT: EMIT vp<%vector.recur.extract> = extract-last-lane vp<[[VP8]]> diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll index d9eaab8d9a000..d08f357ab0d6e 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll @@ -54,7 +54,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) { ; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<%bc.resume.val> from scalar.ph) ; CHECK-NEXT: IR %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv ; CHECK-NEXT: IR %lv = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: IR %call = tail call float @llvm.sqrt.f32(float %lv) +; CHECK-NEXT: IR %call = tail call float @llvm.sqrt.f32(float %lv) #2 ; CHECK-NEXT: IR %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %iv ; CHECK-NEXT: IR store float %call, ptr %arrayidx2, align 4 ; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 @@ -982,7 +982,7 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%p>, vp<[[VP4]]> ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VP5]]> -; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> to i64 +; CHECK-NEXT: EMIT ir<%zext> = zext nneg ir<%l> to i64 ; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = extract-last-part ir<%zext> ; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = extract-last-lane vp<[[VP6]]> ; CHECK-NEXT: CLONE store vp<[[VP7]]>, ir<%p1> diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll index 6580a3dacc21c..ef9f7fb52e0cd 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll @@ -12,15 +12,15 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i32 %conv2, %conv ; CHECK: Cost of 3 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = sext ir<%0> to i32 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i32 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load -; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i32 +; CHECK: Cost of 1 for VF 4: EMIT ir<%conv> = sext ir<%0> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load -; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32 +; CHECK: Cost of 1 for VF 4: EMIT ir<%conv2> = sext ir<%1> to i32 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 4. entry: @@ -56,15 +56,15 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i32 %conv2, %conv ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = sext ir<%0> to i32 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i32 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = sext ir<%0> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv2> = sext ir<%1> to i32 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 4. entry: @@ -100,9 +100,9 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i64 %conv2, %conv ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%0> to i64 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i64 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 2. entry: @@ -139,7 +139,7 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul ir<%1>, ir<%0> -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%mul> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%mul> to i64 ; CHECK: LV: Selecting VF: 2. entry: %cmp6.not = icmp eq i32 %N, 0 @@ -173,15 +173,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i32 %conv2, %conv ; CHECK: Cost of 3 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i32 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load -; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 1 for VF 4: EMIT ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load -; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32 +; CHECK: Cost of 1 for VF 4: EMIT ir<%conv2> = zext ir<%1> to i32 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 4. entry: @@ -217,15 +217,15 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i32 %conv2, %conv ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i32 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load -; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32 +; CHECK: Cost of 0 for VF 4: EMIT ir<%conv2> = zext ir<%1> to i32 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 4. entry: @@ -261,9 +261,9 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i64 %conv2, %conv ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%0> to i64 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i64 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv> ; CHECK: LV: Selecting VF: 2. entry: @@ -300,7 +300,7 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul ir<%1>, ir<%0> -; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%mul> to i64 +; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%mul> to i64 ; CHECK: LV: Selecting VF: 2. entry: %cmp6.not = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index 0e754d7ef5c44..c1d6c6fef2545 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -425,8 +425,11 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison) diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index ad675f6efe0a0..72ba0845e244a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -295,36 +295,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1) ; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i64 1 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i64 2 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i64 3 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i64 4 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i64 5 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i64 6 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i64 7 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i64 8 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i64 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i64 10 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i64 11 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i64 12 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i64 13 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i64 14 -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i64 15 ; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i64 1 ; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i64 2 ; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i64 3 ; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i64 4 ; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i64 5 ; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i64 6 ; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i64 7 ; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i64 8 ; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i64 9 ; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i64 10 ; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i64 11 ; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i64 12 ; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i64 13 ; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i64 14 ; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i64 15 ; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META18]], !noalias [[META15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184 diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll index 7019f37449c32..697bd799c8ecc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -37,16 +37,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK: Cost of 0 for VF 2: vp<[[VECP1:%.+]]> = vector-pointer inbounds ir<%arrayidx> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<[[VECP1]]> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> ; CHECK: Cost of 0 for VF 2: vp<[[VECP2:%.+]]> = vector-pointer inbounds ir<%arrayidx2> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<[[VECP2]]> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32 +; CHECK: Cost of 0 for VF 2: EMIT ir<%conv3> = zext ir<%1> to i32 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3> -; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8 -; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32 +; CHECK: Cost of 0 for VF 2: EMIT vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8 +; CHECK: Cost of 0 for VF 2: EMIT vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll index 3063207e47b25..a7c76208e5084 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll @@ -44,72 +44,72 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I64-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 14 ; I64-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 15 ; I64-NEXT: [[TMP19:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I64-NEXT: [[TMP24:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I64-NEXT: [[TMP29:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I64-NEXT: [[TMP34:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I64-NEXT: [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]] +; I64-NEXT: [[TMP55:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] +; I64-NEXT: [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] +; I64-NEXT: [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] +; I64-NEXT: [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] +; I64-NEXT: [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] +; I64-NEXT: [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] +; I64-NEXT: [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] +; I64-NEXT: [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] +; I64-NEXT: [[TMP63:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] +; I64-NEXT: [[TMP64:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] +; I64-NEXT: [[TMP65:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] +; I64-NEXT: [[TMP66:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]] +; I64-NEXT: [[TMP67:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]] +; I64-NEXT: [[TMP68:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]] +; I64-NEXT: [[TMP69:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]] +; I64-NEXT: [[TMP39:%.*]] = load ptr, ptr [[TMP54]], align 4 +; I64-NEXT: [[TMP40:%.*]] = load ptr, ptr [[TMP55]], align 4 +; I64-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP56]], align 4 +; I64-NEXT: [[TMP42:%.*]] = load ptr, ptr [[TMP57]], align 4 +; I64-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP58]], align 4 +; I64-NEXT: [[TMP44:%.*]] = load ptr, ptr [[TMP59]], align 4 +; I64-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP60]], align 4 +; I64-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP61]], align 4 +; I64-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP62]], align 4 +; I64-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP63]], align 4 +; I64-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP64]], align 4 +; I64-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP65]], align 4 +; I64-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP66]], align 4 +; I64-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP67]], align 4 +; I64-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP68]], align 4 +; I64-NEXT: [[TMP70:%.*]] = load ptr, ptr [[TMP69]], align 4 ; I64-NEXT: [[TMP20:%.*]] = extractelement <4 x double> [[TMP19]], i64 0 +; I64-NEXT: store double [[TMP20]], ptr [[TMP39]], align 4 ; I64-NEXT: [[TMP21:%.*]] = extractelement <4 x double> [[TMP19]], i64 1 +; I64-NEXT: store double [[TMP21]], ptr [[TMP40]], align 4 ; I64-NEXT: [[TMP22:%.*]] = extractelement <4 x double> [[TMP19]], i64 2 +; I64-NEXT: store double [[TMP22]], ptr [[TMP41]], align 4 ; I64-NEXT: [[TMP23:%.*]] = extractelement <4 x double> [[TMP19]], i64 3 -; I64-NEXT: [[TMP24:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I64-NEXT: store double [[TMP23]], ptr [[TMP42]], align 4 ; I64-NEXT: [[TMP25:%.*]] = extractelement <4 x double> [[TMP24]], i64 0 +; I64-NEXT: store double [[TMP25]], ptr [[TMP43]], align 4 ; I64-NEXT: [[TMP26:%.*]] = extractelement <4 x double> [[TMP24]], i64 1 +; I64-NEXT: store double [[TMP26]], ptr [[TMP44]], align 4 ; I64-NEXT: [[TMP27:%.*]] = extractelement <4 x double> [[TMP24]], i64 2 +; I64-NEXT: store double [[TMP27]], ptr [[TMP45]], align 4 ; I64-NEXT: [[TMP28:%.*]] = extractelement <4 x double> [[TMP24]], i64 3 -; I64-NEXT: [[TMP29:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I64-NEXT: store double [[TMP28]], ptr [[TMP46]], align 4 ; I64-NEXT: [[TMP30:%.*]] = extractelement <4 x double> [[TMP29]], i64 0 +; I64-NEXT: store double [[TMP30]], ptr [[TMP47]], align 4 ; I64-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP29]], i64 1 +; I64-NEXT: store double [[TMP31]], ptr [[TMP48]], align 4 ; I64-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP29]], i64 2 +; I64-NEXT: store double [[TMP32]], ptr [[TMP49]], align 4 ; I64-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP29]], i64 3 -; I64-NEXT: [[TMP34:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I64-NEXT: store double [[TMP33]], ptr [[TMP50]], align 4 ; I64-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP34]], i64 0 +; I64-NEXT: store double [[TMP35]], ptr [[TMP51]], align 4 ; I64-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP34]], i64 1 +; I64-NEXT: store double [[TMP36]], ptr [[TMP52]], align 4 ; I64-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP34]], i64 2 +; I64-NEXT: store double [[TMP37]], ptr [[TMP53]], align 4 ; I64-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP34]], i64 3 -; I64-NEXT: [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]] -; I64-NEXT: [[TMP40:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] -; I64-NEXT: [[TMP41:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] -; I64-NEXT: [[TMP42:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] -; I64-NEXT: [[TMP43:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] -; I64-NEXT: [[TMP44:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] -; I64-NEXT: [[TMP45:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] -; I64-NEXT: [[TMP46:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] -; I64-NEXT: [[TMP47:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] -; I64-NEXT: [[TMP48:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] -; I64-NEXT: [[TMP49:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] -; I64-NEXT: [[TMP50:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] -; I64-NEXT: [[TMP51:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]] -; I64-NEXT: [[TMP52:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]] -; I64-NEXT: [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]] -; I64-NEXT: [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]] -; I64-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4 -; I64-NEXT: [[TMP56:%.*]] = load ptr, ptr [[TMP40]], align 4 -; I64-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP41]], align 4 -; I64-NEXT: [[TMP58:%.*]] = load ptr, ptr [[TMP42]], align 4 -; I64-NEXT: [[TMP59:%.*]] = load ptr, ptr [[TMP43]], align 4 -; I64-NEXT: [[TMP60:%.*]] = load ptr, ptr [[TMP44]], align 4 -; I64-NEXT: [[TMP61:%.*]] = load ptr, ptr [[TMP45]], align 4 -; I64-NEXT: [[TMP62:%.*]] = load ptr, ptr [[TMP46]], align 4 -; I64-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP47]], align 4 -; I64-NEXT: [[TMP64:%.*]] = load ptr, ptr [[TMP48]], align 4 -; I64-NEXT: [[TMP65:%.*]] = load ptr, ptr [[TMP49]], align 4 -; I64-NEXT: [[TMP66:%.*]] = load ptr, ptr [[TMP50]], align 4 -; I64-NEXT: [[TMP67:%.*]] = load ptr, ptr [[TMP51]], align 4 -; I64-NEXT: [[TMP68:%.*]] = load ptr, ptr [[TMP52]], align 4 -; I64-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP53]], align 4 -; I64-NEXT: [[TMP70:%.*]] = load ptr, ptr [[TMP54]], align 4 -; I64-NEXT: store double [[TMP20]], ptr [[TMP55]], align 4 -; I64-NEXT: store double [[TMP21]], ptr [[TMP56]], align 4 -; I64-NEXT: store double [[TMP22]], ptr [[TMP57]], align 4 -; I64-NEXT: store double [[TMP23]], ptr [[TMP58]], align 4 -; I64-NEXT: store double [[TMP25]], ptr [[TMP59]], align 4 -; I64-NEXT: store double [[TMP26]], ptr [[TMP60]], align 4 -; I64-NEXT: store double [[TMP27]], ptr [[TMP61]], align 4 -; I64-NEXT: store double [[TMP28]], ptr [[TMP62]], align 4 -; I64-NEXT: store double [[TMP30]], ptr [[TMP63]], align 4 -; I64-NEXT: store double [[TMP31]], ptr [[TMP64]], align 4 -; I64-NEXT: store double [[TMP32]], ptr [[TMP65]], align 4 -; I64-NEXT: store double [[TMP33]], ptr [[TMP66]], align 4 -; I64-NEXT: store double [[TMP35]], ptr [[TMP67]], align 4 -; I64-NEXT: store double [[TMP36]], ptr [[TMP68]], align 4 -; I64-NEXT: store double [[TMP37]], ptr [[TMP69]], align 4 ; I64-NEXT: store double [[TMP38]], ptr [[TMP70]], align 4 ; I64-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; I64-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) @@ -137,21 +137,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I64-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 2 ; I64-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 3 ; I64-NEXT: [[TMP77:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> +; I64-NEXT: [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]] +; I64-NEXT: [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] +; I64-NEXT: [[TMP87:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I64-NEXT: [[TMP88:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] +; I64-NEXT: [[TMP82:%.*]] = load ptr, ptr [[TMP85]], align 4 +; I64-NEXT: [[TMP83:%.*]] = load ptr, ptr [[TMP86]], align 4 +; I64-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP87]], align 4 +; I64-NEXT: [[TMP89:%.*]] = load ptr, ptr [[TMP88]], align 4 ; I64-NEXT: [[TMP78:%.*]] = extractelement <4 x double> [[TMP77]], i64 0 +; I64-NEXT: store double [[TMP78]], ptr [[TMP82]], align 4 ; I64-NEXT: [[TMP79:%.*]] = extractelement <4 x double> [[TMP77]], i64 1 +; I64-NEXT: store double [[TMP79]], ptr [[TMP83]], align 4 ; I64-NEXT: [[TMP80:%.*]] = extractelement <4 x double> [[TMP77]], i64 2 +; I64-NEXT: store double [[TMP80]], ptr [[TMP84]], align 4 ; I64-NEXT: [[TMP81:%.*]] = extractelement <4 x double> [[TMP77]], i64 3 -; I64-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]] -; I64-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] -; I64-NEXT: [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] -; I64-NEXT: [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] -; I64-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 -; I64-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4 -; I64-NEXT: [[TMP88:%.*]] = load ptr, ptr [[TMP84]], align 4 -; I64-NEXT: [[TMP89:%.*]] = load ptr, ptr [[TMP85]], align 4 -; I64-NEXT: store double [[TMP78]], ptr [[TMP86]], align 4 -; I64-NEXT: store double [[TMP79]], ptr [[TMP87]], align 4 -; I64-NEXT: store double [[TMP80]], ptr [[TMP88]], align 4 ; I64-NEXT: store double [[TMP81]], ptr [[TMP89]], align 4 ; I64-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 ; I64-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) @@ -198,72 +198,72 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I32-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 14 ; I32-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 15 ; I32-NEXT: [[TMP18:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I32-NEXT: [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I32-NEXT: [[TMP28:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I32-NEXT: [[TMP33:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I32-NEXT: [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]] +; I32-NEXT: [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]] +; I32-NEXT: [[TMP55:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] +; I32-NEXT: [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] +; I32-NEXT: [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] +; I32-NEXT: [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] +; I32-NEXT: [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] +; I32-NEXT: [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] +; I32-NEXT: [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] +; I32-NEXT: [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] +; I32-NEXT: [[TMP63:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] +; I32-NEXT: [[TMP64:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] +; I32-NEXT: [[TMP65:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] +; I32-NEXT: [[TMP66:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]] +; I32-NEXT: [[TMP67:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]] +; I32-NEXT: [[TMP68:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]] +; I32-NEXT: [[TMP38:%.*]] = load ptr, ptr [[TMP53]], align 4 +; I32-NEXT: [[TMP39:%.*]] = load ptr, ptr [[TMP54]], align 4 +; I32-NEXT: [[TMP40:%.*]] = load ptr, ptr [[TMP55]], align 4 +; I32-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP56]], align 4 +; I32-NEXT: [[TMP42:%.*]] = load ptr, ptr [[TMP57]], align 4 +; I32-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP58]], align 4 +; I32-NEXT: [[TMP44:%.*]] = load ptr, ptr [[TMP59]], align 4 +; I32-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP60]], align 4 +; I32-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP61]], align 4 +; I32-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP62]], align 4 +; I32-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP63]], align 4 +; I32-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP64]], align 4 +; I32-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP65]], align 4 +; I32-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP66]], align 4 +; I32-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP67]], align 4 +; I32-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 4 ; I32-NEXT: [[TMP19:%.*]] = extractelement <4 x double> [[TMP18]], i64 0 +; I32-NEXT: store double [[TMP19]], ptr [[TMP38]], align 4 ; I32-NEXT: [[TMP20:%.*]] = extractelement <4 x double> [[TMP18]], i64 1 +; I32-NEXT: store double [[TMP20]], ptr [[TMP39]], align 4 ; I32-NEXT: [[TMP21:%.*]] = extractelement <4 x double> [[TMP18]], i64 2 +; I32-NEXT: store double [[TMP21]], ptr [[TMP40]], align 4 ; I32-NEXT: [[TMP22:%.*]] = extractelement <4 x double> [[TMP18]], i64 3 -; I32-NEXT: [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I32-NEXT: store double [[TMP22]], ptr [[TMP41]], align 4 ; I32-NEXT: [[TMP24:%.*]] = extractelement <4 x double> [[TMP23]], i64 0 +; I32-NEXT: store double [[TMP24]], ptr [[TMP42]], align 4 ; I32-NEXT: [[TMP25:%.*]] = extractelement <4 x double> [[TMP23]], i64 1 +; I32-NEXT: store double [[TMP25]], ptr [[TMP43]], align 4 ; I32-NEXT: [[TMP26:%.*]] = extractelement <4 x double> [[TMP23]], i64 2 +; I32-NEXT: store double [[TMP26]], ptr [[TMP44]], align 4 ; I32-NEXT: [[TMP27:%.*]] = extractelement <4 x double> [[TMP23]], i64 3 -; I32-NEXT: [[TMP28:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I32-NEXT: store double [[TMP27]], ptr [[TMP45]], align 4 ; I32-NEXT: [[TMP29:%.*]] = extractelement <4 x double> [[TMP28]], i64 0 +; I32-NEXT: store double [[TMP29]], ptr [[TMP46]], align 4 ; I32-NEXT: [[TMP30:%.*]] = extractelement <4 x double> [[TMP28]], i64 1 +; I32-NEXT: store double [[TMP30]], ptr [[TMP47]], align 4 ; I32-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP28]], i64 2 +; I32-NEXT: store double [[TMP31]], ptr [[TMP48]], align 4 ; I32-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP28]], i64 3 -; I32-NEXT: [[TMP33:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I32-NEXT: store double [[TMP32]], ptr [[TMP49]], align 4 ; I32-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP33]], i64 0 +; I32-NEXT: store double [[TMP34]], ptr [[TMP50]], align 4 ; I32-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP33]], i64 1 +; I32-NEXT: store double [[TMP35]], ptr [[TMP51]], align 4 ; I32-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP33]], i64 2 +; I32-NEXT: store double [[TMP36]], ptr [[TMP52]], align 4 ; I32-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP33]], i64 3 -; I32-NEXT: [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]] -; I32-NEXT: [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]] -; I32-NEXT: [[TMP40:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] -; I32-NEXT: [[TMP41:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] -; I32-NEXT: [[TMP42:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] -; I32-NEXT: [[TMP43:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] -; I32-NEXT: [[TMP44:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] -; I32-NEXT: [[TMP45:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] -; I32-NEXT: [[TMP46:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] -; I32-NEXT: [[TMP47:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] -; I32-NEXT: [[TMP48:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] -; I32-NEXT: [[TMP49:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] -; I32-NEXT: [[TMP50:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] -; I32-NEXT: [[TMP51:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]] -; I32-NEXT: [[TMP52:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]] -; I32-NEXT: [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]] -; I32-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4 -; I32-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4 -; I32-NEXT: [[TMP56:%.*]] = load ptr, ptr [[TMP40]], align 4 -; I32-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP41]], align 4 -; I32-NEXT: [[TMP58:%.*]] = load ptr, ptr [[TMP42]], align 4 -; I32-NEXT: [[TMP59:%.*]] = load ptr, ptr [[TMP43]], align 4 -; I32-NEXT: [[TMP60:%.*]] = load ptr, ptr [[TMP44]], align 4 -; I32-NEXT: [[TMP61:%.*]] = load ptr, ptr [[TMP45]], align 4 -; I32-NEXT: [[TMP62:%.*]] = load ptr, ptr [[TMP46]], align 4 -; I32-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP47]], align 4 -; I32-NEXT: [[TMP64:%.*]] = load ptr, ptr [[TMP48]], align 4 -; I32-NEXT: [[TMP65:%.*]] = load ptr, ptr [[TMP49]], align 4 -; I32-NEXT: [[TMP66:%.*]] = load ptr, ptr [[TMP50]], align 4 -; I32-NEXT: [[TMP67:%.*]] = load ptr, ptr [[TMP51]], align 4 -; I32-NEXT: [[TMP68:%.*]] = load ptr, ptr [[TMP52]], align 4 -; I32-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP53]], align 4 -; I32-NEXT: store double [[TMP19]], ptr [[TMP54]], align 4 -; I32-NEXT: store double [[TMP20]], ptr [[TMP55]], align 4 -; I32-NEXT: store double [[TMP21]], ptr [[TMP56]], align 4 -; I32-NEXT: store double [[TMP22]], ptr [[TMP57]], align 4 -; I32-NEXT: store double [[TMP24]], ptr [[TMP58]], align 4 -; I32-NEXT: store double [[TMP25]], ptr [[TMP59]], align 4 -; I32-NEXT: store double [[TMP26]], ptr [[TMP60]], align 4 -; I32-NEXT: store double [[TMP27]], ptr [[TMP61]], align 4 -; I32-NEXT: store double [[TMP29]], ptr [[TMP62]], align 4 -; I32-NEXT: store double [[TMP30]], ptr [[TMP63]], align 4 -; I32-NEXT: store double [[TMP31]], ptr [[TMP64]], align 4 -; I32-NEXT: store double [[TMP32]], ptr [[TMP65]], align 4 -; I32-NEXT: store double [[TMP34]], ptr [[TMP66]], align 4 -; I32-NEXT: store double [[TMP35]], ptr [[TMP67]], align 4 -; I32-NEXT: store double [[TMP36]], ptr [[TMP68]], align 4 ; I32-NEXT: store double [[TMP37]], ptr [[TMP69]], align 4 ; I32-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; I32-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) @@ -291,21 +291,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { ; I32-NEXT: [[TMP74:%.*]] = add i32 [[INDEX4]], 2 ; I32-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 3 ; I32-NEXT: [[TMP76:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> +; I32-NEXT: [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]] +; I32-NEXT: [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP73]] +; I32-NEXT: [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] +; I32-NEXT: [[TMP87:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I32-NEXT: [[TMP81:%.*]] = load ptr, ptr [[TMP84]], align 4 +; I32-NEXT: [[TMP82:%.*]] = load ptr, ptr [[TMP85]], align 4 +; I32-NEXT: [[TMP83:%.*]] = load ptr, ptr [[TMP86]], align 4 +; I32-NEXT: [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 4 ; I32-NEXT: [[TMP77:%.*]] = extractelement <4 x double> [[TMP76]], i64 0 +; I32-NEXT: store double [[TMP77]], ptr [[TMP81]], align 4 ; I32-NEXT: [[TMP78:%.*]] = extractelement <4 x double> [[TMP76]], i64 1 +; I32-NEXT: store double [[TMP78]], ptr [[TMP82]], align 4 ; I32-NEXT: [[TMP79:%.*]] = extractelement <4 x double> [[TMP76]], i64 2 +; I32-NEXT: store double [[TMP79]], ptr [[TMP83]], align 4 ; I32-NEXT: [[TMP80:%.*]] = extractelement <4 x double> [[TMP76]], i64 3 -; I32-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]] -; I32-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP73]] -; I32-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] -; I32-NEXT: [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] -; I32-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4 -; I32-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 -; I32-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4 -; I32-NEXT: [[TMP88:%.*]] = load ptr, ptr [[TMP84]], align 4 -; I32-NEXT: store double [[TMP77]], ptr [[TMP85]], align 4 -; I32-NEXT: store double [[TMP78]], ptr [[TMP86]], align 4 -; I32-NEXT: store double [[TMP79]], ptr [[TMP87]], align 4 ; I32-NEXT: store double [[TMP80]], ptr [[TMP88]], align 4 ; I32-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 ; I32-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index 394e972d79a86..31e87e772d935 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -515,14 +515,6 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i64 2 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i64 3 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i64 4 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i64 5 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i64 6 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i64 7 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP2]] @@ -531,13 +523,21 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP5]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP6]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i64 0 ; CHECK-NEXT: store i8 [[TMP28]], ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i64 1 ; CHECK-NEXT: store i8 [[TMP29]], ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i64 2 ; CHECK-NEXT: store i8 [[TMP30]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i64 3 ; CHECK-NEXT: store i8 [[TMP31]], ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i64 4 ; CHECK-NEXT: store i8 [[TMP32]], ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i64 5 ; CHECK-NEXT: store i8 [[TMP33]], ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i64 6 ; CHECK-NEXT: store i8 [[TMP34]], ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i64 7 ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 @@ -577,22 +577,6 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> ; MAX-BW-NEXT: [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; MAX-BW-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8> -; MAX-BW-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i64 0 -; MAX-BW-NEXT: [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i64 1 -; MAX-BW-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i64 2 -; MAX-BW-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i64 3 -; MAX-BW-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i64 4 -; MAX-BW-NEXT: [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i64 5 -; MAX-BW-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i64 6 -; MAX-BW-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i64 7 -; MAX-BW-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i64 8 -; MAX-BW-NEXT: [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i64 9 -; MAX-BW-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i64 10 -; MAX-BW-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i64 11 -; MAX-BW-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i64 12 -; MAX-BW-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i64 13 -; MAX-BW-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i64 14 -; MAX-BW-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i64 15 ; MAX-BW-NEXT: [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[OFFSET_IDX]] ; MAX-BW-NEXT: [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP1]] ; MAX-BW-NEXT: [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP2]] @@ -609,21 +593,37 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]] ; MAX-BW-NEXT: [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]] ; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]] +; MAX-BW-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i64 0 ; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP69]], align 1 +; MAX-BW-NEXT: [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i64 1 ; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP70]], align 1 +; MAX-BW-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i64 2 ; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP71]], align 1 +; MAX-BW-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i64 3 ; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP72]], align 1 +; MAX-BW-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i64 4 ; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP73]], align 1 +; MAX-BW-NEXT: [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i64 5 ; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP74]], align 1 +; MAX-BW-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i64 6 ; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP75]], align 1 +; MAX-BW-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i64 7 ; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP76]], align 1 +; MAX-BW-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i64 8 ; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP77]], align 1 +; MAX-BW-NEXT: [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i64 9 ; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP78]], align 1 +; MAX-BW-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i64 10 ; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP79]], align 1 +; MAX-BW-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i64 11 ; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP80]], align 1 +; MAX-BW-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i64 12 ; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP81]], align 1 +; MAX-BW-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i64 13 ; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP82]], align 1 +; MAX-BW-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i64 14 ; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP83]], align 1 +; MAX-BW-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i64 15 ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll index 0edb89af5bc54..5ab7ad0014170 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll @@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0" ; CHECK: cost of 4 for VF 1 For instruction: %conv = uitofp i64 %tmp to double -; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double -; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double +; CHECK: Cost of 5 for VF 2: EMIT ir<%conv> = uitofp ir<%tmp> to double +; CHECK: Cost of 10 for VF 4: EMIT ir<%conv> = uitofp ir<%tmp> to double define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll index a80fd0bf6ddd7..54466b8e5c5c1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll @@ -32,11 +32,11 @@ define void @example() { ; FORCED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; FORCED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; FORCED-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[VEC_IND]] to <2 x x86_fp80> -; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 0 -; FORCED-NEXT: [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 1 ; FORCED-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[INDEX]] ; FORCED-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP1]] +; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 0 ; FORCED-NEXT: store x86_fp80 [[TMP5]], ptr [[TMP3]], align 16 +; FORCED-NEXT: [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 1 ; FORCED-NEXT: store x86_fp80 [[TMP6]], ptr [[TMP4]], align 16 ; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FORCED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) @@ -100,8 +100,8 @@ define void @test_replicating_store_x86_fp80_cost(i32 %n, ptr %dst) #0 { ; FORCED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; FORCED-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[VEC_IND]] to <2 x i64> ; FORCED-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i64 0 -; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i64 1 ; FORCED-NEXT: [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]] +; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i64 1 ; FORCED-NEXT: [[TMP8:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP7]] ; FORCED-NEXT: store x86_fp80 0xK00000000000000000000, ptr [[TMP6]], align 16 ; FORCED-NEXT: store x86_fp80 0xK00000000000000000000, ptr [[TMP8]], align 16 diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll index 31ed496de0ccf..7d93501da4411 100644 --- a/llvm/test/Transforms/LoopVectorize/as_cast.ll +++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll @@ -7,6 +7,7 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] @@ -16,7 +17,6 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) { ; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP3]] ; CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP5]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] @@ -24,8 +24,7 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) { ; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_IF1]]: ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]] ; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: @@ -65,19 +64,19 @@ define void @loop_varying_as_cast(ptr addrspace(1) %in) { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[TMP0]], 6 -; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[TMP4]] to ptr +; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(1) [[TMP7]] to ptr +; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: ; CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP5]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: ; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_IF1]]: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(1) [[TMP7]] to ptr ; CHECK-NEXT: store i64 [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll index 979b4ff3c0e7a..46792854f2826 100644 --- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll @@ -276,16 +276,16 @@ define void @cast_induction_tail_folding(ptr %A) { ; IC2-NEXT: [[INDEX1:%.*]] = add i32 [[INDEX]], 1 ; IC2-NEXT: [[TMP2:%.*]] = icmp ule i32 [[INDEX]], 2 ; IC2-NEXT: [[TMP3:%.*]] = icmp ule i32 [[INDEX1]], 2 +; IC2-NEXT: [[TMP4:%.*]] = sext i32 [[INDEX]] to i64 +; IC2-NEXT: [[TMP6:%.*]] = sext i32 [[INDEX1]] to i64 ; IC2-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; IC2: [[PRED_STORE_IF]]: -; IC2-NEXT: [[TMP4:%.*]] = sext i32 [[INDEX]] to i64 ; IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] ; IC2-NEXT: store i32 [[INDEX]], ptr [[TMP5]], align 4 ; IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] ; IC2: [[PRED_STORE_CONTINUE]]: ; IC2-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; IC2: [[PRED_STORE_IF1]]: -; IC2-NEXT: [[TMP6:%.*]] = sext i32 [[INDEX1]] to i64 ; IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] ; IC2-NEXT: store i32 [[INDEX1]], ptr [[TMP7]], align 4 ; IC2-NEXT: br label %[[PRED_STORE_CONTINUE2]] diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 3d55cbdda44bb..05fbbd735a7ad 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -1782,15 +1782,15 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float ; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP11]] ; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32> ; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i64 0 -; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1 -; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2 -; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; VEC4_INTERL1-NEXT: [[TMP18:%.*]] = sext i32 [[TMP14]] to i64 ; VEC4_INTERL1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP18]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1 ; VEC4_INTERL1-NEXT: [[TMP20:%.*]] = sext i32 [[TMP15]] to i64 ; VEC4_INTERL1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP20]] +; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2 ; VEC4_INTERL1-NEXT: [[TMP22:%.*]] = sext i32 [[TMP16]] to i64 ; VEC4_INTERL1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP22]] +; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; VEC4_INTERL1-NEXT: [[TMP24:%.*]] = sext i32 [[TMP17]] to i64 ; VEC4_INTERL1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP24]] ; VEC4_INTERL1-NEXT: store float [[OFFSET_IDX]], ptr [[TMP19]], align 4 @@ -1865,29 +1865,29 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float ; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fmul fast float [[FPINC]], 7.000000e+00 ; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP19]] ; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32> -; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0 -; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1 -; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2 -; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = fptoui <4 x float> [[STEP_ADD]] to <4 x i32> -; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP26]], i64 0 -; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i64 1 -; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i64 2 -; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i64 3 +; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0 ; VEC4_INTERL2-NEXT: [[TMP31:%.*]] = sext i32 [[TMP22]] to i64 ; VEC4_INTERL2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP31]] +; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1 ; VEC4_INTERL2-NEXT: [[TMP33:%.*]] = sext i32 [[TMP23]] to i64 ; VEC4_INTERL2-NEXT: [[TMP34:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP33]] +; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2 ; VEC4_INTERL2-NEXT: [[TMP35:%.*]] = sext i32 [[TMP24]] to i64 ; VEC4_INTERL2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP35]] +; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP37:%.*]] = sext i32 [[TMP25]] to i64 ; VEC4_INTERL2-NEXT: [[TMP38:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP37]] +; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP26]], i64 0 ; VEC4_INTERL2-NEXT: [[TMP39:%.*]] = sext i32 [[TMP27]] to i64 ; VEC4_INTERL2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP39]] +; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i64 1 ; VEC4_INTERL2-NEXT: [[TMP41:%.*]] = sext i32 [[TMP28]] to i64 ; VEC4_INTERL2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP41]] +; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i64 2 ; VEC4_INTERL2-NEXT: [[TMP43:%.*]] = sext i32 [[TMP29]] to i64 ; VEC4_INTERL2-NEXT: [[TMP44:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP43]] +; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP45:%.*]] = sext i32 [[TMP30]] to i64 ; VEC4_INTERL2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP45]] ; VEC4_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP32]], align 4 @@ -2009,9 +2009,9 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[FPINC]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = fptoui <2 x float> [[VEC_IND]] to <2 x i32> ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i64 0 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP12:%.*]] = sext i32 [[TMP10]] to i64 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP12]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP14:%.*]] = sext i32 [[TMP11]] to i64 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP14]] ; VEC2_INTERL1_PRED_STORE-NEXT: store float [[OFFSET_IDX]], ptr [[TMP13]], align 4 @@ -2094,15 +2094,15 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float ; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP11]] ; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32> ; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i64 0 -; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1 -; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2 -; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; VEC4_INTERL1-NEXT: [[TMP18:%.*]] = sext i32 [[TMP14]] to i64 ; VEC4_INTERL1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP18]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1 ; VEC4_INTERL1-NEXT: [[TMP20:%.*]] = sext i32 [[TMP15]] to i64 ; VEC4_INTERL1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP20]] +; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2 ; VEC4_INTERL1-NEXT: [[TMP22:%.*]] = sext i32 [[TMP16]] to i64 ; VEC4_INTERL1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP22]] +; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; VEC4_INTERL1-NEXT: [[TMP24:%.*]] = sext i32 [[TMP17]] to i64 ; VEC4_INTERL1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP24]] ; VEC4_INTERL1-NEXT: store float [[OFFSET_IDX]], ptr [[TMP19]], align 4 @@ -2176,29 +2176,29 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float ; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP17]] ; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fsub fast float [[OFFSET_IDX]], [[FPINC]] ; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32> -; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0 -; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1 -; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2 -; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = fptoui <4 x float> [[STEP_ADD]] to <4 x i32> -; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i64 0 -; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP25]], i64 1 -; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP25]], i64 2 -; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP25]], i64 3 +; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0 ; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = sext i32 [[TMP21]] to i64 ; VEC4_INTERL2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP30]] +; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1 ; VEC4_INTERL2-NEXT: [[TMP32:%.*]] = sext i32 [[TMP22]] to i64 ; VEC4_INTERL2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP32]] +; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2 ; VEC4_INTERL2-NEXT: [[TMP34:%.*]] = sext i32 [[TMP23]] to i64 ; VEC4_INTERL2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP34]] +; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP36:%.*]] = sext i32 [[TMP24]] to i64 ; VEC4_INTERL2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP36]] +; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i64 0 ; VEC4_INTERL2-NEXT: [[TMP38:%.*]] = sext i32 [[TMP26]] to i64 ; VEC4_INTERL2-NEXT: [[TMP39:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP38]] +; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP25]], i64 1 ; VEC4_INTERL2-NEXT: [[TMP40:%.*]] = sext i32 [[TMP27]] to i64 ; VEC4_INTERL2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP40]] +; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP25]], i64 2 ; VEC4_INTERL2-NEXT: [[TMP42:%.*]] = sext i32 [[TMP28]] to i64 ; VEC4_INTERL2-NEXT: [[TMP43:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP42]] +; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP25]], i64 3 ; VEC4_INTERL2-NEXT: [[TMP44:%.*]] = sext i32 [[TMP29]] to i64 ; VEC4_INTERL2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP44]] ; VEC4_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP31]], align 4 @@ -2320,9 +2320,9 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[FPINC]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = fptoui <2 x float> [[VEC_IND]] to <2 x i32> ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i64 0 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP12:%.*]] = sext i32 [[TMP10]] to i64 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP12]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP14:%.*]] = sext i32 [[TMP11]] to i64 ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP14]] ; VEC2_INTERL1_PRED_STORE-NEXT: store float [[OFFSET_IDX]], ptr [[TMP13]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll index 5850ac3195c39..f4988bb01dfa4 100644 --- a/llvm/test/Transforms/LoopVectorize/histograms.ll +++ b/llvm/test/Transforms/LoopVectorize/histograms.ll @@ -16,8 +16,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll b/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll index 65571e8b35a3f..eacdd4fb5ea22 100644 --- a/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll +++ b/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll @@ -14,12 +14,12 @@ define void @induction_with_multiple_instructions_in_chain(ptr %p, ptr noalias % ; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[VEC_IND]] to <4 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i64 2 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i64 3 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] ; CHECK-NEXT: store i8 0, ptr [[TMP5]], align 1 ; CHECK-NEXT: store i8 0, ptr [[TMP6]], align 1 @@ -27,12 +27,12 @@ define void @induction_with_multiple_instructions_in_chain(ptr %p, ptr noalias % ; CHECK-NEXT: store i8 0, ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i32> [[VEC_IND1]] to <4 x i64> ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] ; CHECK-NEXT: store i8 0, ptr [[TMP14]], align 1 ; CHECK-NEXT: store i8 0, ptr [[TMP15]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 3b44b99b1ddeb..d73bb389b03d3 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -2423,11 +2423,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i64 0 ; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i64 1 ; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2470,13 +2470,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; IND-NEXT: [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16> -; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 -; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]] ; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; IND-NEXT: [[TMP16:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]] ; IND-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP16]], i64 6 +; IND-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0 ; IND-NEXT: store i16 [[TMP8]], ptr [[TMP6]], align 2 +; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1 ; IND-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) @@ -2522,11 +2522,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NEXT: [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16> -; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 -; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 ; UNROLL-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> -; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 -; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; UNROLL-NEXT: [[TMP24:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]] @@ -2535,9 +2531,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP25]], i64 10 ; UNROLL-NEXT: [[TMP26:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP26]], i64 14 +; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0 ; UNROLL-NEXT: store i16 [[TMP14]], ptr [[TMP10]], align 2 +; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1 ; UNROLL-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 +; UNROLL-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 ; UNROLL-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 +; UNROLL-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4) @@ -2587,18 +2587,18 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16> -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16> -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP15]], ptr [[TMP11]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP16]], ptr [[TMP12]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0 ; UNROLL-NO-IC-NEXT: store i16 [[TMP17]], ptr [[TMP13]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1 ; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], ptr [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2) @@ -2643,15 +2643,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16> -; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 -; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 -; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16> -; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 -; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 -; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 -; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]] @@ -2668,13 +2660,21 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP40]], i64 26 ; INTERLEAVE-NEXT: [[TMP41:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP41]], i64 30 +; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP22]], ptr [[TMP14]], align 2 +; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP23]], ptr [[TMP15]], align 2 +; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP24]], ptr [[TMP16]], align 2 +; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP25]], ptr [[TMP17]], align 2 +; INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0 ; INTERLEAVE-NEXT: store i16 [[TMP26]], ptr [[TMP18]], align 2 +; INTERLEAVE-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1 ; INTERLEAVE-NEXT: store i16 [[TMP27]], ptr [[TMP19]], align 2 +; INTERLEAVE-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2 ; INTERLEAVE-NEXT: store i16 [[TMP28]], ptr [[TMP20]], align 2 +; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: store i16 [[TMP29]], ptr [[TMP21]], align 2 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll index 72294b64ffbee..6a9bdaa6a5380 100644 --- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll @@ -321,10 +321,10 @@ define void @narrow_scatter_with_uniform_addr_to_scalar_unroll(ptr noalias %src, ; VF4IC1-NEXT: [[TMP11:%.*]] = extractelement <4 x i8> [[TMP10]], i64 3 ; VF4IC1-NEXT: store i8 [[TMP11]], ptr [[DST2]], align 4 ; VF4IC1-NEXT: [[TMP12:%.*]] = extractelement <4 x i8> [[TMP10]], i64 0 -; VF4IC1-NEXT: [[TMP13:%.*]] = extractelement <4 x i8> [[TMP10]], i64 1 -; VF4IC1-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP10]], i64 2 ; VF4IC1-NEXT: store i8 [[TMP12]], ptr [[TMP5]], align 4 +; VF4IC1-NEXT: [[TMP13:%.*]] = extractelement <4 x i8> [[TMP10]], i64 1 ; VF4IC1-NEXT: store i8 [[TMP13]], ptr [[TMP6]], align 4 +; VF4IC1-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP10]], i64 2 ; VF4IC1-NEXT: store i8 [[TMP14]], ptr [[TMP7]], align 4 ; VF4IC1-NEXT: store i8 [[TMP11]], ptr [[TMP8]], align 4 ; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 2afcff0d21f20..5f0b2d4283a2c 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -263,8 +263,8 @@ define void @pr43371(i16 %val) optsize { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 @@ -291,8 +291,8 @@ define void @pr43371(i16 %val) optsize { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0 -; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 @@ -319,8 +319,8 @@ define void @pr43371(i16 %val) optsize { ; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0 -; NPGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] +; NPGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; NPGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 ; NPGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 @@ -370,8 +370,8 @@ define void @pr43371_pgso(i16 %val) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 ; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 @@ -398,8 +398,8 @@ define void @pr43371_pgso(i16 %val) !prof !14 { ; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0 -; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 ; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 ; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll index 0886d040c2688..a30fc1f8702f7 100644 --- a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll +++ b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll @@ -3,7 +3,7 @@ %struct.foo = type { i32, i64 } -; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%0> = bitcast ir<%b> to ptr +; CHECK: Cost of 0 for VF 2: EMIT ir<%0> = bitcast ir<%b> to ptr ; The bitcast below will be scalarized due to the predication in the loop. Bitcasts ; between pointer types should be treated as free, despite the scalarization. diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index b5ca609e48398..81d84a59cdaa5 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -29,12 +29,12 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64> ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP14]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP15]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll index d5de7948b18cd..46bb46bab5910 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll @@ -918,10 +918,10 @@ define void @test_step_is_not_invariant(ptr %A) { ; CHECK-NEXT: [[TMP4:%.*]] = udiv <2 x i16> [[TMP3]], splat (i16 6) ; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: store i16 [[TMP0]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i16 [[TMP0]], ptr [[TMP7]], align 2 ; CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP9]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2) diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll index 65a055399daaa..c7b169f6bb39a 100644 --- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll +++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll @@ -13,12 +13,12 @@ define void @pr63340(ptr %A, ptr %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]] -; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[TMP1]] +; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: @@ -133,15 +133,15 @@ define void @pr173761(i8 %c, ptr %p, ptr noalias %q, ptr noalias %r) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[P]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[P]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i8> [[BROADCAST_SPLAT]] to <4 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x ptr> [[DOTSPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x ptr> [[DOTSPLAT]], <4 x ptr> [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index a1ddda7eda969..9b9c9acf7df04 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1132,10 +1132,12 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) { IntegerType *Int64 = IntegerType::get(C, 64); auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64); VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); - VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, Cast, - VPIRFlags::getDefaultFlags(Instruction::ZExt)); + VPInstructionWithType *Recipe = VPInstructionWithType::createWide( + Instruction::ZExt, Op1, Int64, Cast, + VPIRFlags::getDefaultFlags(Instruction::ZExt)); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(Recipe); + delete Recipe; delete Cast; }