diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 814a414a52ac8..91a0bda2973a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -415,9 +415,9 @@ class VPBuilder {
     return createScalarCast(CastOp, Op, ResultTy, DL);
   }
 
-  VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
-                                     Type *ResultTy) {
-    return tryInsertInstruction(new VPWidenCastRecipe(
+  VPInstructionWithType *createWidenCast(Instruction::CastOps Opcode,
+                                         VPValue *Op, Type *ResultTy) {
+    return tryInsertInstruction(VPInstructionWithType::createWide(
         Opcode, Op, ResultTy, nullptr, VPIRFlags::getDefaultFlags(Opcode)));
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 78163b5fe35d5..0b6aaed866c43 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3245,8 +3245,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
             .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; })
             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
                 [](const auto *R) { return Instruction::Call; })
-            .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                  VPWidenCastRecipe>(
+            .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe>(
                 [](const auto *R) { return R->getOpcode(); })
             .Case([](const VPInterleaveRecipe *R) {
               return R->getStoredValues().empty() ? Instruction::Load
@@ -3317,7 +3316,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPRecipeBase::VPDerivedIVSC:
       case VPRecipeBase::VPScalarIVStepsSC:
       case VPRecipeBase::VPReplicateSC:
-      case VPRecipeBase::VPInstructionSC:
       case VPRecipeBase::VPCurrentIterationPHISC:
       case VPRecipeBase::VPVectorPointerSC:
       case VPRecipeBase::VPVectorEndPointerSC:
@@ -3325,11 +3323,21 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPRecipeBase::VPPredInstPHISC:
       case VPRecipeBase::VPBranchOnMaskSC:
         continue;
+      case VPRecipeBase::VPInstructionSC: {
+        // VPInstructionWithType for wide casts still produces a vector
+        // result, so only skip single-scalar VPInstructions. All other
+        // VPInstructions are treated as not producing vectors to preserve
+        // existing behavior.
+        if (auto *VPIT = dyn_cast<VPInstructionWithType>(&R)) {
+          if (Instruction::isCast(VPIT->getOpcode()) && !VPIT->isSingleScalar())
+            break;
+        }
+        continue;
+      }
       case VPRecipeBase::VPReductionSC:
       case VPRecipeBase::VPActiveLaneMaskPHISC:
       case VPRecipeBase::VPWidenCallSC:
       case VPRecipeBase::VPWidenCanonicalIVSC:
-      case VPRecipeBase::VPWidenCastSC:
       case VPRecipeBase::VPWidenGEPSC:
       case VPRecipeBase::VPWidenIntrinsicSC:
       case VPRecipeBase::VPWidenSC:
@@ -6708,7 +6716,7 @@ bool VPRecipeBuilder::replaceWithFinalIfReductionStore(
   return false;
 }
 
-VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
+VPSingleDefRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
                                                       VFRange &Range) {
   auto *I = VPI->getUnderlyingInstr();
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -6766,6 +6774,15 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
+  if (IsUniform && Instruction::isCast(VPI->getOpcode())) {
+    assert(!IsPredicated && "IsUniform implies unpredicated");
+    auto *CastR = cast<VPInstructionWithType>(VPI);
+    auto *Recipe = new VPInstructionWithType(
+        VPI->getOpcode(), VPI->operandsWithoutMask(), CastR->getResultType(),
+        *VPI, *VPI, VPI->getDebugLoc(), I->getName());
+    Recipe->setUnderlyingValue(I);
+    return Recipe;
+  }
   auto *Recipe =
       new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
                             BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
@@ -6809,9 +6826,9 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
   if (Instruction::isCast(VPI->getOpcode())) {
     auto *CI = cast<CastInst>(Instr);
     auto *CastR = cast<VPInstructionWithType>(VPI);
-    return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), CI, *VPI, *VPI,
-                                 VPI->getDebugLoc());
+    return VPInstructionWithType::createWide(
+        CI->getOpcode(), VPI->getOperand(0), CastR->getResultType(), CI, *VPI,
+        *VPI, VPI->getDebugLoc());
   }
 
   return tryToWiden(VPI);
@@ -7297,10 +7314,10 @@ void LoopVectorizationPlanner::addReductionResultComputation(
       assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
              "Unexpected truncated min-max recurrence!");
       Type *RdxTy = RdxDesc.getRecurrenceType();
-      VPWidenCastRecipe *Trunc;
+      VPInstructionWithType *Trunc;
       Instruction::CastOps ExtendOpc =
           RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
-      VPWidenCastRecipe *Extnd;
+      VPInstructionWithType *Extnd;
       {
         VPBuilder::InsertPointGuard Guard(Builder);
         Builder.setInsertPoint(
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a84c77d614673..37114a1414a39 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -88,10 +88,10 @@ class VPRecipeBuilder {
   bool replaceWithFinalIfReductionStore(VPInstruction *VPI,
                                         VPBuilder &FinalRedStoresBuilder);
 
-  /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask
-  /// as last operand. Range.End may be decreased to ensure same recipe behavior
-  /// from \p Range.Start to \p Range.End.
-  VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range);
+  /// Build a VPReplicationRecipe or VPInstrucionWithType for \p VPI. If it is
+  /// predicated, add the mask as last operand. Range.End may be decreased to
+  /// ensure same recipe behavior  from \p Range.Start to \p Range.End.
+  VPSingleDefRecipe *handleReplication(VPInstruction *VPI, VFRange &Range);
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4a5420185224b..cdd77928efdaf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -437,7 +437,6 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
     VPVectorEndPointerSC,
     VPWidenCallSC,
     VPWidenCanonicalIVSC,
-    VPWidenCastSC,
     VPWidenGEPSC,
     VPWidenIntrinsicSC,
     VPWidenLoadEVLSC,
@@ -626,7 +625,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPRecipeValue {
     case VPRecipeBase::VPVectorEndPointerSC:
     case VPRecipeBase::VPWidenCallSC:
     case VPRecipeBase::VPWidenCanonicalIVSC:
-    case VPRecipeBase::VPWidenCastSC:
     case VPRecipeBase::VPWidenGEPSC:
     case VPRecipeBase::VPWidenIntrinsicSC:
     case VPRecipeBase::VPWidenSC:
@@ -1118,7 +1116,6 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
            R->getVPRecipeID() == VPRecipeBase::VPWidenSC ||
            R->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC ||
            R->getVPRecipeID() == VPRecipeBase::VPWidenCallSC ||
-           R->getVPRecipeID() == VPRecipeBase::VPWidenCastSC ||
            R->getVPRecipeID() == VPRecipeBase::VPWidenIntrinsicSC ||
            R->getVPRecipeID() == VPRecipeBase::VPReductionSC ||
            R->getVPRecipeID() == VPRecipeBase::VPReductionEVLSC ||
@@ -1512,6 +1509,17 @@ class VPInstructionWithType : public VPInstruction {
   /// Scalar result type produced by the recipe.
   Type *ResultTy;
 
+  /// Whether the recipe produces a single scalar result (as opposed to a
+  /// vector/wide result with one lane per VF).
+  bool IsSingleScalar;
+
+  /// Returns the default value of IsSingleScalar for \p Opcode: true for
+  /// opcodes that produce a single scalar (loads, casts, vscale).
+  static bool defaultIsSingleScalar(unsigned Opcode) {
+    return Instruction::isCast(Opcode) || Opcode == Instruction::Load ||
+           Opcode == VPInstruction::VScale;
+  }
+
 public:
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
                         Type *ResultTy, const VPIRFlags &Flags = {},
@@ -1519,7 +1527,22 @@ class VPInstructionWithType : public VPInstruction {
                         DebugLoc DL = DebugLoc::getUnknown(),
                         const Twine &Name = "")
       : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
-        ResultTy(ResultTy) {}
+        ResultTy(ResultTy), IsSingleScalar(defaultIsSingleScalar(Opcode)) {}
+
+  /// Create a new VPInstructionWithType representing a wide (vector-producing)
+  /// cast that mirrors the semantics of the legacy VPWidenCastRecipe.
+  static VPInstructionWithType *
+  createWide(unsigned Opcode, VPValue *Op, Type *ResultTy,
+             CastInst *CI = nullptr, const VPIRFlags &Flags = {},
+             const VPIRMetadata &Metadata = {},
+             DebugLoc DL = DebugLoc::getUnknown()) {
+    assert(Instruction::isCast(Opcode) && "Expected a cast opcode");
+    auto *VPI =
+        new VPInstructionWithType(Opcode, {Op}, ResultTy, Flags, Metadata, DL);
+    VPI->IsSingleScalar = false;
+    VPI->setUnderlyingValue(CI);
+    return VPI;
+  }
 
   static inline bool classof(const VPRecipeBase *R) {
     // VPInstructionWithType are VPInstructions with specific opcodes requiring
@@ -1544,10 +1567,20 @@ class VPInstructionWithType : public VPInstruction {
     return isa<VPInstructionWithType>(cast<VPRecipeBase>(R));
   }
 
+  static inline bool classof(const VPValue *V) {
+    auto *R = V->getDefiningRecipe();
+    return R && classof(R);
+  }
+
+  static inline bool classof(const VPSingleDefRecipe *R) {
+    return classof(static_cast<const VPRecipeBase *>(R));
+  }
+
   VPInstruction *clone() override {
     auto *New =
         new VPInstructionWithType(getOpcode(), operands(), getResultType(),
                                   *this, *this, getDebugLoc(), getName());
+    New->IsSingleScalar = IsSingleScalar;
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
@@ -1556,13 +1589,20 @@ class VPInstructionWithType : public VPInstruction {
 
   /// Return the cost of this VPInstruction.
   InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override {
-    // TODO: Compute accurate cost after retiring the legacy cost model.
-    return 0;
-  }
+                              VPCostContext &Ctx) const override;
 
   Type *getResultType() const { return ResultTy; }
 
+  /// Returns the cast opcode of this recipe; the opcode must be a cast.
+  Instruction::CastOps getCastOpcode() const {
+    assert(Instruction::isCast(getOpcode()) && "not a cast opcode");
+    return static_cast<Instruction::CastOps>(getOpcode());
+  }
+
+  /// Returns true if this recipe produces a single scalar result (rather than
+  /// a vector with VF lanes).
+  bool isSingleScalar() const { return IsSingleScalar; }
+
 protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -1828,58 +1868,6 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
   }
 };
 
-/// VPWidenCastRecipe is a recipe to create vector cast instructions.
-class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
-  /// Cast instruction opcode.
-  Instruction::CastOps Opcode;
-
-  /// Result type for the cast.
-  Type *ResultTy;
-
-public:
-  VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst *CI = nullptr, const VPIRFlags &Flags = {},
-                    const VPIRMetadata &Metadata = {},
-                    DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPRecipeBase::VPWidenCastSC, Op, Flags, DL),
-        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
-    assert(flagsValidForOpcode(Opcode) &&
-           "Set flags not supported for the provided opcode");
-    assert(hasRequiredFlagsForOpcode(Opcode) &&
-           "Opcode requires specific flags to be set");
-    setUnderlyingValue(CI);
-  }
-
-  ~VPWidenCastRecipe() override = default;
-
-  VPWidenCastRecipe *clone() override {
-    return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
-                                 cast_or_null<CastInst>(getUnderlyingValue()),
-                                 *this, *this, getDebugLoc());
-  }
-
-  VP_CLASSOF_IMPL(VPRecipeBase::VPWidenCastSC)
-
-  /// Produce widened copies of the cast.
-  LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override;
-
-  /// Return the cost of this VPWidenCastRecipe.
-  LLVM_ABI_FOR_TEST InstructionCost
-  computeCost(ElementCount VF, VPCostContext &Ctx) const override;
-
-  Instruction::CastOps getOpcode() const { return Opcode; }
-
-  /// Returns the result type of the cast.
-  Type *getResultType() const { return ResultTy; }
-
-protected:
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
 /// A recipe for widening vector intrinsics.
 class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   /// ID of the vector intrinsic to widen.
@@ -3349,15 +3337,15 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
                      ArrayRef<VPSingleDefRecipe *> ExpressionRecipes);
 
 public:
-  VPExpressionRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
+  VPExpressionRecipe(VPInstructionWithType *Ext, VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::ExtendedReduction, {Ext, Red}) {}
   VPExpressionRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::MulAccReduction, {Mul, Red}) {}
-  VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+  VPExpressionRecipe(VPInstructionWithType *Ext0, VPInstructionWithType *Ext1,
                      VPWidenRecipe *Mul, VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction,
                            {Ext0, Ext1, Mul, Red}) {}
-  VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+  VPExpressionRecipe(VPInstructionWithType *Ext0, VPInstructionWithType *Ext1,
                      VPWidenRecipe *Mul, VPWidenRecipe *Sub,
                      VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction,
@@ -4094,11 +4082,10 @@ struct CastInfo<VPIRMetadata, VPRecipeBase *>
   /// Used by isa.
   static inline bool isPossible(VPRecipeBase *R) {
     // NOTE: Each recipe inheriting from VPIRMetadata must be listed here.
-    return isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
-               VPWidenIntrinsicRecipe, VPWidenCallRecipe, VPReplicateRecipe,
-               VPInterleaveRecipe, VPInterleaveEVLRecipe, VPWidenLoadRecipe,
-               VPWidenLoadEVLRecipe, VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(
-        R);
+    return isa<VPInstruction, VPWidenRecipe, VPWidenIntrinsicRecipe,
+               VPWidenCallRecipe, VPReplicateRecipe, VPInterleaveRecipe,
+               VPInterleaveEVLRecipe, VPWidenLoadRecipe, VPWidenLoadEVLRecipe,
+               VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R);
   }
 
   /// Used by cast.
@@ -4108,8 +4095,6 @@ struct CastInfo<VPIRMetadata, VPRecipeBase *>
       return cast<VPInstruction>(R);
     case VPRecipeBase::VPWidenSC:
       return cast<VPWidenRecipe>(R);
-    case VPRecipeBase::VPWidenCastSC:
-      return cast<VPWidenCastRecipe>(R);
     case VPRecipeBase::VPWidenIntrinsicSC:
       return cast<VPWidenIntrinsicRecipe>(R);
     case VPRecipeBase::VPWidenCallSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 103dff1889a6a..837d060830448 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -308,8 +308,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
                 return inferScalarType(R->getOperand(0));
               })
           // VPInstructionWithType must be handled before VPInstruction.
-          .Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
-                VPWidenCastRecipe>(
+          .Case<VPInstructionWithType, VPWidenIntrinsicRecipe>(
               [](const auto *R) { return R->getResultType(); })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
                 VPWidenCallRecipe, VPWidenMemoryRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 3cafeae7c4aea..c5b6e0d863632 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -303,8 +303,7 @@ struct Recipe_match {
 template <unsigned Opcode, typename... OpTys>
 using AllRecipe_match =
     Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ false,
-                 VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe,
-                 VPInstruction>;
+                 VPWidenRecipe, VPReplicateRecipe, VPInstruction>;
 
 template <unsigned Opcode, typename... OpTys>
 using AllRecipe_commutative_match =
@@ -545,7 +544,8 @@ m_ZExtOrSExt(const Op0_t &Op0) {
 }
 
 template <typename Op0_t> inline auto m_WidenAnyExtend(const Op0_t &Op0) {
-  return m_Isa<VPWidenCastRecipe>(m_CombineOr(m_ZExtOrSExt(Op0), m_FPExt(Op0)));
+  return m_Isa<VPInstructionWithType>(
+      m_CombineOr(m_ZExtOrSExt(Op0), m_FPExt(Op0)));
 }
 
 template <typename Op0_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2225dfa310c6c..b0b85ecefdb5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -87,7 +87,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenLoadEVLSC:
@@ -140,7 +139,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
@@ -190,7 +188,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPScalarIVStepsSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
   case VPWidenPHISC:
@@ -1306,7 +1303,9 @@ bool VPInstruction::isSingleScalar() const {
   case VPInstruction::VScale:
     return true;
   default:
-    return isScalarCast();
+    if (auto *VPI = dyn_cast<VPInstructionWithType>(this))
+      return VPI->isSingleScalar();
+    return false;
   }
 }
 
@@ -1589,11 +1588,20 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
 
 void VPInstructionWithType::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
-  if (isScalarCast()) {
-    Value *Op = State.get(getOperand(0), VPLane(0));
-    Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
-                                           Op, ResultTy);
-    State.set(this, Cast, VPLane(0));
+  if (Instruction::isCast(getOpcode())) {
+    Type *DestTy =
+        isSingleScalar() ? ResultTy : VectorType::get(ResultTy, State.VF);
+    Value *Op = isSingleScalar() ? State.get(getOperand(0), VPLane(0))
+                                 : State.get(getOperand(0));
+    Value *Cast = State.Builder.CreateCast(getCastOpcode(), Op, DestTy);
+    if (auto *I = dyn_cast<Instruction>(Cast)) {
+      applyFlags(*I);
+      applyMetadata(*I);
+    }
+    if (isSingleScalar())
+      State.set(this, Cast, VPLane(0));
+    else
+      State.set(this, Cast);
     return;
   }
   switch (getOpcode()) {
@@ -1614,6 +1622,15 @@ void VPInstructionWithType::execute(VPTransformState &State) {
   }
 }
 
+InstructionCost VPInstructionWithType::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  // TODO: Compute cost for VPInstructions without underlying values.
+  if (!getUnderlyingValue())
+    return 0;
+  return getCostForRecipeWithOpcode(
+      getOpcode(), isSingleScalar() ? ElementCount::getFixed(1) : VF, Ctx);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
                                         VPSlotTracker &SlotTracker) const {
@@ -1638,7 +1655,8 @@ void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
     break;
   default:
     assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
-    O << Instruction::getOpcodeName(getOpcode()) << " ";
+    O << Instruction::getOpcodeName(getOpcode());
+    printFlags(O);
     printOperands(O, SlotTracker);
     O << " to " << *ResultTy;
   }
@@ -2479,43 +2497,6 @@ void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-void VPWidenCastRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  /// Vectorize casts.
-  assert(State.VF.isVector() && "Not vectorizing?");
-  Type *DestTy = VectorType::get(getResultType(), State.VF);
-  VPValue *Op = getOperand(0);
-  Value *A = State.get(Op);
-  Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
-  State.set(this, Cast);
-  if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
-    applyFlags(*CastOp);
-    applyMetadata(*CastOp);
-  }
-}
-
-InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  // TODO: In some cases, VPWidenCastRecipes are created but not considered in
-  // the legacy cost model, including truncates/extends when evaluating a
-  // reduction in a smaller type.
-  if (!getUnderlyingValue())
-    return 0;
-  return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-CAST ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode);
-  printFlags(O);
-  printOperands(O, SlotTracker);
-  O << " to " << *getResultType();
-}
-#endif
-
 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
@@ -3032,13 +3013,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
   case ExpressionTypes::ExtendedReduction: {
     unsigned Opcode = RecurrenceDescriptor::getOpcode(
         cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
-    auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    auto *ExtR = cast<VPInstructionWithType>(ExpressionRecipes[0]);
     auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
 
     if (RedR->isPartialReduction())
       return Ctx.TTI.getPartialReductionCost(
           Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
-          TargetTransformInfo::getPartialReductionExtendKind(ExtR->getOpcode()),
+          TargetTransformInfo::getPartialReductionExtendKind(
+              ExtR->getCastOpcode()),
           TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
           RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
                                      : std::nullopt);
@@ -3061,22 +3043,22 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
   case ExpressionTypes::ExtMulAccReduction: {
     auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
     if (RedR->isPartialReduction()) {
-      auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-      auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+      auto *Ext0R = cast<VPInstructionWithType>(ExpressionRecipes[0]);
+      auto *Ext1R = cast<VPInstructionWithType>(ExpressionRecipes[1]);
       auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
       return Ctx.TTI.getPartialReductionCost(
           Opcode, Ctx.Types.inferScalarType(getOperand(0)),
           Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
           TargetTransformInfo::getPartialReductionExtendKind(
-              Ext0R->getOpcode()),
+              Ext0R->getCastOpcode()),
           TargetTransformInfo::getPartialReductionExtendKind(
-              Ext1R->getOpcode()),
+              Ext1R->getCastOpcode()),
           Mul->getOpcode(), Ctx.CostKind,
           RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
                                      : std::nullopt);
     }
     return Ctx.TTI.getMulAccReductionCost(
-        cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
+        cast<VPInstructionWithType>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
@@ -3123,7 +3105,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
     getOperand(0)->printAsOperand(O, SlotTracker);
     Red->printFlags(O);
 
-    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    auto *Ext0 = cast<VPInstructionWithType>(ExpressionRecipes[0]);
     O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
       << *Ext0->getResultType();
     if (Red->isConditional()) {
@@ -3143,11 +3125,11 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
     Mul->printFlags(O);
     O << "(";
     getOperand(0)->printAsOperand(O, SlotTracker);
-    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    auto *Ext0 = cast<VPInstructionWithType>(ExpressionRecipes[0]);
     O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
       << *Ext0->getResultType() << "), (";
     getOperand(1)->printAsOperand(O, SlotTracker);
-    auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+    auto *Ext1 = cast<VPInstructionWithType>(ExpressionRecipes[1]);
     O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
       << *Ext1->getResultType() << ")";
     if (Red->isConditional()) {
@@ -3173,7 +3155,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
       O << "(";
     getOperand(0)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
-      auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+      auto *Ext0 = cast<VPInstructionWithType>(ExpressionRecipes[0]);
       O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
         << *Ext0->getResultType() << "), (";
     } else {
@@ -3181,7 +3163,7 @@ void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
     }
     getOperand(1)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
-      auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+      auto *Ext1 = cast<VPInstructionWithType>(ExpressionRecipes[1]);
       O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
         << *Ext1->getResultType() << ")";
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 262f4798b3d63..f43e45b14bc74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -125,7 +125,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
                 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
           }
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe = new VPWidenCastRecipe(
+          NewRecipe = VPInstructionWithType::createWide(
               CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
               VPIRFlags(*CI), VPIRMetadata(*CI));
         } else {
@@ -1260,8 +1260,7 @@ static std::optional<std::pair<bool, unsigned>>
 getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
   return TypeSwitch<const VPSingleDefRecipe *,
                     std::optional<std::pair<bool, unsigned>>>(R)
-      .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPWidenGEPRecipe,
-            VPReplicateRecipe>(
+      .Case<VPInstruction, VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
           [](auto *I) { return std::make_pair(false, I->getOpcode()); })
       .Case([](const VPWidenIntrinsicRecipe *I) {
         return std::make_pair(true, I->getVectorIntrinsicID());
@@ -1390,15 +1389,25 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
       Def->replaceAllUsesWith(A);
     } else {
       // Don't replace a non-widened cast recipe with a widened cast.
-      if (!isa<VPWidenCastRecipe>(Def))
+      auto *VPIT = dyn_cast<VPInstructionWithType>(Def);
+      if (!VPIT || VPIT->isSingleScalar())
         return;
       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
 
         unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
-                                            TruncTy);
+        VPSingleDefRecipe *Ext;
+        if (vputils::isSingleScalar(Def)) {
+          Ext = new VPInstructionWithType(
+              Instruction::CastOps(ExtOpcode), {A}, TruncTy,
+              VPIRFlags::getDefaultFlags(ExtOpcode), {}, Def->getDebugLoc());
+          Builder.getInsertBlock()->insert(Ext, Builder.getInsertPoint());
+        } else {
+          Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
+                                        TruncTy);
+        }
+
         if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
           // UnderlyingExt has distinct return type, used to retain legacy cost.
           Ext->setUnderlyingValue(UnderlyingExt);
@@ -2081,7 +2090,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
     WideIV->setStepValue(NewStep);
 
-    auto *NewBTC = new VPWidenCastRecipe(
+    auto *NewBTC = VPInstructionWithType::createWide(
         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
         nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
     Plan.getVectorPreheader()->appendRecipe(NewBTC);
@@ -2571,13 +2580,13 @@ void VPlanTransforms::truncateToMinimalBitwidths(
   // cannot use RAUW after creating a new truncate, as this would could make
   // other uses have different types for their operands, making them invalidly
   // typed.
-  DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
+  DenseMap<VPValue *, VPInstructionWithType *> ProcessedTruncs;
   VPTypeAnalysis TypeInfo(Plan);
   VPBasicBlock *PH = Plan.getVectorPreheader();
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
+      if (!isa<VPWidenRecipe, VPInstructionWithType, VPReplicateRecipe,
                VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))
         continue;
 
@@ -2591,7 +2600,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
       // type. Skip those here, after incrementing NumProcessedRecipes. Also
       // skip casts which do not need to be handled explicitly here, as
       // redundant casts will be removed during recipe simplification.
-      if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
+      if (isa<VPReplicateRecipe, VPInstructionWithType>(&R))
         continue;
 
       Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
@@ -2610,7 +2619,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
       if (OldResSizeInBits != NewResSizeInBits &&
           !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
         // Extend result to original width.
-        auto *Ext = new VPWidenCastRecipe(
+        auto *Ext = VPInstructionWithType::createWide(
             Instruction::ZExt, ResultVPV, OldResTy, nullptr,
             VPIRFlags::getDefaultFlags(Instruction::ZExt));
         Ext->insertAfter(&R);
@@ -2647,7 +2656,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
           Builder.setInsertPoint(PH);
         else
           Builder.setInsertPoint(&R);
-        VPWidenCastRecipe *NewOp =
+        VPInstructionWithType *NewOp =
             Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
         ProcessedIter->second = NewOp;
         R.setOperand(Idx, NewOp);
@@ -4309,7 +4318,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
 
           InstructionCost ExtRedCost = InstructionCost::getInvalid();
           InstructionCost ExtCost =
-              cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
+              cast<VPInstructionWithType>(VecOp)->computeCost(VF, Ctx);
           InstructionCost RedCost = Red->computeCost(VF, Ctx);
 
           assert(!RedTy->isFloatingPointTy() &&
@@ -4324,12 +4333,12 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
 
   VPValue *A;
   // Match reduce(ext)).
-  if (match(VecOp, m_Isa<VPWidenCastRecipe>(m_ZExtOrSExt(m_VPValue(A)))) &&
+  if (match(VecOp, m_Isa<VPInstructionWithType>(m_ZExtOrSExt(m_VPValue(A)))) &&
       IsExtendedRedValidAndClampRange(
           RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
-          cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
+          cast<VPInstructionWithType>(VecOp)->getCastOpcode(),
           Ctx.Types.inferScalarType(A)))
-    return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
+    return new VPExpressionRecipe(cast<VPInstructionWithType>(VecOp), Red);
 
   return nullptr;
 }
@@ -4357,8 +4366,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Clamp the range if using multiply-accumulate-reduction is profitable.
   auto IsMulAccValidAndClampRange =
-      [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
-          VPWidenCastRecipe *OuterExt) -> bool {
+      [&](VPWidenRecipe *Mul, VPInstructionWithType *Ext0,
+          VPInstructionWithType *Ext1,
+          VPInstructionWithType *OuterExt) -> bool {
     return LoopVectorizationPlanner::getDecisionAndClampRange(
         [&](ElementCount VF) {
           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -4414,13 +4424,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   // creates two uniform extends that can more easily be matched by the rest of
   // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
   // replaced with the new extend of the constant.
-  auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
-                                           VPWidenCastRecipe *&ExtB,
+  auto ExtendAndReplaceConstantOp = [&Ctx](VPInstructionWithType *ExtA,
+                                           VPInstructionWithType *&ExtB,
                                            VPValue *&ValB, VPWidenRecipe *Mul) {
     if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
       return;
     Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
-    Instruction::CastOps ExtOpc = ExtA->getOpcode();
+    Instruction::CastOps ExtOpc = ExtA->getCastOpcode();
     const APInt *Const;
     if (!match(ValB, m_APInt(Const)) ||
         !llvm::canConstantBeExtended(
@@ -4441,8 +4451,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
-    auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
-    auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
+    auto *RecipeA = dyn_cast<VPInstructionWithType>(A);
+    auto *RecipeB = dyn_cast<VPInstructionWithType>(B);
     auto *Mul = cast<VPWidenRecipe>(VecOp);
 
     // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
@@ -4468,10 +4478,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Match reduce.add(ext(mul(A, B))).
   if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
-    auto *Ext = cast<VPWidenCastRecipe>(VecOp);
+    auto *Ext = cast<VPInstructionWithType>(VecOp);
     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
-    auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
-    auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
+    auto *Ext0 = dyn_cast<VPInstructionWithType>(A);
+    auto *Ext1 = dyn_cast<VPInstructionWithType>(B);
 
     // reduce.add(ext(mul(ext, const)))
     // -> reduce.add(ext(mul(ext, ext(const))))
@@ -4487,16 +4497,16 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
-      auto *NewExt0 = new VPWidenCastRecipe(
+      auto *NewExt0 = VPInstructionWithType::createWide(
           Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
           *Ext0, *Ext0, Ext0->getDebugLoc());
       NewExt0->insertBefore(Ext0);
 
-      VPWidenCastRecipe *NewExt1 = NewExt0;
+      VPInstructionWithType *NewExt1 = NewExt0;
       if (Ext0 != Ext1) {
-        NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
-                                        Ext->getResultType(), nullptr, *Ext1,
-                                        *Ext1, Ext1->getDebugLoc());
+        NewExt1 = VPInstructionWithType::createWide(
+            Ext1->getOpcode(), Ext1->getOperand(0), Ext->getResultType(),
+            nullptr, *Ext1, *Ext1, Ext1->getDebugLoc());
         NewExt1->insertBefore(Ext1);
       }
       Mul->setOperand(0, NewExt0);
@@ -5157,7 +5167,7 @@ static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
   if (!WideMember0)
     return false;
   for (VPValue *V : Ops) {
-    if (!isa<VPWidenRecipe, VPWidenCastRecipe>(V))
+    if (!isa<VPWidenRecipe, VPInstructionWithType>(V))
       return false;
     auto *R = cast<VPSingleDefRecipe>(V);
     if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
@@ -5250,7 +5260,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
   if (isAlreadyNarrow(V))
     return V;
 
-  if (isa<VPWidenRecipe, VPWidenCastRecipe>(R)) {
+  if (isa<VPWidenRecipe, VPInstructionWithType>(R)) {
     auto *WideMember0 = cast<VPSingleDefRecipe>(R);
     for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
       WideMember0->setOperand(
@@ -5894,8 +5904,8 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
   // -> reduce.add(mul(ext(A), ext(trunc(C))))
   const APInt *Const;
   if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
-    auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
-    Instruction::CastOps ExtOpc = ExtA->getOpcode();
+    auto *ExtA = cast<VPInstructionWithType>(Op->getOperand(0));
+    Instruction::CastOps ExtOpc = ExtA->getCastOpcode();
     Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
     if (!Op->hasOneUse() ||
         !llvm::canConstantBeExtended(
@@ -5916,9 +5926,9 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
   if (match(Op, m_WidenIntrinsic<Intrinsic::abs>(m_Sub(
                     m_ZExtOrSExt(m_VPValue(X)), m_ZExtOrSExt(m_VPValue(Y)))))) {
     auto *Sub = Op->getOperand(0)->getDefiningRecipe();
-    auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
+    auto *Ext = cast<VPInstructionWithType>(Sub->getOperand(0));
     assert(Ext->getOpcode() ==
-               cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
+               cast<VPInstructionWithType>(Sub->getOperand(1))->getOpcode() &&
            "Expected both the LHS and RHS extends to be the same");
     bool IsSigned = Ext->getOpcode() == Instruction::SExt;
     VPBuilder Builder(Op);
@@ -5942,21 +5952,21 @@ optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
   // TODO: Support this optimization for float types.
   if (match(Op, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
                                    m_ZExtOrSExt(m_VPValue()))))) {
-    auto *Ext = cast<VPWidenCastRecipe>(Op);
+    auto *Ext = cast<VPInstructionWithType>(Op);
     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
-    auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
-    auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
+    auto *MulLHS = cast<VPInstructionWithType>(Mul->getOperand(0));
+    auto *MulRHS = cast<VPInstructionWithType>(Mul->getOperand(1));
     if (!Mul->hasOneUse() ||
         (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
         MulLHS->getOpcode() != MulRHS->getOpcode())
       return Op;
     VPBuilder Builder(Mul);
-    Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
+    Mul->setOperand(0, Builder.createWidenCast(MulLHS->getCastOpcode(),
                                                MulLHS->getOperand(0),
                                                Ext->getResultType()));
     Mul->setOperand(1, MulLHS == MulRHS
                            ? Mul->getOperand(0)
-                           : Builder.createWidenCast(MulRHS->getOpcode(),
+                           : Builder.createWidenCast(MulRHS->getCastOpcode(),
                                                      MulRHS->getOperand(0),
                                                      Ext->getResultType()));
     return Mul;
@@ -5972,7 +5982,7 @@ createPartialReductionExpression(VPReductionRecipe *Red) {
   // reduce.[f]add(ext(op))
   //  -> VPExpressionRecipe(op, red)
   if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
-    return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
+    return new VPExpressionRecipe(cast<VPInstructionWithType>(VecOp), Red);
 
   // reduce.[f]add([f]mul(ext(a), ext(b)))
   //  -> VPExpressionRecipe(a, b, mul, red)
@@ -5980,8 +5990,8 @@ createPartialReductionExpression(VPReductionRecipe *Red) {
       match(VecOp,
             m_Mul(m_ZExtOrSExt(m_VPValue()), m_ZExtOrSExt(m_VPValue())))) {
     auto *Mul = cast<VPWidenRecipe>(VecOp);
-    auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
-    auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
+    auto *ExtA = cast<VPInstructionWithType>(Mul->getOperand(0));
+    auto *ExtB = cast<VPInstructionWithType>(Mul->getOperand(1));
     return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
   }
 
@@ -5991,8 +6001,8 @@ createPartialReductionExpression(VPReductionRecipe *Red) {
                                             m_ZExtOrSExt(m_VPValue()))))) {
     auto *Sub = cast<VPWidenRecipe>(VecOp);
     auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
-    auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
-    auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
+    auto *ExtA = cast<VPInstructionWithType>(Mul->getOperand(0));
+    auto *ExtB = cast<VPInstructionWithType>(Mul->getOperand(1));
     return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
   }
 
@@ -6133,8 +6143,8 @@ getPartialReductionLinkCost(VPCostContext &CostCtx,
       CostCtx.CostKind, Flags);
 }
 
-static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
-  return TTI::getPartialReductionExtendKind(Cast->getOpcode());
+static ExtendKind getPartialReductionExtendKind(VPInstructionWithType *Cast) {
+  return TTI::getPartialReductionExtendKind(Cast->getCastOpcode());
 }
 
 /// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
@@ -6170,8 +6180,8 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
                                    m_WidenAnyExtend(m_VPValue(Y))))))) {
     auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
     auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
-    auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
-    auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
+    auto *LHSExt = cast<VPInstructionWithType>(Sub->getOperand(0));
+    auto *RHSExt = cast<VPInstructionWithType>(Sub->getOperand(1));
     Type *LHSInputType = TypeInfo.inferScalarType(X);
     Type *RHSInputType = TypeInfo.inferScalarType(Y);
     if (LHSInputType != RHSInputType ||
@@ -6187,7 +6197,7 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
 
   std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
   if (match(Op, m_WidenAnyExtend(m_VPValue()))) {
-    auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
+    auto *CastRecipe = cast<VPInstructionWithType>(Op);
     VPValue *CastSource = CastRecipe->getOperand(0);
     OuterExtKind = getPartialReductionExtendKind(CastRecipe);
     if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
@@ -6231,21 +6241,21 @@ matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
   if (!match(LHS, m_WidenAnyExtend(m_VPValue())))
     return std::nullopt;
 
-  auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
+  auto *LHSCast = cast<VPInstructionWithType>(LHS);
   Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
   ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
 
   // The RHS of the operation can be an extend or a constant integer.
   const APInt *RHSConst = nullptr;
-  VPWidenCastRecipe *RHSCast = nullptr;
+  VPInstructionWithType *RHSCast = nullptr;
   if (match(RHS, m_WidenAnyExtend(m_VPValue())))
-    RHSCast = cast<VPWidenCastRecipe>(RHS);
+    RHSCast = cast<VPInstructionWithType>(RHS);
   else if (!match(RHS, m_APInt(RHSConst)) ||
            !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
     return std::nullopt;
 
   // The outer extend kind must match the inner extends for folding.
-  for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
+  for (VPInstructionWithType *Cast : {LHSCast, RHSCast})
     if (Cast && OuterExtKind &&
         getPartialReductionExtendKind(Cast) != OuterExtKind)
       return std::nullopt;
@@ -6362,9 +6372,11 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
   // something that isn't another partial reduction. This is because the
   // extends are intended to be lowered along with the reduction itself.
   auto ExtendUsersValid = [&](VPValue *Ext) {
-    return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
-      return PartialReductionOps.contains(cast<VPRecipeBase>(U));
-    });
+    auto *VPI = dyn_cast<VPInstructionWithType>(Ext);
+    return !VPI || !Instruction::isCast(VPI->getOpcode()) ||
+           all_of(Ext->users(), [&](VPUser *U) {
+             return PartialReductionOps.contains(cast<VPRecipeBase>(U));
+           });
   };
 
   auto IsProfitablePartialReductionChainForVF =
@@ -6386,7 +6398,8 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
       if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
         RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
       for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
-        if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
+        if (auto *Extend = dyn_cast<VPInstructionWithType>(Op);
+            Extend && Instruction::isCast(Extend->getOpcode()))
           RegularCost += Extend->computeCost(VF, CostCtx);
     }
     return PartialCost.isValid() && PartialCost < RegularCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index a60b490a69ce6..364e172c423c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -86,8 +86,10 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
 /// Returns true if \p R propagates poison from any operand to its result.
 static bool propagatesPoisonFromRecipeOp(const VPRecipeBase *R) {
   return TypeSwitch<const VPRecipeBase *, bool>(R)
-      .Case<VPWidenGEPRecipe, VPWidenCastRecipe>(
-          [](const VPRecipeBase *) { return true; })
+      .Case<VPWidenGEPRecipe>([](const VPRecipeBase *) { return true; })
+      .Case([](const VPInstructionWithType *R) {
+        return Instruction::isCast(R->getOpcode());
+      })
       .Case([](const VPReplicateRecipe *Rep) {
         // GEP and casts propagate poison from all operands.
         unsigned Opcode = Rep->getOpcode();
@@ -380,10 +382,15 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
     return preservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
-  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
+  if (auto *VPI = dyn_cast<VPInstruction>(VPV)) {
+    // VPInstructionWithType carries an explicit IsSingleScalar flag that
+    // takes precedence over uniformity-based inference.
+    if (auto *VPIT = dyn_cast<VPInstructionWithType>(VPI))
+      return VPIT->isSingleScalar();
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (preservesUniformity(VPI->getOpcode()) &&
             all_of(VPI->operands(), isSingleScalar));
+  }
   if (auto *RR = dyn_cast<VPReductionRecipe>(VPV))
     return !RR->isPartialReduction();
   if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe, VPDerivedIVRecipe>(
@@ -432,10 +439,6 @@ bool vputils::isUniformAcrossVFsAndUFs(const VPValue *V) {
         return preservesUniformity(VPI->getOpcode()) &&
                all_of(VPI->operands(), isUniformAcrossVFsAndUFs);
       })
-      .Case([](const VPWidenCastRecipe *R) {
-        // A cast is uniform according to its operand.
-        return isUniformAcrossVFsAndUFs(R->getOperand(0));
-      })
       .Default([](const VPRecipeBase *) { // A value is considered non-uniform
                                           // unless proven otherwise.
         return false;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 31a1d95dedd3c..4f470a48ad9d8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -731,22 +731,22 @@ define void @force_branch_cost(ptr readonly %src, ptr %dst) {
 ; COST1-NEXT:    [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 2
 ; COST1-NEXT:    [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 3
 ; COST1-NEXT:    [[TMP41:%.*]] = zext <4 x i8> [[TMP32]] to <4 x i32>
-; COST1-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP41]], i64 0
-; COST1-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP41]], i64 1
-; COST1-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP41]], i64 2
-; COST1-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP41]], i64 3
 ; COST1-NEXT:    [[TMP46:%.*]] = zext <4 x i8> [[TMP40]] to <4 x i32>
-; COST1-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP46]], i64 0
-; COST1-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP46]], i64 1
-; COST1-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP46]], i64 2
-; COST1-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP46]], i64 3
+; COST1-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP41]], i64 0
 ; COST1-NEXT:    store i32 [[TMP44]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META22:![0-9]+]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP41]], i64 1
 ; COST1-NEXT:    store i32 [[TMP49]], ptr [[NEXT_GEP2]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP41]], i64 2
 ; COST1-NEXT:    store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP41]], i64 3
 ; COST1-NEXT:    store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP46]], i64 0
 ; COST1-NEXT:    store i32 [[TMP45]], ptr [[NEXT_GEP5]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP46]], i64 1
 ; COST1-NEXT:    store i32 [[TMP50]], ptr [[NEXT_GEP6]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP46]], i64 2
 ; COST1-NEXT:    store i32 [[TMP47]], ptr [[NEXT_GEP7]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST1-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP46]], i64 3
 ; COST1-NEXT:    store i32 [[TMP48]], ptr [[NEXT_GEP8]], align 4, !alias.scope [[META22]], !noalias [[META19]]
 ; COST1-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24
 ; COST1-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24
@@ -844,12 +844,12 @@ define void @force_branch_cost(ptr readonly %src, ptr %dst) {
 ; COST10-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP16]], i32 3
 ; COST10-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
 ; COST10-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0
-; COST10-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1
-; COST10-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2
-; COST10-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3
 ; COST10-NEXT:    store i32 [[TMP24]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META22:![0-9]+]], !noalias [[META19]]
+; COST10-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1
 ; COST10-NEXT:    store i32 [[TMP25]], ptr [[NEXT_GEP2]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST10-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2
 ; COST10-NEXT:    store i32 [[TMP22]], ptr [[NEXT_GEP3]], align 4, !alias.scope [[META22]], !noalias [[META19]]
+; COST10-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3
 ; COST10-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP4]], align 4, !alias.scope [[META22]], !noalias [[META19]]
 ; COST10-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24
 ; COST10-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 9e8a95b6b9a47..3c3d808342cc2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -608,24 +608,24 @@ define void@sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <2 x i32> zeroinitializer, [[STEP_ADD_2]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = sub nsw <2 x i32> zeroinitializer, [[STEP_ADD_3]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sext <2 x i32> [[TMP39]] to <2 x i64>
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i64> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i64> [[TMP43]], i64 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = sext <2 x i32> [[TMP40]] to <2 x i64>
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <2 x i32> [[TMP41]] to <2 x i64>
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i64> [[TMP49]], i64 0
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i64> [[TMP49]], i64 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = sext <2 x i32> [[TMP42]] to <2 x i64>
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i64> [[TMP52]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i64> [[TMP52]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i64> [[TMP43]], i64 0
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i64> [[TMP43]], i64 1
 ; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i64> [[TMP49]], i64 0
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i64> [[TMP49]], i64 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP51]]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i64> [[TMP52]], i64 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i64> [[TMP52]], i64 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP54]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = load double, ptr [[TMP55]], align 8, !alias.scope [[META17:![0-9]+]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP56]], align 8, !alias.scope [[META17]]
@@ -687,8 +687,8 @@ define void@sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP89:%.*]] = sub nsw <2 x i32> zeroinitializer, [[VEC_IND24]]
 ; CHECK-NEXT:    [[TMP90:%.*]] = sext <2 x i32> [[TMP89]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i64> [[TMP90]], i64 0
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i64> [[TMP90]], i64 1
 ; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP91]]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i64> [[TMP90]], i64 1
 ; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr double, ptr [[BASE]], i64 [[TMP92]]
 ; CHECK-NEXT:    [[TMP95:%.*]] = load double, ptr [[TMP93]], align 8, !alias.scope [[META17]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = load double, ptr [[TMP94]], align 8, !alias.scope [[META17]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index cba9cdaa66770..9ba7efa2660ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -181,17 +181,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i64 0
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i64 1
 ; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i64 2
 ; CHECK-NEXT:    store i16 [[TMP12]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i64 3
 ; CHECK-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    store i64 0, ptr [[B]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index d751d39446023..d3d9de99073fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;; registers required for a <vscale x 4 x fp128> when trying to maximize
 ;; vector bandwidth with SVE.
 
-; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext  ir<%load.in> to fp128
+; CHECK: Cost of Invalid for VF vscale x 2: EMIT ir<%load.ext> = fpext ir<%load.in> to fp128
 
 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK-LABEL: define void @load_ext_trunc_store(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index 79566fb165bc6..843c70f35ddf9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -9,14 +9,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: EMIT ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: EMIT ir<%conv> = zext ir<%0> to i16
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -89,14 +89,14 @@ exit:
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: EMIT ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: EMIT ir<%conv> = sext ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index 668096feaf639..d3d6379ce8683 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -26,7 +26,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
 ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<{{.+}}>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: EMIT ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}>
@@ -42,7 +42,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
 ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<{{.+}}>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}>
@@ -58,7 +58,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}>
 ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer inbounds ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<{{.+}}>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: EMIT ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<{{.+}}>
@@ -140,15 +140,15 @@ for.inc:
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
+; CHECK: Cost of 4 for VF 2: EMIT ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: EMIT ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 22 for VF 2: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
@@ -171,15 +171,15 @@ for.inc:
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
@@ -202,15 +202,15 @@ for.inc:
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
+; CHECK: Cost of 2 for VF 8: EMIT ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: EMIT ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
+; CHECK: Cost of 2 for VF 8: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}
@@ -233,15 +233,15 @@ for.inc:
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
+; CHECK: Cost of 6 for VF 16: EMIT ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
+; CHECK: Cost of 6 for VF 16: EMIT ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
+; CHECK: Cost of 6 for VF 16: EMIT ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll
index 00d74e27918ac..e17236ee5a02a 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/sve2-histcnt-vplan.ll
@@ -29,7 +29,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     [[STEPS:vp.*]] = SCALAR-STEPS [[IV]], ir<1>, [[VF]]
 ; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
 ; CHECK-NEXT:     CLONE [[IDX:.*]] = load [[GEP_IDX]]
-; CHECK-NEXT:     CLONE [[EXT_IDX:.*]] = zext [[IDX]]
+; CHECK-NEXT:     EMIT-SCALAR [[EXT_IDX:.*]] = zext [[IDX]]
 ; CHECK-NEXT:     CLONE [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
 ; CHECK-NEXT:     CLONE [[HISTVAL:.*]] = load [[GEP_BUCKET]]
 ; CHECK-NEXT:     CLONE [[UPDATE:.*]] = add nsw [[HISTVAL]], ir<1>
@@ -79,7 +79,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
 ; CHECK-NEXT:     [[VECP_IDX:vp.*]] = vector-pointer inbounds [[GEP_IDX]]
 ; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
-; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
+; CHECK-NEXT:     EMIT [[EXT_IDX:.*]] = zext [[IDX]] to i64
 ; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
 ; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
 ; CHECK-NEXT:     EMIT [[IV_NEXT:.*]] = add nuw [[IV]], [[VFxUF]]
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll
index d1a2c1acf77ff..d0c7636eeb56f 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing.ll
@@ -86,8 +86,8 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+do
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   EMIT ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   EMIT ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul>)
 ; CHECK-NEXT:   EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index a672984426f53..357a163febc3e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -26,7 +26,7 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%gep.src>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
+; CHECK-NEXT:     EMIT ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-CALL ir<%s> = call fast @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
@@ -74,7 +74,7 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer inbounds ir<%gep.src>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
+; CHECK-NEXT:     EMIT ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-INTRINSIC ir<%s> = call fast llvm.sin(ir<%conv>)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%s>, ir<%gep.dst>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
index 32765c53efdc8..acdcfafed372e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll
@@ -31,7 +31,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:     vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[EVL_PHI]]> * ir<-1>
 ; CHECK-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[EVL]]>
 ; CHECK-NEXT:     CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1>
-; CHECK-NEXT:     CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:     EMIT-SCALAR ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]> to i64
 ; CHECK-NEXT:     CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]>
 ; CHECK-NEXT:     vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_B]]>, vp<[[EVL]]>
 ; CHECK-NEXT:     WIDEN ir<[[LOAD_B:%.+]]> = vp.load vp<[[VEC_END_PTR_B]]>, vp<[[EVL]]>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll
index 1993a275228f7..23b477d15bc68 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-chains-vplan.ll
@@ -210,9 +210,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT:      vp<[[VP5:%[0-9]+]]> = SCALAR-STEPS vp<[[VP4]]>, ir<1>, vp<[[VP0]]>
 ; CHECK-NEXT:      CLONE ir<%gep> = getelementptr ir<%base>, vp<[[VP5]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = first-order splice ir<%for.x>, vp<[[VP3]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%for.x.prev> = trunc vp<[[VP6]]> to i32
+; CHECK-NEXT:      EMIT ir<%for.x.prev> = trunc vp<[[VP6]]> to i32
 ; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:      WIDEN-CAST ir<%for.y.i64> = sext vp<[[VP7]]> to i64
+; CHECK-NEXT:      EMIT ir<%for.y.i64> = sext vp<[[VP7]]> to i64
 ; CHECK-NEXT:      vp<[[VP8:%[0-9]+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:      WIDEN store vp<[[VP8]]>, ir<%for.y.i64>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
@@ -299,9 +299,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VP5]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6]]> = shl ir<%l>, ir<1>
 ; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = first-order splice ir<%for.x>, vp<[[VP6]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%for.x.prev> = trunc vp<[[VP7]]> to i32
+; CHECK-NEXT:      EMIT ir<%for.x.prev> = trunc vp<[[VP7]]> to i32
 ; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:      WIDEN-CAST ir<%for.y.i64> = sext vp<[[VP8]]> to i64
+; CHECK-NEXT:      EMIT ir<%for.y.i64> = sext vp<[[VP8]]> to i64
 ; CHECK-NEXT:      vp<[[VP9:%[0-9]+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:      WIDEN store vp<[[VP9]]>, ir<%for.y.i64>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
index c2c1cfe18fadc..539443ebf7059 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
@@ -48,7 +48,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT:    Successor(s): loop.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    loop.0:
-; CHECK-NEXT:      WIDEN-CAST ir<%conv> = sext vp<[[VP7]]> to i32
+; CHECK-NEXT:      EMIT ir<%conv> = sext vp<[[VP7]]> to i32
 ; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%0>, ir<%conv>
 ; CHECK-NEXT:    Successor(s): pred.store
 ; CHECK-EMPTY:
@@ -134,7 +134,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize {
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT:    EMIT ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -230,7 +230,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:    EMIT vp<[[VP3:%[0-9]+]]> = reduction-start-vector ir<1234>, ir<-1>, ir<1>
-; CHECK-NEXT:    WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT:    EMIT ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -357,7 +357,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT:    Successor(s): loop.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    loop.0:
-; CHECK-NEXT:      WIDEN-CAST ir<%conv> = sext vp<[[VP7]]> to i32
+; CHECK-NEXT:      EMIT ir<%conv> = sext vp<[[VP7]]> to i32
 ; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = first-order splice ir<%0>, ir<%conv>
 ; CHECK-NEXT:    Successor(s): pred.load
 ; CHECK-EMPTY:
@@ -380,7 +380,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    loop.2:
 ; CHECK-NEXT:      WIDEN ir<%add.1> = add ir<%conv>, vp<[[VP9]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%conv.lv.2> = sext vp<[[VP10]]> to i32
+; CHECK-NEXT:      EMIT ir<%conv.lv.2> = sext vp<[[VP10]]> to i32
 ; CHECK-NEXT:      WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2>
 ; CHECK-NEXT:    Successor(s): pred.store
 ; CHECK-EMPTY:
@@ -472,7 +472,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
+; CHECK-NEXT:    EMIT ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll
index a0ad178b2851e..3084a32aff6d8 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-metadata.ll
@@ -24,9 +24,9 @@ define void @test_widen_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; CHECK-NEXT:      CLONE ir<%gep.A> = getelementptr inbounds ir<%A>, vp<[[VP4]]>
 ; CHECK-NEXT:      vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%gep.A>
 ; CHECK-NEXT:      WIDEN ir<%lv> = load vp<[[VP5]]> (!tbaa !0)
-; CHECK-NEXT:      WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float (!fpmath !4)
+; CHECK-NEXT:      EMIT ir<%conv> = sitofp ir<%lv> to float (!fpmath !4)
 ; CHECK-NEXT:      WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00> (!fpmath !4)
-; CHECK-NEXT:      WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK-NEXT:      EMIT ir<%conv.back> = fptosi ir<%mul> to i32
 ; CHECK-NEXT:      CLONE ir<%gep.B> = getelementptr inbounds ir<%B>, vp<[[VP4]]>
 ; CHECK-NEXT:      vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%gep.B>
 ; CHECK-NEXT:      WIDEN store vp<[[VP6]]>, ir<%conv.back> (!tbaa !0)
@@ -179,9 +179,9 @@ define void @test_widen_with_multiple_metadata(ptr noalias %A, ptr noalias %B, i
 ; CHECK-NEXT:      CLONE ir<%gep.A> = getelementptr inbounds ir<%A>, vp<[[VP4]]>
 ; CHECK-NEXT:      vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%gep.A>
 ; CHECK-NEXT:      WIDEN ir<%lv> = load vp<[[VP5]]> (!tbaa !0)
-; CHECK-NEXT:      WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float
+; CHECK-NEXT:      EMIT ir<%conv> = sitofp ir<%lv> to float
 ; CHECK-NEXT:      WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00>
-; CHECK-NEXT:      WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK-NEXT:      EMIT ir<%conv.back> = fptosi ir<%mul> to i32
 ; CHECK-NEXT:      CLONE ir<%gep.B> = getelementptr inbounds ir<%B>, vp<[[VP4]]>
 ; CHECK-NEXT:      vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%gep.B>
 ; CHECK-NEXT:      WIDEN store vp<[[VP6]]>, ir<%conv.back> (!tbaa !0)
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll
index 598874e282682..a0a5f17a4c37e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll
@@ -1095,7 +1095,7 @@ define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) {
 ; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]>
 ; CHECK-NEXT:      vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep>
 ; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VP8]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
+; CHECK-NEXT:      EMIT ir<%l.ext> = zext ir<%l> to i32
 ; CHECK-NEXT:      EXPRESSION vp<[[VP9]]> = ir<%red> + reduce.add (mul ir<63>, ir<%l.ext>)
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
@@ -1175,7 +1175,7 @@ define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]>
 ; CHECK-NEXT:      vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep>
 ; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VP8]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      EMIT ir<%l.ext> = sext ir<%l> to i32
 ; CHECK-NEXT:      EXPRESSION vp<[[VP9]]> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>)
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
@@ -1336,7 +1336,7 @@ define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[VP7]]>
 ; CHECK-NEXT:      vp<[[VP8:%[0-9]+]]> = vector-pointer vp<%next.gep>
 ; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VP8]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      EMIT ir<%l.ext> = sext ir<%l> to i32
 ; CHECK-NEXT:      EMIT vp<[[VP9:%[0-9]+]]> = shl ir<%l.ext>, ir<7>
 ; CHECK-NEXT:      EXPRESSION vp<[[VP10]]> = ir<%red> + reduce.add (vp<[[VP9]]> sext to i64)
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
@@ -1426,8 +1426,8 @@ define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32
-; CHECK-NEXT:    WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64
+; CHECK-NEXT:    EMIT ir<%load.ext> = sext ir<%load> to i32
+; CHECK-NEXT:    EMIT ir<%load.ext.ext> = sext ir<%load.ext> to i64
 ; CHECK-NEXT:    EMIT vp<[[VP7:%[0-9]+]]> = compute-reduction-result (add, in-loop) vp<[[VP5]]>
 ; CHECK-NEXT:    EMIT vp<[[VP8:%[0-9]+]]> = extract-last-part ir<%load.ext.ext>
 ; CHECK-NEXT:    EMIT vp<%vector.recur.extract> = extract-last-lane vp<[[VP8]]>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
index d9eaab8d9a000..d08f357ab0d6e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
@@ -54,7 +54,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<%bc.resume.val> from scalar.ph)
 ; CHECK-NEXT:    IR   %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv
 ; CHECK-NEXT:    IR   %lv = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:    IR   %call = tail call float @llvm.sqrt.f32(float %lv)
+; CHECK-NEXT:    IR   %call = tail call float @llvm.sqrt.f32(float %lv) #2
 ; CHECK-NEXT:    IR   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %iv
 ; CHECK-NEXT:    IR   store float %call, ptr %arrayidx2, align 4
 ; CHECK-NEXT:    IR   %iv.next = add i64 %iv, 1
@@ -982,7 +982,7 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-NEXT:      CLONE ir<%idx> = getelementptr ir<%p>, vp<[[VP4]]>
 ; CHECK-NEXT:      vp<[[VP5:%[0-9]+]]> = vector-pointer ir<%idx>
 ; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VP5]]>
-; CHECK-NEXT:      WIDEN-CAST ir<%zext> = zext nneg ir<%l> to i64
+; CHECK-NEXT:      EMIT ir<%zext> = zext nneg ir<%l> to i64
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = extract-last-part ir<%zext>
 ; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = extract-last-lane vp<[[VP6]]>
 ; CHECK-NEXT:      CLONE store vp<[[VP7]]>, ir<%p1>
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll
index 6580a3dacc21c..ef9f7fb52e0cd 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll
@@ -12,15 +12,15 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nsw i32 %conv2, %conv
 
 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = sext ir<%0> to i32
 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv>
 
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load
-; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i32
+; CHECK: Cost of 1 for VF 4: EMIT ir<%conv> = sext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load
-; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32
+; CHECK: Cost of 1 for VF 4: EMIT ir<%conv2> = sext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 4.
 entry:
@@ -56,15 +56,15 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nsw i32 %conv2, %conv
 
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = sext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv>
 
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = sext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv2> = sext ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv2> = sext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 4.
 entry:
@@ -100,9 +100,9 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nsw i64 %conv2, %conv
 
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%0> to i64
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv2> = sext ir<%1> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv2> = sext ir<%1> to i64
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 2.
 entry:
@@ -139,7 +139,7 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul ir<%1>, ir<%0>
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%mul> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = sext ir<%mul> to i64
 ; CHECK: LV: Selecting VF: 2.
 entry:
   %cmp6.not = icmp eq i32 %N, 0
@@ -173,15 +173,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nuw nsw i32 %conv2, %conv
 
 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 3 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv>
 
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load
-; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
+; CHECK: Cost of 1 for VF 4: EMIT ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load
-; CHECK: Cost of 1 for VF 4: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32
+; CHECK: Cost of 1 for VF 4: EMIT ir<%conv2> = zext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 4.
 entry:
@@ -217,15 +217,15 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nuw nsw i32 %conv2, %conv
 
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv>
 
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv2> = zext ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: EMIT ir<%conv2> = zext ir<%1> to i32
 ; CHECK: Cost of 1 for VF 4: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 4.
 entry:
@@ -261,9 +261,9 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nuw nsw i64 %conv2, %conv
 
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%0> to i64
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv2> = zext ir<%1> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv2> = zext ir<%1> to i64
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul nuw nsw ir<%conv2>, ir<%conv>
 ; CHECK: LV: Selecting VF: 2.
 entry:
@@ -300,7 +300,7 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%0> = load
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%1> = load
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%mul> = mul ir<%1>, ir<%0>
-; CHECK: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%mul> to i64
+; CHECK: Cost of 1 for VF 2: EMIT ir<%conv> = zext ir<%mul> to i64
 ; CHECK: LV: Selecting VF: 2.
 entry:
   %cmp6.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 0e754d7ef5c44..c1d6c6fef2545 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -425,8 +425,11 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index ad675f6efe0a0..72ba0845e244a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -295,36 +295,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i64 1
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i64 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i64 3
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i64 4
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i64 5
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i64 6
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i64 7
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i64 8
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i64 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i64 10
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i64 11
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i64 12
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i64 13
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i64 14
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i64 15
 ; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i64 1
 ; CHECK-NEXT:    store i8 [[TMP41]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i64 2
 ; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i64 3
 ; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i64 4
 ; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i64 5
 ; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i64 6
 ; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i64 7
 ; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i64 8
 ; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i64 9
 ; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i64 10
 ; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i64 11
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i64 12
 ; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i64 13
 ; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i64 14
 ; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i64 15
 ; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 7019f37449c32..697bd799c8ecc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -37,16 +37,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VECP1:%.+]]> = vector-pointer inbounds ir<%arrayidx>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<[[VECP1]]>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VECP2:%.+]]> = vector-pointer inbounds ir<%arrayidx2>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<[[VECP2]]>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: EMIT ir<%conv3> = zext ir<%1> to i32
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: EMIT vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
 ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 3063207e47b25..a7c76208e5084 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -44,72 +44,72 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP17:%.*]] = add i32 [[INDEX]], 14
 ; I64-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 15
 ; I64-NEXT:    [[TMP19:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT:    [[TMP24:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    [[TMP29:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    [[TMP34:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT:    [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]]
+; I64-NEXT:    [[TMP55:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
+; I64-NEXT:    [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I64-NEXT:    [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I64-NEXT:    [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I64-NEXT:    [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I64-NEXT:    [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I64-NEXT:    [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I64-NEXT:    [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I64-NEXT:    [[TMP63:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I64-NEXT:    [[TMP64:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I64-NEXT:    [[TMP65:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I64-NEXT:    [[TMP66:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
+; I64-NEXT:    [[TMP67:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
+; I64-NEXT:    [[TMP68:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
+; I64-NEXT:    [[TMP69:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]]
+; I64-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[TMP54]], align 4
+; I64-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[TMP55]], align 4
+; I64-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP56]], align 4
+; I64-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[TMP57]], align 4
+; I64-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP58]], align 4
+; I64-NEXT:    [[TMP44:%.*]] = load ptr, ptr [[TMP59]], align 4
+; I64-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP60]], align 4
+; I64-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[TMP61]], align 4
+; I64-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP62]], align 4
+; I64-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP63]], align 4
+; I64-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP64]], align 4
+; I64-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP65]], align 4
+; I64-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP66]], align 4
+; I64-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP67]], align 4
+; I64-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP68]], align 4
+; I64-NEXT:    [[TMP70:%.*]] = load ptr, ptr [[TMP69]], align 4
 ; I64-NEXT:    [[TMP20:%.*]] = extractelement <4 x double> [[TMP19]], i64 0
+; I64-NEXT:    store double [[TMP20]], ptr [[TMP39]], align 4
 ; I64-NEXT:    [[TMP21:%.*]] = extractelement <4 x double> [[TMP19]], i64 1
+; I64-NEXT:    store double [[TMP21]], ptr [[TMP40]], align 4
 ; I64-NEXT:    [[TMP22:%.*]] = extractelement <4 x double> [[TMP19]], i64 2
+; I64-NEXT:    store double [[TMP22]], ptr [[TMP41]], align 4
 ; I64-NEXT:    [[TMP23:%.*]] = extractelement <4 x double> [[TMP19]], i64 3
-; I64-NEXT:    [[TMP24:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    store double [[TMP23]], ptr [[TMP42]], align 4
 ; I64-NEXT:    [[TMP25:%.*]] = extractelement <4 x double> [[TMP24]], i64 0
+; I64-NEXT:    store double [[TMP25]], ptr [[TMP43]], align 4
 ; I64-NEXT:    [[TMP26:%.*]] = extractelement <4 x double> [[TMP24]], i64 1
+; I64-NEXT:    store double [[TMP26]], ptr [[TMP44]], align 4
 ; I64-NEXT:    [[TMP27:%.*]] = extractelement <4 x double> [[TMP24]], i64 2
+; I64-NEXT:    store double [[TMP27]], ptr [[TMP45]], align 4
 ; I64-NEXT:    [[TMP28:%.*]] = extractelement <4 x double> [[TMP24]], i64 3
-; I64-NEXT:    [[TMP29:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    store double [[TMP28]], ptr [[TMP46]], align 4
 ; I64-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP29]], i64 0
+; I64-NEXT:    store double [[TMP30]], ptr [[TMP47]], align 4
 ; I64-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP29]], i64 1
+; I64-NEXT:    store double [[TMP31]], ptr [[TMP48]], align 4
 ; I64-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP29]], i64 2
+; I64-NEXT:    store double [[TMP32]], ptr [[TMP49]], align 4
 ; I64-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP29]], i64 3
-; I64-NEXT:    [[TMP34:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT:    store double [[TMP33]], ptr [[TMP50]], align 4
 ; I64-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP34]], i64 0
+; I64-NEXT:    store double [[TMP35]], ptr [[TMP51]], align 4
 ; I64-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP34]], i64 1
+; I64-NEXT:    store double [[TMP36]], ptr [[TMP52]], align 4
 ; I64-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP34]], i64 2
+; I64-NEXT:    store double [[TMP37]], ptr [[TMP53]], align 4
 ; I64-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP34]], i64 3
-; I64-NEXT:    [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]]
-; I64-NEXT:    [[TMP40:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
-; I64-NEXT:    [[TMP41:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
-; I64-NEXT:    [[TMP42:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
-; I64-NEXT:    [[TMP43:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
-; I64-NEXT:    [[TMP44:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
-; I64-NEXT:    [[TMP45:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
-; I64-NEXT:    [[TMP46:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
-; I64-NEXT:    [[TMP47:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
-; I64-NEXT:    [[TMP48:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
-; I64-NEXT:    [[TMP49:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
-; I64-NEXT:    [[TMP50:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
-; I64-NEXT:    [[TMP51:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
-; I64-NEXT:    [[TMP52:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
-; I64-NEXT:    [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
-; I64-NEXT:    [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]]
-; I64-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
-; I64-NEXT:    [[TMP56:%.*]] = load ptr, ptr [[TMP40]], align 4
-; I64-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP41]], align 4
-; I64-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[TMP42]], align 4
-; I64-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[TMP43]], align 4
-; I64-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[TMP44]], align 4
-; I64-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[TMP45]], align 4
-; I64-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[TMP46]], align 4
-; I64-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP47]], align 4
-; I64-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP48]], align 4
-; I64-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP49]], align 4
-; I64-NEXT:    [[TMP66:%.*]] = load ptr, ptr [[TMP50]], align 4
-; I64-NEXT:    [[TMP67:%.*]] = load ptr, ptr [[TMP51]], align 4
-; I64-NEXT:    [[TMP68:%.*]] = load ptr, ptr [[TMP52]], align 4
-; I64-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP53]], align 4
-; I64-NEXT:    [[TMP70:%.*]] = load ptr, ptr [[TMP54]], align 4
-; I64-NEXT:    store double [[TMP20]], ptr [[TMP55]], align 4
-; I64-NEXT:    store double [[TMP21]], ptr [[TMP56]], align 4
-; I64-NEXT:    store double [[TMP22]], ptr [[TMP57]], align 4
-; I64-NEXT:    store double [[TMP23]], ptr [[TMP58]], align 4
-; I64-NEXT:    store double [[TMP25]], ptr [[TMP59]], align 4
-; I64-NEXT:    store double [[TMP26]], ptr [[TMP60]], align 4
-; I64-NEXT:    store double [[TMP27]], ptr [[TMP61]], align 4
-; I64-NEXT:    store double [[TMP28]], ptr [[TMP62]], align 4
-; I64-NEXT:    store double [[TMP30]], ptr [[TMP63]], align 4
-; I64-NEXT:    store double [[TMP31]], ptr [[TMP64]], align 4
-; I64-NEXT:    store double [[TMP32]], ptr [[TMP65]], align 4
-; I64-NEXT:    store double [[TMP33]], ptr [[TMP66]], align 4
-; I64-NEXT:    store double [[TMP35]], ptr [[TMP67]], align 4
-; I64-NEXT:    store double [[TMP36]], ptr [[TMP68]], align 4
-; I64-NEXT:    store double [[TMP37]], ptr [[TMP69]], align 4
 ; I64-NEXT:    store double [[TMP38]], ptr [[TMP70]], align 4
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -137,21 +137,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 2
 ; I64-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 3
 ; I64-NEXT:    [[TMP77:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I64-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]]
+; I64-NEXT:    [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I64-NEXT:    [[TMP87:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT:    [[TMP88:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT:    [[TMP82:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I64-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP86]], align 4
+; I64-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP87]], align 4
+; I64-NEXT:    [[TMP89:%.*]] = load ptr, ptr [[TMP88]], align 4
 ; I64-NEXT:    [[TMP78:%.*]] = extractelement <4 x double> [[TMP77]], i64 0
+; I64-NEXT:    store double [[TMP78]], ptr [[TMP82]], align 4
 ; I64-NEXT:    [[TMP79:%.*]] = extractelement <4 x double> [[TMP77]], i64 1
+; I64-NEXT:    store double [[TMP79]], ptr [[TMP83]], align 4
 ; I64-NEXT:    [[TMP80:%.*]] = extractelement <4 x double> [[TMP77]], i64 2
+; I64-NEXT:    store double [[TMP80]], ptr [[TMP84]], align 4
 ; I64-NEXT:    [[TMP81:%.*]] = extractelement <4 x double> [[TMP77]], i64 3
-; I64-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]]
-; I64-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
-; I64-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I64-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
-; I64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
-; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
-; I64-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP84]], align 4
-; I64-NEXT:    [[TMP89:%.*]] = load ptr, ptr [[TMP85]], align 4
-; I64-NEXT:    store double [[TMP78]], ptr [[TMP86]], align 4
-; I64-NEXT:    store double [[TMP79]], ptr [[TMP87]], align 4
-; I64-NEXT:    store double [[TMP80]], ptr [[TMP88]], align 4
 ; I64-NEXT:    store double [[TMP81]], ptr [[TMP89]], align 4
 ; I64-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I64-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
@@ -198,72 +198,72 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 14
 ; I32-NEXT:    [[TMP17:%.*]] = add i32 [[INDEX]], 15
 ; I32-NEXT:    [[TMP18:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    [[TMP28:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    [[TMP33:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT:    [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]]
+; I32-NEXT:    [[TMP54:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
+; I32-NEXT:    [[TMP55:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
+; I32-NEXT:    [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I32-NEXT:    [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I32-NEXT:    [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I32-NEXT:    [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I32-NEXT:    [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I32-NEXT:    [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I32-NEXT:    [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I32-NEXT:    [[TMP63:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I32-NEXT:    [[TMP64:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I32-NEXT:    [[TMP65:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I32-NEXT:    [[TMP66:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
+; I32-NEXT:    [[TMP67:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
+; I32-NEXT:    [[TMP68:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
+; I32-NEXT:    [[TMP38:%.*]] = load ptr, ptr [[TMP53]], align 4
+; I32-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[TMP54]], align 4
+; I32-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[TMP55]], align 4
+; I32-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP56]], align 4
+; I32-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[TMP57]], align 4
+; I32-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP58]], align 4
+; I32-NEXT:    [[TMP44:%.*]] = load ptr, ptr [[TMP59]], align 4
+; I32-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP60]], align 4
+; I32-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[TMP61]], align 4
+; I32-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP62]], align 4
+; I32-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP63]], align 4
+; I32-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP64]], align 4
+; I32-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP65]], align 4
+; I32-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP66]], align 4
+; I32-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP67]], align 4
+; I32-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 4
 ; I32-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[TMP18]], i64 0
+; I32-NEXT:    store double [[TMP19]], ptr [[TMP38]], align 4
 ; I32-NEXT:    [[TMP20:%.*]] = extractelement <4 x double> [[TMP18]], i64 1
+; I32-NEXT:    store double [[TMP20]], ptr [[TMP39]], align 4
 ; I32-NEXT:    [[TMP21:%.*]] = extractelement <4 x double> [[TMP18]], i64 2
+; I32-NEXT:    store double [[TMP21]], ptr [[TMP40]], align 4
 ; I32-NEXT:    [[TMP22:%.*]] = extractelement <4 x double> [[TMP18]], i64 3
-; I32-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    store double [[TMP22]], ptr [[TMP41]], align 4
 ; I32-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[TMP23]], i64 0
+; I32-NEXT:    store double [[TMP24]], ptr [[TMP42]], align 4
 ; I32-NEXT:    [[TMP25:%.*]] = extractelement <4 x double> [[TMP23]], i64 1
+; I32-NEXT:    store double [[TMP25]], ptr [[TMP43]], align 4
 ; I32-NEXT:    [[TMP26:%.*]] = extractelement <4 x double> [[TMP23]], i64 2
+; I32-NEXT:    store double [[TMP26]], ptr [[TMP44]], align 4
 ; I32-NEXT:    [[TMP27:%.*]] = extractelement <4 x double> [[TMP23]], i64 3
-; I32-NEXT:    [[TMP28:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    store double [[TMP27]], ptr [[TMP45]], align 4
 ; I32-NEXT:    [[TMP29:%.*]] = extractelement <4 x double> [[TMP28]], i64 0
+; I32-NEXT:    store double [[TMP29]], ptr [[TMP46]], align 4
 ; I32-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP28]], i64 1
+; I32-NEXT:    store double [[TMP30]], ptr [[TMP47]], align 4
 ; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP28]], i64 2
+; I32-NEXT:    store double [[TMP31]], ptr [[TMP48]], align 4
 ; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP28]], i64 3
-; I32-NEXT:    [[TMP33:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT:    store double [[TMP32]], ptr [[TMP49]], align 4
 ; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP33]], i64 0
+; I32-NEXT:    store double [[TMP34]], ptr [[TMP50]], align 4
 ; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP33]], i64 1
+; I32-NEXT:    store double [[TMP35]], ptr [[TMP51]], align 4
 ; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP33]], i64 2
+; I32-NEXT:    store double [[TMP36]], ptr [[TMP52]], align 4
 ; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP33]], i64 3
-; I32-NEXT:    [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX]]
-; I32-NEXT:    [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
-; I32-NEXT:    [[TMP40:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
-; I32-NEXT:    [[TMP41:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
-; I32-NEXT:    [[TMP42:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
-; I32-NEXT:    [[TMP43:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
-; I32-NEXT:    [[TMP44:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
-; I32-NEXT:    [[TMP45:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
-; I32-NEXT:    [[TMP46:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
-; I32-NEXT:    [[TMP47:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
-; I32-NEXT:    [[TMP48:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
-; I32-NEXT:    [[TMP49:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
-; I32-NEXT:    [[TMP50:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
-; I32-NEXT:    [[TMP51:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
-; I32-NEXT:    [[TMP52:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
-; I32-NEXT:    [[TMP53:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
-; I32-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
-; I32-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
-; I32-NEXT:    [[TMP56:%.*]] = load ptr, ptr [[TMP40]], align 4
-; I32-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP41]], align 4
-; I32-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[TMP42]], align 4
-; I32-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[TMP43]], align 4
-; I32-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[TMP44]], align 4
-; I32-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[TMP45]], align 4
-; I32-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[TMP46]], align 4
-; I32-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP47]], align 4
-; I32-NEXT:    [[TMP64:%.*]] = load ptr, ptr [[TMP48]], align 4
-; I32-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[TMP49]], align 4
-; I32-NEXT:    [[TMP66:%.*]] = load ptr, ptr [[TMP50]], align 4
-; I32-NEXT:    [[TMP67:%.*]] = load ptr, ptr [[TMP51]], align 4
-; I32-NEXT:    [[TMP68:%.*]] = load ptr, ptr [[TMP52]], align 4
-; I32-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP53]], align 4
-; I32-NEXT:    store double [[TMP19]], ptr [[TMP54]], align 4
-; I32-NEXT:    store double [[TMP20]], ptr [[TMP55]], align 4
-; I32-NEXT:    store double [[TMP21]], ptr [[TMP56]], align 4
-; I32-NEXT:    store double [[TMP22]], ptr [[TMP57]], align 4
-; I32-NEXT:    store double [[TMP24]], ptr [[TMP58]], align 4
-; I32-NEXT:    store double [[TMP25]], ptr [[TMP59]], align 4
-; I32-NEXT:    store double [[TMP26]], ptr [[TMP60]], align 4
-; I32-NEXT:    store double [[TMP27]], ptr [[TMP61]], align 4
-; I32-NEXT:    store double [[TMP29]], ptr [[TMP62]], align 4
-; I32-NEXT:    store double [[TMP30]], ptr [[TMP63]], align 4
-; I32-NEXT:    store double [[TMP31]], ptr [[TMP64]], align 4
-; I32-NEXT:    store double [[TMP32]], ptr [[TMP65]], align 4
-; I32-NEXT:    store double [[TMP34]], ptr [[TMP66]], align 4
-; I32-NEXT:    store double [[TMP35]], ptr [[TMP67]], align 4
-; I32-NEXT:    store double [[TMP36]], ptr [[TMP68]], align 4
 ; I32-NEXT:    store double [[TMP37]], ptr [[TMP69]], align 4
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -291,21 +291,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP74:%.*]] = add i32 [[INDEX4]], 2
 ; I32-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 3
 ; I32-NEXT:    [[TMP76:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I32-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]]
+; I32-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP73]]
+; I32-NEXT:    [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT:    [[TMP87:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP84]], align 4
+; I32-NEXT:    [[TMP82:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP86]], align 4
+; I32-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP87]], align 4
 ; I32-NEXT:    [[TMP77:%.*]] = extractelement <4 x double> [[TMP76]], i64 0
+; I32-NEXT:    store double [[TMP77]], ptr [[TMP81]], align 4
 ; I32-NEXT:    [[TMP78:%.*]] = extractelement <4 x double> [[TMP76]], i64 1
+; I32-NEXT:    store double [[TMP78]], ptr [[TMP82]], align 4
 ; I32-NEXT:    [[TMP79:%.*]] = extractelement <4 x double> [[TMP76]], i64 2
+; I32-NEXT:    store double [[TMP79]], ptr [[TMP83]], align 4
 ; I32-NEXT:    [[TMP80:%.*]] = extractelement <4 x double> [[TMP76]], i64 3
-; I32-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[INDEX4]]
-; I32-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP73]]
-; I32-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
-; I32-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I32-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
-; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
-; I32-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
-; I32-NEXT:    [[TMP88:%.*]] = load ptr, ptr [[TMP84]], align 4
-; I32-NEXT:    store double [[TMP77]], ptr [[TMP85]], align 4
-; I32-NEXT:    store double [[TMP78]], ptr [[TMP86]], align 4
-; I32-NEXT:    store double [[TMP79]], ptr [[TMP87]], align 4
 ; I32-NEXT:    store double [[TMP80]], ptr [[TMP88]], align 4
 ; I32-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I32-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 394e972d79a86..31e87e772d935 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -515,14 +515,6 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i64 2
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i64 3
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i64 4
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i64 5
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i64 6
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i64 7
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP2]]
@@ -531,13 +523,21 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i64 0
 ; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i64 1
 ; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i64 2
 ; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i64 3
 ; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i64 4
 ; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i64 5
 ; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i64 6
 ; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i64 7
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -577,22 +577,6 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i64 0
-; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i64 1
-; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i64 2
-; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i64 3
-; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i64 4
-; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i64 5
-; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i64 6
-; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i64 7
-; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i64 8
-; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i64 9
-; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i64 10
-; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i64 11
-; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i64 12
-; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i64 13
-; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i64 14
-; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i64 15
 ; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[OFFSET_IDX]]
 ; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP1]]
 ; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP2]]
@@ -609,21 +593,37 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
 ; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
 ; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
+; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i64 0
 ; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP69]], align 1
+; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i64 1
 ; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP70]], align 1
+; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i64 2
 ; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP71]], align 1
+; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i64 3
 ; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP72]], align 1
+; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i64 4
 ; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP73]], align 1
+; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i64 5
 ; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP74]], align 1
+; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i64 6
 ; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP75]], align 1
+; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i64 7
 ; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP76]], align 1
+; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i64 8
 ; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP77]], align 1
+; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i64 9
 ; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP78]], align 1
+; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i64 10
 ; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP79]], align 1
+; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i64 11
 ; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP80]], align 1
+; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i64 12
 ; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP81]], align 1
+; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i64 13
 ; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP82]], align 1
+; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i64 14
 ; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP83]], align 1
+; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i64 15
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index 0edb89af5bc54..5ab7ad0014170 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
-; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
+; CHECK: Cost of 5 for VF 2: EMIT ir<%conv> = uitofp ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: EMIT ir<%conv> = uitofp ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index a80fd0bf6ddd7..54466b8e5c5c1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -32,11 +32,11 @@ define void @example() {
 ; FORCED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; FORCED-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[VEC_IND]] to <2 x x86_fp80>
-; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 0
-; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 1
 ; FORCED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[INDEX]]
 ; FORCED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP1]]
+; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 0
 ; FORCED-NEXT:    store x86_fp80 [[TMP5]], ptr [[TMP3]], align 16
+; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i64 1
 ; FORCED-NEXT:    store x86_fp80 [[TMP6]], ptr [[TMP4]], align 16
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FORCED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -100,8 +100,8 @@ define void @test_replicating_store_x86_fp80_cost(i32 %n, ptr %dst) #0 {
 ; FORCED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[VEC_IND]] to <2 x i64>
 ; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i64 0
-; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i64 1
 ; FORCED-NEXT:    [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]]
+; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i64 1
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP7]]
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP6]], align 16
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP8]], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll
index 31ed496de0ccf..7d93501da4411 100644
--- a/llvm/test/Transforms/LoopVectorize/as_cast.ll
+++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll
@@ -7,6 +7,7 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
@@ -16,7 +17,6 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) {
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP3]]
 ; CHECK-NEXT:    store i64 [[TMP3]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
@@ -24,8 +24,7 @@ define void @loop_invariant_as_cast(ptr addrspace(1) %in) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[IN]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 [[TMP6]]
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
@@ -65,19 +64,19 @@ define void @loop_varying_as_cast(ptr addrspace(1) %in) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[TMP0]], 6
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = addrspacecast ptr addrspace(1) [[TMP7]] to ptr
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i64 [[TMP3]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[IN]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = addrspacecast ptr addrspace(1) [[TMP7]] to ptr
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index 979b4ff3c0e7a..46792854f2826 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -276,16 +276,16 @@ define void @cast_induction_tail_folding(ptr %A) {
 ; IC2-NEXT:    [[INDEX1:%.*]] = add i32 [[INDEX]], 1
 ; IC2-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[INDEX]], 2
 ; IC2-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[INDEX1]], 2
+; IC2-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
+; IC2-NEXT:    [[TMP6:%.*]] = sext i32 [[INDEX1]] to i64
 ; IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; IC2:       [[PRED_STORE_IF]]:
-; IC2-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
 ; IC2-NEXT:    store i32 [[INDEX]], ptr [[TMP5]], align 4
 ; IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; IC2:       [[PRED_STORE_CONTINUE]]:
 ; IC2-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; IC2:       [[PRED_STORE_IF1]]:
-; IC2-NEXT:    [[TMP6:%.*]] = sext i32 [[INDEX1]] to i64
 ; IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; IC2-NEXT:    store i32 [[INDEX1]], ptr [[TMP7]], align 4
 ; IC2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 3d55cbdda44bb..05fbbd735a7ad 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1782,15 +1782,15 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float
 ; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP11]]
 ; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32>
 ; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i64 0
-; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1
-; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2
-; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3
 ; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP14]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP18]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1
 ; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP15]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP20]]
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2
 ; VEC4_INTERL1-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP16]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP22]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3
 ; VEC4_INTERL1-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP17]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP24]]
 ; VEC4_INTERL1-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP19]], align 4
@@ -1865,29 +1865,29 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float
 ; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fmul fast float [[FPINC]], 7.000000e+00
 ; VEC4_INTERL2-NEXT:    [[TMP20:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP19]]
 ; VEC4_INTERL2-NEXT:    [[TMP21:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32>
-; VEC4_INTERL2-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0
-; VEC4_INTERL2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1
-; VEC4_INTERL2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2
-; VEC4_INTERL2-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP26:%.*]] = fptoui <4 x float> [[STEP_ADD]] to <4 x i32>
-; VEC4_INTERL2-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP26]], i64 0
-; VEC4_INTERL2-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i64 1
-; VEC4_INTERL2-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i64 2
-; VEC4_INTERL2-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i64 3
+; VEC4_INTERL2-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i64 0
 ; VEC4_INTERL2-NEXT:    [[TMP31:%.*]] = sext i32 [[TMP22]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP31]]
+; VEC4_INTERL2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i64 1
 ; VEC4_INTERL2-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP23]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP33]]
+; VEC4_INTERL2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i64 2
 ; VEC4_INTERL2-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP24]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP35]]
+; VEC4_INTERL2-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP25]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP37]]
+; VEC4_INTERL2-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP26]], i64 0
 ; VEC4_INTERL2-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP27]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP39]]
+; VEC4_INTERL2-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i64 1
 ; VEC4_INTERL2-NEXT:    [[TMP41:%.*]] = sext i32 [[TMP28]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP41]]
+; VEC4_INTERL2-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i64 2
 ; VEC4_INTERL2-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP29]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP43]]
+; VEC4_INTERL2-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP30]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP45]]
 ; VEC4_INTERL2-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP32]], align 4
@@ -2009,9 +2009,9 @@ define void @fp_iv_used_in_gep_fadd(float %init, ptr noalias nocapture %A, float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[FPINC]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fptoui <2 x float> [[VEC_IND]] to <2 x i32>
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i64 0
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP10]] to i64
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP12]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP11]] to i64
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP14]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP13]], align 4
@@ -2094,15 +2094,15 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float
 ; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP11]]
 ; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32>
 ; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i64 0
-; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1
-; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2
-; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3
 ; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP14]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP18]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i64 1
 ; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP15]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP20]]
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i64 2
 ; VEC4_INTERL1-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP16]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP22]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3
 ; VEC4_INTERL1-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP17]] to i64
 ; VEC4_INTERL1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP24]]
 ; VEC4_INTERL1-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP19]], align 4
@@ -2176,29 +2176,29 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float
 ; VEC4_INTERL2-NEXT:    [[TMP18:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP17]]
 ; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fsub fast float [[OFFSET_IDX]], [[FPINC]]
 ; VEC4_INTERL2-NEXT:    [[TMP20:%.*]] = fptoui <4 x float> [[VEC_IND]] to <4 x i32>
-; VEC4_INTERL2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
-; VEC4_INTERL2-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1
-; VEC4_INTERL2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2
-; VEC4_INTERL2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP25:%.*]] = fptoui <4 x float> [[STEP_ADD]] to <4 x i32>
-; VEC4_INTERL2-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i64 0
-; VEC4_INTERL2-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP25]], i64 1
-; VEC4_INTERL2-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP25]], i64 2
-; VEC4_INTERL2-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP25]], i64 3
+; VEC4_INTERL2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
 ; VEC4_INTERL2-NEXT:    [[TMP30:%.*]] = sext i32 [[TMP21]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP30]]
+; VEC4_INTERL2-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1
 ; VEC4_INTERL2-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP22]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP32]]
+; VEC4_INTERL2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2
 ; VEC4_INTERL2-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP23]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP34]]
+; VEC4_INTERL2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP24]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP36]]
+; VEC4_INTERL2-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i64 0
 ; VEC4_INTERL2-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP26]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP38]]
+; VEC4_INTERL2-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP25]], i64 1
 ; VEC4_INTERL2-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP27]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP40]]
+; VEC4_INTERL2-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP25]], i64 2
 ; VEC4_INTERL2-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP28]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP42]]
+; VEC4_INTERL2-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP25]], i64 3
 ; VEC4_INTERL2-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP29]] to i64
 ; VEC4_INTERL2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP44]]
 ; VEC4_INTERL2-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP31]], align 4
@@ -2320,9 +2320,9 @@ define void @fp_iv_used_in_gep_fsub(float %init, ptr noalias nocapture %A, float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[FPINC]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fptoui <2 x float> [[VEC_IND]] to <2 x i32>
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i64 0
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP10]] to i64
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP12]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP11]] to i64
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[TMP14]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[OFFSET_IDX]], ptr [[TMP13]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll
index 5850ac3195c39..f4988bb01dfa4 100644
--- a/llvm/test/Transforms/LoopVectorize/histograms.ll
+++ b/llvm/test/Transforms/LoopVectorize/histograms.ll
@@ -16,8 +16,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll b/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll
index 65571e8b35a3f..eacdd4fb5ea22 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-wrapflags.ll
@@ -14,12 +14,12 @@ define void @induction_with_multiple_instructions_in_chain(ptr %p, ptr noalias %
 ; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 3, i32 6, i32 9>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[VEC_IND]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i64 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    store i8 0, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    store i8 0, ptr [[TMP6]], align 1
@@ -27,12 +27,12 @@ define void @induction_with_multiple_instructions_in_chain(ptr %p, ptr noalias %
 ; CHECK-NEXT:    store i8 0, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i32> [[VEC_IND1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]]
 ; CHECK-NEXT:    store i8 0, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store i8 0, ptr [[TMP15]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 3b44b99b1ddeb..d73bb389b03d3 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -2423,11 +2423,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i64 0
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i64 1
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2470,13 +2470,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
-; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
-; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP16]], i64 6
+; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
 ; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
+; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2522,11 +2522,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
-; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]]
@@ -2535,9 +2531,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP25]], i64 10
 ; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP26]], i64 14
+; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP14]], ptr [[TMP10]], align 2
+; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
+; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
@@ -2587,18 +2587,18 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -2643,15 +2643,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16>
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [4 x i8], ptr [[P:%.*]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]]
@@ -2668,13 +2660,21 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP40]], i64 26
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr [4 x i8], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP41]], i64 30
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP22]], ptr [[TMP14]], align 2
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP23]], ptr [[TMP15]], align 2
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP24]], ptr [[TMP16]], align 2
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP25]], ptr [[TMP17]], align 2
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP26]], ptr [[TMP18]], align 2
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP27]], ptr [[TMP19]], align 2
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP28]], ptr [[TMP20]], align 2
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP29]], ptr [[TMP21]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 72294b64ffbee..6a9bdaa6a5380 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -321,10 +321,10 @@ define void @narrow_scatter_with_uniform_addr_to_scalar_unroll(ptr noalias %src,
 ; VF4IC1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i8> [[TMP10]], i64 3
 ; VF4IC1-NEXT:    store i8 [[TMP11]], ptr [[DST2]], align 4
 ; VF4IC1-NEXT:    [[TMP12:%.*]] = extractelement <4 x i8> [[TMP10]], i64 0
-; VF4IC1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i8> [[TMP10]], i64 1
-; VF4IC1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i8> [[TMP10]], i64 2
 ; VF4IC1-NEXT:    store i8 [[TMP12]], ptr [[TMP5]], align 4
+; VF4IC1-NEXT:    [[TMP13:%.*]] = extractelement <4 x i8> [[TMP10]], i64 1
 ; VF4IC1-NEXT:    store i8 [[TMP13]], ptr [[TMP6]], align 4
+; VF4IC1-NEXT:    [[TMP14:%.*]] = extractelement <4 x i8> [[TMP10]], i64 2
 ; VF4IC1-NEXT:    store i8 [[TMP14]], ptr [[TMP7]], align 4
 ; VF4IC1-NEXT:    store i8 [[TMP11]], ptr [[TMP8]], align 4
 ; VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 2afcff0d21f20..5f0b2d4283a2c 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -263,8 +263,8 @@ define void @pr43371(i16 %val) optsize {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
@@ -291,8 +291,8 @@ define void @pr43371(i16 %val) optsize {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
-; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
@@ -319,8 +319,8 @@ define void @pr43371(i16 %val) optsize {
 ; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
-; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
+; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; NPGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
@@ -370,8 +370,8 @@ define void @pr43371_pgso(i16 %val) !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
@@ -398,8 +398,8 @@ define void @pr43371_pgso(i16 %val) !prof !14 {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
-; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 ; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
index 0886d040c2688..a30fc1f8702f7 100644
--- a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
@@ -3,7 +3,7 @@
 
 %struct.foo = type { i32, i64 }
 
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%0> = bitcast ir<%b> to ptr
+; CHECK: Cost of 0 for VF 2: EMIT ir<%0> = bitcast ir<%b> to ptr
 
 ; The bitcast below will be scalarized due to the predication in the loop. Bitcasts
 ; between pointer types should be treated as free, despite the scalarization.
diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
index b5ca609e48398..81d84a59cdaa5 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
@@ -29,12 +29,12 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP15]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index d5de7948b18cd..46bb46bab5910 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -918,10 +918,10 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv <2 x i16> [[TMP3]], splat (i16 6)
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    store i16 [[TMP0]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i16 [[TMP0]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index 65a055399daaa..c7b169f6bb39a 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -13,12 +13,12 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[TMP1]]
+; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
@@ -133,15 +133,15 @@ define void @pr173761(i8 %c, ptr %p, ptr noalias %q, ptr noalias %r) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[P]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[P]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i8> [[BROADCAST_SPLAT]] to <4 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x ptr> [[DOTSPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x ptr> [[DOTSPLAT]], <4 x ptr> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index a1ddda7eda969..9b9c9acf7df04 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1132,10 +1132,12 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, Cast,
-                           VPIRFlags::getDefaultFlags(Instruction::ZExt));
+  VPInstructionWithType *Recipe = VPInstructionWithType::createWide(
+      Instruction::ZExt, Op1, Int64, Cast,
+      VPIRFlags::getDefaultFlags(Instruction::ZExt));
 
-  checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser, VPIRMetadata>(&Recipe);
+  checkVPRecipeCastImpl<VPInstructionWithType, VPUser, VPIRMetadata>(Recipe);
+  delete Recipe;
   delete Cast;
 }