diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4dc2e1c03b7f3..df6f11da4cc0c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24352,6 +24352,18 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   MVT XLenVT = Subtarget.getXLenVT();
   unsigned XLenInBytes = Subtarget.getXLen() / 8;
+
+  // Check if this function has any musttail calls. If so, incoming indirect
+  // arg pointers must be saved in virtual registers so they survive across
+  // basic blocks (the SelectionDAG is cleared between BBs). Only do this
+  // when needed to avoid adding register pressure to non-musttail functions.
+  bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) {
+    return llvm::any_of(BB, [](const Instruction &I) {
+      if (const auto *CI = dyn_cast<CallInst>(&I))
+        return CI->isMustTailCall();
+      return false;
+    });
+  });
   // Used with vargs to accumulate store chains.
   std::vector<SDValue> OutChains;
 
@@ -24383,6 +24395,13 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
       InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
                                    MachinePointerInfo()));
       unsigned ArgIndex = Ins[InsIdx].OrigArgIndex;
+      if (HasMusttail) {
+        RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+        Register VReg =
+            MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+        Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue);
+        RVFI->setIncomingIndirectArg(ArgIndex, VReg);
+      }
       unsigned ArgPartOffset = Ins[InsIdx].PartOffset;
       assert(VA.getValVT().isVector() || ArgPartOffset == 0);
       while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) {
@@ -24491,18 +24510,36 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
   if (Caller.hasFnAttribute("interrupt"))
     return false;
 
+  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible
+  // but less efficient and uglier in LowerCall. For musttail, there is no
+  // workaround today: a byval arg requires a local copy that becomes invalid
+  // after the tail call deallocates the caller's frame, so rejecting here
+  // (and triggering reportFatalInternalError in LowerCall) is safer than
+  // miscompiling.
+  for (auto &Arg : Outs)
+    if (Arg.Flags.isByVal())
+      return false;
+
+  // musttail bypasses the remaining checks: the checks either reject cases
+  // we handle specially (indirect args are forwarded via incoming pointers,
+  // stack-passed args reuse the matching incoming layout, sret is forwarded
+  // like any other pointer arg) or are optimizations not applicable to
+  // mandatory tail calls.
+  if (IsMustTail)
+    return true;
+
   // Do not tail call opt if the stack is used to pass parameters.
   if (CCInfo.getStackSize() != 0)
     return false;
 
   // Do not tail call opt if any parameters need to be passed indirectly.
   // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
-  // passed indirectly. So the address of the value will be passed in a
-  // register, or if not available, then the address is put on the stack. In
-  // order to pass indirectly, space on the stack often needs to be allocated
-  // in order to store the value. In this case the CCInfo.getNextStackOffset()
-  // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
-  // are passed CCValAssign::Indirect.
+  // passed indirectly. The caller allocates stack space for the value and
+  // passes a pointer. On a tail call the caller's frame is deallocated
+  // before the callee executes, leaving the pointer dangling.
   for (auto &VA : ArgLocs)
     if (VA.getLocInfo() == CCValAssign::Indirect)
       return false;
@@ -24523,13 +24560,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
       return false;
   }
 
-  // Byval parameters hand the function a pointer directly into the stack area
-  // we want to reuse during a tail call. Working around this *is* possible
-  // but less efficient and uglier in LowerCall.
-  for (auto &Arg : Outs)
-    if (Arg.Flags.isByVal())
-      return false;
-
   return true;
 }
 
@@ -24665,51 +24695,158 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     // Promote the value if needed.
     // For now, only handle fully promoted and indirect arguments.
     if (VA.getLocInfo() == CCValAssign::Indirect) {
-      // Store the argument in a stack slot and pass its address.
-      Align StackAlign =
-          std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
-                   getPrefTypeAlign(ArgValue.getValueType(), DAG));
-      TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
-      // If the original argument was split (e.g. i128), we need
-      // to store the required parts of it here (and pass just one address).
-      // Vectors may be partly split to registers and partly to the stack, in
-      // which case the base address is partly offset and subsequent stores are
-      // relative to that.
-      unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
-      unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
-      assert(VA.getValVT().isVector() || ArgPartOffset == 0);
-      // Calculate the total size to store. We don't have access to what we're
-      // actually storing other than performing the loop and collecting the
-      // info.
-      SmallVector<std::pair<SDValue, SDValue>> Parts;
-      while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
-        SDValue PartValue = OutVals[OutIdx + 1];
-        unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
-        SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
-        EVT PartVT = PartValue.getValueType();
-        if (PartVT.isScalableVector())
-          Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset);
-        StoredSize += PartVT.getStoreSize();
-        StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
-        Parts.push_back(std::make_pair(PartValue, Offset));
-        ++i;
-        ++OutIdx;
-      }
-      SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
-      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(
-          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
-                       MachinePointerInfo::getFixedStack(MF, FI)));
-      for (const auto &Part : Parts) {
-        SDValue PartValue = Part.first;
-        SDValue PartOffset = Part.second;
-        SDValue Address =
-            DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+      // For musttail calls, reuse incoming indirect pointers instead of
+      // creating new stack temporaries. The incoming pointers point to the
+      // caller's caller's frame, which remains valid after a tail call.
+      if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+        RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+        unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex;
+
+        // Resolve which formal parameter is being passed at this call
+        // position.
+        //
+        // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered),
+        // but Outs[].OrigArgIndex is an index into a filtered arg list
+        // (empty types removed, via CallLoweringInfo in the target-
+        // independent layer). IncomingIndirectArgs is keyed by the
+        // caller's unfiltered Argument::getArgNo(), so we have to walk
+        // the caller's formals (same filter) to translate the index.
+        // This target-independent asymmetry should be normalized so
+        // backends do not need to re-derive the mapping.
+        //
+        // Steps:
+        // 1. Find the call operand at filtered position CallArgIdx.
+        // 2. If it is an Argument, use getArgNo() directly (same filter
+        //    for caller formals and call operands).
+        // 3. Otherwise (computed value), walk the caller's formals and
+        //    skip empty types to map the filtered index to getArgNo().
+        const Argument *FormalArg = nullptr;
+        unsigned FilteredIdx = 0;
+        for (const auto &CallArg : CLI.CB->args()) {
+          if (CallArg->getType()->isEmptyTy())
+            continue;
+          if (FilteredIdx == CallArgIdx) {
+            FormalArg = dyn_cast<Argument>(CallArg);
+            break;
+          }
+          ++FilteredIdx;
+        }
+
+        // For forwarded args, getArgNo() gives the unfiltered index directly.
+        // For computed args, walk the caller's formals to resolve it.
+        unsigned FormalArgIdx = CallArgIdx;
+        if (FormalArg) {
+          FormalArgIdx = FormalArg->getArgNo();
+        } else {
+          FilteredIdx = 0;
+          for (const auto &Arg : MF.getFunction().args()) {
+            if (Arg.getType()->isEmptyTy())
+              continue;
+            if (FilteredIdx == CallArgIdx) {
+              FormalArgIdx = Arg.getArgNo();
+              break;
+            }
+            ++FilteredIdx;
+          }
+        }
+
+        Register VReg = RVFI->getIncomingIndirectArg(FormalArgIdx);
+        SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT);
+        // Thread the CopyFromReg output chain through MemOpChains so the
+        // TokenFactor below sequences the copy with any stores we emit
+        // for this argument.
+        MemOpChains.push_back(CopyOp.getValue(1));
+        SDValue IncomingPtr = CopyOp;
+
+        if (!FormalArg) {
+          // Computed value: store into the incoming indirect pointer for the
+          // same-position formal parameter (musttail guarantees matching
+          // prototypes, so types match). The pointer survives the tail call
+          // since it points to the caller's caller's frame.
+          //
+          // The data-flow edge through IncomingPtr already prevents the
+          // store from being scheduled before the CopyFromReg. Threading
+          // CopyOp.getValue(1) (the copy's output chain) into the store
+          // makes that ordering explicit on the chain edge as well, which
+          // is the convention for memory ops chaining off their producers.
+          MemOpChains.push_back(
+              DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr,
+                           MachinePointerInfo::getUnknownStack(MF)));
+          // Store any split parts at their respective offsets. Scalable
+          // vectors need their part offsets multiplied by VSCALE, matching
+          // the non-musttail spill path below.
+          unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+          while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+            SDValue PartValue = OutVals[OutIdx + 1];
+            unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+            SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+            EVT PartVT = PartValue.getValueType();
+            if (PartVT.isScalableVector())
+              Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset);
+            SDValue Addr =
+                DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset);
+            MemOpChains.push_back(
+                DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr,
+                             MachinePointerInfo::getUnknownStack(MF)));
+            ++i;
+            ++OutIdx;
+          }
+        }
+        ArgValue = IncomingPtr;
+
+        // Skip any remaining split parts (for forwarded args, they are
+        // covered by the forwarded pointer).
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+          ++i;
+          ++OutIdx;
+        }
+      } else {
+        // Store the argument in a stack slot and pass its address.
+        Align StackAlign =
+            std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
+                     getPrefTypeAlign(ArgValue.getValueType(), DAG));
+        TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
+        // If the original argument was split (e.g. i128), we need
+        // to store the required parts of it here (and pass just one address).
+        // Vectors may be partly split to registers and partly to the stack, in
+        // which case the base address is partly offset and subsequent stores
+        // are relative to that.
+        unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
+        unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+        assert(VA.getValVT().isVector() || ArgPartOffset == 0);
+        // Calculate the total size to store. We don't have access to what
+        // we're actually storing other than performing the loop and collecting
+        // the info.
+        SmallVector<std::pair<SDValue, SDValue>> Parts;
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
+          SDValue PartValue = OutVals[OutIdx + 1];
+          unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+          SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+          EVT PartVT = PartValue.getValueType();
+          if (PartVT.isScalableVector())
+            Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset);
+          StoredSize += PartVT.getStoreSize();
+          StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
+          Parts.push_back(std::make_pair(PartValue, Offset));
+          ++i;
+          ++OutIdx;
+        }
+        SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
+        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
         MemOpChains.push_back(
-            DAG.getStore(Chain, DL, PartValue, Address,
+            DAG.getStore(Chain, DL, ArgValue, SpillSlot,
                          MachinePointerInfo::getFixedStack(MF, FI)));
+        for (const auto &Part : Parts) {
+          SDValue PartValue = Part.first;
+          SDValue PartOffset = Part.second;
+          SDValue Address =
+              DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+          MemOpChains.push_back(
+              DAG.getStore(Chain, DL, PartValue, Address,
+                           MachinePointerInfo::getFixedStack(MF, FI)));
+        }
+        ArgValue = SpillSlot;
       }
-      ArgValue = SpillSlot;
     } else {
       ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL, Subtarget);
     }
@@ -24727,8 +24864,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
         CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
-      assert(!IsTailCall && "Tail call not allowed if stack is used "
-                            "for passing parameters");
+      assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) &&
+             "Tail call not allowed if stack is used for passing parameters");
 
       // Work out the address of the stack slot.
       if (!StackPtr.getNode())
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 854f2714d9599..06ef7661385c2 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
 
 #include "RISCVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -65,6 +66,14 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
   uint64_t RVVPadding = 0;
   /// Size of stack frame to save callee saved registers
   unsigned CalleeSavedStackSize = 0;
+
+  /// Incoming indirect argument pointers saved as virtual registers, keyed by
+  /// formal parameter index. Used for musttail forwarding of indirect args.
+  /// Virtual registers (not SDValues) are used because the SelectionDAG is
+  /// cleared between basic blocks, and musttail calls may be in non-entry
+  /// blocks.
+  DenseMap<unsigned, Register> IncomingIndirectArgs;
+
   /// Is there any vector argument or return?
   bool IsVectorCall = false;
 
@@ -145,6 +154,15 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
 
+  void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) {
+    IncomingIndirectArgs[ArgIndex] = Reg;
+  }
+  Register getIncomingIndirectArg(unsigned ArgIndex) const {
+    auto It = IncomingIndirectArgs.find(ArgIndex);
+    assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg");
+    return It->second;
+  }
+
   enum class PushPopKind { None = 0, StdExtZcmp, VendorXqccmp };
 
   PushPopKind getPushPopKind(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/RISCV/musttail-call.ll b/llvm/test/CodeGen/RISCV/musttail-call.ll
index f6ec5307b8bad..ee58a1943c655 100644
--- a/llvm/test/CodeGen/RISCV/musttail-call.ll
+++ b/llvm/test/CodeGen/RISCV/musttail-call.ll
@@ -1,19 +1,16 @@
-; Check that we error out if tail is not possible but call is marked as mustail.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Check that musttail with sret generates a tail call (not an error).
 
-; RUN: not --crash llc -mtriple riscv32-unknown-linux-gnu -o - %s \
-; RUN: 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple riscv32-unknown-elf -o - %s \
-; RUN: 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple riscv64-unknown-linux-gnu -o - %s \
-; RUN: 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple riscv64-unknown-elf -o - %s \
-; RUN: 2>&1 | FileCheck %s
+; RUN: llc -mtriple riscv32-unknown-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple riscv64-unknown-linux-gnu -o - %s | FileCheck %s
 
 %struct.A = type { i32 }
 
 declare void @callee_musttail(ptr sret(%struct.A) %a)
 define void @caller_musttail(ptr sret(%struct.A) %a) {
-; CHECK: LLVM ERROR: failed to perform tail call elimination on a call site marked musttail
+; CHECK-LABEL: caller_musttail:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    tail callee_musttail
 entry:
   musttail call void @callee_musttail(ptr sret(%struct.A) %a)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll b/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll
new file mode 100644
index 0000000000000..50289eca58e28
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll
@@ -0,0 +1,914 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 %s -o - | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s --check-prefix=RV64
+
+; Test that musttail with indirect args (fp128 on RV32) forwards the incoming
+; pointer instead of creating a new stack temporary. Without this fix, the
+; pointer would dangle after the tail call deallocates the caller's frame.
+
+declare i32 @callee_musttail_indirect(fp128 %a)
+
+; fp128 is indirect on RV32 (too large for registers), direct on RV64.
+; On RV32, musttail must forward the incoming indirect pointer (a0) directly.
+define i32 @caller_musttail_indirect(fp128 %a) nounwind {
+; RV32-LABEL: caller_musttail_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_indirect
+;
+; RV64-LABEL: caller_musttail_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_indirect
+  %call = musttail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Verify that non-musttail tail call with indirect args does NOT tail call
+; (this is the PR #184972 fix - indirect args are unsafe for regular tail calls).
+define void @caller_no_musttail_indirect() nounwind {
+; RV32-LABEL: caller_no_musttail_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a1, 262128
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    sw zero, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    call callee_musttail_indirect
+; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_no_musttail_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 16383
+; RV64-NEXT:    slli a1, a1, 36
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    tail callee_musttail_indirect
+  %call = tail call i32 @callee_musttail_indirect(fp128 0xL00000000000000003FFF000000000000)
+  ret void
+}
+
+; Verify that non-musttail tail call forwarding an indirect arg from the
+; caller's own parameters also does NOT tail call (the arg lives on the
+; caller's frame, which would be deallocated).
+define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind {
+; RV32-LABEL: caller_no_musttail_forward_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    sw a3, 8(sp)
+; RV32-NEXT:    sw a4, 12(sp)
+; RV32-NEXT:    call callee_musttail_indirect
+; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_no_musttail_forward_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_indirect
+  %call = tail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with two indirect fp128 args on RV32. Both pointers must be
+; forwarded. Exercises the DenseMap with two distinct OrigArgIndex values.
+declare i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b)
+
+define i32 @caller_musttail_two_indirect(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_two_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_two_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b)
+  ret i32 %call
+}
+
+; Test musttail with mixed direct (i32 in register) + indirect (fp128) args.
+; Confirms OrigArgIndex lookup works when not all args are indirect.
+declare i32 @callee_musttail_mixed(i32 %x, fp128 %a)
+
+define i32 @caller_musttail_mixed(i32 %x, fp128 %a) nounwind {
+; RV32-LABEL: caller_musttail_mixed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_mixed
+;
+; RV64-LABEL: caller_musttail_mixed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_mixed
+  %call = musttail call i32 @callee_musttail_mixed(i32 %x, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with i128 on RV32 (indirect, split into 4 x i32 parts).
+declare i64 @callee_musttail_i128(i128 %a)
+
+define i64 @caller_musttail_i128(i128 %a) nounwind {
+; RV32-LABEL: caller_musttail_i128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_i128
+;
+; RV64-LABEL: caller_musttail_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_i128
+  %call = musttail call i64 @callee_musttail_i128(i128 %a)
+  ret i64 %call
+}
+
+; Test musttail with i128 (indirect+split on RV32) plus a trailing i32 direct arg.
+; Exercises the split-skip logic followed by a normal register arg.
+declare i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x)
+
+define i64 @caller_musttail_i128_and_i32(i128 %a, i32 %x) nounwind {
+; RV32-LABEL: caller_musttail_i128_and_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_i128_and_i32
+;
+; RV64-LABEL: caller_musttail_i128_and_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_i128_and_i32
+  %call = musttail call i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x)
+  ret i64 %call
+}
+
+; Test musttail with two indirect args SWAPPED. The pointers must be exchanged
+; before the tail call. This exercises the OrigArgIndex -> Argument::getArgNo()
+; resolution in LowerCall.
+define i32 @caller_musttail_two_indirect_swapped(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_two_indirect_swapped:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_two_indirect_swapped:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a4, a1
+; RV64-NEXT:    mv a5, a0
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:    mv a2, a5
+; RV64-NEXT:    mv a3, a4
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %b, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with three indirect args rotated: call @f(%c, %a, %b).
+; All three pointers need to be shuffled.
+declare i32 @callee_musttail_three_indirect(fp128 %a, fp128 %b, fp128 %c)
+
+define i32 @caller_musttail_three_indirect_rotated(fp128 %a, fp128 %b, fp128 %c) nounwind {
+; RV32-LABEL: caller_musttail_three_indirect_rotated:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a3, a1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:    tail callee_musttail_three_indirect
+;
+; RV64-LABEL: caller_musttail_three_indirect_rotated:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a6, a3
+; RV64-NEXT:    mv a7, a2
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a0, a4
+; RV64-NEXT:    mv a1, a5
+; RV64-NEXT:    mv a4, a7
+; RV64-NEXT:    mv a5, a6
+; RV64-NEXT:    tail callee_musttail_three_indirect
+  %call = musttail call i32 @callee_musttail_three_indirect(fp128 %c, fp128 %a, fp128 %b)
+  ret i32 %call
+}
+
+; Test musttail with mixed direct + indirect args where the indirect args
+; are swapped but the direct arg stays in place.
+declare i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 %a, fp128 %b)
+
+define i32 @caller_musttail_mixed_swap_indirect(i32 %x, fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_mixed_swap_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a3, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:    tail callee_musttail_mixed_two_indirect
+;
+; RV64-LABEL: caller_musttail_mixed_swap_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a5, a2
+; RV64-NEXT:    mv a6, a1
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:    mv a2, a4
+; RV64-NEXT:    mv a3, a6
+; RV64-NEXT:    mv a4, a5
+; RV64-NEXT:    tail callee_musttail_mixed_two_indirect
+  %call = musttail call i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 %b, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with swapped i128 on RV32 (split indirect args).
+declare i64 @callee_musttail_two_i128(i128 %a, i128 %b)
+
+define i64 @caller_musttail_two_i128_swapped(i128 %a, i128 %b) nounwind {
+; RV32-LABEL: caller_musttail_two_i128_swapped:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    tail callee_musttail_two_i128
+;
+; RV64-LABEL: caller_musttail_two_i128_swapped:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a4, a1
+; RV64-NEXT:    mv a5, a0
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:    mv a2, a5
+; RV64-NEXT:    mv a3, a4
+; RV64-NEXT:    tail callee_musttail_two_i128
+  %call = musttail call i64 @callee_musttail_two_i128(i128 %b, i128 %a)
+  ret i64 %call
+}
+
+; Test musttail passing the same indirect arg to both positions.
+define i32 @caller_musttail_two_indirect_dup(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_two_indirect_dup:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_two_indirect_dup:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with enough indirect args to spill to the stack (9 fp128 on
+; RV32 uses a0-a7 for the first 8 pointers, 9th goes on the stack).
+declare void @callee_musttail_nine_indirect(fp128, fp128, fp128, fp128, fp128, fp128, fp128, fp128, fp128)
+
+define void @caller_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind {
+; RV32-LABEL: caller_musttail_nine_indirect:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw t0, 0(sp)
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    tail callee_musttail_nine_indirect
+;
+; RV64-LABEL: caller_musttail_nine_indirect:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld t0, 32(sp)
+; RV64-NEXT:    ld t1, 40(sp)
+; RV64-NEXT:    ld t2, 48(sp)
+; RV64-NEXT:    ld t3, 56(sp)
+; RV64-NEXT:    ld t4, 64(sp)
+; RV64-NEXT:    ld t5, 72(sp)
+; RV64-NEXT:    ld t6, 96(sp)
+; RV64-NEXT:    ld s0, 104(sp)
+; RV64-NEXT:    ld s1, 80(sp)
+; RV64-NEXT:    ld s2, 88(sp)
+; RV64-NEXT:    sd t6, 64(sp)
+; RV64-NEXT:    sd s0, 72(sp)
+; RV64-NEXT:    sd t4, 32(sp)
+; RV64-NEXT:    sd t5, 40(sp)
+; RV64-NEXT:    sd s1, 48(sp)
+; RV64-NEXT:    sd s2, 56(sp)
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    sd t1, 8(sp)
+; RV64-NEXT:    sd t2, 16(sp)
+; RV64-NEXT:    sd t3, 24(sp)
+; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail callee_musttail_nine_indirect
+  musttail call void @callee_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i)
+  ret void
+}
+
+; Test musttail swapping the first (register) and last (stack-spilled) args.
+define void @caller_musttail_nine_indirect_swap_first_last(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind {
+; RV32-LABEL: caller_musttail_nine_indirect_swap_first_last:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw t0, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    mv a0, t0
+; RV32-NEXT:    tail callee_musttail_nine_indirect
+;
+; RV64-LABEL: caller_musttail_nine_indirect_swap_first_last:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    ld t0, 96(sp)
+; RV64-NEXT:    ld t1, 104(sp)
+; RV64-NEXT:    ld t2, 32(sp)
+; RV64-NEXT:    ld t3, 40(sp)
+; RV64-NEXT:    ld t4, 48(sp)
+; RV64-NEXT:    ld t5, 56(sp)
+; RV64-NEXT:    ld t6, 64(sp)
+; RV64-NEXT:    ld s0, 72(sp)
+; RV64-NEXT:    ld s1, 80(sp)
+; RV64-NEXT:    ld s2, 88(sp)
+; RV64-NEXT:    sd a0, 64(sp)
+; RV64-NEXT:    sd a1, 72(sp)
+; RV64-NEXT:    sd t6, 32(sp)
+; RV64-NEXT:    sd s0, 40(sp)
+; RV64-NEXT:    sd s1, 48(sp)
+; RV64-NEXT:    sd s2, 56(sp)
+; RV64-NEXT:    sd t2, 0(sp)
+; RV64-NEXT:    sd t3, 8(sp)
+; RV64-NEXT:    sd t4, 16(sp)
+; RV64-NEXT:    sd t5, 24(sp)
+; RV64-NEXT:    mv a0, t0
+; RV64-NEXT:    mv a1, t1
+; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail callee_musttail_nine_indirect
+  musttail call void @callee_musttail_nine_indirect(fp128 %i, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %a)
+  ret void
+}
+
+; Test musttail where the indirect arg is a computed value, not a forwarded
+; formal parameter. The computed value must be stored into the incoming
+; indirect pointer before tail calling.
+define i32 @caller_musttail_computed(fp128 %a) nounwind {
+; RV32-LABEL: caller_musttail_computed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a3, 4(s0)
+; RV32-NEXT:    lw a4, 8(s0)
+; RV32-NEXT:    lw a5, 12(s0)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    sw a0, 24(sp)
+; RV32-NEXT:    sw a3, 12(sp)
+; RV32-NEXT:    sw a4, 16(sp)
+; RV32-NEXT:    sw a5, 20(sp)
+; RV32-NEXT:    addi a0, sp, 40
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    sw a3, 28(sp)
+; RV32-NEXT:    sw a4, 32(sp)
+; RV32-NEXT:    sw a5, 36(sp)
+; RV32-NEXT:    call __addtf3
+; RV32-NEXT:    lw a0, 40(sp)
+; RV32-NEXT:    lw a1, 44(sp)
+; RV32-NEXT:    lw a2, 48(sp)
+; RV32-NEXT:    lw a3, 52(sp)
+; RV32-NEXT:    sw a0, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    sw a2, 8(s0)
+; RV32-NEXT:    sw a3, 12(s0)
+; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    tail callee_musttail_indirect
+;
+; RV64-LABEL: caller_musttail_computed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    call __addtf3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    tail callee_musttail_indirect
+  %sum = fadd fp128 %a, %a
+  %r = musttail call i32 @callee_musttail_indirect(fp128 %sum)
+  ret i32 %r
+}
+
+; Test musttail with a computed i128 on RV32 (split indirect). The add result
+; must be stored back into the incoming pointer.
+define i64 @caller_musttail_computed_i128(i128 %a) nounwind {
+; RV32-LABEL: caller_musttail_computed_i128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    addi a1, a1, 1
+; RV32-NEXT:    seqz a5, a1
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    or a5, a1, a2
+; RV32-NEXT:    seqz a5, a5
+; RV32-NEXT:    add a5, a3, a5
+; RV32-NEXT:    sltu a3, a5, a3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    tail callee_musttail_i128
+;
+; RV64-LABEL: caller_musttail_computed_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a0, a0, 1
+; RV64-NEXT:    seqz a2, a0
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    tail callee_musttail_i128
+  %sum = add i128 %a, 1
+  %r = musttail call i64 @callee_musttail_i128(i128 %sum)
+  ret i64 %r
+}
+
+; Test musttail with one computed and one forwarded indirect arg.
+; Position 0 gets the fadd result (stored into %a's incoming pointer),
+; position 1 gets %b's incoming pointer forwarded directly.
+define i32 @caller_musttail_computed_and_forwarded(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_computed_and_forwarded:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a5, 8(a1)
+; RV32-NEXT:    lw a6, 12(a1)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s1)
+; RV32-NEXT:    lw a2, 8(s1)
+; RV32-NEXT:    lw a7, 12(s1)
+; RV32-NEXT:    sw a0, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 24(sp)
+; RV32-NEXT:    sw a7, 28(sp)
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    sw a3, 0(sp)
+; RV32-NEXT:    sw a4, 4(sp)
+; RV32-NEXT:    sw a5, 8(sp)
+; RV32-NEXT:    sw a6, 12(sp)
+; RV32-NEXT:    call __addtf3
+; RV32-NEXT:    lw a0, 32(sp)
+; RV32-NEXT:    lw a1, 36(sp)
+; RV32-NEXT:    lw a2, 40(sp)
+; RV32-NEXT:    lw a3, 44(sp)
+; RV32-NEXT:    sw a0, 0(s1)
+; RV32-NEXT:    sw a1, 4(s1)
+; RV32-NEXT:    sw a2, 8(s1)
+; RV32-NEXT:    sw a3, 12(s1)
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_computed_and_forwarded:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a3
+; RV64-NEXT:    mv s1, a2
+; RV64-NEXT:    call __addtf3
+; RV64-NEXT:    mv a2, s1
+; RV64-NEXT:    mv a3, s0
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %sum = fadd fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %b)
+  ret i32 %r
+}
+
+; Test musttail with one forwarded and one computed indirect arg (reversed).
+; Position 0 forwards %a, position 1 gets the computed value.
+define i32 @caller_musttail_forwarded_and_computed(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_forwarded_and_computed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a5, 8(a1)
+; RV32-NEXT:    lw a6, 12(a1)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s1)
+; RV32-NEXT:    lw a2, 8(s1)
+; RV32-NEXT:    lw a7, 12(s1)
+; RV32-NEXT:    sw a0, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 24(sp)
+; RV32-NEXT:    sw a7, 28(sp)
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    sw a3, 0(sp)
+; RV32-NEXT:    sw a4, 4(sp)
+; RV32-NEXT:    sw a5, 8(sp)
+; RV32-NEXT:    sw a6, 12(sp)
+; RV32-NEXT:    call __addtf3
+; RV32-NEXT:    lw a0, 32(sp)
+; RV32-NEXT:    lw a1, 36(sp)
+; RV32-NEXT:    lw a2, 40(sp)
+; RV32-NEXT:    lw a3, 44(sp)
+; RV32-NEXT:    sw a0, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    sw a2, 8(s0)
+; RV32-NEXT:    sw a3, 12(s0)
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_forwarded_and_computed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a1
+; RV64-NEXT:    mv s1, a0
+; RV64-NEXT:    call __addtf3
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    mv a0, s1
+; RV64-NEXT:    mv a1, s0
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %sum = fadd fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %sum)
+  ret i32 %r
+}
+
+; Test musttail with both args computed. Neither can be zero-copy forwarded.
+define i32 @caller_musttail_both_computed(fp128 %a, fp128 %b) nounwind {
+; RV32-LABEL: caller_musttail_both_computed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -160
+; RV32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    lw s5, 0(a1)
+; RV32-NEXT:    lw s2, 4(a1)
+; RV32-NEXT:    lw s3, 8(a1)
+; RV32-NEXT:    lw s4, 12(a1)
+; RV32-NEXT:    lw s6, 0(a0)
+; RV32-NEXT:    lw s7, 4(a0)
+; RV32-NEXT:    lw s8, 8(a0)
+; RV32-NEXT:    lw s9, 12(a0)
+; RV32-NEXT:    sw s6, 72(sp)
+; RV32-NEXT:    sw s7, 76(sp)
+; RV32-NEXT:    sw s8, 80(sp)
+; RV32-NEXT:    sw s9, 84(sp)
+; RV32-NEXT:    addi a0, sp, 88
+; RV32-NEXT:    addi a1, sp, 72
+; RV32-NEXT:    addi a2, sp, 56
+; RV32-NEXT:    sw s5, 56(sp)
+; RV32-NEXT:    sw s2, 60(sp)
+; RV32-NEXT:    sw s3, 64(sp)
+; RV32-NEXT:    sw s4, 68(sp)
+; RV32-NEXT:    call __addtf3
+; RV32-NEXT:    lw s10, 88(sp)
+; RV32-NEXT:    lw s11, 92(sp)
+; RV32-NEXT:    lw a0, 96(sp)
+; RV32-NEXT:    sw a0, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lw a0, 100(sp)
+; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 24(sp)
+; RV32-NEXT:    sw s7, 28(sp)
+; RV32-NEXT:    sw s8, 32(sp)
+; RV32-NEXT:    sw s9, 36(sp)
+; RV32-NEXT:    addi a0, sp, 40
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    sw s5, 8(sp)
+; RV32-NEXT:    sw s2, 12(sp)
+; RV32-NEXT:    sw s3, 16(sp)
+; RV32-NEXT:    sw s4, 20(sp)
+; RV32-NEXT:    call __subtf3
+; RV32-NEXT:    lw a0, 40(sp)
+; RV32-NEXT:    lw a1, 44(sp)
+; RV32-NEXT:    lw a2, 48(sp)
+; RV32-NEXT:    lw a3, 52(sp)
+; RV32-NEXT:    sw s10, 0(s1)
+; RV32-NEXT:    sw s11, 4(s1)
+; RV32-NEXT:    lw a4, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw a4, 8(s1)
+; RV32-NEXT:    lw a4, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sw a4, 12(s1)
+; RV32-NEXT:    sw a0, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    sw a2, 8(s0)
+; RV32-NEXT:    sw a3, 12(s0)
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s0
+; RV32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 160
+; RV32-NEXT:    tail callee_musttail_two_indirect
+;
+; RV64-LABEL: caller_musttail_both_computed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -64
+; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a3
+; RV64-NEXT:    mv s1, a2
+; RV64-NEXT:    mv s2, a1
+; RV64-NEXT:    mv s3, a0
+; RV64-NEXT:    call __addtf3
+; RV64-NEXT:    mv s4, a0
+; RV64-NEXT:    mv s5, a1
+; RV64-NEXT:    mv a0, s3
+; RV64-NEXT:    mv a1, s2
+; RV64-NEXT:    mv a2, s1
+; RV64-NEXT:    mv a3, s0
+; RV64-NEXT:    call __subtf3
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    mv a0, s4
+; RV64-NEXT:    mv a1, s5
+; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    tail callee_musttail_two_indirect
+  %sum = fadd fp128 %a, %b
+  %diff = fsub fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %diff)
+  ret i32 %r
+}
+
+; Test musttail in a non-entry basic block. The indirect pointer must survive
+; across basic blocks (the SelectionDAG is cleared between BBs, so the pointer
+; must be preserved in a virtual register, not as a raw SDValue).
+declare i32 @callee_musttail_cross_bb(fp128 %a, i1 %c)
+
+define i32 @caller_musttail_cross_bb(fp128 %a, i1 %cond) nounwind {
+; RV32-LABEL: caller_musttail_cross_bb:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    andi a2, a1, 1
+; RV32-NEXT:    beqz a2, .LBB19_2
+; RV32-NEXT:  # %bb.1: # %then
+; RV32-NEXT:    tail callee_musttail_cross_bb
+; RV32-NEXT:  .LBB19_2: # %else
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_musttail_cross_bb:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    andi a3, a2, 1
+; RV64-NEXT:    beqz a3, .LBB19_2
+; RV64-NEXT:  # %bb.1: # %then
+; RV64-NEXT:    tail callee_musttail_cross_bb
+; RV64-NEXT:  .LBB19_2: # %else
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+entry:
+  br i1 %cond, label %then, label %else
+then:
+  %r = musttail call i32 @callee_musttail_cross_bb(fp128 %a, i1 %cond)
+  ret i32 %r
+else:
+  ret i32 0
+}
+
+; Test musttail with control flow and a computed indirect arg in a non-entry BB.
+declare i32 @callee_musttail_cross_bb_computed(fp128 %a, i1 %c)
+
+define i32 @caller_musttail_cross_bb_computed(fp128 %a, i1 %cond) nounwind {
+; RV32-LABEL: caller_musttail_cross_bb_computed:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a3, 4(s0)
+; RV32-NEXT:    lw a4, 8(s0)
+; RV32-NEXT:    lw a5, 12(s0)
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    andi s2, a1, 1
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    sw a0, 16(sp)
+; RV32-NEXT:    sw a3, 4(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a5, 12(sp)
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    sw a3, 20(sp)
+; RV32-NEXT:    sw a4, 24(sp)
+; RV32-NEXT:    sw a5, 28(sp)
+; RV32-NEXT:    call __addtf3
+; RV32-NEXT:    beqz s2, .LBB20_2
+; RV32-NEXT:  # %bb.1: # %then
+; RV32-NEXT:    lw a0, 32(sp)
+; RV32-NEXT:    lw a1, 36(sp)
+; RV32-NEXT:    lw a2, 40(sp)
+; RV32-NEXT:    lw a3, 44(sp)
+; RV32-NEXT:    sw a0, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    sw a2, 8(s0)
+; RV32-NEXT:    sw a3, 12(s0)
+; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    mv a1, s1
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    tail callee_musttail_cross_bb_computed
+; RV32-NEXT:  .LBB20_2: # %else
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_musttail_cross_bb_computed:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a2
+; RV64-NEXT:    andi s1, a2, 1
+; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a3, a1
+; RV64-NEXT:    call __addtf3
+; RV64-NEXT:    beqz s1, .LBB20_2
+; RV64-NEXT:  # %bb.1: # %then
+; RV64-NEXT:    mv a2, s0
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    tail callee_musttail_cross_bb_computed
+; RV64-NEXT:  .LBB20_2: # %else
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
+entry:
+  %sum = fadd fp128 %a, %a
+  br i1 %cond, label %then, label %else
+then:
+  %r = musttail call i32 @callee_musttail_cross_bb_computed(fp128 %sum, i1 %cond)
+  ret i32 %r
+else:
+  ret i32 0
+}
+
+; Non-indirect args that spill to the stack (exercises the
+; isEligibleForTailCallOptimization stack-size bypass for musttail). Both
+; RV32 and RV64 use a0..a7 for the first 8 args and spill from the 9th. The
+; spilled args live in the caller's incoming stack slots, which musttail can
+; re-use because matching prototypes imply a matching layout.
+declare void @callee_musttail_stack_spill(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define void @caller_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind {
+; RV32-LABEL: caller_musttail_stack_spill:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw t0, 0(sp)
+; RV32-NEXT:    lw t1, 4(sp)
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    sw t1, 4(sp)
+; RV32-NEXT:    tail callee_musttail_stack_spill
+;
+; RV64-LABEL: caller_musttail_stack_spill:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld t0, 0(sp)
+; RV64-NEXT:    ld t1, 8(sp)
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    sd t1, 8(sp)
+; RV64-NEXT:    tail callee_musttail_stack_spill
+  musttail call void @callee_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9)
+  ret void
+}
+
+; sret + musttail: the sret pointer is just a regular pointer arg in a0.
+; Tail call forwards it unchanged.
+%struct.Large = type { i64, i64, i64, i64 }
+declare void @callee_musttail_sret(ptr sret(%struct.Large), i32)
+
+define void @caller_musttail_sret(ptr sret(%struct.Large) %out, i32 %x) nounwind {
+; RV32-LABEL: caller_musttail_sret:
+; RV32:       # %bb.0:
+; RV32-NEXT:    tail callee_musttail_sret
+;
+; RV64-LABEL: caller_musttail_sret:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail callee_musttail_sret
+  musttail call void @callee_musttail_sret(ptr sret(%struct.Large) %out, i32 %x)
+  ret void
+}
+
+; Mix of indirect (fp128) and many i32 args spilled to the stack.
+declare void @callee_musttail_indirect_and_spill(fp128, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define void @caller_musttail_indirect_and_spill(fp128 %a, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) nounwind {
+; RV32-LABEL: caller_musttail_indirect_and_spill:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw t0, 0(sp)
+; RV32-NEXT:    lw t1, 4(sp)
+; RV32-NEXT:    sw t0, 0(sp)
+; RV32-NEXT:    sw t1, 4(sp)
+; RV32-NEXT:    tail callee_musttail_indirect_and_spill
+;
+; RV64-LABEL: caller_musttail_indirect_and_spill:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld t0, 0(sp)
+; RV64-NEXT:    ld t1, 8(sp)
+; RV64-NEXT:    ld t2, 16(sp)
+; RV64-NEXT:    sd t0, 0(sp)
+; RV64-NEXT:    sd t1, 8(sp)
+; RV64-NEXT:    sd t2, 16(sp)
+; RV64-NEXT:    tail callee_musttail_indirect_and_spill
+  musttail call void @callee_musttail_indirect_and_spill(fp128 %a, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8)
+  ret void
+}
+
+; Note: byval + musttail is intentionally NOT tested here.
+; isEligibleForTailCallOptimization rejects byval outright, which causes the
+; musttail site to hit reportFatalInternalError. Tail-call support for byval
+; was reverted in 501417baa60f (RISC-V/LoongArch) pending a vreg-based
+; re-implementation; once that lands, musttail + byval can be tested as
+; well.
diff --git a/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll b/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll
new file mode 100644
index 0000000000000..58463ec1b3274
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v %s -o - | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v %s -o - | FileCheck %s --check-prefixes=CHECK,RV64
+
+; Companion to musttail-indirect-args.ll for scalable vector args.
+; <vscale x 32 x i32> is too large to fit in v8-v23, so it is split into
+; multiple parts and the callee receives a pointer to the parts via
+; CCValAssign::Indirect. The musttail computed-arg path must scale per-part
+; offsets by VSCALE -- mirrors the non-musttail spill path validated by
+; calling-conv.ll's caller_scalable_vector_split_indirect.
+
+declare <vscale x 32 x i32> @callee_musttail_scalable(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y)
+
+; Forwarded: both args are caller formals passed straight through. The
+; incoming indirect pointers are forwarded zero-copy; no per-part offset
+; computation happens in the caller, so VSCALE does not enter here.
+define <vscale x 32 x i32> @caller_musttail_scalable_forwarded(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y) nounwind {
+; CHECK-LABEL: caller_musttail_scalable_forwarded:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tail callee_musttail_scalable
+  %r = musttail call <vscale x 32 x i32> @callee_musttail_scalable(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y)
+  ret <vscale x 32 x i32> %r
+}
+
+; Computed: both args are arithmetic on the formals, so dyn_cast<Argument>
+; fails and we go through the store-into-incoming-pointer path. The
+; second part's store must use VSCALE * PartOffset relative to
+; IncomingPtr, otherwise the part lands at a fixed byte offset that
+; differs from what the callee expects (vlenb * 8 above).
+define <vscale x 32 x i32> @caller_musttail_scalable_computed(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y) nounwind {
+; CHECK-LABEL: caller_musttail_scalable_computed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vl8re32.v v0, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    tail callee_musttail_scalable
+  %s = add <vscale x 32 x i32> %x, %y
+  %r = musttail call <vscale x 32 x i32> @callee_musttail_scalable(<vscale x 32 x i32> %s, <vscale x 32 x i32> %s)
+  ret <vscale x 32 x i32> %r
+}
+
+; Mixed: first arg forwarded, second arg computed. Exercises both branches
+; of the formal-arg-vs-computed path on the same call.
+define <vscale x 32 x i32> @caller_musttail_scalable_mixed(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y) nounwind {
+; CHECK-LABEL: caller_musttail_scalable_mixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vl8re32.v v0, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v24, v8, v24
+; CHECK-NEXT:    vs8r.v v24, (a0)
+; CHECK-NEXT:    vadd.vv v24, v16, v0
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    tail callee_musttail_scalable
+  %s = add <vscale x 32 x i32> %x, %y
+  %r = musttail call <vscale x 32 x i32> @callee_musttail_scalable(<vscale x 32 x i32> %x, <vscale x 32 x i32> %s)
+  ret <vscale x 32 x i32> %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}