diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4dc2e1c03b7f3..df6f11da4cc0c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24352,6 +24352,18 @@ SDValue RISCVTargetLowering::LowerFormalArguments( EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); unsigned XLenInBytes = Subtarget.getXLen() / 8; + + // Check if this function has any musttail calls. If so, incoming indirect + // arg pointers must be saved in virtual registers so they survive across + // basic blocks (the SelectionDAG is cleared between BBs). Only do this + // when needed to avoid adding register pressure to non-musttail functions. + bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) { + return llvm::any_of(BB, [](const Instruction &I) { + if (const auto *CI = dyn_cast(&I)) + return CI->isMustTailCall(); + return false; + }); + }); // Used with vargs to accumulate store chains. std::vector OutChains; @@ -24383,6 +24395,13 @@ SDValue RISCVTargetLowering::LowerFormalArguments( InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); unsigned ArgIndex = Ins[InsIdx].OrigArgIndex; + if (HasMusttail) { + RISCVMachineFunctionInfo *RVFI = MF.getInfo(); + Register VReg = + MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass); + Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue); + RVFI->setIncomingIndirectArg(ArgIndex, VReg); + } unsigned ArgPartOffset = Ins[InsIdx].PartOffset; assert(VA.getValVT().isVector() || ArgPartOffset == 0); while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) { @@ -24491,18 +24510,36 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( if (Caller.hasFnAttribute("interrupt")) return false; + bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible + // but less efficient and uglier in LowerCall. For musttail, there is no + // workaround today: a byval arg requires a local copy that becomes invalid + // after the tail call deallocates the caller's frame, so rejecting here + // (and triggering reportFatalInternalError in LowerCall) is safer than + // miscompiling. + for (auto &Arg : Outs) + if (Arg.Flags.isByVal()) + return false; + + // musttail bypasses the remaining checks: the checks either reject cases + // we handle specially (indirect args are forwarded via incoming pointers, + // stack-passed args reuse the matching incoming layout, sret is forwarded + // like any other pointer arg) or are optimizations not applicable to + // mandatory tail calls. + if (IsMustTail) + return true; + // Do not tail call opt if the stack is used to pass parameters. if (CCInfo.getStackSize() != 0) return false; // Do not tail call opt if any parameters need to be passed indirectly. // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are - // passed indirectly. So the address of the value will be passed in a - // register, or if not available, then the address is put on the stack. In - // order to pass indirectly, space on the stack often needs to be allocated - // in order to store the value. In this case the CCInfo.getNextStackOffset() - // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs - // are passed CCValAssign::Indirect. + // passed indirectly. The caller allocates stack space for the value and + // passes a pointer. On a tail call the caller's frame is deallocated + // before the callee executes, leaving the pointer dangling. for (auto &VA : ArgLocs) if (VA.getLocInfo() == CCValAssign::Indirect) return false; @@ -24523,13 +24560,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization( return false; } - // Byval parameters hand the function a pointer directly into the stack area - // we want to reuse during a tail call. Working around this *is* possible - // but less efficient and uglier in LowerCall. - for (auto &Arg : Outs) - if (Arg.Flags.isByVal()) - return false; - return true; } @@ -24665,51 +24695,158 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // Promote the value if needed. // For now, only handle fully promoted and indirect arguments. if (VA.getLocInfo() == CCValAssign::Indirect) { - // Store the argument in a stack slot and pass its address. - Align StackAlign = - std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG), - getPrefTypeAlign(ArgValue.getValueType(), DAG)); - TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); - // If the original argument was split (e.g. i128), we need - // to store the required parts of it here (and pass just one address). - // Vectors may be partly split to registers and partly to the stack, in - // which case the base address is partly offset and subsequent stores are - // relative to that. - unsigned ArgIndex = Outs[OutIdx].OrigArgIndex; - unsigned ArgPartOffset = Outs[OutIdx].PartOffset; - assert(VA.getValVT().isVector() || ArgPartOffset == 0); - // Calculate the total size to store. We don't have access to what we're - // actually storing other than performing the loop and collecting the - // info. - SmallVector> Parts; - while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) { - SDValue PartValue = OutVals[OutIdx + 1]; - unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; - SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); - EVT PartVT = PartValue.getValueType(); - if (PartVT.isScalableVector()) - Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset); - StoredSize += PartVT.getStoreSize(); - StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); - Parts.push_back(std::make_pair(PartValue, Offset)); - ++i; - ++OutIdx; - } - SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); - int FI = cast(SpillSlot)->getIndex(); - MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, SpillSlot, - MachinePointerInfo::getFixedStack(MF, FI))); - for (const auto &Part : Parts) { - SDValue PartValue = Part.first; - SDValue PartOffset = Part.second; - SDValue Address = - DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset); + // For musttail calls, reuse incoming indirect pointers instead of + // creating new stack temporaries. The incoming pointers point to the + // caller's caller's frame, which remains valid after a tail call. + if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { + RISCVMachineFunctionInfo *RVFI = MF.getInfo(); + unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex; + + // Resolve which formal parameter is being passed at this call + // position. + // + // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered), + // but Outs[].OrigArgIndex is an index into a filtered arg list + // (empty types removed, via CallLoweringInfo in the target- + // independent layer). IncomingIndirectArgs is keyed by the + // caller's unfiltered Argument::getArgNo(), so we have to walk + // the caller's formals (same filter) to translate the index. + // This target-independent asymmetry should be normalized so + // backends do not need to re-derive the mapping. + // + // Steps: + // 1. Find the call operand at filtered position CallArgIdx. + // 2. If it is an Argument, use getArgNo() directly (same filter + // for caller formals and call operands). + // 3. Otherwise (computed value), walk the caller's formals and + // skip empty types to map the filtered index to getArgNo(). + const Argument *FormalArg = nullptr; + unsigned FilteredIdx = 0; + for (const auto &CallArg : CLI.CB->args()) { + if (CallArg->getType()->isEmptyTy()) + continue; + if (FilteredIdx == CallArgIdx) { + FormalArg = dyn_cast(CallArg); + break; + } + ++FilteredIdx; + } + + // For forwarded args, getArgNo() gives the unfiltered index directly. + // For computed args, walk the caller's formals to resolve it. + unsigned FormalArgIdx = CallArgIdx; + if (FormalArg) { + FormalArgIdx = FormalArg->getArgNo(); + } else { + FilteredIdx = 0; + for (const auto &Arg : MF.getFunction().args()) { + if (Arg.getType()->isEmptyTy()) + continue; + if (FilteredIdx == CallArgIdx) { + FormalArgIdx = Arg.getArgNo(); + break; + } + ++FilteredIdx; + } + } + + Register VReg = RVFI->getIncomingIndirectArg(FormalArgIdx); + SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT); + // Thread the CopyFromReg output chain through MemOpChains so the + // TokenFactor below sequences the copy with any stores we emit + // for this argument. + MemOpChains.push_back(CopyOp.getValue(1)); + SDValue IncomingPtr = CopyOp; + + if (!FormalArg) { + // Computed value: store into the incoming indirect pointer for the + // same-position formal parameter (musttail guarantees matching + // prototypes, so types match). The pointer survives the tail call + // since it points to the caller's caller's frame. + // + // The data-flow edge through IncomingPtr already prevents the + // store from being scheduled before the CopyFromReg. Threading + // CopyOp.getValue(1) (the copy's output chain) into the store + // makes that ordering explicit on the chain edge as well, which + // is the convention for memory ops chaining off their producers. + MemOpChains.push_back( + DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr, + MachinePointerInfo::getUnknownStack(MF))); + // Store any split parts at their respective offsets. Scalable + // vectors need their part offsets multiplied by VSCALE, matching + // the non-musttail spill path below. + unsigned ArgPartOffset = Outs[OutIdx].PartOffset; + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) { + SDValue PartValue = OutVals[OutIdx + 1]; + unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + EVT PartVT = PartValue.getValueType(); + if (PartVT.isScalableVector()) + Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset); + SDValue Addr = + DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset); + MemOpChains.push_back( + DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr, + MachinePointerInfo::getUnknownStack(MF))); + ++i; + ++OutIdx; + } + } + ArgValue = IncomingPtr; + + // Skip any remaining split parts (for forwarded args, they are + // covered by the forwarded pointer). + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) { + ++i; + ++OutIdx; + } + } else { + // Store the argument in a stack slot and pass its address. + Align StackAlign = + std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG), + getPrefTypeAlign(ArgValue.getValueType(), DAG)); + TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); + // If the original argument was split (e.g. i128), we need + // to store the required parts of it here (and pass just one address). + // Vectors may be partly split to registers and partly to the stack, in + // which case the base address is partly offset and subsequent stores + // are relative to that. + unsigned ArgIndex = Outs[OutIdx].OrigArgIndex; + unsigned ArgPartOffset = Outs[OutIdx].PartOffset; + assert(VA.getValVT().isVector() || ArgPartOffset == 0); + // Calculate the total size to store. We don't have access to what + // we're actually storing other than performing the loop and collecting + // the info. + SmallVector> Parts; + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) { + SDValue PartValue = OutVals[OutIdx + 1]; + unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + EVT PartVT = PartValue.getValueType(); + if (PartVT.isScalableVector()) + Offset = DAG.getNode(ISD::VSCALE, DL, XLenVT, Offset); + StoredSize += PartVT.getStoreSize(); + StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); + Parts.push_back(std::make_pair(PartValue, Offset)); + ++i; + ++OutIdx; + } + SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); + int FI = cast(SpillSlot)->getIndex(); MemOpChains.push_back( - DAG.getStore(Chain, DL, PartValue, Address, + DAG.getStore(Chain, DL, ArgValue, SpillSlot, MachinePointerInfo::getFixedStack(MF, FI))); + for (const auto &Part : Parts) { + SDValue PartValue = Part.first; + SDValue PartOffset = Part.second; + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset); + MemOpChains.push_back( + DAG.getStore(Chain, DL, PartValue, Address, + MachinePointerInfo::getFixedStack(MF, FI))); + } + ArgValue = SpillSlot; } - ArgValue = SpillSlot; } else { ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL, Subtarget); } @@ -24727,8 +24864,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i); } else { assert(VA.isMemLoc() && "Argument not register or memory"); - assert(!IsTailCall && "Tail call not allowed if stack is used " - "for passing parameters"); + assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) && + "Tail call not allowed if stack is used for passing parameters"); // Work out the address of the stack slot. if (!StackPtr.getNode()) diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h index 854f2714d9599..06ef7661385c2 100644 --- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H #include "RISCVSubtarget.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -65,6 +66,14 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo { uint64_t RVVPadding = 0; /// Size of stack frame to save callee saved registers unsigned CalleeSavedStackSize = 0; + + /// Incoming indirect argument pointers saved as virtual registers, keyed by + /// formal parameter index. Used for musttail forwarding of indirect args. + /// Virtual registers (not SDValues) are used because the SelectionDAG is + /// cleared between basic blocks, and musttail calls may be in non-entry + /// blocks. + DenseMap IncomingIndirectArgs; + /// Is there any vector argument or return? bool IsVectorCall = false; @@ -145,6 +154,15 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo { unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) { + IncomingIndirectArgs[ArgIndex] = Reg; + } + Register getIncomingIndirectArg(unsigned ArgIndex) const { + auto It = IncomingIndirectArgs.find(ArgIndex); + assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg"); + return It->second; + } + enum class PushPopKind { None = 0, StdExtZcmp, VendorXqccmp }; PushPopKind getPushPopKind(const MachineFunction &MF) const; diff --git a/llvm/test/CodeGen/RISCV/musttail-call.ll b/llvm/test/CodeGen/RISCV/musttail-call.ll index f6ec5307b8bad..ee58a1943c655 100644 --- a/llvm/test/CodeGen/RISCV/musttail-call.ll +++ b/llvm/test/CodeGen/RISCV/musttail-call.ll @@ -1,19 +1,16 @@ -; Check that we error out if tail is not possible but call is marked as mustail. +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Check that musttail with sret generates a tail call (not an error). -; RUN: not --crash llc -mtriple riscv32-unknown-linux-gnu -o - %s \ -; RUN: 2>&1 | FileCheck %s -; RUN: not --crash llc -mtriple riscv32-unknown-elf -o - %s \ -; RUN: 2>&1 | FileCheck %s -; RUN: not --crash llc -mtriple riscv64-unknown-linux-gnu -o - %s \ -; RUN: 2>&1 | FileCheck %s -; RUN: not --crash llc -mtriple riscv64-unknown-elf -o - %s \ -; RUN: 2>&1 | FileCheck %s +; RUN: llc -mtriple riscv32-unknown-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple riscv64-unknown-linux-gnu -o - %s | FileCheck %s %struct.A = type { i32 } declare void @callee_musttail(ptr sret(%struct.A) %a) define void @caller_musttail(ptr sret(%struct.A) %a) { -; CHECK: LLVM ERROR: failed to perform tail call elimination on a call site marked musttail +; CHECK-LABEL: caller_musttail: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: tail callee_musttail entry: musttail call void @callee_musttail(ptr sret(%struct.A) %a) ret void diff --git a/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll b/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll new file mode 100644 index 0000000000000..50289eca58e28 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/musttail-indirect-args.ll @@ -0,0 +1,914 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 %s -o - | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s --check-prefix=RV64 + +; Test that musttail with indirect args (fp128 on RV32) forwards the incoming +; pointer instead of creating a new stack temporary. Without this fix, the +; pointer would dangle after the tail call deallocates the caller's frame. + +declare i32 @callee_musttail_indirect(fp128 %a) + +; fp128 is indirect on RV32 (too large for registers), direct on RV64. +; On RV32, musttail must forward the incoming indirect pointer (a0) directly. +define i32 @caller_musttail_indirect(fp128 %a) nounwind { +; RV32-LABEL: caller_musttail_indirect: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_indirect +; +; RV64-LABEL: caller_musttail_indirect: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_indirect + %call = musttail call i32 @callee_musttail_indirect(fp128 %a) + ret i32 %call +} + +; Verify that non-musttail tail call with indirect args does NOT tail call +; (this is the PR #184972 fix - indirect args are unsafe for regular tail calls). +define void @caller_no_musttail_indirect() nounwind { +; RV32-LABEL: caller_no_musttail_indirect: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a1, 262128 +; RV32-NEXT: mv a0, sp +; RV32-NEXT: sw zero, 0(sp) +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: sw zero, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: call callee_musttail_indirect +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_no_musttail_indirect: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 16383 +; RV64-NEXT: slli a1, a1, 36 +; RV64-NEXT: li a0, 0 +; RV64-NEXT: tail callee_musttail_indirect + %call = tail call i32 @callee_musttail_indirect(fp128 0xL00000000000000003FFF000000000000) + ret void +} + +; Verify that non-musttail tail call forwarding an indirect arg from the +; caller's own parameters also does NOT tail call (the arg lives on the +; caller's frame, which would be deallocated). +define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind { +; RV32-LABEL: caller_no_musttail_forward_indirect: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a4, 12(a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: sw a3, 8(sp) +; RV32-NEXT: sw a4, 12(sp) +; RV32-NEXT: call callee_musttail_indirect +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_no_musttail_forward_indirect: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_indirect + %call = tail call i32 @callee_musttail_indirect(fp128 %a) + ret i32 %call +} + +; Test musttail with two indirect fp128 args on RV32. Both pointers must be +; forwarded. Exercises the DenseMap with two distinct OrigArgIndex values. +declare i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b) + +define i32 @caller_musttail_two_indirect(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_two_indirect: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_two_indirect: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_two_indirect + %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b) + ret i32 %call +} + +; Test musttail with mixed direct (i32 in register) + indirect (fp128) args. +; Confirms OrigArgIndex lookup works when not all args are indirect. +declare i32 @callee_musttail_mixed(i32 %x, fp128 %a) + +define i32 @caller_musttail_mixed(i32 %x, fp128 %a) nounwind { +; RV32-LABEL: caller_musttail_mixed: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_mixed +; +; RV64-LABEL: caller_musttail_mixed: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_mixed + %call = musttail call i32 @callee_musttail_mixed(i32 %x, fp128 %a) + ret i32 %call +} + +; Test musttail with i128 on RV32 (indirect, split into 4 x i32 parts). +declare i64 @callee_musttail_i128(i128 %a) + +define i64 @caller_musttail_i128(i128 %a) nounwind { +; RV32-LABEL: caller_musttail_i128: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_i128 +; +; RV64-LABEL: caller_musttail_i128: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_i128 + %call = musttail call i64 @callee_musttail_i128(i128 %a) + ret i64 %call +} + +; Test musttail with i128 (indirect+split on RV32) plus a trailing i32 direct arg. +; Exercises the split-skip logic followed by a normal register arg. +declare i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x) + +define i64 @caller_musttail_i128_and_i32(i128 %a, i32 %x) nounwind { +; RV32-LABEL: caller_musttail_i128_and_i32: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_i128_and_i32 +; +; RV64-LABEL: caller_musttail_i128_and_i32: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_i128_and_i32 + %call = musttail call i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x) + ret i64 %call +} + +; Test musttail with two indirect args SWAPPED. The pointers must be exchanged +; before the tail call. This exercises the OrigArgIndex -> Argument::getArgNo() +; resolution in LowerCall. +define i32 @caller_musttail_two_indirect_swapped(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_two_indirect_swapped: +; RV32: # %bb.0: +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_two_indirect_swapped: +; RV64: # %bb.0: +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: mv a5, a0 +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a3 +; RV64-NEXT: mv a2, a5 +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: tail callee_musttail_two_indirect + %call = musttail call i32 @callee_musttail_two_indirect(fp128 %b, fp128 %a) + ret i32 %call +} + +; Test musttail with three indirect args rotated: call @f(%c, %a, %b). +; All three pointers need to be shuffled. +declare i32 @callee_musttail_three_indirect(fp128 %a, fp128 %b, fp128 %c) + +define i32 @caller_musttail_three_indirect_rotated(fp128 %a, fp128 %b, fp128 %c) nounwind { +; RV32-LABEL: caller_musttail_three_indirect_rotated: +; RV32: # %bb.0: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: tail callee_musttail_three_indirect +; +; RV64-LABEL: caller_musttail_three_indirect_rotated: +; RV64: # %bb.0: +; RV64-NEXT: mv a6, a3 +; RV64-NEXT: mv a7, a2 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a0, a4 +; RV64-NEXT: mv a1, a5 +; RV64-NEXT: mv a4, a7 +; RV64-NEXT: mv a5, a6 +; RV64-NEXT: tail callee_musttail_three_indirect + %call = musttail call i32 @callee_musttail_three_indirect(fp128 %c, fp128 %a, fp128 %b) + ret i32 %call +} + +; Test musttail with mixed direct + indirect args where the indirect args +; are swapped but the direct arg stays in place. +declare i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 %a, fp128 %b) + +define i32 @caller_musttail_mixed_swap_indirect(i32 %x, fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_mixed_swap_indirect: +; RV32: # %bb.0: +; RV32-NEXT: mv a3, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: tail callee_musttail_mixed_two_indirect +; +; RV64-LABEL: caller_musttail_mixed_swap_indirect: +; RV64: # %bb.0: +; RV64-NEXT: mv a5, a2 +; RV64-NEXT: mv a6, a1 +; RV64-NEXT: mv a1, a3 +; RV64-NEXT: mv a2, a4 +; RV64-NEXT: mv a3, a6 +; RV64-NEXT: mv a4, a5 +; RV64-NEXT: tail callee_musttail_mixed_two_indirect + %call = musttail call i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 %b, fp128 %a) + ret i32 %call +} + +; Test musttail with swapped i128 on RV32 (split indirect args). +declare i64 @callee_musttail_two_i128(i128 %a, i128 %b) + +define i64 @caller_musttail_two_i128_swapped(i128 %a, i128 %b) nounwind { +; RV32-LABEL: caller_musttail_two_i128_swapped: +; RV32: # %bb.0: +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: tail callee_musttail_two_i128 +; +; RV64-LABEL: caller_musttail_two_i128_swapped: +; RV64: # %bb.0: +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: mv a5, a0 +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a3 +; RV64-NEXT: mv a2, a5 +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: tail callee_musttail_two_i128 + %call = musttail call i64 @callee_musttail_two_i128(i128 %b, i128 %a) + ret i64 %call +} + +; Test musttail passing the same indirect arg to both positions. +define i32 @caller_musttail_two_indirect_dup(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_two_indirect_dup: +; RV32: # %bb.0: +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_two_indirect_dup: +; RV64: # %bb.0: +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: tail callee_musttail_two_indirect + %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %a) + ret i32 %call +} + +; Test musttail with enough indirect args to spill to the stack (9 fp128 on +; RV32 uses a0-a7 for the first 8 pointers, 9th goes on the stack). +declare void @callee_musttail_nine_indirect(fp128, fp128, fp128, fp128, fp128, fp128, fp128, fp128, fp128) + +define void @caller_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind { +; RV32-LABEL: caller_musttail_nine_indirect: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: tail callee_musttail_nine_indirect +; +; RV64-LABEL: caller_musttail_nine_indirect: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: ld t0, 32(sp) +; RV64-NEXT: ld t1, 40(sp) +; RV64-NEXT: ld t2, 48(sp) +; RV64-NEXT: ld t3, 56(sp) +; RV64-NEXT: ld t4, 64(sp) +; RV64-NEXT: ld t5, 72(sp) +; RV64-NEXT: ld t6, 96(sp) +; RV64-NEXT: ld s0, 104(sp) +; RV64-NEXT: ld s1, 80(sp) +; RV64-NEXT: ld s2, 88(sp) +; RV64-NEXT: sd t6, 64(sp) +; RV64-NEXT: sd s0, 72(sp) +; RV64-NEXT: sd t4, 32(sp) +; RV64-NEXT: sd t5, 40(sp) +; RV64-NEXT: sd s1, 48(sp) +; RV64-NEXT: sd s2, 56(sp) +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: sd t1, 8(sp) +; RV64-NEXT: sd t2, 16(sp) +; RV64-NEXT: sd t3, 24(sp) +; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail callee_musttail_nine_indirect + musttail call void @callee_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) + ret void +} + +; Test musttail swapping the first (register) and last (stack-spilled) args. +define void @caller_musttail_nine_indirect_swap_first_last(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind { +; RV32-LABEL: caller_musttail_nine_indirect_swap_first_last: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: mv a0, t0 +; RV32-NEXT: tail callee_musttail_nine_indirect +; +; RV64-LABEL: caller_musttail_nine_indirect_swap_first_last: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: ld t0, 96(sp) +; RV64-NEXT: ld t1, 104(sp) +; RV64-NEXT: ld t2, 32(sp) +; RV64-NEXT: ld t3, 40(sp) +; RV64-NEXT: ld t4, 48(sp) +; RV64-NEXT: ld t5, 56(sp) +; RV64-NEXT: ld t6, 64(sp) +; RV64-NEXT: ld s0, 72(sp) +; RV64-NEXT: ld s1, 80(sp) +; RV64-NEXT: ld s2, 88(sp) +; RV64-NEXT: sd a0, 64(sp) +; RV64-NEXT: sd a1, 72(sp) +; RV64-NEXT: sd t6, 32(sp) +; RV64-NEXT: sd s0, 40(sp) +; RV64-NEXT: sd s1, 48(sp) +; RV64-NEXT: sd s2, 56(sp) +; RV64-NEXT: sd t2, 0(sp) +; RV64-NEXT: sd t3, 8(sp) +; RV64-NEXT: sd t4, 16(sp) +; RV64-NEXT: sd t5, 24(sp) +; RV64-NEXT: mv a0, t0 +; RV64-NEXT: mv a1, t1 +; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail callee_musttail_nine_indirect + musttail call void @callee_musttail_nine_indirect(fp128 %i, fp128 %b, fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %a) + ret void +} + +; Test musttail where the indirect arg is a computed value, not a forwarded +; formal parameter. The computed value must be stored into the incoming +; indirect pointer before tail calling. +define i32 @caller_musttail_computed(fp128 %a) nounwind { +; RV32-LABEL: caller_musttail_computed: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a3, 4(s0) +; RV32-NEXT: lw a4, 8(s0) +; RV32-NEXT: lw a5, 12(s0) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a0, 24(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: sw a4, 16(sp) +; RV32-NEXT: sw a5, 20(sp) +; RV32-NEXT: addi a0, sp, 40 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw a3, 28(sp) +; RV32-NEXT: sw a4, 32(sp) +; RV32-NEXT: sw a5, 36(sp) +; RV32-NEXT: call __addtf3 +; RV32-NEXT: lw a0, 40(sp) +; RV32-NEXT: lw a1, 44(sp) +; RV32-NEXT: lw a2, 48(sp) +; RV32-NEXT: lw a3, 52(sp) +; RV32-NEXT: sw a0, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: sw a2, 8(s0) +; RV32-NEXT: sw a3, 12(s0) +; RV32-NEXT: mv a0, s0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: tail callee_musttail_indirect +; +; RV64-LABEL: caller_musttail_computed: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: call __addtf3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: tail callee_musttail_indirect + %sum = fadd fp128 %a, %a + %r = musttail call i32 @callee_musttail_indirect(fp128 %sum) + ret i32 %r +} + +; Test musttail with a computed i128 on RV32 (split indirect). The add result +; must be stored back into the incoming pointer. +define i64 @caller_musttail_computed_i128(i128 %a) nounwind { +; RV32-LABEL: caller_musttail_computed_i128: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a4, 12(a0) +; RV32-NEXT: addi a1, a1, 1 +; RV32-NEXT: seqz a5, a1 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: or a5, a1, a2 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: add a5, a3, a5 +; RV32-NEXT: sltu a3, a5, a3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: sw a5, 8(a0) +; RV32-NEXT: sw a3, 12(a0) +; RV32-NEXT: tail callee_musttail_i128 +; +; RV64-LABEL: caller_musttail_computed_i128: +; RV64: # %bb.0: +; RV64-NEXT: addi a0, a0, 1 +; RV64-NEXT: seqz a2, a0 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: tail callee_musttail_i128 + %sum = add i128 %a, 1 + %r = musttail call i64 @callee_musttail_i128(i128 %sum) + ret i64 %r +} + +; Test musttail with one computed and one forwarded indirect arg. +; Position 0 gets the fadd result (stored into %a's incoming pointer), +; position 1 gets %b's incoming pointer forwarded directly. +define i32 @caller_musttail_computed_and_forwarded(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_computed_and_forwarded: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a1 +; RV32-NEXT: mv s1, a0 +; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a5, 8(a1) +; RV32-NEXT: lw a6, 12(a1) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s1) +; RV32-NEXT: lw a2, 8(s1) +; RV32-NEXT: lw a7, 12(s1) +; RV32-NEXT: sw a0, 16(sp) +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 24(sp) +; RV32-NEXT: sw a7, 28(sp) +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: sw a3, 0(sp) +; RV32-NEXT: sw a4, 4(sp) +; RV32-NEXT: sw a5, 8(sp) +; RV32-NEXT: sw a6, 12(sp) +; RV32-NEXT: call __addtf3 +; RV32-NEXT: lw a0, 32(sp) +; RV32-NEXT: lw a1, 36(sp) +; RV32-NEXT: lw a2, 40(sp) +; RV32-NEXT: lw a3, 44(sp) +; RV32-NEXT: sw a0, 0(s1) +; RV32-NEXT: sw a1, 4(s1) +; RV32-NEXT: sw a2, 8(s1) +; RV32-NEXT: sw a3, 12(s1) +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_computed_and_forwarded: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a3 +; RV64-NEXT: mv s1, a2 +; RV64-NEXT: call __addtf3 +; RV64-NEXT: mv a2, s1 +; RV64-NEXT: mv a3, s0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail callee_musttail_two_indirect + %sum = fadd fp128 %a, %b + %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %b) + ret i32 %r +} + +; Test musttail with one forwarded and one computed indirect arg (reversed). +; Position 0 forwards %a, position 1 gets the computed value. +define i32 @caller_musttail_forwarded_and_computed(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_forwarded_and_computed: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a1 +; RV32-NEXT: mv s1, a0 +; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a5, 8(a1) +; RV32-NEXT: lw a6, 12(a1) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s1) +; RV32-NEXT: lw a2, 8(s1) +; RV32-NEXT: lw a7, 12(s1) +; RV32-NEXT: sw a0, 16(sp) +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 24(sp) +; RV32-NEXT: sw a7, 28(sp) +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: sw a3, 0(sp) +; RV32-NEXT: sw a4, 4(sp) +; RV32-NEXT: sw a5, 8(sp) +; RV32-NEXT: sw a6, 12(sp) +; RV32-NEXT: call __addtf3 +; RV32-NEXT: lw a0, 32(sp) +; RV32-NEXT: lw a1, 36(sp) +; RV32-NEXT: lw a2, 40(sp) +; RV32-NEXT: lw a3, 44(sp) +; RV32-NEXT: sw a0, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: sw a2, 8(s0) +; RV32-NEXT: sw a3, 12(s0) +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_forwarded_and_computed: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a1 +; RV64-NEXT: mv s1, a0 +; RV64-NEXT: call __addtf3 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: mv a0, s1 +; RV64-NEXT: mv a1, s0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail callee_musttail_two_indirect + %sum = fadd fp128 %a, %b + %r = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %sum) + ret i32 %r +} + +; Test musttail with both args computed. Neither can be zero-copy forwarded. +define i32 @caller_musttail_both_computed(fp128 %a, fp128 %b) nounwind { +; RV32-LABEL: caller_musttail_both_computed: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -160 +; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a1 +; RV32-NEXT: mv s1, a0 +; RV32-NEXT: lw s5, 0(a1) +; RV32-NEXT: lw s2, 4(a1) +; RV32-NEXT: lw s3, 8(a1) +; RV32-NEXT: lw s4, 12(a1) +; RV32-NEXT: lw s6, 0(a0) +; RV32-NEXT: lw s7, 4(a0) +; RV32-NEXT: lw s8, 8(a0) +; RV32-NEXT: lw s9, 12(a0) +; RV32-NEXT: sw s6, 72(sp) +; RV32-NEXT: sw s7, 76(sp) +; RV32-NEXT: sw s8, 80(sp) +; RV32-NEXT: sw s9, 84(sp) +; RV32-NEXT: addi a0, sp, 88 +; RV32-NEXT: addi a1, sp, 72 +; RV32-NEXT: addi a2, sp, 56 +; RV32-NEXT: sw s5, 56(sp) +; RV32-NEXT: sw s2, 60(sp) +; RV32-NEXT: sw s3, 64(sp) +; RV32-NEXT: sw s4, 68(sp) +; RV32-NEXT: call __addtf3 +; RV32-NEXT: lw s10, 88(sp) +; RV32-NEXT: lw s11, 92(sp) +; RV32-NEXT: lw a0, 96(sp) +; RV32-NEXT: sw a0, 0(sp) # 4-byte Folded Spill +; RV32-NEXT: lw a0, 100(sp) +; RV32-NEXT: sw a0, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 24(sp) +; RV32-NEXT: sw s7, 28(sp) +; RV32-NEXT: sw s8, 32(sp) +; RV32-NEXT: sw s9, 36(sp) +; RV32-NEXT: addi a0, sp, 40 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw s5, 8(sp) +; RV32-NEXT: sw s2, 12(sp) +; RV32-NEXT: sw s3, 16(sp) +; RV32-NEXT: sw s4, 20(sp) +; RV32-NEXT: call __subtf3 +; RV32-NEXT: lw a0, 40(sp) +; RV32-NEXT: lw a1, 44(sp) +; RV32-NEXT: lw a2, 48(sp) +; RV32-NEXT: lw a3, 52(sp) +; RV32-NEXT: sw s10, 0(s1) +; RV32-NEXT: sw s11, 4(s1) +; RV32-NEXT: lw a4, 0(sp) # 4-byte Folded Reload +; RV32-NEXT: sw a4, 8(s1) +; RV32-NEXT: lw a4, 4(sp) # 4-byte Folded Reload +; RV32-NEXT: sw a4, 12(s1) +; RV32-NEXT: sw a0, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: sw a2, 8(s0) +; RV32-NEXT: sw a3, 12(s0) +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s0 +; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 160 +; RV32-NEXT: tail callee_musttail_two_indirect +; +; RV64-LABEL: caller_musttail_both_computed: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a3 +; RV64-NEXT: mv s1, a2 +; RV64-NEXT: mv s2, a1 +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: call __addtf3 +; RV64-NEXT: mv s4, a0 +; RV64-NEXT: mv s5, a1 +; RV64-NEXT: mv a0, s3 +; RV64-NEXT: mv a1, s2 +; RV64-NEXT: mv a2, s1 +; RV64-NEXT: mv a3, s0 +; RV64-NEXT: call __subtf3 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: mv a0, s4 +; RV64-NEXT: mv a1, s5 +; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: tail callee_musttail_two_indirect + %sum = fadd fp128 %a, %b + %diff = fsub fp128 %a, %b + %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %diff) + ret i32 %r +} + +; Test musttail in a non-entry basic block. The indirect pointer must survive +; across basic blocks (the SelectionDAG is cleared between BBs, so the pointer +; must be preserved in a virtual register, not as a raw SDValue). +declare i32 @callee_musttail_cross_bb(fp128 %a, i1 %c) + +define i32 @caller_musttail_cross_bb(fp128 %a, i1 %cond) nounwind { +; RV32-LABEL: caller_musttail_cross_bb: +; RV32: # %bb.0: # %entry +; RV32-NEXT: andi a2, a1, 1 +; RV32-NEXT: beqz a2, .LBB19_2 +; RV32-NEXT: # %bb.1: # %then +; RV32-NEXT: tail callee_musttail_cross_bb +; RV32-NEXT: .LBB19_2: # %else +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_musttail_cross_bb: +; RV64: # %bb.0: # %entry +; RV64-NEXT: andi a3, a2, 1 +; RV64-NEXT: beqz a3, .LBB19_2 +; RV64-NEXT: # %bb.1: # %then +; RV64-NEXT: tail callee_musttail_cross_bb +; RV64-NEXT: .LBB19_2: # %else +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +entry: + br i1 %cond, label %then, label %else +then: + %r = musttail call i32 @callee_musttail_cross_bb(fp128 %a, i1 %cond) + ret i32 %r +else: + ret i32 0 +} + +; Test musttail with control flow and a computed indirect arg in a non-entry BB. +declare i32 @callee_musttail_cross_bb_computed(fp128 %a, i1 %c) + +define i32 @caller_musttail_cross_bb_computed(fp128 %a, i1 %cond) nounwind { +; RV32-LABEL: caller_musttail_cross_bb_computed: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a3, 4(s0) +; RV32-NEXT: lw a4, 8(s0) +; RV32-NEXT: lw a5, 12(s0) +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: andi s2, a1, 1 +; RV32-NEXT: sw a0, 0(sp) +; RV32-NEXT: sw a0, 16(sp) +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: sw a3, 20(sp) +; RV32-NEXT: sw a4, 24(sp) +; RV32-NEXT: sw a5, 28(sp) +; RV32-NEXT: call __addtf3 +; RV32-NEXT: beqz s2, .LBB20_2 +; RV32-NEXT: # %bb.1: # %then +; RV32-NEXT: lw a0, 32(sp) +; RV32-NEXT: lw a1, 36(sp) +; RV32-NEXT: lw a2, 40(sp) +; RV32-NEXT: lw a3, 44(sp) +; RV32-NEXT: sw a0, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: sw a2, 8(s0) +; RV32-NEXT: sw a3, 12(s0) +; RV32-NEXT: mv a0, s0 +; RV32-NEXT: mv a1, s1 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: tail callee_musttail_cross_bb_computed +; RV32-NEXT: .LBB20_2: # %else +; RV32-NEXT: li a0, 0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_musttail_cross_bb_computed: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a2 +; RV64-NEXT: andi s1, a2, 1 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: mv a3, a1 +; RV64-NEXT: call __addtf3 +; RV64-NEXT: beqz s1, .LBB20_2 +; RV64-NEXT: # %bb.1: # %then +; RV64-NEXT: mv a2, s0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: tail callee_musttail_cross_bb_computed +; RV64-NEXT: .LBB20_2: # %else +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret +entry: + %sum = fadd fp128 %a, %a + br i1 %cond, label %then, label %else +then: + %r = musttail call i32 @callee_musttail_cross_bb_computed(fp128 %sum, i1 %cond) + ret i32 %r +else: + ret i32 0 +} + +; Non-indirect args that spill to the stack (exercises the +; isEligibleForTailCallOptimization stack-size bypass for musttail). Both +; RV32 and RV64 use a0..a7 for the first 8 args and spill from the 9th. The +; spilled args live in the caller's incoming stack slots, which musttail can +; re-use because matching prototypes imply a matching layout. +declare void @callee_musttail_stack_spill(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define void @caller_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind { +; RV32-LABEL: caller_musttail_stack_spill: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: lw t1, 4(sp) +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: sw t1, 4(sp) +; RV32-NEXT: tail callee_musttail_stack_spill +; +; RV64-LABEL: caller_musttail_stack_spill: +; RV64: # %bb.0: +; RV64-NEXT: ld t0, 0(sp) +; RV64-NEXT: ld t1, 8(sp) +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: sd t1, 8(sp) +; RV64-NEXT: tail callee_musttail_stack_spill + musttail call void @callee_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) + ret void +} + +; sret + musttail: the sret pointer is just a regular pointer arg in a0. +; Tail call forwards it unchanged. +%struct.Large = type { i64, i64, i64, i64 } +declare void @callee_musttail_sret(ptr sret(%struct.Large), i32) + +define void @caller_musttail_sret(ptr sret(%struct.Large) %out, i32 %x) nounwind { +; RV32-LABEL: caller_musttail_sret: +; RV32: # %bb.0: +; RV32-NEXT: tail callee_musttail_sret +; +; RV64-LABEL: caller_musttail_sret: +; RV64: # %bb.0: +; RV64-NEXT: tail callee_musttail_sret + musttail call void @callee_musttail_sret(ptr sret(%struct.Large) %out, i32 %x) + ret void +} + +; Mix of indirect (fp128) and many i32 args spilled to the stack. +declare void @callee_musttail_indirect_and_spill(fp128, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define void @caller_musttail_indirect_and_spill(fp128 %a, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) nounwind { +; RV32-LABEL: caller_musttail_indirect_and_spill: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: lw t1, 4(sp) +; RV32-NEXT: sw t0, 0(sp) +; RV32-NEXT: sw t1, 4(sp) +; RV32-NEXT: tail callee_musttail_indirect_and_spill +; +; RV64-LABEL: caller_musttail_indirect_and_spill: +; RV64: # %bb.0: +; RV64-NEXT: ld t0, 0(sp) +; RV64-NEXT: ld t1, 8(sp) +; RV64-NEXT: ld t2, 16(sp) +; RV64-NEXT: sd t0, 0(sp) +; RV64-NEXT: sd t1, 8(sp) +; RV64-NEXT: sd t2, 16(sp) +; RV64-NEXT: tail callee_musttail_indirect_and_spill + musttail call void @callee_musttail_indirect_and_spill(fp128 %a, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) + ret void +} + +; Note: byval + musttail is intentionally NOT tested here. +; isEligibleForTailCallOptimization rejects byval outright, which causes the +; musttail site to hit reportFatalInternalError. Tail-call support for byval +; was reverted in 501417baa60f (RISC-V/LoongArch) pending a vreg-based +; re-implementation; once that lands, musttail + byval can be tested as +; well. diff --git a/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll b/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll new file mode 100644 index 0000000000000..58463ec1b3274 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/musttail-indirect-args.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v %s -o - | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v %s -o - | FileCheck %s --check-prefixes=CHECK,RV64 + +; Companion to musttail-indirect-args.ll for scalable vector args. +; is too large to fit in v8-v23, so it is split into +; multiple parts and the callee receives a pointer to the parts via +; CCValAssign::Indirect. The musttail computed-arg path must scale per-part +; offsets by VSCALE -- mirrors the non-musttail spill path validated by +; calling-conv.ll's caller_scalable_vector_split_indirect. + +declare @callee_musttail_scalable( %x, %y) + +; Forwarded: both args are caller formals passed straight through. The +; incoming indirect pointers are forwarded zero-copy; no per-part offset +; computation happens in the caller, so VSCALE does not enter here. +define @caller_musttail_scalable_forwarded( %x, %y) nounwind { +; CHECK-LABEL: caller_musttail_scalable_forwarded: +; CHECK: # %bb.0: +; CHECK-NEXT: tail callee_musttail_scalable + %r = musttail call @callee_musttail_scalable( %x, %y) + ret %r +} + +; Computed: both args are arithmetic on the formals, so dyn_cast +; fails and we go through the store-into-incoming-pointer path. The +; second part's store must use VSCALE * PartOffset relative to +; IncomingPtr, otherwise the part lands at a fixed byte offset that +; differs from what the callee expects (vlenb * 8 above). +define @caller_musttail_scalable_computed( %x, %y) nounwind { +; CHECK-LABEL: caller_musttail_scalable_computed: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vl8re32.v v0, (a1) +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: tail callee_musttail_scalable + %s = add %x, %y + %r = musttail call @callee_musttail_scalable( %s, %s) + ret %r +} + +; Mixed: first arg forwarded, second arg computed. Exercises both branches +; of the formal-arg-vs-computed path on the same call. +define @caller_musttail_scalable_mixed( %x, %y) nounwind { +; CHECK-LABEL: caller_musttail_scalable_mixed: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vl8re32.v v0, (a1) +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vv v24, v8, v24 +; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vadd.vv v24, v16, v0 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: tail callee_musttail_scalable + %s = add %x, %y + %r = musttail call @callee_musttail_scalable( %x, %s) + ret %r +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}}