I am seeing segfaults (with various errors detailed in the above issue) when running workload stacking many small kernels together. Eventually getting segfaults in rocr::core::Runtime::AsyncEventsLoop.
Here are MD5SUM of failing libhsa-runtime64.so (using the one loaded by torch, through python3 -c "import torch; torch.cuda.init(); import os; os.system(f'cat /proc/{os.getpid()}/maps | grep libhsa-runtime64')"):
Thread 66 "python" received signal SIGSEGV, Segmentation fault.
[Switching to thread 66 (Thread 0x7ffdc2dff640 (LWP 654565))]
0x00007ffec3a76921 in rocr::core::Runtime::AsyncEventsLoop(void*) ()
from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
(gdb) thread apply all bt
Thread 67 (Thread 0x7ffdc25fe640 (LWP 654566) "python"):
#0 0x00007ffff7d709cf in ioctl () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffec3b222b0 in hsakmt_ioctl () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#2 0x00007ffec3b17f53 in hsaKmtWaitOnMultipleEvents_ExtCtx () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#3 0x00007ffec3a9b999 in rocr::core::Signal::WaitAnyExceptions(unsigned int, hsa_signal_s const*, hsa_signal_condition_t const*, long const*, long*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#4 0x00007ffec3a77248 in rocr::core::Runtime::AsyncEventsLoop(void*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#5 0x00007ffec3ad34dd in rocr::os::ThreadTrampoline(void*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#6 0x00007ffff7ceaac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#7 0x00007ffff7d7c8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 66 (Thread 0x7ffdc2dff640 (LWP 654565) "python"):
#0 0x00007ffec3a76921 in rocr::core::Runtime::AsyncEventsLoop(void*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#1 0x00007ffec3ad34dd in rocr::os::ThreadTrampoline(void*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#2 0x00007ffff7ceaac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#3 0x00007ffff7d7c8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 65 (Thread 0x7ffdc81ff640 (LWP 654561) "python"):
#0 0x00007ffff7ce7117 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff7ce9a41 in pthread_cond_wait () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007ffe6d89eb8b in blas_thread_server () from /usr/local/lib/python3.12/dist-packages/numpy/_core/../../numpy.libs/libscipy_openblas64_-ff651d7f.so
#3 0x00007ffff7ceaac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4 0x00007ffff7d7c8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
...
Thread 3 (Thread 0x7ffe6cbff640 (LWP 654494) "python"):
#0 0x00007ffff7ce7117 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff7ce9a41 in pthread_cond_wait () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007ffe6d89eb8b in blas_thread_server () from /usr/local/lib/python3.12/dist-packages/numpy/_core/../../numpy.libs/libscipy_openblas64_-ff651d7f.so
#3 0x00007ffff7ceaac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4 0x00007ffff7d7c8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 2 (Thread 0x7ffe6fbff640 (LWP 654493) "python"):
#0 0x00007ffff7ce7117 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff7ce9a41 in pthread_cond_wait () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007fff49888747 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007fff4965353f in PTL::ThreadPool::execute_thread(PTL::VUserTaskQueue*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#4 0x00007fff49653b3c in PTL::ThreadPool::start_thread(PTL::ThreadPool*, std::vector<std::shared_ptr<PTL::ThreadData>, std::allocator<std::shared_ptr<PTL::ThreadData> > >*, long, std::shared_ptr<std::promise<void> >) () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#5 0x00007fff4965610b in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(PTL::ThreadPool*, std::vector<std::shared_ptr<PTL::ThreadData>, std::allocator<std::shared_ptr<PTL::ThreadData> > >*, long, std::shared_ptr<std::promise<void> >), PTL::ThreadPool*, std::vector<std::shared_ptr<PTL::ThreadData>, std::allocator<std::shared_ptr<PTL::ThreadData> > >*, unsigned long, std::shared_ptr<std::promise<void> > > > >::_M_run() () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#6 0x00007fff498b8253 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#7 0x00007ffff7ceaac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#8 0x00007ffff7d7c8d0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
Thread 1 (Thread 0x7ffff7c55000 (LWP 654460) "python"):
#0 0x00007ffec3a4d7d9 in rocr::core::BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t, long, unsigned long, hsa_wait_state_t) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#1 0x00007ffec3a4d56a in rocr::core::BusyWaitSignal::WaitAcquire(hsa_signal_condition_t, long, unsigned long, hsa_wait_state_t) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#2 0x00007ffec3a510b1 in rocr::HSA::hsa_signal_wait_scacquire(hsa_signal_s, hsa_signal_condition_t, long, unsigned long, hsa_wait_state_t) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.so
#3 0x00007fff49defd4a in amd::roc::WaitForSignal(hsa_signal_s, bool, bool) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#4 0x00007fff49e00c57 in amd::roc::VirtualGPU::ManagedBuffer::Acquire(unsigned int, unsigned int) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#5 0x00007fff49e029bd in amd::roc::VirtualGPU::submitKernelInternal(amd::NDRangeContainer const&, amd::Kernel const&, unsigned char const*, void*, unsigned int, amd::NDRangeKernelCommand*, hsa_kernel_dispatch_packet_s*, bool) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#6 0x00007fff49e03558 in amd::roc::VirtualGPU::submitKernel(amd::NDRangeKernelCommand&) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#7 0x00007fff49dccd45 in amd::Command::enqueue() () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#8 0x00007fff49c6e2ef in hip::ihipModuleLaunchKernel(ihipModuleSymbol_t*, amd::LaunchParams&, ihipStream_t*, void**, void**, ihipEvent_t*, ihipEvent_t*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned long, unsigned long, unsigned int) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#9 0x00007fff49cbbfa9 in hip::ihipLaunchKernel(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*, ihipEvent_t*, ihipEvent_t*, int) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#10 0x00007fff49c6e91a in hip::hipLaunchKernel_common(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#11 0x00007fff49c89c11 in hip::hipLaunchKernel(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libamdhip64.so
#12 0x00007fff4921386a in auto rocprofiler::hip::hip_api_impl<1ul, 214ul>::exec<hipError_t (*&)(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*), void const*, dim3, dim3, void**, unsigned long, ihipStream_t*>(hipError_t (*&)(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*), void const*&&, dim3&&, dim3&&, void**&&, unsigned long&&, ihipStream_t*&&) [clone .isra.0] () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#13 0x00007fff49453231 in hipError_t rocprofiler::hip::hip_api_impl<1ul, 214ul>::functor<hipError_t, void const*, dim3, dim3, void**, unsigned long, ihipStream_t*>(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#14 0x00007fff494e8030 in rocprofiler::hip::stream::create_read_functor<1ul, 214ul, hipError_t, void const*, dim3, dim3, void**, unsigned long, ihipStream_t*, hipError_t (*)(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*)>(hipError_t (*)(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*))::{lambda(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*)#1}::operator()(void const*, dim3, dim3, void**, unsigned long, ihipStream_t*) const [clone .isra.0] () from /usr/local/lib/python3.12/dist-packages/torch/lib/librocprofiler-sdk.so
#15 0x00007fffc10f8c77 in void at::native::(anonymous namespace)::parallel_cat<at::native::(anonymous namespace)::OpaqueType<1u>, 128, 1>(at::Tensor const&, std::vector<std::reference_wrapper<at::Tensor const>, std::allocator<std::reference_wrapper<at::Tensor const> > > const&, long, int, c10::MemoryFormat) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_hip.so
#16 0x00007fffc10f6556 in at::native::structured_cat_out_cuda::impl(c10::IListRef<at::Tensor> const&, long, long, bool, bool, bool, c10::MemoryFormat, at::Tensor const&) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_hip.so
#17 0x00007fffc242ab21 in c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::IListRef<at::Tensor> const&, long), &at::(anonymous namespace)::wrapper_CUDA_cat>, at::Tensor, c10::guts::typelist::typelist<c10::IListRef<at::Tensor> const&, long> >, at::Tensor (c10::IListRef<at::Tensor> const&, long)>::call(c10::OperatorKernel*, c10::DispatchKeySet, c10::IListRef<at::Tensor> const&, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_hip.so
#18 0x00007fffe21fbe30 in at::_ops::cat::call(c10::IListRef<at::Tensor> const&, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#19 0x00007fffe1c64a14 in at::native::stack(c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#20 0x00007fffe2bdda92 in c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::ArrayRef<at::Tensor>, long), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd__stack>, at::Tensor, c10::guts::typelist::typelist<c10::ArrayRef<at::Tensor>, long> >, at::Tensor (c10::ArrayRef<at::Tensor>, long)>::call(c10::OperatorKernel*, c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#21 0x00007fffe2322444 in at::_ops::stack::redispatch(c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#22 0x00007fffe4e0e934 in torch::autograd::VariableType::(anonymous namespace)::stack(c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#23 0x00007fffe4e0ef65 in c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long), &torch::autograd::VariableType::(anonymous namespace)::stack>, at::Tensor, c10::guts::typelist::typelist<c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long> >, at::Tensor (c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long)>::call(c10::OperatorKernel*, c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#24 0x00007fffe23cafc5 in at::_ops::stack::call(c10::ArrayRef<at::Tensor>, long) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#25 0x00007ffff5f0649e in torch::autograd::THPVariable_stack(_object*, _object*, _object*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so
#26 0x000000000056cc2d in ?? ()
#27 0x000000000053f2ab in _PyObject_MakeTpCall ()
#28 0x0000000000549fe0 in _PyEval_EvalFrameDefault ()
#29 0x000000000061f469 in PyEval_EvalCode ()
#30 0x000000000065b0eb in ?? ()
#31 0x0000000000656186 in ?? ()
#32 0x0000000000652df5 in ?? ()
#33 0x0000000000652ac5 in _PyRun_SimpleFileObject ()
#34 0x00000000006525d7 in _PyRun_AnyFileObject ()
#35 0x000000000064f285 in Py_RunMain ()
#36 0x00000000006082cd in Py_BytesMain ()
#37 0x00007ffff7c7fd90 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#38 0x00007ffff7c7fe40 in __libc_start_main () from /lib/x86_64-linux-gnu/libc.so.6
#39 0x0000000000608145 in _start ()
Problem Description
Hi,
This is ported from pytorch/pytorch#180341 as it looks to be neither a torch nor vLLM issue.
I am seeing segfaults (with various errors detailed in the above issue) when running workload stacking many small kernels together. Eventually getting segfaults in
rocr::core::Runtime::AsyncEventsLoop.This issue does not occur on Nvidia devices, neither does it occur when using
HIP_LAUNCH_BLOCKING=1orAMD_SERIALIZE_KERNEL=3.This issue can be reproduced with:
torch==2.12.0.dev20260413+rocm7.2(torch nightly), in a docker container withROCm version: 7.2.1 | amdgpu version: 6.16.13rocm/vllm-dev:nightly_main_20260413that usestorch==2.10.0+git8514f05,ROCm version: 7.2.1 | amdgpu version: 6.16.13This issue can be reproduced as well with:
with
torch==2.12.0.dev20260413+rocm7.2andROCm version: 7.2.2 | amdgpu version: 6.16.13.This issue does NOT occur in:
rocm/pytorch:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0that usestorch==2.10.0+rocm7.2.2.lw.git40d237bf,ROCm version: 7.2.2 | amdgpu version: 6.16.13rocm/pytorch:rocm7.2.1_ubuntu24.04_py3.14_pytorch_2.10.0that usestorch==2.10.0+rocm7.2.1.lw.gitb6dec82dandROCm version: 7.2.1 | amdgpu version: 6.16.13rocm/vllm-dev:nightly_main_20260323, that usestorch==2.9.1+git8907517andROCm version: 7.0.0 | amdgpu version: 6.16.13Here are MD5SUM of failing libhsa-runtime64.so (using the one loaded by torch, through
python3 -c "import torch; torch.cuda.init(); import os; os.system(f'cat /proc/{os.getpid()}/maps | grep libhsa-runtime64')"):Failing images:
rocm/vllm-dev:nightly_main_20260413:277b086ff9ec634b7e2f1ff5bfb1d704 /opt/rocm-7.2.1/lib/libhsa-runtime64.so.1.18.70201andROCR BUILD ID: "1.18.0-rocm-rel-7.2-81-e1a6bc5663".rocm/vllm-dev:nightly_main_20260413+ using torch nightly:0f814039f3236bcdb42c70522d0308c2 /usr/local/lib/python3.12/dist-packages/torch/lib/libhsa-runtime64.soandROCR BUILD ID: "1.18.0-rocm-rel-7.2-81-e1a6bc5663".rocm/dev-ubuntu-24.04:7.2.2-complete+ torch nightly:0f814039f3236bcdb42c70522d0308c2 /root/miniforge3/lib/python3.13/site-packages/torch/lib/libhsa-runtime64.soandROCR BUILD ID: "1.18.0-rocm-rel-7.2-81-e1a6bc5663".Working images:
rocm/pytorch:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0:e0b6b6159d80406e2e2e092647923576 /opt/rocm-7.2.2/lib/libhsa-runtime64.so.1.18.70202andROCR BUILD ID: "1.18.0-rocm-rel-7.2-86-671d39a71e".rocm/pytorch:rocm7.2.1_ubuntu24.04_py3.14_pytorch_2.10.0:0935cd27736ad16a1df92e48098c9e9f /opt/rocm-7.2.1/lib/libhsa-runtime64.so.1.18.70201andROCR BUILD ID: "1.18.0-rocm-rel-7.2-81-e1a6bc5663".rocm/vllm-dev:nightly_main_20260323:29e569e5503447f20f9f9c55b8b7b3cb /opt/rocm-7.0.0/lib/libhsa-runtime64.so.1.18.70000andROCR BUILD ID: "1.18.0-rocm-rel-7.0-38-737ba1dc".Operating System
Ubuntu 22.04.5 LTS (Jammy Jellyfish)
CPU
AMD EPYC 9555 64-Core Processor
GPU
MI325X
ROCm Version
ROCm 7.2.1
ROCm Component
ROCR-Runtime
Steps to Reproduce
with backtrace:
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
Additional Information
No response