-
Notifications
You must be signed in to change notification settings - Fork 269
"Alternate stack already in use." -- any ideas how to debug? #3395
Description
Describe the issue
I am trying to get large system under the shadow to shake the tree and see where fruits fall. I've wrapped the parts of system under https://bitbucket.org/sivann/restarter.git to get things auto-restarted and some also auto-killed. That working well, until recently I've got to the condition, when shadow stuck.
Digging into stuck state shown shadow waiting for zombie, and the zombie has the "Alternate stack already in use." panic in shimlog; the syscall: tgkill(self, self, 15).
To Reproduce
I can't reliable reproduce it under reduced setup (yet). :(
Operating System (please complete the following information):
- OS: Ubuntu 22.04.4 LTS
- Kernel: 5.15.153.1-microsoft-standard-WSL2
- Shadow: Some local patches over freshly pulled origin/main daa5b47
- The failing one is -- some golang binary under restarter mentioned above
Additional context
The panic itself:
thread '<unnamed>' panicked at lib/shim/src/signals.rs:195:17:
Alternate stack already in use.
stack backtrace:
0: 0x7ffff7f58c95 - std::backtrace_rs::backtrace::libunwind::trace::h1a07e5dba0da0cd2
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5
1: 0x7ffff7f58c95 - std::backtrace_rs::backtrace::trace_unsynchronized::h61b9b8394328c0bc
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
2: 0x7ffff7f58c95 - std::sys_common::backtrace::_print_fmt::h1c5e18b460934cff
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:68:5
3: 0x7ffff7f58c95 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h1e1a1972118942ad
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:44:22
4: 0x7ffff7f7566b - core::fmt::rt::Argument::fmt::h07af2b4071d536cd
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/rt.rs:165:63
5: 0x7ffff7f7566b - core::fmt::write::hc090a2ffd6b28c4a
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/mod.rs:1157:21
6: 0x7ffff7f56f1f - std::io::Write::write_fmt::h8898bac6ff039a23
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/io/mod.rs:1832:15
7: 0x7ffff7f58a6e - std::sys_common::backtrace::_print::h4e80c5803d4ee35b
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:47:5
8: 0x7ffff7f58a6e - std::sys_common::backtrace::print::ha96650907276675e
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:34:9
9: 0x7ffff7f59c79 - std::panicking::default_hook::{{closure}}::h215c2a0a8346e0e0
10: 0x7ffff7f599bd - std::panicking::default_hook::h207342be97478370
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:298:9
11: 0x7ffff7f5a113 - std::panicking::rust_panic_with_hook::hac8bdceee1e4fe2c
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:795:13
12: 0x7ffff7f59fbb - std::panicking::begin_panic_handler::{{closure}}::h00d785e82757ce3c
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:656:13
13: 0x7ffff7f59159 - std::sys_common::backtrace::__rust_end_short_backtrace::h1628d957bcd06996
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:171:18
14: 0x7ffff7f59d27 - rust_begin_unwind
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:652:5
15: 0x7ffff7f2ce93 - core::panicking::panic_fmt::hdc63834ffaaefae5
at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panicking.rs:72:14
16: 0x7ffff7f33ca0 - shadow_shim::signals::process_signals::h5f2c9fec9f740685
17: 0x7ffff7f35fbb - shim_emulated_syscallv
18: 0x7ffff7f3a45c - shim_syscallv
at /home/datacompboy/work/shadow/src/lib/shim/shim_syscall.c:53:14
19: 0x7ffff7f3a53a - shim_syscall
at /home/datacompboy/work/shadow/src/lib/shim/shim_syscall.c:68:15
20: 0x7ffff7f397a8 - _shim_seccomp_handle_sigsys
at /home/datacompboy/work/shadow/src/lib/shim/shim_seccomp.c:63:15
21: 0x7ffff7a19520 - <unknown>
22: 0x473b21 - runtime.raise
at /usr/lib/go-1.19/src/runtime/sys_linux_amd64.s:158
fatal runtime error: failed to initiate panic, error 5
Last syscalls (1000 -- main restarter process, 1001 -- restarer's child spawner & monitor, 1002 -- target process):
00:01:41.864028050 [tid 1000] clone(CloneFlags(CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID | 0x11), 0x0, 0x0 <null>, 1299 (0x7ffff7a3e6d0), 0x0) = 1001
00:01:41.864028050 [tid 1001] set_robust_list(0x7ffff7a3e6e0, 24) = -38 (ENOSYS)
00:01:41.864028050 [tid 1001] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864030050 [tid 1001] getrlimit(...) = <native>
00:01:41.864030050 [tid 1001] ^^^ = SyscallReg { as_i64: 0, as_u64: 0, as_ptr: 0x0 }
00:01:41.864030050 [tid 1001] setrlimit(...) = <native>
00:01:41.864030050 [tid 1000] rt_sigprocmask(1, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864030050 [tid 1000] nanosleep(linux_timespec { tv_sec: 1, tv_nsec: 0 } (0x7fffffffe790), 0x7fffffffe790) = <blocked>
00:01:41.864032050 [tid 1001] ^^^ = SyscallReg { as_i64: 0, as_u64: 0, as_ptr: 0x0 }
00:01:41.864032050 [tid 1001] getpid() = 1001
00:01:41.864032050 [tid 1001] pipe([3, 4] (0x7fffffefe1d8)) = 0
00:01:41.864034050 [tid 1001] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864034050 [tid 1001] rt_sigprocmask(0, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864036050 [tid 1001] clone(CloneFlags(CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID | 0x11), 0x0, 0x0 <null>, 1001 (0x7ffff7a3e6d0), 0x0) = 1002
00:01:41.864036050 [tid 1001] rt_sigprocmask(1, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864036050 [tid 1002] set_robust_list(0x7ffff7a3e6e0, 24) = -38 (ENOSYS)
00:01:41.864036050 [tid 1002] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864038050 [tid 1002] getpid() = 1002
00:01:41.864038050 [tid 1002] shadow_yield() = 0
00:01:41.864038050 [tid 1001] rt_sigaction(14, 0x7fffffefdec0, 0x7fffffefdf60, 8) = 0
00:01:41.864038050 [tid 1001] setitimer(ITIMER_REAL, 0x7fffffefe1b0, 0x0) = 0
000000101864039060 [tid 1002] clock_gettime(...) = 0
00:01:41.864039060 [tid 1002] getpid() = 1002
00:01:41.864039060 [tid 1002] writev(2, 0x7fffffefdfe0, 2) = 62
00:01:41.864040050 [tid 1001] close(4) = 0
00:01:41.864040050 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>
<<<< are 1001 doing stdout/stderr pumping via read(3)
00:01:41.864275400 [tid 1001] read(3, 0x7fffffefe540, 511) = 143
00:01:41.864277400 [tid 1001] read(3, 0x7fffffefe540, 511) = 144
00:01:41.864277400 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>
<<<< since this moment 1001 is dormant
<<<< other threads of app-under-restarter was in waiting, their latest syscalls:
00:02:04.477354060 [tid 1002] epoll_pwait(5, 0x7fffffffe1d8, 128, 548, 0x0, 140737488348032) = <blocked>
00:02:04.477372990 [tid 1003] futex(0 (0x2013e58), 128, 0, 0x7fffce5ed878, 0x0 <null>, 0) = <blocked>
<<<< timeout in restarter reached, and the previously blocked read() get interrupted and restarted?! or what?
00:02:04.654038050 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>
00:02:04.654038050 [tid 1001] kill(1002, 15) = 0
<<<< I believe the kill is executed from alarm timer, and read() should be restarted after, not before?
00:02:04.654038050 [tid 1002] epoll_pwait(5, 0x7fffffffe1d8, 128, 548, 0x0, 140737488348032) = -4 (EINTR)
00:02:04.654038050 [tid 1002] rt_sigprocmask(1, 0xc00000f7a8, 0x0, 8) = 0
00:02:04.654040050 [tid 1002] getpid() = 1002
00:02:04.654040050 [tid 1002] gettid() = 1002
00:02:04.654040050 [tid 1001] nanosleep(linux_timespec { tv_sec: 3, tv_nsec: 0 } (0x7fffffefd120), 0x7fffffefd120) = <blocked>
00:02:04.654042050 [tid 1002] tgkill(1002, 1002, 15) = 0
<<<< the golang tried to self-shutdown (?) and that triggers alt-stack-in-use panic.