Skip to content

"Alternate stack already in use." -- any ideas how to debug? #3395

@datacompboy

Description

@datacompboy

Describe the issue
I am trying to get large system under the shadow to shake the tree and see where fruits fall. I've wrapped the parts of system under https://bitbucket.org/sivann/restarter.git to get things auto-restarted and some also auto-killed. That working well, until recently I've got to the condition, when shadow stuck.

Digging into stuck state shown shadow waiting for zombie, and the zombie has the "Alternate stack already in use." panic in shimlog; the syscall: tgkill(self, self, 15).

To Reproduce
I can't reliable reproduce it under reduced setup (yet). :(

Operating System (please complete the following information):

  • OS: Ubuntu 22.04.4 LTS
  • Kernel: 5.15.153.1-microsoft-standard-WSL2
  • Shadow: Some local patches over freshly pulled origin/main daa5b47
  • The failing one is -- some golang binary under restarter mentioned above

Additional context

The panic itself:

thread '<unnamed>' panicked at lib/shim/src/signals.rs:195:17:
Alternate stack already in use.
stack backtrace:
   0:     0x7ffff7f58c95 - std::backtrace_rs::backtrace::libunwind::trace::h1a07e5dba0da0cd2
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5
   1:     0x7ffff7f58c95 - std::backtrace_rs::backtrace::trace_unsynchronized::h61b9b8394328c0bc
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:     0x7ffff7f58c95 - std::sys_common::backtrace::_print_fmt::h1c5e18b460934cff
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:68:5
   3:     0x7ffff7f58c95 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h1e1a1972118942ad
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:44:22
   4:     0x7ffff7f7566b - core::fmt::rt::Argument::fmt::h07af2b4071d536cd
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/rt.rs:165:63
   5:     0x7ffff7f7566b - core::fmt::write::hc090a2ffd6b28c4a
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/mod.rs:1157:21
   6:     0x7ffff7f56f1f - std::io::Write::write_fmt::h8898bac6ff039a23
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/io/mod.rs:1832:15
   7:     0x7ffff7f58a6e - std::sys_common::backtrace::_print::h4e80c5803d4ee35b
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:47:5
   8:     0x7ffff7f58a6e - std::sys_common::backtrace::print::ha96650907276675e
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:34:9
   9:     0x7ffff7f59c79 - std::panicking::default_hook::{{closure}}::h215c2a0a8346e0e0
  10:     0x7ffff7f599bd - std::panicking::default_hook::h207342be97478370
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:298:9
  11:     0x7ffff7f5a113 - std::panicking::rust_panic_with_hook::hac8bdceee1e4fe2c
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:795:13
  12:     0x7ffff7f59fbb - std::panicking::begin_panic_handler::{{closure}}::h00d785e82757ce3c
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:656:13
  13:     0x7ffff7f59159 - std::sys_common::backtrace::__rust_end_short_backtrace::h1628d957bcd06996
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:171:18
  14:     0x7ffff7f59d27 - rust_begin_unwind
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:652:5
  15:     0x7ffff7f2ce93 - core::panicking::panic_fmt::hdc63834ffaaefae5
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panicking.rs:72:14
  16:     0x7ffff7f33ca0 - shadow_shim::signals::process_signals::h5f2c9fec9f740685
  17:     0x7ffff7f35fbb - shim_emulated_syscallv
  18:     0x7ffff7f3a45c - shim_syscallv
                               at /home/datacompboy/work/shadow/src/lib/shim/shim_syscall.c:53:14
  19:     0x7ffff7f3a53a - shim_syscall
                               at /home/datacompboy/work/shadow/src/lib/shim/shim_syscall.c:68:15
  20:     0x7ffff7f397a8 - _shim_seccomp_handle_sigsys
                               at /home/datacompboy/work/shadow/src/lib/shim/shim_seccomp.c:63:15
  21:     0x7ffff7a19520 - <unknown>
  22:           0x473b21 - runtime.raise
                               at /usr/lib/go-1.19/src/runtime/sys_linux_amd64.s:158
fatal runtime error: failed to initiate panic, error 5

Last syscalls (1000 -- main restarter process, 1001 -- restarer's child spawner & monitor, 1002 -- target process):

00:01:41.864028050 [tid 1000] clone(CloneFlags(CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID | 0x11), 0x0, 0x0 <null>, 1299 (0x7ffff7a3e6d0), 0x0) = 1001
00:01:41.864028050 [tid 1001] set_robust_list(0x7ffff7a3e6e0, 24) = -38 (ENOSYS)
00:01:41.864028050 [tid 1001] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864030050 [tid 1001] getrlimit(...) = <native>
00:01:41.864030050 [tid 1001] ^^^ = SyscallReg { as_i64: 0, as_u64: 0, as_ptr: 0x0 }
00:01:41.864030050 [tid 1001] setrlimit(...) = <native>
00:01:41.864030050 [tid 1000] rt_sigprocmask(1, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864030050 [tid 1000] nanosleep(linux_timespec { tv_sec: 1, tv_nsec: 0 } (0x7fffffffe790), 0x7fffffffe790) = <blocked>
00:01:41.864032050 [tid 1001] ^^^ = SyscallReg { as_i64: 0, as_u64: 0, as_ptr: 0x0 }
00:01:41.864032050 [tid 1001] getpid() = 1001
00:01:41.864032050 [tid 1001] pipe([3, 4] (0x7fffffefe1d8)) = 0
00:01:41.864034050 [tid 1001] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864034050 [tid 1001] rt_sigprocmask(0, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864036050 [tid 1001] clone(CloneFlags(CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID | 0x11), 0x0, 0x0 <null>, 1001 (0x7ffff7a3e6d0), 0x0) = 1002
00:01:41.864036050 [tid 1001] rt_sigprocmask(1, 0x7fffffefe1e0, 0x0, 8) = 0
00:01:41.864036050 [tid 1002] set_robust_list(0x7ffff7a3e6e0, 24) = -38 (ENOSYS)
00:01:41.864036050 [tid 1002] rt_sigaction(17, 0x7fffffefe010, 0x0, 8) = 0
00:01:41.864038050 [tid 1002] getpid() = 1002
00:01:41.864038050 [tid 1002] shadow_yield() = 0
00:01:41.864038050 [tid 1001] rt_sigaction(14, 0x7fffffefdec0, 0x7fffffefdf60, 8) = 0
00:01:41.864038050 [tid 1001] setitimer(ITIMER_REAL, 0x7fffffefe1b0, 0x0) = 0
000000101864039060 [tid 1002] clock_gettime(...) = 0
00:01:41.864039060 [tid 1002] getpid() = 1002
00:01:41.864039060 [tid 1002] writev(2, 0x7fffffefdfe0, 2) = 62
00:01:41.864040050 [tid 1001] close(4) = 0
00:01:41.864040050 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>

<<<< are 1001 doing stdout/stderr pumping via read(3)
00:01:41.864275400 [tid 1001] read(3, 0x7fffffefe540, 511) = 143
00:01:41.864277400 [tid 1001] read(3, 0x7fffffefe540, 511) = 144
00:01:41.864277400 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>
<<<< since this moment 1001 is dormant

<<<< other threads of app-under-restarter was in waiting, their latest syscalls:
00:02:04.477354060 [tid 1002] epoll_pwait(5, 0x7fffffffe1d8, 128, 548, 0x0, 140737488348032) = <blocked>
00:02:04.477372990 [tid 1003] futex(0 (0x2013e58), 128, 0, 0x7fffce5ed878, 0x0 <null>, 0) = <blocked>

<<<< timeout in restarter reached, and the previously blocked read() get interrupted and restarted?! or what?
00:02:04.654038050 [tid 1001] read(3, 0x7fffffefe540, 511) = <blocked>
00:02:04.654038050 [tid 1001] kill(1002, 15) = 0
<<<< I believe the kill is executed from alarm timer, and read() should be restarted after, not before?

00:02:04.654038050 [tid 1002] epoll_pwait(5, 0x7fffffffe1d8, 128, 548, 0x0, 140737488348032) = -4 (EINTR)
00:02:04.654038050 [tid 1002] rt_sigprocmask(1, 0xc00000f7a8, 0x0, 8) = 0
00:02:04.654040050 [tid 1002] getpid() = 1002
00:02:04.654040050 [tid 1002] gettid() = 1002
00:02:04.654040050 [tid 1001] nanosleep(linux_timespec { tv_sec: 3, tv_nsec: 0 } (0x7fffffefd120), 0x7fffffefd120) = <blocked>
00:02:04.654042050 [tid 1002] tgkill(1002, 1002, 15) = 0
<<<< the golang tried to self-shutdown (?) and that triggers alt-stack-in-use panic.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type: BugError or flaw producing unexpected results

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions