Skip to content

Commit 3e95727

Browse files
committed
Make killing shims more resilient
Currently, we send a single SIGKILL to the shim process once and then we spin in a loop where we use kill(pid, 0) to detect when the pid has disappeared completely. Unfortunately, this has a race condition since pids can be reused causing us to spin in an infinite loop when that happens. This adds a timeout to this loop which logs a warning and exits the infinite loop. Signed-off-by: Ashray Jain <ashrayj@palantir.com>
1 parent e094d36 commit 3e95727

1 file changed

Lines changed: 15 additions & 5 deletions

File tree

runtime/v1/shim/client/client.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error {
324324
select {
325325
case <-ctx.Done():
326326
return ctx.Err()
327-
case <-c.waitForExit(pid):
327+
case <-c.waitForExit(ctx, pid):
328328
return nil
329329
}
330330
}
331331

332-
func (c *Client) waitForExit(pid int) <-chan struct{} {
333-
c.exitOnce.Do(func() {
332+
func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} {
333+
go c.exitOnce.Do(func() {
334+
defer close(c.exitCh)
335+
336+
ticker := time.NewTicker(10 * time.Millisecond)
337+
defer ticker.Stop()
338+
334339
for {
335340
// use kill(pid, 0) here because the shim could have been reparented
336341
// and we are no longer able to waitpid(pid, ...) on the shim
337342
if err := unix.Kill(pid, 0); err == unix.ESRCH {
338-
close(c.exitCh)
339343
return
340344
}
341-
time.Sleep(10 * time.Millisecond)
345+
346+
select {
347+
case <-ticker.C:
348+
case <-ctx.Done():
349+
log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit")
350+
return
351+
}
342352
}
343353
})
344354
return c.exitCh

0 commit comments

Comments
 (0)