Skip to content

Commit 672e52e

Browse files
authored
Merge pull request #571 from Microsoft/shim_mitigations
Handle Process/Container not found errors and force exits
2 parents 2de31e6 + d26c179 commit 672e52e

File tree

4 files changed

+44
-4
lines changed

4 files changed

+44
-4
lines changed

cmd/containerd-shim-runhcs-v1/exec_hcs.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,9 +426,28 @@ func (he *hcsExec) Kill(ctx context.Context, signal uint32) error {
426426
err = he.p.Kill()
427427
}
428428
if err != nil {
429-
if hcs.IsNotExist(err) {
429+
if hcs.IsNotExist(err) && signal == 0x9 || hcs.IsOperationInvalidState(err) {
430+
// If we issued a SIGKILL (or terminate) and get ERROR_NOT_FOUND
431+
// or ERROR_VMCOMPUTE_INVALID_STATE `he.waitForExit` is either:
432+
//
433+
// 1. About to transition the state when it is signaled by the
434+
// HCS and everything is fine. This was just a simple race where
435+
// the SIGKILL came in before the previous signal completed.
436+
//
437+
// OR
438+
//
439+
// 2. We are stuck in `he.waitForExit` and the notification is
440+
// not going to be delivered. In this case we force the exit by
441+
// closing `he.p` and unblocking all waiters.
442+
go func() {
443+
// Give the HCS 1 second to finish and deliver the notification.
444+
time.Sleep(1 * time.Second)
445+
// Force the close. This is safe to call if `he.waitForExit` already called it.
446+
he.p.Close()
447+
}()
430448
return errors.Wrapf(errdefs.ErrNotFound, "exec: '%s' in task: '%s' not found", he.id, he.tid)
431449
}
450+
// Unknown. Return the err from Signal/Kill
432451
return err
433452
}
434453
return nil

cmd/containerd-shim-runhcs-v1/task_hcs.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,20 @@ func (ht *hcsTask) KillExec(ctx context.Context, eid string, signal uint32, all
349349
return errors.Wrap(errdefs.ErrFailedPrecondition, "cannot signal init exec with un-exited additional exec's")
350350
}
351351
}
352+
if signal == 0x9 && eid == "" && ht.ownsHost && ht.host != nil {
353+
go func() {
354+
// The caller has issued a SIGKILL to the init process that owns the
355+
// host.
356+
//
357+
// To mitigate failures that can cause the HCS to never deliver the
358+
// exit notification give everything 30 seconds and terminate the
359+
// UVM to force all exits.
360+
time.Sleep(30 * time.Second)
361+
// Safe to call multiple times if called previously on successful
362+
// shutdown.
363+
ht.host.Close()
364+
}()
365+
}
352366
eg.Go(func() error {
353367
return e.Kill(ctx, signal)
354368
})
@@ -506,7 +520,7 @@ func (ht *hcsTask) close() {
506520
if ht.c != nil {
507521
// Do our best attempt to tear down the container.
508522
if err := ht.c.Shutdown(); err != nil {
509-
if hcs.IsAlreadyClosed(err) || hcs.IsAlreadyStopped(err) {
523+
if hcs.IsAlreadyClosed(err) || hcs.IsNotExist(err) || hcs.IsAlreadyStopped(err) {
510524
// This is the state we want. Do nothing.
511525
} else if !hcs.IsPending(err) {
512526
logrus.WithFields(logrus.Fields{
@@ -523,7 +537,7 @@ func (ht *hcsTask) close() {
523537
}
524538
}
525539
if err := ht.c.Terminate(); err != nil {
526-
if hcs.IsAlreadyClosed(err) || hcs.IsAlreadyStopped(err) {
540+
if hcs.IsAlreadyClosed(err) || hcs.IsNotExist(err) || hcs.IsAlreadyStopped(err) {
527541
// This is the state we want. Do nothing.
528542
} else if !hcs.IsPending(err) {
529543
logrus.WithFields(logrus.Fields{

internal/hcs/errors.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,13 @@ func IsNotSupported(err error) bool {
272272
err == ErrVmcomputeUnknownMessage
273273
}
274274

275+
// IsOperationInvalidState returns true when err is caused by
276+
// `ErrVmcomputeOperationInvalidState`.
277+
func IsOperationInvalidState(err error) bool {
278+
err = getInnerError(err)
279+
return err == ErrVmcomputeOperationInvalidState
280+
}
281+
275282
func getInnerError(err error) error {
276283
switch pe := err.(type) {
277284
case nil:

internal/hcs/process.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func (process *Process) Wait() (err error) {
192192

193193
<-process.waitBlock
194194
if process.waitError != nil {
195-
return makeProcessError(process, operation, err, nil)
195+
return makeProcessError(process, operation, process.waitError, nil)
196196
}
197197
return nil
198198
}

0 commit comments

Comments
 (0)