Skip to content

containerd: failed to stop container #9240

@CharlieR-o-o-t

Description

@CharlieR-o-o-t

Description

Containerd unable to Stop container in specific condition.

What exactly occured:

  1. Container pid exited with signal 0
  2. Containerd event has been triggered to handle container exit

Note

  • event handler has timeout 10 sec on one event.
  1. Pod status has been updated:
  "status": {
    "id": "d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525",
    "metadata": {
      "attempt": 0,
      "name": "test-job-28285990-gthsg",
      "namespace": "app-namespace",
      "uid": "229f7a47-5a39-41f4-8f25-48fccff83740"
    },
    "state": "SANDBOX_NOTREADY",
    "createdAt": "2023-10-13T01:10:01.378923372Z",
    "network": {
      "additionalIps": [],
      "ip": ""
    }
    }
  1. handleContainerExit call ended with error
"failed to stop container: context deadline exceeded: unknown"
  1. After that container state can't be changed due to fact that Pid is not exists anymore and Exit event was not handled properly.
    Error from log:
Oct 13 16:12:08 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:12:08.020634516Z" level=error msg="Failed to handle backOff event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],} for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="failed to handle container TaskExit event: failed to cleanup container 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd in task-service: container must be created: failed precondition"

full log:

Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.243451096Z" level=info msg="CreateContainer within sandbox \"d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525\" for container &ContainerMetadata{Name:cache-job,Attempt:0,}"
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.307650802Z" level=info msg="CreateContainer within sandbox \"d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525\" for &ContainerMetadata{Name:cache-job,Attempt:0,} returns container id \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\""
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.308485075Z" level=info msg="StartContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\""
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.310733930Z" level=info msg="shim containerd-shim started" address="unix:///run/containerd/s/531c24254735587221a2ea954cdeda9d083e903d10112a9b57b424f45bf0fc6d" debug=false pid=12393
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.446857438Z" level=info msg="StartContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" returns successfully"
Oct 13 01:10:18 hostname containerd[1548]: time="2023-10-13T01:10:18.510862521Z" level=error msg="collecting metrics for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="cgroups: cgroup deleted"
Oct 13 01:10:27 hostname containerd[1548]: time="2023-10-13T01:10:27.980083052Z" level=error msg="failed to handle container TaskExit event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],}" error="failed to stop container: context deadline exceeded: unknown"



Oct 13 16:16:58 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:16:58.645935223Z" level=info msg="StopContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" with timeout 30 (s)"
Oct 13 16:16:58 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:16:58.646613873Z" level=info msg="StopContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" returns successfully"
Oct 13 16:17:02 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:02.704244442Z" level=error msg="collecting metrics for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="cgroups: cgroup deleted"
Oct 13 16:17:09 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:09.019842710Z" level=info msg="TaskExit event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],}"
Oct 13 16:17:09 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:09.020717601Z" level=error msg="Failed to handle backOff event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],} for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="failed to handle container TaskExit event: failed to cleanup container 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd in task-service:  : failed precondition"

containerd version: 1.6.24 61f9fd8

Steps to reproduce the issue

No steps to reproduce.

Describe the results you received and expected

Container cleanup expected

What version of containerd are you using?

1.6.24 61f9fd8

Any other relevant information

No response

Show configuration if it is related to CRI plugin.

version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
plugin_dir = ""
disabled_plugins = []
required_plugins = []
oom_score = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  tcp_address = ""
  tcp_tls_cert = ""
  tcp_tls_key = ""
  uid = 0
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216

[ttrpc]
  address = ""
  uid = 0
  gid = 0

[debug]
  address = ""
  uid = 0
  gid = 0
  level = ""

[metrics]
  address = ""
  grpc_histogram = false

[cgroup]
  path = ""

[timeouts]
  "io.containerd.timeout.shim.cleanup" = "5s"
  "io.containerd.timeout.shim.load" = "5s"
  "io.containerd.timeout.shim.shutdown" = "3s"
  "io.containerd.timeout.task.state" = "2s"


[plugins]

  [plugins."io.containerd.gc.v1.scheduler"]
    pause_threshold = 0.02
    deletion_threshold = 0
    mutation_threshold = 100
    schedule_delay = "0s"
    startup_delay = "100ms"

  [plugins."io.containerd.grpc.v1.cri"]
    disable_tcp_service = true
    stream_server_address = "127.0.0.1"
    stream_server_port = "0"
    stream_idle_timeout = "4h0m0s"
    enable_selinux = true
    selinux_category_range = 1024
    sandbox_image = "k8s.gcr.io/pause:3.1"
    stats_collect_period = 10
    systemd_cgroup = true
    enable_tls_streaming = false
    tolerate_missing_hugetlb_controller = true
    ignore_image_defined_volumes = false
    netns_mounts_under_state_dir = false
    max_container_log_line_size = 16384
    disable_cgroup = false
    disable_apparmor = true
    restrict_oom_score_adj = false
    max_concurrent_downloads = 3
    disable_proc_mount = false
    unset_seccomp_profile = ""
    disable_hugetlb_controller = true
    enable_unprivileged_ports = false
    enable_unprivileged_icmp = false

    [plugins."io.containerd.grpc.v1.cri".containerd]
      snapshotter = "overlayfs"
      default_runtime_name = "runc"
      no_pivot = false
      disable_snapshot_annotations = true
      discard_unpacked_layers = false

      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
        runtime_type = ""
        runtime_engine = ""
        runtime_root = ""
        privileged_without_host_devices = false
        base_runtime_spec = ""

      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
        runtime_type = ""
        runtime_engine = ""
        runtime_root = ""
        privileged_without_host_devices = false
        base_runtime_spec = ""

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          runtime_type = "io.containerd.runtime.v1.linux"
          runtime_engine = ""
          runtime_root = ""
          pod_annotations = []
          container_annotations = []
          privileged_without_host_devices = false
          base_runtime_spec = ""
          cni_conf_dir = "/etc/cni/net.d"
          cni_max_conf_num = 1

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            NoPivotRoot = false
            NoNewKeyring = false
            ShimCgroup = ""
            IoUid = 0
            IoGid = 0
            BinaryName = ""
            Root = ""
            CriuPath = ""
            SystemdCgroup = true
            CriuImagePath = ""
            CriuWorkPath = ""

    [plugins."io.containerd.grpc.v1.cri".cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      max_conf_num = 1
      conf_template = ""
      ip_pref = "ipv4"

    [plugins."io.containerd.grpc.v1.cri".registry]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
          endpoint = ["https://artifactory.wgdp.io/v2/docker-mirror/", "https://registry-1.docker.io"]

    [plugins."io.containerd.grpc.v1.cri".image_decryption]
      key_model = "node"

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

  [plugins."io.containerd.internal.v1.opt"]
    path = "/opt/containerd"

  [plugins."io.containerd.internal.v1.restart"]
    interval = "10s"

  [plugins."io.containerd.metadata.v1.bolt"]
    content_sharing_policy = "shared"

  [plugins."io.containerd.monitor.v1.cgroups"]
    no_prometheus = false

  [plugins."io.containerd.runtime.v1.linux"]
    shim = "containerd-shim"
    runtime = "runc"
    runtime_root = ""
    no_shim = false
    shim_debug = false

  [plugins."io.containerd.runtime.v2.task"]
    platforms = ["linux/amd64"]

  [plugins."io.containerd.service.v1.diff-service"]
    default = ["walking"]

  [plugins."io.containerd.snapshotter.v1.devmapper"]
    root_path = ""
    pool_name = ""
    base_image_size = ""
    async_remove = false

Metadata

Metadata

Assignees

No one assigned

    Type

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions