-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Open
Description
Description
Containerd unable to Stop container in specific condition.
What exactly occured:
- Container pid exited with signal 0
- Containerd event has been triggered to handle container exit
Note
- event handler has timeout 10 sec on one event.
- Pod status has been updated:
"status": {
"id": "d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525",
"metadata": {
"attempt": 0,
"name": "test-job-28285990-gthsg",
"namespace": "app-namespace",
"uid": "229f7a47-5a39-41f4-8f25-48fccff83740"
},
"state": "SANDBOX_NOTREADY",
"createdAt": "2023-10-13T01:10:01.378923372Z",
"network": {
"additionalIps": [],
"ip": ""
}
}
- handleContainerExit call ended with error
"failed to stop container: context deadline exceeded: unknown"
- After that container state can't be changed due to fact that Pid is not exists anymore and Exit event was not handled properly.
Error from log:
Oct 13 16:12:08 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:12:08.020634516Z" level=error msg="Failed to handle backOff event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],} for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="failed to handle container TaskExit event: failed to cleanup container 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd in task-service: container must be created: failed precondition"
full log:
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.243451096Z" level=info msg="CreateContainer within sandbox \"d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525\" for container &ContainerMetadata{Name:cache-job,Attempt:0,}"
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.307650802Z" level=info msg="CreateContainer within sandbox \"d102f495d90c786957cd58cc877de75b9da599da9cba0273d8f754adb8fb1525\" for &ContainerMetadata{Name:cache-job,Attempt:0,} returns container id \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\""
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.308485075Z" level=info msg="StartContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\""
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.310733930Z" level=info msg="shim containerd-shim started" address="unix:///run/containerd/s/531c24254735587221a2ea954cdeda9d083e903d10112a9b57b424f45bf0fc6d" debug=false pid=12393
Oct 13 01:10:15 hostname containerd[1548]: time="2023-10-13T01:10:15.446857438Z" level=info msg="StartContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" returns successfully"
Oct 13 01:10:18 hostname containerd[1548]: time="2023-10-13T01:10:18.510862521Z" level=error msg="collecting metrics for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="cgroups: cgroup deleted"
Oct 13 01:10:27 hostname containerd[1548]: time="2023-10-13T01:10:27.980083052Z" level=error msg="failed to handle container TaskExit event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],}" error="failed to stop container: context deadline exceeded: unknown"
Oct 13 16:16:58 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:16:58.645935223Z" level=info msg="StopContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" with timeout 30 (s)"
Oct 13 16:16:58 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:16:58.646613873Z" level=info msg="StopContainer for \"680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd\" returns successfully"
Oct 13 16:17:02 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:02.704244442Z" level=error msg="collecting metrics for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="cgroups: cgroup deleted"
Oct 13 16:17:09 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:09.019842710Z" level=info msg="TaskExit event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],}"
Oct 13 16:17:09 ed-c16-208-221-76 containerd[1548]: time="2023-10-13T16:17:09.020717601Z" level=error msg="Failed to handle backOff event &TaskExit{ContainerID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,ID:680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd,Pid:12412,ExitStatus:0,ExitedAt:2023-10-13 01:10:17.977124706 +0000 UTC,XXX_unrecognized:[],} for 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd" error="failed to handle container TaskExit event: failed to cleanup container 680d7356661f9f87a602b589bd5cec535bf3a674a65bd38966072b75a779a8fd in task-service: : failed precondition"
containerd version: 1.6.24 61f9fd8
Steps to reproduce the issue
No steps to reproduce.
Describe the results you received and expected
Container cleanup expected
What version of containerd are you using?
1.6.24 61f9fd8
Any other relevant information
No response
Show configuration if it is related to CRI plugin.
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
plugin_dir = ""
disabled_plugins = []
required_plugins = []
oom_score = 0
[grpc]
address = "/run/containerd/containerd.sock"
tcp_address = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
[ttrpc]
address = ""
uid = 0
gid = 0
[debug]
address = ""
uid = 0
gid = 0
level = ""
[metrics]
address = ""
grpc_histogram = false
[cgroup]
path = ""
[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
pause_threshold = 0.02
deletion_threshold = 0
mutation_threshold = 100
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
disable_tcp_service = true
stream_server_address = "127.0.0.1"
stream_server_port = "0"
stream_idle_timeout = "4h0m0s"
enable_selinux = true
selinux_category_range = 1024
sandbox_image = "k8s.gcr.io/pause:3.1"
stats_collect_period = 10
systemd_cgroup = true
enable_tls_streaming = false
tolerate_missing_hugetlb_controller = true
ignore_image_defined_volumes = false
netns_mounts_under_state_dir = false
max_container_log_line_size = 16384
disable_cgroup = false
disable_apparmor = true
restrict_oom_score_adj = false
max_concurrent_downloads = 3
disable_proc_mount = false
unset_seccomp_profile = ""
disable_hugetlb_controller = true
enable_unprivileged_ports = false
enable_unprivileged_icmp = false
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
disable_snapshot_annotations = true
discard_unpacked_layers = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runtime.v1.linux"
runtime_engine = ""
runtime_root = ""
pod_annotations = []
container_annotations = []
privileged_without_host_devices = false
base_runtime_spec = ""
cni_conf_dir = "/etc/cni/net.d"
cni_max_conf_num = 1
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
NoPivotRoot = false
NoNewKeyring = false
ShimCgroup = ""
IoUid = 0
IoGid = 0
BinaryName = ""
Root = ""
CriuPath = ""
SystemdCgroup = true
CriuImagePath = ""
CriuWorkPath = ""
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"
max_conf_num = 1
conf_template = ""
ip_pref = "ipv4"
[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://artifactory.wgdp.io/v2/docker-mirror/", "https://registry-1.docker.io"]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = "node"
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.runtime.v1.linux"]
shim = "containerd-shim"
runtime = "runc"
runtime_root = ""
no_shim = false
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.snapshotter.v1.devmapper"]
root_path = ""
pool_name = ""
base_image_size = ""
async_remove = false
Metadata
Metadata
Assignees
Type
Projects
Status
Todo