Skip to content

supposedly running container actually dead #10589

@clementnuss

Description

@clementnuss

Description

containerd shows a container as running, although the PID for that container is absent. furthermore, when inspecting the pod cgroup (v2), we only find the /pause process, and when restarting containerd, the container is effectively marked as exited.

Steps to reproduce the issue

we haven't found a way to reliably reproduce the bug. we notice it rarely happens with redis containers when they get OOM killed. we would be happy to get some tips to try to reproduce it.

Describe the results you received and expected

the container is showing as RUNNING, whereas the process of the container is not running, and we didn't expect that containerd wasn't aware of container process being stopped

What version of containerd are you using?

containerd github.com/containerd/containerd v1.7.20 8fc6bcf

Any other relevant information

Kubernetes v1.29 (kubeadm-deployed), containerd v1.20, Debian 12.

runc --version
runc --version
runc version 1.1.13
commit: v1.1.13-0-g58aa9203-dirty
spec: 1.0.2-dev
go: go1.21.11
libseccomp: 2.5.5
crictl info
crictl info
{
  "status": {
    "conditions": [
      {
        "type": "RuntimeReady",
        "status": true,
        "reason": "",
        "message": ""
      },
      {
        "type": "NetworkReady",
        "status": true,
        "reason": "",
        "message": ""
      },
      {
        "type": "ContainerdHasNoDeprecationWarnings",
        "status": true,
        "reason": "",
        "message": ""
      }
    ]
  },
  "cniconfig": {
    "PluginDirs": [
      "/opt/cni/bin"
    ],
    "PluginConfDir": "/etc/cni/net.d",
    "PluginMaxConfNum": 1,
    "Prefix": "eth",
    "Networks": [
      {
        "Config": {
          "Name": "cni-loopback",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "loopback",
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"type\":\"loopback\"}"
            }
          ],
          "Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n  \"type\": \"loopback\"\n}]\n}"
        },
        "IFName": "lo"
      },
      {
        "Config": {
          "Name": "cilium",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "cilium-cni",
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"enable-debug\":false,\"log-file\":\"/var/run/cilium/cilium-cni.log\",\"type\":\"cilium-cni\"}"
            }
          ],
          "Source": "\n{\n  \"cniVersion\": \"0.3.1\",\n  \"name\": \"cilium\",\n  \"plugins\": [\n    {\n       \"type\": \"cilium-cni\",\n       \"enable-debug\": false,\n       \"log-file\": \"/var/run/cilium/cilium-cni.log\"\n    }\n  ]\n}"
        },
        "IFName": "eth0"
      }
    ]
  },
  "config": {
    "containerd": {
      "snapshotter": "overlayfs",
      "defaultRuntimeName": "runc",
      "defaultRuntime": {
        "runtimeType": "",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": null,
        "ContainerAnnotations": null,
        "runtimeRoot": "",
        "options": null,
        "privileged_without_host_devices": false,
        "privileged_without_host_devices_all_devices_allowed": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0,
        "snapshotter": "",
        "sandboxMode": ""
      },
      "untrustedWorkloadRuntime": {
        "runtimeType": "",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": null,
        "ContainerAnnotations": null,
        "runtimeRoot": "",
        "options": null,
        "privileged_without_host_devices": false,
        "privileged_without_host_devices_all_devices_allowed": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0,
        "snapshotter": "",
        "sandboxMode": ""
      },
      "runtimes": {
        "runc": {
          "runtimeType": "io.containerd.runc.v2",
          "runtimePath": "",
          "runtimeEngine": "",
          "PodAnnotations": null,
          "ContainerAnnotations": null,
          "runtimeRoot": "",
          "options": {
            "SystemdCgroup": true
          },
          "privileged_without_host_devices": false,
          "privileged_without_host_devices_all_devices_allowed": false,
          "baseRuntimeSpec": "",
          "cniConfDir": "",
          "cniMaxConfNum": 0,
          "snapshotter": "",
          "sandboxMode": "podsandbox"
        }
      },
      "noPivot": false,
      "disableSnapshotAnnotations": true,
      "discardUnpackedLayers": false,
      "ignoreBlockIONotEnabledErrors": false,
      "ignoreRdtNotEnabledErrors": false
    },
    "cni": {
      "binDir": "/opt/cni/bin",
      "confDir": "/etc/cni/net.d",
      "maxConfNum": 1,
      "setupSerially": false,
      "confTemplate": "",
      "ipPref": ""
    },
    "registry": {
      "configPath": "/etc/containerd/certs.d",
      "mirrors": null,
      "configs": null,
      "auths": null,
      "headers": null
    },
    "imageDecryption": {
      "keyModel": ""
    },
    "disableTCPService": true,
    "streamServerAddress": "127.0.0.1",
    "streamServerPort": "0",
    "streamIdleTimeout": "4h0m0s",
    "enableSelinux": false,
    "selinuxCategoryRange": 1024,
    "sandboxImage": "registry.k8s.io/pause:3.8",
    "statsCollectPeriod": 10,
    "systemdCgroup": false,
    "enableTLSStreaming": false,
    "x509KeyPairStreaming": {
      "tlsCertFile": "",
      "tlsKeyFile": ""
    },
    "maxContainerLogSize": 10000,
    "disableCgroup": false,
    "disableApparmor": false,
    "restrictOOMScoreAdj": false,
    "maxConcurrentDownloads": 3,
    "disableProcMount": false,
    "unsetSeccompProfile": "",
    "tolerateMissingHugetlbController": true,
    "disableHugetlbController": true,
    "device_ownership_from_security_context": false,
    "ignoreImageDefinedVolumes": false,
    "netnsMountsUnderStateDir": false,
    "enableUnprivilegedPorts": false,
    "enableUnprivilegedICMP": false,
    "enableCDI": true,
    "cdiSpecDirs": [
      "/etc/cdi",
      "/var/run/cdi"
    ],
    "imagePullProgressTimeout": "5m0s",
    "drainExecSyncIOTimeout": "0s",
    "imagePullWithSyncFs": false,
    "ignoreDeprecationWarnings": null,
    "containerdRootDir": "/var/lib/containerd",
    "containerdEndpoint": "/run/containerd/containerd.sock",
    "rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
    "stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
  },
  "golang": "go1.21.12",
  "lastCNILoadStatus": "OK",
  "lastCNILoadStatus.default": "OK"
}
uname -a
Linux t1-k8s-alsu025 6.1.0-21-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.90-1 (2024-05-03) x86_64 GNU/Linux
crictl inspect
{
  "status": {
    "id": "2594a5faa42199dfcefb468de17fc8b2aceee27e31519e505aef73a2cb8eaee2",
    "metadata": {
      "attempt": 6,
      "name": "redis"
    },
    "state": "CONTAINER_RUNNING",
    "createdAt": "2024-08-05T20:29:52.3390638+02:00",
    "startedAt": "2024-08-05T20:29:52.441284768+02:00",
    "finishedAt": "0001-01-01T00:00:00Z",
    "exitCode": 0,
    "image": {
      "annotations": {},
      "image": ".../redis-cluster:6.2.12-debian-11-r23",
      "userSpecifiedImage": ""
    },
    "reason": "OOMKilled",
    "message": "",
    "resources": {
      "linux": {
        "cpuPeriod": "100000",
        "cpuQuota": "0",
        "cpuShares": "51",
        "cpusetCpus": "",
        "cpusetMems": "",
        "hugepageLimits": [],
        "memoryLimitInBytes": "268435456",
        "memorySwapLimitInBytes": "268435456",
        "oomScoreAdj": "999",
        "unified": {
          "memory.oom.group": "1",
          "memory.swap.max": "0"
        }
      },
      "windows": null
    }
  },
  "info": {
    "sandboxID": "4db3bfdad71867f3edfe3146571b74b7beced902a0f5f0f6b908e0e43bdce2d2",
    "pid": 2631444,
    "removing": false,
    "snapshotKey": "2594a5faa42199dfcefb468de17fc8b2aceee27e31519e505aef73a2cb8eaee2",
    "snapshotter": "overlayfs",
    "runtimeType": "io.containerd.runc.v2",
    "runtimeOptions": {
      "systemd_cgroup": true
    },
    "config": {
      "metadata": {
        "name": "redis",
        "attempt": 6
      },
      "linux": {
        "resources": {
          "cpu_period": 100000,
          "cpu_shares": 51,
          "memory_limit_in_bytes": 268435456,
          "oom_score_adj": 999,
          "hugepage_limits": [
            {
              "page_size": "2MB"
            },
            {
              "page_size": "1GB"
            }
          ],
          "unified": {
            "memory.oom.group": "1",
            "memory.swap.max": "0"
          }
        },
        "security_context": {
          "namespace_options": {
            "pid": 1
          },
          "run_as_user": {
            "value": 1001
          },
          "supplemental_groups": [
            1001
          ],
          "masked_paths": [
            "/proc/asound",
            "/proc/acpi",
            "/proc/kcore",
            "/proc/keys",
            "/proc/latency_stats",
            "/proc/timer_list",
            "/proc/timer_stats",
            "/proc/sched_debug",
            "/proc/scsi",
            "/sys/firmware"
          ],
          "readonly_paths": [
            "/proc/bus",
            "/proc/fs",
            "/proc/irq",
            "/proc/sys",
            "/proc/sysrq-trigger"
          ],
          "seccomp": {
            "profile_type": 1
          }
        }
      }
    },
    "runtimeSpec": {
      "ociVersion": "1.1.0",
      "process": {
        "user": {
          "uid": 1001,
          "gid": 0,
          "additionalGids": [
            0,
            1001
          ]
        },
        "oomScoreAdj": 999
      },
      "root": {
        "path": "rootfs"
      },
      "linux": {
        "resources": {
          "devices": [
            {
              "allow": false,
              "access": "rwm"
            }
          ],
          "memory": {
            "limit": 268435456,
            "swap": 268435456
          },
          "cpu": {
            "shares": 51,
            "period": 100000
          },
          "unified": {
            "memory.oom.group": "1",
            "memory.swap.max": "0"
          }
        },
        "cgroupsPath": "kubepods-burstable-pod3bbf7d7a_a8d4_41bc_9cea_3e0e96321f65.slice:cri-containerd:2594a5faa42199dfcefb468de17fc8b2aceee27e31519e505aef73a2cb8eaee2",
        "namespaces": [
          {
            "type": "pid"
          },
          {
            "type": "ipc",
            "path": "/proc/4056906/ns/ipc"
          },
          {
            "type": "uts",
            "path": "/proc/4056906/ns/uts"
          },
          {
            "type": "mount"
          },
          {
            "type": "network",
            "path": "/proc/4056906/ns/net"
          },
          {
            "type": "cgroup"
          }
        ],
        "maskedPaths": [
          "/proc/asound",
          "/proc/acpi",
          "/proc/kcore",
          "/proc/keys",
          "/proc/latency_stats",
          "/proc/timer_list",
          "/proc/timer_stats",
          "/proc/sched_debug",
          "/proc/scsi",
          "/sys/firmware"
        ],
        "readonlyPaths": [
          "/proc/bus",
          "/proc/fs",
          "/proc/irq",
          "/proc/sys",
          "/proc/sysrq-trigger"
        ]
      }
    }
  }
}
systemd-cgls within the supposedly running pod
systemd-cgls
Working directory /sys/fs/cgroup/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3bbf7d7a_a8d4_41bc_9cea_3e0e96321f65.slice:
└─cri-containerd-4db3bfdad71867f3edfe3146571b74b7beced902a0f5f0f6b908e0e43bdce2d2.scope … (#9418267)
  → user.invocation_id: 7acd8b7f51304a8c8bf691483b192621
  → trusted.invocation_id: 7acd8b7f51304a8c8bf691483b192621
  → user.delegate: 1
  → trusted.delegate: 1
  └─4056906 /pause

/cc @LeTT00r

Show configuration if it is related to CRI plugin.

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    area/criContainer Runtime Interface (CRI)kind/bug

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions