Skip to content

runc has problems due to leaked mount information #2404

@chrischdi

Description

@chrischdi

I'm coming over her by debugging kubernetes/kubernetes#91023 together with containerd v1.3.4 which ships runc:

runc version 1.0.0-rc10
commit: dc9208a3303feef5b3839f4323d9beb36df0a9dd-dirty
spec: 1.0.1-dev

We have identified that there is some kind of leak of cgroup mounts which result in e.g. the following lines in /proc/self/mountinfo:

root@kube-node01:~# cat /proc/self/mountinfo | grep cgroup
30 21 0:26 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 - tmpfs tmpfs ro,mode=755
31 30 0:27 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 - cgroup2 cgroup2 rw
32 30 0:28 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,xattr,name=systemd
35 30 0:31 / /sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,rdma
36 30 0:32 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,cpu,cpuacct
37 30 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,freezer
38 30 0:34 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,blkio
39 30 0:35 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
40 30 0:36 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:20 - cgroup cgroup rw,cpuset
41 30 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,memory
42 30 0:38 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:22 - cgroup cgroup rw,net_cls,net_prio
43 30 0:39 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,hugetlb
44 30 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:24 - cgroup cgroup rw,perf_event
45 30 0:41 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,devices
945 25 0:159 / /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:606 - tmpfs tmpfs rw,mode=755
979 945 0:28 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,xattr,name=systemd
1471 945 0:31 / /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,rdma
1696 945 0:32 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,cpu,cpuacct
1714 945 0:33 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,freezer
2099 945 0:34 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,blkio
2345 945 0:35 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
2390 945 0:36 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:20 - cgroup cgroup rw,cpuset
2409 945 0:37 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,memory
2428 945 0:38 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:22 - cgroup cgroup rw,net_cls,net_prio
2447 945 0:39 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,hugetlb
2466 945 0:40 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:24 - cgroup cgroup rw,perf_event
2485 945 0:41 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,devices

When such a leak does exist, runc tries to use use a wrong cgroup during libcontainer/rootfs_linux's prepareRootfs.

May 14 07:10:35 c04pc003-kube-node03 containerd[28410]: time="2020-05-14T07:10:35.468879890Z" level=error msg="StartContainer for \"b19c361eda463efba332d6a6a365ea4c48ec461823b4ae01456033c95f663372\" failed" error="failed to create containerd task: OCI runtime create failed: container
_linux.go:349: starting container process caused \"process_linux.go:449: container init caused \\\"rootfs_linux.go:58: mounting \\\\\\\"cgroup\\\\\\\" to rootfs \\\\\\\"/run/containerd/io.containerd.runtime.v1.linux/k8s.io/b19c361eda463efba332d6a6a365ea4c48ec461823b4ae01456033c95f663
372/rootfs\\\\\\\" at \\\\\\\"/sys/fs/cgroup\\\\\\\" caused \\\\\\\"stat /run/foo/rootfs/sys/fs/pod81924e98-98f5-45b8-9bb7-0e8ee2f77d12/b19c361eda463efba332d6a6a365ea4c48ec461823b4ae01456033c95f663372: no such file or directory\\\\\\\"\\\"\": unknown"

I was able to reproduce the bug by:

  • Adding the patch mentioned here to the kubelet
  • Replacing and restarting the kubelet
  • Adding a broken mount manually by:
    # create target dummy dir
    mkdir -p /run/foo/rootfs/sys/fs/cgroup/systemd
    # get a container cgroup dir in /sys/fs/cgroup/systemd/kubepods
    POD=$(find /sys/fs/cgroup/systemd/kubepods/ -maxdepth 1 -name "pod*" | head -n 1)
    SRCDIR="$POD/$(ls -1 $POD | grep -v -E -e '^cgroup|^notify|^tasks' | head -n 1)"
    mount -o bind "$SRCDIR" "/run/foo/rootfs/sys/fs/cgroup/systemd"
    
  • creating pods

This results in the above output.

I was able to debug a bit into runc here and found the following
The function GetCgroupMounts(false) returns in this case the wrong mountpoint for the systemd cgroup (/run/foo/rootfs/sys/fs/cgroup/systemd insetad of /sys/fs/cgroup/systemd).

This is because in /proc/self/mountinfo the mount /run/foo/rootfs/sys/fs/cgroup/systemd occured before /sys/fs/cgroup/systemd (which seems weird for me, because having a look myself to /proc/self/mountinfo and processing it would order them the other way around).

As a POC I added the following patch to runc which fixed it for my test case:

diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go
index dbcc58f5..25d57efe 100644
--- a/libcontainer/cgroups/utils.go
+++ b/libcontainer/cgroups/utils.go
@@ -208,6 +208,9 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
                        Mountpoint: fields[4],
                        Root:       fields[3],
                }
+               if strings.HasPrefix(m.Mountpoint, "/run/foo") {
+                       continue
+               }
                for _, opt := range strings.Split(fields[len(fields)-1], ",") {
                        seen, known := ss[opt]
                        if !known || (!all && seen) {

of course this does not work for upstream, at least to fix the original leak I would need to match on something like /run/containerd/.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions