We have identified that there is some kind of leak of cgroup mounts which result in e.g. the following lines in /proc/self/mountinfo:
root@kube-node01:~# cat /proc/self/mountinfo | grep cgroup
30 21 0:26 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 - tmpfs tmpfs ro,mode=755
31 30 0:27 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 - cgroup2 cgroup2 rw
32 30 0:28 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,xattr,name=systemd
35 30 0:31 / /sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,rdma
36 30 0:32 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,cpu,cpuacct
37 30 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,freezer
38 30 0:34 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,blkio
39 30 0:35 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
40 30 0:36 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:20 - cgroup cgroup rw,cpuset
41 30 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,memory
42 30 0:38 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:22 - cgroup cgroup rw,net_cls,net_prio
43 30 0:39 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,hugetlb
44 30 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:24 - cgroup cgroup rw,perf_event
45 30 0:41 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,devices
945 25 0:159 / /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:606 - tmpfs tmpfs rw,mode=755
979 945 0:28 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,xattr,name=systemd
1471 945 0:31 / /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,rdma
1696 945 0:32 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,cpu,cpuacct
1714 945 0:33 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,freezer
2099 945 0:34 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,blkio
2345 945 0:35 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
2390 945 0:36 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:20 - cgroup cgroup rw,cpuset
2409 945 0:37 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,memory
2428 945 0:38 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:22 - cgroup cgroup rw,net_cls,net_prio
2447 945 0:39 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,hugetlb
2466 945 0:40 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:24 - cgroup cgroup rw,perf_event
2485 945 0:41 /kubepods/pod2eb4976a-5001-4c2b-b5cf-df562549e3d4/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c /run/containerd/io.containerd.runtime.v1.linux/k8s.io/2e75a956e9db372ddf40ef4c32d148100010386140238c6173ad6e4d45fd8e1c/rootfs/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,devices
This results in the above output.
diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go
index dbcc58f5..25d57efe 100644
--- a/libcontainer/cgroups/utils.go
+++ b/libcontainer/cgroups/utils.go
@@ -208,6 +208,9 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
Mountpoint: fields[4],
Root: fields[3],
}
+ if strings.HasPrefix(m.Mountpoint, "/run/foo") {
+ continue
+ }
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
seen, known := ss[opt]
if !known || (!all && seen) {
of course this does not work for upstream, at least to fix the original leak I would need to match on something like /run/containerd/.
I'm coming over her by debugging kubernetes/kubernetes#91023 together with containerd v1.3.4 which ships runc:
We have identified that there is some kind of leak of cgroup mounts which result in e.g. the following lines in
/proc/self/mountinfo:When such a leak does exist, runc tries to use use a wrong cgroup during
libcontainer/rootfs_linux'sprepareRootfs.I was able to reproduce the bug by:
This results in the above output.
I was able to debug a bit into runc here and found the following
The function
GetCgroupMounts(false)returns in this case the wrong mountpoint for the systemd cgroup (/run/foo/rootfs/sys/fs/cgroup/systemdinsetad of/sys/fs/cgroup/systemd).This is because in
/proc/self/mountinfothe mount/run/foo/rootfs/sys/fs/cgroup/systemdoccured before/sys/fs/cgroup/systemd(which seems weird for me, because having a look myself to/proc/self/mountinfoand processing it would order them the other way around).As a POC I added the following patch to runc which fixed it for my test case:
of course this does not work for upstream, at least to fix the original leak I would need to match on something like
/run/containerd/.