Skip to content

Commit 8a1280b

Browse files
committed
metrics/cgroups: fix deadlock issue in Add during Collect
The Collector.Collect will be the field ns'Collect's callback, which be invoked periodically with internal lock. And Collector.Add also runs with ns.Lock in Collector.Lock, which is easy to cause deadlock. Goroutine X: ns.Collect ns.Lock Collector.Collect Collector.RLock Goroutine Y: Collector.Add Collector.Lock ns.Lock We should use ns.Lock without Collector.Lock in Add. Fix: #6772 Signed-off-by: Wei Fu <fuweid89@gmail.com>
1 parent f033f6f commit 8a1280b

5 files changed

Lines changed: 266 additions & 36 deletions

File tree

Vagrantfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ EOF
228228
set -eux -o pipefail
229229
rm -rf /var/lib/containerd-test /run/containerd-test
230230
cd ${GOPATH}/src/github.com/containerd/containerd
231+
go test -v -count=1 -race ./metrics/cgroups
231232
make integration EXTRA_TESTFLAGS="-timeout 15m -no-criu -test.v" TEST_RUNTIME=io.containerd.runc.v2 RUNC_FLAVOR=$RUNC_FLAVOR
232233
SHELL
233234
end

metrics/cgroups/common/type.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//go:build linux
2+
// +build linux
3+
4+
/*
5+
Copyright The containerd Authors.
6+
7+
Licensed under the Apache License, Version 2.0 (the "License");
8+
you may not use this file except in compliance with the License.
9+
You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
*/
19+
20+
package common
21+
22+
import (
23+
"context"
24+
25+
"github.com/gogo/protobuf/types"
26+
)
27+
28+
// Statable type that returns cgroup metrics
29+
type Statable interface {
30+
ID() string
31+
Namespace() string
32+
Stats(context.Context) (*types.Any, error)
33+
}

metrics/cgroups/metrics_test.go

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
//go:build linux
2+
// +build linux
3+
4+
/*
5+
Copyright The containerd Authors.
6+
7+
Licensed under the Apache License, Version 2.0 (the "License");
8+
you may not use this file except in compliance with the License.
9+
You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
*/
19+
20+
package cgroups
21+
22+
import (
23+
"context"
24+
"strconv"
25+
"sync"
26+
"testing"
27+
"time"
28+
29+
"github.com/containerd/cgroups"
30+
"github.com/containerd/containerd/metrics/cgroups/common"
31+
v1 "github.com/containerd/containerd/metrics/cgroups/v1"
32+
v2 "github.com/containerd/containerd/metrics/cgroups/v2"
33+
v1types "github.com/containerd/containerd/metrics/types/v1"
34+
v2types "github.com/containerd/containerd/metrics/types/v2"
35+
"github.com/containerd/containerd/protobuf"
36+
"github.com/prometheus/client_golang/prometheus"
37+
38+
metrics "github.com/docker/go-metrics"
39+
"github.com/gogo/protobuf/types"
40+
)
41+
42+
// TestRegressionIssue6772 should not have dead-lock when Collect and Add run
43+
// in the same time.
44+
//
45+
// Issue: https://github.com/containerd/containerd/issues/6772.
46+
func TestRegressionIssue6772(t *testing.T) {
47+
ns := metrics.NewNamespace("test-container", "", nil)
48+
isV1 := true
49+
50+
var collecter Collecter
51+
if cgroups.Mode() == cgroups.Unified {
52+
isV1 = false
53+
collecter = v2.NewCollector(ns)
54+
} else {
55+
collecter = v1.NewCollector(ns)
56+
}
57+
58+
doneCh := make(chan struct{})
59+
defer close(doneCh)
60+
61+
maxItem := 100
62+
startCh := make(chan struct{})
63+
64+
metricCh := make(chan prometheus.Metric, maxItem)
65+
66+
go func() {
67+
for {
68+
select {
69+
case <-doneCh:
70+
return
71+
case <-metricCh:
72+
}
73+
}
74+
}()
75+
76+
go func() {
77+
// pulling the metrics to trigger dead-lock
78+
ns.Collect(metricCh)
79+
close(startCh)
80+
81+
for {
82+
select {
83+
case <-doneCh:
84+
return
85+
default:
86+
}
87+
88+
ns.Collect(metricCh)
89+
}
90+
}()
91+
<-startCh
92+
93+
labels := map[string]string{"issue": "6772"}
94+
errCh := make(chan error, 1)
95+
96+
var wg sync.WaitGroup
97+
for i := 0; i < maxItem; i++ {
98+
id := i
99+
wg.Add(1)
100+
101+
go func() {
102+
defer wg.Done()
103+
104+
err := collecter.Add(
105+
&mockStatT{
106+
id: strconv.Itoa(id),
107+
namespace: "issue6772",
108+
isV1: isV1,
109+
},
110+
labels,
111+
)
112+
if err != nil {
113+
errCh <- err
114+
}
115+
}()
116+
}
117+
118+
finishedCh := make(chan struct{})
119+
go func() {
120+
defer close(finishedCh)
121+
122+
wg.Wait()
123+
}()
124+
125+
select {
126+
case err := <-errCh:
127+
t.Fatalf("unexpected error: %v", err)
128+
case <-finishedCh:
129+
case <-time.After(30 * time.Second):
130+
t.Fatal("should finish the Add in time")
131+
}
132+
}
133+
134+
type Collecter interface {
135+
Collect(ch chan<- prometheus.Metric)
136+
137+
Add(t common.Statable, labels map[string]string) error
138+
}
139+
140+
type mockStatT struct {
141+
id, namespace string
142+
isV1 bool
143+
}
144+
145+
func (t *mockStatT) ID() string {
146+
return t.id
147+
}
148+
149+
func (t *mockStatT) Namespace() string {
150+
return t.namespace
151+
}
152+
153+
func (t *mockStatT) Stats(context.Context) (*types.Any, error) {
154+
if t.isV1 {
155+
return protobuf.MarshalAnyToProto(&v1types.Metrics{})
156+
}
157+
return protobuf.MarshalAnyToProto(&v2types.Metrics{})
158+
}

metrics/cgroups/v1/metrics.go

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,14 @@ import (
2626

2727
"github.com/containerd/cgroups"
2828
"github.com/containerd/containerd/log"
29+
"github.com/containerd/containerd/metrics/cgroups/common"
2930
v1 "github.com/containerd/containerd/metrics/types/v1"
3031
"github.com/containerd/containerd/namespaces"
3132
"github.com/containerd/typeurl"
3233
"github.com/docker/go-metrics"
33-
"github.com/gogo/protobuf/types"
3434
"github.com/prometheus/client_golang/prometheus"
3535
)
3636

37-
// Statable type that returns cgroup metrics
38-
type Statable interface {
39-
ID() string
40-
Namespace() string
41-
Stats(context.Context) (*types.Any, error)
42-
}
43-
4437
// Trigger will be called when an event happens and provides the cgroup
4538
// where the event originated from
4639
type Trigger func(string, string, cgroups.Cgroup)
@@ -71,7 +64,7 @@ func taskID(id, namespace string) string {
7164
}
7265

7366
type entry struct {
74-
task Statable
67+
task common.Statable
7568
// ns is an optional child namespace that contains additional to parent labels.
7669
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
7770
ns *metrics.Namespace
@@ -80,12 +73,34 @@ type entry struct {
8073
// Collector provides the ability to collect container stats and export
8174
// them in the prometheus format
8275
type Collector struct {
83-
mu sync.RWMutex
84-
85-
tasks map[string]entry
8676
ns *metrics.Namespace
87-
metrics []*metric
8877
storedMetrics chan prometheus.Metric
78+
79+
// TODO(fuweid):
80+
//
81+
// The Collector.Collect will be the field ns'Collect's callback,
82+
// which be invoked periodically with internal lock. And Collector.Add
83+
// might also invoke ns.Lock if the labels is not nil, which is easy to
84+
// cause dead-lock.
85+
//
86+
// Goroutine X:
87+
//
88+
// ns.Collect
89+
// ns.Lock
90+
// Collector.Collect
91+
// Collector.RLock
92+
//
93+
//
94+
// Goroutine Y:
95+
//
96+
// Collector.Add
97+
// ...(RLock/Lock)
98+
// ns.Lock
99+
//
100+
// I think we should seek the way to decouple ns from Collector.
101+
mu sync.RWMutex
102+
tasks map[string]entry
103+
metrics []*metric
89104
}
90105

91106
// Describe prometheus metrics
@@ -148,26 +163,31 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool
148163
}
149164

150165
// Add adds the provided cgroup and id so that metrics are collected and exported
151-
func (c *Collector) Add(t Statable, labels map[string]string) error {
166+
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
152167
if c.ns == nil {
153168
return nil
154169
}
155-
c.mu.Lock()
156-
defer c.mu.Unlock()
170+
c.mu.RLock()
157171
id := taskID(t.ID(), t.Namespace())
158-
if _, ok := c.tasks[id]; ok {
172+
_, ok := c.tasks[id]
173+
c.mu.RUnlock()
174+
if ok {
159175
return nil // requests to collect metrics should be idempotent
160176
}
177+
161178
entry := entry{task: t}
162179
if labels != nil {
163180
entry.ns = c.ns.WithConstLabels(labels)
164181
}
182+
183+
c.mu.Lock()
165184
c.tasks[id] = entry
185+
c.mu.Unlock()
166186
return nil
167187
}
168188

169189
// Remove removes the provided cgroup by id from the collector
170-
func (c *Collector) Remove(t Statable) {
190+
func (c *Collector) Remove(t common.Statable) {
171191
if c.ns == nil {
172192
return
173193
}

0 commit comments

Comments
 (0)