Skip to content

Commit 5b7b3bb

Browse files
tommyp1cklesaditighag
authored andcommitted
ipam: when a CiliumNode is removed, delete node label from metrics.
Affects: * operator_ipam_available_ips * operator_ipam_used_ips * operator_ipam_needed_ips Which have the label "target_name", previously when a Node was deleted the metric continued to be emitted by the Prometheus exporter, leading to confusing sum() values across a cluster. Fixes changes in cilium#24776 Signed-off-by: Tom Hadlaw <tom.hadlaw@isovalent.com>
1 parent fa3cf34 commit 5b7b3bb

4 files changed

Lines changed: 68 additions & 2 deletions

File tree

pkg/ipam/metrics/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,15 @@ func (p *prometheusMetrics) SetIPNeeded(node string, usage int) {
264264
p.NeededIPs.WithLabelValues(node).Set(float64(usage))
265265
}
266266

267+
// DeleteNode removes all per-node metrics for a particular node (i.e. those labeled with "target_node").
268+
// This is to ensure that when a Node/CiliumNode delete event happens that the operator will no longer report
269+
// metrics for that node.
270+
func (p *prometheusMetrics) DeleteNode(node string) {
271+
p.AvailableIPs.DeleteLabelValues(node)
272+
p.UsedIPs.DeleteLabelValues(node)
273+
p.NeededIPs.DeleteLabelValues(node)
274+
}
275+
267276
type triggerMetrics struct {
268277
total prometheus.Counter
269278
folds prometheus.Gauge
@@ -349,6 +358,7 @@ func (m *NoOpMetrics) SetIPNeeded(node string, n int)
349358
func (m *NoOpMetrics) PoolMaintainerTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
350359
func (m *NoOpMetrics) K8sSyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
351360
func (m *NoOpMetrics) ResyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
361+
func (m *NoOpMetrics) DeleteNode(n string) {}
352362

353363
func merge(slices ...[]float64) []float64 {
354364
result := make([]float64, 1)

pkg/ipam/metrics/mock/mock.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,30 @@ func (m *mockMetrics) SetIPNeeded(s string, n int) {
197197
m.mutex.Unlock()
198198
}
199199

200+
func (m *mockMetrics) GetPerNodeMetrics(n string) (*int, *int, *int) {
201+
m.mutex.Lock()
202+
defer m.mutex.Unlock()
203+
var avail, used, needed *int
204+
if c, ok := m.nodeIPAvailable[n]; ok {
205+
avail = &c
206+
}
207+
if c, ok := m.nodeIPUsed[n]; ok {
208+
used = &c
209+
}
210+
if c, ok := m.nodeIPNeeded[n]; ok {
211+
needed = &c
212+
}
213+
return avail, used, needed
214+
}
215+
216+
func (m *mockMetrics) DeleteNode(n string) {
217+
m.mutex.Lock()
218+
defer m.mutex.Unlock()
219+
delete(m.nodeIPAvailable, n)
220+
delete(m.nodeIPUsed, n)
221+
delete(m.nodeIPNeeded, n)
222+
}
223+
200224
func (m *mockMetrics) PoolMaintainerTrigger() trigger.MetricsObserver {
201225
return nil
202226
}

pkg/ipam/node_manager.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ type MetricsNodeAPI interface {
157157
SetIPAvailable(node string, cap int)
158158
SetIPUsed(node string, used int)
159159
SetIPNeeded(node string, needed int)
160+
DeleteNode(node string)
160161
}
161162

162163
// nodeMap is a mapping of node names to ENI nodes
@@ -383,6 +384,9 @@ func (n *NodeManager) Delete(resource *v2.CiliumNode) {
383384
n.mutex.Lock()
384385

385386
if node, ok := n.nodes[resource.Name]; ok {
387+
// Stop target_node metrics related to this node being emitted.
388+
n.metricsAPI.DeleteNode(node.name)
389+
386390
if node.poolMaintainer != nil {
387391
node.poolMaintainer.Shutdown()
388392
}

pkg/ipam/node_manager_test.go

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,6 @@ func (e *IPAMSuite) TestNodeManagerGet(c *check.C) {
207207
c.Assert(err, check.IsNil)
208208
c.Assert(mngr, check.Not(check.IsNil))
209209

210-
// instances.Resync(context.TODO())
211-
212210
node1 := newCiliumNode("node1", 0, 0, 0)
213211
mngr.Upsert(node1)
214212

@@ -220,6 +218,36 @@ func (e *IPAMSuite) TestNodeManagerGet(c *check.C) {
220218
c.Assert(mngr.Get("node2"), check.IsNil)
221219
}
222220

221+
func (e *IPAMSuite) TestNodeManagerDelete(c *check.C) {
222+
am := newAllocationImplementationMock()
223+
c.Assert(am, check.Not(check.IsNil))
224+
metrics := metricsmock.NewMockMetrics()
225+
mngr, err := NewNodeManager(am, k8sapi, metrics, 10, false, false)
226+
c.Assert(err, check.IsNil)
227+
c.Assert(mngr, check.Not(check.IsNil))
228+
229+
node1 := newCiliumNode("node-foo", 0, 0, 0)
230+
mngr.Upsert(node1)
231+
232+
c.Assert(mngr.Get("node-foo"), check.Not(check.IsNil))
233+
c.Assert(mngr.Get("node2"), check.IsNil)
234+
235+
mngr.Resync(context.Background(), time.Now())
236+
avail, used, needed := metrics.GetPerNodeMetrics("node-foo")
237+
c.Assert(avail, check.Not(check.IsNil))
238+
c.Assert(used, check.Not(check.IsNil))
239+
c.Assert(needed, check.Not(check.IsNil))
240+
mngr.Delete(node1)
241+
// Following a node Delete, we expect the per-node metrics for that Node to be
242+
// deleted.
243+
avail, used, needed = metrics.GetPerNodeMetrics("node-foo")
244+
c.Assert(avail, check.IsNil)
245+
c.Assert(used, check.IsNil)
246+
c.Assert(needed, check.IsNil)
247+
c.Assert(mngr.Get("node-foo"), check.IsNil)
248+
c.Assert(mngr.Get("node2"), check.IsNil)
249+
}
250+
223251
type k8sMock struct{}
224252

225253
func (k *k8sMock) Update(origNode, orig *v2.CiliumNode) (*v2.CiliumNode, error) {

0 commit comments

Comments
 (0)