Skip to content

Commit b855b25

Browse files
bimmlerdaanm
authored andcommitted
node/manager: synthesize node deletion events
When the cilium agent is down (due to a crash or an upgrade), it can miss node events. Upon startup, live nodes are upserted, but when deletions are missed, the agent fails to clean up node-related system state. Examples of such state includes bpf map entries, xfrm states or routes. In particular, the agent fails to clean up node IP to nodeID mappings in the nodeid bpf map. Since K8s will happily recycle such IPs, this can lead to breakage, as the agent associate the wrong nodeID with IPs. To avoid leaking this state, the node manager now dumps its view of the current set of nodes to a file in the runtime state directory, which can be read on restart of an agent. This is similar to how we restore other state upon restart. When reading this file, it's important to avoid resurrecting long-gone nodes (as we don't know for how long the agent was down) - instead, we merely take note of which nodes we knew of in the past, compare that to the nodes we consider live (once synced to k8s), and delete the ones which seem to have disappeared. The motivation to build this reconciliation based on full state dumps to disk is that downstream code generally assumes to have access to a full node object in the deletion callbacks. This makes is infeasible to base the pruning on just the information available in bpf maps. In an alternative design, downstream subsystems are responsible for cleaning up their own state based on just a node identifier, but current code doesn't allow for this. Signed-off-by: David Bimmler <david.bimmler@isovalent.com>
1 parent 545fbc8 commit b855b25

7 files changed

Lines changed: 405 additions & 51 deletions

File tree

pkg/clustermesh/cell.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,6 @@ var Cell = cell.Module(
3636
metrics.Metric(NewMetrics),
3737
metrics.Metric(common.MetricsProvider(subsystem)),
3838

39-
cell.Invoke(ipsetSyncer),
39+
cell.Invoke(ipsetNotifier),
40+
cell.Invoke(nodeManagerNotifier),
4041
)

pkg/clustermesh/ipsetsyncer.go

Lines changed: 0 additions & 39 deletions
This file was deleted.

pkg/clustermesh/notifier.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright Authors of Cilium
3+
4+
package clustermesh
5+
6+
import (
7+
"context"
8+
9+
"github.com/cilium/hive/cell"
10+
"github.com/cilium/hive/job"
11+
12+
"github.com/cilium/cilium/pkg/datapath/iptables/ipset"
13+
nodeManager "github.com/cilium/cilium/pkg/node/manager"
14+
)
15+
16+
func ipsetNotifier(
17+
jg job.Group,
18+
cm *ClusterMesh,
19+
ipsetMgr ipset.Manager,
20+
) {
21+
if cm == nil {
22+
return
23+
}
24+
25+
initializer := ipsetMgr.NewInitializer()
26+
27+
jg.Add(job.OneShot("clustermesh-ipset-notifier", func(ctx context.Context, _ cell.Health) error {
28+
// wait for initial nodes listing from all remote clusters
29+
// before allowing stale ipset entries deletion
30+
if err := cm.NodesSynced(ctx); err != nil {
31+
return err
32+
}
33+
initializer.InitDone()
34+
return nil
35+
}))
36+
}
37+
38+
func nodeManagerNotifier(
39+
jg job.Group,
40+
cm *ClusterMesh,
41+
nodeMgr nodeManager.NodeManager,
42+
) {
43+
if cm == nil {
44+
return
45+
}
46+
47+
jg.Add(job.OneShot("clustermesh-nodemanager-notifier", func(ctx context.Context, _ cell.Health) error {
48+
// wait for initial nodes listing from all remote clusters
49+
// before allowing stale node deletion
50+
if err := cm.NodesSynced(ctx); err != nil {
51+
return err
52+
}
53+
nodeMgr.MeshNodeSync()
54+
return nil
55+
}))
56+
}

pkg/datapath/linux/node.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"github.com/cilium/cilium/pkg/node/manager"
4141
nodeTypes "github.com/cilium/cilium/pkg/node/types"
4242
"github.com/cilium/cilium/pkg/option"
43+
"github.com/cilium/cilium/pkg/source"
4344
"github.com/cilium/cilium/pkg/time"
4445
)
4546

@@ -1086,9 +1087,13 @@ func (n *linuxNodeHandler) NodeDelete(oldNode nodeTypes.Node) error {
10861087
defer n.mutex.Unlock()
10871088

10881089
nodeIdentity := oldNode.Identity()
1089-
if oldCachedNode, nodeExists := n.nodes[nodeIdentity]; nodeExists {
1090+
if oldCachedNode, nodeExists := n.nodes[nodeIdentity]; nodeExists || oldNode.Source == source.Restored {
10901091
delete(n.nodes, nodeIdentity)
10911092

1093+
if oldNode.Source == source.Restored {
1094+
oldCachedNode = &oldNode
1095+
}
1096+
10921097
if n.isInitialized {
10931098
return n.nodeDelete(oldCachedNode)
10941099
}

pkg/node/manager/cell.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ type NodeManager interface {
5959

6060
// NodeSync is called when the store completes the initial nodes listing
6161
NodeSync()
62+
// MeshNodeSync is called when the store completes the initial nodes listing including meshed nodes
63+
MeshNodeSync()
6264

6365
// ClusterSizeDependantInterval returns a time.Duration that is dependent on
6466
// the cluster size, i.e. the number of nodes that have been discovered. This

0 commit comments

Comments
 (0)