Skip to content

Deadlock between ipcache GC and endpoint regeneration injecting ipcache entry via FQDN #11946

@tgraf

Description

@tgraf

Deadlock description

Routine 1:

The following go routine holds the ipcache mutex while garbage collecting:

goroutine 779 [semacquire, 48 minutes]:
sync.runtime_SemacquireMutex(0xc0008f4f68, 0xc007663600, 0x0)
        /usr/local/go/src/runtime/sema.go:71 +0x47
sync.(*RWMutex).Lock(0xc0008f4f60)
        /usr/local/go/src/sync/rwmutex.go:103 +0x88
github.com/cilium/cilium/pkg/datapath/loader.(*Loader).Reinitialize(0xc0003d70a0, 0x2759d20, 0xc000468bc0, 0x2771e60, 0xc000515680, 0x2329, 0x7f87768eac80, 0xc00022ff10, 0x26faf60, 0xc0004a18b0, ...)
        /go/src/github.com/cilium/cilium/pkg/datapath/loader/base.go:124 +0xcd
main.(*Daemon).TriggerReloadWithoutCompile(0xc000515680, 0x2332316, 0x10, 0x12, 0xc008f75060, 0x16)
        /go/src/github.com/cilium/cilium/daemon/daemon.go:588 +0x202
github.com/cilium/cilium/pkg/datapath/ipcache.(*BPFListener).garbageCollect(0xc0006f8960, 0x2759d20, 0xc003748000, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/datapath/ipcache/listener.go:306 +0xadc
github.com/cilium/cilium/pkg/datapath/ipcache.(*BPFListener).OnIPIdentityCacheGC.func1(0x2759d20, 0xc003748000, 0x3a5bf00, 0x2)
        /go/src/github.com/cilium/cilium/pkg/datapath/ipcache/listener.go:340 +0x3e
github.com/cilium/cilium/pkg/controller.(*Controller).runController(0xc001e1e000)
        /go/src/github.com/cilium/cilium/pkg/controller/controller.go:205 +0xa2a
created by github.com/cilium/cilium/pkg/controller.(*Manager).updateController
        /go/src/github.com/cilium/cilium/pkg/controller/manager.go:120 +0xb09

As part of this, Reinitialize() is called which will require the compilation mutex to be acquired.

Routine 2

The following ongoing endpoint regeneration is holding the compilation lock and thus blocks Routine 1 from completing. It is itself blocked on the FQDN NameManager mutex.

goroutine 7227352 [semacquire, 48 minutes]:
sync.runtime_SemacquireMutex(0xc0005c3744, 0xc009bf4c00, 0x1)
        /usr/local/go/src/runtime/sema.go:71 +0x47
sync.(*Mutex).lockSlow(0xc0005c3740)
        /usr/local/go/src/sync/mutex.go:138 +0xfc
sync.(*Mutex).Lock(...)
        /usr/local/go/src/sync/mutex.go:81
github.com/cilium/cilium/pkg/fqdn.(*NameManager).Lock(0xc0005c3740)
        /go/src/github.com/cilium/cilium/pkg/fqdn/name_manager.go:93 +0x47
github.com/cilium/cilium/pkg/policy.(*SelectorCache).AddFQDNSelector(0xc0004691c0, 0x26faf00, 0xc00acd6e70, 0x0, 0x0, 0xc004106590, 0xb, 0x0, 0x0, 0x40ca00)
        /go/src/github.com/cilium/cilium/pkg/policy/selectorcache.go:580 +0x13f
github.com/cilium/cilium/pkg/policy.(*L4Filter).cacheFQDNSelector(...)
        /go/src/github.com/cilium/cilium/pkg/policy/l4.go:392
github.com/cilium/cilium/pkg/policy.(*L4Filter).cacheFQDNSelectors(0xc00acd6e70, 0xc008c7d100, 0x38, 0x38, 0xc0004691c0)
        /go/src/github.com/cilium/cilium/pkg/policy/l4.go:387 +0xad
github.com/cilium/cilium/pkg/policy.createL4Filter(0x2739660, 0xc001c49680, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
        /go/src/github.com/cilium/cilium/pkg/policy/l4.go:489 +0x7ce
github.com/cilium/cilium/pkg/policy.createL4EgressFilter(...)
        /go/src/github.com/cilium/cilium/pkg/policy/l4.go:599
github.com/cilium/cilium/pkg/policy.mergeEgressPortProto(0x2739660, 0xc001c49680, 0xc0052087d0, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, ...)
        /go/src/github.com/cilium/cilium/pkg/policy/rule.go:637 +0x187
github.com/cilium/cilium/pkg/policy.mergeEgress(0x2739660, 0xc001c49680, 0xc0052087d0, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0xc00b91f2c0, ...)
        /go/src/github.com/cilium/cilium/pkg/policy/rule.go:572 +0xdbd
github.com/cilium/cilium/pkg/policy.(*rule).resolveEgressPolicy(0xc0079a0400, 0x2739660, 0xc001c49680, 0xc0052087d0, 0xc005208418, 0xc0090738f0, 0x0, 0x0, 0x0, 0xc0090738f0, ...)
        /go/src/github.com/cilium/cilium/pkg/policy/rule.go:675 +0x265
github.com/cilium/cilium/pkg/policy.ruleSlice.resolveL4EgressPolicy(0xc008d0a9c0, 0x5, 0x8, 0x2739660, 0xc001c49680, 0xc0052087d0, 0xc008d6c300, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/policy/rules.go:106 +0x70b
github.com/cilium/cilium/pkg/policy.(*Repository).resolvePolicyLocked(0xc000246460, 0xc0059fe820, 0xc0000066c3, 0xc0030b0960, 0xc006176201)
        /go/src/github.com/cilium/cilium/pkg/policy/repository.go:665 +0x3f9
github.com/cilium/cilium/pkg/policy.(*PolicyCache).updateSelectorPolicy(0xc000342b20, 0xc0059fe820, 0x3a5bf00, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/policy/distillery.go:135 +0x148
github.com/cilium/cilium/pkg/policy.(*PolicyCache).UpdatePolicy(...)
        /go/src/github.com/cilium/cilium/pkg/policy/distillery.go:170
github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regeneratePolicy(0xc001427080, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/endpoint/policy.go:175 +0x255
github.com/cilium/cilium/pkg/endpoint.(*Endpoint).runPreCompilationSteps(0xc001427080, 0xc005d3a500, 0xc008d0a800, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/endpoint/bpf.go:736 +0x96a
github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regenerateBPF(0xc001427080, 0xc005d3a500, 0x0, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/endpoint/bpf.go:508 +0x1e0
github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regenerate(0xc001427080, 0xc005d3a500, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/endpoint/policy.go:349 +0x841
github.com/cilium/cilium/pkg/endpoint.(*EndpointRegenerationEvent).Handle(0xc003530210, 0xc0068115c0)
        /go/src/github.com/cilium/cilium/pkg/endpoint/events.go:58 +0x459
github.com/cilium/cilium/pkg/eventqueue.(*EventQueue).Run.func1()
        /go/src/github.com/cilium/cilium/pkg/eventqueue/eventqueue.go:260 +0x187
sync.(*Once).doSlow(0xc005f60e58, 0xc009bbb330)
        /usr/local/go/src/sync/once.go:66 +0xe3
sync.(*Once).Do(0xc005f60e58, 0xc009bbb330)
        /usr/local/go/src/sync/once.go:57 +0x45
created by github.com/cilium/cilium/pkg/eventqueue.(*EventQueue).Run
        /go/src/github.com/cilium/cilium/pkg/eventqueue/eventqueue.go:248 +0xa9

Routine 3

This third routine is holding the FQDN NameManager mutex while itself waiting on the ipcache mutex which is held by routine 1.

goroutine 7229843 [select, 48 minutes]:
golang.org/x/sync/semaphore.(*Weighted).Acquire(0xc000464230, 0x2759d60, 0xc000058018, 0x40000000, 0xc0008f9740, 0x0)
        /go/src/github.com/cilium/cilium/vendor/golang.org/x/sync/semaphore/semaphore.go:60 +0x28e
github.com/cilium/cilium/pkg/lock.(*SemaphoredMutex).Lock(...)
        /go/src/github.com/cilium/cilium/pkg/lock/semaphored_mutex.go:41
github.com/cilium/cilium/pkg/ipcache.(*IPCache).Upsert(0xc000464280, 0xc008858dd0, 0xd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1000596, 0x231acf1, ...)
        /go/src/github.com/cilium/cilium/pkg/ipcache/ipcache.go:184 +0x93
github.com/cilium/cilium/pkg/ipcache.allocateCIDRs(0xc004bb2a00, 0x98, 0x98, 0x0, 0x0, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/ipcache/cidr.go:97 +0x7aa
github.com/cilium/cilium/pkg/ipcache.AllocateCIDRsForIPs(0xc0046a2000, 0x98, 0x200, 0x1, 0x1, 0x0, 0x1f4a440, 0xc00a768900)
        /go/src/github.com/cilium/cilium/pkg/ipcache/cidr.go:52 +0x65
main.identitiesForFQDNSelectorIPs(0xc00a7687e0, 0xc0008f9740, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/daemon/fqdn.go:87 +0x38e
main.(*Daemon).updateSelectors(0xc000515680, 0x2759da0, 0xc000e09a40, 0xc00a7687e0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/daemon/fqdn.go:320 +0x40
github.com/cilium/cilium/pkg/fqdn.(*NameManager).UpdateGenerateDNS(0xc0005c3740, 0x2759da0, 0xc000e09a40, 0xbfae6eb0118dc6cd, 0x131b26c90d12, 0x3a5bf00, 0xc004cf9910, 0x0, 0x0, 0x0)
        /go/src/github.com/cilium/cilium/pkg/fqdn/name_manager.go:225 +0x48f
main.(*Daemon).notifyOnDNSMsg(0xc000515680, 0xbfae6eb0118dc6cd, 0x131b26c90d12, 0x3a5bf00, 0xc001427080, 0xc00a2bdec0, 0x11, 0xc00ae52340, 0x10, 0xc003564f30, ...)
        /go/src/github.com/cilium/cilium/daemon/fqdn.go:584 +0x12cd
github.com/cilium/cilium/pkg/fqdn/dnsproxy.(*DNSProxy).ServeDNS(0xc0008c8e80, 0x27818e0, 0xc00302e1e0, 0xc0072a4f30)
        /go/src/github.com/cilium/cilium/pkg/fqdn/dnsproxy/proxy.go:475 +0x238c
github.com/miekg/dns.(*Server).serveDNS(0xc000563a40, 0xc001f8be00, 0x6b, 0x200, 0xc00302e1e0)
        /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:597 +0x20b
github.com/miekg/dns.(*Server).serveUDPPacket(0xc000563a40, 0xc00047d060, 0xc001f8be00, 0x6b, 0x200, 0xc000df0380, 0xc0092cf2b0)
        /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:552 +0xb2
github.com/miekg/dns.(*Server).serveUDP.func2(0xc000563a40, 0xc00047d060, 0xc001f8be00, 0x6b, 0x200, 0xc000df0380, 0xc0092cf2b0)
        /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:478 +0x67
created by github.com/miekg/dns.(*Server).serveUDP
        /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:477 +0x272

Metadata

Metadata

Assignees

Labels

kind/bugThis is a bug in the Cilium logic.priority/highThis is considered vital to an upcoming release.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions