Skip to content

Commit e5aca71

Browse files
olavst-spksmira
authored andcommitted
fix: fix healthcheck timeout
Removes the 5 minute timeouts in the cluster health checks and relies only on the global timeout set by the --wait-timeout flag Signed-off-by: Olav Thoresen <Olav.Sortland.Thoresen@spk.no> Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
1 parent 634b71e commit e5aca71

File tree

3 files changed

+31
-32
lines changed

3 files changed

+31
-32
lines changed

pkg/cluster/check/default.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ func DefaultClusterChecks() []ClusterCheck {
2323
func(cluster ClusterInfo) conditions.Condition {
2424
return conditions.PollingCondition("all k8s nodes to report ready", func(ctx context.Context) error {
2525
return K8sAllNodesReadyAssertion(ctx, cluster)
26-
}, 10*time.Minute, 5*time.Second)
26+
}, 5*time.Second)
2727
},
2828

2929
// wait for kube-proxy to report ready
@@ -39,7 +39,7 @@ func DefaultClusterChecks() []ClusterCheck {
3939
}
4040

4141
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-proxy")
42-
}, 5*time.Minute, 5*time.Second)
42+
}, 5*time.Second)
4343
},
4444

4545
// wait for coredns to report ready
@@ -55,14 +55,14 @@ func DefaultClusterChecks() []ClusterCheck {
5555
}
5656

5757
return K8sPodReadyAssertion(ctx, cluster, replicas, "kube-system", "k8s-app=kube-dns")
58-
}, 5*time.Minute, 5*time.Second)
58+
}, 5*time.Second)
5959
},
6060

6161
// wait for all the nodes to be schedulable
6262
func(cluster ClusterInfo) conditions.Condition {
6363
return conditions.PollingCondition("all k8s nodes to report schedulable", func(ctx context.Context) error {
6464
return K8sAllNodesSchedulableAssertion(ctx, cluster)
65-
}, 5*time.Minute, 5*time.Second)
65+
}, 5*time.Second)
6666
},
6767
},
6868
)
@@ -77,21 +77,21 @@ func K8sComponentsReadinessChecks() []ClusterCheck {
7777
func(cluster ClusterInfo) conditions.Condition {
7878
return conditions.PollingCondition("all k8s nodes to report", func(ctx context.Context) error {
7979
return K8sAllNodesReportedAssertion(ctx, cluster)
80-
}, 5*time.Minute, 30*time.Second) // give more time per each attempt, as this check is going to build and cache kubeconfig
80+
}, 30*time.Second) // give more time per each attempt, as this check is going to build and cache kubeconfig
8181
},
8282

8383
// wait for k8s control plane static pods
8484
func(cluster ClusterInfo) conditions.Condition {
8585
return conditions.PollingCondition("all control plane static pods to be running", func(ctx context.Context) error {
8686
return K8sControlPlaneStaticPods(ctx, cluster)
87-
}, 5*time.Minute, 5*time.Second)
87+
}, 5*time.Second)
8888
},
8989

9090
// wait for HA k8s control plane
9191
func(cluster ClusterInfo) conditions.Condition {
9292
return conditions.PollingCondition("all control plane components to be ready", func(ctx context.Context) error {
9393
return K8sFullControlPlaneAssertion(ctx, cluster)
94-
}, 5*time.Minute, 5*time.Second)
94+
}, 5*time.Second)
9595
},
9696
}
9797
}
@@ -110,63 +110,63 @@ func PreBootSequenceChecks() []ClusterCheck {
110110
func(cluster ClusterInfo) conditions.Condition {
111111
return conditions.PollingCondition("etcd to be healthy", func(ctx context.Context) error {
112112
return ServiceHealthAssertion(ctx, cluster, "etcd", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
113-
}, 5*time.Minute, 5*time.Second)
113+
}, 5*time.Second)
114114
},
115115

116116
// wait for etcd members to be consistent across nodes
117117
func(cluster ClusterInfo) conditions.Condition {
118118
return conditions.PollingCondition("etcd members to be consistent across nodes", func(ctx context.Context) error {
119119
return EtcdConsistentAssertion(ctx, cluster)
120-
}, 5*time.Minute, 5*time.Second)
120+
}, 5*time.Second)
121121
},
122122

123123
// wait for etcd members to be the control plane nodes
124124
func(cluster ClusterInfo) conditions.Condition {
125125
return conditions.PollingCondition("etcd members to be control plane nodes", func(ctx context.Context) error {
126126
return EtcdControlPlaneNodesAssertion(ctx, cluster)
127-
}, 5*time.Minute, 5*time.Second)
127+
}, 5*time.Second)
128128
},
129129

130130
// wait for apid to be ready on all the nodes
131131
func(cluster ClusterInfo) conditions.Condition {
132132
return conditions.PollingCondition("apid to be ready", func(ctx context.Context) error {
133133
return ApidReadyAssertion(ctx, cluster)
134-
}, 5*time.Minute, 5*time.Second)
134+
}, 5*time.Second)
135135
},
136136

137137
// wait for all nodes to report their memory size
138138
func(cluster ClusterInfo) conditions.Condition {
139139
return conditions.PollingCondition("all nodes memory sizes", func(ctx context.Context) error {
140140
return AllNodesMemorySizes(ctx, cluster)
141-
}, 5*time.Minute, 5*time.Second)
141+
}, 5*time.Second)
142142
},
143143

144144
// wait for all nodes to report their disk size
145145
func(cluster ClusterInfo) conditions.Condition {
146146
return conditions.PollingCondition("all nodes disk sizes", func(ctx context.Context) error {
147147
return AllNodesDiskSizes(ctx, cluster)
148-
}, 5*time.Minute, 5*time.Second)
148+
}, 5*time.Second)
149149
},
150150

151151
// check diagnostics
152152
func(cluster ClusterInfo) conditions.Condition {
153153
return conditions.PollingCondition("no diagnostics", func(ctx context.Context) error {
154154
return NoDiagnostics(ctx, cluster)
155-
}, time.Minute, 5*time.Second)
155+
}, 5*time.Second)
156156
},
157157

158158
// wait for kubelet to be healthy on all
159159
func(cluster ClusterInfo) conditions.Condition {
160160
return conditions.PollingCondition("kubelet to be healthy", func(ctx context.Context) error {
161161
return ServiceHealthAssertion(ctx, cluster, "kubelet", WithNodeTypes(machine.TypeInit, machine.TypeControlPlane))
162-
}, 5*time.Minute, 5*time.Second)
162+
}, 5*time.Second)
163163
},
164164

165165
// wait for all nodes to finish booting
166166
func(cluster ClusterInfo) conditions.Condition {
167167
return conditions.PollingCondition("all nodes to finish boot sequence", func(ctx context.Context) error {
168168
return AllNodesBootedAssertion(ctx, cluster)
169-
}, 5*time.Minute, 5*time.Second)
169+
}, 5*time.Second)
170170
},
171171
}
172172
}

pkg/conditions/poll.go

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ type pollingCondition struct {
2626
lastErr error
2727
lastErrSet bool
2828

29-
assertion AssertionFunc
30-
description string
31-
timeout, interval time.Duration
29+
assertion AssertionFunc
30+
description string
31+
interval time.Duration
3232
}
3333

3434
func (p *pollingCondition) String() string {
@@ -53,9 +53,6 @@ func (p *pollingCondition) Wait(ctx context.Context) error {
5353
ticker := time.NewTicker(p.interval)
5454
defer ticker.Stop()
5555

56-
timeoutCtx, timeoutCtxCancel := context.WithTimeout(ctx, p.timeout)
57-
defer timeoutCtxCancel()
58-
5956
for {
6057
err := func() error {
6158
runCtx, runCtxCancel := context.WithTimeout(ctx, p.interval)
@@ -75,20 +72,19 @@ func (p *pollingCondition) Wait(ctx context.Context) error {
7572
}
7673

7774
select {
78-
case <-timeoutCtx.Done():
79-
return timeoutCtx.Err()
75+
case <-ctx.Done():
76+
return ctx.Err()
8077
case <-ticker.C:
8178
}
8279
}
8380
}
8481

85-
// PollingCondition converts AssertionFunc into Condition by calling it every interval until timeout
86-
// is reached.
87-
func PollingCondition(description string, assertion AssertionFunc, timeout, interval time.Duration) Condition {
82+
// PollingCondition converts AssertionFunc into Condition by calling it every interval until
83+
// it completes or the context is canceled.
84+
func PollingCondition(description string, assertion AssertionFunc, interval time.Duration) Condition {
8885
return &pollingCondition{
8986
assertion: assertion,
9087
description: description,
91-
timeout: timeout,
9288
interval: interval,
9389
}
9490
}

pkg/conditions/poll_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ func TestPollingCondition(t *testing.T) {
3131
}
3232

3333
return nil
34-
}, time.Second, time.Millisecond)
34+
}, time.Millisecond)
3535

3636
err := cond.Wait(t.Context())
3737
assert.NoError(t, err)
@@ -52,7 +52,7 @@ func TestPollingCondition(t *testing.T) {
5252
}
5353

5454
return conditions.ErrSkipAssertion
55-
}, time.Second, time.Millisecond)
55+
}, time.Millisecond)
5656

5757
err := cond.Wait(t.Context())
5858
assert.NoError(t, err)
@@ -69,9 +69,12 @@ func TestPollingCondition(t *testing.T) {
6969
calls++
7070

7171
return errors.New("failed")
72-
}, time.Second, 750*time.Millisecond)
72+
}, 750*time.Millisecond)
7373

74-
err := cond.Wait(t.Context())
74+
ctx, cancel := context.WithTimeout(t.Context(), 1400*time.Millisecond)
75+
defer cancel()
76+
77+
err := cond.Wait(ctx)
7578
assert.Equal(t, context.DeadlineExceeded, err)
7679
assert.Equal(t, "Test condition: failed", cond.String())
7780
assert.Equal(t, 2, calls)

0 commit comments

Comments
 (0)