Skip to content

Commit 75ba839

Browse files
committed
fix(gateway): expose event loop health in readiness
1 parent 9f7932f commit 75ba839

6 files changed

Lines changed: 167 additions & 4 deletions

File tree

docs/cli/gateway.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ When you set `--url`, the CLI does not fall back to config or environment creden
145145
openclaw gateway health --url ws://127.0.0.1:18789
146146
```
147147

148-
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling.
148+
The HTTP `/healthz` endpoint is a liveness probe: it returns once the server can answer HTTP. The HTTP `/readyz` endpoint is stricter and stays red while startup sidecars, channels, or configured hooks are still settling. Local or authenticated detailed readiness responses include an `eventLoop` diagnostic block with event-loop delay, event-loop utilization, CPU core ratio, and a `degraded` flag.
149149

150150
### `gateway usage-cost`
151151

src/gateway/server-close.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ export async function runGatewayClosePrelude(params: {
112112
disposeBrowserAuthRateLimiter: () => void;
113113
stopModelPricingRefresh?: () => void;
114114
stopChannelHealthMonitor?: () => void;
115+
stopReadinessEventLoopHealth?: () => void;
115116
clearSecretsRuntimeSnapshot?: () => void;
116117
closeMcpServer?: () => Promise<void>;
117118
}): Promise<void> {
@@ -122,6 +123,7 @@ export async function runGatewayClosePrelude(params: {
122123
params.disposeBrowserAuthRateLimiter();
123124
params.stopModelPricingRefresh?.();
124125
params.stopChannelHealthMonitor?.();
126+
params.stopReadinessEventLoopHealth?.();
125127
params.clearSecretsRuntimeSnapshot?.();
126128
await params.closeMcpServer?.().catch(() => {});
127129
}

src/gateway/server.impl.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ import { STARTUP_UNAVAILABLE_GATEWAY_METHODS } from "./server-startup-unavailabl
8686
import { startGatewayEarlyRuntime, startGatewayPostAttachRuntime } from "./server-startup.js";
8787
import { createWizardSessionTracker } from "./server-wizard-sessions.js";
8888
import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
89+
import { createGatewayEventLoopHealthMonitor } from "./server/event-loop-health.js";
8990
import {
9091
getHealthCache,
9192
getHealthVersion,
@@ -565,6 +566,7 @@ export async function startGatewayServer(
565566
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
566567
}
567568
const serverStartedAt = Date.now();
569+
const readinessEventLoopHealth = createGatewayEventLoopHealthMonitor();
568570
let startupSidecarsReady = minimalTestGateway;
569571
const channelManager = createChannelManager({
570572
getRuntimeConfig: () =>
@@ -582,6 +584,7 @@ export async function startGatewayServer(
582584
channelManager,
583585
startedAt: serverStartedAt,
584586
getStartupPending: () => !startupSidecarsReady,
587+
getEventLoopHealth: readinessEventLoopHealth.snapshot,
585588
});
586589
log.info("starting HTTP server...");
587590
const {
@@ -682,6 +685,7 @@ export async function startGatewayServer(
682685
disposeBrowserAuthRateLimiter: () => browserAuthRateLimiter.dispose(),
683686
stopModelPricingRefresh: runtimeState.stopModelPricingRefresh,
684687
stopChannelHealthMonitor: () => runtimeState?.channelHealthMonitor?.stop(),
688+
stopReadinessEventLoopHealth: readinessEventLoopHealth.stop,
685689
clearSecretsRuntimeSnapshot,
686690
closeMcpServer: closeMcpLoopbackServerOnDemand,
687691
});
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import { monitorEventLoopDelay, performance } from "node:perf_hooks";
2+
3+
const EVENT_LOOP_MONITOR_RESOLUTION_MS = 20;
4+
const EVENT_LOOP_DELAY_WARN_MS = 1_000;
5+
const EVENT_LOOP_UTILIZATION_WARN = 0.95;
6+
const CPU_CORE_RATIO_WARN = 0.9;
7+
8+
type EventLoopDelayMonitor = ReturnType<typeof monitorEventLoopDelay>;
9+
type EventLoopUtilization = ReturnType<typeof performance.eventLoopUtilization>;
10+
type CpuUsage = ReturnType<typeof process.cpuUsage>;
11+
12+
export type GatewayEventLoopHealthReason = "event_loop_delay" | "event_loop_utilization" | "cpu";
13+
14+
export type GatewayEventLoopHealth = {
15+
degraded: boolean;
16+
reasons: GatewayEventLoopHealthReason[];
17+
intervalMs: number;
18+
delayP99Ms: number;
19+
delayMaxMs: number;
20+
utilization: number;
21+
cpuCoreRatio: number;
22+
};
23+
24+
export type GatewayEventLoopHealthMonitor = {
25+
snapshot: () => GatewayEventLoopHealth | undefined;
26+
stop: () => void;
27+
};
28+
29+
function roundMetric(value: number, digits = 3): number {
30+
if (!Number.isFinite(value)) {
31+
return 0;
32+
}
33+
const factor = 10 ** digits;
34+
return Math.round(value * factor) / factor;
35+
}
36+
37+
function nanosecondsToMilliseconds(value: number): number {
38+
return roundMetric(value / 1_000_000, 1);
39+
}
40+
41+
export function createGatewayEventLoopHealthMonitor(): GatewayEventLoopHealthMonitor {
42+
let monitor: EventLoopDelayMonitor | null = null;
43+
let lastWallAt = Date.now();
44+
let lastCpuUsage: CpuUsage | null = process.cpuUsage();
45+
let lastEventLoopUtilization: EventLoopUtilization | null = performance.eventLoopUtilization();
46+
47+
try {
48+
monitor = monitorEventLoopDelay({ resolution: EVENT_LOOP_MONITOR_RESOLUTION_MS });
49+
monitor.enable();
50+
monitor.reset();
51+
} catch {
52+
monitor = null;
53+
}
54+
55+
return {
56+
snapshot: () => {
57+
if (!monitor || !lastCpuUsage || !lastEventLoopUtilization || lastWallAt <= 0) {
58+
return undefined;
59+
}
60+
61+
const now = Date.now();
62+
const intervalMs = Math.max(1, now - lastWallAt);
63+
const cpuUsage = process.cpuUsage(lastCpuUsage);
64+
const currentEventLoopUtilization = performance.eventLoopUtilization();
65+
const utilization = roundMetric(
66+
performance.eventLoopUtilization(currentEventLoopUtilization, lastEventLoopUtilization)
67+
.utilization,
68+
);
69+
const delayP99Ms = nanosecondsToMilliseconds(monitor.percentile(99));
70+
const delayMaxMs = nanosecondsToMilliseconds(monitor.max);
71+
const cpuTotalMs = roundMetric((cpuUsage.user + cpuUsage.system) / 1_000, 1);
72+
const cpuCoreRatio = roundMetric(cpuTotalMs / intervalMs);
73+
const reasons: GatewayEventLoopHealthReason[] = [];
74+
75+
if (delayP99Ms >= EVENT_LOOP_DELAY_WARN_MS || delayMaxMs >= EVENT_LOOP_DELAY_WARN_MS) {
76+
reasons.push("event_loop_delay");
77+
}
78+
if (utilization >= EVENT_LOOP_UTILIZATION_WARN) {
79+
reasons.push("event_loop_utilization");
80+
}
81+
if (cpuCoreRatio >= CPU_CORE_RATIO_WARN) {
82+
reasons.push("cpu");
83+
}
84+
85+
monitor.reset();
86+
lastWallAt = now;
87+
lastCpuUsage = process.cpuUsage();
88+
lastEventLoopUtilization = currentEventLoopUtilization;
89+
90+
return {
91+
degraded: reasons.length > 0,
92+
reasons,
93+
intervalMs,
94+
delayP99Ms,
95+
delayMaxMs,
96+
utilization,
97+
cpuCoreRatio,
98+
};
99+
},
100+
stop: () => {
101+
monitor?.disable();
102+
monitor = null;
103+
lastWallAt = 0;
104+
lastCpuUsage = null;
105+
lastEventLoopUtilization = null;
106+
},
107+
};
108+
}

src/gateway/server/readiness.test.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ function createReadinessHarness(params: {
6464
startedAgoMs: number;
6565
accounts: Record<string, Partial<ChannelAccountSnapshot>>;
6666
getStartupPending?: () => boolean;
67+
getEventLoopHealth?: Parameters<typeof createReadinessChecker>[0]["getEventLoopHealth"];
6768
cacheTtlMs?: number;
6869
}) {
6970
const startedAt = Date.now() - params.startedAgoMs;
@@ -74,6 +75,7 @@ function createReadinessHarness(params: {
7475
channelManager: manager,
7576
startedAt,
7677
getStartupPending: params.getStartupPending,
78+
getEventLoopHealth: params.getEventLoopHealth,
7779
cacheTtlMs: params.cacheTtlMs,
7880
}),
7981
};
@@ -273,4 +275,37 @@ describe("createReadinessChecker", () => {
273275
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
274276
});
275277
});
278+
279+
it("adds event-loop health to detailed readiness without changing readiness state", () => {
280+
withReadinessClock(() => {
281+
const { readiness } = createReadinessHarness({
282+
startedAgoMs: 5 * 60_000,
283+
accounts: {},
284+
getEventLoopHealth: () => ({
285+
degraded: true,
286+
reasons: ["cpu", "event_loop_utilization"],
287+
intervalMs: 2_000,
288+
delayP99Ms: 42.1,
289+
delayMaxMs: 88.7,
290+
utilization: 0.991,
291+
cpuCoreRatio: 0.973,
292+
}),
293+
});
294+
295+
expect(readiness()).toEqual({
296+
ready: true,
297+
failing: [],
298+
uptimeMs: 300_000,
299+
eventLoop: {
300+
degraded: true,
301+
reasons: ["cpu", "event_loop_utilization"],
302+
intervalMs: 2_000,
303+
delayP99Ms: 42.1,
304+
delayMaxMs: 88.7,
305+
utilization: 0.991,
306+
cpuCoreRatio: 0.973,
307+
},
308+
});
309+
});
310+
});
276311
});

src/gateway/server/readiness.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@ import {
77
type ChannelHealthEvaluation,
88
} from "../channel-health-policy.js";
99
import type { ChannelManager } from "../server-channels.js";
10+
import type { GatewayEventLoopHealth } from "./event-loop-health.js";
1011

1112
export type ReadinessResult = {
1213
ready: boolean;
1314
failing: string[];
1415
uptimeMs: number;
16+
eventLoop?: GatewayEventLoopHealth;
1517
};
1618

1719
export type ReadinessChecker = () => ReadinessResult;
@@ -35,6 +37,7 @@ export function createReadinessChecker(deps: {
3537
channelManager: ChannelManager;
3638
startedAt: number;
3739
getStartupPending?: () => boolean;
40+
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined;
3841
cacheTtlMs?: number;
3942
}): ReadinessChecker {
4043
const { channelManager, startedAt } = deps;
@@ -46,10 +49,13 @@ export function createReadinessChecker(deps: {
4649
const now = Date.now();
4750
const uptimeMs = now - startedAt;
4851
if (deps.getStartupPending?.()) {
49-
return { ready: false, failing: ["startup-sidecars"], uptimeMs };
52+
return withEventLoopHealth(
53+
{ ready: false, failing: ["startup-sidecars"], uptimeMs },
54+
deps.getEventLoopHealth,
55+
);
5056
}
5157
if (cachedState && now - cachedAt < cacheTtlMs) {
52-
return { ...cachedState, uptimeMs };
58+
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
5359
}
5460

5561
const snapshot = channelManager.getRuntimeSnapshot();
@@ -79,6 +85,14 @@ export function createReadinessChecker(deps: {
7985

8086
cachedAt = now;
8187
cachedState = { ready: failing.length === 0, failing };
82-
return { ...cachedState, uptimeMs };
88+
return withEventLoopHealth({ ...cachedState, uptimeMs }, deps.getEventLoopHealth);
8389
};
8490
}
91+
92+
function withEventLoopHealth(
93+
result: ReadinessResult,
94+
getEventLoopHealth?: () => GatewayEventLoopHealth | undefined,
95+
): ReadinessResult {
96+
const eventLoop = getEventLoopHealth?.();
97+
return eventLoop ? { ...result, eventLoop } : result;
98+
}

0 commit comments

Comments
 (0)