Skip to content

Commit 097eed8

Browse files
committed
feat: emit diagnostics timeline
1 parent db69510 commit 097eed8

7 files changed

Lines changed: 604 additions & 18 deletions

File tree

docs/cli/gateway.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Inline `--password` can be exposed in local process listings. Prefer `--password
112112
### Startup profiling
113113

114114
- Set `OPENCLAW_GATEWAY_STARTUP_TRACE=1` to log phase timings during Gateway startup, including per-phase `eventLoopMax` delay and plugin lookup-table timings for installed-index, manifest registry, startup planning, and owner-map work.
115+
- Set `OPENCLAW_DIAGNOSTICS=1` with `OPENCLAW_DIAGNOSTICS_TIMELINE_PATH=<path>` to write a best-effort JSONL startup diagnostics timeline for external QA harnesses. Add `OPENCLAW_DIAGNOSTICS_EVENT_LOOP=1` to include event-loop samples.
115116
- Run `pnpm test:startup:gateway -- --runs 5 --warmup 1` to benchmark Gateway startup. The benchmark records first process output, `/healthz`, `/readyz`, startup trace timings, event-loop delay, and plugin lookup-table timing details.
116117

117118
## Query a running Gateway

src/gateway/server-startup-plugins.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { initSubagentRegistry } from "../agents/subagent-registry.js";
33
import { runChannelPluginStartupMaintenance } from "../channels/plugins/lifecycle-startup.js";
44
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
55
import type { OpenClawConfig } from "../config/types.openclaw.js";
6+
import { measureDiagnosticsTimelineSpan } from "../infra/diagnostics-timeline.js";
67
import { resolveOpenClawPackageRootSync } from "../infra/openclaw-root.js";
78
import {
89
pruneUnknownBundledRuntimeDepsRoots,
@@ -43,6 +44,23 @@ async function prestageGatewayBundledRuntimeDeps(params: {
4344
cfg: OpenClawConfig;
4445
pluginIds: readonly string[];
4546
log: GatewayPluginBootstrapLog;
47+
}): Promise<void> {
48+
await measureDiagnosticsTimelineSpan(
49+
"runtimeDeps.stage",
50+
() => prestageGatewayBundledRuntimeDepsImpl(params),
51+
{
52+
phase: "startup",
53+
attributes: {
54+
pluginCount: params.pluginIds.length,
55+
},
56+
},
57+
);
58+
}
59+
60+
async function prestageGatewayBundledRuntimeDepsImpl(params: {
61+
cfg: OpenClawConfig;
62+
pluginIds: readonly string[];
63+
log: GatewayPluginBootstrapLog;
4664
}): Promise<void> {
4765
if (params.pluginIds.length === 0) {
4866
return;

src/gateway/server.impl.ts

Lines changed: 122 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { monitorEventLoopDelay } from "node:perf_hooks";
1+
import { monitorEventLoopDelay, performance } from "node:perf_hooks";
22
import { getActiveEmbeddedRunCount } from "../agents/pi-embedded-runner/run-state.js";
33
import { getTotalPendingReplies } from "../auto-reply/reply/dispatcher-registry.js";
44
import type { CanvasHostServer } from "../canvas-host/server.js";
@@ -24,6 +24,10 @@ import {
2424
isDiagnosticsEnabled,
2525
setDiagnosticsEnabledForProcess,
2626
} from "../infra/diagnostic-events.js";
27+
import {
28+
emitDiagnosticsTimelineEvent,
29+
isDiagnosticsTimelineEnabled,
30+
} from "../infra/diagnostics-timeline.js";
2731
import { isTruthyEnvValue, isVitestRuntimeEnv, logAcceptedEnvOption } from "../infra/env.js";
2832
import { ensureOpenClawCliOnPath } from "../infra/path-env.js";
2933
import { setGatewaySigusr1RestartPolicy, setPreRestartDeferralCheck } from "../infra/restart.js";
@@ -164,30 +168,80 @@ const gatewayRuntime = runtimeForLogger(log);
164168
const canvasRuntime = runtimeForLogger(logCanvas);
165169

166170
function createGatewayStartupTrace() {
167-
const enabled = isTruthyEnvValue(process.env.OPENCLAW_GATEWAY_STARTUP_TRACE);
168-
const eventLoopDelay = enabled ? monitorEventLoopDelay({ resolution: 10 }) : undefined;
171+
const logEnabled = isTruthyEnvValue(process.env.OPENCLAW_GATEWAY_STARTUP_TRACE);
172+
const timelineEnabled = isDiagnosticsTimelineEnabled();
173+
const eventLoopTimelineEnabled =
174+
timelineEnabled && isTruthyEnvValue(process.env.OPENCLAW_DIAGNOSTICS_EVENT_LOOP);
175+
const eventLoopDelay =
176+
logEnabled || eventLoopTimelineEnabled ? monitorEventLoopDelay({ resolution: 10 }) : undefined;
169177
eventLoopDelay?.enable();
170178
const started = performance.now();
171179
let last = started;
180+
let spanSequence = 0;
172181
const formatMetric = (key: string, value: number | string) =>
173182
`${key}=${typeof value === "number" ? value.toFixed(1) : value}`;
174-
const readEventLoopMaxMs = () => {
183+
const mapTimelineName = (name: string) => {
184+
switch (name) {
185+
case "config.snapshot":
186+
return "config.load";
187+
case "config.auth":
188+
case "config.final-snapshot":
189+
case "runtime.config":
190+
return "config.normalize";
191+
case "plugins.bootstrap":
192+
return "plugins.load";
193+
case "runtime.post-attach":
194+
case "ready":
195+
return "gateway.ready";
196+
default:
197+
return name;
198+
}
199+
};
200+
const takeEventLoopSample = () => {
175201
if (!eventLoopDelay) {
176-
return 0;
202+
return undefined;
177203
}
178-
const maxMs = eventLoopDelay.max / 1_000_000;
204+
const sample = {
205+
p50Ms: eventLoopDelay.percentile(50) / 1_000_000,
206+
p95Ms: eventLoopDelay.percentile(95) / 1_000_000,
207+
p99Ms: eventLoopDelay.percentile(99) / 1_000_000,
208+
maxMs: eventLoopDelay.max / 1_000_000,
209+
};
179210
eventLoopDelay.reset();
180-
return maxMs;
211+
return sample;
212+
};
213+
const emitEventLoopTimelineSample = (
214+
activeSpanName: string,
215+
sample: ReturnType<typeof takeEventLoopSample>,
216+
) => {
217+
if (!eventLoopTimelineEnabled) {
218+
return;
219+
}
220+
if (!sample) {
221+
return;
222+
}
223+
emitDiagnosticsTimelineEvent({
224+
type: "eventLoop.sample",
225+
name: "eventLoop",
226+
phase: "startup",
227+
activeSpanName: mapTimelineName(activeSpanName),
228+
attributes:
229+
activeSpanName === mapTimelineName(activeSpanName)
230+
? undefined
231+
: { traceName: activeSpanName },
232+
...sample,
233+
});
181234
};
182235
const emit = (
183236
name: string,
184237
durationMs: number,
185238
totalMs: number,
239+
eventLoopSample: ReturnType<typeof takeEventLoopSample>,
186240
extras: ReadonlyArray<readonly [string, number | string]> = [],
187241
) => {
188-
if (enabled) {
242+
if (logEnabled) {
189243
const metrics = [
190-
`eventLoopMax=${readEventLoopMaxMs().toFixed(1)}ms`,
244+
`eventLoopMax=${(eventLoopSample?.maxMs ?? 0).toFixed(1)}ms`,
191245
...extras.map(([key, value]) => formatMetric(key, value)),
192246
].join(" ");
193247
log.info(
@@ -198,27 +252,78 @@ function createGatewayStartupTrace() {
198252
return {
199253
mark(name: string) {
200254
const now = performance.now();
201-
emit(name, now - last, now - started);
255+
const eventLoopSample = takeEventLoopSample();
256+
emit(name, now - last, now - started, eventLoopSample);
257+
emitDiagnosticsTimelineEvent({
258+
type: "mark",
259+
name: mapTimelineName(name),
260+
phase: "startup",
261+
durationMs: now - started,
262+
attributes: name === mapTimelineName(name) ? undefined : { traceName: name },
263+
});
264+
emitEventLoopTimelineSample(name, eventLoopSample);
202265
last = now;
203266
if (name === "ready") {
204267
eventLoopDelay?.disable();
205268
}
206269
},
207270
detail(name: string, metrics: ReadonlyArray<readonly [string, number | string]>) {
208-
if (!enabled) {
209-
return;
271+
const attributes = Object.fromEntries(metrics);
272+
if (logEnabled) {
273+
log.info(
274+
`startup trace: ${name} ${metrics.map(([key, value]) => formatMetric(key, value)).join(" ")}`,
275+
);
210276
}
211-
log.info(
212-
`startup trace: ${name} ${metrics.map(([key, value]) => formatMetric(key, value)).join(" ")}`,
213-
);
277+
emitDiagnosticsTimelineEvent({
278+
type: "mark",
279+
name: mapTimelineName(name),
280+
phase: "startup",
281+
attributes: {
282+
traceName: name,
283+
...attributes,
284+
},
285+
});
214286
},
215287
async measure<T>(name: string, run: () => Promise<T> | T): Promise<T> {
216288
const before = performance.now();
289+
const spanId = `gateway-startup-${++spanSequence}`;
290+
emitDiagnosticsTimelineEvent({
291+
type: "span.start",
292+
name: mapTimelineName(name),
293+
phase: "startup",
294+
spanId,
295+
attributes: name === mapTimelineName(name) ? undefined : { traceName: name },
296+
});
217297
try {
218-
return await run();
298+
const result = await run();
299+
const now = performance.now();
300+
emitDiagnosticsTimelineEvent({
301+
type: "span.end",
302+
name: mapTimelineName(name),
303+
phase: "startup",
304+
spanId,
305+
durationMs: now - before,
306+
attributes: name === mapTimelineName(name) ? undefined : { traceName: name },
307+
});
308+
return result;
309+
} catch (error) {
310+
const now = performance.now();
311+
emitDiagnosticsTimelineEvent({
312+
type: "span.error",
313+
name: mapTimelineName(name),
314+
phase: "startup",
315+
spanId,
316+
durationMs: now - before,
317+
attributes: name === mapTimelineName(name) ? undefined : { traceName: name },
318+
errorName: error instanceof Error ? error.name : typeof error,
319+
errorMessage: error instanceof Error ? error.message : String(error),
320+
});
321+
throw error;
219322
} finally {
220323
const now = performance.now();
221-
emit(name, now - before, now - started);
324+
const eventLoopSample = takeEventLoopSample();
325+
emit(name, now - before, now - started, eventLoopSample);
326+
emitEventLoopTimelineSample(name, eventLoopSample);
222327
last = now;
223328
}
224329
},
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import { mkdtemp, readFile, rm } from "node:fs/promises";
2+
import { tmpdir } from "node:os";
3+
import { join } from "node:path";
4+
import { afterEach, describe, expect, it } from "vitest";
5+
import {
6+
emitDiagnosticsTimelineEvent,
7+
flushDiagnosticsTimelineForTest,
8+
isDiagnosticsTimelineEnabled,
9+
measureDiagnosticsTimelineSpan,
10+
measureDiagnosticsTimelineSpanSync,
11+
} from "./diagnostics-timeline.js";
12+
13+
const tempDirs: string[] = [];
14+
15+
async function createTimelineEnv() {
16+
const dir = await mkdtemp(join(tmpdir(), "openclaw-diagnostics-timeline-"));
17+
tempDirs.push(dir);
18+
return {
19+
env: {
20+
OPENCLAW_DIAGNOSTICS: "1",
21+
OPENCLAW_DIAGNOSTICS_RUN_ID: "run-1",
22+
OPENCLAW_DIAGNOSTICS_ENV: "env-1",
23+
OPENCLAW_DIAGNOSTICS_TIMELINE_PATH: join(dir, "nested", "timeline.jsonl"),
24+
} as NodeJS.ProcessEnv,
25+
path: join(dir, "nested", "timeline.jsonl"),
26+
};
27+
}
28+
29+
async function readTimeline(path: string) {
30+
await flushDiagnosticsTimelineForTest();
31+
return (await readFile(path, "utf8"))
32+
.trim()
33+
.split("\n")
34+
.map((line) => JSON.parse(line) as Record<string, unknown>);
35+
}
36+
37+
afterEach(async () => {
38+
await Promise.all(tempDirs.splice(0).map((dir) => rm(dir, { recursive: true, force: true })));
39+
});
40+
41+
describe("diagnostics timeline", () => {
42+
it("detects when timeline output is enabled", async () => {
43+
const { env } = await createTimelineEnv();
44+
45+
expect(isDiagnosticsTimelineEnabled(env)).toBe(true);
46+
expect(isDiagnosticsTimelineEnabled({ ...env, OPENCLAW_DIAGNOSTICS: "0" })).toBe(false);
47+
expect(isDiagnosticsTimelineEnabled({ ...env, OPENCLAW_DIAGNOSTICS_TIMELINE_PATH: "" })).toBe(
48+
false,
49+
);
50+
});
51+
52+
it("writes JSONL diagnostic events with the stable envelope", async () => {
53+
const { env, path } = await createTimelineEnv();
54+
55+
emitDiagnosticsTimelineEvent(
56+
{
57+
type: "mark",
58+
name: "gateway.ready",
59+
phase: "startup",
60+
attributes: {
61+
ok: true,
62+
count: 2,
63+
ignored: Number.NaN,
64+
},
65+
},
66+
env,
67+
);
68+
69+
const [event] = await readTimeline(path);
70+
expect(event).toMatchObject({
71+
schemaVersion: "openclaw.diagnostics.v1",
72+
type: "mark",
73+
name: "gateway.ready",
74+
runId: "run-1",
75+
envName: "env-1",
76+
phase: "startup",
77+
attributes: {
78+
ok: true,
79+
count: 2,
80+
},
81+
});
82+
expect(event?.timestamp).toEqual(expect.any(String));
83+
expect(event?.pid).toEqual(expect.any(Number));
84+
expect((event?.attributes as Record<string, unknown>).ignored).toBeUndefined();
85+
});
86+
87+
it("records span start and end events around successful work", async () => {
88+
const { env, path } = await createTimelineEnv();
89+
90+
await expect(
91+
measureDiagnosticsTimelineSpan("runtimeDeps.stage", () => "ok", {
92+
env,
93+
phase: "startup",
94+
attributes: { pluginCount: 3 },
95+
}),
96+
).resolves.toBe("ok");
97+
98+
const events = await readTimeline(path);
99+
expect(events).toHaveLength(2);
100+
expect(events[0]).toMatchObject({
101+
type: "span.start",
102+
name: "runtimeDeps.stage",
103+
phase: "startup",
104+
attributes: { pluginCount: 3 },
105+
});
106+
expect(events[1]).toMatchObject({
107+
type: "span.end",
108+
name: "runtimeDeps.stage",
109+
phase: "startup",
110+
attributes: { pluginCount: 3 },
111+
});
112+
expect(events[1]?.spanId).toBe(events[0]?.spanId);
113+
expect(events[1]?.durationMs).toEqual(expect.any(Number));
114+
});
115+
116+
it("records span error events and rethrows failures", async () => {
117+
const { env, path } = await createTimelineEnv();
118+
119+
await expect(
120+
measureDiagnosticsTimelineSpan(
121+
"plugins.load",
122+
() => {
123+
throw new TypeError("bad plugin");
124+
},
125+
{ env, phase: "startup" },
126+
),
127+
).rejects.toThrow("bad plugin");
128+
129+
const events = await readTimeline(path);
130+
expect(events).toHaveLength(2);
131+
expect(events[1]).toMatchObject({
132+
type: "span.error",
133+
name: "plugins.load",
134+
phase: "startup",
135+
errorName: "TypeError",
136+
errorMessage: "bad plugin",
137+
});
138+
});
139+
140+
it("records synchronous spans", async () => {
141+
const { env, path } = await createTimelineEnv();
142+
143+
const result = measureDiagnosticsTimelineSpanSync("plugins.metadata.scan", () => 42, {
144+
env,
145+
phase: "startup",
146+
});
147+
148+
expect(result).toBe(42);
149+
const events = await readTimeline(path);
150+
expect(events).toHaveLength(2);
151+
expect(events[0]).toMatchObject({
152+
type: "span.start",
153+
name: "plugins.metadata.scan",
154+
});
155+
expect(events[1]).toMatchObject({
156+
type: "span.end",
157+
name: "plugins.metadata.scan",
158+
});
159+
});
160+
});

0 commit comments

Comments
 (0)