Skip to content

Commit bcdacfa

Browse files
authored
feat(diagnostics): carry trace context through hooks
Pass immutable diagnostic trace contexts through agent and tool hook surfaces, emit model usage with the run trace, and parent OTEL spans/logs from validated trace context without retained global state.\n\nThanks @vincentkoc.
1 parent 33c0cd1 commit bcdacfa

18 files changed

Lines changed: 229 additions & 35 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
88

99
- Diagnostics/OTEL: add a lightweight diagnostic trace-context carrier for future span correlation without adding OTEL SDK state to core. Thanks @vincentkoc.
1010
- Diagnostics/OTEL: attach diagnostic trace context to exported OTEL logs so log records can correlate with future spans without adding retained process state. Thanks @vincentkoc.
11+
- Diagnostics/OTEL: pass immutable per-run diagnostic trace context through agent and tool hook contexts, and parent exported diagnostic spans from validated context without retaining global trace state. Thanks @vincentkoc.
1112
- Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it.
1213
- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent.
1314
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
ad7ec565b1702a76a87b1a08904445c9838e10d4d41fb1c58909af886b702d80 plugin-sdk-api-baseline.json
2-
907a07c206dd52ebd910793fab7bca8640c37cf82ff7e7cca88ab1b12b4fbdfe plugin-sdk-api-baseline.jsonl
1+
c0f788d1895ced2ffdad9f82e6afc592171e6651c61c0fc5083f0040437cda6d plugin-sdk-api-baseline.json
2+
70e320157331080b98f9c2acae58e89ad1dc70b48adad265225a7eb76b6ac29f plugin-sdk-api-baseline.jsonl

docs/automation/hooks.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ const handler = async (event) => {
106106
export default handler;
107107
```
108108

109-
Each event includes: `type`, `action`, `sessionKey`, `timestamp`, `messages` (push to send to user), and `context` (event-specific data).
109+
Each event includes: `type`, `action`, `sessionKey`, `timestamp`, `messages` (push to send to user), and `context` (event-specific data). Agent and tool plugin hook contexts can also include `trace`, a read-only W3C-compatible diagnostic trace context that plugins may pass into structured logs for OTEL correlation.
110110

111111
### Event context highlights
112112

extensions/diagnostics-otel/src/service.test.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const telemetryState = vi.hoisted(() => {
66
const counters = new Map<string, { add: ReturnType<typeof vi.fn> }>();
77
const histograms = new Map<string, { record: ReturnType<typeof vi.fn> }>();
88
const tracer = {
9-
startSpan: vi.fn((_name: string, _opts?: unknown) => ({
9+
startSpan: vi.fn((_name: string, _opts?: unknown, _ctx?: unknown) => ({
1010
end: vi.fn(),
1111
setStatus: vi.fn(),
1212
})),
@@ -384,6 +384,64 @@ describe("diagnostics-otel service", () => {
384384
});
385385
});
386386

387+
test("parents diagnostic event spans from trace context", async () => {
388+
const service = createDiagnosticsOtelService();
389+
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
390+
await service.start(ctx);
391+
392+
emitDiagnosticEvent({
393+
type: "model.usage",
394+
trace: {
395+
traceId: TRACE_ID,
396+
spanId: SPAN_ID,
397+
traceFlags: "01",
398+
},
399+
provider: "openai",
400+
model: "gpt-5.4",
401+
usage: { total: 4 },
402+
durationMs: 12,
403+
});
404+
405+
const modelUsageCall = telemetryState.tracer.startSpan.mock.calls.find(
406+
(call) => call[0] === "openclaw.model.usage",
407+
);
408+
expect(modelUsageCall?.[2]).toEqual({
409+
spanContext: expect.objectContaining({
410+
traceId: TRACE_ID,
411+
spanId: SPAN_ID,
412+
traceFlags: 1,
413+
isRemote: true,
414+
}),
415+
});
416+
await service.stop?.(ctx);
417+
});
418+
419+
test("ignores invalid diagnostic event trace parents", async () => {
420+
const service = createDiagnosticsOtelService();
421+
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true });
422+
await service.start(ctx);
423+
424+
emitDiagnosticEvent({
425+
type: "model.usage",
426+
trace: {
427+
traceId: "0".repeat(32),
428+
spanId: "not-a-span",
429+
traceFlags: "zz",
430+
},
431+
provider: "openai",
432+
model: "gpt-5.4",
433+
usage: { total: 4 },
434+
durationMs: 12,
435+
});
436+
437+
const modelUsageCall = telemetryState.tracer.startSpan.mock.calls.find(
438+
(call) => call[0] === "openclaw.model.usage",
439+
);
440+
expect(telemetryState.tracer.setSpanContext).not.toHaveBeenCalled();
441+
expect(modelUsageCall?.[2]).toBeUndefined();
442+
await service.stop?.(ctx);
443+
});
444+
387445
test("redacts sensitive reason in session.state metric attributes", async () => {
388446
const service = createDiagnosticsOtelService();
389447
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });

extensions/diagnostics-otel/src/service.ts

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -137,22 +137,36 @@ function traceFlagsToOtel(traceFlags: string | undefined): TraceFlags {
137137
return (parsed & TraceFlags.SAMPLED) !== 0 ? TraceFlags.SAMPLED : TraceFlags.NONE;
138138
}
139139

140+
function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined) {
141+
const normalized = normalizeTraceContext(traceContext);
142+
if (!normalized?.spanId) {
143+
return undefined;
144+
}
145+
return trace.setSpanContext(otelContextApi.active(), {
146+
traceId: normalized.traceId,
147+
spanId: normalized.spanId,
148+
traceFlags: traceFlagsToOtel(normalized.traceFlags),
149+
isRemote: true,
150+
});
151+
}
152+
140153
function addTraceAttributes(
141154
attributes: Record<string, string | number | boolean>,
142155
traceContext: DiagnosticTraceContext | undefined,
143156
): void {
144-
if (!traceContext) {
157+
const normalized = normalizeTraceContext(traceContext);
158+
if (!normalized) {
145159
return;
146160
}
147-
attributes["openclaw.traceId"] = traceContext.traceId;
148-
if (traceContext.spanId) {
149-
attributes["openclaw.spanId"] = traceContext.spanId;
161+
attributes["openclaw.traceId"] = normalized.traceId;
162+
if (normalized.spanId) {
163+
attributes["openclaw.spanId"] = normalized.spanId;
150164
}
151-
if (traceContext.parentSpanId) {
152-
attributes["openclaw.parentSpanId"] = traceContext.parentSpanId;
165+
if (normalized.parentSpanId) {
166+
attributes["openclaw.parentSpanId"] = normalized.parentSpanId;
153167
}
154-
if (traceContext.traceFlags) {
155-
attributes["openclaw.traceFlags"] = traceContext.traceFlags;
168+
if (normalized.traceFlags) {
169+
attributes["openclaw.traceFlags"] = normalized.traceFlags;
156170
}
157171
}
158172

@@ -448,13 +462,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
448462
attributes: redactOtelAttributes(attributes),
449463
timestamp: meta?.date ?? new Date(),
450464
};
451-
if (traceContext?.spanId) {
452-
logRecord.context = trace.setSpanContext(otelContextApi.active(), {
453-
traceId: traceContext.traceId,
454-
spanId: traceContext.spanId,
455-
traceFlags: traceFlagsToOtel(traceContext.traceFlags),
456-
isRemote: true,
457-
});
465+
const logContext = contextForTraceContext(traceContext);
466+
if (logContext) {
467+
logRecord.context = logContext;
458468
}
459469
otelLogger.emit(logRecord);
460470
} catch (err) {
@@ -467,13 +477,19 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
467477
name: string,
468478
attributes: Record<string, string | number>,
469479
durationMs?: number,
480+
traceContext?: DiagnosticTraceContext,
470481
) => {
471482
const startTime =
472483
typeof durationMs === "number" ? Date.now() - Math.max(0, durationMs) : undefined;
473-
const span = tracer.startSpan(name, {
474-
attributes,
475-
...(startTime ? { startTime } : {}),
476-
});
484+
const parentContext = contextForTraceContext(traceContext);
485+
const span = tracer.startSpan(
486+
name,
487+
{
488+
attributes,
489+
...(startTime ? { startTime } : {}),
490+
},
491+
parentContext,
492+
);
477493
return span;
478494
};
479495

@@ -537,7 +553,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
537553
"openclaw.tokens.total": usage.total ?? 0,
538554
};
539555

540-
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs);
556+
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, evt.trace);
541557
span.end();
542558
};
543559

@@ -568,7 +584,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
568584
if (evt.chatId !== undefined) {
569585
spanAttrs["openclaw.chatId"] = String(evt.chatId);
570586
}
571-
const span = spanWithDuration("openclaw.webhook.processed", spanAttrs, evt.durationMs);
587+
const span = spanWithDuration(
588+
"openclaw.webhook.processed",
589+
spanAttrs,
590+
evt.durationMs,
591+
evt.trace,
592+
);
572593
span.end();
573594
};
574595

@@ -591,9 +612,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
591612
if (evt.chatId !== undefined) {
592613
spanAttrs["openclaw.chatId"] = String(evt.chatId);
593614
}
594-
const span = tracer.startSpan("openclaw.webhook.error", {
595-
attributes: spanAttrs,
596-
});
615+
const span = tracer.startSpan(
616+
"openclaw.webhook.error",
617+
{
618+
attributes: spanAttrs,
619+
},
620+
contextForTraceContext(evt.trace),
621+
);
597622
span.setStatus({ code: SpanStatusCode.ERROR, message: redactedError });
598623
span.end();
599624
};
@@ -648,7 +673,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
648673
if (evt.reason) {
649674
spanAttrs["openclaw.reason"] = redactSensitiveText(evt.reason);
650675
}
651-
const span = spanWithDuration("openclaw.message.processed", spanAttrs, evt.durationMs);
676+
const span = spanWithDuration(
677+
"openclaw.message.processed",
678+
spanAttrs,
679+
evt.durationMs,
680+
evt.trace,
681+
);
652682
if (evt.outcome === "error" && evt.error) {
653683
span.setStatus({ code: SpanStatusCode.ERROR, message: redactSensitiveText(evt.error) });
654684
}
@@ -699,7 +729,11 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
699729
addSessionIdentityAttrs(spanAttrs, evt);
700730
spanAttrs["openclaw.queueDepth"] = evt.queueDepth ?? 0;
701731
spanAttrs["openclaw.ageMs"] = evt.ageMs;
702-
const span = tracer.startSpan("openclaw.session.stuck", { attributes: spanAttrs });
732+
const span = tracer.startSpan(
733+
"openclaw.session.stuck",
734+
{ attributes: spanAttrs },
735+
contextForTraceContext(evt.trace),
736+
);
703737
span.setStatus({ code: SpanStatusCode.ERROR, message: "session stuck" });
704738
span.end();
705739
};

src/agents/pi-embedded-runner/run.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { ensureContextEnginesInitialized } from "../../context-engine/init.js";
77
import { resolveContextEngine } from "../../context-engine/registry.js";
88
import { emitAgentPlanEvent } from "../../infra/agent-events.js";
99
import { sleepWithAbort } from "../../infra/backoff.js";
10+
import { freezeDiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js";
1011
import { formatErrorMessage } from "../../infra/errors.js";
1112
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
1213
import { enqueueCommandInLane } from "../../process/command-queue.js";
@@ -2134,6 +2135,9 @@ export async function runEmbeddedPiAgent(
21342135
});
21352136
return {
21362137
payloads: payloadsWithToolMedia?.length ? payloadsWithToolMedia : undefined,
2138+
...(attempt.diagnosticTrace
2139+
? { diagnosticTrace: freezeDiagnosticTraceContext(attempt.diagnosticTrace) }
2140+
: {}),
21372141
meta: {
21382142
durationMs: Date.now() - started,
21392143
agentMeta,
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
1+
import {
2+
freezeDiagnosticTraceContext,
3+
type DiagnosticTraceContext,
4+
} from "../../../infra/diagnostic-trace-context.js";
15
import type { EmbeddedRunTrigger } from "./params.js";
26

37
export function buildEmbeddedAttemptToolRunContext(params: {
48
trigger?: EmbeddedRunTrigger;
59
memoryFlushWritePath?: string;
10+
trace?: DiagnosticTraceContext;
611
}): {
712
trigger?: EmbeddedRunTrigger;
813
memoryFlushWritePath?: string;
14+
trace?: DiagnosticTraceContext;
915
} {
1016
return {
1117
trigger: params.trigger,
1218
memoryFlushWritePath: params.memoryFlushWritePath,
19+
...(params.trace ? { trace: freezeDiagnosticTraceContext(params.trace) } : {}),
1320
};
1421
}

src/agents/pi-embedded-runner/run/attempt.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ import {
99
} from "@mariozechner/pi-coding-agent";
1010
import { filterHeartbeatPairs } from "../../../auto-reply/heartbeat-filter.js";
1111
import { resolveChannelCapabilities } from "../../../config/channel-capabilities.js";
12+
import {
13+
createDiagnosticTraceContext,
14+
freezeDiagnosticTraceContext,
15+
} from "../../../infra/diagnostic-trace-context.js";
1216
import { isEmbeddedMode } from "../../../infra/embedded-mode.js";
1317
import { formatErrorMessage } from "../../../infra/errors.js";
1418
import { resolveHeartbeatSummaryForAgent } from "../../../infra/heartbeat-summary.js";
@@ -504,12 +508,13 @@ export async function runEmbeddedAttempt(
504508
const sessionLabel = params.sessionKey ?? params.sessionId;
505509
const contextInjectionMode = resolveContextInjectionMode(params.config);
506510
const agentDir = params.agentDir ?? resolveOpenClawAgentDir();
511+
const diagnosticTrace = freezeDiagnosticTraceContext(createDiagnosticTraceContext());
507512
const toolsRaw = params.disableTools
508513
? []
509514
: (() => {
510515
const allTools = createOpenClawCodingTools({
511516
agentId: sessionAgentId,
512-
...buildEmbeddedAttemptToolRunContext(params),
517+
...buildEmbeddedAttemptToolRunContext({ ...params, trace: diagnosticTrace }),
513518
exec: {
514519
...params.execOverrides,
515520
elevated: params.bashElevated,
@@ -1942,6 +1947,7 @@ export async function runEmbeddedAttempt(
19421947
}
19431948
const hookCtx = {
19441949
runId: params.runId,
1950+
trace: freezeDiagnosticTraceContext(diagnosticTrace),
19451951
agentId: hookAgentId,
19461952
sessionKey: params.sessionKey,
19471953
sessionId: params.sessionId,
@@ -2173,6 +2179,7 @@ export async function runEmbeddedAttempt(
21732179
},
21742180
{
21752181
runId: params.runId,
2182+
trace: freezeDiagnosticTraceContext(diagnosticTrace),
21762183
agentId: hookAgentId,
21772184
sessionKey: params.sessionKey,
21782185
sessionId: params.sessionId,
@@ -2580,6 +2587,7 @@ export async function runEmbeddedAttempt(
25802587
},
25812588
{
25822589
runId: params.runId,
2590+
trace: freezeDiagnosticTraceContext(diagnosticTrace),
25832591
agentId: hookAgentId,
25842592
sessionKey: params.sessionKey,
25852593
sessionId: params.sessionId,
@@ -2681,6 +2689,7 @@ export async function runEmbeddedAttempt(
26812689
},
26822690
{
26832691
runId: params.runId,
2692+
trace: freezeDiagnosticTraceContext(diagnosticTrace),
26842693
agentId: hookAgentId,
26852694
sessionKey: params.sessionKey,
26862695
sessionId: params.sessionId,
@@ -2768,6 +2777,7 @@ export async function runEmbeddedAttempt(
27682777
promptErrorSource,
27692778
preflightRecovery,
27702779
sessionIdUsed,
2780+
diagnosticTrace,
27712781
bootstrapPromptWarningSignaturesSeen: bootstrapPromptWarning.warningSignaturesSeen,
27722782
bootstrapPromptWarningSignature: bootstrapPromptWarning.signature,
27732783
systemPromptReport,

src/agents/pi-embedded-runner/run/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import type { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
44
import type { ThinkLevel } from "../../../auto-reply/thinking.js";
55
import type { SessionSystemPromptReport } from "../../../config/sessions/types.js";
66
import type { ContextEngine, ContextEnginePromptCacheInfo } from "../../../context-engine/types.js";
7+
import type { DiagnosticTraceContext } from "../../../infra/diagnostic-trace-context.js";
78
import type { PluginHookBeforeAgentStartResult } from "../../../plugins/hook-before-agent-start.types.js";
89
import type { MessagingToolSend } from "../../pi-embedded-messaging.types.js";
910
import type { ToolErrorSummary } from "../../tool-error-summary.js";
@@ -72,6 +73,7 @@ export type EmbeddedRunAttemptResult = {
7273
handled?: false;
7374
};
7475
sessionIdUsed: string;
76+
diagnosticTrace?: DiagnosticTraceContext;
7577
agentHarnessId?: string;
7678
bootstrapPromptWarningSignaturesSeen?: string[];
7779
bootstrapPromptWarningSignature?: string;

src/agents/pi-embedded-runner/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import type { CliSessionBinding, SessionSystemPromptReport } from "../../config/sessions/types.js";
2+
import type { DiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js";
23
import type { MessagingToolSend } from "../pi-embedded-messaging.types.js";
34

45
export type EmbeddedPiAgentMeta = {
@@ -141,6 +142,7 @@ export type EmbeddedPiRunResult = {
141142
audioAsVoice?: boolean;
142143
}>;
143144
meta: EmbeddedPiRunMeta;
145+
diagnosticTrace?: DiagnosticTraceContext;
144146
// True if a messaging tool successfully sent a message.
145147
// Used to suppress agent's confirmation text.
146148
didSendViaMessagingTool?: boolean;

0 commit comments

Comments
 (0)