Skip to content

Commit 5d81c29

Browse files
authored
fix: reconcile subagent wait timeouts
Fixes #82787 by keeping session-backed parent subagent runs active when agent.wait only hits a poll timeout before the child session settles. Refactors terminal session-store reconciliation into a shared helper and rejects stale terminal rows from reused child sessions. Verification: - CodexReview clean - pnpm test src/agents/subagent-registry.test.ts src/agents/subagent-registry.lifecycle-retry-grace.e2e.test.ts src/agents/openclaw-tools.subagents.sessions-spawn.lifecycle.test.ts -- --reporter=dot - git diff --check - pnpm check:changed via Blacksmith Testbox tbx_01krt1rxpkb7vj53mkaqwfserq - GitHub CI/CodeQL/OpenGrep/Workflow Sanity green; proof gate covered by maintainer proof: override label
1 parent 06e85d5 commit 5d81c29

7 files changed

Lines changed: 425 additions & 117 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
88

99
- Agents/diagnostics: split slow embedded-run `attempt-dispatch` startup summaries into workspace, prompt, runtime-plan, and final dispatch subspans so traces identify the delayed setup phase. Fixes #82782. (#82783) Thanks @galiniliev.
1010
- CLI/media: accept HTTP(S) URLs in `openclaw infer image describe --file`, fetching remote images through the guarded media path instead of treating URLs as local files. Fixes #82837. (#82854) Thanks @neeravmakwana.
11+
- Agents/subagents: keep session-backed parent runs active when the child wait call times out before the child session has actually settled, so late subagent completions are reconciled instead of being lost. Fixes #82787. Thanks @ramitrkar-hash.
1112
- Agents/subagents: route group/channel subagent completions through message-tool-only handoffs when required and keep active-requester wake failures from dropping completion delivery. Fixes #82803. Thanks @galiniliev, @yozakura-ava, and @moeedahmed.
1213
- Memory-core: scan persisted memory source sessions on startup, comparing on-disk transcripts against the index and marking only missing/newer/resized files dirty for incremental sync. Fixes #82341. (#82341) Thanks @giodl73-repo.
1314
- Telegram: keep the top-level default account in the account list when named accounts or bindings are added alongside top-level credentials, preserving default polling while still letting named-only configs resolve to a single account. Fixes #82794. (#82794) Thanks @giodl73-repo.

src/agents/openclaw-tools.subagents.sessions-spawn.lifecycle.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,11 +481,12 @@ describe("openclaw-tools: subagents (sessions_spawn lifecycle)", () => {
481481
expect(deletedKey?.startsWith("agent:main:subagent:")).toBe(true);
482482
});
483483

484-
it("sessions_spawn records timeout when agent.wait returns timeout", async () => {
484+
it("sessions_spawn records timeout when agent.wait returns timeout and child session is terminal", async () => {
485485
const ctx = setupSessionsSpawnGatewayMock({
486486
includeChatHistory: true,
487487
chatHistoryText: "still working",
488488
agentWaitResult: { status: "timeout", startedAt: 6000, endedAt: 7000 },
489+
subagentSessionEntryPatch: { status: "timeout", endedAt: 7000 },
489490
});
490491

491492
const tool = await getDiscordGroupSpawnTool();

src/agents/openclaw-tools.subagents.sessions-spawn.test-harness.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ type SubagentSpawnTesting = (typeof import("./subagent-spawn.js"))["__testing"];
1616
type CreateOpenClawToolsOpts = Parameters<CreateSessionsSpawnTool>[0];
1717
type GatewayRequest = { method?: string; params?: unknown; timeoutMs?: number };
1818
type AgentWaitCall = { runId?: string; timeoutMs?: number };
19+
type TestSessionEntry = {
20+
sessionId: string;
21+
updatedAt: number;
22+
startedAt?: number;
23+
endedAt?: number;
24+
status?: "running" | "done" | "failed" | "killed" | "timeout";
25+
};
1926
type SessionsSpawnGatewayMockOptions = {
2027
includeSessionsList?: boolean;
2128
includeChatHistory?: boolean;
@@ -24,6 +31,7 @@ type SessionsSpawnGatewayMockOptions = {
2431
onSessionsPatch?: (params: unknown) => void;
2532
onSessionsDelete?: (params: unknown) => void;
2633
agentWaitResult?: { status: "ok" | "timeout"; startedAt: number; endedAt: number };
34+
subagentSessionEntryPatch?: Partial<TestSessionEntry>;
2735
};
2836
type EventWaiter = {
2937
label: string;
@@ -35,7 +43,7 @@ type EventWaiter = {
3543

3644
const hoisted = vi.hoisted(() => {
3745
const callGatewayMock = vi.fn();
38-
const sessionStore: Record<string, { sessionId: string; updatedAt: number }> = {};
46+
const sessionStore: Record<string, TestSessionEntry> = {};
3947
let nextRunId = 0;
4048
const defaultConfigOverride = {
4149
session: {
@@ -269,6 +277,7 @@ export function setupSessionsSpawnGatewayMock(setupOpts: SessionsSpawnGatewayMoc
269277
hoisted.sessionStore[childSessionKey] = {
270278
sessionId: `sess-${childSessionKey}`,
271279
updatedAt: Date.now(),
280+
...setupOpts.subagentSessionEntryPatch,
272281
};
273282
}
274283
setupOpts.onAgentSubagentSpawn?.(params);
@@ -347,6 +356,8 @@ vi.mock("../config/sessions.js", () => ({
347356
...existing,
348357
...patch,
349358
}),
359+
resolveAgentIdFromSessionKey: (sessionKey: string) =>
360+
sessionKey.match(/^agent:([^:]+)/)?.[1] ?? "main",
350361
resolveAgentMainSessionKey: (params: {
351362
cfg?: { session?: { mainKey?: string } };
352363
agentId: string;

src/agents/subagent-registry-run-manager.ts

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import {
2828
safeRemoveAttachmentsDir,
2929
} from "./subagent-registry-helpers.js";
3030
import type { SubagentRunRecord } from "./subagent-registry.types.js";
31+
import type { SubagentSessionCompletion } from "./subagent-session-reconciliation.js";
3132

3233
const log = createSubsystemLogger("agents/subagent-registry");
3334
const RECOVERABLE_WAIT_RETRY_DELAY_MS = process.env.OPENCLAW_TEST_FAST === "1" ? 25 : 5_000;
@@ -123,6 +124,11 @@ export function createSubagentRunManager(params: {
123124
clearPendingLifecycleError(runId: string): void;
124125
resolveSubagentWaitTimeoutMs(cfg: OpenClawConfig, runTimeoutSeconds?: number): number;
125126
scheduleOrphanRecovery(args?: { delayMs?: number; maxRetries?: number }): void;
127+
resolveSubagentSessionCompletion(args: {
128+
childSessionKey: string;
129+
fallbackEndedAt: number;
130+
notBeforeMs?: number;
131+
}): SubagentSessionCompletion | null;
126132
notifyContextEngineSubagentEnded(args: {
127133
childSessionKey: string;
128134
reason: "completed" | "deleted" | "released";
@@ -151,6 +157,22 @@ export function createSubagentRunManager(params: {
151157
expectedEntry?: SubagentRunRecord,
152158
) => {
153159
let completionForRetry: Parameters<typeof params.completeSubagentRun>[0] | undefined;
160+
const scheduleWaitRetry = (entry: SubagentRunRecord, reason: string, error?: string) => {
161+
params.scheduleOrphanRecovery({ delayMs: 1_000 });
162+
const scheduledEntry = entry;
163+
setTimeout(() => {
164+
const current = params.runs.get(runId);
165+
if (!current || current !== scheduledEntry || typeof current.endedAt === "number") {
166+
return;
167+
}
168+
void waitForSubagentCompletion(runId, waitTimeoutMs, scheduledEntry);
169+
}, RECOVERABLE_WAIT_RETRY_DELAY_MS).unref?.();
170+
log.info(reason, {
171+
runId,
172+
childSessionKey: entry.childSessionKey,
173+
...(error ? { error } : {}),
174+
});
175+
};
154176
try {
155177
const wait = await waitForAgentRun({
156178
runId,
@@ -177,23 +199,49 @@ export function createSubagentRunManager(params: {
177199
return;
178200
}
179201
if (wait.status === "error" && isRecoverableAgentWaitError(wait.error)) {
180-
log.info("subagent wait interrupted; scheduling recovery", {
181-
runId,
182-
childSessionKey: expectedEntry?.childSessionKey ?? entry?.childSessionKey,
183-
error: wait.error,
202+
scheduleWaitRetry(entry, "subagent wait interrupted; scheduling recovery", wait.error);
203+
return;
204+
}
205+
if (wait.status === "timeout") {
206+
const isTerminalWaitTimeout =
207+
typeof wait.endedAt === "number" ||
208+
typeof wait.stopReason === "string" ||
209+
typeof wait.livenessState === "string";
210+
const completion = params.resolveSubagentSessionCompletion({
211+
childSessionKey: entry.childSessionKey,
212+
fallbackEndedAt: typeof wait.endedAt === "number" ? wait.endedAt : Date.now(),
213+
notBeforeMs: entry.startedAt ?? entry.createdAt,
184214
});
185-
params.scheduleOrphanRecovery({ delayMs: 1_000 });
186-
const scheduledEntry = entry;
187-
setTimeout(() => {
188-
if (!scheduledEntry) {
189-
return;
190-
}
191-
const current = params.runs.get(runId);
192-
if (!current || current !== scheduledEntry || typeof current.endedAt === "number") {
193-
return;
194-
}
195-
void waitForSubagentCompletion(runId, waitTimeoutMs, scheduledEntry);
196-
}, RECOVERABLE_WAIT_RETRY_DELAY_MS).unref?.();
215+
if (completion) {
216+
completionForRetry = {
217+
runId,
218+
endedAt: completion.endedAt,
219+
outcome: completion.outcome,
220+
reason: completion.reason,
221+
sendFarewell: true,
222+
accountId: entry.requesterOrigin?.accountId,
223+
triggerCleanup: true,
224+
};
225+
await params.completeSubagentRun(completionForRetry);
226+
return;
227+
}
228+
if (isTerminalWaitTimeout) {
229+
completionForRetry = {
230+
runId,
231+
endedAt: wait.endedAt,
232+
outcome: { status: "timeout" },
233+
reason: SUBAGENT_ENDED_REASON_COMPLETE,
234+
sendFarewell: true,
235+
accountId: entry.requesterOrigin?.accountId,
236+
triggerCleanup: true,
237+
};
238+
await params.completeSubagentRun(completionForRetry);
239+
return;
240+
}
241+
scheduleWaitRetry(
242+
entry,
243+
"subagent wait timed out; deferring terminal state until session reconciliation",
244+
);
197245
return;
198246
}
199247
let mutated = false;
@@ -214,11 +262,7 @@ export function createSubagentRunManager(params: {
214262
}
215263
const waitError = typeof wait.error === "string" ? wait.error : undefined;
216264
const baseOutcome: SubagentRunOutcome =
217-
wait.status === "error"
218-
? { status: "error", error: waitError }
219-
: wait.status === "timeout"
220-
? { status: "timeout" }
221-
: { status: "ok" };
265+
wait.status === "error" ? { status: "error", error: waitError } : { status: "ok" };
222266
const outcome = withSubagentOutcomeTiming(baseOutcome, {
223267
startedAt: entry.startedAt,
224268
endedAt: entry.endedAt,

src/agents/subagent-registry.test.ts

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,177 @@ describe("subagent registry seam flow", () => {
329329
expect(run?.outcome).toBeUndefined();
330330
});
331331

332+
it("keeps parent run active when agent.wait times out before child session settles", async () => {
333+
let waitAttempts = 0;
334+
let resolveSecondWait: (value: {
335+
status: "ok";
336+
startedAt: number;
337+
endedAt: number;
338+
}) => void = () => {};
339+
const secondWait = new Promise<{ status: "ok"; startedAt: number; endedAt: number }>(
340+
(resolve) => {
341+
resolveSecondWait = resolve;
342+
},
343+
);
344+
mocks.callGateway.mockImplementation(async (request: { method?: string }) => {
345+
if (request.method === "agent.wait") {
346+
waitAttempts += 1;
347+
if (waitAttempts === 1) {
348+
return { status: "timeout" };
349+
}
350+
return secondWait;
351+
}
352+
return {};
353+
});
354+
mocks.loadSessionStore.mockReturnValue({
355+
"agent:main:subagent:child": {
356+
sessionId: "sess-child",
357+
updatedAt: 1,
358+
status: "running",
359+
},
360+
});
361+
362+
mod.registerSubagentRun({
363+
runId: "run-waiter-timeout",
364+
childSessionKey: "agent:main:subagent:child",
365+
requesterSessionKey: "agent:main:main",
366+
requesterDisplayKey: "main",
367+
task: "eventually complete",
368+
cleanup: "keep",
369+
});
370+
371+
await waitForFast(() => {
372+
expect(waitAttempts).toBeGreaterThanOrEqual(1);
373+
});
374+
await waitForFast(() => {
375+
expect(waitAttempts).toBeGreaterThanOrEqual(2);
376+
});
377+
const activeRun = mod
378+
.listSubagentRunsForRequester("agent:main:main")
379+
.find((entry) => entry.runId === "run-waiter-timeout");
380+
expect(activeRun?.endedAt).toBeUndefined();
381+
expect(activeRun?.outcome).toBeUndefined();
382+
383+
resolveSecondWait({
384+
status: "ok",
385+
startedAt: 111,
386+
endedAt: 222,
387+
});
388+
await waitForFast(() => {
389+
const completedRun = mod
390+
.listSubagentRunsForRequester("agent:main:main")
391+
.find((entry) => entry.runId === "run-waiter-timeout");
392+
expect(waitAttempts).toBeGreaterThanOrEqual(2);
393+
expect(completedRun?.endedAt).toBe(222);
394+
expectRecordFields(completedRun?.outcome, { status: "ok" }, "completed run outcome");
395+
});
396+
expect(mocks.runSubagentAnnounceFlow).toHaveBeenCalledTimes(1);
397+
});
398+
399+
it("records terminal agent.wait timeouts even before session store timing is persisted", async () => {
400+
mocks.callGateway.mockImplementation(async (request: { method?: string }) => {
401+
if (request.method === "agent.wait") {
402+
return {
403+
status: "timeout",
404+
startedAt: 111,
405+
endedAt: 222,
406+
stopReason: "rpc",
407+
};
408+
}
409+
return {};
410+
});
411+
mocks.loadSessionStore.mockReturnValue({
412+
"agent:main:subagent:child": {
413+
sessionId: "sess-child",
414+
updatedAt: 1,
415+
status: "running",
416+
},
417+
});
418+
419+
mod.registerSubagentRun({
420+
runId: "run-terminal-timeout",
421+
childSessionKey: "agent:main:subagent:child",
422+
requesterSessionKey: "agent:main:main",
423+
requesterDisplayKey: "main",
424+
task: "time out terminally",
425+
cleanup: "keep",
426+
});
427+
428+
await waitForFast(() => {
429+
const run = mod
430+
.listSubagentRunsForRequester("agent:main:main")
431+
.find((entry) => entry.runId === "run-terminal-timeout");
432+
expect(run?.endedAt).toBe(222);
433+
expectRecordFields(run?.outcome, { status: "timeout" }, "terminal timeout outcome");
434+
});
435+
expect(mocks.runSubagentAnnounceFlow).toHaveBeenCalledTimes(1);
436+
});
437+
438+
it("ignores stale terminal session-store rows from older child runs", async () => {
439+
let waitAttempts = 0;
440+
let resolveSecondWait: (value: {
441+
status: "ok";
442+
startedAt: number;
443+
endedAt: number;
444+
}) => void = () => {};
445+
const secondWait = new Promise<{ status: "ok"; startedAt: number; endedAt: number }>(
446+
(resolve) => {
447+
resolveSecondWait = resolve;
448+
},
449+
);
450+
mocks.callGateway.mockImplementation(async (request: { method?: string }) => {
451+
if (request.method === "agent.wait") {
452+
waitAttempts += 1;
453+
if (waitAttempts === 1) {
454+
return { status: "timeout" };
455+
}
456+
return secondWait;
457+
}
458+
return {};
459+
});
460+
const staleEndedAt = Date.parse("2026-03-24T11:59:00Z");
461+
mocks.loadSessionStore.mockReturnValue({
462+
"agent:main:subagent:child": {
463+
sessionId: "sess-child",
464+
updatedAt: staleEndedAt,
465+
status: "done",
466+
startedAt: staleEndedAt - 100,
467+
endedAt: staleEndedAt,
468+
},
469+
});
470+
471+
mod.registerSubagentRun({
472+
runId: "run-reactivated-timeout",
473+
childSessionKey: "agent:main:subagent:child",
474+
requesterSessionKey: "agent:main:main",
475+
requesterDisplayKey: "main",
476+
task: "new run after stale terminal row",
477+
cleanup: "keep",
478+
});
479+
480+
await waitForFast(() => {
481+
expect(waitAttempts).toBeGreaterThanOrEqual(2);
482+
});
483+
const activeRun = mod
484+
.listSubagentRunsForRequester("agent:main:main")
485+
.find((entry) => entry.runId === "run-reactivated-timeout");
486+
expect(activeRun?.endedAt).toBeUndefined();
487+
expect(activeRun?.outcome).toBeUndefined();
488+
expect(mocks.runSubagentAnnounceFlow).not.toHaveBeenCalled();
489+
490+
resolveSecondWait({
491+
status: "ok",
492+
startedAt: Date.parse("2026-03-24T12:00:01Z"),
493+
endedAt: Date.parse("2026-03-24T12:00:02Z"),
494+
});
495+
await waitForFast(() => {
496+
const completedRun = mod
497+
.listSubagentRunsForRequester("agent:main:main")
498+
.find((entry) => entry.runId === "run-reactivated-timeout");
499+
expectRecordFields(completedRun?.outcome, { status: "ok" }, "reactivated run outcome");
500+
});
501+
});
502+
332503
it("keeps sessions_yield-ended subagent runs paused instead of announcing no output", async () => {
333504
mocks.callGateway.mockImplementation(async (request: { method?: string }) => {
334505
if (request.method === "agent.wait") {
@@ -397,6 +568,7 @@ describe("subagent registry seam flow", () => {
397568
},
398569
});
399570

571+
vi.setSystemTime(persistedStartedAt - 1);
400572
mod.registerSubagentRun({
401573
runId: "run-stale-terminal",
402574
childSessionKey: "agent:main:subagent:child",

0 commit comments

Comments
 (0)