Skip to content

Commit 44d5330

Browse files
authored
fix: recover stuck Codex compaction
- Restart the shared Codex app-server client when native server-side compaction times out. - Retry native compaction once on the fresh app-server while preserving stale-thread cleanup only for `thread not found`. - Add regression coverage and changelog entry for the preflight compaction recovery path. Verification: - `pnpm test extensions/codex/src/app-server/compact.test.ts` - `env -u OPENCLAW_TESTBOX -u OPENCLAW_TESTBOX_REMOTE_RUN pnpm check:changed` - `.agents/skills/autoreview/scripts/autoreview --mode local` CI note: `build-artifacts` is red due inherited latest-main workflow/test drift, reproduced locally outside this PR diff and tracked in the pre-merge PR comment.
1 parent 8174bfc commit 44d5330

3 files changed

Lines changed: 178 additions & 27 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Docs: https://docs.openclaw.ai
4343

4444
- Gateway/agents: preserve fresh session overrides and metadata when stale cached agent-session entries race with store updates, so subagent model/provider overrides and routing policy survive concurrent writes. (#19328) Thanks @CodeReclaimers.
4545
- Control UI/chat: keep chat session search inline with the session selector so the header no longer shows a duplicate standalone search row.
46+
- Codex app-server: restart the native app-server and retry once when server-side compaction times out, so preflight compaction stalls recover instead of failing every dispatch. (#85500)
4647
- Restore Control UI gateway token pairing [AI]. (#85459) Thanks @pgondhi987.
4748
- CLI/update: repair managed npm plugin `openclaw` peer links during post-core convergence and reject stale or wrong-target peer links before restart. (#83794) Thanks @fuller-stack-dev.
4849
- CLI/agents: default new omitted-account bindings to all accounts when the channel has multiple configured accounts, and clarify account-scope docs. (#49769) Thanks @Gcaufy.

extensions/codex/src/app-server/compact.test.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,89 @@ describe("maybeCompactCodexAppServerSession", () => {
330330
expect(result.result).toBeUndefined();
331331
});
332332

333+
it("restarts the Codex app-server and retries when native compaction times out", async () => {
334+
const previousTimeout = process.env.OPENCLAW_CODEX_COMPACTION_WAIT_TIMEOUT_MS;
335+
process.env.OPENCLAW_CODEX_COMPACTION_WAIT_TIMEOUT_MS = "100";
336+
const warn = vi.spyOn(embeddedAgentLog, "warn").mockImplementation(() => undefined);
337+
try {
338+
const first = createFakeCodexClient();
339+
const second = createFakeCodexClient();
340+
let factoryCalls = 0;
341+
const factory = vi.fn(async () => {
342+
factoryCalls += 1;
343+
if (factoryCalls === 1) {
344+
return first.client;
345+
}
346+
return second.client;
347+
});
348+
setCodexAppServerClientFactoryForTest(factory);
349+
const sessionFile = await writeTestBinding();
350+
351+
const pendingResult = startCompaction(sessionFile, { currentTokenCount: 456 });
352+
await vi.waitFor(() => {
353+
expect(first.request).toHaveBeenCalledWith("thread/compact/start", {
354+
threadId: "thread-1",
355+
});
356+
});
357+
358+
await vi.waitFor(() => {
359+
expect(first.close).toHaveBeenCalledTimes(1);
360+
expect(second.request).toHaveBeenCalledWith("thread/compact/start", {
361+
threadId: "thread-1",
362+
});
363+
});
364+
second.emit({
365+
method: "thread/tokenUsage/updated",
366+
params: {
367+
threadId: "thread-1",
368+
tokenUsage: {
369+
last_token_usage: {
370+
total_tokens: 12_345,
371+
},
372+
},
373+
},
374+
});
375+
second.emit({
376+
method: "item/completed",
377+
params: {
378+
threadId: "thread-1",
379+
turnId: "turn-2",
380+
item: { type: "contextCompaction", id: "compact-2" },
381+
},
382+
});
383+
384+
const result = requireCompactResult(await pendingResult);
385+
expect(result.ok).toBe(true);
386+
expect(result.compacted).toBe(true);
387+
expect(result.result?.tokensAfter).toBe(12_345);
388+
expect(factory).toHaveBeenCalledTimes(2);
389+
expect(second.close).not.toHaveBeenCalled();
390+
expect(await readCodexAppServerBinding(sessionFile)).toBeDefined();
391+
const details = compactDetails(result);
392+
expect(details.signal).toBe("item/completed");
393+
expect(details.itemId).toBe("compact-2");
394+
expect(details.compactionAttempts).toBe(2);
395+
expect(details.recoveredAfterAppServerRestart).toBe(true);
396+
expect(warn).toHaveBeenCalledWith(
397+
"codex app-server compaction timed out; restarting app-server",
398+
expect.objectContaining({
399+
sessionId: "session-1",
400+
sessionKey: "agent:main:session-1",
401+
threadId: "thread-1",
402+
attempt: 1,
403+
maxAttempts: 2,
404+
}),
405+
);
406+
} finally {
407+
if (previousTimeout === undefined) {
408+
delete process.env.OPENCLAW_CODEX_COMPACTION_WAIT_TIMEOUT_MS;
409+
} else {
410+
process.env.OPENCLAW_CODEX_COMPACTION_WAIT_TIMEOUT_MS = previousTimeout;
411+
}
412+
warn.mockRestore();
413+
}
414+
});
415+
333416
it("warns when stale OpenClaw compaction overrides are ignored", async () => {
334417
const warn = vi.spyOn(embeddedAgentLog, "warn").mockImplementation(() => undefined);
335418
const fake = createFakeCodexClient();
@@ -1007,19 +1090,23 @@ describe("maybeCompactCodexAppServerSession", () => {
10071090
function createFakeCodexClient(): {
10081091
client: CodexAppServerClient;
10091092
request: ReturnType<typeof vi.fn>;
1093+
close: ReturnType<typeof vi.fn>;
10101094
emit: (notification: CodexServerNotification) => void;
10111095
} {
10121096
const handlers = new Set<(notification: CodexServerNotification) => void>();
10131097
const request = vi.fn(async () => ({}));
1098+
const close = vi.fn();
10141099
return {
10151100
client: {
10161101
request,
1102+
close,
10171103
addNotificationHandler(handler: (notification: CodexServerNotification) => void) {
10181104
handlers.add(handler);
10191105
return () => handlers.delete(handler);
10201106
},
10211107
} as unknown as CodexAppServerClient,
10221108
request,
1109+
close,
10231110
emit(notification: CodexServerNotification): void {
10241111
for (const handler of handlers) {
10251112
handler(notification);

extensions/codex/src/app-server/compact.ts

Lines changed: 90 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,15 @@ type CodexNativeCompactionWaiter = {
3232

3333
const DEFAULT_CODEX_COMPACTION_WAIT_TIMEOUT_MS = 5 * 60 * 1000;
3434
const CODEX_COMPACTION_TOKEN_USAGE_GRACE_MS = 250;
35+
const MAX_CODEX_NATIVE_COMPACTION_ATTEMPTS = 2;
3536
const warnedIgnoredCompactionOverrides = new Set<string>();
3637

38+
class CodexNativeCompactionTimeoutError extends Error {
39+
constructor(readonly threadId: string) {
40+
super(`timed out waiting for codex app-server compaction for ${threadId}`);
41+
}
42+
}
43+
3744
export async function maybeCompactCodexAppServerSession(
3845
params: CompactEmbeddedPiSessionParams,
3946
options: { pluginConfig?: unknown; clientFactory?: CodexAppServerClientFactory } = {},
@@ -354,38 +361,70 @@ async function compactCodexNativeThread(
354361
}
355362

356363
const clientFactory = options.clientFactory ?? defaultCodexAppServerClientFactory;
357-
const client = await clientFactory(
358-
appServer.start,
359-
requestedAuthProfileId ?? binding.authProfileId,
360-
params.agentDir,
361-
params.config,
362-
);
363-
const waiter = createCodexNativeCompactionWaiter(client, binding.threadId);
364-
let completion: CodexNativeCompactionCompletion;
365-
try {
366-
await client.request("thread/compact/start", {
367-
threadId: binding.threadId,
368-
});
369-
embeddedAgentLog.info("started codex app-server compaction", {
370-
sessionId: params.sessionId,
371-
threadId: binding.threadId,
372-
});
373-
waiter.startTimeout();
374-
completion = await waiter.promise;
375-
} catch (error) {
376-
waiter.cancel();
377-
if (isCodexThreadNotFoundError(error)) {
378-
await clearCodexAppServerBinding(params.sessionFile, { config: params.config });
379-
return failedCodexThreadBindingCompactionResult(params, {
364+
let completion: CodexNativeCompactionCompletion | undefined;
365+
let attempt = 0;
366+
for (attempt = 1; attempt <= MAX_CODEX_NATIVE_COMPACTION_ATTEMPTS; attempt += 1) {
367+
const client = await clientFactory(
368+
appServer.start,
369+
requestedAuthProfileId ?? binding.authProfileId,
370+
params.agentDir,
371+
params.config,
372+
);
373+
const waiter = createCodexNativeCompactionWaiter(client, binding.threadId);
374+
try {
375+
await client.request("thread/compact/start", {
380376
threadId: binding.threadId,
381-
reason: formatCompactionError(error),
382-
recovery: "stale_thread_binding",
383377
});
378+
embeddedAgentLog.info("started codex app-server compaction", {
379+
sessionId: params.sessionId,
380+
threadId: binding.threadId,
381+
attempt,
382+
});
383+
waiter.startTimeout();
384+
completion = await waiter.promise;
385+
break;
386+
} catch (error) {
387+
waiter.cancel();
388+
if (isCodexThreadNotFoundError(error)) {
389+
await clearCodexAppServerBinding(params.sessionFile, { config: params.config });
390+
return failedCodexThreadBindingCompactionResult(params, {
391+
threadId: binding.threadId,
392+
reason: formatCompactionError(error),
393+
recovery: "stale_thread_binding",
394+
});
395+
}
396+
if (
397+
isCodexNativeCompactionTimeoutError(error, binding.threadId) &&
398+
attempt < MAX_CODEX_NATIVE_COMPACTION_ATTEMPTS
399+
) {
400+
restartCodexAppServerAfterNativeCompactionTimeout(
401+
client,
402+
params,
403+
binding.threadId,
404+
attempt,
405+
);
406+
continue;
407+
}
408+
if (isCodexNativeCompactionTimeoutError(error, binding.threadId)) {
409+
restartCodexAppServerAfterNativeCompactionTimeout(
410+
client,
411+
params,
412+
binding.threadId,
413+
attempt,
414+
);
415+
}
416+
return {
417+
ok: false,
418+
compacted: false,
419+
reason: formatCompactionError(error),
420+
};
384421
}
422+
}
423+
if (!completion) {
385424
return {
386425
ok: false,
387426
compacted: false,
388-
reason: formatCompactionError(error),
427+
reason: `codex app-server compaction did not complete for ${binding.threadId}`,
389428
};
390429
}
391430
embeddedAgentLog.info("completed codex app-server compaction", {
@@ -410,6 +449,10 @@ async function compactCodexNativeThread(
410449
if (completion.tokensAfter !== undefined) {
411450
resultDetails.tokenUsageSource = "thread/tokenUsage/updated";
412451
}
452+
if (attempt > 1) {
453+
resultDetails.compactionAttempts = attempt;
454+
resultDetails.recoveredAfterAppServerRestart = true;
455+
}
413456
return {
414457
ok: true,
415458
compacted: true,
@@ -453,6 +496,26 @@ function isCodexThreadNotFoundError(error: unknown): boolean {
453496
return formatCompactionError(error).toLowerCase().includes("thread not found");
454497
}
455498

499+
function isCodexNativeCompactionTimeoutError(error: unknown, threadId: string): boolean {
500+
return error instanceof CodexNativeCompactionTimeoutError && error.threadId === threadId;
501+
}
502+
503+
function restartCodexAppServerAfterNativeCompactionTimeout(
504+
client: CodexAppServerClient,
505+
params: CompactEmbeddedPiSessionParams,
506+
threadId: string,
507+
attempt: number,
508+
): void {
509+
embeddedAgentLog.warn("codex app-server compaction timed out; restarting app-server", {
510+
sessionId: params.sessionId,
511+
sessionKey: params.sessionKey,
512+
threadId,
513+
attempt,
514+
maxAttempts: MAX_CODEX_NATIVE_COMPACTION_ATTEMPTS,
515+
});
516+
client.close();
517+
}
518+
456519
function createCodexNativeCompactionWaiter(
457520
client: CodexAppServerClient,
458521
threadId: string,
@@ -544,7 +607,7 @@ function createCodexNativeCompactionWaiter(
544607
return;
545608
}
546609
timeout = setTimeout(() => {
547-
failWaiter(new Error(`timed out waiting for codex app-server compaction for ${threadId}`));
610+
failWaiter(new CodexNativeCompactionTimeoutError(threadId));
548611
}, resolveCompactionWaitTimeoutMs());
549612
timeout.unref?.();
550613
},

0 commit comments

Comments
 (0)