Skip to content

Commit e5dc3f7

Browse files
committed
fix: retry codex app-server startup closes
1 parent 8631cad commit e5dc3f7

3 files changed

Lines changed: 97 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai
4040
- Memory Wiki: accept relative Markdown links that include the `.md` suffix during broken-wikilink validation, avoiding false positives for native render-mode links. Thanks @Kenneth8128.
4141
- OpenAI Codex: show the device-pairing code in the interactive SSH/headless prompt while keeping the short-lived code out of persistent runtime logs. Fixes #74212. Thanks @da22le123.
4242
- QA Lab: stop gateway children when the suite parent disappears, so interrupted local QA runs cannot leave hot orphaned gateways behind.
43+
- Codex/app-server: tolerate a second connection close during startup recovery and include retry counts plus stringified errors in the restart warning, so concurrent lanes do not fail after one shared-client race.
4344
- Plugins/CLI: cache plugin CLI registration entries per command program so completion state generation does not repeat the full plugin sweep in one invocation. Thanks @ScientificProgrammer.
4445
- Plugins: reuse gateway-bindable plugin loader cache entries for later default-mode loads without serving default-built registries to gateway-bound requests, reducing repeated plugin registration during dispatch. Refs #61756. Thanks @DmitryPogodaev.
4546
- Gateway/secrets: include the caught error message in `secrets.reload` and `secrets.resolve` warning logs while keeping RPC errors generic, so operators can diagnose reload and permission failures. Thanks @davidangularme.

extensions/codex/src/app-server/run-attempt.test.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2017,6 +2017,58 @@ describe("runCodexAppServerAttempt", () => {
20172017
expect(requests).toEqual([["thread/resume"], ["thread/resume", "turn/start"]]);
20182018
});
20192019

2020+
it("tolerates a second app-server close while retrying startup", async () => {
2021+
const sessionFile = path.join(tempDir, "session.jsonl");
2022+
const workspaceDir = path.join(tempDir, "workspace");
2023+
await writeExistingBinding(sessionFile, workspaceDir, { dynamicToolsFingerprint: "[]" });
2024+
const requests: string[][] = [];
2025+
let starts = 0;
2026+
let notify: (notification: CodexServerNotification) => Promise<void> = async () => undefined;
2027+
__testing.setCodexAppServerClientFactoryForTests(async () => {
2028+
const startIndex = starts++;
2029+
const methods: string[] = [];
2030+
requests.push(methods);
2031+
return {
2032+
request: vi.fn(async (method: string) => {
2033+
methods.push(method);
2034+
if (method === "thread/resume" && startIndex < 2) {
2035+
throw new Error("codex app-server client is closed");
2036+
}
2037+
if (method === "thread/resume") {
2038+
return threadStartResult("thread-existing");
2039+
}
2040+
if (method === "turn/start") {
2041+
return turnStartResult();
2042+
}
2043+
return {};
2044+
}),
2045+
addNotificationHandler: (handler: typeof notify) => {
2046+
notify = handler;
2047+
return () => undefined;
2048+
},
2049+
addRequestHandler: () => () => undefined,
2050+
} as never;
2051+
});
2052+
2053+
const run = runCodexAppServerAttempt(createParams(sessionFile, workspaceDir));
2054+
await vi.waitFor(() => expect(requests[2]).toContain("turn/start"), { interval: 1 });
2055+
await notify({
2056+
method: "turn/completed",
2057+
params: {
2058+
threadId: "thread-existing",
2059+
turnId: "turn-1",
2060+
turn: { id: "turn-1", status: "completed" },
2061+
},
2062+
});
2063+
2064+
await expect(run).resolves.toMatchObject({ aborted: false });
2065+
expect(requests).toEqual([
2066+
["thread/resume"],
2067+
["thread/resume"],
2068+
["thread/resume", "turn/start"],
2069+
]);
2070+
});
2071+
20202072
it("passes native hook relay config on thread start and resume", async () => {
20212073
const sessionFile = path.join(tempDir, "session.jsonl");
20222074
const workspaceDir = path.join(tempDir, "workspace");

extensions/codex/src/app-server/run-attempt.ts

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ import { createCodexUserInputBridge } from "./user-input-bridge.js";
9595
import { filterToolsForVisionInputs } from "./vision-tools.js";
9696

9797
const CODEX_DYNAMIC_TOOL_TIMEOUT_MS = 30_000;
98+
const CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS = 3;
9899
const CODEX_TURN_COMPLETION_IDLE_TIMEOUT_MS = 60_000;
99100
const CODEX_TURN_TERMINAL_IDLE_TIMEOUT_MS = 30 * 60_000;
100101
const CODEX_STEER_ALL_DEBOUNCE_MS = 500;
@@ -543,24 +544,51 @@ export async function runCodexAppServerAttempt(
543544
});
544545
return { client: startupClient, thread: startupThread };
545546
};
546-
try {
547-
return await startupAttempt();
548-
} catch (error) {
549-
if (runAbortController.signal.aborted || !isCodexAppServerConnectionClosedError(error)) {
550-
throw error;
547+
for (
548+
let attempt = 1;
549+
attempt <= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS;
550+
attempt += 1
551+
) {
552+
try {
553+
return await startupAttempt();
554+
} catch (error) {
555+
if (
556+
runAbortController.signal.aborted ||
557+
!isCodexAppServerConnectionClosedError(error)
558+
) {
559+
throw error;
560+
}
561+
const failedClient = attemptedClient;
562+
const clearedSharedClient = clearSharedCodexAppServerClientIfCurrent(failedClient);
563+
if (startupClientForCleanup === failedClient) {
564+
startupClientForCleanup = undefined;
565+
}
566+
attemptedClient = undefined;
567+
if (attempt >= CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS) {
568+
embeddedAgentLog.warn(
569+
"codex app-server connection closed during startup; retries exhausted",
570+
{
571+
attempt,
572+
maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS,
573+
clearedSharedClient,
574+
error: formatErrorMessage(error),
575+
},
576+
);
577+
throw error;
578+
}
579+
embeddedAgentLog.warn(
580+
"codex app-server connection closed during startup; restarting app-server and retrying",
581+
{
582+
attempt,
583+
nextAttempt: attempt + 1,
584+
maxAttempts: CODEX_APP_SERVER_STARTUP_CONNECTION_CLOSE_MAX_ATTEMPTS,
585+
clearedSharedClient,
586+
error: formatErrorMessage(error),
587+
},
588+
);
551589
}
552-
embeddedAgentLog.warn(
553-
"codex app-server connection closed during startup; restarting app-server and retrying",
554-
{ error },
555-
);
556-
const failedClient = attemptedClient;
557-
clearSharedCodexAppServerClientIfCurrent(failedClient);
558-
if (startupClientForCleanup === failedClient) {
559-
startupClientForCleanup = undefined;
560-
}
561-
attemptedClient = undefined;
562-
return await startupAttempt();
563590
}
591+
throw new Error("codex app-server startup retry loop exited unexpectedly");
564592
},
565593
}));
566594
startupClientForCleanup = undefined;

0 commit comments

Comments
 (0)