Skip to content

Commit 3a64dc7

Browse files
authored
fix(codex): keep turn timeouts inside Codex (#86476)
Keep Codex app-server turn timeouts within the Codex runtime boundary so they interrupt the active turn without retiring the shared app-server client, poisoning auth-profile cooldowns, or falling through to generic provider/model fallback. Preserve concrete non-timeout provider failures for auth-profile rotation and fallback, and add regression coverage for prompt-stage timeouts, assistant idle timeouts, auth-profile cooldowns, and app-server timeout handling. Thanks @pashpashpash.
1 parent f22c3a5 commit 3a64dc7

7 files changed

Lines changed: 158 additions & 37 deletions

File tree

extensions/codex/src/app-server/run-attempt.test.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4160,7 +4160,7 @@ describe("runCodexAppServerAttempt", () => {
41604160
});
41614161
});
41624162

4163-
it("closes the app-server client when the active turn goes idle past the attempt timeout", async () => {
4163+
it("interrupts but keeps the app-server client alive when a turn timeout fires", async () => {
41644164
const close = vi.fn();
41654165
const request = vi.fn(async (method: string) => {
41664166
if (method === "thread/start") {
@@ -4204,7 +4204,7 @@ describe("runCodexAppServerAttempt", () => {
42044204
},
42054205
{ timeoutMs: 5_000 },
42064206
);
4207-
expect(close).toHaveBeenCalledTimes(1);
4207+
expect(close).not.toHaveBeenCalled();
42084208
expect(queueActiveRunMessageForTest("session-1", "after timeout")).toBe(false);
42094209
});
42104210

@@ -5519,6 +5519,7 @@ describe("runCodexAppServerAttempt", () => {
55195519
});
55205520

55215521
it("does not treat global rate-limit notifications as turn progress", async () => {
5522+
const warn = vi.spyOn(embeddedAgentLog, "warn").mockImplementation(() => undefined);
55225523
const harness = createStartedThreadHarness();
55235524
const params = createParams(
55245525
path.join(tempDir, "session.jsonl"),
@@ -5561,6 +5562,10 @@ describe("runCodexAppServerAttempt", () => {
55615562
),
55625563
{ interval: 1 },
55635564
);
5565+
expect(warn).not.toHaveBeenCalledWith(
5566+
"codex app-server client retired after timed-out turn",
5567+
expect.anything(),
5568+
);
55645569
});
55655570

55665571
it("yields a macrotask before processing queued app-server notifications", async () => {

extensions/codex/src/app-server/run-attempt.ts

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3229,19 +3229,11 @@ export async function runCodexAppServerAttempt(
32293229
touchTurnCompletionActivity("turn:start", { attemptProgress: true });
32303230

32313231
const abortListener = () => {
3232-
const shouldRetireClient = timedOut;
32333232
interruptCodexTurnBestEffort(client, {
32343233
threadId: thread.threadId,
32353234
turnId: activeTurnId,
3236-
timeoutMs: shouldRetireClient ? CODEX_APP_SERVER_INTERRUPT_TIMEOUT_MS : undefined,
3235+
timeoutMs: timedOut ? CODEX_APP_SERVER_INTERRUPT_TIMEOUT_MS : undefined,
32373236
});
3238-
if (shouldRetireClient) {
3239-
retireCodexAppServerClientAfterTimedOutTurn(client, {
3240-
threadId: thread.threadId,
3241-
turnId: activeTurnId,
3242-
reason: String(runAbortController.signal.reason ?? "timeout"),
3243-
});
3244-
}
32453237
resolveCompletion?.();
32463238
};
32473239
runAbortController.signal.addEventListener("abort", abortListener, { once: true });
@@ -4057,29 +4049,6 @@ async function unsubscribeCodexThreadBestEffort(
40574049
}
40584050
}
40594051

4060-
function retireCodexAppServerClientAfterTimedOutTurn(
4061-
client: CodexAppServerClient,
4062-
params: {
4063-
threadId: string;
4064-
turnId: string;
4065-
reason: string;
4066-
},
4067-
): void {
4068-
const clearedSharedClient = clearSharedCodexAppServerClientIfCurrent(client);
4069-
if (!clearedSharedClient) {
4070-
const close = (client as { close?: () => void }).close;
4071-
if (typeof close === "function") {
4072-
close.call(client);
4073-
}
4074-
}
4075-
embeddedAgentLog.warn("codex app-server client retired after timed-out turn", {
4076-
threadId: params.threadId,
4077-
turnId: params.turnId,
4078-
reason: params.reason,
4079-
clearedSharedClient,
4080-
});
4081-
}
4082-
40834052
type DynamicToolBuildParams = {
40844053
params: EmbeddedRunAttemptParams;
40854054
resolvedWorkspace: string;

src/agents/pi-embedded-runner/run.codex-app-server-recovery.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
loadRunOverflowCompactionHarness,
66
MockedFailoverError,
77
mockedClassifyFailoverReason,
8+
mockedMarkAuthProfileFailure,
89
mockedRunEmbeddedAttempt,
910
overflowBaseRunParams,
1011
resetRunOverflowCompactionHarnessMocks,
@@ -203,6 +204,7 @@ describe("runEmbeddedPiAgent Codex app-server recovery", () => {
203204
"codex app-server turn idle timed out waiting for turn/completed",
204205
);
205206
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
207+
expect(mockedMarkAuthProfileFailure).not.toHaveBeenCalled();
206208
});
207209

208210
it("does not retry after visible assistant output", async () => {

src/agents/pi-embedded-runner/run.overflow-compaction.harness.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ export const mockedGetApiKeyForModel = vi.fn(
221221
mode: "api-key" as const,
222222
}),
223223
);
224+
export const mockedMarkAuthProfileFailure = vi.fn(async () => {});
224225
export const mockedEnsureAuthProfileStore = vi.fn(() => ({}));
225226
export const mockedEnsureAuthProfileStoreWithoutExternalProfiles = vi.fn(
226227
(_agentDir?: string, _options?: { allowKeychainPrompt?: boolean }) => ({}),
@@ -408,6 +409,8 @@ export function resetRunOverflowCompactionHarnessMocks(): void {
408409
mode: "api-key",
409410
}),
410411
);
412+
mockedMarkAuthProfileFailure.mockReset();
413+
mockedMarkAuthProfileFailure.mockResolvedValue(undefined);
411414
mockedEnsureAuthProfileStore.mockReset();
412415
mockedEnsureAuthProfileStore.mockReturnValue({});
413416
mockedEnsureAuthProfileStoreWithoutExternalProfiles.mockReset();
@@ -463,7 +466,7 @@ export async function loadRunOverflowCompactionHarness(): Promise<{
463466

464467
vi.doMock("../auth-profiles.js", () => ({
465468
isProfileInCooldown: vi.fn(() => false),
466-
markAuthProfileFailure: vi.fn(async () => {}),
469+
markAuthProfileFailure: mockedMarkAuthProfileFailure,
467470
markAuthProfileSuccess: mockedMarkAuthProfileSuccess,
468471
resolveProfilesUnavailableReason: vi.fn(() => undefined),
469472
}));

src/agents/pi-embedded-runner/run.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,11 @@ export async function runEmbeddedPiAgent(
11531153
if (!profileId || !reason) {
11541154
return;
11551155
}
1156+
if (pluginHarnessOwnsTransport && reason === "timeout") {
1157+
// Harness-owned transport timeouts are lifecycle failures, not
1158+
// credential evidence. Do not poison OpenClaw auth cooldowns.
1159+
return;
1160+
}
11561161
await markAuthProfileFailure({
11571162
store: profileFailureStore,
11581163
profileId,
@@ -2393,6 +2398,7 @@ export async function runEmbeddedPiAgent(
23932398
fallbackConfigured,
23942399
failoverFailure: promptFailoverFailure,
23952400
failoverReason: promptFailoverReason,
2401+
harnessOwnsTransport: pluginHarnessOwnsTransport,
23962402
profileRotated: false,
23972403
});
23982404
if (
@@ -2431,6 +2437,7 @@ export async function runEmbeddedPiAgent(
24312437
fallbackConfigured,
24322438
failoverFailure: promptFailoverFailure,
24332439
failoverReason: promptFailoverReason,
2440+
harnessOwnsTransport: pluginHarnessOwnsTransport,
24342441
profileRotated: true,
24352442
});
24362443
}
@@ -2597,6 +2604,7 @@ export async function runEmbeddedPiAgent(
25972604
idleTimedOut,
25982605
timedOutDuringCompaction,
25992606
timedOutDuringToolExecution,
2607+
harnessOwnsTransport: pluginHarnessOwnsTransport,
26002608
profileRotated: false,
26012609
});
26022610
const assistantFailoverOutcome = await handleAssistantFailover({

src/agents/pi-embedded-runner/run/failover-policy.test.ts

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,71 @@ describe("resolveRunFailoverDecision", () => {
298298
});
299299
});
300300

301+
it("does not rotate harness-owned assistant timeouts", () => {
302+
expect(
303+
resolveRunFailoverDecision({
304+
stage: "assistant",
305+
aborted: true,
306+
externalAbort: false,
307+
fallbackConfigured: true,
308+
failoverFailure: false,
309+
failoverReason: null,
310+
timedOut: true,
311+
idleTimedOut: false,
312+
timedOutDuringCompaction: false,
313+
timedOutDuringToolExecution: false,
314+
harnessOwnsTransport: true,
315+
profileRotated: false,
316+
}),
317+
).toEqual({
318+
action: "continue_normal",
319+
});
320+
});
321+
322+
it("rotates concrete assistant failover failures that accompany harness-owned timeouts", () => {
323+
expect(
324+
resolveRunFailoverDecision({
325+
stage: "assistant",
326+
aborted: false,
327+
externalAbort: false,
328+
fallbackConfigured: true,
329+
failoverFailure: true,
330+
failoverReason: "rate_limit",
331+
timedOut: true,
332+
idleTimedOut: false,
333+
timedOutDuringCompaction: false,
334+
timedOutDuringToolExecution: false,
335+
harnessOwnsTransport: true,
336+
profileRotated: false,
337+
}),
338+
).toEqual({
339+
action: "rotate_profile",
340+
reason: "rate_limit",
341+
});
342+
});
343+
344+
it("falls back with the concrete assistant failover reason after harness-owned timeout rotation is exhausted", () => {
345+
expect(
346+
resolveRunFailoverDecision({
347+
stage: "assistant",
348+
aborted: false,
349+
externalAbort: false,
350+
fallbackConfigured: true,
351+
failoverFailure: true,
352+
failoverReason: "billing",
353+
timedOut: true,
354+
idleTimedOut: false,
355+
timedOutDuringCompaction: false,
356+
timedOutDuringToolExecution: false,
357+
harnessOwnsTransport: true,
358+
profileRotated: true,
359+
}),
360+
).toEqual({
361+
action: "fallback_model",
362+
reason: "billing",
363+
});
364+
});
365+
301366
it("treats idle watchdog timeouts during tool execution as model silence", () => {
302367
expect(
303368
resolveRunFailoverDecision({
@@ -403,6 +468,45 @@ describe("resolveRunFailoverDecision", () => {
403468
});
404469
});
405470

471+
it("does not fallback harness-owned LLM idle timeouts after profile rotation is exhausted", () => {
472+
expect(
473+
resolveRunFailoverDecision({
474+
stage: "assistant",
475+
aborted: false,
476+
externalAbort: false,
477+
fallbackConfigured: true,
478+
failoverFailure: false,
479+
failoverReason: null,
480+
timedOut: false,
481+
idleTimedOut: true,
482+
timedOutDuringCompaction: false,
483+
timedOutDuringToolExecution: false,
484+
harnessOwnsTransport: true,
485+
profileRotated: true,
486+
}),
487+
).toEqual({
488+
action: "continue_normal",
489+
});
490+
});
491+
492+
it("surfaces harness-owned prompt timeouts instead of falling back", () => {
493+
expect(
494+
resolveRunFailoverDecision({
495+
stage: "prompt",
496+
aborted: false,
497+
externalAbort: false,
498+
fallbackConfigured: true,
499+
failoverFailure: true,
500+
failoverReason: "timeout",
501+
harnessOwnsTransport: true,
502+
profileRotated: true,
503+
}),
504+
).toEqual({
505+
action: "surface_error",
506+
reason: "timeout",
507+
});
508+
});
509+
406510
it("surfaces error on LLM idle timeout when no fallback is configured and rotation is exhausted", () => {
407511
expect(
408512
resolveRunFailoverDecision({

src/agents/pi-embedded-runner/run/failover-policy.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type PromptDecisionParams = {
4545
fallbackConfigured: boolean;
4646
failoverFailure: boolean;
4747
failoverReason: FailoverReason | null;
48+
harnessOwnsTransport?: boolean;
4849
profileRotated: boolean;
4950
};
5051

@@ -60,6 +61,7 @@ type AssistantDecisionParams = {
6061
idleTimedOut: boolean;
6162
timedOutDuringCompaction: boolean;
6263
timedOutDuringToolExecution: boolean;
64+
harnessOwnsTransport?: boolean;
6365
profileRotated: boolean;
6466
};
6567

@@ -103,11 +105,33 @@ function isAssistantTimeoutFailure(params: AssistantDecisionParams): boolean {
103105
);
104106
}
105107

108+
function isConcreteNonTimeoutAssistantFailure(params: AssistantDecisionParams): boolean {
109+
return (
110+
params.failoverFailure && Boolean(params.failoverReason) && params.failoverReason !== "timeout"
111+
);
112+
}
113+
106114
function shouldRotateAssistant(params: AssistantDecisionParams): boolean {
107115
if (isTerminalFormatFailure(params)) {
108116
return false;
109117
}
110-
return (!params.aborted && params.failoverFailure) || isAssistantTimeoutFailure(params);
118+
const timeoutFailure = isAssistantTimeoutFailure(params);
119+
if (
120+
timeoutFailure &&
121+
params.harnessOwnsTransport &&
122+
!isConcreteNonTimeoutAssistantFailure(params)
123+
) {
124+
return false;
125+
}
126+
return (!params.aborted && params.failoverFailure) || timeoutFailure;
127+
}
128+
129+
function assistantFallbackReason(params: AssistantDecisionParams): FailoverReason {
130+
const failoverReason = params.failoverReason;
131+
if (params.failoverFailure && failoverReason && failoverReason !== "timeout") {
132+
return failoverReason;
133+
}
134+
return isAssistantTimeoutFailure(params) ? "timeout" : (failoverReason ?? "unknown");
111135
}
112136

113137
export function mergeRetryFailoverReason(params: {
@@ -146,6 +170,12 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
146170
reason: params.failoverReason,
147171
};
148172
}
173+
if (params.harnessOwnsTransport && params.failoverReason === "timeout") {
174+
return {
175+
action: "surface_error",
176+
reason: params.failoverReason,
177+
};
178+
}
149179
if (!params.profileRotated && shouldRotatePrompt(params)) {
150180
return {
151181
action: "rotate_profile",
@@ -186,7 +216,7 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
186216
if (assistantShouldRotate && params.fallbackConfigured) {
187217
return {
188218
action: "fallback_model",
189-
reason: isAssistantTimeoutFailure(params) ? "timeout" : (params.failoverReason ?? "unknown"),
219+
reason: assistantFallbackReason(params),
190220
};
191221
}
192222
if (!assistantShouldRotate) {

0 commit comments

Comments
 (0)