Skip to content

Commit 3240d67

Browse files
committed
fix(embedded-runner): preserve takeover during fallback
1 parent 82fb236 commit 3240d67

5 files changed

Lines changed: 67 additions & 6 deletions

File tree

src/agents/failover-error.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ export function isNonProviderRuntimeCoordinationError(err: unknown): boolean {
274274
if (isFailoverError(err)) {
275275
return false;
276276
}
277+
if (isEmbeddedAttemptSessionTakeover(err)) {
278+
return true;
279+
}
277280
return resolveFailoverClassificationFromError(err) === null;
278281
}
279282

src/agents/model-fallback.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,38 @@ describe("runWithModelFallback", () => {
797797
expect(run).toHaveBeenCalledTimes(1);
798798
});
799799

800+
it("aborts fallback when a provider prompt error carries cleanup session takeover", async () => {
801+
const cfg = makeCfg({
802+
agents: {
803+
defaults: {
804+
model: {
805+
primary: "openai/gpt-5.4",
806+
fallbacks: ["anthropic/claude-sonnet-4-6", "openai/gpt-4.1-mini"],
807+
},
808+
},
809+
},
810+
});
811+
const cleanupTakeover = new Error(
812+
"session file changed while embedded prompt lock was released: /tmp/session.jsonl",
813+
);
814+
cleanupTakeover.name = "EmbeddedAttemptSessionTakeoverError";
815+
const providerFacingError = new Error("provider rejected request: rate limit", {
816+
cause: cleanupTakeover,
817+
});
818+
providerFacingError.name = "EmbeddedAttemptSessionTakeoverError";
819+
const run = vi.fn().mockRejectedValue(providerFacingError);
820+
821+
await expect(
822+
runWithModelFallback({
823+
cfg,
824+
provider: "openai",
825+
model: "gpt-5.4",
826+
run,
827+
}),
828+
).rejects.toBe(providerFacingError);
829+
expect(run).toHaveBeenCalledTimes(1);
830+
});
831+
800832
it("aborts the fallback chain on session write-lock timeout instead of trying every model (#83510)", async () => {
801833
const cfg = makeCfg({
802834
agents: {

src/agents/model-fallback.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,9 @@ async function runFallbackCandidate<T>(params: {
249249
if (isCommandLaneTaskTimeoutError(err)) {
250250
throw err;
251251
}
252+
if (isNonProviderRuntimeCoordinationError(err)) {
253+
throw err;
254+
}
252255
// Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
253256
// so they become FailoverErrors and continue the fallback loop instead of aborting.
254257
const normalizedFailover = coerceToFailoverError(err, {

src/agents/pi-embedded-runner/run/attempt.spawn-workspace.context-engine.test.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -998,31 +998,36 @@ describe("runEmbeddedAttempt context engine sessionKey forwarding", () => {
998998
expectInitialLockReleasedBeforePostTurnWrite(lockEvents);
999999
});
10001000

1001-
it("preserves provider prompt errors when cleanup detects session takeover", async () => {
1001+
it("preserves provider prompt errors while carrying cleanup session takeover", async () => {
10021002
const providerError = new Error("provider rejected request: HTTP 400");
10031003
let releasingCleanupLock = false;
1004+
let cleanupTakeover: EmbeddedAttemptSessionTakeoverError | undefined;
10041005
hoisted.flushPendingToolResultsAfterIdleMock.mockImplementation(async () => {
10051006
releasingCleanupLock = true;
10061007
});
10071008
hoisted.acquireSessionWriteLockMock.mockImplementation(async (params) => ({
10081009
release: async () => {
10091010
if (releasingCleanupLock) {
1010-
throw new EmbeddedAttemptSessionTakeoverError(params.sessionFile);
1011+
cleanupTakeover = new EmbeddedAttemptSessionTakeoverError(params.sessionFile);
1012+
throw cleanupTakeover;
10111013
}
10121014
},
10131015
}));
10141016

1015-
const result = await createContextEngineAttemptRunner({
1017+
const error = await createContextEngineAttemptRunner({
10161018
contextEngine: createContextEngineBootstrapAndAssemble(),
10171019
sessionKey,
10181020
tempPaths,
10191021
sessionPrompt: async () => {
10201022
throw providerError;
10211023
},
1022-
});
1024+
}).catch((err: unknown) => err);
10231025

1024-
expect(result.promptError).toBe(providerError);
1025-
expect(result.promptErrorSource).toBe("prompt");
1026+
expect(error).toBeInstanceOf(Error);
1027+
expect((error as Error).name).toBe("EmbeddedAttemptSessionTakeoverError");
1028+
expect((error as Error).message).toBe(providerError.message);
1029+
expect((error as Error).cause).toBe(cleanupTakeover);
1030+
expect((error as { promptError?: unknown }).promptError).toBe(providerError);
10261031
});
10271032

10281033
it("keeps cleanup session takeover fatal when no provider prompt error exists", async () => {

src/agents/pi-embedded-runner/run/attempt.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,18 @@ function shouldPreservePromptErrorAfterCleanupError(params: {
10001000
);
10011001
}
10021002

1003+
class EmbeddedAttemptPromptErrorWithCleanupTakeoverError extends Error {
1004+
readonly promptError: unknown;
1005+
readonly cleanupError: EmbeddedAttemptSessionTakeoverError;
1006+
1007+
constructor(params: { promptError: unknown; cleanupError: EmbeddedAttemptSessionTakeoverError }) {
1008+
super(formatErrorMessage(params.promptError), { cause: params.cleanupError });
1009+
this.name = "EmbeddedAttemptSessionTakeoverError";
1010+
this.promptError = params.promptError;
1011+
this.cleanupError = params.cleanupError;
1012+
}
1013+
}
1014+
10031015
function hasVisiblePendingToolMediaReply(
10041016
reply: { mediaUrls?: string[]; audioAsVoice?: boolean } | null | undefined,
10051017
): boolean {
@@ -4873,6 +4885,12 @@ export async function runEmbeddedAttempt(
48734885
`runId=${params.runId} sessionId=${params.sessionId} ` +
48744886
`promptError=${formatErrorMessage(promptError)} cleanupError=${formatErrorMessage(cleanupError)}`,
48754887
);
4888+
await Promise.reject(
4889+
new EmbeddedAttemptPromptErrorWithCleanupTakeoverError({
4890+
promptError,
4891+
cleanupError: cleanupError as EmbeddedAttemptSessionTakeoverError,
4892+
}),
4893+
);
48764894
} else {
48774895
await Promise.reject(cleanupError);
48784896
}

0 commit comments

Comments
 (0)