Skip to content

Commit a958b6e

Browse files
authored
fix(runner): surface provider errors to webchat (#70848)
Surface non-retryable assistant provider failures from the embedded runner instead of letting surface_error fall through to continue_normal. - Preserve external abort and plain timeout fall-through paths. - Preserve raw provider error diagnostics on surfaced FailoverError. - Add regression coverage for billing/auth/rate-limit/null-reason/error fall-through cases. - Update changelog. Fixes #70124. Thanks @truffle-dev.
1 parent 7c18b76 commit a958b6e

3 files changed

Lines changed: 333 additions & 22 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
2222

2323
### Fixes
2424

25+
- Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
2526
- Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
2627
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
2728
- Plugins/Google Meet: hang up delegated Twilio calls on leave, clean up Chrome realtime audio bridges when launch fails, and use a flat provider-safe tool schema.
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import { describe, expect, it, vi } from "vitest";
2+
import { FailoverError } from "../../failover-error.js";
3+
import { formatBillingErrorMessage } from "../../pi-embedded-helpers.js";
4+
import { handleAssistantFailover } from "./assistant-failover.js";
5+
6+
type Params = Parameters<typeof handleAssistantFailover>[0];
7+
type Outcome = Awaited<ReturnType<typeof handleAssistantFailover>>;
8+
9+
function makeParams(overrides: Partial<Params> = {}): Params {
10+
const provider = "Anthropic";
11+
const model = "claude-haiku-4-5-20251001";
12+
const defaults: Params = {
13+
initialDecision: { action: "surface_error", reason: "billing" },
14+
aborted: false,
15+
externalAbort: false,
16+
fallbackConfigured: false,
17+
failoverFailure: true,
18+
failoverReason: "billing",
19+
timedOut: false,
20+
idleTimedOut: false,
21+
timedOutDuringCompaction: false,
22+
allowSameModelIdleTimeoutRetry: false,
23+
assistantProfileFailureReason: null,
24+
lastProfileId: undefined,
25+
modelId: model,
26+
provider,
27+
activeErrorContext: { provider, model },
28+
lastAssistant: undefined,
29+
config: undefined,
30+
sessionKey: undefined,
31+
authFailure: false,
32+
rateLimitFailure: false,
33+
billingFailure: true,
34+
cloudCodeAssistFormatError: false,
35+
isProbeSession: false,
36+
overloadProfileRotations: 0,
37+
overloadProfileRotationLimit: 3,
38+
previousRetryFailoverReason: null,
39+
logAssistantFailoverDecision: vi.fn(),
40+
warn: vi.fn(),
41+
maybeMarkAuthProfileFailure: vi.fn(async () => {}),
42+
maybeEscalateRateLimitProfileFallback: vi.fn(),
43+
maybeBackoffBeforeOverloadFailover: vi.fn(async () => {}),
44+
advanceAuthProfile: vi.fn(async () => false),
45+
};
46+
return { ...defaults, ...overrides };
47+
}
48+
49+
function expectThrownFailoverError(outcome: Outcome): FailoverError {
50+
expect(outcome.action).toBe("throw");
51+
if (outcome.action !== "throw") {
52+
throw new Error("expected throw outcome");
53+
}
54+
expect(outcome.error).toBeInstanceOf(FailoverError);
55+
return outcome.error;
56+
}
57+
58+
describe("handleAssistantFailover", () => {
59+
describe("surface_error branch (openclaw#70124)", () => {
60+
it("throws a billing FailoverError so the webchat can render the provider failure", async () => {
61+
const logDecision = vi.fn();
62+
const outcome = await handleAssistantFailover(
63+
makeParams({
64+
initialDecision: { action: "surface_error", reason: "billing" },
65+
failoverReason: "billing",
66+
billingFailure: true,
67+
logAssistantFailoverDecision: logDecision,
68+
}),
69+
);
70+
71+
const err = expectThrownFailoverError(outcome);
72+
expect(err.reason).toBe("billing");
73+
expect(err.message).toBe(formatBillingErrorMessage("Anthropic", "claude-haiku-4-5-20251001"));
74+
expect(err.status).toBe(402);
75+
expect(err.provider).toBe("Anthropic");
76+
expect(err.model).toBe("claude-haiku-4-5-20251001");
77+
expect(logDecision).toHaveBeenCalledWith("surface_error");
78+
});
79+
80+
it("throws an auth FailoverError for auth-classified surface errors", async () => {
81+
const outcome = await handleAssistantFailover(
82+
makeParams({
83+
initialDecision: { action: "surface_error", reason: "auth" },
84+
failoverReason: "auth",
85+
billingFailure: false,
86+
authFailure: true,
87+
}),
88+
);
89+
90+
const err = expectThrownFailoverError(outcome);
91+
expect(err.reason).toBe("auth");
92+
expect(err.message).toBe("LLM request unauthorized.");
93+
expect(err.status).toBe(401);
94+
});
95+
96+
it("throws a rate_limit FailoverError for rate-limited surface errors", async () => {
97+
const outcome = await handleAssistantFailover(
98+
makeParams({
99+
initialDecision: { action: "surface_error", reason: "rate_limit" },
100+
failoverReason: "rate_limit",
101+
billingFailure: false,
102+
rateLimitFailure: true,
103+
}),
104+
);
105+
106+
const err = expectThrownFailoverError(outcome);
107+
expect(err.reason).toBe("rate_limit");
108+
expect(err.message).toBe("LLM request rate limited.");
109+
expect(err.status).toBe(429);
110+
});
111+
112+
it("preserves the raw provider error on surfaced failures", async () => {
113+
const rawError = ' 400 {"error":{"message":"credit balance is too low"}} ';
114+
const outcome = await handleAssistantFailover(
115+
makeParams({
116+
initialDecision: { action: "surface_error", reason: "billing" },
117+
failoverReason: "billing",
118+
billingFailure: true,
119+
lastAssistant: {
120+
errorMessage: rawError,
121+
model: "claude-haiku-4-5-20251001",
122+
provider: "Anthropic",
123+
} as Params["lastAssistant"],
124+
}),
125+
);
126+
127+
const err = expectThrownFailoverError(outcome);
128+
expect(err.reason).toBe("billing");
129+
expect(err.rawError).toBe(rawError.trim());
130+
});
131+
132+
it("coerces a null decision reason onto the most specific non-timeout failure signal", async () => {
133+
// failover-policy can return `surface_error` with `reason: null`
134+
// when shouldRotateAssistant fires on `failoverFailure` without a
135+
// classified upstream reason. FailoverError requires a concrete
136+
// reason, so the throw path coerces null onto the most specific
137+
// signal the run observed.
138+
const outcome = await handleAssistantFailover(
139+
makeParams({
140+
initialDecision: { action: "surface_error", reason: null },
141+
failoverReason: null,
142+
timedOut: false,
143+
billingFailure: false,
144+
authFailure: true,
145+
}),
146+
);
147+
148+
const err = expectThrownFailoverError(outcome);
149+
expect(err.reason).toBe("auth");
150+
expect(err.message).toBe("LLM request unauthorized.");
151+
expect(err.status).toBe(401);
152+
});
153+
154+
it("leaves externally-aborted runs on the continue_normal path", async () => {
155+
// External aborts (user pressed stop) must never synthesize a
156+
// provider error; the partial assistant output carries the turn.
157+
const outcome = await handleAssistantFailover(
158+
makeParams({
159+
initialDecision: { action: "surface_error", reason: null },
160+
externalAbort: true,
161+
aborted: true,
162+
failoverReason: null,
163+
billingFailure: false,
164+
}),
165+
);
166+
167+
expect(outcome.action).toBe("continue_normal");
168+
});
169+
170+
it("leaves plain timeouts on the continue_normal path for the runner's timeout-payload synthesis", async () => {
171+
// `run.ts` already emits an explicit timeout payload when
172+
// `buildEmbeddedRunPayloads` produces no assistant content (see
173+
// the `timedOut && !timedOutDuringCompaction &&
174+
// !payloadsWithToolMedia.length` block). Throwing a FailoverError
175+
// here would short-circuit that synthesis and break
176+
// timeout-compaction retry coverage in
177+
// `run.timeout-triggered-compaction.test.ts`. The throw path is
178+
// reserved for concrete provider failures that have no other
179+
// downstream surface.
180+
const outcome = await handleAssistantFailover(
181+
makeParams({
182+
initialDecision: { action: "surface_error", reason: null },
183+
failoverReason: null,
184+
timedOut: true,
185+
billingFailure: false,
186+
}),
187+
);
188+
189+
expect(outcome.action).toBe("continue_normal");
190+
});
191+
192+
it("retries the same model when an idle-timeout retry is allowed", async () => {
193+
const outcome = await handleAssistantFailover(
194+
makeParams({
195+
initialDecision: { action: "surface_error", reason: null },
196+
failoverReason: null,
197+
timedOut: true,
198+
idleTimedOut: true,
199+
allowSameModelIdleTimeoutRetry: true,
200+
billingFailure: false,
201+
}),
202+
);
203+
204+
expect(outcome.action).toBe("retry");
205+
if (outcome.action !== "retry") {
206+
return;
207+
}
208+
expect(outcome.retryKind).toBe("same_model_idle_timeout");
209+
});
210+
});
211+
212+
describe("fallback_model branch", () => {
213+
it("still throws a FailoverError after the surface_error refactor", async () => {
214+
const logDecision = vi.fn();
215+
const outcome = await handleAssistantFailover(
216+
makeParams({
217+
initialDecision: { action: "fallback_model", reason: "billing" },
218+
fallbackConfigured: true,
219+
failoverReason: "billing",
220+
billingFailure: true,
221+
logAssistantFailoverDecision: logDecision,
222+
}),
223+
);
224+
225+
const err = expectThrownFailoverError(outcome);
226+
expect(err.reason).toBe("billing");
227+
expect(err.status).toBe(402);
228+
expect(err.message).toBe(formatBillingErrorMessage("Anthropic", "claude-haiku-4-5-20251001"));
229+
expect(logDecision).toHaveBeenCalledWith("fallback_model", { status: 402 });
230+
});
231+
});
232+
});

src/agents/pi-embedded-runner/run/assistant-failover.ts

Lines changed: 100 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -183,28 +183,7 @@ export async function handleAssistantFailover(params: {
183183

184184
if (decision.action === "fallback_model") {
185185
await params.maybeBackoffBeforeOverloadFailover(params.failoverReason);
186-
const message =
187-
(params.lastAssistant
188-
? formatAssistantErrorText(params.lastAssistant, {
189-
cfg: params.config,
190-
sessionKey: params.sessionKey,
191-
provider: params.activeErrorContext.provider,
192-
model: params.activeErrorContext.model,
193-
})
194-
: undefined) ||
195-
params.lastAssistant?.errorMessage?.trim() ||
196-
(params.timedOut
197-
? "LLM request timed out."
198-
: params.rateLimitFailure
199-
? "LLM request rate limited."
200-
: params.billingFailure
201-
? formatBillingErrorMessage(
202-
params.activeErrorContext.provider,
203-
params.activeErrorContext.model,
204-
)
205-
: params.authFailure
206-
? "LLM request unauthorized."
207-
: "LLM request failed.");
186+
const message = resolveAssistantFailoverErrorMessage(params);
208187
const status =
209188
resolveFailoverStatus(decision.reason) ?? (isTimeoutErrorMessage(message) ? 408 : undefined);
210189
params.logAssistantFailoverDecision("fallback_model", { status });
@@ -227,10 +206,109 @@ export async function handleAssistantFailover(params: {
227206
return sameModelIdleTimeoutRetry();
228207
}
229208
params.logAssistantFailoverDecision("surface_error");
209+
// Two surface_error shapes already have downstream synthesis and
210+
// must keep falling through to `continue_normal`:
211+
// 1. External abort (user pressed stop) — partial assistant
212+
// output carries the turn; no provider error to synthesize.
213+
// 2. Timeout without an idle-retry — run.ts emits a dedicated
214+
// timeout payload when buildEmbeddedRunPayloads produces no
215+
// assistant content (see the `timedOut &&
216+
// !timedOutDuringCompaction && !payloadsWithToolMedia.length`
217+
// block in run.ts). Throwing here would short-circuit that
218+
// synthesis and break timeout-compaction retry coverage.
219+
// Every other surface_error is a concrete provider failure that
220+
// continue_normal would silently drop before the payload builder
221+
// sees it (openclaw#70124: billing errors reached the gateway
222+
// but never the webchat because stopReason was not "error" and
223+
// no other synthesis path caught them). Throw a FailoverError so
224+
// the client surface can render it the same way it already
225+
// renders fallback_model failures.
226+
if (!params.externalAbort && !params.timedOut) {
227+
const message = resolveAssistantFailoverErrorMessage(params);
228+
const reason = resolveSurfaceErrorReason(decision.reason, params);
229+
const status =
230+
resolveFailoverStatus(reason) ?? (isTimeoutErrorMessage(message) ? 408 : undefined);
231+
return {
232+
action: "throw",
233+
overloadProfileRotations,
234+
error: new FailoverError(message, {
235+
reason,
236+
provider: params.activeErrorContext.provider,
237+
model: params.activeErrorContext.model,
238+
profileId: params.lastProfileId,
239+
status,
240+
rawError: params.lastAssistant?.errorMessage?.trim(),
241+
}),
242+
};
243+
}
230244
}
231245

232246
return {
233247
action: "continue_normal",
234248
overloadProfileRotations,
235249
};
236250
}
251+
252+
function resolveAssistantFailoverErrorMessage(params: {
253+
lastAssistant: AssistantMessage | undefined;
254+
config: OpenClawConfig | undefined;
255+
sessionKey?: string;
256+
activeErrorContext: { provider: string; model: string };
257+
timedOut: boolean;
258+
rateLimitFailure: boolean;
259+
billingFailure: boolean;
260+
authFailure: boolean;
261+
}): string {
262+
return (
263+
(params.lastAssistant
264+
? formatAssistantErrorText(params.lastAssistant, {
265+
cfg: params.config,
266+
sessionKey: params.sessionKey,
267+
provider: params.activeErrorContext.provider,
268+
model: params.activeErrorContext.model,
269+
})
270+
: undefined) ||
271+
params.lastAssistant?.errorMessage?.trim() ||
272+
(params.timedOut
273+
? "LLM request timed out."
274+
: params.rateLimitFailure
275+
? "LLM request rate limited."
276+
: params.billingFailure
277+
? formatBillingErrorMessage(
278+
params.activeErrorContext.provider,
279+
params.activeErrorContext.model,
280+
)
281+
: params.authFailure
282+
? "LLM request unauthorized."
283+
: "LLM request failed.")
284+
);
285+
}
286+
287+
// surface_error decisions can arrive with `reason: null` when
288+
// shouldRotateAssistant fired on `failoverFailure` without a classified
289+
// upstream reason. FailoverError requires a concrete reason, so map
290+
// null onto the most specific failure the run observed, falling back
291+
// to "unknown" when no signal is set. Callers only hit this helper on
292+
// the non-timeout throw branch, so timeouts don't need a case here.
293+
function resolveSurfaceErrorReason(
294+
declared: FailoverReason | null,
295+
params: {
296+
billingFailure: boolean;
297+
authFailure: boolean;
298+
rateLimitFailure: boolean;
299+
},
300+
): FailoverReason {
301+
if (declared) {
302+
return declared;
303+
}
304+
if (params.billingFailure) {
305+
return "billing";
306+
}
307+
if (params.authFailure) {
308+
return "auth";
309+
}
310+
if (params.rateLimitFailure) {
311+
return "rate_limit";
312+
}
313+
return "unknown";
314+
}

0 commit comments

Comments
 (0)