fix(runner): surface provider errors to webchat (#70848)

truffle-dev · web-flow · commit a958b6e72328 · 2026-04-24T03:28:38.000+01:00
Surface non-retryable assistant provider failures from the embedded runner instead of letting surface_error fall through to continue_normal. - Preserve external abort and plain timeout fall-through paths. - Preserve raw provider error diagnostics on surfaced FailoverError. - Add regression coverage for billing/auth/rate-limit/null-reason/error fall-through cases. - Update changelog. Fixes #70124. Thanks @truffle-dev.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
 - Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
 - Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
 - Plugins/Google Meet: hang up delegated Twilio calls on leave, clean up Chrome realtime audio bridges when launch fails, and use a flat provider-safe tool schema.
diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.test.ts b/src/agents/pi-embedded-runner/run/assistant-failover.test.ts
@@ -0,0 +1,232 @@
+import { describe, expect, it, vi } from "vitest";
+import { FailoverError } from "../../failover-error.js";
+import { formatBillingErrorMessage } from "../../pi-embedded-helpers.js";
+import { handleAssistantFailover } from "./assistant-failover.js";
+
+type Params = Parameters<typeof handleAssistantFailover>[0];
+type Outcome = Awaited<ReturnType<typeof handleAssistantFailover>>;
+
+function makeParams(overrides: Partial<Params> = {}): Params {
+  const provider = "Anthropic";
+  const model = "claude-haiku-4-5-20251001";
+  const defaults: Params = {
+    initialDecision: { action: "surface_error", reason: "billing" },
+    aborted: false,
+    externalAbort: false,
+    fallbackConfigured: false,
+    failoverFailure: true,
+    failoverReason: "billing",
+    timedOut: false,
+    idleTimedOut: false,
+    timedOutDuringCompaction: false,
+    allowSameModelIdleTimeoutRetry: false,
+    assistantProfileFailureReason: null,
+    lastProfileId: undefined,
+    modelId: model,
+    provider,
+    activeErrorContext: { provider, model },
+    lastAssistant: undefined,
+    config: undefined,
+    sessionKey: undefined,
+    authFailure: false,
+    rateLimitFailure: false,
+    billingFailure: true,
+    cloudCodeAssistFormatError: false,
+    isProbeSession: false,
+    overloadProfileRotations: 0,
+    overloadProfileRotationLimit: 3,
+    previousRetryFailoverReason: null,
+    logAssistantFailoverDecision: vi.fn(),
+    warn: vi.fn(),
+    maybeMarkAuthProfileFailure: vi.fn(async () => {}),
+    maybeEscalateRateLimitProfileFallback: vi.fn(),
+    maybeBackoffBeforeOverloadFailover: vi.fn(async () => {}),
+    advanceAuthProfile: vi.fn(async () => false),
+  };
+  return { ...defaults, ...overrides };
+}
+
+function expectThrownFailoverError(outcome: Outcome): FailoverError {
+  expect(outcome.action).toBe("throw");
+  if (outcome.action !== "throw") {
+    throw new Error("expected throw outcome");
+  }
+  expect(outcome.error).toBeInstanceOf(FailoverError);
+  return outcome.error;
+}
+
+describe("handleAssistantFailover", () => {
+  describe("surface_error branch (openclaw#70124)", () => {
+    it("throws a billing FailoverError so the webchat can render the provider failure", async () => {
+      const logDecision = vi.fn();
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: "billing" },
+          failoverReason: "billing",
+          billingFailure: true,
+          logAssistantFailoverDecision: logDecision,
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("billing");
+      expect(err.message).toBe(formatBillingErrorMessage("Anthropic", "claude-haiku-4-5-20251001"));
+      expect(err.status).toBe(402);
+      expect(err.provider).toBe("Anthropic");
+      expect(err.model).toBe("claude-haiku-4-5-20251001");
+      expect(logDecision).toHaveBeenCalledWith("surface_error");
+    });
+
+    it("throws an auth FailoverError for auth-classified surface errors", async () => {
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: "auth" },
+          failoverReason: "auth",
+          billingFailure: false,
+          authFailure: true,
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("auth");
+      expect(err.message).toBe("LLM request unauthorized.");
+      expect(err.status).toBe(401);
+    });
+
+    it("throws a rate_limit FailoverError for rate-limited surface errors", async () => {
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: "rate_limit" },
+          failoverReason: "rate_limit",
+          billingFailure: false,
+          rateLimitFailure: true,
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("rate_limit");
+      expect(err.message).toBe("LLM request rate limited.");
+      expect(err.status).toBe(429);
+    });
+
+    it("preserves the raw provider error on surfaced failures", async () => {
+      const rawError = '  400 {"error":{"message":"credit balance is too low"}}  ';
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: "billing" },
+          failoverReason: "billing",
+          billingFailure: true,
+          lastAssistant: {
+            errorMessage: rawError,
+            model: "claude-haiku-4-5-20251001",
+            provider: "Anthropic",
+          } as Params["lastAssistant"],
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("billing");
+      expect(err.rawError).toBe(rawError.trim());
+    });
+
+    it("coerces a null decision reason onto the most specific non-timeout failure signal", async () => {
+      // failover-policy can return `surface_error` with `reason: null`
+      // when shouldRotateAssistant fires on `failoverFailure` without a
+      // classified upstream reason. FailoverError requires a concrete
+      // reason, so the throw path coerces null onto the most specific
+      // signal the run observed.
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: null },
+          failoverReason: null,
+          timedOut: false,
+          billingFailure: false,
+          authFailure: true,
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("auth");
+      expect(err.message).toBe("LLM request unauthorized.");
+      expect(err.status).toBe(401);
+    });
+
+    it("leaves externally-aborted runs on the continue_normal path", async () => {
+      // External aborts (user pressed stop) must never synthesize a
+      // provider error; the partial assistant output carries the turn.
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: null },
+          externalAbort: true,
+          aborted: true,
+          failoverReason: null,
+          billingFailure: false,
+        }),
+      );
+
+      expect(outcome.action).toBe("continue_normal");
+    });
+
+    it("leaves plain timeouts on the continue_normal path for the runner's timeout-payload synthesis", async () => {
+      // `run.ts` already emits an explicit timeout payload when
+      // `buildEmbeddedRunPayloads` produces no assistant content (see
+      // the `timedOut && !timedOutDuringCompaction &&
+      // !payloadsWithToolMedia.length` block). Throwing a FailoverError
+      // here would short-circuit that synthesis and break
+      // timeout-compaction retry coverage in
+      // `run.timeout-triggered-compaction.test.ts`. The throw path is
+      // reserved for concrete provider failures that have no other
+      // downstream surface.
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: null },
+          failoverReason: null,
+          timedOut: true,
+          billingFailure: false,
+        }),
+      );
+
+      expect(outcome.action).toBe("continue_normal");
+    });
+
+    it("retries the same model when an idle-timeout retry is allowed", async () => {
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "surface_error", reason: null },
+          failoverReason: null,
+          timedOut: true,
+          idleTimedOut: true,
+          allowSameModelIdleTimeoutRetry: true,
+          billingFailure: false,
+        }),
+      );
+
+      expect(outcome.action).toBe("retry");
+      if (outcome.action !== "retry") {
+        return;
+      }
+      expect(outcome.retryKind).toBe("same_model_idle_timeout");
+    });
+  });
+
+  describe("fallback_model branch", () => {
+    it("still throws a FailoverError after the surface_error refactor", async () => {
+      const logDecision = vi.fn();
+      const outcome = await handleAssistantFailover(
+        makeParams({
+          initialDecision: { action: "fallback_model", reason: "billing" },
+          fallbackConfigured: true,
+          failoverReason: "billing",
+          billingFailure: true,
+          logAssistantFailoverDecision: logDecision,
+        }),
+      );
+
+      const err = expectThrownFailoverError(outcome);
+      expect(err.reason).toBe("billing");
+      expect(err.status).toBe(402);
+      expect(err.message).toBe(formatBillingErrorMessage("Anthropic", "claude-haiku-4-5-20251001"));
+      expect(logDecision).toHaveBeenCalledWith("fallback_model", { status: 402 });
+    });
+  });
+});
diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.ts b/src/agents/pi-embedded-runner/run/assistant-failover.ts
@@ -183,28 +183,7 @@ export async function handleAssistantFailover(params: {
 
   if (decision.action === "fallback_model") {
     await params.maybeBackoffBeforeOverloadFailover(params.failoverReason);
-    const message =
-      (params.lastAssistant
-        ? formatAssistantErrorText(params.lastAssistant, {
-            cfg: params.config,
-            sessionKey: params.sessionKey,
-            provider: params.activeErrorContext.provider,
-            model: params.activeErrorContext.model,
-          })
-        : undefined) ||
-      params.lastAssistant?.errorMessage?.trim() ||
-      (params.timedOut
-        ? "LLM request timed out."
-        : params.rateLimitFailure
-          ? "LLM request rate limited."
-          : params.billingFailure
-            ? formatBillingErrorMessage(
-                params.activeErrorContext.provider,
-                params.activeErrorContext.model,
-              )
-            : params.authFailure
-              ? "LLM request unauthorized."
-              : "LLM request failed.");
+    const message = resolveAssistantFailoverErrorMessage(params);
     const status =
       resolveFailoverStatus(decision.reason) ?? (isTimeoutErrorMessage(message) ? 408 : undefined);
     params.logAssistantFailoverDecision("fallback_model", { status });
@@ -227,10 +206,109 @@ export async function handleAssistantFailover(params: {
       return sameModelIdleTimeoutRetry();
     }
     params.logAssistantFailoverDecision("surface_error");
+    // Two surface_error shapes already have downstream synthesis and
+    // must keep falling through to `continue_normal`:
+    //   1. External abort (user pressed stop) — partial assistant
+    //      output carries the turn; no provider error to synthesize.
+    //   2. Timeout without an idle-retry — run.ts emits a dedicated
+    //      timeout payload when buildEmbeddedRunPayloads produces no
+    //      assistant content (see the `timedOut &&
+    //      !timedOutDuringCompaction && !payloadsWithToolMedia.length`
+    //      block in run.ts). Throwing here would short-circuit that
+    //      synthesis and break timeout-compaction retry coverage.
+    // Every other surface_error is a concrete provider failure that
+    // continue_normal would silently drop before the payload builder
+    // sees it (openclaw#70124: billing errors reached the gateway
+    // but never the webchat because stopReason was not "error" and
+    // no other synthesis path caught them). Throw a FailoverError so
+    // the client surface can render it the same way it already
+    // renders fallback_model failures.
+    if (!params.externalAbort && !params.timedOut) {
+      const message = resolveAssistantFailoverErrorMessage(params);
+      const reason = resolveSurfaceErrorReason(decision.reason, params);
+      const status =
+        resolveFailoverStatus(reason) ?? (isTimeoutErrorMessage(message) ? 408 : undefined);
+      return {
+        action: "throw",
+        overloadProfileRotations,
+        error: new FailoverError(message, {
+          reason,
+          provider: params.activeErrorContext.provider,
+          model: params.activeErrorContext.model,
+          profileId: params.lastProfileId,
+          status,
+          rawError: params.lastAssistant?.errorMessage?.trim(),
+        }),
+      };
+    }
   }
 
   return {
     action: "continue_normal",
     overloadProfileRotations,
   };
 }
+
+function resolveAssistantFailoverErrorMessage(params: {
+  lastAssistant: AssistantMessage | undefined;
+  config: OpenClawConfig | undefined;
+  sessionKey?: string;
+  activeErrorContext: { provider: string; model: string };
+  timedOut: boolean;
+  rateLimitFailure: boolean;
+  billingFailure: boolean;
+  authFailure: boolean;
+}): string {
+  return (
+    (params.lastAssistant
+      ? formatAssistantErrorText(params.lastAssistant, {
+          cfg: params.config,
+          sessionKey: params.sessionKey,
+          provider: params.activeErrorContext.provider,
+          model: params.activeErrorContext.model,
+        })
+      : undefined) ||
+    params.lastAssistant?.errorMessage?.trim() ||
+    (params.timedOut
+      ? "LLM request timed out."
+      : params.rateLimitFailure
+        ? "LLM request rate limited."
+        : params.billingFailure
+          ? formatBillingErrorMessage(
+              params.activeErrorContext.provider,
+              params.activeErrorContext.model,
+            )
+          : params.authFailure
+            ? "LLM request unauthorized."
+            : "LLM request failed.")
+  );
+}
+
+// surface_error decisions can arrive with `reason: null` when
+// shouldRotateAssistant fired on `failoverFailure` without a classified
+// upstream reason. FailoverError requires a concrete reason, so map
+// null onto the most specific failure the run observed, falling back
+// to "unknown" when no signal is set. Callers only hit this helper on
+// the non-timeout throw branch, so timeouts don't need a case here.
+function resolveSurfaceErrorReason(
+  declared: FailoverReason | null,
+  params: {
+    billingFailure: boolean;
+    authFailure: boolean;
+    rateLimitFailure: boolean;
+  },
+): FailoverReason {
+  if (declared) {
+    return declared;
+  }
+  if (params.billingFailure) {
+    return "billing";
+  }
+  if (params.authFailure) {
+    return "auth";
+  }
+  if (params.rateLimitFailure) {
+    return "rate_limit";
+  }
+  return "unknown";
+}