fix(agents): improve fallback failure observability

steipete · steipete · commit 0e586bb48a31 · 2026-04-27T13:10:12.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
 - Control UI: show loading, reload, and retry states when a lazy dashboard panel cannot load after an upgrade, so the Logs tab no longer appears blank on stale browser bundles. Fixes #72450. Thanks @sobergou.
 - Agents/reasoning: recover fully wrapped unclosed `<think>` replies that would otherwise sanitize to empty text while keeping strict stripping for closed reasoning blocks and unclosed tails after visible text. Fixes #37696; supersedes #51915. Thanks @druide67 and @okuyam2y.
 - Control UI/Gateway: bind WebChat handshakes to their active socket and reject post-close server registrations, so aborted connects no longer leave zombie clients or misleading duplicate WebSocket connection logs. Fixes #72753. Thanks @LumenFromTheFuture.
+- Agents/fallback: split ambiguous provider failures into `empty_response`, `no_error_details`, and `unclassified`, and add flat fallback-step fields to structured fallback logs so primary-model failures stay visible when later fallbacks also fail. Fixes #71922; refs #71744. Thanks @andyk-ms and @nikolaykazakovvs-ux.
 - Plugins/Windows: normalize Windows absolute paths before handing bundled plugin modules to Jiti, so Feishu/Lark message sending no longer fails with unsupported `c:` ESM loader URLs. Fixes #72783. Thanks @jackychen-png.
 - CLI/doctor: run bundled plugin runtime-dependency repairs through the async npm installer with spinner/line progress and heartbeat updates, so long `openclaw doctor --fix` installs no longer look hung in TTY or piped output. Fixes #72775. Thanks @dfpalhano.
 - Feishu/Windows: normalize bundled channel sidecar loads before Jiti evaluates them, so Feishu outbound sends no longer fail with raw `C:` ESM loader errors on Windows. Fixes #72783. Thanks @jackychen-png.
diff --git a/docs/concepts/model-failover.md b/docs/concepts/model-failover.md
@@ -203,7 +203,7 @@ Defaults:
 
 ## Model fallback
 
-If all profiles for a provider fail, OpenClaw moves to the next model in `agents.defaults.model.fallbacks`. This applies to auth failures, rate limits, and timeouts that exhausted profile rotation (other errors do not advance fallback).
+If all profiles for a provider fail, OpenClaw moves to the next model in `agents.defaults.model.fallbacks`. This applies to auth failures, rate limits, and timeouts that exhausted profile rotation (other errors do not advance fallback). Provider errors that do not expose enough detail are still labeled precisely in fallback state: `empty_response` means the provider returned no usable message or status, `no_error_details` means the provider explicitly returned `Unknown error (no error details in response)`, and `unclassified` means OpenClaw preserved the raw preview but no classifier matched it yet.
 
 Overloaded and rate-limit errors are handled more aggressively than billing cooldowns. By default, OpenClaw allows one same-provider auth-profile retry, then switches to the next configured model fallback without waiting. Provider-busy signals such as `ModelNotReadyException` land in that overloaded bucket. Tune this with `auth.cooldowns.overloadedProfileRotations`, `auth.cooldowns.overloadedBackoffMs`, and `auth.cooldowns.rateLimitedProfileRotations`.
 
@@ -302,6 +302,8 @@ The persisted fallback override closes that window, and the narrow rollback keep
 - optional status/code
 - human-readable error summary
 
+Structured `model_fallback_decision` logs also include flat `fallbackStep*` fields when a candidate fails, is skipped, or a later fallback succeeds. These fields make the attempted transition explicit (`fallbackStepFromModel`, `fallbackStepToModel`, `fallbackStepFromFailureReason`, `fallbackStepFromFailureDetail`, `fallbackStepFinalOutcome`) so log and diagnostic exporters can reconstruct the primary failure even when the terminal fallback also fails.
+
 When every candidate fails, OpenClaw throws `FallbackSummaryError`. The outer reply runner can use that to build a more specific message such as "all models are temporarily rate-limited" and include the soonest cooldown expiry when one is known.
 
 That cooldown summary is model-aware:
diff --git a/src/agents/auth-profiles/types.ts b/src/agents/auth-profiles/types.ts
@@ -61,6 +61,9 @@ export type AuthProfileFailureReason =
   | "timeout"
   | "model_not_found"
   | "session_expired"
+  | "empty_response"
+  | "no_error_details"
+  | "unclassified"
   | "unknown";
 
 /** Per-profile usage statistics for round-robin and cooldown tracking */
diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts
@@ -45,6 +45,9 @@ const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
   "overloaded",
   "timeout",
   "rate_limit",
+  "empty_response",
+  "no_error_details",
+  "unclassified",
   "unknown",
 ];
 const FAILURE_REASON_SET = new Set<AuthProfileFailureReason>(FAILURE_REASON_PRIORITY);
@@ -89,7 +92,11 @@ function shouldProbeWhamForFailure(
 ): boolean {
   return (
     normalizeProviderId(provider ?? "") === "openai-codex" &&
-    (reason === "rate_limit" || reason === "unknown")
+    (reason === "rate_limit" ||
+      reason === "empty_response" ||
+      reason === "no_error_details" ||
+      reason === "unclassified" ||
+      reason === "unknown")
   );
 }
 
diff --git a/src/agents/failover-policy.test.ts b/src/agents/failover-policy.test.ts
@@ -38,6 +38,24 @@ const CASES: ReasonCase[] = [
     useTransientProbeSlot: true,
     preserveTransientProbeSlot: false,
   },
+  {
+    reason: "empty_response",
+    allowCooldownProbe: true,
+    useTransientProbeSlot: true,
+    preserveTransientProbeSlot: false,
+  },
+  {
+    reason: "no_error_details",
+    allowCooldownProbe: true,
+    useTransientProbeSlot: true,
+    preserveTransientProbeSlot: false,
+  },
+  {
+    reason: "unclassified",
+    allowCooldownProbe: true,
+    useTransientProbeSlot: true,
+    preserveTransientProbeSlot: false,
+  },
   {
     reason: "model_not_found",
     allowCooldownProbe: false,
diff --git a/src/agents/failover-policy.ts b/src/agents/failover-policy.ts
@@ -8,6 +8,9 @@ export function shouldAllowCooldownProbeForReason(
     reason === "overloaded" ||
     reason === "billing" ||
     reason === "unknown" ||
+    reason === "empty_response" ||
+    reason === "no_error_details" ||
+    reason === "unclassified" ||
     reason === "timeout"
   );
 }
@@ -19,6 +22,9 @@ export function shouldUseTransientCooldownProbeSlot(
     reason === "rate_limit" ||
     reason === "overloaded" ||
     reason === "unknown" ||
+    reason === "empty_response" ||
+    reason === "no_error_details" ||
+    reason === "unclassified" ||
     reason === "timeout"
   );
 }
diff --git a/src/agents/model-fallback-observation.ts b/src/agents/model-fallback-observation.ts
@@ -27,6 +27,68 @@ function buildErrorObservationFields(error?: string): {
   };
 }
 
+type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted";
+
+function formatModelRef(candidate: ModelCandidate): string {
+  return `${candidate.provider}/${candidate.model}`;
+}
+
+function buildFallbackStepFields(params: {
+  decision: "skip_candidate" | "candidate_failed" | "candidate_succeeded";
+  candidate: ModelCandidate;
+  reason?: FailoverReason | null;
+  error?: string;
+  nextCandidate?: ModelCandidate;
+  attempt?: number;
+  previousAttempts?: FallbackAttempt[];
+}):
+  | {
+      fallbackStepType: "fallback_step";
+      fallbackStepFromModel: string;
+      fallbackStepToModel?: string;
+      fallbackStepFromFailureReason?: FailoverReason;
+      fallbackStepFromFailureDetail?: string;
+      fallbackStepChainPosition?: number;
+      fallbackStepFinalOutcome: FallbackStepOutcome;
+    }
+  | undefined {
+  const lastPreviousAttempt = params.previousAttempts?.at(-1);
+  if (params.decision === "candidate_succeeded") {
+    if (!lastPreviousAttempt) {
+      return undefined;
+    }
+    return {
+      fallbackStepType: "fallback_step",
+      fallbackStepFromModel: `${lastPreviousAttempt.provider}/${lastPreviousAttempt.model}`,
+      fallbackStepToModel: formatModelRef(params.candidate),
+      ...(lastPreviousAttempt.reason
+        ? { fallbackStepFromFailureReason: lastPreviousAttempt.reason }
+        : {}),
+      ...(lastPreviousAttempt.error
+        ? { fallbackStepFromFailureDetail: lastPreviousAttempt.error }
+        : {}),
+      ...(typeof params.attempt === "number" ? { fallbackStepChainPosition: params.attempt } : {}),
+      fallbackStepFinalOutcome: "succeeded",
+    };
+  }
+
+  const observed = buildErrorObservationFields(params.error);
+  return {
+    fallbackStepType: "fallback_step",
+    fallbackStepFromModel: formatModelRef(params.candidate),
+    ...(params.nextCandidate ? { fallbackStepToModel: formatModelRef(params.nextCandidate) } : {}),
+    ...(params.reason ? { fallbackStepFromFailureReason: params.reason } : {}),
+    ...((observed.providerErrorMessagePreview ?? observed.errorPreview)
+      ? {
+          fallbackStepFromFailureDetail:
+            observed.providerErrorMessagePreview ?? observed.errorPreview,
+        }
+      : {}),
+    ...(typeof params.attempt === "number" ? { fallbackStepChainPosition: params.attempt } : {}),
+    fallbackStepFinalOutcome: params.nextCandidate ? "next_fallback" : "chain_exhausted",
+  };
+}
+
 export function logModelFallbackDecision(params: {
   decision:
     | "skip_candidate"
@@ -57,6 +119,20 @@ export function logModelFallbackDecision(params: {
   const reasonText = params.reason ?? "unknown";
   const observedError = buildErrorObservationFields(params.error);
   const detailText = observedError.providerErrorMessagePreview ?? observedError.errorPreview;
+  const fallbackStepFields =
+    params.decision === "skip_candidate" ||
+    params.decision === "candidate_failed" ||
+    params.decision === "candidate_succeeded"
+      ? buildFallbackStepFields({
+          decision: params.decision,
+          candidate: params.candidate,
+          reason: params.reason,
+          error: params.error,
+          nextCandidate: params.nextCandidate,
+          attempt: params.attempt,
+          previousAttempts: params.previousAttempts,
+        })
+      : undefined;
   const providerErrorTypeSuffix = observedError.providerErrorType
     ? ` providerErrorType=${sanitizeForLog(observedError.providerErrorType)}`
     : "";
@@ -76,6 +152,7 @@ export function logModelFallbackDecision(params: {
     status: params.status,
     code: params.code,
     ...observedError,
+    ...fallbackStepFields,
     nextCandidateProvider: params.nextCandidate?.provider,
     nextCandidateModel: params.nextCandidate?.model,
     isPrimary: params.isPrimary,
diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts
@@ -346,6 +346,12 @@ describe("runWithModelFallback – probe logic", () => {
           requestedModelMatched: true,
           nextCandidateProvider: "anthropic",
           nextCandidateModel: "claude-haiku-3-5",
+          fallbackStepType: "fallback_step",
+          fallbackStepFromModel: "openai/gpt-4.1-mini",
+          fallbackStepToModel: "anthropic/claude-haiku-3-5",
+          fallbackStepFromFailureReason: "rate_limit",
+          fallbackStepChainPosition: 1,
+          fallbackStepFinalOutcome: "next_fallback",
         }),
         expect.objectContaining({
           event: "model_fallback_decision",
@@ -354,6 +360,12 @@ describe("runWithModelFallback – probe logic", () => {
           candidateModel: "claude-haiku-3-5",
           isPrimary: false,
           requestedModelMatched: false,
+          fallbackStepType: "fallback_step",
+          fallbackStepFromModel: "openai/gpt-4.1-mini",
+          fallbackStepToModel: "anthropic/claude-haiku-3-5",
+          fallbackStepFromFailureReason: "rate_limit",
+          fallbackStepChainPosition: 2,
+          fallbackStepFinalOutcome: "succeeded",
         }),
       ]),
     );
diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@@ -733,9 +733,9 @@ describe("classifyFailoverReason", () => {
     ).toBeNull();
   });
 
-  it("classifies OpenAI Responses unknown-no-details message as unknown", () => {
+  it("classifies OpenAI Responses unknown-no-details message distinctly", () => {
     const message = "Unknown error (no error details in response)";
-    expect(classifyFailoverReason(message)).toBe("unknown");
+    expect(classifyFailoverReason(message)).toBe("no_error_details");
     expect(isFailoverErrorMessage(message)).toBe(true);
   });
 
@@ -1376,6 +1376,16 @@ describe("classifyProviderRuntimeFailureKind", () => {
     ).toBe("replay_invalid");
   });
 
+  it("splits ambiguous provider runtime failures instead of collapsing to unknown", () => {
+    expect(classifyProviderRuntimeFailureKind({})).toBe("empty_response");
+    expect(classifyProviderRuntimeFailureKind("Unknown error (no error details in response)")).toBe(
+      "no_error_details",
+    );
+    expect(classifyProviderRuntimeFailureKind("provider sent a strange opaque failure")).toBe(
+      "unclassified",
+    );
+  });
+
   it("does not classify generic config errors that mention proxy settings as proxy failures", () => {
     expect(
       classifyProviderRuntimeFailureKind(
diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts
@@ -270,6 +270,9 @@ export type ProviderRuntimeFailureKind =
   | "schema"
   | "sandbox_blocked"
   | "replay_invalid"
+  | "empty_response"
+  | "no_error_details"
+  | "unclassified"
   | "unknown";
 
 const BILLING_402_HINTS = [
@@ -851,7 +854,7 @@ function classifyFailoverClassificationFromMessage(
     return toReasonClassification("format");
   }
   if (isExactUnknownNoDetailsError(raw)) {
-    return toReasonClassification("unknown");
+    return toReasonClassification("no_error_details");
   }
   if (isTimeoutErrorMessage(raw)) {
     return toReasonClassification("timeout");
@@ -900,7 +903,7 @@ export function classifyProviderRuntimeFailureKind(
   const status = inferSignalStatus(normalizedSignal);
 
   if (!message && typeof status !== "number") {
-    return "unknown";
+    return "empty_response";
   }
   if (normalizedSignal.code === "refresh_contention") {
     return "refresh_contention";
@@ -958,7 +961,10 @@ export function classifyProviderRuntimeFailureKind(
   if (message && isTimeoutTransportErrorMessage(message, status)) {
     return "timeout";
   }
-  return "unknown";
+  if (message && isExactUnknownNoDetailsError(message)) {
+    return "no_error_details";
+  }
+  return "unclassified";
 }
 
 export function formatAssistantErrorText(
diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts
@@ -10,4 +10,7 @@ export type FailoverReason =
   | "timeout"
   | "model_not_found"
   | "session_expired"
+  | "empty_response"
+  | "no_error_details"
+  | "unclassified"
   | "unknown";
diff --git a/src/agents/runtime-plan/types.ts b/src/agents/runtime-plan/types.ts
@@ -25,6 +25,9 @@ export type AgentRuntimeFailoverReason =
   | "timeout"
   | "model_not_found"
   | "session_expired"
+  | "empty_response"
+  | "no_error_details"
+  | "unclassified"
   | "unknown";
 
 export type AgentRuntimeConfig = unknown;
diff --git a/src/auto-reply/fallback-state.ts b/src/auto-reply/fallback-state.ts
@@ -9,7 +9,14 @@ export {
 } from "../status/fallback-notice-state.js";
 
 const FALLBACK_REASON_PART_MAX = 80;
-const TRANSIENT_FALLBACK_REASONS = new Set(["rate_limit", "overloaded", "timeout"]);
+const TRANSIENT_FALLBACK_REASONS = new Set([
+  "rate_limit",
+  "overloaded",
+  "timeout",
+  "empty_response",
+  "no_error_details",
+  "unclassified",
+]);
 const TRANSIENT_ERROR_DETAIL_HINT_RE =
   /\b(?:429|5\d\d|too many requests|usage limit|quota|try again in|retry[- ]after|seconds?|minutes?|hours?|temporarily unavailable|overloaded|service unavailable|throttl)\b/i;
 
diff --git a/src/gateway/protocol/schema/cron.ts b/src/gateway/protocol/schema/cron.ts
@@ -65,6 +65,9 @@ const CronFailoverReasonSchema = Type.Union([
   Type.Literal("billing"),
   Type.Literal("timeout"),
   Type.Literal("model_not_found"),
+  Type.Literal("empty_response"),
+  Type.Literal("no_error_details"),
+  Type.Literal("unclassified"),
   Type.Literal("unknown"),
 ]);
 const CronCommonOptionalFields = {