fix: share cron preflight budget across fallbacks

Cleo Thornsburg · Cleo Thornsburg · commit a02407e1d1fd · 2026-06-08T09:19:19.000-05:00
diff --git a/docs/cli/cron.md b/docs/cli/cron.md
@@ -141,7 +141,7 @@ Recurring jobs use exponential retry backoff after consecutive errors: 30s, 1m,
 
 Skipped runs are tracked separately from execution errors. They do not affect retry backoff, but `openclaw cron edit <job-id> --failure-alert-include-skipped` can opt failure alerts into repeated skipped-run notifications.
 
-For isolated jobs that target a local configured model provider, cron runs a lightweight provider preflight before starting the agent turn. Loopback, private-network, and `.local` `api: "ollama"` providers are probed at `/api/tags`; local OpenAI-compatible providers such as vLLM, SGLang, and LM Studio are probed at `/models`. If an endpoint is unreachable after the configured attempts, cron advances to the next configured model fallback. The run is recorded as `skipped` and retried on a later schedule only when no candidate is reachable. Matching dead endpoints are cached for 5 minutes to avoid many jobs hammering the same local server. Tune `cron.modelPreflight.timeoutMs`, `cron.modelPreflight.maxAttempts`, and `cron.modelPreflight.retryDelayMs` when a sleeping local/LAN provider needs a short wake-up window before cron advances to a fallback or gives up. The worst-case preflight window is limited to 55s so it stays below cron's isolated-agent setup watchdog.
+For isolated jobs that target a local configured model provider, cron runs a lightweight provider preflight before starting the agent turn. Loopback, private-network, and `.local` `api: "ollama"` providers are probed at `/api/tags`; local OpenAI-compatible providers such as vLLM, SGLang, and LM Studio are probed at `/models`. If an endpoint is unreachable after the configured attempts, cron advances to the next configured model fallback. The run is recorded as `skipped` and retried on a later schedule only when no candidate is reachable. Matching dead endpoints are cached for 5 minutes to avoid many jobs hammering the same local server. Tune `cron.modelPreflight.timeoutMs`, `cron.modelPreflight.maxAttempts`, and `cron.modelPreflight.retryDelayMs` when a sleeping local/LAN provider needs a short wake-up window before cron advances to a fallback or gives up. The entire candidate-chain preflight is limited to 55s, and each probe or delay is clamped to the remaining budget so setup stays below cron's isolated-agent watchdog.
 
 Note: cron jobs, pending runtime state, and run history live in the shared SQLite state database. Legacy `jobs.json`, `jobs-state.json`, and `runs/*.jsonl` files are imported once and renamed with a `.migrated` suffix. After import, edit schedules with `openclaw cron add|edit|remove` instead of editing JSON files.
 
diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md
@@ -1296,7 +1296,7 @@ Current builds no longer include the TCP bridge. Nodes connect over the Gateway
       timeoutMs: 2500, // default per-attempt timeout
       maxAttempts: 1, // default probe attempts before skipped
       retryDelayMs: 0, // default delay between attempts
-      // worst-case window is limited to 55s so preflight stays below cron's setup watchdog
+      // the full candidate chain is limited to 55s by cron's setup watchdog budget
     },
   },
 }
@@ -1305,7 +1305,7 @@ Current builds no longer include the TCP bridge. Nodes connect over the Gateway
 - `sessionRetention`: how long to keep completed isolated cron run sessions before pruning from `sessions.json`. Also controls cleanup of archived deleted cron transcripts. Default: `24h`; set `false` to disable.
 - `runLog.maxBytes`: accepted for compatibility with older file-backed cron run logs. Default: `2_000_000` bytes.
 - `runLog.keepLines`: newest SQLite run-history rows retained per job. Default: `2000`.
-- `modelPreflight`: local model-provider preflight controls for isolated cron agent turns. Increase `maxAttempts`, `retryDelayMs`, or `timeoutMs` when a sleeping Ollama/vLLM/LM Studio host needs a short wake-up window before cron advances to a configured fallback or marks the run skipped. The worst-case window (`timeoutMs * maxAttempts + retryDelayMs * (maxAttempts - 1)`) must stay at or below 55s so preflight remains below cron's isolated-agent setup watchdog.
+- `modelPreflight`: local model-provider preflight controls for isolated cron agent turns. Increase `maxAttempts`, `retryDelayMs`, or `timeoutMs` when a sleeping Ollama/vLLM/LM Studio host needs a short wake-up window before cron advances to a configured fallback or marks the run skipped. The configured per-endpoint window (`timeoutMs * maxAttempts + retryDelayMs * (maxAttempts - 1)`) must stay at or below 55s. Cron also shares one 55s deadline across the complete candidate chain and clamps each probe or delay to the remaining budget.
 - `webhookToken`: bearer token used for cron webhook POST delivery (`delivery.mode = "webhook"`), if omitted no auth header is sent.
 - `webhook`: deprecated legacy fallback webhook URL (http/https) used by `openclaw doctor --fix` to migrate stored jobs that still have `notify: true`; runtime delivery uses per-job `delivery.mode="webhook"` plus `delivery.to`, or `delivery.completionDestination` when preserving announce delivery.
 
diff --git a/docs/providers/ollama.md b/docs/providers/ollama.md
@@ -270,8 +270,9 @@ configure
 `cron.modelPreflight.maxAttempts`, `cron.modelPreflight.retryDelayMs`, and/or
 `cron.modelPreflight.timeoutMs` to give it a short wake-up window before cron
 advances to a fallback or marks the run skipped. Keep the worst-case window at
-or below 55s; OpenClaw validates this so local-provider preflight stays below
-cron's isolated-agent setup watchdog.
+or below 55s; OpenClaw validates the configured endpoint window and enforces one
+shared 55s deadline across the full fallback candidate chain so local-provider
+preflight stays below cron's isolated-agent setup watchdog.
 
 Live-verify the local text path, native stream path, and embeddings against
 local Ollama with:
diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts
@@ -1084,7 +1084,9 @@ describe("cron webhook schema", () => {
     if (res.success) {
       throw new Error("expected cron.modelPreflight retry window validation to fail");
     }
-    expect(res.error.issues[0]?.message).toContain("total retry window must be <= 55000ms");
+    expect(res.error.issues[0]?.message).toContain(
+      "total retry window must be <= 55000ms per endpoint",
+    );
   });
 });
 
diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts
@@ -1717,7 +1717,7 @@ export const FIELD_HELP: Record<string, string> = {
   "cron.runLog.keepLines":
     "How many trailing run-history rows to retain per cron job (default `2000`). Increase for longer forensic history or lower for smaller disks.",
   "cron.modelPreflight":
-    "Controls the lightweight local model-provider preflight used before isolated cron agent turns. Tune this when local or LAN providers such as Ollama need a few seconds to wake before /api/tags or /models responds. The total retry window is capped at 55s to stay below cron's setup watchdog.",
+    "Controls the lightweight local model-provider preflight used before isolated cron agent turns. Tune this when local or LAN providers such as Ollama need a few seconds to wake before /api/tags or /models responds. Each configured endpoint window and the complete fallback candidate chain are capped at 55s to stay below cron's setup watchdog.",
   "cron.modelPreflight.timeoutMs":
     "Per-attempt timeout in milliseconds for local model-provider preflight probes (default: 2500). Increase for slow LAN or cold-starting providers while keeping the total retry window <= 55s.",
   "cron.modelPreflight.maxAttempts":
diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts
@@ -865,7 +865,7 @@ export const OpenClawSchema = z
                 code: z.ZodIssueCode.custom,
                 message:
                   `cron.modelPreflight total retry window must be <= ${CRON_MODEL_PREFLIGHT_MAX_TOTAL_WINDOW_MS}ms ` +
-                  `so local-provider preflight stays below the cron agent setup watchdog; got ${totalWindowMs}ms.`,
+                  `per endpoint so it fits within the cron agent setup budget; got ${totalWindowMs}ms.`,
               });
             }
           })
diff --git a/src/cron/isolated-agent.model-preflight.test.ts b/src/cron/isolated-agent.model-preflight.test.ts
@@ -168,6 +168,89 @@ describe("runCronIsolatedAgentTurn model provider preflight", () => {
     expect(String(logWarnMock.mock.calls[0]?.[0] ?? "")).not.toContain("Skipping this cron run");
   });
 
+  it("shares one preflight deadline across multiple local fallback candidates", async () => {
+    mockRunCronFallbackPassthrough();
+    preflightCronModelProviderMock
+      .mockResolvedValueOnce({
+        status: "unavailable",
+        reason: "first local provider unavailable",
+        provider: "ollama",
+        model: "qwen3:32b",
+        baseUrl: "http://127.0.0.1:11434",
+        retryAfterMs: 300000,
+      })
+      .mockResolvedValueOnce({
+        status: "unavailable",
+        reason: "second local provider unavailable",
+        provider: "vllm",
+        model: "local-fallback",
+        baseUrl: "http://127.0.0.1:8000/v1",
+        retryAfterMs: 300000,
+      })
+      .mockResolvedValueOnce({ status: "available" });
+
+    const result = await runCronIsolatedAgentTurn({
+      cfg: {
+        agents: {
+          defaults: {
+            model: {
+              primary: "ollama/qwen3:32b",
+              fallbacks: ["vllm/local-fallback", "openrouter/cloud-fallback"],
+            },
+          },
+        },
+        models: {
+          providers: {
+            ollama: {
+              api: "ollama",
+              baseUrl: "http://127.0.0.1:11434",
+              models: [],
+            },
+            vllm: {
+              api: "openai-completions",
+              baseUrl: "http://127.0.0.1:8000/v1",
+              models: [],
+            },
+            openrouter: {
+              api: "openai-completions",
+              baseUrl: "https://openrouter.ai/api/v1",
+              models: [],
+            },
+          },
+        },
+      },
+      deps: {} as never,
+      job: {
+        id: "shared-preflight-budget",
+        name: "Shared Preflight Budget",
+        enabled: true,
+        createdAtMs: 0,
+        updatedAtMs: 0,
+        schedule: { kind: "cron", expr: "*/5 * * * *", tz: "UTC" },
+        sessionTarget: "isolated",
+        state: {},
+        wakeMode: "next-heartbeat",
+        payload: { kind: "agentTurn", message: "summarize" },
+        delivery: { mode: "none" },
+      },
+      message: "summarize",
+      sessionKey: "cron:shared-preflight-budget",
+      lane: "cron",
+    });
+
+    expect(result.status).toBe("ok");
+    expect(result.provider).toBe("openrouter");
+    const preflightCalls = preflightCronModelProviderMock.mock.calls.map((call) => call[0]);
+    expect(preflightCalls).toMatchObject([
+      { provider: "ollama", model: "qwen3:32b" },
+      { provider: "vllm", model: "local-fallback" },
+      { provider: "openrouter", model: "openrouter/cloud-fallback" },
+    ]);
+    const deadlines = preflightCalls.map((call) => call.deadlineMs);
+    expect(deadlines.every((deadline) => typeof deadline === "number")).toBe(true);
+    expect(new Set(deadlines).size).toBe(1);
+  });
+
   it("keeps explicit empty payload fallbacks strict when local primary preflight fails", async () => {
     preflightCronModelProviderMock.mockResolvedValueOnce({
       status: "unavailable",
diff --git a/src/cron/isolated-agent/model-preflight.runtime.test.ts b/src/cron/isolated-agent/model-preflight.runtime.test.ts
@@ -1,5 +1,5 @@
 // Runtime model preflight tests cover provider/model checks before cron execution.
-import { beforeEach, describe, expect, it, vi } from "vitest";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
 const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({
   fetchWithSsrFGuardMock: vi.fn(),
@@ -41,6 +41,10 @@ describe("preflightCronModelProvider", () => {
     resetCronModelProviderPreflightCacheForTest();
   });
 
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
   it("skips network checks for cloud provider URLs", async () => {
     const result = await preflightCronModelProvider({
       cfg: {
@@ -206,6 +210,55 @@ describe("preflightCronModelProvider", () => {
     expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(2);
   });
 
+  it("does not probe a second local candidate after the shared chain deadline expires", async () => {
+    vi.useFakeTimers();
+    vi.setSystemTime(1_000);
+    fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("first endpoint unavailable"));
+
+    const cfg = {
+      models: {
+        providers: {
+          first: {
+            api: "openai-completions" as const,
+            baseUrl: "http://127.0.0.1:18001/v1",
+            models: [],
+          },
+          second: {
+            api: "openai-completions" as const,
+            baseUrl: "http://127.0.0.1:18002/v1",
+            models: [],
+          },
+        },
+      },
+    };
+    const deadlineMs = 1_200;
+
+    const first = await preflightCronModelProvider({
+      cfg,
+      provider: "first",
+      model: "local-one",
+      deadlineMs,
+    });
+    expect(first.status).toBe("unavailable");
+    expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
+    expect(requireFetchPreflightRequest().timeoutMs).toBe(200);
+
+    vi.setSystemTime(deadlineMs);
+    const second = await preflightCronModelProvider({
+      cfg,
+      provider: "second",
+      model: "local-two",
+      deadlineMs,
+    });
+
+    expect(second.status).toBe("unavailable");
+    if (second.status !== "unavailable") {
+      throw new Error(`expected second preflight unavailable, got ${second.status}`);
+    }
+    expect(second.reason).toContain("chain budget exhausted");
+    expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
+  });
+
   it("retries an unavailable endpoint after the cache ttl", async () => {
     fetchWithSsrFGuardMock.mockRejectedValueOnce(new Error("ECONNREFUSED")).mockResolvedValueOnce({
       response: { status: 200 },
diff --git a/src/cron/isolated-agent/model-preflight.runtime.ts b/src/cron/isolated-agent/model-preflight.runtime.ts
@@ -181,7 +181,13 @@ function sleepMs(delayMs: number): Promise<void> {
   if (delayMs <= 0) {
     return Promise.resolve();
   }
-  return new Promise((resolve) => setTimeout(resolve, delayMs));
+  return new Promise((resolve) => {
+    setTimeout(resolve, delayMs);
+  });
+}
+
+function resolveRemainingBudgetMs(deadlineMs: number | undefined): number | undefined {
+  return deadlineMs === undefined ? undefined : Math.max(0, deadlineMs - Date.now());
 }
 
 async function probeLocalProviderEndpoint(params: {
@@ -212,6 +218,7 @@ export async function preflightCronModelProvider(params: {
   provider: string;
   model: string;
   nowMs?: number;
+  deadlineMs?: number;
 }): Promise<CronModelProviderPreflightResult> {
   const providerConfig = resolveProviderConfig(params.cfg, params.provider);
   if (!providerConfig) {
@@ -253,23 +260,48 @@ export async function preflightCronModelProvider(params: {
     });
   }
 
-  let result: EndpointPreflightResult;
   let lastError: unknown;
   let attempts = 0;
-  for (attempts = 1; attempts <= maxAttempts; attempts += 1) {
+  for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
+    const remainingBudgetMs = resolveRemainingBudgetMs(params.deadlineMs);
+    if (remainingBudgetMs !== undefined && remainingBudgetMs <= 0) {
+      lastError = new Error("cron model preflight chain budget exhausted");
+      break;
+    }
+    attempts = attempt;
     try {
-      await probeLocalProviderEndpoint({ api, baseUrl, timeoutMs });
-      result = { status: "available" };
+      await probeLocalProviderEndpoint({
+        api,
+        baseUrl,
+        timeoutMs:
+          remainingBudgetMs === undefined ? timeoutMs : Math.min(timeoutMs, remainingBudgetMs),
+      });
+      const result: EndpointPreflightResult = { status: "available" };
       preflightCache.set(cacheKey, { checkedAtMs: nowMs, result });
       return { status: "available" };
     } catch (error) {
       lastError = error;
-      if (attempts < maxAttempts) {
-        await sleepMs(retryDelayMs);
+      if (attempt < maxAttempts) {
+        const remainingDelayBudgetMs = resolveRemainingBudgetMs(params.deadlineMs);
+        if (remainingDelayBudgetMs !== undefined && remainingDelayBudgetMs <= 0) {
+          lastError = new Error(
+            `cron model preflight chain budget exhausted after ${attempts} attempt${attempts === 1 ? "" : "s"}`,
+          );
+          break;
+        }
+        await sleepMs(
+          remainingDelayBudgetMs === undefined
+            ? retryDelayMs
+            : Math.min(retryDelayMs, remainingDelayBudgetMs),
+        );
       }
     }
   }
-  result = { status: "unavailable", error: lastError, attempts: maxAttempts };
+  const result: EndpointPreflightResult = {
+    status: "unavailable",
+    error: lastError ?? new Error("cron model preflight chain budget exhausted"),
+    attempts,
+  };
   preflightCache.set(cacheKey, { checkedAtMs: nowMs, result });
   return buildUnavailableResult({
     provider: params.provider,
diff --git a/src/cron/isolated-agent/run.ts b/src/cron/isolated-agent/run.ts
@@ -113,6 +113,7 @@ const cronDeliveryRuntimeLoader = createLazyImportLoader(() => import("./run-del
 const cronModelPreflightRuntimeLoader = createLazyImportLoader(
   () => import("./model-preflight.runtime.js"),
 );
+const CRON_MODEL_PREFLIGHT_CHAIN_BUDGET_MS = 55_000;
 const runtimePluginsLoader = createLazyImportLoader(
   () => import("../../plugins/runtime-plugins.runtime.js"),
 );
@@ -662,6 +663,7 @@ async function prepareCronRunContext(params: {
     model,
     useSubagentFallbacks,
   });
+  const preflightDeadlineMs = Date.now() + CRON_MODEL_PREFLIGHT_CHAIN_BUDGET_MS;
   let selectedPreflightCandidate: { provider: string; model: string } | undefined;
   let selectedPreflightCandidateIndex = -1;
   let firstUnavailablePreflight:
@@ -672,6 +674,7 @@ async function prepareCronRunContext(params: {
       cfg: cfgWithAgentDefaults,
       provider: candidate.provider,
       model: candidate.model,
+      deadlineMs: preflightDeadlineMs,
     });
     if (candidatePreflight.status === "available") {
       selectedPreflightCandidate = candidate;

Original file line number	Diff line number	Diff line change
`@@ -865,7 +865,7 @@ export const OpenClawSchema = z`
`865`	`865`	`code: z.ZodIssueCode.custom,`
`866`	`866`	`message:`
`867`	`867`	`cron.modelPreflight total retry window must be <= ${CRON_MODEL_PREFLIGHT_MAX_TOTAL_WINDOW_MS}ms ` +
`868`		- `so local-provider preflight stays below the cron agent setup watchdog; got ${totalWindowMs}ms.`,
	`868`	+ `per endpoint so it fits within the cron agent setup budget; got ${totalWindowMs}ms.`,
`869`	`869`	`});`
`870`	`870`	`}`
`871`	`871`	`})`