fix: recover stale cron task records

steipete · steipete · commit 1fae716a04e5 · 2026-04-26T07:23:39.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -80,6 +80,7 @@ Docs: https://docs.openclaw.ai
 - Installer: load nvm before Node.js detection so `curl | bash` installs respect nvm-managed Node instead of stale system Node. Fixes #49556. Thanks @heavenlxj.
 - CLI/Volta: respawn raw `openclaw` CLI runs through the named `node` shim when the current Node executable resolves to `volta-shim`, avoiding direct shim execution failures in non-interactive shells. Fixes #68672. Thanks @sanchezm86.
 - Installer: warn when multiple npm global roots contain OpenClaw installs, showing active Node/npm/openclaw plus each install path and version so stale version-manager installs are visible. Fixes #40839. Thanks @zhixianio.
+- Cron/tasks: recover completed cron task ledger records from durable run logs and job state before marking them `lost`, reducing false `backing session missing` audit errors for isolated cron runs and keeping offline CLI audit from treating its empty local cron active-job set as authoritative. Fixes #71963.
 - Docker: copy patched dependency files into runtime images so downstream `pnpm install` layers keep working. Fixes #69224. Thanks @gucasbrg.
 - Agents/runtime: submit heartbeat, cron, and exec wakeups as transient runtime context instead of visible user prompts, keeping synthetic system work out of chat transcripts. Fixes #66496 and #66814. Thanks @jeades and @mandomaker.
 - Telegram: include native quote excerpts automatically for threaded replies and reply tags when the original Telegram text is available, without adding another config knob. Fixes #6975. Thanks @rex05ai.
diff --git a/docs/automation/cron-jobs.md b/docs/automation/cron-jobs.md
@@ -51,7 +51,7 @@ Cron is the Gateway's built-in scheduler. It persists jobs, wakes the agent at t
 <a id="maintenance"></a>
 
 <Note>
-Task reconciliation for cron is runtime-owned: an active cron task stays live while the cron runtime still tracks that job as running, even if an old child session row still exists. Once the runtime stops owning the job and the 5-minute grace window expires, maintenance can mark the task `lost`.
+Task reconciliation for cron is runtime-owned first, durable-history-backed second: an active cron task stays live while the cron runtime still tracks that job as running, even if an old child session row still exists. Once the runtime stops owning the job and the 5-minute grace window expires, maintenance checks persisted run logs and job state for the matching `cron:<jobId>:<startedAt>` run. If that durable history shows a terminal result, the task ledger is finalized from it; otherwise Gateway-owned maintenance can mark the task `lost`. Offline CLI audit can recover from durable history, but it does not treat its own empty in-process active-job set as proof that a Gateway-owned cron run is gone.
 </Note>
 
 ## Schedule types
diff --git a/docs/automation/tasks.md b/docs/automation/tasks.md
@@ -25,8 +25,12 @@ Not every agent run creates a task. Heartbeat turns and normal interactive chat
 - Tasks are **records**, not schedulers — cron and heartbeat decide _when_ work runs, tasks track _what happened_.
 - ACP, subagents, all cron jobs, and CLI operations create tasks. Heartbeat turns do not.
 - Each task moves through `queued → running → terminal` (succeeded, failed, timed_out, cancelled, or lost).
-- Cron tasks stay live while the cron runtime still owns the job; chat-backed CLI tasks stay live only while their owning run context is still active.
-- Completion is push-driven: detached work can notify directly or wake the requester session/heartbeat when it finishes, so status polling loops are usually the wrong shape.
+- Cron tasks stay live while the cron runtime still owns the job; if the
+  in-memory runtime state is gone, task maintenance first checks durable cron
+  run history before marking a task lost.
+- Completion is push-driven: detached work can notify directly or wake the
+  requester session/heartbeat when it finishes, so status polling loops are
+  usually the wrong shape.
 - Isolated cron runs and subagent completions best-effort clean up tracked browser tabs/processes for their child session before final cleanup bookkeeping.
 - Isolated cron delivery suppresses stale interim parent replies while descendant subagent work is still draining, and it prefers final descendant output when that arrives before delivery.
 - Completion notifications are delivered directly to a channel or queued for the next heartbeat.
@@ -143,8 +147,14 @@ Agent run completion is authoritative for active task records. A successful deta
 
 - ACP tasks: backing ACP child session metadata disappeared.
 - Subagent tasks: backing child session disappeared from the target agent store.
-- Cron tasks: the cron runtime no longer tracks the job as active.
-- CLI tasks: isolated child-session tasks use the child session; chat-backed CLI tasks use the live run context instead, so lingering channel/group/direct session rows do not keep them alive. Gateway-backed `openclaw agent` runs also finalize from their run result, so completed runs do not sit active until the sweeper marks them `lost`.
+- Cron tasks: the cron runtime no longer tracks the job as active and durable
+  cron run history does not show a terminal result for that run. Offline CLI
+  audit does not treat its own empty in-process cron runtime state as authority.
+- CLI tasks: isolated child-session tasks use the child session; chat-backed
+  CLI tasks use the live run context instead, so lingering
+  channel/group/direct session rows do not keep them alive. Gateway-backed
+  `openclaw agent` runs also finalize from their run result, so completed runs
+  do not sit active until the sweeper marks them `lost`.
 
 ## Delivery and notifications
 
@@ -236,7 +246,7 @@ openclaw tasks notify <lookup> state_changes
     Reconciliation is runtime-aware:
 
     - ACP/subagent tasks check their backing child session.
-    - Cron tasks check whether the cron runtime still owns the job.
+    - Cron tasks check whether the cron runtime still owns the job, then recover terminal status from persisted cron run logs/job state before falling back to `lost`. Only the Gateway process is authoritative for the in-memory cron active-job set; offline CLI audit uses durable history but does not mark a cron task lost solely because that local Set is empty.
     - Chat-backed CLI tasks check the owning live run context, not just the chat session row.
 
     Completion cleanup is also runtime-aware:
diff --git a/docs/cli/tasks.md b/docs/cli/tasks.md
@@ -84,6 +84,10 @@ openclaw tasks maintenance [--apply] [--json]
 ```
 
 Previews or applies task and Task Flow reconciliation, cleanup stamping, and pruning.
+For cron tasks, reconciliation uses persisted run logs/job state before marking an
+old active task `lost`, so completed cron runs do not become false audit errors
+just because the in-memory Gateway runtime state is gone. Offline CLI audit is
+not authoritative for the Gateway's process-local cron active-job set.
 
 ### `flow`
 
diff --git a/src/commands/status.summary.test.ts b/src/commands/status.summary.test.ts
@@ -58,6 +58,7 @@ vi.mock("../infra/system-events.js", () => ({
 }));
 
 vi.mock("../tasks/task-registry.maintenance.js", () => ({
+  configureTaskRegistryMaintenance: vi.fn(),
   getInspectableTaskRegistrySummary: vi.fn(() => ({
     total: 0,
     active: 0,
diff --git a/src/commands/status.summary.ts b/src/commands/status.summary.ts
@@ -4,6 +4,7 @@ import { resolveStorePath } from "../config/sessions/paths.js";
 import { readSessionStoreReadOnly } from "../config/sessions/store-read.js";
 import { resolveSessionTotalTokens, type SessionEntry } from "../config/sessions/types.js";
 import type { OpenClawConfig } from "../config/types.js";
+import { resolveCronStorePath } from "../cron/store.js";
 import { listGatewayAgentsBasic } from "../gateway/agent-list.js";
 import { resolveHeartbeatSummaryForAgent } from "../infra/heartbeat-summary.js";
 import { peekSystemEvents } from "../infra/system-events.js";
@@ -151,6 +152,9 @@ export async function getStatusSummary(
   const mainSessionKey = resolveMainSessionKey(cfg);
   const queuedSystemEvents = peekSystemEvents(mainSessionKey);
   const taskMaintenanceModule = await loadTaskRegistryMaintenanceModule();
+  taskMaintenanceModule.configureTaskRegistryMaintenance({
+    cronStorePath: resolveCronStorePath(cfg.cron?.store),
+  });
   const tasks = taskMaintenanceModule.getInspectableTaskRegistrySummary();
   const taskAudit = taskMaintenanceModule.getInspectableTaskAuditSummary();
 
diff --git a/src/commands/tasks.ts b/src/commands/tasks.ts
@@ -1,3 +1,5 @@
+import { loadConfig } from "../config/config.js";
+import { resolveCronStorePath } from "../cron/store.js";
 import type { RuntimeEnv } from "../runtime.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
 import { getTaskById, updateTaskNotifyPolicyById } from "../tasks/runtime-internal.js";
@@ -24,6 +26,7 @@ import { compareTaskAuditFindingSortKeys } from "../tasks/task-registry.audit.sh
 import {
   getInspectableTaskAuditSummary,
   getInspectableTaskRegistrySummary,
+  configureTaskRegistryMaintenance,
   previewTaskRegistryMaintenance,
   runTaskRegistryMaintenance,
 } from "../tasks/task-registry.maintenance.js";
@@ -44,10 +47,16 @@ const RUN_PAD = 10;
 const info = theme.info;
 
 async function loadTaskCancelConfig() {
-  const { loadConfig } = await import("../config/config.js");
   return loadConfig();
 }
 
+function configureTaskMaintenanceFromConfig(): void {
+  const cfg = loadConfig();
+  configureTaskRegistryMaintenance({
+    cronStorePath: resolveCronStorePath(cfg.cron?.store),
+  });
+}
+
 function truncate(value: string, maxChars: number) {
   if (value.length <= maxChars) {
     return value;
@@ -417,6 +426,7 @@ export async function tasksAuditCommand(
   },
   runtime: RuntimeEnv,
 ) {
+  configureTaskMaintenanceFromConfig();
   const severityFilter = opts.severity?.trim() as TaskSystemAuditSeverity | undefined;
   const codeFilter = opts.code?.trim() as TaskSystemAuditCode | undefined;
   const { allFindings, filteredFindings, taskFindings, summary } = toSystemAuditFindings({
@@ -491,6 +501,7 @@ export async function tasksMaintenanceCommand(
   opts: { json?: boolean; apply?: boolean },
   runtime: RuntimeEnv,
 ) {
+  configureTaskMaintenanceFromConfig();
   const auditBefore = getInspectableTaskAuditSummary();
   const flowAuditBefore = getInspectableTaskFlowAuditSummary();
   const taskMaintenance = opts.apply
diff --git a/src/cron/run-log.test.ts b/src/cron/run-log.test.ts
@@ -9,6 +9,7 @@ import {
   getPendingCronRunLogWriteCountForTests,
   readCronRunLogEntries,
   readCronRunLogEntriesPage,
+  readCronRunLogEntriesSync,
   resolveCronRunLogPruneOptions,
   resolveCronRunLogPath,
 } from "./run-log.js";
@@ -96,6 +97,36 @@ describe("cron run log", () => {
     });
   });
 
+  it("reads run-log entries synchronously for task reconciliation", async () => {
+    await withRunLogDir("openclaw-cron-log-sync-", async (dir) => {
+      const logPath = path.join(dir, "runs", "job-1.jsonl");
+      await appendCronRunLog(logPath, {
+        ts: 1000,
+        jobId: "job-1",
+        action: "finished",
+        status: "ok",
+        runAtMs: 900,
+        durationMs: 100,
+      });
+      await appendCronRunLog(logPath, {
+        ts: 2000,
+        jobId: "job-2",
+        action: "finished",
+        status: "error",
+      });
+
+      expect(readCronRunLogEntriesSync(logPath, { jobId: "job-1" })).toEqual([
+        expect.objectContaining({
+          jobId: "job-1",
+          status: "ok",
+          runAtMs: 900,
+          durationMs: 100,
+        }),
+      ]);
+      expect(readCronRunLogEntriesSync(path.join(dir, "runs", "missing.jsonl"))).toEqual([]);
+    });
+  });
+
   it.skipIf(process.platform === "win32")(
     "writes run log files with secure permissions",
     async () => {
diff --git a/src/cron/run-log.ts b/src/cron/run-log.ts
@@ -1,4 +1,5 @@
 import { randomBytes } from "node:crypto";
+import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";
 import { parseByteSize } from "../cli/parse-bytes.js";
@@ -198,6 +199,23 @@ export async function readCronRunLogEntries(
   return page.entries.toReversed();
 }
 
+export function readCronRunLogEntriesSync(
+  filePath: string,
+  opts?: { limit?: number; jobId?: string },
+): CronRunLogEntry[] {
+  const limit = Math.max(1, Math.min(5000, Math.floor(opts?.limit ?? 200)));
+  let raw: string;
+  try {
+    raw = fsSync.readFileSync(path.resolve(filePath), "utf-8");
+  } catch (error) {
+    if (typeof error === "object" && error !== null && "code" in error && error.code === "ENOENT") {
+      return [];
+    }
+    throw error;
+  }
+  return parseAllRunLogEntries(raw, { jobId: opts?.jobId }).slice(-limit);
+}
+
 function normalizeRunStatusFilter(status?: string): CronRunLogStatusFilter {
   if (status === "ok" || status === "error" || status === "skipped" || status === "all") {
     return status;
diff --git a/src/cron/store.test.ts b/src/cron/store.test.ts
@@ -3,7 +3,7 @@ import os from "node:os";
 import path from "node:path";
 import { setTimeout as scheduleNativeTimeout } from "node:timers";
 import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
-import { loadCronStore, resolveCronStorePath, saveCronStore } from "./store.js";
+import { loadCronStore, loadCronStoreSync, resolveCronStorePath, saveCronStore } from "./store.js";
 import type { CronStoreFile } from "./types.js";
 
 let fixtureRoot = "";
@@ -125,6 +125,19 @@ describe("cron store", () => {
     });
   });
 
+  it("loads split cron state synchronously for task reconciliation", async () => {
+    const { storePath } = await makeStorePath();
+    await saveCronStore(storePath, makeStore("job-sync", true));
+
+    const loaded = loadCronStoreSync(storePath);
+
+    expect(loaded.jobs[0]).toMatchObject({
+      id: "job-sync",
+      state: expect.any(Object),
+      updatedAtMs: expect.any(Number),
+    });
+  });
+
   it("does not create a backup file when saving unchanged content", async () => {
     const store = await makeStorePath();
     const payload = makeStore("job-1", true);
diff --git a/src/cron/store.ts b/src/cron/store.ts
@@ -114,6 +114,39 @@ async function loadStateFile(statePath: string): Promise<CronStateFile | null> {
   }
 }
 
+function loadStateFileSync(statePath: string): CronStateFile | null {
+  let raw: string;
+  try {
+    raw = fs.readFileSync(statePath, "utf-8");
+  } catch (err) {
+    if ((err as { code?: unknown })?.code === "ENOENT") {
+      return null;
+    }
+    throw new Error(`Failed to read cron state at ${statePath}: ${String(err)}`, {
+      cause: err,
+    });
+  }
+
+  try {
+    const parsed = parseJsonWithJson5Fallback(raw);
+    if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
+      return null;
+    }
+    const record = parsed as Record<string, unknown>;
+    if (
+      record.version !== 1 ||
+      typeof record.jobs !== "object" ||
+      record.jobs === null ||
+      Array.isArray(record.jobs)
+    ) {
+      return null;
+    }
+    return { version: 1, jobs: record.jobs as Record<string, CronStateFileEntry> };
+  } catch {
+    return null;
+  }
+}
+
 function hasInlineState(jobs: Array<Record<string, unknown> | null | undefined>): boolean {
   return jobs.some(
     (job) =>
@@ -219,6 +252,60 @@ export async function loadCronStore(storePath: string): Promise<CronStoreFile> {
   }
 }
 
+export function loadCronStoreSync(storePath: string): CronStoreFile {
+  try {
+    const raw = fs.readFileSync(storePath, "utf-8");
+    let parsed: unknown;
+    try {
+      parsed = parseJsonWithJson5Fallback(raw);
+    } catch (err) {
+      throw new Error(`Failed to parse cron store at ${storePath}: ${String(err)}`, {
+        cause: err,
+      });
+    }
+    const parsedRecord =
+      parsed && typeof parsed === "object" && !Array.isArray(parsed)
+        ? (parsed as Record<string, unknown>)
+        : {};
+    const jobs = Array.isArray(parsedRecord.jobs) ? (parsedRecord.jobs as never[]) : [];
+    const store = {
+      version: 1 as const,
+      jobs: jobs.filter(Boolean) as never as CronStoreFile["jobs"],
+    };
+
+    const stateFile = loadStateFileSync(resolveStatePath(storePath));
+    const hasLegacyInlineState =
+      !stateFile && hasInlineState(jobs as unknown as Array<Record<string, unknown>>);
+
+    if (stateFile) {
+      for (const job of store.jobs) {
+        const entry = stateFile.jobs[job.id];
+        if (entry) {
+          job.updatedAtMs = resolveUpdatedAtMs(job, entry.updatedAtMs);
+          job.state = (entry.state ?? {}) as never;
+        } else {
+          backfillMissingRuntimeFields(job);
+        }
+      }
+    } else if (!hasLegacyInlineState) {
+      for (const job of store.jobs) {
+        backfillMissingRuntimeFields(job);
+      }
+    }
+
+    for (const job of store.jobs) {
+      ensureJobStateObject(job);
+    }
+
+    return store;
+  } catch (err) {
+    if ((err as { code?: unknown })?.code === "ENOENT") {
+      return { version: 1, jobs: [] };
+    }
+    throw err;
+  }
+}
+
 type SaveCronStoreOptions = {
   skipBackup?: boolean;
 };
diff --git a/src/gateway/server-startup-early.ts b/src/gateway/server-startup-early.ts
@@ -1,14 +1,18 @@
 import { registerSkillsChangeListener } from "../agents/skills/refresh.js";
 import type { GatewayTailscaleMode } from "../config/types.gateway.js";
 import type { OpenClawConfig } from "../config/types.openclaw.js";
+import { resolveCronStorePath } from "../cron/store.js";
 import { getMachineDisplayName } from "../infra/machine-name.js";
 import {
   primeRemoteSkillsCache,
   refreshRemoteBinsForConnectedNodes,
   setSkillsRemoteRegistry,
 } from "../infra/skills-remote.js";
 import type { PluginRegistry } from "../plugins/registry-types.js";
-import { startTaskRegistryMaintenance } from "../tasks/task-registry.maintenance.js";
+import {
+  configureTaskRegistryMaintenance,
+  startTaskRegistryMaintenance,
+} from "../tasks/task-registry.maintenance.js";
 import { startGatewayDiscovery } from "./server-discovery-runtime.js";
 import { startGatewayMaintenanceTimers } from "./server-maintenance.js";
 
@@ -77,6 +81,10 @@ export async function startGatewayEarlyRuntime(params: {
   if (!params.minimalTestGateway) {
     setSkillsRemoteRegistry(params.nodeRegistry);
     void primeRemoteSkillsCache();
+    configureTaskRegistryMaintenance({
+      cronStorePath: resolveCronStorePath(params.cfgAtStart.cron?.store),
+      cronRuntimeAuthoritative: true,
+    });
     startTaskRegistryMaintenance();
   }
 
diff --git a/src/tasks/task-registry.maintenance.issue-60299.test.ts b/src/tasks/task-registry.maintenance.issue-60299.test.ts
diff --git a/src/tasks/task-registry.maintenance.ts b/src/tasks/task-registry.maintenance.ts
diff --git a/src/tasks/task-registry.test.ts b/src/tasks/task-registry.test.ts