Skip to content

Commit c602824

Browse files
authored
fix(cron): stop unresolved next-run refire loops (#66083)
Merged via squash. Prepared head SHA: b86ba58
1 parent 114ff23 commit c602824

5 files changed

Lines changed: 285 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai
2020
- Browser/CDP: let managed local Chrome readiness, status probes, and managed loopback CDP control bypass browser SSRF policy for their own loopback control plane, so OpenClaw no longer misclassifies a healthy child browser as "not reachable after start". (#65695, #66043) Thanks @mbelinky.
2121
- Gateway/sessions: stop heartbeat, cron-event, and exec-event turns from overwriting shared-session routing and origin metadata, preventing synthetic `heartbeat` targets from poisoning later cron or user delivery. (#63733, #35300)
2222
- Browser/CDP: let local attach-only `manual-cdp` profiles reuse the local loopback CDP control plane under strict default policy and remote-class probe timeouts, so tabs/snapshot stop falsely reporting a live local browser session as not running. (#65611, #66080) Thanks @mbelinky.
23+
- Cron/scheduler: stop inventing short retries when cron next-run calculation returns no valid future slot, and keep a maintenance wake armed so enabled unscheduled jobs recover without entering a refire loop. (#66019, #66083) Thanks @mbelinky.
2324

2425
## 2026.4.12
2526

src/cron/service.armtimer-tight-loop.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,43 @@ describe("CronService - armTimer tight loop prevention", () => {
141141
timeoutSpy.mockRestore();
142142
});
143143

144+
it("keeps a maintenance wake armed when enabled jobs have no nextRunAtMs", () => {
145+
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
146+
const now = Date.parse("2026-02-28T12:32:00.000Z");
147+
148+
const state = createTimerState({
149+
storePath: "/tmp/test-cron/jobs.json",
150+
now,
151+
});
152+
state.store = {
153+
version: 1,
154+
jobs: [
155+
{
156+
id: "missing-next-run",
157+
name: "missing-next-run",
158+
enabled: true,
159+
deleteAfterRun: false,
160+
createdAtMs: now - 60_000,
161+
updatedAtMs: now - 60_000,
162+
schedule: { kind: "cron", expr: "*/15 * * * *" },
163+
sessionTarget: "isolated" as const,
164+
wakeMode: "next-heartbeat" as const,
165+
payload: { kind: "agentTurn" as const, message: "test" },
166+
delivery: { mode: "none" as const },
167+
state: {},
168+
},
169+
],
170+
};
171+
172+
armTimer(state);
173+
174+
expect(state.timer).not.toBeNull();
175+
const delays = extractTimeoutDelays(timeoutSpy);
176+
expect(delays).toContain(60_000);
177+
178+
timeoutSpy.mockRestore();
179+
});
180+
144181
it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => {
145182
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
146183
const store = await makeStorePath();
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import { describe, expect, it, vi } from "vitest";
2+
import {
3+
createDefaultIsolatedRunner,
4+
createIsolatedRegressionJob,
5+
noopLogger,
6+
setupCronRegressionFixtures,
7+
writeCronJobs,
8+
} from "../../test/helpers/cron/service-regression-fixtures.js";
9+
import * as schedule from "./schedule.js";
10+
import { createCronServiceState } from "./service/state.js";
11+
import { onTimer } from "./service/timer.js";
12+
13+
const issue66019Fixtures = setupCronRegressionFixtures({ prefix: "cron-66019-" });
14+
15+
describe("#66019 unresolved next-run repro", () => {
16+
it("does not refire a recurring cron job 2s later when next-run resolution returns undefined", async () => {
17+
const store = issue66019Fixtures.makeStorePath();
18+
const scheduledAt = Date.parse("2026-04-13T15:40:00.000Z");
19+
let now = scheduledAt;
20+
21+
const cronJob = createIsolatedRegressionJob({
22+
id: "cron-66019-minimal-success",
23+
name: "cron-66019-minimal-success",
24+
scheduledAt,
25+
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
26+
payload: { kind: "agentTurn", message: "ping" },
27+
state: { nextRunAtMs: scheduledAt - 1_000 },
28+
});
29+
await writeCronJobs(store.storePath, [cronJob]);
30+
31+
const runIsolatedAgentJob = createDefaultIsolatedRunner();
32+
const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
33+
const state = createCronServiceState({
34+
cronEnabled: true,
35+
storePath: store.storePath,
36+
log: noopLogger,
37+
nowMs: () => now,
38+
enqueueSystemEvent: vi.fn(),
39+
requestHeartbeatNow: vi.fn(),
40+
runIsolatedAgentJob,
41+
});
42+
43+
try {
44+
await onTimer(state);
45+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
46+
expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
47+
48+
// Before the fix, applyJobResult would synthesize endedAt + 2_000 here,
49+
// so a second tick a couple seconds later would refire the same job.
50+
now = scheduledAt + 2_001;
51+
await onTimer(state);
52+
53+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
54+
expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
55+
} finally {
56+
nextRunSpy.mockRestore();
57+
if (state.timer) {
58+
clearTimeout(state.timer);
59+
state.timer = null;
60+
}
61+
}
62+
});
63+
64+
it("does not refire a recurring errored cron job after the first backoff window when next-run resolution returns undefined", async () => {
65+
const store = issue66019Fixtures.makeStorePath();
66+
const scheduledAt = Date.parse("2026-04-13T15:45:00.000Z");
67+
let now = scheduledAt;
68+
69+
const cronJob = createIsolatedRegressionJob({
70+
id: "cron-66019-minimal-error",
71+
name: "cron-66019-minimal-error",
72+
scheduledAt,
73+
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
74+
payload: { kind: "agentTurn", message: "ping" },
75+
state: { nextRunAtMs: scheduledAt - 1_000 },
76+
});
77+
await writeCronJobs(store.storePath, [cronJob]);
78+
79+
const runIsolatedAgentJob = vi.fn().mockResolvedValue({
80+
status: "error",
81+
error: "synthetic failure",
82+
});
83+
const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
84+
const state = createCronServiceState({
85+
cronEnabled: true,
86+
storePath: store.storePath,
87+
log: noopLogger,
88+
nowMs: () => now,
89+
enqueueSystemEvent: vi.fn(),
90+
requestHeartbeatNow: vi.fn(),
91+
runIsolatedAgentJob,
92+
});
93+
94+
try {
95+
await onTimer(state);
96+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
97+
expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
98+
99+
// Before the fix, the error branch would synthesize the first backoff
100+
// retry (30s), so the next tick after that window would rerun the job.
101+
now = scheduledAt + 30_001;
102+
await onTimer(state);
103+
104+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
105+
expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
106+
} finally {
107+
nextRunSpy.mockRestore();
108+
if (state.timer) {
109+
clearTimeout(state.timer);
110+
state.timer = null;
111+
}
112+
}
113+
});
114+
});

src/cron/service/timer.regression.test.ts

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,11 +1061,11 @@ describe("cron service timer regressions", () => {
10611061
expect(job.state.lastStatus).toBe("ok");
10621062
expect(job.state.scheduleErrorCount).toBe(1);
10631063
expect(job.state.lastError).toMatch(/^schedule error:/);
1064-
expect(job.state.nextRunAtMs).toBe(endedAt + 2_000);
1064+
expect(job.state.nextRunAtMs).toBeUndefined();
10651065
expect(job.enabled).toBe(true);
10661066
});
10671067

1068-
it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => {
1068+
it("keeps state updates when cron next-run computation throws on error path (#30905)", () => {
10691069
const startedAt = Date.parse("2026-03-02T12:05:00.000Z");
10701070
const endedAt = startedAt + 25;
10711071
const state = createCronServiceState({
@@ -1100,10 +1100,93 @@ describe("cron service timer regressions", () => {
11001100
expect(job.state.consecutiveErrors).toBe(1);
11011101
expect(job.state.scheduleErrorCount).toBe(1);
11021102
expect(job.state.lastError).toMatch(/^schedule error:/);
1103-
expect(job.state.nextRunAtMs).toBe(endedAt + 30_000);
1103+
expect(job.state.nextRunAtMs).toBeUndefined();
11041104
expect(job.enabled).toBe(true);
11051105
});
11061106

1107+
it("does not synthesize a 2s retry when cron schedule computation returns undefined (#66019)", () => {
1108+
const startedAt = Date.parse("2026-04-13T15:40:00.000Z");
1109+
const endedAt = startedAt + 50;
1110+
const state = createCronServiceState({
1111+
cronEnabled: true,
1112+
storePath: "/tmp/cron-66019-success.json",
1113+
log: noopLogger,
1114+
nowMs: () => endedAt,
1115+
enqueueSystemEvent: vi.fn(),
1116+
requestHeartbeatNow: vi.fn(),
1117+
runIsolatedAgentJob: createDefaultIsolatedRunner(),
1118+
});
1119+
const job = createIsolatedRegressionJob({
1120+
id: "cron-66019-success",
1121+
name: "cron-66019-success",
1122+
scheduledAt: startedAt,
1123+
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
1124+
payload: { kind: "agentTurn", message: "ping" },
1125+
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
1126+
});
1127+
const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
1128+
1129+
try {
1130+
const shouldDelete = applyJobResult(state, job, {
1131+
status: "ok",
1132+
delivered: true,
1133+
startedAt,
1134+
endedAt,
1135+
});
1136+
1137+
expect(shouldDelete).toBe(false);
1138+
expect(job.state.runningAtMs).toBeUndefined();
1139+
expect(job.state.lastRunAtMs).toBe(startedAt);
1140+
expect(job.state.lastStatus).toBe("ok");
1141+
expect(job.state.nextRunAtMs).toBeUndefined();
1142+
expect(job.enabled).toBe(true);
1143+
} finally {
1144+
nextRunSpy.mockRestore();
1145+
}
1146+
});
1147+
1148+
it("does not synthesize backoff retries when cron schedule computation returns undefined (#66019)", () => {
1149+
const startedAt = Date.parse("2026-04-13T15:45:00.000Z");
1150+
const endedAt = startedAt + 25;
1151+
const state = createCronServiceState({
1152+
cronEnabled: true,
1153+
storePath: "/tmp/cron-66019-error.json",
1154+
log: noopLogger,
1155+
nowMs: () => endedAt,
1156+
enqueueSystemEvent: vi.fn(),
1157+
requestHeartbeatNow: vi.fn(),
1158+
runIsolatedAgentJob: createDefaultIsolatedRunner(),
1159+
});
1160+
const job = createIsolatedRegressionJob({
1161+
id: "cron-66019-error",
1162+
name: "cron-66019-error",
1163+
scheduledAt: startedAt,
1164+
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
1165+
payload: { kind: "agentTurn", message: "ping" },
1166+
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
1167+
});
1168+
const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
1169+
1170+
try {
1171+
const shouldDelete = applyJobResult(state, job, {
1172+
status: "error",
1173+
error: "synthetic failure",
1174+
startedAt,
1175+
endedAt,
1176+
});
1177+
1178+
expect(shouldDelete).toBe(false);
1179+
expect(job.state.runningAtMs).toBeUndefined();
1180+
expect(job.state.lastRunAtMs).toBe(startedAt);
1181+
expect(job.state.lastStatus).toBe("error");
1182+
expect(job.state.consecutiveErrors).toBe(1);
1183+
expect(job.state.nextRunAtMs).toBeUndefined();
1184+
expect(job.enabled).toBe(true);
1185+
} finally {
1186+
nextRunSpy.mockRestore();
1187+
}
1188+
});
1189+
11071190
it("force run preserves 'every' anchor while recording manual lastRunAtMs", () => {
11081191
const nowMs = Date.now();
11091192
const everyMs = 24 * 60 * 60 * 1_000;

src/cron/service/timer.ts

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,27 @@ function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]
240240
return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
241241
}
242242

243+
function resolveCronNextRunWithLowerBound(params: {
244+
state: CronServiceState;
245+
job: CronJob;
246+
naturalNext: number | undefined;
247+
lowerBoundMs: number;
248+
context: "completion" | "error_backoff";
249+
}): number | undefined {
250+
if (params.naturalNext === undefined) {
251+
params.state.deps.log.warn(
252+
{
253+
jobId: params.job.id,
254+
jobName: params.job.name,
255+
context: params.context,
256+
},
257+
"cron: next run unresolved; clearing schedule to avoid a refire loop",
258+
);
259+
return undefined;
260+
}
261+
return Math.max(params.naturalNext, params.lowerBoundMs);
262+
}
263+
243264
function resolveRetryConfig(cronConfig?: CronConfig) {
244265
const retry = cronConfig?.retry;
245266
return {
@@ -518,7 +539,17 @@ export function applyJobResult(
518539
const backoffNext = result.endedAt + backoff;
519540
// Use whichever is later: the natural next run or the backoff delay.
520541
job.state.nextRunAtMs =
521-
normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext;
542+
job.schedule.kind === "cron"
543+
? resolveCronNextRunWithLowerBound({
544+
state,
545+
job,
546+
naturalNext: normalNext,
547+
lowerBoundMs: backoffNext,
548+
context: "error_backoff",
549+
})
550+
: normalNext !== undefined
551+
? Math.max(normalNext, backoffNext)
552+
: backoffNext;
522553
state.deps.log.info(
523554
{
524555
jobId: job.id,
@@ -547,8 +578,13 @@ export function applyJobResult(
547578
// schedule computation lands in the same second due to
548579
// timezone/croner edge cases (see #17821).
549580
const minNext = result.endedAt + MIN_REFIRE_GAP_MS;
550-
job.state.nextRunAtMs =
551-
naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext;
581+
job.state.nextRunAtMs = resolveCronNextRunWithLowerBound({
582+
state,
583+
job,
584+
naturalNext,
585+
lowerBoundMs: minNext,
586+
context: "completion",
587+
});
552588
} else {
553589
job.state.nextRunAtMs = naturalNext;
554590
}
@@ -609,6 +645,14 @@ export function armTimer(state: CronServiceState) {
609645
const withNextRun =
610646
state.store?.jobs.filter((j) => j.enabled && hasScheduledNextRunAtMs(j.state.nextRunAtMs))
611647
.length ?? 0;
648+
if (enabledCount > 0) {
649+
armRunningRecheckTimer(state);
650+
state.deps.log.debug(
651+
{ jobCount, enabledCount, withNextRun, delayMs: MAX_TIMER_DELAY_MS },
652+
"cron: timer armed for maintenance recheck",
653+
);
654+
return;
655+
}
612656
state.deps.log.debug(
613657
{ jobCount, enabledCount, withNextRun },
614658
"cron: armTimer skipped - no jobs with nextRunAtMs",

0 commit comments

Comments
 (0)