Skip to content

Commit 190a4b4

Browse files
authored
fix(cron): preserve unresolved next-run backoff (#66113)
Merged via squash. Prepared head SHA: a553daa Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com> Co-authored-by: mbelinky <132747814+mbelinky@users.noreply.github.com> Reviewed-by: @mbelinky
1 parent 31281bc commit 190a4b4

4 files changed

Lines changed: 97 additions & 22 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
2525
- Gateway/sessions: stop heartbeat, cron-event, and exec-event turns from overwriting shared-session routing and origin metadata, preventing synthetic `heartbeat` targets from poisoning later cron or user delivery. (#63733, #35300)
2626
- Browser/CDP: let local attach-only `manual-cdp` profiles reuse the local loopback CDP control plane under strict default policy and remote-class probe timeouts, so tabs/snapshot stop falsely reporting a live local browser session as not running. (#65611, #66080) Thanks @mbelinky.
2727
- Cron/scheduler: stop inventing short retries when cron next-run calculation returns no valid future slot, and keep a maintenance wake armed so enabled unscheduled jobs recover without entering a refire loop. (#66019, #66083) Thanks @mbelinky.
28+
- Cron/scheduler: preserve the active error-backoff floor when maintenance repair recomputes a missing cron next-run, so recurring errored jobs do not resume early after a transient next-run resolution failure. (#66019, #66083, #66113) Thanks @mbelinky.
2829

2930
## 2026.4.12
3031

src/cron/service.issue-66019-unresolved-next-run.test.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,61 @@ describe("#66019 unresolved next-run repro", () => {
111111
}
112112
}
113113
});
114+
115+
it("preserves the active error backoff floor when maintenance repair later finds a natural next run", async () => {
116+
const store = issue66019Fixtures.makeStorePath();
117+
const scheduledAt = Date.parse("2026-04-13T15:50:00.000Z");
118+
let now = scheduledAt;
119+
120+
const cronJob = createIsolatedRegressionJob({
121+
id: "cron-66019-error-backoff-floor",
122+
name: "cron-66019-error-backoff-floor",
123+
scheduledAt,
124+
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
125+
payload: { kind: "agentTurn", message: "ping" },
126+
state: { nextRunAtMs: scheduledAt - 1_000 },
127+
});
128+
await writeCronJobs(store.storePath, [cronJob]);
129+
130+
const runIsolatedAgentJob = vi.fn().mockResolvedValue({
131+
status: "error",
132+
error: "synthetic failure",
133+
});
134+
const naturalNext = scheduledAt + 5_000;
135+
const backoffNext = scheduledAt + 30_000;
136+
const nextRunSpy = vi
137+
.spyOn(schedule, "computeNextRunAtMs")
138+
.mockReturnValueOnce(undefined)
139+
.mockReturnValueOnce(naturalNext)
140+
.mockReturnValue(naturalNext);
141+
const state = createCronServiceState({
142+
cronEnabled: true,
143+
storePath: store.storePath,
144+
log: noopLogger,
145+
nowMs: () => now,
146+
enqueueSystemEvent: vi.fn(),
147+
requestHeartbeatNow: vi.fn(),
148+
runIsolatedAgentJob,
149+
});
150+
151+
try {
152+
await onTimer(state);
153+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
154+
expect(state.store?.jobs[0]?.state.nextRunAtMs).toBe(backoffNext);
155+
156+
now = naturalNext + 1;
157+
await onTimer(state);
158+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
159+
160+
now = backoffNext + 1;
161+
await onTimer(state);
162+
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2);
163+
} finally {
164+
nextRunSpy.mockRestore();
165+
if (state.timer) {
166+
clearTimeout(state.timer);
167+
state.timer = null;
168+
}
169+
}
170+
});
114171
});

src/cron/service/jobs.ts

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ import type { CronServiceState } from "./state.js";
3838
const STUCK_RUN_MS = 2 * 60 * 60 * 1000;
3939
const STAGGER_OFFSET_CACHE_MAX = 4096;
4040
const staggerOffsetCache = new Map<string, number>();
41+
export const DEFAULT_ERROR_BACKOFF_SCHEDULE_MS = [
42+
30_000,
43+
60_000,
44+
5 * 60_000,
45+
15 * 60_000,
46+
60 * 60_000,
47+
];
4148

4249
function isFiniteTimestamp(value: unknown): value is number {
4350
return typeof value === "number" && Number.isFinite(value);
@@ -47,6 +54,14 @@ export function hasScheduledNextRunAtMs(value: unknown): value is number {
4754
return isFiniteTimestamp(value) && value > 0;
4855
}
4956

57+
export function errorBackoffMs(
58+
consecutiveErrors: number,
59+
scheduleMs = DEFAULT_ERROR_BACKOFF_SCHEDULE_MS,
60+
): number {
61+
const idx = Math.min(consecutiveErrors - 1, scheduleMs.length - 1);
62+
return scheduleMs[Math.max(0, idx)] ?? DEFAULT_ERROR_BACKOFF_SCHEDULE_MS[0];
63+
}
64+
5065
function resolveStableCronOffsetMs(jobId: string, staggerMs: number) {
5166
if (staggerMs <= 1) {
5267
return 0;
@@ -421,7 +436,27 @@ function walkSchedulableJobs(
421436
function recomputeJobNextRunAtMs(params: { state: CronServiceState; job: CronJob; nowMs: number }) {
422437
let changed = false;
423438
try {
424-
const newNext = computeJobNextRunAtMs(params.job, params.nowMs);
439+
let newNext = computeJobNextRunAtMs(params.job, params.nowMs);
440+
if (
441+
params.job.schedule.kind !== "at" &&
442+
params.job.state.lastStatus === "error" &&
443+
isFiniteTimestamp(params.job.state.lastRunAtMs)
444+
) {
445+
const consecutiveErrorsRaw = params.job.state.consecutiveErrors;
446+
const consecutiveErrors =
447+
typeof consecutiveErrorsRaw === "number" && Number.isFinite(consecutiveErrorsRaw)
448+
? Math.max(1, Math.floor(consecutiveErrorsRaw))
449+
: 1;
450+
const backoffFloor =
451+
params.job.state.lastRunAtMs +
452+
errorBackoffMs(
453+
consecutiveErrors,
454+
params.state.deps.cronConfig?.retry?.backoffMs ?? DEFAULT_ERROR_BACKOFF_SCHEDULE_MS,
455+
);
456+
if (newNext !== undefined) {
457+
newNext = Math.max(newNext, backoffFloor);
458+
}
459+
}
425460
if (params.job.state.nextRunAtMs !== newNext) {
426461
params.job.state.nextRunAtMs = newNext;
427462
changed = true;

src/cron/service/timer.ts

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ import type {
2020
CronRunTelemetry,
2121
} from "../types.js";
2222
import {
23+
DEFAULT_ERROR_BACKOFF_SCHEDULE_MS,
2324
computeJobPreviousRunAtMs,
2425
computeJobNextRunAtMs,
26+
errorBackoffMs,
2527
hasScheduledNextRunAtMs,
2628
isJobEnabled,
2729
nextWakeAtMs,
@@ -199,26 +201,6 @@ function tryFinishCronTaskRun(
199201
);
200202
}
201203
}
202-
/**
203-
* Exponential backoff delays (in ms) indexed by consecutive error count.
204-
* After the last entry the delay stays constant.
205-
*/
206-
const DEFAULT_BACKOFF_SCHEDULE_MS = [
207-
30_000, // 1st error → 30 s
208-
60_000, // 2nd error → 1 min
209-
5 * 60_000, // 3rd error → 5 min
210-
15 * 60_000, // 4th error → 15 min
211-
60 * 60_000, // 5th+ error → 60 min
212-
];
213-
214-
function errorBackoffMs(
215-
consecutiveErrors: number,
216-
scheduleMs = DEFAULT_BACKOFF_SCHEDULE_MS,
217-
): number {
218-
const idx = Math.min(consecutiveErrors - 1, scheduleMs.length - 1);
219-
return scheduleMs[Math.max(0, idx)];
220-
}
221-
222204
/** Default max retries for one-shot jobs on transient errors (#24355). */
223205
const DEFAULT_MAX_TRANSIENT_RETRIES = 3;
224206

@@ -269,7 +251,7 @@ function resolveRetryConfig(cronConfig?: CronConfig) {
269251
backoffMs:
270252
Array.isArray(retry?.backoffMs) && retry.backoffMs.length > 0
271253
? retry.backoffMs
272-
: DEFAULT_BACKOFF_SCHEDULE_MS.slice(0, 3),
254+
: DEFAULT_ERROR_BACKOFF_SCHEDULE_MS.slice(0, 3),
273255
retryOn: Array.isArray(retry?.retryOn) && retry.retryOn.length > 0 ? retry.retryOn : undefined,
274256
};
275257
}

0 commit comments

Comments
 (0)