Skip to content

Commit ebfb834

Browse files
authored
fix(cron): classify network retry errors (#85344)
1 parent 3551e98 commit ebfb834

4 files changed

Lines changed: 86 additions & 26 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Docs: https://docs.openclaw.ai
3333

3434
### Fixes
3535

36+
- Cron: honor `cron.retry.retryOn: ["network"]` for common network error codes such as `EAI_AGAIN`, `EHOSTUNREACH`, and `ENETUNREACH`.
3637
- Agents/OpenAI: preserve structured provider error code, type, and redacted body metadata on boundary-aware transport failures.
3738
- CLI/agents: retry transient normal-close Gateway handshakes before falling back to embedded `openclaw agent` execution.
3839
- CLI/update: keep managed Gateway service stop/restart status lines out of `openclaw update --json` stdout so package-update automation can parse the JSON payload.

src/cron/retry-hint.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { describe, expect, it } from "vitest";
2+
import { resolveCronExecutionRetryHint } from "./retry-hint.js";
3+
4+
describe("resolveCronExecutionRetryHint", () => {
5+
it("matches classified transient errors", () => {
6+
expect(resolveCronExecutionRetryHint("HTTP 529", ["overloaded"])).toEqual({
7+
retryable: true,
8+
category: "overloaded",
9+
});
10+
expect(resolveCronExecutionRetryHint("429 rate limit exceeded", ["rate_limit"])).toEqual({
11+
retryable: true,
12+
category: "rate_limit",
13+
});
14+
});
15+
16+
it("treats common network error codes as network when retryOn only includes network", () => {
17+
for (const code of [
18+
"EAI_AGAIN",
19+
"EHOSTUNREACH",
20+
"EHOSTDOWN",
21+
"ENETRESET",
22+
"ENETUNREACH",
23+
"EPIPE",
24+
]) {
25+
expect(resolveCronExecutionRetryHint(`temporary DNS failure: ${code}`, ["network"])).toEqual({
26+
retryable: true,
27+
category: "network",
28+
});
29+
}
30+
});
31+
32+
it("does not retry permanent errors", () => {
33+
expect(resolveCronExecutionRetryHint("invalid API key", ["network"])).toEqual({
34+
retryable: false,
35+
});
36+
});
37+
});

src/cron/retry-hint.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import type { CronRetryOn } from "../config/types.cron.js";
2+
3+
export type CronRetryHint = {
4+
retryable: boolean;
5+
category?: CronRetryOn;
6+
};
7+
8+
const TRANSIENT_PATTERNS: Record<CronRetryOn, RegExp> = {
9+
rate_limit:
10+
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
11+
overloaded:
12+
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
13+
network:
14+
/(network|fetch failed|socket|econnreset|econnrefused|eai_again|ehostunreach|ehostdown|enetreset|enetunreach|epipe)/i,
15+
timeout: /(timeout|etimedout)/i,
16+
server_error: /\b5\d{2}\b/,
17+
};
18+
19+
export function resolveCronExecutionRetryHint(
20+
error: string | undefined,
21+
retryOn?: CronRetryOn[],
22+
classifiedReason?: string | null,
23+
): CronRetryHint {
24+
if (!error || typeof error !== "string") {
25+
return { retryable: false };
26+
}
27+
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
28+
const classified = classifiedReason ?? undefined;
29+
if (classified && keys.includes(classified as CronRetryOn)) {
30+
return { retryable: true, category: classified as CronRetryOn };
31+
}
32+
for (const key of keys) {
33+
if (TRANSIENT_PATTERNS[key]?.test(error)) {
34+
return { retryable: true, category: key };
35+
}
36+
}
37+
return { retryable: false };
38+
}

src/cron/service/timer.ts

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { resolveFailoverReasonFromError } from "../../agents/failover-error.js";
22
import { formatEmbeddedAgentExecutionPhase } from "../../agents/pi-embedded-runner/execution-phase.js";
33
import { readSessionEntry } from "../../config/sessions/store-load.js";
44
import type { SessionEntry } from "../../config/sessions/types.js";
5-
import type { CronConfig, CronRetryOn } from "../../config/types.cron.js";
5+
import type { CronConfig } from "../../config/types.cron.js";
66
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
77
import {
88
HEARTBEAT_SKIP_CRON_IN_PROGRESS,
@@ -25,6 +25,7 @@ import type { DeliveryContext } from "../../utils/delivery-context.types.js";
2525
import { clearCronJobActive, markCronJobActive } from "../active-jobs.js";
2626
import { resolveCronDeliveryPlan, resolveFailureDestination } from "../delivery-plan.js";
2727
import { resolveCronAgentSessionKey } from "../isolated-agent/session-key.js";
28+
import { resolveCronExecutionRetryHint } from "../retry-hint.js";
2829
import {
2930
createCronRunDiagnosticsFromError,
3031
normalizeCronRunDiagnostics,
@@ -560,28 +561,6 @@ function tryFinishCronTaskRun(
560561
/** Default max retries for one-shot jobs on transient errors (#24355). */
561562
const DEFAULT_MAX_TRANSIENT_RETRIES = 3;
562563

563-
const TRANSIENT_PATTERNS: Record<string, RegExp> = {
564-
rate_limit:
565-
/(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare|tokens per day)/i,
566-
overloaded:
567-
/\b529\b|\boverloaded(?:_error)?\b|high demand|temporar(?:ily|y) overloaded|capacity exceeded/i,
568-
network: /(network|econnreset|econnrefused|fetch failed|socket)/i,
569-
timeout: /(timeout|etimedout)/i,
570-
server_error: /\b5\d{2}\b/,
571-
};
572-
573-
function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean {
574-
if (!error || typeof error !== "string") {
575-
return false;
576-
}
577-
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
578-
const classified = resolveFailoverReasonFromError(error);
579-
if (classified && keys.includes(classified as CronRetryOn)) {
580-
return true;
581-
}
582-
return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
583-
}
584-
585564
function resolveCronNextRunWithLowerBound(params: {
586565
state: CronServiceState;
587566
job: CronJob;
@@ -970,10 +949,14 @@ export function applyJobResult(
970949
job.state.nextRunAtMs = undefined;
971950
} else if (result.status === "error") {
972951
const retryConfig = resolveRetryConfig(state.deps.cronConfig);
973-
const transient = isTransientCronError(result.error, retryConfig.retryOn);
952+
const retryHint = resolveCronExecutionRetryHint(
953+
result.error,
954+
retryConfig.retryOn,
955+
job.state.lastErrorReason,
956+
);
974957
// consecutiveErrors is always set to ≥1 by the increment block above.
975958
const consecutive = job.state.consecutiveErrors;
976-
if (transient && consecutive <= retryConfig.maxAttempts) {
959+
if (retryHint.retryable && consecutive <= retryConfig.maxAttempts) {
977960
// Schedule retry with backoff (#24355).
978961
const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs);
979962
job.state.nextRunAtMs = result.endedAt + backoff;
@@ -1000,7 +983,8 @@ export function applyJobResult(
1000983
jobName: job.name,
1001984
consecutiveErrors: consecutive,
1002985
error: result.error,
1003-
reason: transient ? "max retries exhausted" : "permanent error",
986+
reason: retryHint.retryable ? "max retries exhausted" : "permanent error",
987+
retryCategory: retryHint.category,
1004988
},
1005989
"cron: disabling one-shot job after error",
1006990
);

0 commit comments

Comments
 (0)