Skip to content

Commit 8cc38c1

Browse files
MonkeyLeeTobviyus
andauthored
fix: stop session lock failover (#68700) (thanks @MonkeyLeeT)
* fix(agents): stop treating session lock waits as timeout * fix(agents): ignore abort-wrapped session lock waits * fix(agents): keep explicit failover metadata authoritative * fix(agents): respect inferred failover metadata * fix(agents): ignore generic abort codes for lock waits * fix(agents): suppress cause-based lock wait fallback * fix(agents): type session lock timeout errors * fix: stop session lock failover (#68700) (thanks @MonkeyLeeT) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
1 parent c03e5b3 commit 8cc38c1

6 files changed

Lines changed: 160 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ Docs: https://docs.openclaw.ai
231231
- Config/agents: accept `agents.list[].contextTokens` in strict config validation so per-agent overrides survive hot reload, letting `/status` reflect the configured model window instead of the 200k fallback. Fixes #70692. (#71247) Thanks @statxc.
232232
- Heartbeat: include async exec completion details in heartbeat prompts so command-finished notifications relay the actual output. (#71213) Thanks @GodsBoy.
233233
- Memory search: apply session visibility and agent-to-agent policy to session transcript hits, and keep `corpus=sessions` ranking scoped to session collections before result limiting. (#70761) Thanks @nefainl.
234+
- Agents/sessions: stop session write-lock timeouts from entering model failover, so local lock contention surfaces directly instead of cascading across providers. (#68700) Thanks @MonkeyLeeT.
234235

235236
## 2026.4.23
236237

src/agents/failover-error.test.ts

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
resolveFailoverStatus,
99
} from "./failover-error.js";
1010
import { classifyFailoverSignal } from "./pi-embedded-helpers/errors.js";
11+
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
1112

1213
// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors
1314
const OPENAI_RATE_LIMIT_MESSAGE =
@@ -359,6 +360,87 @@ describe("failover-error", () => {
359360
).toBe("overloaded");
360361
});
361362

363+
it("does not classify session lock wait errors as model timeout failover", () => {
364+
const sessionLockError = new SessionWriteLockTimeoutError({
365+
timeoutMs: 10_000,
366+
owner: "pid=37121",
367+
lockPath: "/tmp/openclaw/session.jsonl.lock",
368+
});
369+
expect(resolveFailoverReasonFromError(sessionLockError)).toBeNull();
370+
expect(isTimeoutError(sessionLockError)).toBe(false);
371+
372+
const wrappedLockError = Object.assign(new Error("operation timed out"), {
373+
name: "AbortError",
374+
cause: sessionLockError,
375+
});
376+
expect(resolveFailoverReasonFromError(wrappedLockError)).toBeNull();
377+
expect(isTimeoutError(wrappedLockError)).toBe(false);
378+
379+
const abortWrappedLockError = Object.assign(new Error("request was aborted"), {
380+
name: "AbortError",
381+
cause: sessionLockError,
382+
});
383+
expect(resolveFailoverReasonFromError(abortWrappedLockError)).toBeNull();
384+
expect(isTimeoutError(abortWrappedLockError)).toBe(false);
385+
});
386+
387+
it("keeps explicit provider failover metadata authoritative over nested session lock text", () => {
388+
expect(
389+
resolveFailoverReasonFromError({
390+
status: 429,
391+
code: "RESOURCE_EXHAUSTED",
392+
message: "upstream quota pressure",
393+
cause: new SessionWriteLockTimeoutError({
394+
timeoutMs: 10_000,
395+
owner: "pid=37121",
396+
lockPath: "/tmp/openclaw/session.jsonl.lock",
397+
}),
398+
}),
399+
).toBe("rate_limit");
400+
});
401+
402+
it("keeps inferred HTTP failover metadata authoritative over nested session lock text", () => {
403+
expect(
404+
resolveFailoverReasonFromError({
405+
message: "HTTP 429: upstream quota pressure",
406+
cause: new SessionWriteLockTimeoutError({
407+
timeoutMs: 10_000,
408+
owner: "pid=37121",
409+
lockPath: "/tmp/openclaw/session.jsonl.lock",
410+
}),
411+
}),
412+
).toBe("rate_limit");
413+
});
414+
415+
it("does not treat generic abort codes as explicit failover metadata over nested session lock text", () => {
416+
expect(
417+
resolveFailoverReasonFromError({
418+
name: "AbortError",
419+
code: "ABORT_ERR",
420+
message: "The operation was aborted",
421+
cause: new SessionWriteLockTimeoutError({
422+
timeoutMs: 10_000,
423+
owner: "pid=37121",
424+
lockPath: "/tmp/openclaw/session.jsonl.lock",
425+
}),
426+
}),
427+
).toBeNull();
428+
});
429+
430+
it("does not let cause-based failover classification bypass wrapper session lock suppression", () => {
431+
expect(
432+
resolveFailoverReasonFromError({
433+
message: "wrapper",
434+
reason: new SessionWriteLockTimeoutError({
435+
timeoutMs: 10_000,
436+
owner: "pid=37121",
437+
lockPath: "/tmp/openclaw/session.jsonl.lock",
438+
}),
439+
cause: new Error("operation timed out"),
440+
}),
441+
).toBeNull();
442+
});
443+
362444
it("classifies provider-scoped generic upstream errors for failover", () => {
363445
expect(
364446
resolveFailoverReasonFromError({

src/agents/failover-error.ts

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import { readErrorName } from "../infra/errors.js";
22
import {
33
classifyFailoverSignal,
4+
inferSignalStatus,
45
isUnclassifiedNoBodyHttpSignal,
56
type FailoverClassification,
67
type FailoverSignal,
78
} from "./pi-embedded-helpers/errors.js";
89
import { isTimeoutErrorMessage } from "./pi-embedded-helpers/errors.js";
910
import type { FailoverReason } from "./pi-embedded-helpers/types.js";
11+
import { isSessionWriteLockTimeoutError } from "./session-write-lock-error.js";
1012

1113
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
1214
const MAX_FAILOVER_CAUSE_DEPTH = 25;
@@ -198,10 +200,32 @@ function normalizeDirectErrorSignal(err: unknown): FailoverSignal {
198200
};
199201
}
200202

203+
function hasSessionWriteLockTimeout(err: unknown, seen: Set<object> = new Set()): boolean {
204+
if (isSessionWriteLockTimeoutError(err)) {
205+
return true;
206+
}
207+
if (!err || typeof err !== "object") {
208+
return false;
209+
}
210+
if (seen.has(err)) {
211+
return false;
212+
}
213+
seen.add(err);
214+
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
215+
return (
216+
hasSessionWriteLockTimeout(candidate.error, seen) ||
217+
hasSessionWriteLockTimeout(candidate.cause, seen) ||
218+
hasSessionWriteLockTimeout(candidate.reason, seen)
219+
);
220+
}
221+
201222
function hasTimeoutHint(err: unknown): boolean {
202223
if (!err) {
203224
return false;
204225
}
226+
if (hasSessionWriteLockTimeout(err)) {
227+
return false;
228+
}
205229
if (readErrorName(err) === "TimeoutError") {
206230
return true;
207231
}
@@ -219,6 +243,9 @@ export function isTimeoutError(err: unknown): boolean {
219243
if (readErrorName(err) !== "AbortError") {
220244
return false;
221245
}
246+
if (hasSessionWriteLockTimeout(err)) {
247+
return false;
248+
}
222249
const message = getErrorMessage(err);
223250
if (message && ABORT_TIMEOUT_RE.test(message)) {
224251
return true;
@@ -316,8 +343,15 @@ function resolveFailoverClassificationFromErrorInternal(
316343
reason: err.reason,
317344
};
318345
}
319-
320346
const signal = normalizeErrorSignal(err);
347+
const codeReason = signal.code
348+
? failoverReasonFromClassification(classifyFailoverSignal({ code: signal.code }))
349+
: null;
350+
const hasExplicitFailoverMetadata =
351+
typeof inferSignalStatus(signal) === "number" ||
352+
(codeReason !== null && codeReason !== "timeout");
353+
const hasSessionLock = hasSessionWriteLockTimeout(err);
354+
321355
const classification = classifyFailoverSignal(signal);
322356
const nestedCandidates = getNestedErrorCandidates(err);
323357

@@ -329,6 +363,9 @@ function resolveFailoverClassificationFromErrorInternal(
329363
depth + 1,
330364
);
331365
if (nestedClassification) {
366+
if (hasSessionLock && !hasExplicitFailoverMetadata) {
367+
return null;
368+
}
332369
return nestedClassification;
333370
}
334371
}
@@ -352,9 +389,16 @@ function resolveFailoverClassificationFromErrorInternal(
352389
}
353390

354391
if (classification) {
392+
if (hasSessionLock && !hasExplicitFailoverMetadata) {
393+
return null;
394+
}
355395
return classification;
356396
}
357397

398+
if (hasSessionLock) {
399+
return null;
400+
}
401+
358402
if (isTimeoutError(err)) {
359403
return {
360404
kind: "reason",

src/agents/pi-embedded-helpers/errors.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ function stripErrorPrefix(raw: string): string {
337337
return raw.replace(/^error:\s*/i, "").trim();
338338
}
339339

340-
function inferSignalStatus(signal: FailoverSignal): number | undefined {
340+
export function inferSignalStatus(signal: FailoverSignal): number | undefined {
341341
if (typeof signal.status === "number" && Number.isFinite(signal.status)) {
342342
return signal.status;
343343
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
export const SESSION_WRITE_LOCK_TIMEOUT_CODE = "OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT";
2+
3+
export class SessionWriteLockTimeoutError extends Error {
4+
readonly code = SESSION_WRITE_LOCK_TIMEOUT_CODE;
5+
readonly timeoutMs: number;
6+
readonly owner: string;
7+
readonly lockPath: string;
8+
9+
constructor(params: { timeoutMs: number; owner: string; lockPath: string }) {
10+
super(
11+
`session file locked (timeout ${params.timeoutMs}ms): ${params.owner} ${params.lockPath}`,
12+
);
13+
this.name = "SessionWriteLockTimeoutError";
14+
this.timeoutMs = params.timeoutMs;
15+
this.owner = params.owner;
16+
this.lockPath = params.lockPath;
17+
}
18+
}
19+
20+
export function isSessionWriteLockTimeoutError(err: unknown): boolean {
21+
return (
22+
err instanceof SessionWriteLockTimeoutError ||
23+
Boolean(
24+
err &&
25+
typeof err === "object" &&
26+
(err as { code?: unknown }).code === SESSION_WRITE_LOCK_TIMEOUT_CODE,
27+
)
28+
);
29+
}

src/agents/session-write-lock.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import fs from "node:fs/promises";
33
import path from "node:path";
44
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
55
import { resolveProcessScopedMap } from "../shared/process-scoped-map.js";
6+
import { SessionWriteLockTimeoutError } from "./session-write-lock-error.js";
67

78
type LockFilePayload = {
89
pid?: number;
@@ -584,7 +585,7 @@ export async function acquireSessionWriteLock(params: {
584585

585586
const payload = await readLockPayload(lockPath);
586587
const owner = typeof payload?.pid === "number" ? `pid=${payload.pid}` : "unknown";
587-
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
588+
throw new SessionWriteLockTimeoutError({ timeoutMs, owner, lockPath });
588589
}
589590

590591
export const __testing = {

0 commit comments

Comments
 (0)