Skip to content

Commit 840cc2d

Browse files
committed
fix(agents): report stale session locks without cleanup
1 parent 1bfae9d commit 840cc2d

10 files changed

Lines changed: 240 additions & 36 deletions

extensions/qa-lab/src/suite-runtime-agent-session.test.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,28 @@ describe("qa suite runtime agent session helpers", () => {
7575
);
7676
});
7777

78+
it("retries transient session store stale locks while creating sessions", async () => {
79+
const lockStaleError = Object.assign(
80+
new Error("SessionWriteLockStaleError: session file lock stale"),
81+
{ code: "OPENCLAW_SESSION_WRITE_LOCK_STALE" },
82+
);
83+
gatewayCall.mockRejectedValueOnce(lockStaleError).mockResolvedValueOnce({ key: " session-3 " });
84+
85+
vi.useFakeTimers();
86+
const pending = createSession(env, "Retry Stale Session", "agent:qa:stale-retry");
87+
88+
await vi.advanceTimersByTimeAsync(1);
89+
90+
await expect(pending).resolves.toBe("session-3");
91+
expect(gatewayCall).toHaveBeenCalledTimes(2);
92+
expect(gatewayCall).toHaveBeenNthCalledWith(
93+
2,
94+
"sessions.create",
95+
{ label: "Retry Stale Session", key: "agent:qa:stale-retry" },
96+
expect.objectContaining({ timeoutMs: expect.any(Number) }),
97+
);
98+
});
99+
78100
it("reads effective tool ids once and drops blanks", async () => {
79101
gatewayCall.mockResolvedValueOnce({
80102
groups: [

extensions/qa-lab/src/suite-runtime-agent-session.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,11 @@ function isSessionStoreLockTimeout(error: unknown) {
3535
const text = formatErrorMessage(error);
3636
return (
3737
text.includes("OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT") ||
38+
text.includes("OPENCLAW_SESSION_WRITE_LOCK_STALE") ||
3839
text.includes("SessionWriteLockTimeoutError") ||
39-
text.includes("session file locked")
40+
text.includes("SessionWriteLockStaleError") ||
41+
text.includes("session file locked") ||
42+
text.includes("session file lock stale")
4043
);
4144
}
4245

src/agents/embedded-agent-runner/google-prompt-cache.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { streamWithPayloadPatch } from "../../llm/providers/stream-wrappers/stre
1111
import type { Model } from "../../llm/types.js";
1212
import { buildGuardedModelFetch } from "../provider-transport-fetch.js";
1313
import type { StreamFn } from "../runtime/index.js";
14-
import { isSessionWriteLockTimeoutError } from "../session-write-lock-error.js";
14+
import { isSessionWriteLockAcquireError } from "../session-write-lock-error.js";
1515
import { stableStringify } from "../stable-stringify.js";
1616
import { stripSystemPromptCacheBoundary } from "../system-prompt-cache-boundary.js";
1717
import { mergeTransportHeaders, sanitizeTransportPayloadText } from "../transport-stream-shared.js";
@@ -179,7 +179,7 @@ async function appendGooglePromptCacheEntry(
179179
try {
180180
await sessionManager.appendCustomEntry(GOOGLE_PROMPT_CACHE_CUSTOM_TYPE, entry);
181181
} catch (err) {
182-
if (err instanceof EmbeddedAttemptSessionTakeoverError || isSessionWriteLockTimeoutError(err)) {
182+
if (err instanceof EmbeddedAttemptSessionTakeoverError || isSessionWriteLockAcquireError(err)) {
183183
throw err;
184184
}
185185
// ignore persistence failures

src/agents/embedded-agent-runner/run/attempt.session-lock.test.ts

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import {
88
runWithOwnedSessionTranscriptWritePublication,
99
withOwnedSessionTranscriptWrites,
1010
} from "../../../config/sessions/transcript-write-context.js";
11-
import { SessionWriteLockTimeoutError } from "../../session-write-lock-error.js";
11+
import {
12+
SessionWriteLockStaleError,
13+
SessionWriteLockTimeoutError,
14+
} from "../../session-write-lock-error.js";
1215
import {
1316
acquireSessionWriteLock,
1417
resetSessionWriteLockStateForTest,
@@ -1178,6 +1181,36 @@ describe("embedded attempt session lock lifecycle", () => {
11781181
expect(releases).toEqual(["prep"]);
11791182
});
11801183

1184+
it("skips cleanup lock reacquisition after a post-prompt stale lock", async () => {
1185+
const releases: string[] = [];
1186+
const acquireSessionWriteLock = vi
1187+
.fn()
1188+
.mockResolvedValueOnce({ release: vi.fn(async () => releases.push("prep")) })
1189+
.mockRejectedValueOnce(
1190+
new SessionWriteLockStaleError({
1191+
owner: "pid=789 alive=true ageMs=1800001",
1192+
lockPath: `${lockOptions.sessionFile}.lock`,
1193+
staleReasons: ["too-old"],
1194+
}),
1195+
);
1196+
1197+
const controller = await createEmbeddedAttemptSessionLockController({
1198+
acquireSessionWriteLock,
1199+
lockOptions,
1200+
});
1201+
1202+
await controller.releaseForPrompt();
1203+
await expect(controller.withSessionWriteLock(() => "late-write")).rejects.toBeInstanceOf(
1204+
SessionWriteLockStaleError,
1205+
);
1206+
const cleanupLock = await controller.acquireForCleanup();
1207+
await cleanupLock.release();
1208+
1209+
expect(acquireSessionWriteLock).toHaveBeenCalledTimes(2);
1210+
expect(controller.hasSessionTakeover()).toBe(true);
1211+
expect(releases).toEqual(["prep"]);
1212+
});
1213+
11811214
it("wraps provider stream submission with queued transcript drain and lock release", async () => {
11821215
const events: string[] = [];
11831216
const streamFn = vi.fn(async (..._args: unknown[]) => {

src/agents/embedded-agent-runner/run/attempt.session-lock.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { isDeepStrictEqual } from "node:util";
55
import { normalizeStringEntries } from "@openclaw/normalization-core/string-normalization";
66
import { withOwnedSessionTranscriptWrites } from "../../../config/sessions/transcript-write-context.js";
77
import { resolveGlobalSingleton } from "../../../shared/global-singleton.js";
8-
import { isSessionWriteLockTimeoutError } from "../../session-write-lock-error.js";
8+
import { isSessionWriteLockAcquireError } from "../../session-write-lock-error.js";
99
import type { acquireSessionWriteLock } from "../../session-write-lock.js";
1010
import { resolveEmbeddedSessionFileKey } from "../session-file-key.js";
1111

@@ -651,7 +651,7 @@ export async function createEmbeddedAttemptSessionLockController(params: {
651651
try {
652652
return { lock: await acquireLock(), owned: true };
653653
} catch (err) {
654-
if (isSessionWriteLockTimeoutError(err)) {
654+
if (isSessionWriteLockAcquireError(err)) {
655655
takeoverDetected = true;
656656
}
657657
throw err;
@@ -865,7 +865,7 @@ export async function createEmbeddedAttemptSessionLockController(params: {
865865
try {
866866
return await acquireLock();
867867
} catch (err) {
868-
if (isSessionWriteLockTimeoutError(err)) {
868+
if (isSessionWriteLockAcquireError(err)) {
869869
takeoverDetected = true;
870870
return undefined;
871871
}

src/agents/failover-error.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import {
99
} from "./embedded-agent-helpers/errors.js";
1010
import { isTimeoutErrorMessage } from "./embedded-agent-helpers/errors.js";
1111
import type { FailoverReason } from "./embedded-agent-helpers/types.js";
12-
import { isSessionWriteLockTimeoutError } from "./session-write-lock-error.js";
12+
import { isSessionWriteLockAcquireError } from "./session-write-lock-error.js";
1313

1414
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
1515
const MAX_FAILOVER_CAUSE_DEPTH = 25;
@@ -262,8 +262,8 @@ function normalizeDirectErrorSignal(err: unknown): FailoverSignal {
262262
};
263263
}
264264

265-
function hasSessionWriteLockTimeout(err: unknown, seen: Set<object> = new Set()): boolean {
266-
if (isSessionWriteLockTimeoutError(err)) {
265+
function hasSessionWriteLockContention(err: unknown, seen: Set<object> = new Set()): boolean {
266+
if (isSessionWriteLockAcquireError(err)) {
267267
return true;
268268
}
269269
if (!err || typeof err !== "object") {
@@ -275,9 +275,9 @@ function hasSessionWriteLockTimeout(err: unknown, seen: Set<object> = new Set())
275275
seen.add(err);
276276
const candidate = err as { error?: unknown; cause?: unknown; reason?: unknown };
277277
return (
278-
hasSessionWriteLockTimeout(candidate.error, seen) ||
279-
hasSessionWriteLockTimeout(candidate.cause, seen) ||
280-
hasSessionWriteLockTimeout(candidate.reason, seen)
278+
hasSessionWriteLockContention(candidate.error, seen) ||
279+
hasSessionWriteLockContention(candidate.cause, seen) ||
280+
hasSessionWriteLockContention(candidate.reason, seen)
281281
);
282282
}
283283

@@ -315,7 +315,7 @@ function hasEmbeddedAttemptSessionTakeover(err: unknown, seen: Set<object> = new
315315
* See #83510.
316316
*/
317317
export function isNonProviderRuntimeCoordinationError(err: unknown): boolean {
318-
if (!hasSessionWriteLockTimeout(err) && !hasEmbeddedAttemptSessionTakeover(err)) {
318+
if (!hasSessionWriteLockContention(err) && !hasEmbeddedAttemptSessionTakeover(err)) {
319319
return false;
320320
}
321321
if (isFailoverError(err)) {
@@ -331,7 +331,7 @@ function hasTimeoutHint(err: unknown): boolean {
331331
if (!err) {
332332
return false;
333333
}
334-
if (hasSessionWriteLockTimeout(err)) {
334+
if (hasSessionWriteLockContention(err)) {
335335
return false;
336336
}
337337
if (readErrorName(err) === "TimeoutError") {
@@ -351,7 +351,7 @@ export function isTimeoutError(err: unknown): boolean {
351351
if (readErrorName(err) !== "AbortError") {
352352
return false;
353353
}
354-
if (hasSessionWriteLockTimeout(err)) {
354+
if (hasSessionWriteLockContention(err)) {
355355
return false;
356356
}
357357
const message = getErrorMessage(err);
@@ -460,7 +460,7 @@ function resolveFailoverClassificationFromErrorInternal(
460460
const hasExplicitFailoverMetadata =
461461
typeof inferSignalStatus(signal) === "number" ||
462462
(codeReason !== null && codeReason !== "timeout");
463-
const hasSessionLock = hasSessionWriteLockTimeout(err);
463+
const hasSessionLock = hasSessionWriteLockContention(err);
464464

465465
const classification = classifyFailoverSignal(signal);
466466
const nestedCandidates = getNestedErrorCandidates(err);

src/agents/session-write-lock-error.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const SESSION_WRITE_LOCK_TIMEOUT_CODE = "OPENCLAW_SESSION_WRITE_LOCK_TIMEOUT";
2+
const SESSION_WRITE_LOCK_STALE_CODE = "OPENCLAW_SESSION_WRITE_LOCK_STALE";
23

34
export class SessionWriteLockTimeoutError extends Error {
45
readonly code = SESSION_WRITE_LOCK_TIMEOUT_CODE;
@@ -17,6 +18,24 @@ export class SessionWriteLockTimeoutError extends Error {
1718
}
1819
}
1920

21+
export class SessionWriteLockStaleError extends Error {
22+
readonly code = SESSION_WRITE_LOCK_STALE_CODE;
23+
readonly owner: string;
24+
readonly lockPath: string;
25+
readonly staleReasons: string[];
26+
27+
constructor(params: { owner: string; lockPath: string; staleReasons?: string[] }) {
28+
const staleReasons = params.staleReasons?.length ? params.staleReasons : ["unknown"];
29+
super(
30+
`session file lock stale (${staleReasons.join(", ")}): ${params.owner} ${params.lockPath}`,
31+
);
32+
this.name = "SessionWriteLockStaleError";
33+
this.owner = params.owner;
34+
this.lockPath = params.lockPath;
35+
this.staleReasons = staleReasons;
36+
}
37+
}
38+
2039
export function isSessionWriteLockTimeoutError(err: unknown): boolean {
2140
return (
2241
err instanceof SessionWriteLockTimeoutError ||
@@ -27,3 +46,18 @@ export function isSessionWriteLockTimeoutError(err: unknown): boolean {
2746
)
2847
);
2948
}
49+
50+
export function isSessionWriteLockStaleError(err: unknown): boolean {
51+
return (
52+
err instanceof SessionWriteLockStaleError ||
53+
Boolean(
54+
err &&
55+
typeof err === "object" &&
56+
(err as { code?: unknown }).code === SESSION_WRITE_LOCK_STALE_CODE,
57+
)
58+
);
59+
}
60+
61+
export function isSessionWriteLockAcquireError(err: unknown): boolean {
62+
return isSessionWriteLockTimeoutError(err) || isSessionWriteLockStaleError(err);
63+
}

src/agents/session-write-lock.test.ts

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import { spawn } from "node:child_process";
12
import fsSync from "node:fs";
23
import fs from "node:fs/promises";
34
import os from "node:os";
45
import path from "node:path";
56
import { MAX_TIMER_TIMEOUT_MS } from "@openclaw/normalization-core/number-coercion";
67
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
8+
import { SessionWriteLockStaleError } from "./session-write-lock-error.js";
79

810
const FAKE_STARTTIME = 12345;
911
let testing: typeof import("./session-write-lock.js").testing;
@@ -119,12 +121,13 @@ async function withSymlinkedSessionPaths(
119121

120122
async function expectActiveInProcessLockIsNotReclaimed(params?: {
121123
legacyStarttime?: unknown;
124+
createdAt?: string;
122125
}): Promise<void> {
123126
await withTempSessionLockFile(async ({ sessionFile, lockPath }) => {
124127
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
125128
const lockPayload = {
126129
pid: process.pid,
127-
createdAt: new Date().toISOString(),
130+
createdAt: params?.createdAt ?? new Date().toISOString(),
128131
...(params && "legacyStarttime" in params ? { starttime: params.legacyStarttime } : {}),
129132
};
130133
await fs.writeFile(lockPath, JSON.stringify(lockPayload), "utf8");
@@ -410,6 +413,46 @@ describe("acquireSessionWriteLock", () => {
410413
});
411414
});
412415

416+
it("does not report or remove active in-process locks that pass staleMs", async () => {
417+
await expectActiveInProcessLockIsNotReclaimed({
418+
createdAt: new Date(Date.now() - 120_000).toISOString(),
419+
});
420+
});
421+
422+
it("reports live OpenClaw-owned stale locks without removing them", async () => {
423+
await withTempSessionLockFile(async ({ sessionFile, lockPath }) => {
424+
const owner = spawn(process.execPath, ["-e", "setInterval(() => {}, 1000)", "openclaw"], {
425+
stdio: "ignore",
426+
});
427+
if (!owner.pid) {
428+
throw new Error("missing lock owner pid");
429+
}
430+
await fs.writeFile(
431+
lockPath,
432+
JSON.stringify({
433+
pid: owner.pid,
434+
createdAt: new Date(Date.now() - 120_000).toISOString(),
435+
}),
436+
"utf8",
437+
);
438+
439+
try {
440+
await expect(
441+
acquireSessionWriteLock({ sessionFile, timeoutMs: 500, staleMs: 10 }),
442+
).rejects.toMatchObject({
443+
name: "SessionWriteLockStaleError",
444+
staleReasons: ["too-old"],
445+
});
446+
await expect(
447+
acquireSessionWriteLock({ sessionFile, timeoutMs: 500, staleMs: 10 }),
448+
).rejects.toBeInstanceOf(SessionWriteLockStaleError);
449+
await expect(fs.access(lockPath)).resolves.toBeUndefined();
450+
} finally {
451+
owner.kill("SIGTERM");
452+
}
453+
});
454+
});
455+
413456
it("watchdog releases stale in-process locks", async () => {
414457
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
415458
const stderrSpy = vi.spyOn(process.stderr, "write").mockImplementation(() => true);

0 commit comments

Comments
 (0)