Skip to content

Commit ba40072

Browse files
committed
test(agents): real-fs proof for #84193 lock-leak fix + diagnostic stderr
Add real-file-system integration test that exercises the actual acquireSessionWriteLock + on-disk .jsonl.lock semantics (no mocks): - Hold the lock through a stuck withSessionWriteLock and verify a competing acquire on the same session file fails with SessionWriteLockTimeoutError (matches the user-reported 60s timeout on later Discord turns). - Run attempt cleanup, verify the lock is released, the next competing acquire succeeds in <5ms, and the .jsonl bytes remain intact (no transcript tear). - Verify the new stderr diagnostic line names the abandoned owner pid and session file — what maintainers see in `journalctl -u openclaw` after the bug fires. Make abandonInFlightWriteLocks await its releases so the on-disk lock file is gone before cleanup returns; without it, concurrent acquires race the release and bounce off "file lock stale" until the watchdog catches up.
1 parent 63a66f5 commit ba40072

2 files changed

Lines changed: 140 additions & 3 deletions

File tree

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import fs from "node:fs/promises";
2+
import os from "node:os";
3+
import path from "node:path";
4+
import { afterEach, describe, expect, it } from "vitest";
5+
import { SessionWriteLockTimeoutError } from "../../session-write-lock-error.js";
6+
import {
7+
acquireSessionWriteLock,
8+
resetSessionWriteLockStateForTest,
9+
} from "../../session-write-lock.js";
10+
import { createEmbeddedAttemptSessionLockController } from "./attempt.session-lock.js";
11+
12+
const tempDirs: string[] = [];
13+
14+
afterEach(async () => {
15+
resetSessionWriteLockStateForTest();
16+
for (const dir of tempDirs.splice(0)) {
17+
await fs.rm(dir, { recursive: true, force: true });
18+
}
19+
});
20+
21+
async function makeSessionFile(): Promise<{ sessionFile: string; lockPath: string }> {
22+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-84193-integration-"));
23+
tempDirs.push(dir);
24+
const sessionFile = path.join(dir, "session.jsonl");
25+
await fs.writeFile(sessionFile, '{"type":"session"}\n', "utf8");
26+
return { sessionFile, lockPath: `${sessionFile}.lock` };
27+
}
28+
29+
describe("#84193 — real-fs proof: stuck post-run compaction releases JSONL write lock on cleanup", () => {
30+
it(
31+
"pre-cleanup: a stuck withSessionWriteLock holds the on-disk .jsonl.lock and a competing acquire times out; " +
32+
"post-cleanup: the same competing acquire succeeds and the diagnostic log records the abandoned owner",
33+
async () => {
34+
const { sessionFile, lockPath } = await makeSessionFile();
35+
const diagnostics: string[] = [];
36+
37+
const controller = await createEmbeddedAttemptSessionLockController({
38+
acquireSessionWriteLock,
39+
lockOptions: {
40+
sessionFile,
41+
timeoutMs: 2_000,
42+
staleMs: 1_800_000,
43+
maxHoldMs: 300_000,
44+
},
45+
logAbandonDiagnostic: (message) => diagnostics.push(message),
46+
});
47+
48+
await controller.releaseForPrompt();
49+
50+
let unblockStuck: (() => void) | undefined;
51+
const stuckPromise = controller
52+
.withSessionWriteLock(
53+
() =>
54+
new Promise<void>((resolve) => {
55+
unblockStuck = resolve;
56+
}),
57+
)
58+
.catch(() => undefined);
59+
60+
for (let i = 0; i < 40; i += 1) {
61+
if (unblockStuck) {
62+
break;
63+
}
64+
await new Promise((r) => setImmediate(r));
65+
}
66+
expect(typeof unblockStuck).toBe("function");
67+
68+
// Real-fs evidence #1: lock file exists on disk with this process as owner.
69+
const ownerPayloadRaw = await fs.readFile(lockPath, "utf8");
70+
const ownerPayload = JSON.parse(ownerPayloadRaw) as { pid?: number };
71+
expect(ownerPayload.pid).toBe(process.pid);
72+
73+
// Real-fs evidence #2: a competing acquire from a separate caller hits
74+
// the on-disk .jsonl.lock and times out — matching the user-reported
75+
// 60s SessionWriteLockTimeoutError on later Discord turns.
76+
const competingBefore = acquireSessionWriteLock({
77+
sessionFile,
78+
timeoutMs: 200,
79+
staleMs: 1_800_000,
80+
maxHoldMs: 60_000,
81+
});
82+
await expect(competingBefore).rejects.toBeInstanceOf(SessionWriteLockTimeoutError);
83+
84+
// Trigger the fix: attempt cleanup abandons the still-held in-flight lock.
85+
const cleanupLock = await controller.acquireForCleanup();
86+
await cleanupLock.release();
87+
88+
// Real-fs evidence #3: diagnostic stderr line names the abandoned owner
89+
// and lock target — what maintainers would see in `journalctl -u openclaw`.
90+
expect(diagnostics).toHaveLength(1);
91+
expect(diagnostics[0]).toMatch(/abandoned 1 in-flight lock\(s\) on attempt cleanup/);
92+
expect(diagnostics[0]).toContain(`sessionFile=${sessionFile}`);
93+
expect(diagnostics[0]).toContain(`owner=pid=${process.pid}`);
94+
95+
// Real-fs evidence #4: the same competing acquire that timed out a
96+
// moment ago now succeeds — the next Discord turn would proceed instead
97+
// of bouncing off SessionWriteLockTimeoutError.
98+
const competingAfter = await acquireSessionWriteLock({
99+
sessionFile,
100+
timeoutMs: 5_000,
101+
staleMs: 1_800_000,
102+
maxHoldMs: 60_000,
103+
});
104+
await competingAfter.release();
105+
106+
// Real-fs evidence #5: session file bytes were not torn — the
107+
// original transcript line is still intact, and the fence on a future
108+
// controller would still detect any post-abandon compaction write.
109+
const finalBytes = await fs.readFile(sessionFile, "utf8");
110+
expect(finalBytes).toBe('{"type":"session"}\n');
111+
112+
unblockStuck?.();
113+
await stuckPromise;
114+
},
115+
);
116+
});

src/agents/pi-embedded-runner/run/attempt.session-lock.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ type LockOptions = {
1313
maxHoldMs: number;
1414
};
1515

16+
type AbandonDiagnosticLogger = (message: string) => void;
17+
18+
function defaultAbandonDiagnosticLogger(message: string): void {
19+
process.stderr.write(`${message}\n`);
20+
}
21+
1622
type SessionEventProcessor = {
1723
_processAgentEvent?: (event: unknown) => Promise<void>;
1824
_extensionRunner?: {
@@ -252,7 +258,9 @@ export type EmbeddedAttemptSessionLockController = {
252258
export async function createEmbeddedAttemptSessionLockController(params: {
253259
acquireSessionWriteLock: AcquireSessionWriteLock;
254260
lockOptions: LockOptions;
261+
logAbandonDiagnostic?: AbandonDiagnosticLogger;
255262
}): Promise<EmbeddedAttemptSessionLockController> {
263+
const logAbandon = params.logAbandonDiagnostic ?? defaultAbandonDiagnosticLogger;
256264
const acquireLock = async (): Promise<SessionLock> =>
257265
await params.acquireSessionWriteLock({
258266
sessionFile: params.lockOptions.sessionFile,
@@ -284,20 +292,33 @@ export async function createEmbeddedAttemptSessionLockController(params: {
284292
// session until the watchdog max-hold timer fires (openclaw#84193). Cleanup
285293
// calls this to abandon any reacquired locks whose `run()` body never
286294
// settled so the next turn can acquire the session file.
287-
function abandonInFlightWriteLocks(): boolean {
295+
async function abandonInFlightWriteLocks(): Promise<boolean> {
288296
if (inFlightWriteLocks.size === 0) {
289297
return false;
290298
}
291299
takeoverDetected = true;
292300
const drained = Array.from(inFlightWriteLocks);
293301
inFlightWriteLocks.clear();
302+
const pendingReleases: Array<Promise<void>> = [];
294303
for (const entry of drained) {
295304
if (entry.released) {
296305
continue;
297306
}
298307
entry.released = true;
299-
void entry.lock.release().catch(() => {});
308+
pendingReleases.push(entry.lock.release().catch(() => undefined));
309+
}
310+
if (pendingReleases.length > 0) {
311+
logAbandon(
312+
`[session-write-lock] abandoned ${pendingReleases.length} in-flight lock(s) on attempt cleanup: ` +
313+
`sessionFile=${params.lockOptions.sessionFile} owner=pid=${process.pid} ` +
314+
`reason=stuck-compaction-or-hook`,
315+
);
300316
}
317+
// Await every release before returning so the next acquire in this cleanup
318+
// path (and any concurrent same-process acquire) sees the .jsonl.lock file
319+
// gone — otherwise the watchdog still sees it and the next acquire bounces
320+
// on "file lock stale" until the orphan-self detection fires.
321+
await Promise.all(pendingReleases);
301322
return true;
302323
}
303324

@@ -388,7 +409,7 @@ export async function createEmbeddedAttemptSessionLockController(params: {
388409
// Abandon those before attempting cleanup so the session file unblocks
389410
// immediately. The fence already tracks session file mutations, so the
390411
// next turn can detect any partial-write divergence on its own acquire.
391-
const abandoned = abandonInFlightWriteLocks();
412+
const abandoned = await abandonInFlightWriteLocks();
392413
if (takeoverDetected) {
393414
if (abandoned && heldLock) {
394415
const orphan = heldLock;

0 commit comments

Comments
 (0)