Skip to content

Commit 5a2200b

Browse files
fix(sessions): harden recycled PID lock recovery follow-up (#31320)
* fix: detect PID recycling in session write lock staleness check The session lock uses isPidAlive() to determine if a lock holder is still running. In containers, PID recycling can cause a different process to inherit the same PID, making the lock appear valid when the original holder is dead. Record the process start time (field 22 of /proc/pid/stat) in the lock file and compare it during staleness checks. If the PID is alive but its start time differs from the recorded value, the lock is treated as stale and reclaimed immediately. Backward compatible: lock files without starttime are handled with the existing PID-alive + age-based logic. Non-Linux platforms skip the starttime check entirely (getProcessStartTime returns null). * shared: harden pid starttime parsing * sessions: validate lock pid/starttime payloads * changelog: note recycled PID lock recovery fix * changelog: credit hiroki and vincent on lock recovery fix --------- Co-authored-by: HirokiKobayashi-R <hiroki@rhems-japan.co.jp>
1 parent 548a502 commit 5a2200b

5 files changed

Lines changed: 272 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ Docs: https://docs.openclaw.ai
7575
- Android/Gateway canvas capability refresh: send `node.canvas.capability.refresh` with object `params` (`{}`) from Android node runtime so gateway object-schema validation accepts refresh retries and A2UI host recovery works after scoped capability expiry. (#28413) Thanks @obviyus.
7676
- Gateway/Control UI origins: honor `gateway.controlUi.allowedOrigins: ["*"]` wildcard entries (including trimmed values) and lock behavior with regression tests. Landed from contributor PR #31058 by @byungsker. Thanks @byungsker.
7777
- Agents/Sessions list transcript paths: handle missing/non-string/relative `sessions.list.path` values and per-agent `{agentId}` templates when deriving `transcriptPath`, so cross-agent session listings resolve to concrete agent session files instead of workspace-relative paths. (#24775) Thanks @martinfrancois.
78+
- Sessions/Lock recovery: detect recycled Linux PIDs by comparing lock-file `starttime` with `/proc/<pid>/stat` starttime, so stale `.jsonl.lock` files are reclaimed immediately in containerized PID-reuse scenarios while preserving compatibility for older lock files. (#26443) Fixes #27252. Thanks @HirokiKobayashi-R and @vincentkoc.
7879
- Gateway/Control UI CSP: allow required Google Fonts origins in Control UI CSP. (#29279) Thanks @Glucksberg and @vincentkoc.
7980
- CLI/Install: add an npm-link fallback to fix CLI startup `Permission denied` failures (`exit 127`) on affected installs. (#17151) Thanks @sskyu and @vincentkoc.
8081
- Onboarding/Custom providers: improve verification reliability for slower local endpoints (for example Ollama) during setup. (#27380) Thanks @Sid-Qin.

src/agents/session-write-lock.test.ts

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@ import fs from "node:fs/promises";
22
import os from "node:os";
33
import path from "node:path";
44
import { describe, expect, it, vi } from "vitest";
5+
6+
// Mock getProcessStartTime so PID-recycling detection works on non-Linux
7+
// (macOS, CI runners). isPidAlive is left unmocked.
8+
const FAKE_STARTTIME = 12345;
9+
vi.mock("../shared/pid-alive.js", async (importOriginal) => {
10+
const original = await importOriginal<typeof import("../shared/pid-alive.js")>();
11+
return {
12+
...original,
13+
getProcessStartTime: (pid: number) => (pid === process.pid ? FAKE_STARTTIME : null),
14+
};
15+
});
16+
517
import {
618
__testing,
719
acquireSessionWriteLock,
@@ -255,6 +267,82 @@ describe("acquireSessionWriteLock", () => {
255267
}
256268
});
257269

270+
it("reclaims lock files with recycled PIDs", async () => {
271+
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
272+
try {
273+
const sessionFile = path.join(root, "sessions.json");
274+
const lockPath = `${sessionFile}.lock`;
275+
// Write a lock with a live PID (current process) but a wrong starttime,
276+
// simulating PID recycling: the PID is alive but belongs to a different
277+
// process than the one that created the lock.
278+
await fs.writeFile(
279+
lockPath,
280+
JSON.stringify({
281+
pid: process.pid,
282+
createdAt: new Date().toISOString(),
283+
starttime: 999_999_999,
284+
}),
285+
"utf8",
286+
);
287+
288+
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
289+
const raw = await fs.readFile(lockPath, "utf8");
290+
const payload = JSON.parse(raw) as { pid: number };
291+
292+
expect(payload.pid).toBe(process.pid);
293+
await lock.release();
294+
} finally {
295+
await fs.rm(root, { recursive: true, force: true });
296+
}
297+
});
298+
299+
it("does not reclaim lock files without starttime (backward compat)", async () => {
300+
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
301+
try {
302+
const sessionFile = path.join(root, "sessions.json");
303+
const lockPath = `${sessionFile}.lock`;
304+
// Old-format lock without starttime — should NOT be reclaimed just because
305+
// starttime is missing. The PID is alive, so the lock is valid.
306+
await fs.writeFile(
307+
lockPath,
308+
JSON.stringify({
309+
pid: process.pid,
310+
createdAt: new Date().toISOString(),
311+
}),
312+
"utf8",
313+
);
314+
315+
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow(
316+
/session file locked/,
317+
);
318+
} finally {
319+
await fs.rm(root, { recursive: true, force: true });
320+
}
321+
});
322+
323+
it("does not treat malformed starttime as recycled", async () => {
324+
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
325+
try {
326+
const sessionFile = path.join(root, "sessions.json");
327+
const lockPath = `${sessionFile}.lock`;
328+
await fs.writeFile(
329+
lockPath,
330+
JSON.stringify({
331+
pid: process.pid,
332+
createdAt: new Date().toISOString(),
333+
starttime: 123.5,
334+
}),
335+
"utf8",
336+
);
337+
338+
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow(
339+
/session file locked/,
340+
);
341+
} finally {
342+
await fs.rm(root, { recursive: true, force: true });
343+
}
344+
});
345+
258346
it("registers cleanup for SIGQUIT and SIGABRT", () => {
259347
expect(__testing.cleanupSignals).toContain("SIGQUIT");
260348
expect(__testing.cleanupSignals).toContain("SIGABRT");

src/agents/session-write-lock.ts

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
import fsSync from "node:fs";
22
import fs from "node:fs/promises";
33
import path from "node:path";
4-
import { isPidAlive } from "../shared/pid-alive.js";
4+
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
55
import { resolveProcessScopedMap } from "../shared/process-scoped-map.js";
66

77
type LockFilePayload = {
88
pid?: number;
99
createdAt?: string;
10+
/** Process start time in clock ticks (from /proc/pid/stat field 22). */
11+
starttime?: number;
1012
};
1113

14+
function isValidLockNumber(value: unknown): value is number {
15+
return typeof value === "number" && Number.isInteger(value) && value >= 0;
16+
}
17+
1218
type HeldLock = {
1319
count: number;
1420
handle: fs.FileHandle;
@@ -270,12 +276,15 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
270276
const raw = await fs.readFile(lockPath, "utf8");
271277
const parsed = JSON.parse(raw) as Record<string, unknown>;
272278
const payload: LockFilePayload = {};
273-
if (typeof parsed.pid === "number") {
279+
if (isValidLockNumber(parsed.pid) && parsed.pid > 0) {
274280
payload.pid = parsed.pid;
275281
}
276282
if (typeof parsed.createdAt === "string") {
277283
payload.createdAt = parsed.createdAt;
278284
}
285+
if (isValidLockNumber(parsed.starttime)) {
286+
payload.starttime = parsed.starttime;
287+
}
279288
return payload;
280289
} catch {
281290
return null;
@@ -287,17 +296,31 @@ function inspectLockPayload(
287296
staleMs: number,
288297
nowMs: number,
289298
): LockInspectionDetails {
290-
const pid = typeof payload?.pid === "number" ? payload.pid : null;
299+
const pid = isValidLockNumber(payload?.pid) && payload.pid > 0 ? payload.pid : null;
291300
const pidAlive = pid !== null ? isPidAlive(pid) : false;
292301
const createdAt = typeof payload?.createdAt === "string" ? payload.createdAt : null;
293302
const createdAtMs = createdAt ? Date.parse(createdAt) : Number.NaN;
294303
const ageMs = Number.isFinite(createdAtMs) ? Math.max(0, nowMs - createdAtMs) : null;
295304

305+
// Detect PID recycling: if the PID is alive but its start time differs from
306+
// what was recorded in the lock file, the original process died and the OS
307+
// reassigned the same PID to a different process.
308+
const storedStarttime = isValidLockNumber(payload?.starttime) ? payload.starttime : null;
309+
const pidRecycled =
310+
pidAlive && pid !== null && storedStarttime !== null
311+
? (() => {
312+
const currentStarttime = getProcessStartTime(pid);
313+
return currentStarttime !== null && currentStarttime !== storedStarttime;
314+
})()
315+
: false;
316+
296317
const staleReasons: string[] = [];
297318
if (pid === null) {
298319
staleReasons.push("missing-pid");
299320
} else if (!pidAlive) {
300321
staleReasons.push("dead-pid");
322+
} else if (pidRecycled) {
323+
staleReasons.push("recycled-pid");
301324
}
302325
if (ageMs === null) {
303326
staleReasons.push("invalid-createdAt");
@@ -447,7 +470,12 @@ export async function acquireSessionWriteLock(params: {
447470
try {
448471
handle = await fs.open(lockPath, "wx");
449472
const createdAt = new Date().toISOString();
450-
await handle.writeFile(JSON.stringify({ pid: process.pid, createdAt }, null, 2), "utf8");
473+
const starttime = getProcessStartTime(process.pid);
474+
const lockPayload: LockFilePayload = { pid: process.pid, createdAt };
475+
if (starttime !== null) {
476+
lockPayload.starttime = starttime;
477+
}
478+
await handle.writeFile(JSON.stringify(lockPayload, null, 2), "utf8");
451479
const createdHeld: HeldLock = {
452480
count: 1,
453481
handle,

src/shared/pid-alive.test.ts

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import fsSync from "node:fs";
22
import { describe, expect, it, vi } from "vitest";
3-
import { isPidAlive } from "./pid-alive.js";
3+
import { getProcessStartTime, isPidAlive } from "./pid-alive.js";
44

55
describe("isPidAlive", () => {
66
it("returns true for the current running process", () => {
@@ -14,6 +14,7 @@ describe("isPidAlive", () => {
1414
it("returns false for invalid PIDs", () => {
1515
expect(isPidAlive(0)).toBe(false);
1616
expect(isPidAlive(-1)).toBe(false);
17+
expect(isPidAlive(1.5)).toBe(false);
1718
expect(isPidAlive(Number.NaN)).toBe(false);
1819
expect(isPidAlive(Number.POSITIVE_INFINITY)).toBe(false);
1920
});
@@ -51,3 +52,114 @@ describe("isPidAlive", () => {
5152
}
5253
});
5354
});
55+
56+
describe("getProcessStartTime", () => {
57+
it("returns a number on Linux for the current process", async () => {
58+
const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
59+
if (!originalPlatformDescriptor) {
60+
throw new Error("missing process.platform descriptor");
61+
}
62+
63+
const originalReadFileSync = fsSync.readFileSync;
64+
// Simulate a realistic /proc/<pid>/stat line
65+
const fakeStat = `${process.pid} (node) S 1 ${process.pid} ${process.pid} 0 -1 4194304 12345 0 0 0 100 50 0 0 20 0 8 0 98765 123456789 5000 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`;
66+
vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
67+
if (filePath === `/proc/${process.pid}/stat`) {
68+
return fakeStat;
69+
}
70+
return originalReadFileSync(filePath as never, encoding as never) as never;
71+
});
72+
73+
Object.defineProperty(process, "platform", {
74+
...originalPlatformDescriptor,
75+
value: "linux",
76+
});
77+
78+
try {
79+
vi.resetModules();
80+
const { getProcessStartTime: fresh } = await import("./pid-alive.js");
81+
const starttime = fresh(process.pid);
82+
expect(starttime).toBe(98765);
83+
} finally {
84+
Object.defineProperty(process, "platform", originalPlatformDescriptor);
85+
vi.restoreAllMocks();
86+
}
87+
});
88+
89+
it("returns null on non-Linux platforms", () => {
90+
if (process.platform === "linux") {
91+
// On actual Linux, this test is trivially satisfied by the other tests.
92+
expect(true).toBe(true);
93+
return;
94+
}
95+
expect(getProcessStartTime(process.pid)).toBeNull();
96+
});
97+
98+
it("returns null for invalid PIDs", () => {
99+
expect(getProcessStartTime(0)).toBeNull();
100+
expect(getProcessStartTime(-1)).toBeNull();
101+
expect(getProcessStartTime(1.5)).toBeNull();
102+
expect(getProcessStartTime(Number.NaN)).toBeNull();
103+
expect(getProcessStartTime(Number.POSITIVE_INFINITY)).toBeNull();
104+
});
105+
106+
it("returns null for malformed /proc stat content", async () => {
107+
const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
108+
if (!originalPlatformDescriptor) {
109+
throw new Error("missing process.platform descriptor");
110+
}
111+
112+
const originalReadFileSync = fsSync.readFileSync;
113+
vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
114+
if (filePath === "/proc/42/stat") {
115+
return "42 node S malformed";
116+
}
117+
return originalReadFileSync(filePath as never, encoding as never) as never;
118+
});
119+
120+
Object.defineProperty(process, "platform", {
121+
...originalPlatformDescriptor,
122+
value: "linux",
123+
});
124+
125+
try {
126+
vi.resetModules();
127+
const { getProcessStartTime: fresh } = await import("./pid-alive.js");
128+
expect(fresh(42)).toBeNull();
129+
} finally {
130+
Object.defineProperty(process, "platform", originalPlatformDescriptor);
131+
vi.restoreAllMocks();
132+
}
133+
});
134+
135+
it("handles comm fields containing spaces and parentheses", async () => {
136+
const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
137+
if (!originalPlatformDescriptor) {
138+
throw new Error("missing process.platform descriptor");
139+
}
140+
141+
const originalReadFileSync = fsSync.readFileSync;
142+
// comm field with spaces and nested parens: "(My App (v2))"
143+
const fakeStat = `42 (My App (v2)) S 1 42 42 0 -1 4194304 0 0 0 0 0 0 0 0 20 0 1 0 55555 0 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`;
144+
vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
145+
if (filePath === "/proc/42/stat") {
146+
return fakeStat;
147+
}
148+
return originalReadFileSync(filePath as never, encoding as never) as never;
149+
});
150+
151+
Object.defineProperty(process, "platform", {
152+
...originalPlatformDescriptor,
153+
value: "linux",
154+
});
155+
156+
try {
157+
vi.resetModules();
158+
const { getProcessStartTime: fresh } = await import("./pid-alive.js");
159+
expect(fresh(42)).toBe(55555);
160+
} finally {
161+
Object.defineProperty(process, "platform", originalPlatformDescriptor);
162+
vi.restoreAllMocks();
163+
}
164+
});
165+
});

src/shared/pid-alive.ts

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import fsSync from "node:fs";
22

3+
function isValidPid(pid: number): boolean {
4+
return Number.isInteger(pid) && pid > 0;
5+
}
6+
37
/**
48
* Check if a process is a zombie on Linux by reading /proc/<pid>/status.
59
* Returns false on non-Linux platforms or if the proc file can't be read.
@@ -18,7 +22,7 @@ function isZombieProcess(pid: number): boolean {
1822
}
1923

2024
export function isPidAlive(pid: number): boolean {
21-
if (!Number.isFinite(pid) || pid <= 0) {
25+
if (!isValidPid(pid)) {
2226
return false;
2327
}
2428
try {
@@ -31,3 +35,36 @@ export function isPidAlive(pid: number): boolean {
3135
}
3236
return true;
3337
}
38+
39+
/**
40+
* Read the process start time (field 22 "starttime") from /proc/<pid>/stat.
41+
* Returns the value in clock ticks since system boot, or null on non-Linux
42+
* platforms or if the proc file can't be read.
43+
*
44+
* This is used to detect PID recycling: if two readings for the same PID
45+
* return different starttimes, the PID has been reused by a different process.
46+
*/
47+
export function getProcessStartTime(pid: number): number | null {
48+
if (process.platform !== "linux") {
49+
return null;
50+
}
51+
if (!isValidPid(pid)) {
52+
return null;
53+
}
54+
try {
55+
const stat = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8");
56+
const commEndIndex = stat.lastIndexOf(")");
57+
if (commEndIndex < 0) {
58+
return null;
59+
}
60+
// The comm field (field 2) is wrapped in parens and can contain spaces,
61+
// so split after the last ")" to get fields 3..N reliably.
62+
const afterComm = stat.slice(commEndIndex + 1).trimStart();
63+
const fields = afterComm.split(/\s+/);
64+
// field 22 (starttime) = index 19 after the comm-split (field 3 is index 0).
65+
const starttime = Number(fields[19]);
66+
return Number.isInteger(starttime) && starttime >= 0 ? starttime : null;
67+
} catch {
68+
return null;
69+
}
70+
}

0 commit comments

Comments
 (0)