Skip to content

Commit d852041

Browse files
committed
fix(windows): stabilize gateway restart and avoid false stale cleanup
1 parent 1561c6a commit d852041

7 files changed

Lines changed: 272 additions & 85 deletions

File tree

src/cli/daemon-cli/lifecycle.test.ts

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@ import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vite
33
type RestartHealthSnapshot = {
44
healthy: boolean;
55
staleGatewayPids: number[];
6-
runtime: { status?: string };
6+
runtime: { status?: string; pid?: number };
77
portUsage: { port: number; status: string; listeners: []; hints: []; errors?: string[] };
8+
listenerKinds?: { gateway: number[]; unknown: number[]; other: number[] };
89
};
910

1011
type RestartPostCheckContext = {
@@ -21,6 +22,7 @@ type RestartParams = {
2122

2223
const service = {
2324
readCommand: vi.fn(),
25+
readRuntime: vi.fn(),
2426
restart: vi.fn(),
2527
};
2628

@@ -128,6 +130,7 @@ describe("runDaemonRestart health checks", () => {
128130

129131
beforeEach(() => {
130132
service.readCommand.mockReset();
133+
service.readRuntime.mockReset();
131134
service.restart.mockReset();
132135
runServiceRestart.mockReset();
133136
runServiceStop.mockReset();
@@ -148,6 +151,7 @@ describe("runDaemonRestart health checks", () => {
148151
programArguments: ["openclaw", "gateway", "--port", "18789"],
149152
environment: {},
150153
});
154+
service.readRuntime.mockResolvedValue({ status: "running", pid: 1500 });
151155
service.restart.mockResolvedValue({ outcome: "completed" });
152156

153157
runServiceRestart.mockImplementation(async (params: RestartParams) => {
@@ -194,6 +198,7 @@ describe("runDaemonRestart health checks", () => {
194198
staleGatewayPids: [],
195199
runtime: { status: "running" },
196200
portUsage: { port: 18789, status: "busy", listeners: [], hints: [] },
201+
listenerKinds: { gateway: [], unknown: [], other: [] },
197202
};
198203
waitForGatewayHealthyRestart.mockResolvedValueOnce(unhealthy).mockResolvedValueOnce(healthy);
199204
terminateStaleGatewayPids.mockResolvedValue([1993]);
@@ -204,6 +209,9 @@ describe("runDaemonRestart health checks", () => {
204209
expect(terminateStaleGatewayPids).toHaveBeenCalledWith([1993]);
205210
expect(service.restart).toHaveBeenCalledTimes(1);
206211
expect(waitForGatewayHealthyRestart).toHaveBeenCalledTimes(2);
212+
for (const [params] of waitForGatewayHealthyRestart.mock.calls) {
213+
expect(params).not.toHaveProperty("includeUnknownListenersAsStale");
214+
}
207215
});
208216

209217
it("skips stale-pid retry health checks when the retry restart is only scheduled", async () => {
@@ -242,6 +250,47 @@ describe("runDaemonRestart health checks", () => {
242250
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
243251
});
244252

253+
it("warns on Windows when restart health recovers without a clear pid change", async () => {
254+
const originalPlatform = process.platform;
255+
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
256+
const healthy: RestartHealthSnapshot & {
257+
listenerKinds: { gateway: number[]; unknown: number[]; other: number[] };
258+
} = {
259+
healthy: true,
260+
staleGatewayPids: [],
261+
runtime: { status: "running", pid: 1500 },
262+
portUsage: { port: 18789, status: "busy", listeners: [], hints: [] },
263+
listenerKinds: { gateway: [1500], unknown: [], other: [] },
264+
};
265+
waitForGatewayHealthyRestart.mockResolvedValue(healthy);
266+
const capturedWarnings: string[][] = [];
267+
runServiceRestart.mockImplementationOnce(async (params: RestartParams) => {
268+
const warnings: string[] = [];
269+
capturedWarnings.push(warnings);
270+
await params.postRestartCheck?.({
271+
json: true,
272+
stdout: process.stdout,
273+
warnings,
274+
fail: (message: string, hints?: string[]) => {
275+
const err = new Error(message) as Error & { hints?: string[] };
276+
err.hints = hints;
277+
throw err;
278+
},
279+
});
280+
return true;
281+
});
282+
283+
try {
284+
await expect(runDaemonRestart({ json: true })).resolves.toBe(true);
285+
} finally {
286+
Object.defineProperty(process, "platform", { value: originalPlatform, configurable: true });
287+
}
288+
289+
expect(capturedWarnings[0]).toContain(
290+
"Gateway restart became healthy, but process identity did not clearly change from pid 1500; runtime may be lagging or the old process may still own the listener.",
291+
);
292+
});
293+
245294
it("signals an unmanaged gateway process on stop", async () => {
246295
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4200, 4300]);
247296
runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise<unknown> }) => {

src/cli/daemon-cli/lifecycle.ts

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,18 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
151151
const json = Boolean(opts.json);
152152
const service = resolveGatewayService();
153153
let restartedWithoutServiceManager = false;
154+
let preRestartRuntimePid: number | undefined;
154155
const restartPort = await resolveGatewayLifecyclePort(service).catch(() =>
155156
resolveGatewayPortFallback(),
156157
);
157158
const restartWaitMs = POST_RESTART_HEALTH_ATTEMPTS * POST_RESTART_HEALTH_DELAY_MS;
158159
const restartWaitSeconds = Math.round(restartWaitMs / 1000);
160+
if (process.platform === "win32") {
161+
preRestartRuntimePid = await service
162+
.readRuntime(process.env)
163+
.then((runtime) => runtime.pid)
164+
.catch(() => undefined);
165+
}
159166

160167
return await runServiceRestart({
161168
serviceNoun: "Gateway",
@@ -204,14 +211,18 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
204211
port: restartPort,
205212
attempts: POST_RESTART_HEALTH_ATTEMPTS,
206213
delayMs: POST_RESTART_HEALTH_DELAY_MS,
207-
includeUnknownListenersAsStale: process.platform === "win32",
208214
});
209215

210216
if (!health.healthy && health.staleGatewayPids.length > 0) {
211-
const staleMsg = `Found stale gateway process(es): ${health.staleGatewayPids.join(", ")}.`;
217+
const staleMsg = `Found positively identified stale gateway process(es): ${health.staleGatewayPids.join(", ")}.`;
218+
const diagnostics = renderRestartDiagnostics(health);
212219
warnings.push(staleMsg);
220+
warnings.push(...diagnostics);
213221
if (!json) {
214222
defaultRuntime.log(theme.warn(staleMsg));
223+
for (const line of diagnostics) {
224+
defaultRuntime.log(theme.muted(line));
225+
}
215226
defaultRuntime.log(theme.muted("Stopping stale process(es) and retrying restart..."));
216227
}
217228

@@ -225,11 +236,23 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
225236
port: restartPort,
226237
attempts: POST_RESTART_HEALTH_ATTEMPTS,
227238
delayMs: POST_RESTART_HEALTH_DELAY_MS,
228-
includeUnknownListenersAsStale: process.platform === "win32",
229239
});
230240
}
231241

232242
if (health.healthy) {
243+
if (
244+
process.platform === "win32" &&
245+
preRestartRuntimePid != null &&
246+
(health.runtime.pid === preRestartRuntimePid ||
247+
health.listenerKinds.gateway.includes(preRestartRuntimePid))
248+
) {
249+
const identityWarning = `Gateway restart became healthy, but process identity did not clearly change from pid ${preRestartRuntimePid}; runtime may be lagging or the old process may still own the listener.`;
250+
if (!json) {
251+
defaultRuntime.log(theme.warn(identityWarning));
252+
} else {
253+
warnings.push(identityWarning);
254+
}
255+
}
233256
return;
234257
}
235258

src/cli/daemon-cli/restart-health.test.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,30 @@ describe("inspectGatewayRestart", () => {
210210
expect(snapshot.staleGatewayPids).toEqual([]);
211211
});
212212

213+
it("treats a busy unknown Windows listener as healthy when the probe succeeds", async () => {
214+
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
215+
classifyPortListener.mockReturnValue("unknown");
216+
probeGateway.mockResolvedValue({
217+
ok: true,
218+
close: null,
219+
});
220+
221+
const snapshot = await inspectGatewayRestartWithSnapshot({
222+
runtime: { status: "stopped" },
223+
portUsage: {
224+
port: 18789,
225+
status: "busy",
226+
listeners: [{ pid: 9100, commandLine: "" }],
227+
hints: [],
228+
},
229+
includeUnknownListenersAsStale: true,
230+
});
231+
232+
expect(snapshot.healthy).toBe(true);
233+
expect(snapshot.staleGatewayPids).toEqual([]);
234+
expect(snapshot.probeResult).toBe("ok");
235+
});
236+
213237
it("treats auth-closed probe as healthy gateway reachability", async () => {
214238
const snapshot = await inspectAmbiguousOwnershipWithProbe({
215239
ok: false,
@@ -240,4 +264,35 @@ describe("inspectGatewayRestart", () => {
240264
expect(snapshot.healthy).toBe(true);
241265
expect(probeGateway).not.toHaveBeenCalled();
242266
});
267+
268+
it("reports probe result, listener classification, and failure reason in diagnostics", async () => {
269+
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
270+
classifyPortListener.mockImplementation((listener) =>
271+
"command" in (listener as Record<string, unknown>) ? "unknown" : "gateway",
272+
);
273+
probeGateway.mockResolvedValue({
274+
ok: false,
275+
close: null,
276+
});
277+
278+
const { inspectGatewayRestart, renderRestartDiagnostics } = await import("./restart-health.js");
279+
const snapshot = await inspectGatewayRestart({
280+
service: makeGatewayService({ status: "stopped" }),
281+
port: 18789,
282+
includeUnknownListenersAsStale: false,
283+
});
284+
285+
const lines = renderRestartDiagnostics({
286+
...snapshot,
287+
listenerKinds: { gateway: [9000], unknown: [9100], other: [] },
288+
staleGatewayPids: [9000],
289+
failureReason: "stale-gateway-listener",
290+
probeResult: "failed",
291+
});
292+
293+
expect(lines).toContain("Restart probe: failed.");
294+
expect(lines).toContain("Listener classification: gateway=9000; unknown=9100; other=none.");
295+
expect(lines).toContain("Stale gateway termination candidates: 9000.");
296+
expect(lines).toContain("Restart health failure reason: stale-gateway-listener.");
297+
});
243298
});

src/cli/daemon-cli/restart-health.ts

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@ export type GatewayRestartSnapshot = {
2121
portUsage: PortUsage;
2222
healthy: boolean;
2323
staleGatewayPids: number[];
24+
probeResult: "ok" | "failed" | "not-run";
25+
listenerKinds: {
26+
gateway: number[];
27+
unknown: number[];
28+
other: number[];
29+
};
30+
failureReason?: "runtime-lag" | "probe-failed" | "stale-gateway-listener";
2431
};
2532

2633
export type GatewayPortHealthSnapshot = {
@@ -124,35 +131,49 @@ export async function inspectGatewayRestart(params: {
124131
};
125132
}
126133

134+
let probeResult: GatewayRestartSnapshot["probeResult"] = "not-run";
127135
if (portUsage.status === "busy" && runtime.status !== "running") {
128136
try {
129137
const reachable = await confirmGatewayReachable(params.port);
138+
probeResult = reachable ? "ok" : "failed";
130139
if (reachable) {
131140
return {
132141
runtime,
133142
portUsage,
134143
healthy: true,
135144
staleGatewayPids: [],
145+
probeResult,
146+
listenerKinds: { gateway: [], unknown: [], other: [] },
136147
};
137148
}
138149
} catch {
150+
probeResult = "failed";
139151
// Probe is best-effort; keep the ownership-based diagnostics.
140152
}
141153
}
142154

143-
const gatewayListeners =
155+
const classifiedListeners =
144156
portUsage.status === "busy"
145-
? portUsage.listeners.filter(
146-
(listener) => classifyPortListener(listener, params.port) === "gateway",
147-
)
157+
? portUsage.listeners.map((listener) => ({
158+
listener,
159+
kind: classifyPortListener(listener, params.port),
160+
}))
148161
: [];
162+
const gatewayListeners = classifiedListeners
163+
.filter(({ kind }) => kind === "gateway")
164+
.map(({ listener }) => listener);
165+
const unknownListeners = classifiedListeners
166+
.filter(({ kind }) => kind === "unknown")
167+
.map(({ listener }) => listener);
168+
const otherListeners = classifiedListeners
169+
.filter(({ kind }) => kind !== "gateway" && kind !== "unknown")
170+
.map(({ listener }) => listener);
149171
const fallbackListenerPids =
150172
params.includeUnknownListenersAsStale &&
151173
process.platform === "win32" &&
152174
runtime.status !== "running" &&
153175
portUsage.status === "busy"
154-
? portUsage.listeners
155-
.filter((listener) => classifyPortListener(listener, params.port) === "unknown")
176+
? unknownListeners
156177
.map((listener) => listener.pid)
157178
.filter((pid): pid is number => Number.isFinite(pid))
158179
: [];
@@ -169,7 +190,9 @@ export async function inspectGatewayRestart(params: {
169190
if (!healthy && running && portUsage.status === "busy") {
170191
try {
171192
healthy = await confirmGatewayReachable(params.port);
193+
probeResult = healthy ? "ok" : "failed";
172194
} catch {
195+
probeResult = "failed";
173196
// best-effort probe
174197
}
175198
}
@@ -192,12 +215,34 @@ export async function inspectGatewayRestart(params: {
192215
),
193216
]),
194217
);
218+
const failureReason = healthy
219+
? undefined
220+
: staleGatewayPids.length > 0
221+
? "stale-gateway-listener"
222+
: portUsage.status === "busy" && probeResult === "failed" && runtime.status !== "running"
223+
? "probe-failed"
224+
: portUsage.status === "busy" && runtime.status !== "running"
225+
? "runtime-lag"
226+
: undefined;
195227

196228
return {
197229
runtime,
198230
portUsage,
199231
healthy,
200232
staleGatewayPids,
233+
probeResult,
234+
listenerKinds: {
235+
gateway: gatewayListeners
236+
.map((listener) => listener.pid)
237+
.filter((pid): pid is number => Number.isFinite(pid)),
238+
unknown: unknownListeners
239+
.map((listener) => listener.pid)
240+
.filter((pid): pid is number => Number.isFinite(pid)),
241+
other: otherListeners
242+
.map((listener) => listener.pid)
243+
.filter((pid): pid is number => Number.isFinite(pid)),
244+
},
245+
failureReason,
201246
};
202247
}
203248

@@ -290,6 +335,26 @@ export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): stri
290335
lines.push(`Service runtime: ${runtimeSummary}`);
291336
}
292337

338+
lines.push(`Restart probe: ${snapshot.probeResult}.`);
339+
340+
const formatPidSummary = (label: string, pids: number[]) =>
341+
`${label}=${pids.length > 0 ? pids.join(", ") : "none"}`;
342+
lines.push(
343+
`Listener classification: ${[
344+
formatPidSummary("gateway", snapshot.listenerKinds.gateway),
345+
formatPidSummary("unknown", snapshot.listenerKinds.unknown),
346+
formatPidSummary("other", snapshot.listenerKinds.other),
347+
].join("; ")}.`,
348+
);
349+
350+
if (snapshot.staleGatewayPids.length > 0) {
351+
lines.push(`Stale gateway termination candidates: ${snapshot.staleGatewayPids.join(", ")}.`);
352+
}
353+
354+
if (snapshot.failureReason) {
355+
lines.push(`Restart health failure reason: ${snapshot.failureReason}.`);
356+
}
357+
293358
lines.push(...renderPortUsageDiagnostics(snapshot));
294359

295360
return lines;

src/cli/daemon-cli/status.gather.test.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ const inspectGatewayRestart = vi.fn<(opts?: unknown) => Promise<GatewayRestartSn
3131
portUsage: { port: 19001, status: "busy", listeners: [], hints: [] },
3232
healthy: true,
3333
staleGatewayPids: [],
34+
probeResult: "ok",
35+
listenerKinds: { gateway: [1234], unknown: [], other: [] },
3436
}),
3537
);
3638
const serviceReadCommand = vi.fn<
@@ -459,6 +461,8 @@ describe("gatherDaemonStatus", () => {
459461
},
460462
healthy: false,
461463
staleGatewayPids: [9000],
464+
probeResult: "failed",
465+
listenerKinds: { gateway: [9000], unknown: [], other: [] },
462466
});
463467

464468
const status = await gatherDaemonStatus({

0 commit comments

Comments
 (0)