Skip to content

Commit 680eff6

Browse files
committed
fix: land SIGUSR1 orphan recovery regressions (#47719) (thanks @joeykrug)
1 parent 98f6ec5 commit 680eff6

7 files changed

Lines changed: 64 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ Docs: https://docs.openclaw.ai
5858
- Agents/openai-compatible tool calls: deduplicate repeated tool call ids across live assistant messages and replayed history so OpenAI-compatible backends no longer reject duplicate `tool_call_id` values with HTTP 400. (#40996) Thanks @xaeon2026.
5959
- Security/device pairing: harden `device.token.rotate` deny handling by keeping public failures generic while logging internal deny reasons and preserving approved-baseline enforcement. (`GHSA-7jrw-x62h-64p8`)
6060
- Slack/interactive replies: preserve `channelData.slack.blocks` through live DM delivery and preview-finalized edits so Block Kit button and select directives render instead of falling back to raw text. (#45890) Thanks @vincentkoc.
61+
- Gateway/restart: defer externally signaled unmanaged restarts through the in-process idle drain, and preserve the restored subagent run as remap fallback during orphan recovery so resumed sessions do not duplicate work. (#47719) Thanks @joeykrug.
6162
- Zalo/plugin runtime: export `resolveClientIp` from `openclaw/plugin-sdk/zalo` so installed builds no longer crash on startup when the webhook monitor loads from the packaged extension instead of the monorepo source tree. (#46549) Thanks @No898.
6263
- CI/channel test routing: move the built-in channel suites into `test:channels` and keep them out of `test:extensions`, so extension CI no longer fails after the channel migration while targeted test routing still sends Slack, Signal, and iMessage suites to the right lane. (#46066) Thanks @scoootscooob.
6364
- Browser/profiles: drop the auto-created `chrome-relay` browser profile; users who need the Chrome extension relay must now create their own profile via `openclaw browser create-profile`. (#45777) Thanks @odysseus0.

src/agents/subagent-orphan-recovery.test.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,9 @@ describe("subagent-orphan-recovery", () => {
6565
"agent:main:subagent:test-session-1": sessionEntry,
6666
});
6767

68+
const run = createTestRunRecord();
6869
const activeRuns = new Map<string, SubagentRunRecord>();
69-
activeRuns.set("run-1", createTestRunRecord());
70+
activeRuns.set("run-1", run);
7071

7172
const { recoverOrphanedSubagentSessions } = await import("./subagent-orphan-recovery.js");
7273

@@ -87,10 +88,13 @@ describe("subagent-orphan-recovery", () => {
8788
expect(params.sessionKey).toBe("agent:main:subagent:test-session-1");
8889
expect(params.message).toContain("gateway reload");
8990
expect(params.message).toContain("Test task: implement feature X");
90-
expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith({
91-
previousRunId: "run-1",
92-
nextRunId: "test-run-id",
93-
});
91+
expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith(
92+
expect.objectContaining({
93+
previousRunId: "run-1",
94+
nextRunId: "test-run-id",
95+
fallback: run,
96+
}),
97+
);
9498
});
9599

96100
it("skips sessions that are not aborted", async () => {

src/agents/subagent-orphan-recovery.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ async function resumeOrphanedSession(params: {
8282
lastHumanMessage?: string;
8383
configChangeHint?: string;
8484
originalRunId: string;
85+
originalRun: SubagentRunRecord;
8586
}): Promise<boolean> {
8687
let resumeMessage = buildResumeMessage(params.task, params.lastHumanMessage);
8788
if (params.configChangeHint) {
@@ -103,6 +104,7 @@ async function resumeOrphanedSession(params: {
103104
const remapped = replaceSubagentRunAfterSteer({
104105
previousRunId: params.originalRunId,
105106
nextRunId: result.runId,
107+
fallback: params.originalRun,
106108
});
107109
if (!remapped) {
108110
log.warn(
@@ -210,6 +212,7 @@ export async function recoverOrphanedSubagentSessions(params: {
210212
? "\n\n[config changes from your previous run were already applied — do not re-modify openclaw.json or restart the gateway]"
211213
: undefined,
212214
originalRunId: runId,
215+
originalRun: runRecord,
213216
});
214217

215218
if (resumed) {

src/cli/gateway-cli/run-loop.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ const acquireGatewayLock = vi.fn(async (_opts?: { port?: number }) => ({
88
const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true);
99
const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
1010
const markGatewaySigusr1RestartHandled = vi.fn();
11+
const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?: string }) => ({
12+
ok: true,
13+
pid: process.pid,
14+
signal: "SIGUSR1" as const,
15+
delayMs: 0,
16+
mode: "emit" as const,
17+
coalesced: false,
18+
cooldownMsApplied: 0,
19+
}));
1120
const getActiveTaskCount = vi.fn(() => 0);
1221
const markGatewayDraining = vi.fn();
1322
const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
@@ -35,6 +44,8 @@ vi.mock("../../infra/restart.js", () => ({
3544
consumeGatewaySigusr1RestartAuthorization: () => consumeGatewaySigusr1RestartAuthorization(),
3645
isGatewaySigusr1RestartExternallyAllowed: () => isGatewaySigusr1RestartExternallyAllowed(),
3746
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
47+
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
48+
scheduleGatewaySigusr1Restart(opts),
3849
}));
3950

4051
vi.mock("../../infra/process-respawn.js", () => ({
@@ -292,6 +303,28 @@ describe("runGatewayLoop", () => {
292303
});
293304
});
294305

306+
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
307+
vi.clearAllMocks();
308+
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
309+
isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true);
310+
311+
await withIsolatedSignals(async ({ captureSignal }) => {
312+
const { close, start } = await createSignaledLoopHarness();
313+
const sigusr1 = captureSignal("SIGUSR1");
314+
315+
sigusr1();
316+
await new Promise<void>((resolve) => setImmediate(resolve));
317+
318+
expect(scheduleGatewaySigusr1Restart).toHaveBeenCalledWith({
319+
delayMs: 0,
320+
reason: "SIGUSR1",
321+
});
322+
expect(close).not.toHaveBeenCalled();
323+
expect(start).toHaveBeenCalledTimes(1);
324+
expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled();
325+
});
326+
});
327+
295328
it("releases the lock before exiting on spawned restart", async () => {
296329
vi.clearAllMocks();
297330

src/cli/gateway-cli/run-loop.ts

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
consumeGatewaySigusr1RestartAuthorization,
1111
isGatewaySigusr1RestartExternallyAllowed,
1212
markGatewaySigusr1RestartHandled,
13+
scheduleGatewaySigusr1Restart,
1314
} from "../../infra/restart.js";
1415
import { createSubsystemLogger } from "../../logging/subsystem.js";
1516
import {
@@ -186,10 +187,20 @@ export async function runGatewayLoop(params: {
186187
const onSigusr1 = () => {
187188
gatewayLog.info("signal SIGUSR1 received");
188189
const authorized = consumeGatewaySigusr1RestartAuthorization();
189-
if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) {
190-
gatewayLog.warn(
191-
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
192-
);
190+
if (!authorized) {
191+
if (!isGatewaySigusr1RestartExternallyAllowed()) {
192+
gatewayLog.warn(
193+
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
194+
);
195+
return;
196+
}
197+
if (shuttingDown) {
198+
gatewayLog.info("received SIGUSR1 during shutdown; ignoring");
199+
return;
200+
}
201+
// External SIGUSR1 requests should still reuse the in-process restart
202+
// scheduler so idle drain and restart coalescing stay consistent.
203+
scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "SIGUSR1" });
193204
return;
194205
}
195206
markGatewaySigusr1RestartHandled();

src/config/schema.labels.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ export const FIELD_LABELS: Record<string, string> = {
279279
"OpenAI Chat Completions Image Timeout (ms)",
280280
"gateway.reload.mode": "Config Reload Mode",
281281
"gateway.reload.debounceMs": "Config Reload Debounce (ms)",
282+
"gateway.reload.deferralTimeoutMs": "Restart Deferral Timeout (ms)",
282283
"gateway.nodes.browser.mode": "Gateway Node Browser Mode",
283284
"gateway.nodes.browser.node": "Gateway Node Browser Pin",
284285
"gateway.nodes.allowCommands": "Gateway Node Allowlist (Extra Commands)",

src/infra/infra-runtime.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,8 @@ describe("infra runtime", () => {
190190
await vi.advanceTimersByTimeAsync(0);
191191
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
192192

193-
// Advance past the 90s max deferral wait
194-
await vi.advanceTimersByTimeAsync(90_000);
193+
// Advance past the 5-minute max deferral wait
194+
await vi.advanceTimersByTimeAsync(300_000);
195195
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
196196
} finally {
197197
process.removeListener("SIGUSR1", handler);

0 commit comments

Comments
 (0)