Skip to content

Commit 1f41b8b

Browse files
committed
fix(gateway): bound default restart deferral
1 parent 7e5c375 commit 1f41b8b

11 files changed

Lines changed: 131 additions & 20 deletions
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json
2-
6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json
3-
0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json
1+
7f9815a297504c75022c4db2df250ce4cc9ff5c3f69250c67ca253b89148b9f3 config-baseline.json
2+
8bc9fda7c1096472beaa416a61043ce51d691d4dcad9ed3e0be46e68bb70b0ce config-baseline.core.json
3+
45162ff84813be8a1fe561ed8d6245a248d5c6288ef9e9af51bdf4ec05ef65ad config-baseline.channel.json
44
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json

docs/gateway/configuration-reference.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
515515
reload: {
516516
mode: "hybrid", // off | restart | hot | hybrid
517517
debounceMs: 500,
518-
deferralTimeoutMs: 0,
518+
deferralTimeoutMs: 300000,
519519
},
520520
},
521521
}
@@ -527,7 +527,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
527527
- `"hot"`: apply changes in-process without restarting.
528528
- `"hybrid"` (default): try hot reload first; fall back to restart if required.
529529
- `debounceMs`: debounce window in ms before config changes are applied (non-negative integer).
530-
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it or set `0` to wait indefinitely and log periodic still-pending warnings.
530+
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it to use the default bounded wait (`300000`); set `0` to wait indefinitely and log periodic still-pending warnings.
531531

532532
---
533533

src/cli/gateway-cli/run-loop.test.ts

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ const abortEmbeddedPiRun = vi.fn(
5050
const getActiveEmbeddedRunCount = vi.fn(() => 0);
5151
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
5252
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
53-
const loadConfig = vi.fn(() => ({
53+
const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
54+
const loadConfig = vi.fn<() => { gateway: { reload: { deferralTimeoutMs?: number } } }>(() => ({
5455
gateway: {
5556
reload: {
5657
deferralTimeoutMs: 90_000,
@@ -74,6 +75,15 @@ vi.mock("../../infra/restart.js", () => ({
7475
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
7576
peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(),
7677
resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(),
78+
resolveGatewayRestartDeferralTimeoutMs: (timeoutMs: unknown) => {
79+
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
80+
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
81+
}
82+
if (timeoutMs <= 0) {
83+
return undefined;
84+
}
85+
return Math.floor(timeoutMs);
86+
},
7787
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
7888
scheduleGatewaySigusr1Restart(opts),
7989
}));
@@ -438,6 +448,31 @@ describe("runGatewayLoop", () => {
438448
});
439449
});
440450

451+
it("uses the default restart drain timeout when config omits deferralTimeoutMs", async () => {
452+
vi.clearAllMocks();
453+
loadConfig.mockReturnValue({ gateway: { reload: {} } });
454+
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);
455+
respawnGatewayProcessForUpdate.mockReturnValue({
456+
mode: "disabled",
457+
detail: "OPENCLAW_NO_RESPAWN",
458+
});
459+
460+
await withIsolatedSignals(async ({ captureSignal }) => {
461+
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
462+
463+
const { start } = await createSignaledLoopHarness();
464+
const sigusr1 = captureSignal("SIGUSR1");
465+
466+
sigusr1();
467+
await new Promise<void>((resolve) => setImmediate(resolve));
468+
await new Promise<void>((resolve) => setImmediate(resolve));
469+
470+
expect(waitForActiveTasks).toHaveBeenCalledWith(DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS);
471+
expect(markGatewayDraining).toHaveBeenCalledOnce();
472+
expect(start).toHaveBeenCalledTimes(2);
473+
});
474+
});
475+
441476
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
442477
vi.clearAllMocks();
443478
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);

src/cli/gateway-cli/run-loop.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,12 @@ export async function runGatewayLoop(params: {
279279
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
280280
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
281281
try {
282-
const { getRuntimeConfig } = await loadRuntimeConfigModule();
282+
const [{ getRuntimeConfig }, { resolveGatewayRestartDeferralTimeoutMs }] = await Promise.all([
283+
loadRuntimeConfigModule(),
284+
loadRestartModule(),
285+
]);
283286
const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs;
284-
return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) && timeoutMs > 0
285-
? timeoutMs
286-
: undefined;
287+
return resolveGatewayRestartDeferralTimeoutMs(timeoutMs);
287288
} catch {
288289
return DEFAULT_RESTART_DRAIN_TIMEOUT_MS;
289290
}

src/config/schema.base.generated.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22632,7 +22632,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
2263222632
maximum: 9007199254740991,
2263322633
title: "Restart Deferral Timeout (ms)",
2263422634
description:
22635-
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
22635+
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
2263622636
},
2263722637
},
2263822638
additionalProperties: false,
@@ -25803,7 +25803,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
2580325803
},
2580425804
"gateway.reload.deferralTimeoutMs": {
2580525805
label: "Restart Deferral Timeout (ms)",
25806-
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
25806+
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
2580725807
tags: ["network", "reliability", "performance"],
2580825808
},
2580925809
"gateway.nodes.browser.mode": {

src/config/schema.help.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ export const FIELD_HELP: Record<string, string> = {
492492
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
493493
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",
494494
"gateway.reload.deferralTimeoutMs":
495-
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
495+
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
496496
"gateway.nodes.browser.mode":
497497
'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).',
498498
"gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).",

src/config/types.gateway.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,8 @@ export type GatewayReloadConfig = {
215215
debounceMs?: number;
216216
/**
217217
* Optional maximum time (ms) to wait for in-flight operations to complete
218-
* before forcing a restart. Absent or 0 waits indefinitely and logs periodic
219-
* still-pending warnings.
218+
* before forcing a restart. Absent uses the gateway's default bounded wait;
219+
* 0 waits indefinitely and logs periodic still-pending warnings.
220220
* Lower positive values risk aborting active subagent LLM calls.
221221
* @see https://github.com/openclaw/openclaw/issues/65485
222222
*/

src/gateway/server-reload-handlers.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
1212
import {
1313
deferGatewayRestartUntilIdle,
1414
emitGatewayRestart,
15+
resolveGatewayRestartDeferralTimeoutMs,
1516
setGatewaySigusr1RestartPolicy,
1617
} from "../infra/restart.js";
1718
import { getTotalQueueSize } from "../process/command-queue.js";
@@ -363,7 +364,9 @@ export function createGatewayReloadHandlers(params: GatewayReloadHandlerParams)
363364

364365
deferGatewayRestartUntilIdle({
365366
getPendingCount: () => getActiveCounts().totalActive,
366-
maxWaitMs: nextConfig.gateway?.reload?.deferralTimeoutMs,
367+
maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(
368+
nextConfig.gateway?.reload?.deferralTimeoutMs,
369+
),
367370
hooks: {
368371
onReady: () => {
369372
restartPending = false;

src/gateway/server.reload.test.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,50 @@ describe("gateway hot reload", () => {
727727
});
728728
});
729729

730+
it("uses the default restart deferral timeout when config omits deferralTimeoutMs", async () => {
731+
await withNonMinimalGatewayServer(async () => {
732+
const onRestart = hoisted.getOnRestart();
733+
expect(onRestart).toBeTypeOf("function");
734+
735+
const restartTesting = (await import("../infra/restart.js")).__testing;
736+
restartTesting.resetSigusr1State();
737+
hoisted.activeTaskCount.value = 1;
738+
const signalSpy = vi.fn();
739+
process.once("SIGUSR1", signalSpy);
740+
vi.useFakeTimers();
741+
742+
try {
743+
onRestart?.(
744+
{
745+
changedPaths: ["gateway.port"],
746+
restartGateway: true,
747+
restartReasons: ["gateway.port"],
748+
hotReasons: [],
749+
reloadHooks: false,
750+
restartGmailWatcher: false,
751+
restartCron: false,
752+
restartHeartbeat: false,
753+
restartChannels: new Set(),
754+
noopPaths: [],
755+
},
756+
{},
757+
);
758+
759+
await vi.advanceTimersByTimeAsync(299_500);
760+
expect(signalSpy).not.toHaveBeenCalled();
761+
762+
await vi.advanceTimersByTimeAsync(500);
763+
await Promise.resolve();
764+
expect(signalSpy).toHaveBeenCalledTimes(1);
765+
} finally {
766+
hoisted.activeTaskCount.value = 0;
767+
vi.useRealTimers();
768+
process.removeListener("SIGUSR1", signalSpy);
769+
restartTesting.resetSigusr1State();
770+
}
771+
});
772+
});
773+
730774
it("emits one-shot degraded and recovered system events during secret reload transitions", async () => {
731775
await writeEnvRefConfig();
732776
process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret

src/infra/infra-runtime.test.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ describe("infra runtime", () => {
483483
}
484484
});
485485

486-
it("keeps SIGUSR1 deferred by default while work is still pending", async () => {
486+
it("emits SIGUSR1 after the default deferral timeout while work is still pending", async () => {
487487
const emitSpy = vi.spyOn(process, "emit");
488488
const handler = () => {};
489489
process.on("SIGUSR1", handler);
@@ -495,8 +495,25 @@ describe("infra runtime", () => {
495495
await vi.advanceTimersByTimeAsync(0);
496496
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
497497

498-
// No default max deferral wait; active turns should not be killed just
499-
// because a config-triggered restart has been pending for 5 minutes.
498+
await vi.advanceTimersByTimeAsync(300_000);
499+
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
500+
} finally {
501+
process.removeListener("SIGUSR1", handler);
502+
}
503+
});
504+
505+
it("keeps SIGUSR1 deferred when deferral timeout is explicitly disabled", async () => {
506+
const emitSpy = vi.spyOn(process, "emit");
507+
const handler = () => {};
508+
process.on("SIGUSR1", handler);
509+
try {
510+
setRuntimeConfigSnapshot({ gateway: { reload: { deferralTimeoutMs: 0 } } });
511+
setPreRestartDeferralCheck(() => 5); // always pending
512+
scheduleGatewaySigusr1Restart({ delayMs: 0 });
513+
514+
await vi.advanceTimersByTimeAsync(0);
515+
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
516+
500517
await vi.advanceTimersByTimeAsync(300_000);
501518
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
502519
} finally {

0 commit comments

Comments
 (0)