Skip to content

Commit 8d58ad4

Browse files
committed
fix(gateway): retry startup handshakes before surfacing failures
1 parent a98a4e6 commit 8d58ad4

14 files changed

Lines changed: 430 additions & 7 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Docs: https://docs.openclaw.ai
6969
- Feishu: suppress distinct late `final` text deliveries after a streaming card has already closed, while keeping media attachments deliverable, so late-finals no longer reopen duplicate Feishu cards. Fixes #71977. (#72294) Thanks @MonkeyLeeT.
7070
- Gateway: expose `gateway.handshakeTimeoutMs` in config, schema, and docs while preserving `OPENCLAW_HANDSHAKE_TIMEOUT_MS` precedence, so loaded or low-powered hosts can tune local WebSocket pre-auth handshakes without patching dist files. Supersedes #51282; refs #73592 and #73652. Thanks @henry-the-frog.
7171
- Gateway/TUI/status: align configured and env-based WebSocket handshake budgets across local clients, probes, and fallback RPCs while preserving explicit status timeouts and paired-device auth fallback, so slow local gateways are not marked unreachable by a shorter client watchdog. Refs #73524, #73535, #73592, and #73602. Thanks @harshcatsystems-collab, @DJBlackhawk, and @Vksh07.
72+
- Gateway/startup: return retryable `UNAVAILABLE` during the sidecar startup window and keep CLI/TUI/status clients retrying inside their existing timeout budget, so early connects no longer surface as terminal handshake failures. Fixes #73652. Thanks @spenceryang1996-dot.
7273
- Agents/auth: scope external CLI credential discovery to configured providers during model auth status and startup prewarm, so opencode-only and other single-provider gateways do not block on unrelated Claude CLI Keychain probes. Fixes #73908. Thanks @Ailuras.
7374
- Agents/model selection: resolve slash-form aliases before provider/model parsing and keep alias-resolved primary models subject to transient provider cooldowns, so cron and persisted sessions do not retry cooled-down raw aliases. Fixes #73573 and #73657. Thanks @akai-shuuichi and @hashslingers.
7475
- Agents/Claude CLI: reuse already-cached macOS Keychain credentials for no-prompt Claude credential reads, so doctor/runtime checks do not miss fresh interactive Claude auth. Fixes #73682. Thanks @RyanSandoval.

docs/gateway/protocol.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ Gateway → Client:
9797
}
9898
```
9999

100+
While the Gateway is still finishing startup sidecars, the `connect` request can
101+
return a retryable `UNAVAILABLE` error with `details.reason` set to
102+
`"startup-sidecars"` and `retryAfterMs`. Clients should retry that response
103+
within their overall connection budget instead of surfacing it as a terminal
104+
handshake failure.
105+
100106
`server`, `features`, `snapshot`, and `policy` are all required by the schema
101107
(`src/gateway/protocol/schema/frames.ts`). `auth` is also required and reports
102108
the negotiated role/scopes. `canvasHostUrl` is optional.

src/gateway/call.test.ts

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ let lastRequestOptions: {
4141
params?: unknown;
4242
opts?: { expectFinal?: boolean; timeoutMs?: number | null };
4343
} | null = null;
44-
type StartMode = "hello" | "close" | "silent";
44+
type StartMode = "hello" | "close" | "silent" | "startup-retry-then-hello";
4545
let startMode: StartMode = "hello";
4646
let closeCode = 1006;
4747
let closeReason = "";
@@ -87,6 +87,12 @@ vi.mock("./client.js", () => ({
8787
methods: helloMethods,
8888
},
8989
});
90+
} else if (startMode === "startup-retry-then-hello") {
91+
void lastClientOptions?.onHelloOk?.({
92+
features: {
93+
methods: helloMethods,
94+
},
95+
});
9096
} else if (startMode === "close") {
9197
lastClientOptions?.onClose?.(closeCode, closeReason);
9298
}
@@ -134,6 +140,12 @@ class StubGatewayClient {
134140
methods: helloMethods,
135141
},
136142
});
143+
} else if (startMode === "startup-retry-then-hello") {
144+
void lastClientOptions?.onHelloOk?.({
145+
features: {
146+
methods: helloMethods,
147+
},
148+
});
137149
} else if (startMode === "close") {
138150
lastClientOptions?.onClose?.(closeCode, closeReason);
139151
}
@@ -835,6 +847,15 @@ describe("callGateway error details", () => {
835847
});
836848
});
837849

850+
it("keeps the request alive through internally retried startup-unavailable handshakes", async () => {
851+
startMode = "startup-retry-then-hello";
852+
setLocalLoopbackGatewayConfig();
853+
854+
await expect(callGateway({ method: "health" })).resolves.toEqual({ ok: true });
855+
856+
expect(lastRequestOptions?.method).toBe("health");
857+
});
858+
838859
it("includes connection details on timeout", async () => {
839860
startMode = "silent";
840861
setLocalLoopbackGatewayConfig();

src/gateway/client.test.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class MockWebSocket {
2929
private errorHandlers: WsEventHandlers["error"][] = [];
3030
readonly sent: string[] = [];
3131
closeCalls = 0;
32+
lastClose: { code?: number; reason?: string } | null = null;
3233
terminateCalls = 0;
3334
autoCloseOnClose = true;
3435
readyState = MockWebSocket.CONNECTING;
@@ -62,6 +63,7 @@ class MockWebSocket {
6263

6364
close(code?: number, reason?: string): void {
6465
this.closeCalls += 1;
66+
this.lastClose = { code, reason };
6567
this.readyState = MockWebSocket.CLOSING;
6668
if (this.autoCloseOnClose) {
6769
this.emitClose(code ?? 1000, reason ?? "");
@@ -335,6 +337,73 @@ describe("GatewayClient request errors", () => {
335337

336338
client.stop();
337339
});
340+
341+
it("retries startup-unavailable connect failures without terminal callbacks", async () => {
342+
vi.useFakeTimers();
343+
wsInstances.length = 0;
344+
logDebugMock.mockClear();
345+
logErrorMock.mockClear();
346+
const onClose = vi.fn();
347+
const onConnectError = vi.fn();
348+
const client = new GatewayClient({
349+
url: "ws://127.0.0.1:18789",
350+
deviceIdentity: null,
351+
onClose,
352+
onConnectError,
353+
});
354+
try {
355+
client.start();
356+
const ws = getLatestWs();
357+
ws.emitOpen();
358+
ws.emitMessage(
359+
JSON.stringify({
360+
type: "event",
361+
event: "connect.challenge",
362+
payload: { nonce: "nonce-1" },
363+
}),
364+
);
365+
const connectFrame = JSON.parse(
366+
ws.sent.find((frame) => frame.includes('"method":"connect"')) ?? "{}",
367+
) as { id?: string };
368+
369+
ws.emitMessage(
370+
JSON.stringify({
371+
type: "res",
372+
id: connectFrame.id,
373+
ok: false,
374+
error: {
375+
code: "UNAVAILABLE",
376+
message: "gateway starting; retry shortly",
377+
details: { reason: "startup-sidecars" },
378+
retryable: true,
379+
retryAfterMs: 250,
380+
},
381+
}),
382+
);
383+
384+
await vi.advanceTimersByTimeAsync(0);
385+
for (let i = 0; i < 10; i += 1) {
386+
await Promise.resolve();
387+
}
388+
389+
expect(onConnectError).not.toHaveBeenCalled();
390+
expect(onClose).not.toHaveBeenCalled();
391+
expect(ws.lastClose).toEqual({ code: 1013, reason: "gateway starting" });
392+
expect(logDebugMock).toHaveBeenCalledWith(expect.stringContaining("gateway connect failed:"));
393+
expect(logErrorMock).not.toHaveBeenCalledWith(
394+
expect.stringContaining("gateway connect failed:"),
395+
);
396+
expect(wsInstances).toHaveLength(1);
397+
398+
await vi.advanceTimersByTimeAsync(249);
399+
expect(wsInstances).toHaveLength(1);
400+
await vi.advanceTimersByTimeAsync(1);
401+
expect(wsInstances).toHaveLength(2);
402+
} finally {
403+
client.stop();
404+
vi.useRealTimers();
405+
}
406+
});
338407
});
339408

340409
describe("GatewayClient close handling", () => {

src/gateway/client.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ import {
5050
validateRequestFrame,
5151
validateResponseFrame,
5252
} from "./protocol/index.js";
53+
import { resolveGatewayStartupRetryAfterMs } from "./protocol/startup-unavailable.js";
5354

5455
type Pending = {
5556
resolve: (value: unknown) => void;
@@ -168,6 +169,7 @@ export const GATEWAY_CLOSE_CODE_HINTS: Readonly<Record<number, string>> = {
168169
1006: "abnormal closure (no close frame)",
169170
1008: "policy violation",
170171
1012: "service restart",
172+
1013: "try again later",
171173
};
172174

173175
export function describeGatewayCloseCode(code: number): string | undefined {
@@ -227,6 +229,7 @@ export class GatewayClient {
227229
private reconnectTimer: NodeJS.Timeout | null = null;
228230
private pendingDeviceTokenRetry = false;
229231
private deviceTokenRetryBudgetUsed = false;
232+
private pendingStartupReconnectDelayMs: number | null = null;
230233
private pendingConnectErrorDetailCode: string | null = null;
231234
// Track last tick to detect silent stalls.
232235
private lastTick: number | null = null;
@@ -350,6 +353,10 @@ export class GatewayClient {
350353
}
351354
this.socketOpened = false;
352355
this.resolvePendingStop(ws);
356+
if (this.pendingStartupReconnectDelayMs !== null) {
357+
this.scheduleReconnect();
358+
return;
359+
}
353360
// Clear persisted device auth state only when device-token auth was active.
354361
// Shared token/password failures can return the same close reason but should
355362
// not erase a valid cached device token.
@@ -429,6 +436,7 @@ export class GatewayClient {
429436
this.closed = true;
430437
this.pendingDeviceTokenRetry = false;
431438
this.deviceTokenRetryBudgetUsed = false;
439+
this.pendingStartupReconnectDelayMs = null;
432440
this.pendingConnectErrorDetailCode = null;
433441
this.clearReconnectTimer();
434442
if (this.tickTimer) {
@@ -576,6 +584,7 @@ export class GatewayClient {
576584
.then((helloOk) => {
577585
this.pendingDeviceTokenRetry = false;
578586
this.deviceTokenRetryBudgetUsed = false;
587+
this.pendingStartupReconnectDelayMs = null;
579588
this.pendingConnectErrorDetailCode = null;
580589
const authInfo = helloOk?.auth;
581590
if (authInfo?.deviceToken && this.opts.deviceIdentity) {
@@ -626,6 +635,13 @@ export class GatewayClient {
626635
this.deviceTokenRetryBudgetUsed = true;
627636
this.backoffMs = Math.min(this.backoffMs, 250);
628637
}
638+
const startupRetryAfterMs = resolveGatewayStartupRetryAfterMs(err);
639+
if (startupRetryAfterMs !== null) {
640+
this.pendingStartupReconnectDelayMs = startupRetryAfterMs;
641+
logDebug(`gateway connect failed: ${String(err)}`);
642+
this.ws?.close(1013, "gateway starting");
643+
return;
644+
}
629645
this.opts.onConnectError?.(err instanceof Error ? err : new Error(String(err)));
630646
const msg = `gateway connect failed: ${String(err)}`;
631647
if (this.opts.mode === GATEWAY_CLIENT_MODES.PROBE || isGatewayClientStoppedError(err)) {
@@ -916,8 +932,12 @@ export class GatewayClient {
916932
this.tickTimer = null;
917933
}
918934
this.clearReconnectTimer();
919-
const delay = this.backoffMs;
920-
this.backoffMs = Math.min(this.backoffMs * 2, 30_000);
935+
const startupDelay = this.pendingStartupReconnectDelayMs;
936+
this.pendingStartupReconnectDelayMs = null;
937+
const delay = startupDelay ?? this.backoffMs;
938+
if (startupDelay === null) {
939+
this.backoffMs = Math.min(this.backoffMs * 2, 30_000);
940+
}
921941
this.reconnectTimer = setTimeout(() => {
922942
this.reconnectTimer = null;
923943
this.start();

src/gateway/probe.test.ts

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest";
33
const gatewayClientState = vi.hoisted(() => ({
44
options: null as Record<string, unknown> | null,
55
requests: [] as string[],
6-
startMode: "hello" as "hello" | "close" | "connect-error-close",
6+
startMode: "hello" as "hello" | "close" | "connect-error-close" | "startup-retry-then-hello",
77
close: { code: 1008, reason: "pairing required" },
88
helloAuth: {
99
role: "operator",
@@ -76,6 +76,17 @@ class MockGatewayClient {
7676
}
7777
return;
7878
}
79+
if (gatewayClientState.startMode === "startup-retry-then-hello") {
80+
const onHelloOk = this.opts.onHelloOk;
81+
if (typeof onHelloOk === "function") {
82+
await onHelloOk({
83+
type: "hello-ok",
84+
server: gatewayClientState.helloServer,
85+
auth: gatewayClientState.helloAuth,
86+
});
87+
}
88+
return;
89+
}
7990
const onHelloOk = this.opts.onHelloOk;
8091
if (typeof onHelloOk === "function") {
8192
await onHelloOk({
@@ -381,4 +392,21 @@ describe("probeGateway", () => {
381392
close: { code: 1008, reason: "pairing required" },
382393
});
383394
});
395+
396+
it("keeps probing through internally retried startup-unavailable handshakes", async () => {
397+
gatewayClientState.startMode = "startup-retry-then-hello";
398+
399+
const result = await probeGateway({
400+
url: "ws://127.0.0.1:18789",
401+
auth: { token: "secret" },
402+
timeoutMs: 1_000,
403+
includeDetails: false,
404+
});
405+
406+
expect(result).toMatchObject({
407+
ok: true,
408+
error: null,
409+
close: null,
410+
});
411+
});
384412
});
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
export const GATEWAY_STARTUP_UNAVAILABLE_REASON = "startup-sidecars";
2+
export const GATEWAY_STARTUP_RETRY_AFTER_MS = 500;
3+
export const GATEWAY_STARTUP_RETRY_MIN_MS = 100;
4+
export const GATEWAY_STARTUP_RETRY_MAX_MS = 2_000;
5+
6+
export type GatewayStartupUnavailableDetails = {
7+
reason: typeof GATEWAY_STARTUP_UNAVAILABLE_REASON;
8+
};
9+
10+
export function gatewayStartupUnavailableDetails(): GatewayStartupUnavailableDetails {
11+
return { reason: GATEWAY_STARTUP_UNAVAILABLE_REASON };
12+
}
13+
14+
export function isGatewayStartupUnavailableDetails(
15+
details: unknown,
16+
): details is GatewayStartupUnavailableDetails {
17+
return (
18+
typeof details === "object" &&
19+
details !== null &&
20+
(details as { reason?: unknown }).reason === GATEWAY_STARTUP_UNAVAILABLE_REASON
21+
);
22+
}
23+
24+
export function isRetryableGatewayStartupUnavailableError(error: unknown): boolean {
25+
if (!error || typeof error !== "object") {
26+
return false;
27+
}
28+
const shaped = error as {
29+
code?: unknown;
30+
gatewayCode?: unknown;
31+
retryable?: unknown;
32+
details?: unknown;
33+
};
34+
const code = shaped.gatewayCode ?? shaped.code;
35+
return (
36+
code === "UNAVAILABLE" &&
37+
shaped.retryable === true &&
38+
isGatewayStartupUnavailableDetails(shaped.details)
39+
);
40+
}
41+
42+
export function resolveGatewayStartupRetryAfterMs(error: unknown): number | null {
43+
if (!isRetryableGatewayStartupUnavailableError(error)) {
44+
return null;
45+
}
46+
const retryAfterMs = (error as { retryAfterMs?: unknown }).retryAfterMs;
47+
const raw =
48+
typeof retryAfterMs === "number" && Number.isFinite(retryAfterMs)
49+
? retryAfterMs
50+
: GATEWAY_STARTUP_RETRY_AFTER_MS;
51+
return Math.min(
52+
Math.max(Math.floor(raw), GATEWAY_STARTUP_RETRY_MIN_MS),
53+
GATEWAY_STARTUP_RETRY_MAX_MS,
54+
);
55+
}

src/gateway/server-ws-runtime.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export function attachGatewayWsHandlers(params: GatewayWsRuntimeParams) {
3636
rateLimiter: params.rateLimiter,
3737
browserRateLimiter: params.browserRateLimiter,
3838
preauthHandshakeTimeoutMs: params.preauthHandshakeTimeoutMs,
39+
isStartupPending: params.isStartupPending,
3940
gatewayMethods: params.gatewayMethods,
4041
events: params.events,
4142
refreshHealthSnapshot: params.context.refreshHealthSnapshot,

src/gateway/server.impl.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,7 @@ export async function startGatewayServer(
942942
rateLimiter: authRateLimiter,
943943
browserRateLimiter: browserAuthRateLimiter,
944944
preauthHandshakeTimeoutMs,
945+
isStartupPending: () => !startupSidecarsReady,
945946
gatewayMethods: runtimeState.gatewayMethods,
946947
events: GATEWAY_EVENTS,
947948
logGateway: log,

0 commit comments

Comments
 (0)