Skip to content

Commit 1faf660

Browse files
fix(feishu): recover WebSocket after SDK retry exhaustion
1 parent 50ebcf8 commit 1faf660

4 files changed

Lines changed: 96 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ Docs: https://docs.openclaw.ai
7575
- Docker Compose: default missing config and workspace bind mounts to `${HOME:-/tmp}/.openclaw` so manual compose runs do not create invalid empty-source volume specs. (#64485) Thanks @jlapenna.
7676
- Agents/context engines: preserve the child agent's configured `agentDir` when subagent cleanup re-resolves a context engine, so `onSubagentEnded` hooks keep operating on the correct per-agent state. (#67243) Thanks @jarimustonen.
7777
- Channels/WhatsApp: restrict pairing verification replies to real inbound user content, preventing unsolicited prompts from receipts, typing indicators, presence updates, and other non-message Baileys upserts. Fixes #73797. (#73823) Thanks @hclsys.
78-
- Feishu: recreate WebSocket clients with monitor-owned backoff after SDK reconnect exhaustion, preserving heartbeat defaults and shutdown cleanup so persistent connections recover without manual gateway restart. Fixes #52618; duplicate evidence #59753; related #55532, #68766, #72411, and #73739. Thanks @vincentkoc, @schumilin, @alex-xuweilong, @120106835, @sirfengyu, and @tianhaocui.
78+
- Feishu: recreate WebSocket clients with monitor-owned backoff only after SDK reconnect exhaustion, preserving heartbeat defaults and shutdown cleanup without treating recoverable SDK callback errors as terminal, so persistent connections recover without manual gateway restart. Fixes #52618; duplicate evidence #59753; related #55532, #68766, #72411, and #73739. Thanks @vincentkoc, @schumilin, @alex-xuweilong, @120106835, @sirfengyu, and @tianhaocui.
7979

8080
## 2026.4.27
8181

extensions/feishu/src/client.test.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,19 @@ function firstWsClientOptions(): {
123123
agent?: unknown;
124124
wsConfig?: unknown;
125125
onError?: unknown;
126+
onReady?: unknown;
127+
onReconnected?: unknown;
128+
onReconnecting?: unknown;
126129
} {
127130
const options = readCallOptions(wsClientCtorMock, 0);
128-
return { agent: options.agent, wsConfig: options.wsConfig, onError: options.onError };
131+
return {
132+
agent: options.agent,
133+
wsConfig: options.wsConfig,
134+
onError: options.onError,
135+
onReady: options.onReady,
136+
onReconnected: options.onReconnected,
137+
onReconnecting: options.onReconnecting,
138+
};
129139
}
130140

131141
beforeAll(async () => {
@@ -361,11 +371,22 @@ describe("createFeishuWSClient proxy handling", () => {
361371

362372
it("passes lifecycle callbacks while preserving heartbeat wsConfig defaults", async () => {
363373
const onError = vi.fn();
364-
365-
await createFeishuWSClient(baseAccount, { onError });
374+
const onReady = vi.fn();
375+
const onReconnected = vi.fn();
376+
const onReconnecting = vi.fn();
377+
378+
await createFeishuWSClient(baseAccount, {
379+
onError,
380+
onReady,
381+
onReconnected,
382+
onReconnecting,
383+
});
366384

367385
const options = firstWsClientOptions();
368386
expect(options.onError).toBe(onError);
387+
expect(options.onReady).toBe(onReady);
388+
expect(options.onReconnected).toBe(onReconnected);
389+
expect(options.onReconnecting).toBe(onReconnecting);
369390
expect(options.wsConfig).toEqual({
370391
PingInterval: 30,
371392
PingTimeout: 3,

extensions/feishu/src/monitor.cleanup.test.ts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,55 @@ describe("feishu websocket cleanup", () => {
202202
expect(errorMessage).not.toContain("token_abc");
203203
});
204204

205+
it("keeps the websocket client alive after recoverable sdk callback errors", async () => {
206+
vi.useFakeTimers();
207+
const wsClient = createWsClient();
208+
createFeishuWSClientMock.mockResolvedValueOnce(wsClient);
209+
210+
const abortController = new AbortController();
211+
const runtime = {
212+
log: vi.fn(),
213+
error: vi.fn(),
214+
exit: vi.fn(),
215+
};
216+
const accountId = "recoverable-callback";
217+
218+
const monitorPromise = monitorWebSocket({
219+
account: createAccount(accountId),
220+
accountId,
221+
runtime,
222+
abortSignal: abortController.signal,
223+
eventDispatcher: {} as never,
224+
});
225+
226+
await vi.waitFor(() => {
227+
expect(wsClient.start).toHaveBeenCalledTimes(1);
228+
expect(wsClients.get(accountId)).toBe(wsClient);
229+
});
230+
231+
const callbacks = createFeishuWSClientMock.mock.calls[0]?.[1] as
232+
| { onError?: (err: Error) => void }
233+
| undefined;
234+
callbacks?.onError?.(new Error("temporary callback failure\nBearer token_abc"));
235+
236+
await vi.advanceTimersByTimeAsync(1_000);
237+
238+
expect(createFeishuWSClientMock).toHaveBeenCalledTimes(1);
239+
expect(wsClient.close).not.toHaveBeenCalled();
240+
expect(wsClients.get(accountId)).toBe(wsClient);
241+
const errorMessage = String(runtime.error.mock.calls[0]?.[0] ?? "");
242+
expect(errorMessage).toContain("WebSocket SDK reported recoverable error");
243+
expect(errorMessage).toContain("Bearer [redacted]");
244+
expect(errorMessage).not.toContain("\n");
245+
expect(errorMessage).not.toContain("token_abc");
246+
247+
abortController.abort();
248+
await monitorPromise;
249+
250+
expect(createFeishuWSClientMock).toHaveBeenCalledTimes(1);
251+
expect(wsClient.close).toHaveBeenCalledTimes(1);
252+
});
253+
205254
it("clears identity without recreating a websocket when aborted during reconnect backoff", async () => {
206255
vi.useFakeTimers();
207256
const exhaustedClient = createWsClient();

extensions/feishu/src/monitor.transport.ts

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ export type MonitorTransportParams = {
3333
const FEISHU_WS_RECONNECT_INITIAL_DELAY_MS = 1_000;
3434
const FEISHU_WS_RECONNECT_MAX_DELAY_MS = 30_000;
3535
const FEISHU_WS_LOG_ERROR_MAX_LENGTH = 500;
36+
const FEISHU_WS_RECONNECT_EXHAUSTED_RE = /^WebSocket reconnect exhausted after \d+ attempts?/;
37+
const FEISHU_WS_AUTORECONNECT_DISABLED_ERROR =
38+
"WebSocket connect failed and autoReconnect is disabled";
3639

3740
function isFeishuWebhookPayload(value: unknown): value is Record<string, unknown> {
3841
return !!value && typeof value === "object" && !Array.isArray(value);
@@ -120,6 +123,14 @@ function formatFeishuWsErrorForLog(err: unknown): string {
120123
return `${redacted.slice(0, FEISHU_WS_LOG_ERROR_MAX_LENGTH)}...`;
121124
}
122125

126+
function isFeishuWsTerminalError(err: Error): boolean {
127+
const message = err.message.trim();
128+
return (
129+
FEISHU_WS_RECONNECT_EXHAUSTED_RE.test(message) ||
130+
message.startsWith(FEISHU_WS_AUTORECONNECT_DISABLED_ERROR)
131+
);
132+
}
133+
123134
function cleanupFeishuWsClient(params: {
124135
accountId: string;
125136
wsClient?: Lark.WSClient;
@@ -199,9 +210,19 @@ export async function monitorWebSocket({
199210
const terminalError = new Promise<Error>((resolve) => {
200211
reportTerminalError = resolve;
201212
});
213+
const handleWsError = (err: Error) => {
214+
if (isFeishuWsTerminalError(err)) {
215+
reportTerminalError(err);
216+
return;
217+
}
218+
219+
error(
220+
`feishu[${accountId}]: WebSocket SDK reported recoverable error: ${formatFeishuWsErrorForLog(err)}`,
221+
);
222+
};
202223
log(`feishu[${accountId}]: starting WebSocket connection...`);
203224
wsClient = await createFeishuWSClient(account, {
204-
onError: reportTerminalError,
225+
onError: handleWsError,
205226
});
206227
if (abortSignal?.aborted) {
207228
cleanupFeishuWsClient({ accountId, wsClient, error, clearIdentity: true });

0 commit comments

Comments
 (0)