Skip to content

Commit 5a8ccb6

Browse files
authored
fix: recover Slack channel restart after stop timeout (#77686)
* fix: recover Slack channel restart after stop timeout * fix: keep recovery restart cancellable
1 parent 123f7a6 commit 5a8ccb6

5 files changed

Lines changed: 128 additions & 13 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
7171
- Webhooks/Gmail/Windows: resolve `gcloud`, `gog`, and `tailscale` PATH/PATHEXT shims before setup and watcher spawns, using the Windows-safe `.cmd` wrapper for long-lived `gog serve` processes. (#74881, fixes #54470) Thanks @Angfr95.
7272
- Video generation: accept provider-specific aspect-ratio and resolution hints at the tool boundary, normalize `720P` to MiniMax's supported `768P`, and stop sending Google `generateAudio` on Gemini video requests so provider fallback can recover from model-specific parameter differences. Thanks @vincentkoc.
7373
- Plugins/install: honor the beta update channel for onboarding and doctor-managed plugin installs by requesting floating npm and ClawHub specs with `@beta` while keeping persistent install records on the catalog default. Thanks @vincentkoc.
74+
- Slack: keep health-monitor recovery stops from poisoning manual-stop state after channel stop timeouts, allowing Socket Mode accounts to reconnect after event-loop stalls instead of staying dead until Gateway restart. Fixes #77651. Thanks @Gusty3055.
7475
- WhatsApp/onboarding: canonicalize setup and pairing allowlist entries to WhatsApp's digit-only phone ids while still accepting E.164, JID, and `whatsapp:` inputs, so personal-phone allowlists match WhatsApp Web sender ids after setup. Thanks @vincentkoc.
7576
- Gateway/startup: load provider plugins that own explicitly configured image, video, or music generation defaults so generation tools become live after gateway restart instead of remaining catalog-only. Fixes #77244. Thanks @buyuangtampan, @Nikoxx99, and @vincentkoc.
7677
- Control UI/chat: suppress `HEARTBEAT_OK` acknowledgement history, streams, deltas, and final events before they enter the transcript view, so repeated heartbeat no-op turns do not stack noisy bubbles. Thanks @BunsDev.

src/gateway/channel-health-monitor.test.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ async function expectRestartedChannel(
131131
accountId = "default",
132132
) {
133133
const monitor = await startAndRunCheck(manager);
134-
expect(manager.stopChannel).toHaveBeenCalledWith(channel, accountId);
134+
expect(manager.stopChannel).toHaveBeenCalledWith(channel, accountId, { manual: false });
135135
expect(manager.startChannel).toHaveBeenCalledWith(channel, accountId);
136136
monitor.stop();
137137
}
@@ -286,9 +286,9 @@ describe("channel-health-monitor", () => {
286286
},
287287
);
288288
const monitor = await startAndRunCheck(manager);
289-
expect(manager.stopChannel).toHaveBeenCalledWith("discord", "default");
289+
expect(manager.stopChannel).toHaveBeenCalledWith("discord", "default", { manual: false });
290290
expect(manager.startChannel).toHaveBeenCalledWith("discord", "default");
291-
expect(manager.stopChannel).not.toHaveBeenCalledWith("discord", "quiet");
291+
expect(manager.stopChannel).not.toHaveBeenCalledWith("discord", "quiet", { manual: false });
292292
expect(manager.startChannel).not.toHaveBeenCalledWith("discord", "quiet");
293293
monitor.stop();
294294
});
@@ -308,7 +308,7 @@ describe("channel-health-monitor", () => {
308308
},
309309
});
310310
const monitor = await startAndRunCheck(manager);
311-
expect(manager.stopChannel).toHaveBeenCalledWith("whatsapp", "default");
311+
expect(manager.stopChannel).toHaveBeenCalledWith("whatsapp", "default", { manual: false });
312312
expect(manager.resetRestartAttempts).toHaveBeenCalledWith("whatsapp", "default");
313313
expect(manager.startChannel).toHaveBeenCalledWith("whatsapp", "default");
314314
monitor.stop();
@@ -613,7 +613,7 @@ describe("channel-health-monitor", () => {
613613
const monitor = await startAndRunCheck(manager, {
614614
staleEventThresholdMs: customThreshold,
615615
});
616-
expect(manager.stopChannel).toHaveBeenCalledWith("slack", "default");
616+
expect(manager.stopChannel).toHaveBeenCalledWith("slack", "default", { manual: false });
617617
expect(manager.startChannel).toHaveBeenCalledWith("slack", "default");
618618
monitor.stop();
619619
});

src/gateway/channel-health-monitor.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,9 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann
163163

164164
try {
165165
if (status.running) {
166-
await channelManager.stopChannel(channelId as ChannelId, accountId);
166+
await channelManager.stopChannel(channelId as ChannelId, accountId, {
167+
manual: false,
168+
});
167169
}
168170
channelManager.resetRestartAttempts(channelId as ChannelId, accountId);
169171
await channelManager.startChannel(channelId as ChannelId, accountId);

src/gateway/server-channels.test.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,85 @@ describe("server-channels auto restart", () => {
329329
expect(account?.lastError).toContain("channel stop timed out");
330330
});
331331

332+
it("does not poison auto-restart state when recovery stop times out", async () => {
333+
const releaseFirstTask = createDeferred();
334+
const startAccount = vi.fn(
335+
async ({ abortSignal }: { abortSignal: AbortSignal }) =>
336+
await new Promise<void>((resolve) => {
337+
abortSignal.addEventListener("abort", () => {}, { once: true });
338+
void releaseFirstTask.promise.then(resolve);
339+
}),
340+
);
341+
installTestRegistry(
342+
createTestPlugin({
343+
startAccount,
344+
}),
345+
);
346+
const manager = createManager();
347+
348+
await manager.startChannels();
349+
const stopTask = manager.stopChannel("discord", DEFAULT_ACCOUNT_ID, { manual: false });
350+
await vi.advanceTimersByTimeAsync(5_000);
351+
await stopTask;
352+
await manager.startChannel("discord", DEFAULT_ACCOUNT_ID);
353+
354+
const snapshot = manager.getRuntimeSnapshot();
355+
const account = snapshot.channelAccounts.discord?.[DEFAULT_ACCOUNT_ID];
356+
expect(startAccount).toHaveBeenCalledTimes(1);
357+
expect(account?.running).toBe(false);
358+
expect(account?.restartPending).toBe(true);
359+
expect(account?.lastError).toContain("channel stop timed out");
360+
expect(manager.isManuallyStopped("discord", DEFAULT_ACCOUNT_ID)).toBe(false);
361+
362+
releaseFirstTask.resolve();
363+
await flushMicrotasks();
364+
await vi.advanceTimersByTimeAsync(10);
365+
await flushMicrotasks();
366+
367+
expect(startAccount).toHaveBeenCalledTimes(2);
368+
});
369+
370+
it("lets manual stops cancel recovery backoff after recovery stop times out", async () => {
371+
const releaseFirstTask = createDeferred();
372+
const startAccount = vi.fn(
373+
async ({ abortSignal }: { abortSignal: AbortSignal }) =>
374+
await new Promise<void>((resolve) => {
375+
abortSignal.addEventListener("abort", () => {}, { once: true });
376+
void releaseFirstTask.promise.then(resolve);
377+
}),
378+
);
379+
installTestRegistry(
380+
createTestPlugin({
381+
startAccount,
382+
}),
383+
);
384+
const manager = createManager();
385+
386+
await manager.startChannels();
387+
const recoveryStopTask = manager.stopChannel("discord", DEFAULT_ACCOUNT_ID, {
388+
manual: false,
389+
});
390+
await vi.advanceTimersByTimeAsync(5_000);
391+
await recoveryStopTask;
392+
393+
releaseFirstTask.resolve();
394+
await waitForMicrotaskCondition(
395+
() => hoisted.sleepWithAbort.mock.calls.length > 0,
396+
"expected recovery restart backoff to be scheduled",
397+
);
398+
expect(hoisted.sleepWithAbort).toHaveBeenCalledWith(10, expect.any(AbortSignal));
399+
400+
await manager.stopChannel("discord", DEFAULT_ACCOUNT_ID);
401+
await vi.advanceTimersByTimeAsync(10);
402+
await flushMicrotasks();
403+
404+
const account = manager.getRuntimeSnapshot().channelAccounts.discord?.[DEFAULT_ACCOUNT_ID];
405+
expect(startAccount).toHaveBeenCalledTimes(1);
406+
expect(account?.running).toBe(false);
407+
expect(account?.restartPending).toBe(false);
408+
expect(manager.isManuallyStopped("discord", DEFAULT_ACCOUNT_ID)).toBe(true);
409+
});
410+
332411
it("marks enabled/configured when account descriptors omit them", () => {
333412
installTestRegistry(
334413
createTestPlugin({

src/gateway/server-channels.ts

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,15 @@ type StartChannelOptions = {
188188
preserveManualStop?: boolean;
189189
};
190190

191+
type StopChannelOptions = {
192+
manual?: boolean;
193+
};
194+
191195
export type ChannelManager = {
192196
getRuntimeSnapshot: () => ChannelRuntimeSnapshot;
193197
startChannels: () => Promise<void>;
194198
startChannel: (channel: ChannelId, accountId?: string) => Promise<void>;
195-
stopChannel: (channel: ChannelId, accountId?: string) => Promise<void>;
199+
stopChannel: (channel: ChannelId, accountId?: string, opts?: StopChannelOptions) => Promise<void>;
196200
markChannelLoggedOut: (channelId: ChannelId, cleared: boolean, accountId?: string) => void;
197201
isManuallyStopped: (channelId: ChannelId, accountId: string) => boolean;
198202
resetRestartAttempts: (channelId: ChannelId, accountId: string) => void;
@@ -216,6 +220,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
216220
const restartAttempts = new Map<string, number>();
217221
// Tracks accounts that were manually stopped so we don't auto-restart them.
218222
const manuallyStopped = new Set<string>();
223+
const recoveryStopTimedOut = new Set<string>();
219224

220225
const restartKey = (channelId: ChannelId, accountId: string) => `${channelId}:${accountId}`;
221226
const ensureChannelLog = (channelId: ChannelId): SubsystemLogger => {
@@ -568,15 +573,24 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
568573
restartPending: true,
569574
reconnectAttempts: attempt,
570575
});
576+
const recoveryRestartSleepAbort = recoveryStopTimedOut.has(rKey)
577+
? new AbortController()
578+
: undefined;
579+
if (recoveryRestartSleepAbort) {
580+
store.aborts.set(id, recoveryRestartSleepAbort);
581+
}
571582
try {
572-
await sleepWithAbort(delayMs, abort.signal);
583+
const restartSleepAbort = recoveryRestartSleepAbort?.signal ?? abort.signal;
584+
await sleepWithAbort(delayMs, restartSleepAbort);
573585
if (manuallyStopped.has(rKey)) {
586+
recoveryStopTimedOut.delete(rKey);
574587
return;
575588
}
589+
recoveryStopTimedOut.delete(rKey);
576590
if (store.tasks.get(id) === trackedPromise) {
577591
store.tasks.delete(id);
578592
}
579-
if (store.aborts.get(id) === abort) {
593+
if (store.aborts.get(id) === (recoveryRestartSleepAbort ?? abort)) {
580594
store.aborts.delete(id);
581595
}
582596
await startChannelInternal(channelId, id, {
@@ -585,6 +599,13 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
585599
});
586600
} catch {
587601
// abort or startup failure — next crash will retry
602+
} finally {
603+
if (recoveryRestartSleepAbort) {
604+
recoveryStopTimedOut.delete(rKey);
605+
if (store.aborts.get(id) === recoveryRestartSleepAbort) {
606+
store.aborts.delete(id);
607+
}
608+
}
588609
}
589610
})
590611
.finally(() => {
@@ -630,7 +651,12 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
630651
await startChannelInternal(channelId, accountId);
631652
};
632653

633-
const stopChannel = async (channelId: ChannelId, accountId?: string) => {
654+
const stopChannel = async (
655+
channelId: ChannelId,
656+
accountId?: string,
657+
opts: StopChannelOptions = {},
658+
) => {
659+
const manual = opts.manual ?? true;
634660
const plugin = getChannelPlugin(channelId);
635661
const store = getStore(channelId);
636662
// Fast path: nothing running and no explicit plugin shutdown hook to run.
@@ -656,7 +682,10 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
656682
if (!abort && !task && !plugin?.gateway?.stopAccount) {
657683
return;
658684
}
659-
manuallyStopped.add(restartKey(channelId, id));
685+
const rKey = restartKey(channelId, id);
686+
if (manual) {
687+
manuallyStopped.add(rKey);
688+
}
660689
abort?.abort();
661690
const log = ensureChannelLog(channelId);
662691
const runtime = ensureChannelRuntime(channelId);
@@ -683,12 +712,16 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
683712
);
684713
setRuntime(channelId, id, {
685714
accountId: id,
686-
running: true,
687-
restartPending: false,
715+
running: manual,
716+
restartPending: !manual,
688717
lastError: `channel stop timed out after ${CHANNEL_STOP_ABORT_TIMEOUT_MS}ms`,
689718
});
719+
if (!manual) {
720+
recoveryStopTimedOut.add(rKey);
721+
}
690722
return;
691723
}
724+
recoveryStopTimedOut.delete(rKey);
692725
store.aborts.delete(id);
693726
store.tasks.delete(id);
694727
setRuntime(channelId, id, {

0 commit comments

Comments
 (0)