Skip to content

Commit e8f9c3e

Browse files
authored
fix(voice-call): stabilize Twilio STT startup (#75257)
Fix Twilio voice-call startup so accepted media streams register immediately, realtime transcription readiness gates only the initial greeting, and early inbound media is preserved while STT connects. Fixes #75197. Thanks @PfanP and @donkeykong91.
1 parent 4ea0556 commit e8f9c3e

6 files changed

Lines changed: 268 additions & 14 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
1313

1414
### Fixes
1515

16+
- Voice Call/Twilio: register accepted media streams immediately but wait for realtime transcription readiness before speaking the initial greeting, so reconnect grace handling stays live while OpenAI STT startup is no longer starved by TTS. Fixes #75197. (#75257) Thanks @donkeykong91 and @PfanP.
1617
- Agents/pi-embedded-runner: extract the `abortable` provider-call wrapper from `runEmbeddedAttempt` to module scope so its promise handlers no longer close over the run lexical context, releasing transcripts, tool buffers, and subscription callbacks when a provider call hangs past abort. (#74182) Thanks @cjboy007.
1718
- Docker: restore `python3` in the gateway runtime image after the slim-runtime switch. Fixes #75041.
1819
- CLI/Voice Call: scope `voicecall` command activation to the Voice Call plugin so setup and smoke checks no longer broad-load unrelated plugin runtimes or hang after printing JSON. Thanks @vincentkoc.

docs/plugins/voice-call.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ Current runtime behavior:
297297
- `streaming.provider` is optional. If unset, Voice Call uses the first registered realtime transcription provider.
298298
- Bundled realtime transcription providers: Deepgram (`deepgram`), ElevenLabs (`elevenlabs`), Mistral (`mistral`), OpenAI (`openai`), and xAI (`xai`), registered by their provider plugins.
299299
- Provider-owned raw config lives under `streaming.providers.<providerId>`.
300+
- After Twilio sends an accepted stream `start` message, Voice Call registers the stream immediately, queues inbound media through the transcription provider while the provider connects, and starts the initial greeting only after realtime transcription is ready.
300301
- If `streaming.provider` points at an unregistered provider, or none is registered, Voice Call logs a warning and skips media streaming instead of failing the whole plugin.
301302

302303
### Streaming provider examples

extensions/voice-call/src/media-stream.test.ts

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,20 @@ const flush = async (): Promise<void> => {
3333
await new Promise((resolve) => setTimeout(resolve, 0));
3434
};
3535

36+
const createDeferred = (): {
37+
promise: Promise<void>;
38+
resolve: () => void;
39+
reject: (error: Error) => void;
40+
} => {
41+
let resolve!: () => void;
42+
let reject!: (error: Error) => void;
43+
const promise = new Promise<void>((resolvePromise, rejectPromise) => {
44+
resolve = resolvePromise;
45+
reject = rejectPromise;
46+
});
47+
return { promise, resolve, reject };
48+
};
49+
3650
const waitForAbort = (signal: AbortSignal): Promise<void> =>
3751
new Promise((resolve) => {
3852
if (signal.aborted) {
@@ -502,6 +516,211 @@ describe("MediaStreamHandler security hardening", () => {
502516
}
503517
});
504518

519+
it("keeps accepted streams alive while STT readiness exceeds the pre-start timeout", async () => {
520+
const sttReady = createDeferred();
521+
const sttConnectStarted = createDeferred();
522+
const transcriptionReady = createDeferred();
523+
const events: string[] = [];
524+
525+
const session: RealtimeTranscriptionSession = {
526+
connect: async () => {
527+
events.push("stt-connect-start");
528+
sttConnectStarted.resolve();
529+
await sttReady.promise;
530+
events.push("stt-connect-ready");
531+
},
532+
sendAudio: () => {},
533+
close: () => {},
534+
isConnected: () => false,
535+
};
536+
537+
const handler = new MediaStreamHandler({
538+
transcriptionProvider: {
539+
createSession: () => session,
540+
id: "openai",
541+
label: "OpenAI",
542+
isConfigured: () => true,
543+
},
544+
providerConfig: {},
545+
preStartTimeoutMs: 40,
546+
shouldAcceptStream: () => true,
547+
onConnect: () => {
548+
events.push("onConnect");
549+
},
550+
onTranscriptionReady: () => {
551+
events.push("onTranscriptionReady");
552+
transcriptionReady.resolve();
553+
},
554+
});
555+
const server = await startWsServer(handler);
556+
557+
try {
558+
const ws = await connectWs(server.url);
559+
ws.send(
560+
JSON.stringify({
561+
event: "start",
562+
streamSid: "MZ-slow-stt",
563+
start: { callSid: "CA-slow-stt" },
564+
}),
565+
);
566+
567+
await withTimeout(sttConnectStarted.promise);
568+
await new Promise((resolve) => setTimeout(resolve, 80));
569+
expect(ws.readyState).toBe(WebSocket.OPEN);
570+
expect(events).toEqual(["onConnect", "stt-connect-start"]);
571+
572+
sttReady.resolve();
573+
await withTimeout(transcriptionReady.promise);
574+
expect(events).toEqual([
575+
"onConnect",
576+
"stt-connect-start",
577+
"stt-connect-ready",
578+
"onTranscriptionReady",
579+
]);
580+
581+
ws.close();
582+
await waitForClose(ws);
583+
} finally {
584+
await server.close();
585+
}
586+
});
587+
588+
it("forwards early Twilio media into the STT session before readiness", async () => {
589+
const sttReady = createDeferred();
590+
const sttConnectStarted = createDeferred();
591+
const transcriptionReady = createDeferred();
592+
const receivedAudio: Buffer[] = [];
593+
let onConnectCalls = 0;
594+
let onTranscriptionReadyCalls = 0;
595+
596+
const session: RealtimeTranscriptionSession = {
597+
connect: async () => {
598+
sttConnectStarted.resolve();
599+
await sttReady.promise;
600+
},
601+
sendAudio: (audio) => {
602+
receivedAudio.push(Buffer.from(audio));
603+
},
604+
close: () => {},
605+
isConnected: () => false,
606+
};
607+
608+
const handler = new MediaStreamHandler({
609+
transcriptionProvider: {
610+
createSession: () => session,
611+
id: "openai",
612+
label: "OpenAI",
613+
isConfigured: () => true,
614+
},
615+
providerConfig: {},
616+
shouldAcceptStream: () => true,
617+
onConnect: () => {
618+
onConnectCalls += 1;
619+
},
620+
onTranscriptionReady: () => {
621+
onTranscriptionReadyCalls += 1;
622+
transcriptionReady.resolve();
623+
},
624+
});
625+
const server = await startWsServer(handler);
626+
627+
try {
628+
const ws = await connectWs(server.url);
629+
ws.send(
630+
JSON.stringify({
631+
event: "start",
632+
streamSid: "MZ-early-media",
633+
start: { callSid: "CA-early-media" },
634+
}),
635+
);
636+
637+
await withTimeout(sttConnectStarted.promise);
638+
ws.send(
639+
JSON.stringify({
640+
event: "media",
641+
streamSid: "MZ-early-media",
642+
media: { payload: Buffer.from("early").toString("base64") },
643+
}),
644+
);
645+
await flush();
646+
647+
expect(Buffer.concat(receivedAudio).toString()).toBe("early");
648+
expect(onConnectCalls).toBe(1);
649+
expect(onTranscriptionReadyCalls).toBe(0);
650+
651+
sttReady.resolve();
652+
await withTimeout(transcriptionReady.promise);
653+
expect(onConnectCalls).toBe(1);
654+
expect(onTranscriptionReadyCalls).toBe(1);
655+
656+
ws.close();
657+
await waitForClose(ws);
658+
} finally {
659+
await server.close();
660+
}
661+
});
662+
663+
it("closes the media stream and disconnects once when STT readiness fails", async () => {
664+
const sttConnectStarted = createDeferred();
665+
const onDisconnectReady = createDeferred();
666+
const onConnect = vi.fn();
667+
const onTranscriptionReady = vi.fn();
668+
const onDisconnect = vi.fn(() => {
669+
onDisconnectReady.resolve();
670+
});
671+
672+
const session: RealtimeTranscriptionSession = {
673+
connect: async () => {
674+
sttConnectStarted.resolve();
675+
throw new Error("provider unavailable");
676+
},
677+
sendAudio: () => {},
678+
close: vi.fn(),
679+
isConnected: () => false,
680+
};
681+
682+
const handler = new MediaStreamHandler({
683+
transcriptionProvider: {
684+
createSession: () => session,
685+
id: "openai",
686+
label: "OpenAI",
687+
isConfigured: () => true,
688+
},
689+
providerConfig: {},
690+
shouldAcceptStream: () => true,
691+
onConnect,
692+
onTranscriptionReady,
693+
onDisconnect,
694+
});
695+
const server = await startWsServer(handler);
696+
697+
try {
698+
const ws = await connectWs(server.url);
699+
ws.send(
700+
JSON.stringify({
701+
event: "start",
702+
streamSid: "MZ-stt-fail",
703+
start: { callSid: "CA-stt-fail" },
704+
}),
705+
);
706+
707+
await withTimeout(sttConnectStarted.promise);
708+
const closed = await waitForClose(ws);
709+
await withTimeout(onDisconnectReady.promise);
710+
711+
expect(closed.code).toBe(1011);
712+
expect(closed.reason).toBe("STT connection failed");
713+
expect(onConnect).toHaveBeenCalledTimes(1);
714+
expect(onConnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
715+
expect(onTranscriptionReady).not.toHaveBeenCalled();
716+
expect(onDisconnect).toHaveBeenCalledTimes(1);
717+
expect(onDisconnect).toHaveBeenCalledWith("CA-stt-fail", "MZ-stt-fail");
718+
expect(session.close).toHaveBeenCalledTimes(1);
719+
} finally {
720+
await server.close();
721+
}
722+
});
723+
505724
it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => {
506725
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
507726
[];

extensions/voice-call/src/media-stream.ts

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ export interface MediaStreamConfig {
4242
onPartialTranscript?: (callId: string, partial: string) => void;
4343
/** Callback when stream connects */
4444
onConnect?: (callId: string, streamSid: string) => void;
45+
/** Callback when realtime transcription is ready for the stream */
46+
onTranscriptionReady?: (callId: string, streamSid: string) => void;
4547
/** Callback when speech starts (barge-in) */
4648
onSpeechStart?: (callId: string) => void;
4749
/** Callback when stream disconnects */
@@ -213,7 +215,7 @@ export class MediaStreamHandler {
213215
break;
214216

215217
case "start":
216-
session = await this.handleStart(ws, message, streamToken);
218+
session = this.handleStart(ws, message, streamToken);
217219
if (session) {
218220
this.clearPendingConnection(ws);
219221
}
@@ -263,11 +265,11 @@ export class MediaStreamHandler {
263265
/**
264266
* Handle stream start event.
265267
*/
266-
private async handleStart(
268+
private handleStart(
267269
ws: WebSocket,
268270
message: TwilioMediaMessage,
269271
streamToken?: string,
270-
): Promise<StreamSession | null> {
272+
): StreamSession | null {
271273
const streamSid = message.streamSid || "";
272274
const callSid = message.start?.callSid || "";
273275

@@ -315,18 +317,42 @@ export class MediaStreamHandler {
315317
};
316318

317319
this.sessions.set(streamSid, session);
318-
319-
// Notify connection BEFORE STT connect so TTS can work even if STT fails
320320
this.config.onConnect?.(callSid, streamSid);
321-
322-
// Connect to transcription service (non-blocking, log errors but don't fail the call)
323-
sttSession.connect().catch((err) => {
324-
console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
325-
});
321+
void this.connectTranscriptionAndNotify(session);
326322

327323
return session;
328324
}
329325

326+
private async connectTranscriptionAndNotify(session: StreamSession): Promise<void> {
327+
try {
328+
await session.sttSession.connect();
329+
} catch (error) {
330+
console.warn(
331+
"[MediaStream] STT connection failed; closing media stream:",
332+
error instanceof Error ? error.message : String(error),
333+
);
334+
if (
335+
this.sessions.get(session.streamSid) === session &&
336+
session.ws.readyState === WebSocket.OPEN
337+
) {
338+
session.ws.close(1011, "STT connection failed");
339+
} else {
340+
session.sttSession.close();
341+
}
342+
return;
343+
}
344+
345+
if (
346+
this.sessions.get(session.streamSid) !== session ||
347+
session.ws.readyState !== WebSocket.OPEN
348+
) {
349+
session.sttSession.close();
350+
return;
351+
}
352+
353+
this.config.onTranscriptionReady?.(session.callId, session.streamSid);
354+
}
355+
330356
/**
331357
* Handle stream stop event.
332358
*/

extensions/voice-call/src/webhook.test.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,7 +1159,7 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => {
11591159
processEvent: vi.fn(),
11601160
} as unknown as CallManager;
11611161

1162-
let currentStreamSid: string | null = "MZ-new";
1162+
let currentStreamSid: string | null = "MZ-old";
11631163
const twilioProvider = createTwilioStreamingProvider({
11641164
registerCallStream: (_callSid: string, streamSid: string) => {
11651165
currentStreamSid = streamSid;
@@ -1195,16 +1195,23 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => {
11951195
config: {
11961196
onDisconnect?: (providerCallId: string, streamSid: string) => void;
11971197
onConnect?: (providerCallId: string, streamSid: string) => void;
1198+
onTranscriptionReady?: (providerCallId: string, streamSid: string) => void;
11981199
};
11991200
};
12001201
if (!mediaHandler) {
12011202
throw new Error("expected webhook server to expose a media stream handler");
12021203
}
12031204

1204-
mediaHandler.config.onConnect?.("CA-stream-1", "MZ-new");
12051205
mediaHandler.config.onDisconnect?.("CA-stream-1", "MZ-old");
1206+
await vi.advanceTimersByTimeAsync(1_000);
1207+
mediaHandler.config.onConnect?.("CA-stream-1", "MZ-new");
12061208
await vi.advanceTimersByTimeAsync(2_100);
12071209
expect(endCall).not.toHaveBeenCalled();
1210+
expect(speakInitialMessage).not.toHaveBeenCalled();
1211+
1212+
mediaHandler.config.onTranscriptionReady?.("CA-stream-1", "MZ-new");
1213+
expect(speakInitialMessage).toHaveBeenCalledTimes(1);
1214+
expect(speakInitialMessage).toHaveBeenCalledWith("CA-stream-1");
12081215

12091216
mediaHandler.config.onDisconnect?.("CA-stream-1", "MZ-new");
12101217
await vi.advanceTimersByTimeAsync(2_100);

extensions/voice-call/src/webhook.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,8 @@ export class VoiceCallWebhookServer {
383383
if (this.provider.name === "twilio") {
384384
(this.provider as TwilioProvider).registerCallStream(callId, streamSid);
385385
}
386-
387-
// Speak initial message immediately (no delay) to avoid stream timeout
386+
},
387+
onTranscriptionReady: (callId) => {
388388
this.manager.speakInitialMessage(callId).catch((err) => {
389389
console.warn(`[voice-call] Failed to speak initial message:`, err);
390390
});

0 commit comments

Comments
 (0)