Skip to content

Commit 05bb2b3

Browse files
shushushvclaude
andcommitted
fix(talk): show OpenAI Realtime WebRTC assistant transcripts
Handle current OpenAI Realtime assistant transcript events on the direct WebRTC transport (response.output_audio_transcript.delta/done, response.output_text.*), keep legacy response.audio_transcript.* compatibility, and dedup duplicate assistant bubbles in the shared conversation aggregation so both webrtc and gateway-relay benefit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 66b91d7 commit 05bb2b3

4 files changed

Lines changed: 225 additions & 16 deletions

File tree

ui/src/ui/chat/realtime-talk-conversation.test.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,69 @@ describe("realtime Talk conversation", () => {
7878
]);
7979
});
8080

81+
it("keeps a late final assistant transcript in the original assistant bubble", () => {
82+
let state = createRealtimeTalkConversationState();
83+
84+
for (const [index, text] of ["Hey", " there", "!", " What", "’s", " up", "?"].entries()) {
85+
state = updateRealtimeTalkConversation(state, {
86+
role: "assistant",
87+
text,
88+
final: false,
89+
nowMs: index + 1,
90+
});
91+
}
92+
state = updateRealtimeTalkConversation(state, {
93+
role: "user",
94+
text: "Hello",
95+
final: true,
96+
nowMs: 10,
97+
});
98+
state = updateRealtimeTalkConversation(state, {
99+
role: "assistant",
100+
text: "Hey there! What’s up?",
101+
final: true,
102+
nowMs: 11,
103+
});
104+
105+
expect(state.entries).toMatchObject([
106+
{ role: "assistant", text: "Hey there! What’s up?", isStreaming: false },
107+
{ role: "user", text: "Hello", isStreaming: false },
108+
]);
109+
});
110+
111+
it("starts a new assistant bubble when the final transcript arrives past the rewrite window", () => {
112+
let state = createRealtimeTalkConversationState();
113+
114+
for (const [index, text] of ["Hey", " there", "!", " What", "’s", " up", "?"].entries()) {
115+
state = updateRealtimeTalkConversation(state, {
116+
role: "assistant",
117+
text,
118+
final: false,
119+
nowMs: index + 1,
120+
});
121+
}
122+
state = updateRealtimeTalkConversation(state, {
123+
role: "user",
124+
text: "Hello",
125+
final: true,
126+
nowMs: 10,
127+
});
128+
// Past the rewrite grace window the final is treated as a new turn, even when
129+
// its text matches the earlier streamed bubble.
130+
state = updateRealtimeTalkConversation(state, {
131+
role: "assistant",
132+
text: "Hey there! What’s up?",
133+
final: true,
134+
nowMs: 10 + 2_001,
135+
});
136+
137+
expect(state.entries).toMatchObject([
138+
{ role: "assistant", text: "Hey there! What’s up?", isStreaming: false },
139+
{ role: "user", text: "Hello", isStreaming: false },
140+
{ role: "assistant", text: "Hey there! What’s up?", isStreaming: false },
141+
]);
142+
});
143+
81144
it("creates a new bubble for the next final user turn after assistant output starts", () => {
82145
let state = createRealtimeTalkConversationState();
83146

ui/src/ui/chat/realtime-talk-conversation.ts

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ export type RealtimeTalkConversationState = {
1515
userEntryAwaitingFinal: boolean;
1616
userEntryAwaitingFinalStartedAtMs: number | null;
1717
assistantEntryId: string | null;
18+
assistantEntryAwaitingFinalId: string | null;
19+
assistantEntryAwaitingFinalStartedAtMs: number | null;
1820
};
1921

2022
export type RealtimeTalkTranscriptUpdate = {
@@ -26,6 +28,10 @@ export type RealtimeTalkTranscriptUpdate = {
2628

2729
const MAX_CONVERSATION_ENTRIES = 60;
2830
const USER_FINAL_REWRITE_GRACE_MS = 1_500;
31+
// A final assistant transcript can land after a user transcript already closed
32+
// the streamed assistant bubble. Within this window we rewrite that bubble
33+
// instead of inserting a duplicate; past it the final is treated as a new turn.
34+
const ASSISTANT_FINAL_REWRITE_GRACE_MS = 2_000;
2935

3036
export function createRealtimeTalkConversationState(): RealtimeTalkConversationState {
3137
return {
@@ -35,6 +41,8 @@ export function createRealtimeTalkConversationState(): RealtimeTalkConversationS
3541
userEntryAwaitingFinal: false,
3642
userEntryAwaitingFinalStartedAtMs: null,
3743
assistantEntryId: null,
44+
assistantEntryAwaitingFinalId: null,
45+
assistantEntryAwaitingFinalStartedAtMs: null,
3846
};
3947
}
4048

@@ -49,10 +57,13 @@ export function updateRealtimeTalkConversation(
4957
const nowMs = update.nowMs ?? Date.now();
5058
if (update.role === "assistant") {
5159
const preparedState = finishRealtimeConversationEntry(state, "user", nowMs);
60+
const assistantEntryId =
61+
preparedState.assistantEntryId ??
62+
resolveLateFinalAssistantEntryId(preparedState, text, update.final, nowMs);
5263
return upsertRealtimeConversationEntry(
5364
preparedState,
5465
update.role,
55-
preparedState.assistantEntryId,
66+
assistantEntryId,
5667
text,
5768
update.final,
5869
nowMs,
@@ -139,7 +150,12 @@ function rememberRealtimeConversationEntry(
139150
userEntryAwaitingFinalStartedAtMs: null,
140151
};
141152
}
142-
return { ...state, assistantEntryId: isFinal ? null : entryId };
153+
return {
154+
...state,
155+
assistantEntryId: isFinal ? null : entryId,
156+
assistantEntryAwaitingFinalId: null,
157+
assistantEntryAwaitingFinalStartedAtMs: null,
158+
};
143159
}
144160

145161
export function finishRealtimeConversationEntry(
@@ -162,7 +178,50 @@ export function finishRealtimeConversationEntry(
162178
userEntryAwaitingFinalStartedAtMs: nowMs,
163179
};
164180
}
165-
return { ...state, entries, assistantEntryId: null };
181+
return {
182+
...state,
183+
entries,
184+
assistantEntryId: null,
185+
assistantEntryAwaitingFinalId: entryId,
186+
assistantEntryAwaitingFinalStartedAtMs: nowMs,
187+
};
188+
}
189+
190+
// Reattach a late final assistant transcript to the bubble its deltas already
191+
// filled, when a user transcript closed it moments earlier. Outside the grace
192+
// window, or when the text is not the same utterance, return null so the caller
193+
// starts a fresh bubble and never folds the next assistant turn into this one.
194+
function resolveLateFinalAssistantEntryId(
195+
state: RealtimeTalkConversationState,
196+
incoming: string,
197+
isFinal: boolean,
198+
nowMs: number,
199+
): string | null {
200+
if (!isFinal || state.assistantEntryAwaitingFinalId === null) {
201+
return null;
202+
}
203+
const elapsed =
204+
state.assistantEntryAwaitingFinalStartedAtMs === null
205+
? Number.POSITIVE_INFINITY
206+
: nowMs - state.assistantEntryAwaitingFinalStartedAtMs;
207+
if (elapsed > ASSISTANT_FINAL_REWRITE_GRACE_MS) {
208+
return null;
209+
}
210+
const entry = state.entries.find(
211+
(candidate) => candidate.id === state.assistantEntryAwaitingFinalId,
212+
);
213+
if (!entry || entry.role !== "assistant") {
214+
return null;
215+
}
216+
const existing = entry.text;
217+
if (
218+
incoming === existing ||
219+
incoming.startsWith(existing) ||
220+
looksLikeTranscriptReplacement(existing, incoming)
221+
) {
222+
return entry.id;
223+
}
224+
return null;
166225
}
167226

168227
function shouldStartNewRealtimeUserEntry(

ui/src/ui/chat/realtime-talk-webrtc.ts

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type RealtimeServerEvent = {
1919
name?: string;
2020
delta?: string;
2121
transcript?: string;
22+
text?: string;
2223
arguments?: string;
2324
error?: unknown;
2425
response?: {
@@ -171,20 +172,16 @@ export class WebRtcSdpRealtimeTalkTransport implements RealtimeTalkTransport {
171172
}
172173
}
173174
return;
175+
case "conversation.output_transcript.delta":
176+
case "response.output_text.delta":
177+
case "response.audio_transcript.delta":
178+
case "response.output_audio_transcript.delta":
179+
this.emitAssistantTranscript(event, false);
180+
return;
181+
case "response.output_text.done":
174182
case "response.audio_transcript.done":
175-
if (event.transcript) {
176-
this.ctx.callbacks.onTranscript?.({
177-
role: "assistant",
178-
text: event.transcript,
179-
final: true,
180-
});
181-
this.emitTalkEvent({
182-
type: "output.text.done",
183-
final: true,
184-
itemId: event.item_id,
185-
payload: { text: event.transcript },
186-
});
187-
}
183+
case "response.output_audio_transcript.done":
184+
this.emitAssistantTranscript(event, true);
188185
return;
189186
case "response.function_call_arguments.delta":
190187
this.bufferToolDelta(event);
@@ -239,6 +236,24 @@ export class WebRtcSdpRealtimeTalkTransport implements RealtimeTalkTransport {
239236
return status && status !== "completed" ? `Response ${status}` : undefined;
240237
}
241238

239+
private emitAssistantTranscript(event: RealtimeServerEvent, final: boolean): void {
240+
const text = final ? (event.transcript ?? event.text) : event.delta;
241+
if (!text) {
242+
return;
243+
}
244+
this.ctx.callbacks.onTranscript?.({
245+
role: "assistant",
246+
text,
247+
final,
248+
});
249+
this.emitTalkEvent({
250+
type: final ? "output.text.done" : "output.text.delta",
251+
final,
252+
itemId: event.item_id,
253+
payload: { text },
254+
});
255+
}
256+
242257
private extractErrorDetail(error: unknown): string {
243258
if (!error || typeof error !== "object") {
244259
return "Realtime provider error";

ui/src/ui/realtime-talk-webrtc.test.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,78 @@ describe("WebRtcSdpRealtimeTalkTransport", () => {
371371
transport.stop();
372372
});
373373

374+
// Audio output sends the final string in `transcript`; text output sends it in
375+
// `text`. Both must surface the same assistant transcript + talk events.
376+
it.each([
377+
{
378+
label: "audio output",
379+
deltaType: "response.output_audio_transcript.delta",
380+
doneType: "response.output_audio_transcript.done",
381+
doneField: { transcript: "hi there" },
382+
},
383+
{
384+
label: "text output",
385+
deltaType: "response.output_text.delta",
386+
doneType: "response.output_text.done",
387+
doneField: { text: "hi there" },
388+
},
389+
])(
390+
"emits assistant transcripts from OpenAI Realtime $label events",
391+
async ({ deltaType, doneType, doneField }) => {
392+
vi.stubGlobal(
393+
"fetch",
394+
vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch,
395+
);
396+
const onTranscript = vi.fn();
397+
const onTalkEvent = vi.fn();
398+
const transport = new WebRtcSdpRealtimeTalkTransport(
399+
{
400+
provider: "openai",
401+
transport: "webrtc",
402+
clientSecret: "client-secret-123",
403+
},
404+
{
405+
client: {} as never,
406+
sessionKey: "main",
407+
callbacks: { onTranscript, onTalkEvent },
408+
},
409+
);
410+
411+
await transport.start();
412+
const peer = FakePeerConnection.instances[0];
413+
peer?.channel.dispatchEvent(
414+
new MessageEvent("message", {
415+
data: JSON.stringify({ type: deltaType, item_id: "response-1", delta: "hi" }),
416+
}),
417+
);
418+
peer?.channel.dispatchEvent(
419+
new MessageEvent("message", {
420+
data: JSON.stringify({ type: doneType, item_id: "response-1", ...doneField }),
421+
}),
422+
);
423+
424+
expect(onTranscript).toHaveBeenCalledWith({
425+
role: "assistant",
426+
text: "hi",
427+
final: false,
428+
});
429+
expect(onTranscript).toHaveBeenCalledWith({
430+
role: "assistant",
431+
text: "hi there",
432+
final: true,
433+
});
434+
expect(onTalkEvent.mock.calls.map(([event]) => event.type)).toEqual([
435+
"output.text.delta",
436+
"output.text.done",
437+
]);
438+
expect(onTalkEvent.mock.calls.map(([event]) => event.payload)).toEqual([
439+
{ text: "hi" },
440+
{ text: "hi there" },
441+
]);
442+
transport.stop();
443+
},
444+
);
445+
374446
it("aborts an in-flight OpenAI tool consult when the transport stops", async () => {
375447
vi.stubGlobal(
376448
"fetch",

0 commit comments

Comments
 (0)