Skip to content

Commit bef0ba8

Browse files
authored
refactor: reuse realtime output activity in google meet (#86665)
1 parent 84929e4 commit bef0ba8

6 files changed

Lines changed: 57 additions & 27 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
77
### Changes
88

99
- Voice: expose shared realtime turn-context tracking through the realtime voice SDK and reuse it for Discord speaker attribution and wake-name context recovery.
10+
- Voice: reuse shared realtime output activity tracking in Google Meet command and node audio bridges, including recent-output checks for local barge-in detection.
1011
- Voice: expose shared realtime output activity tracking through the realtime voice SDK and reuse it for Discord playback activity and barge-in decisions.
1112
- Voice: expose shared realtime consult question matching, speakable-result extraction, and alias-aware forced-consult coordination through the realtime voice SDK, then reuse it in Gateway Talk, Voice Call, and Discord voice paths.
1213
- Voice: share activation-name matching and consult-transcript screening through the realtime voice SDK so Discord, browser voice, and meeting surfaces can reuse one implementation.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
8712c831d993980a72d7a628cd235654a5e7dcf74edda0c9f0c9c9c5ba250afc plugin-sdk-api-baseline.json
2-
f6bf4178e5429f64943f5404f3085c360a7f390c15d15b5b9c23239c7a134ca6 plugin-sdk-api-baseline.jsonl
1+
5ea1c7850cd69d5cfb6817148ffff622a52c6e0a2306e0ae71b6f451ad54ac2c plugin-sdk-api-baseline.json
2+
df2ca60d91db5c5b0225286938a175c3b56feb3190b613a6524864471efa4588 plugin-sdk-api-baseline.jsonl

extensions/google-meet/src/realtime-node.ts

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
createRealtimeVoiceAgentTalkbackQueue,
1010
createTalkSessionController,
1111
createRealtimeVoiceBridgeSession,
12+
createRealtimeVoiceOutputActivityTracker,
1213
recordTalkObservabilityEvent,
1314
type RealtimeVoiceAgentTalkbackQueue,
1415
type RealtimeVoiceBridgeSession,
@@ -27,7 +28,7 @@ import {
2728
getGoogleMeetRealtimeTranscriptHealth,
2829
buildGoogleMeetSpeakExactUserMessage,
2930
GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS,
30-
extendGoogleMeetOutputEchoSuppression,
31+
recordGoogleMeetOutputActivity,
3132
getGoogleMeetRealtimeEventHealth,
3233
recordGoogleMeetRealtimeTranscript,
3334
recordGoogleMeetRealtimeEvent,
@@ -97,7 +98,7 @@ export async function startNodeAgentAudioBridge(params: {
9798
let lastInputAt: string | undefined;
9899
let lastOutputAt: string | undefined;
99100
let lastInputBytes = 0;
100-
let lastOutputBytes = 0;
101+
const outputActivity = createRealtimeVoiceOutputActivityTracker();
101102
let suppressedInputBytes = 0;
102103
let lastSuppressedInputAt: string | undefined;
103104
let suppressInputUntil = 0;
@@ -148,7 +149,8 @@ export async function startNodeAgentAudioBridge(params: {
148149
};
149150

150151
const pushOutputAudio = async (audio: Buffer) => {
151-
const suppression = extendGoogleMeetOutputEchoSuppression({
152+
const suppression = recordGoogleMeetOutputActivity({
153+
tracker: outputActivity,
152154
audio,
153155
audioFormat: params.config.chrome.audioFormat,
154156
nowMs: Date.now(),
@@ -158,7 +160,6 @@ export async function startNodeAgentAudioBridge(params: {
158160
suppressInputUntil = suppression.suppressInputUntilMs;
159161
lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs;
160162
lastOutputAt = new Date().toISOString();
161-
lastOutputBytes += audio.byteLength;
162163
await params.runtime.nodes.invoke({
163164
nodeId: params.nodeId,
164165
command: "googlemeet.chrome",
@@ -317,12 +318,12 @@ export async function startNodeAgentAudioBridge(params: {
317318
providerConnected: sttSession?.isConnected() ?? false,
318319
realtimeReady,
319320
audioInputActive: lastInputBytes > 0,
320-
audioOutputActive: lastOutputBytes > 0,
321+
audioOutputActive: outputActivity.isActive(),
321322
lastInputAt,
322323
lastOutputAt,
323324
lastSuppressedInputAt,
324325
lastInputBytes,
325-
lastOutputBytes,
326+
lastOutputBytes: outputActivity.snapshot().sinkAudioBytes,
326327
suppressedInputBytes,
327328
...getGoogleMeetRealtimeTranscriptHealth(transcript),
328329
consecutiveInputErrors,
@@ -351,7 +352,7 @@ export async function startNodeRealtimeAudioBridge(params: {
351352
let lastOutputAt: string | undefined;
352353
let lastClearAt: string | undefined;
353354
let lastInputBytes = 0;
354-
let lastOutputBytes = 0;
355+
const outputActivity = createRealtimeVoiceOutputActivityTracker();
355356
let suppressedInputBytes = 0;
356357
let lastSuppressedInputAt: string | undefined;
357358
let suppressInputUntil = 0;
@@ -505,7 +506,8 @@ export async function startNodeRealtimeAudioBridge(params: {
505506
turnId,
506507
payload: { byteLength: audio.byteLength },
507508
});
508-
const suppression = extendGoogleMeetOutputEchoSuppression({
509+
const suppression = recordGoogleMeetOutputActivity({
510+
tracker: outputActivity,
509511
audio,
510512
audioFormat: params.config.chrome.audioFormat,
511513
nowMs: Date.now(),
@@ -515,7 +517,6 @@ export async function startNodeRealtimeAudioBridge(params: {
515517
suppressInputUntil = suppression.suppressInputUntilMs;
516518
lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs;
517519
lastOutputAt = new Date().toISOString();
518-
lastOutputBytes += audio.byteLength;
519520
void params.runtime.nodes
520521
.invoke({
521522
nodeId: params.nodeId,
@@ -759,13 +760,13 @@ export async function startNodeRealtimeAudioBridge(params: {
759760
providerConnected: bridge?.bridge.isConnected() ?? false,
760761
realtimeReady,
761762
audioInputActive: lastInputBytes > 0,
762-
audioOutputActive: lastOutputBytes > 0,
763+
audioOutputActive: outputActivity.isActive(),
763764
lastInputAt,
764765
lastOutputAt,
765766
lastSuppressedInputAt,
766767
lastClearAt,
767768
lastInputBytes,
768-
lastOutputBytes,
769+
lastOutputBytes: outputActivity.snapshot().sinkAudioBytes,
769770
suppressedInputBytes,
770771
...getGoogleMeetRealtimeTranscriptHealth(transcript),
771772
...getGoogleMeetRealtimeEventHealth(realtimeEvents),

extensions/google-meet/src/realtime.ts

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
import {
1414
createRealtimeVoiceAgentTalkbackQueue,
1515
createRealtimeVoiceBridgeSession,
16+
createRealtimeVoiceOutputActivityTracker,
1617
createTalkSessionController,
1718
convertPcmToMulaw8k,
1819
extendRealtimeVoiceOutputEchoSuppression,
@@ -30,6 +31,7 @@ import {
3031
type RealtimeVoiceAgentTalkbackQueue,
3132
type RealtimeVoiceBridgeEventLogEntry,
3233
type RealtimeVoiceBridgeSession,
34+
type RealtimeVoiceOutputActivityTracker,
3335
type RealtimeVoiceProviderConfig,
3436
type RealtimeVoiceProviderPlugin,
3537
type RealtimeVoiceTranscriptEntry,
@@ -163,6 +165,24 @@ export function extendGoogleMeetOutputEchoSuppression(params: {
163165
});
164166
}
165167

168+
export function recordGoogleMeetOutputActivity(params: {
169+
tracker: RealtimeVoiceOutputActivityTracker;
170+
audio: Buffer;
171+
audioFormat: GoogleMeetConfig["chrome"]["audioFormat"];
172+
nowMs: number;
173+
lastOutputPlayableUntilMs: number;
174+
suppressInputUntilMs: number;
175+
}): { lastOutputPlayableUntilMs: number; suppressInputUntilMs: number; durationMs: number } {
176+
const suppression = extendGoogleMeetOutputEchoSuppression(params);
177+
params.tracker.markPlaybackStarted();
178+
params.tracker.markAudio({
179+
audioMs: suppression.durationMs,
180+
sourceAudioBytes: params.audio.byteLength,
181+
sinkAudioBytes: params.audio.byteLength,
182+
});
183+
return suppression;
184+
}
185+
166186
export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) {
167187
return config.chrome.audioFormat === "g711-ulaw-8khz"
168188
? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ
@@ -477,7 +497,7 @@ export async function startCommandAgentAudioBridge(params: {
477497
let lastInputAt: string | undefined;
478498
let lastOutputAt: string | undefined;
479499
let lastInputBytes = 0;
480-
let lastOutputBytes = 0;
500+
const outputActivity = createRealtimeVoiceOutputActivityTracker();
481501
let suppressedInputBytes = 0;
482502
let lastSuppressedInputAt: string | undefined;
483503
let suppressInputUntil = 0;
@@ -606,7 +626,8 @@ export async function startCommandAgentAudioBridge(params: {
606626
});
607627

608628
const writeOutputAudio = (audio: Buffer) => {
609-
const suppression = extendGoogleMeetOutputEchoSuppression({
629+
const suppression = recordGoogleMeetOutputActivity({
630+
tracker: outputActivity,
610631
audio,
611632
audioFormat: params.config.chrome.audioFormat,
612633
nowMs: Date.now(),
@@ -616,7 +637,6 @@ export async function startCommandAgentAudioBridge(params: {
616637
suppressInputUntil = suppression.suppressInputUntilMs;
617638
lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs;
618639
lastOutputAt = new Date().toISOString();
619-
lastOutputBytes += audio.byteLength;
620640
emitTalkEvent({
621641
type: "output.audio.delta",
622642
turnId: ensureTalkTurn(),
@@ -787,12 +807,12 @@ export async function startCommandAgentAudioBridge(params: {
787807
providerConnected: sttSession?.isConnected() ?? false,
788808
realtimeReady,
789809
audioInputActive: lastInputBytes > 0,
790-
audioOutputActive: lastOutputBytes > 0,
810+
audioOutputActive: outputActivity.isActive(),
791811
lastInputAt,
792812
lastOutputAt,
793813
lastSuppressedInputAt,
794814
lastInputBytes,
795-
lastOutputBytes,
815+
lastOutputBytes: outputActivity.snapshot().sinkAudioBytes,
796816
suppressedInputBytes,
797817
...getGoogleMeetRealtimeTranscriptHealth(transcript),
798818
recentTalkEvents: summarizeGoogleMeetTalkEvents(recentTalkEvents),
@@ -833,19 +853,19 @@ export async function startCommandRealtimeAudioBridge(params: {
833853
let lastInputAt: string | undefined;
834854
let lastOutputAt: string | undefined;
835855
let lastInputBytes = 0;
836-
let lastOutputBytes = 0;
856+
const outputActivity = createRealtimeVoiceOutputActivityTracker();
837857
let lastClearAt: string | undefined;
838858
let clearCount = 0;
839859
let suppressedInputBytes = 0;
840860
let lastSuppressedInputAt: string | undefined;
841861
let suppressInputUntil = 0;
842-
let lastOutputAtMs = 0;
843862
let lastOutputPlayableUntilMs = 0;
844863
let bargeInInputProcess: BridgeProcess | undefined;
845864
let agentTalkback: RealtimeVoiceAgentTalkbackQueue | undefined;
846865

847866
const suppressInputForOutput = (audio: Buffer) => {
848-
const suppression = extendGoogleMeetOutputEchoSuppression({
867+
const suppression = recordGoogleMeetOutputActivity({
868+
tracker: outputActivity,
849869
audio,
850870
audioFormat: params.config.chrome.audioFormat,
851871
nowMs: Date.now(),
@@ -970,12 +990,13 @@ export async function startCommandRealtimeAudioBridge(params: {
970990
stdio: ["ignore", "pipe", "pipe"],
971991
});
972992
bargeInInputProcess.stdout?.on("data", (chunk) => {
973-
if (stopped || lastOutputAtMs === 0) {
993+
if (stopped || !outputActivity.isInterruptible()) {
974994
return;
975995
}
976996
const now = Date.now();
977997
const playbackActive = now <= Math.max(lastOutputPlayableUntilMs, suppressInputUntil);
978-
if (!playbackActive && now - lastOutputAtMs > 1000) {
998+
const lastOutputAudioAt = outputActivity.snapshot().lastAudioAt;
999+
if (!playbackActive && (lastOutputAudioAt === undefined || now - lastOutputAudioAt > 1_000)) {
9791000
return;
9801001
}
9811002
if (now - lastBargeInAt < params.config.chrome.bargeInCooldownMs) {
@@ -1141,9 +1162,7 @@ export async function startCommandRealtimeAudioBridge(params: {
11411162
turnId,
11421163
payload: { byteLength: audio.byteLength },
11431164
});
1144-
lastOutputAtMs = Date.now();
11451165
lastOutputAt = new Date().toISOString();
1146-
lastOutputBytes += audio.byteLength;
11471166
suppressInputForOutput(audio);
11481167
writeOutputAudio(audio);
11491168
},
@@ -1315,12 +1334,12 @@ export async function startCommandRealtimeAudioBridge(params: {
13151334
providerConnected: bridge?.bridge.isConnected() ?? false,
13161335
realtimeReady,
13171336
audioInputActive: lastInputBytes > 0,
1318-
audioOutputActive: lastOutputBytes > 0,
1337+
audioOutputActive: outputActivity.isActive(),
13191338
lastInputAt,
13201339
lastOutputAt,
13211340
lastSuppressedInputAt,
13221341
lastInputBytes,
1323-
lastOutputBytes,
1342+
lastOutputBytes: outputActivity.snapshot().sinkAudioBytes,
13241343
suppressedInputBytes,
13251344
...getGoogleMeetRealtimeTranscriptHealth(transcript),
13261345
...getGoogleMeetRealtimeEventHealth(realtimeEvents),

src/talk/output-activity-tracker.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ describe("realtime voice output activity tracker", () => {
77

88
expect(tracker.isActive(false)).toBe(false);
99
expect(tracker.isInterruptible(false)).toBe(false);
10+
expect(tracker.snapshot().lastAudioAt).toBeUndefined();
1011

1112
tracker.markAudio({ audioMs: 10, sourceAudioBytes: 480, sinkAudioBytes: 1_920 });
1213

@@ -17,6 +18,7 @@ describe("realtime voice output activity tracker", () => {
1718
chunks: 1,
1819
sourceAudioBytes: 480,
1920
sinkAudioBytes: 1_920,
21+
lastAudioAt: expect.any(Number),
2022
});
2123
});
2224

@@ -43,6 +45,7 @@ describe("realtime voice output activity tracker", () => {
4345
playbackStarted: true,
4446
playbackStartedAt: 1_000,
4547
streamEnding: true,
48+
lastAudioAt: 1_000,
4649
});
4750
});
4851

src/talk/output-activity-tracker.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ export type RealtimeVoiceOutputActivitySnapshot = {
1515
sinkAudioBytes: number;
1616
playbackStarted: boolean;
1717
streamEnding: boolean;
18+
lastAudioAt?: number;
1819
playbackStartedAt?: number;
1920
};
2021

@@ -41,6 +42,7 @@ export function createRealtimeVoiceOutputActivityTracker(
4142
let sinkAudioBytes = 0;
4243
let playbackStarted = false;
4344
let streamEnding = false;
45+
let lastAudioAt: number | undefined;
4446
let playbackStartedAt: number | undefined;
4547

4648
const snapshot = (): RealtimeVoiceOutputActivitySnapshot => ({
@@ -50,6 +52,7 @@ export function createRealtimeVoiceOutputActivityTracker(
5052
sinkAudioBytes,
5153
playbackStarted,
5254
streamEnding,
55+
...(lastAudioAt === undefined ? {} : { lastAudioAt }),
5356
...(playbackStartedAt === undefined ? {} : { playbackStartedAt }),
5457
});
5558

@@ -58,6 +61,7 @@ export function createRealtimeVoiceOutputActivityTracker(
5861
streamEnding = false;
5962
playbackStarted = false;
6063
playbackStartedAt = undefined;
64+
lastAudioAt = undefined;
6165
},
6266
markStreamEnding() {
6367
streamEnding = true;
@@ -74,6 +78,7 @@ export function createRealtimeVoiceOutputActivityTracker(
7478
sourceAudioBytes += Math.max(0, delta.sourceAudioBytes ?? 0);
7579
sinkAudioBytes += Math.max(0, delta.sinkAudioBytes ?? 0);
7680
chunks += 1;
81+
lastAudioAt = now();
7782
},
7883
reset() {
7984
audioMs = 0;
@@ -82,6 +87,7 @@ export function createRealtimeVoiceOutputActivityTracker(
8287
sinkAudioBytes = 0;
8388
playbackStarted = false;
8489
streamEnding = false;
90+
lastAudioAt = undefined;
8591
playbackStartedAt = undefined;
8692
},
8793
isActive(sinkActive = false) {

0 commit comments

Comments
 (0)