Skip to content

Commit 309ff6b

Browse files
scoootscooobsteipete
authored andcommitted
perf(voice-call): trim realtime audio copies
1 parent 7fc9a82 commit 309ff6b

3 files changed

Lines changed: 94 additions & 20 deletions

File tree

extensions/google/realtime-voice-provider.test.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,32 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
411411
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
412412
});
413413

414+
it("fuses telephony mu-law conversion into the Gemini 16 kHz PCM input frame", async () => {
415+
const provider = buildGoogleRealtimeVoiceProvider();
416+
const bridge = provider.createBridge({
417+
providerConfig: { apiKey: "gemini-key" },
418+
onAudio: vi.fn(),
419+
onClearAudio: vi.fn(),
420+
});
421+
422+
await bridge.connect();
423+
lastConnectParams().callbacks.onopen();
424+
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
425+
426+
bridge.sendAudio(Buffer.from([0xff, 0x00]));
427+
428+
expect(session.sendRealtimeInput).toHaveBeenCalledWith({
429+
audio: {
430+
data: expect.any(String),
431+
mimeType: "audio/pcm;rate=16000",
432+
},
433+
});
434+
const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
435+
expect(Array.from({ length: sent.length / 2 }, (_, i) => sent.readInt16LE(i * 2))).toEqual([
436+
0, -16062, -32124, -32124,
437+
]);
438+
});
439+
414440
it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => {
415441
const provider = buildGoogleRealtimeVoiceProvider();
416442
const bridge = provider.createBridge({

extensions/google/realtime-voice-provider.ts

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ const MAX_PENDING_AUDIO_CHUNKS = 320;
5050
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 500;
5151
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
5252
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
53+
const MULAW_LINEAR_SAMPLES = new Int16Array(256);
54+
55+
for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) {
56+
MULAW_LINEAR_SAMPLES[i] = decodeMulawSample(i);
57+
}
5358

5459
type GoogleRealtimeSensitivity = "low" | "high";
5560
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
@@ -330,6 +335,8 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
330335

331336
function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
332337
const functionDeclarations = buildFunctionDeclarations(config.tools);
338+
const realtimeInputConfig = buildRealtimeInputConfig(config);
339+
const thinkingConfig = buildThinkingConfig(config);
333340
return {
334341
responseModalities: ["AUDIO" as Modality],
335342
...(typeof config.temperature === "number" && config.temperature > 0
@@ -344,15 +351,13 @@ function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveCon
344351
},
345352
systemInstruction: config.instructions,
346353
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
347-
...(buildRealtimeInputConfig(config)
348-
? { realtimeInputConfig: buildRealtimeInputConfig(config) }
349-
: {}),
354+
...(realtimeInputConfig ? { realtimeInputConfig } : {}),
350355
inputAudioTranscription: {},
351356
outputAudioTranscription: {},
352357
...(typeof config.enableAffectiveDialog === "boolean"
353358
? { enableAffectiveDialog: config.enableAffectiveDialog }
354359
: {}),
355-
...(buildThinkingConfig(config) ? { thinkingConfig: buildThinkingConfig(config) } : {}),
360+
...(thinkingConfig ? { thinkingConfig } : {}),
356361
};
357362
}
358363

@@ -487,12 +492,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
487492
this.audioStreamEnded = false;
488493
}
489494

490-
const pcm = this.toInputPcm(audio);
491-
const pcm16k = resamplePcm(
492-
pcm,
493-
this.audioFormat.sampleRateHz,
494-
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
495-
);
495+
const pcm16k = this.toGoogleInputPcm16k(audio);
496496
this.session.sendRealtimeInput({
497497
audio: {
498498
data: pcm16k.toString("base64"),
@@ -617,6 +617,21 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
617617
return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio);
618618
}
619619

620+
private toGoogleInputPcm16k(audio: Buffer): Buffer {
621+
if (
622+
this.audioFormat.encoding === "g711_ulaw" &&
623+
this.audioFormat.sampleRateHz === 8_000 &&
624+
GOOGLE_REALTIME_INPUT_SAMPLE_RATE === 16_000
625+
) {
626+
return convertMulaw8kToPcm16k(audio);
627+
}
628+
return resamplePcm(
629+
this.toInputPcm(audio),
630+
this.audioFormat.sampleRateHz,
631+
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
632+
);
633+
}
634+
620635
private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer {
621636
return this.audioFormat.encoding === "pcm16"
622637
? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz)
@@ -726,6 +741,30 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
726741
}
727742
}
728743

744+
function convertMulaw8kToPcm16k(muLaw: Buffer): Buffer {
745+
if (muLaw.length === 0) {
746+
return Buffer.alloc(0);
747+
}
748+
const pcm = Buffer.alloc(muLaw.length * 4);
749+
for (let i = 0; i < muLaw.length; i += 1) {
750+
const current = MULAW_LINEAR_SAMPLES[muLaw[i] ?? 0] ?? 0;
751+
const next = MULAW_LINEAR_SAMPLES[muLaw[i + 1] ?? muLaw[i] ?? 0] ?? current;
752+
pcm.writeInt16LE(current, i * 4);
753+
pcm.writeInt16LE(Math.round((current + next) / 2), i * 4 + 2);
754+
}
755+
return pcm;
756+
}
757+
758+
function decodeMulawSample(value: number): number {
759+
const muLaw = ~value & 0xff;
760+
const sign = muLaw & 0x80;
761+
const exponent = (muLaw >> 4) & 0x07;
762+
const mantissa = muLaw & 0x0f;
763+
let sample = ((mantissa << 3) + 132) << exponent;
764+
sample -= 132;
765+
return sign ? -sample : sample;
766+
}
767+
729768
async function createGoogleRealtimeBrowserSession(
730769
req: RealtimeVoiceBrowserSessionCreateRequest,
731770
): Promise<RealtimeVoiceBrowserSession> {

extensions/voice-call/src/webhook/realtime-audio-pacer.ts

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1-
import { mulawToPcm } from "openclaw/plugin-sdk/realtime-voice";
2-
31
const TELEPHONY_SAMPLE_RATE = 8_000;
42
const TELEPHONY_CHUNK_BYTES = 160;
53
const TELEPHONY_CHUNK_MS = 20;
64
const DEFAULT_SPEECH_RMS_THRESHOLD = 0.02;
75
const DEFAULT_REQUIRED_LOUD_CHUNKS = 2;
86
const DEFAULT_REQUIRED_QUIET_CHUNKS = 10;
7+
const PCM16_MAX_AMPLITUDE = 32768;
8+
const MULAW_LINEAR_SAMPLES = new Int16Array(256);
9+
10+
for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) {
11+
MULAW_LINEAR_SAMPLES[i] = decodeMulawSample(i);
12+
}
913

1014
type RealtimeTwilioAudioQueueItem =
1115
| {
@@ -125,17 +129,12 @@ export function calculateMulawRms(muLaw: Buffer): number {
125129
if (muLaw.length === 0) {
126130
return 0;
127131
}
128-
const pcm = mulawToPcm(muLaw);
129-
const samples = Math.floor(pcm.length / 2);
130-
if (samples === 0) {
131-
return 0;
132-
}
133132
let sum = 0;
134-
for (let i = 0; i < samples; i += 1) {
135-
const normalized = pcm.readInt16LE(i * 2) / 32768;
133+
for (let i = 0; i < muLaw.length; i += 1) {
134+
const normalized = (MULAW_LINEAR_SAMPLES[muLaw[i] ?? 0] ?? 0) / PCM16_MAX_AMPLITUDE;
136135
sum += normalized * normalized;
137136
}
138-
return Math.sqrt(sum / samples);
137+
return Math.sqrt(sum / muLaw.length);
139138
}
140139

141140
export class RealtimeMulawSpeechStartDetector {
@@ -174,3 +173,13 @@ export class RealtimeMulawSpeechStartDetector {
174173
return false;
175174
}
176175
}
176+
177+
function decodeMulawSample(value: number): number {
178+
const muLaw = ~value & 0xff;
179+
const sign = muLaw & 0x80;
180+
const exponent = (muLaw >> 4) & 0x07;
181+
const mantissa = muLaw & 0x0f;
182+
let sample = ((mantissa << 3) + 132) << exponent;
183+
sample -= 132;
184+
return sign ? -sample : sample;
185+
}

0 commit comments

Comments
 (0)