Skip to content

Commit 8fe4f34

Browse files
authored
fix: accept leading fuzzy Discord voice wake names (#86484)
1 parent 5d01803 commit 8fe4f34

3 files changed

Lines changed: 288 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai
1414

1515
### Fixes
1616

17+
- Discord/OpenAI voice: accept leading fuzzy wake-name transcripts such as "Monty" or "Moti" for a Molty agent while keeping ambient speech gated.
1718
- Discord/OpenAI voice: rotate Realtime sessions at provider max duration without logging the expected session-expiry event as an error.
1819
- Memory/local embeddings: run local GGUF embeddings in an isolated worker sidecar and degrade to configured fallback or keyword search on worker failure so native embedding crashes do not take down the Gateway. (#85348) Thanks @osolmaz.
1920
- Gateway: clear the runtime config snapshot before `SIGUSR1` in-process restarts so config changes survive the next gateway loop. (#86388) Thanks @XuZehan-iCenter.

extensions/discord/src/voice/manager.e2e.test.ts

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2966,6 +2966,133 @@ describe("DiscordVoiceManager", () => {
29662966
expectUserMessageIncludes("openclaw wake answer");
29672967
});
29682968

2969+
it("accepts leading fuzzy wake names before realtime agent-proxy consults", async () => {
2970+
const manager = createManager(
2971+
{
2972+
groupPolicy: "open",
2973+
voice: {
2974+
enabled: true,
2975+
mode: "agent-proxy",
2976+
realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true },
2977+
},
2978+
},
2979+
undefined,
2980+
{
2981+
agents: {
2982+
list: [{ id: "agent-1", identity: { name: "Molty" } }],
2983+
},
2984+
},
2985+
);
2986+
2987+
await manager.join({ guildId: "g1", channelId: "1001" });
2988+
const entry = getSessionEntry(manager) as {
2989+
realtime?: {
2990+
beginSpeakerTurn: (
2991+
context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
2992+
userId: string,
2993+
) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
2994+
};
2995+
};
2996+
const bridgeParams = lastRealtimeBridgeParams() as
2997+
| {
2998+
onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
2999+
}
3000+
| undefined;
3001+
3002+
const montyTurn = entry.realtime?.beginSpeakerTurn(
3003+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3004+
"u-owner",
3005+
);
3006+
montyTurn?.sendInputAudio(Buffer.alloc(8));
3007+
bridgeParams?.onTranscript?.("user", "Monty, are you with us?", true);
3008+
await new Promise((resolve) => setTimeout(resolve, 260));
3009+
3010+
expect(agentCommandArgsAt(0).message).toContain("are you with us?");
3011+
expect(agentCommandArgsAt(0).message).not.toContain("Monty");
3012+
3013+
const motiTurn = entry.realtime?.beginSpeakerTurn(
3014+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3015+
"u-owner",
3016+
);
3017+
motiTurn?.sendInputAudio(Buffer.alloc(8));
3018+
bridgeParams?.onTranscript?.("user", "Moti, what's going on today?", true);
3019+
await new Promise((resolve) => setTimeout(resolve, 260));
3020+
3021+
expect(agentCommandArgsAt(1).message).toContain("what's going on today?");
3022+
expect(agentCommandArgsAt(1).message).not.toContain("Moti");
3023+
3024+
const openClawTurn = entry.realtime?.beginSpeakerTurn(
3025+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3026+
"u-owner",
3027+
);
3028+
openClawTurn?.sendInputAudio(Buffer.alloc(8));
3029+
bridgeParams?.onTranscript?.("user", "Open claw can you still hear me?", true);
3030+
await new Promise((resolve) => setTimeout(resolve, 260));
3031+
3032+
expect(agentCommandArgsAt(2).message).toContain("can you still hear me?");
3033+
expect(agentCommandArgsAt(2).message).not.toContain("Open claw");
3034+
});
3035+
3036+
it("rejects non-wake fuzzy leading phrases before realtime agent-proxy consults", async () => {
3037+
const manager = createManager(
3038+
{
3039+
groupPolicy: "open",
3040+
voice: {
3041+
enabled: true,
3042+
mode: "agent-proxy",
3043+
realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true },
3044+
},
3045+
},
3046+
undefined,
3047+
{
3048+
agents: {
3049+
list: [{ id: "agent-1", identity: { name: "Molty" } }],
3050+
},
3051+
},
3052+
);
3053+
3054+
await manager.join({ guildId: "g1", channelId: "1001" });
3055+
const entry = getSessionEntry(manager) as {
3056+
realtime?: {
3057+
beginSpeakerTurn: (
3058+
context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
3059+
userId: string,
3060+
) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
3061+
};
3062+
};
3063+
const bridgeParams = lastRealtimeBridgeParams() as
3064+
| {
3065+
onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
3066+
}
3067+
| undefined;
3068+
3069+
const multiTurn = entry.realtime?.beginSpeakerTurn(
3070+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3071+
"u-owner",
3072+
);
3073+
multiTurn?.sendInputAudio(Buffer.alloc(8));
3074+
bridgeParams?.onTranscript?.("user", "Multi, step through the maintainer queue.", true);
3075+
await new Promise((resolve) => setTimeout(resolve, 260));
3076+
3077+
const ambientTurn = entry.realtime?.beginSpeakerTurn(
3078+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3079+
"u-owner",
3080+
);
3081+
ambientTurn?.sendInputAudio(Buffer.alloc(8));
3082+
bridgeParams?.onTranscript?.("user", "This is a multi-step maintainer problem.", true);
3083+
await new Promise((resolve) => setTimeout(resolve, 260));
3084+
3085+
const openLawTurn = entry.realtime?.beginSpeakerTurn(
3086+
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
3087+
"u-owner",
3088+
);
3089+
openLawTurn?.sendInputAudio(Buffer.alloc(8));
3090+
bridgeParams?.onTranscript?.("user", "Open law is not the wake phrase.", true);
3091+
await new Promise((resolve) => setTimeout(resolve, 260));
3092+
3093+
expect(agentCommandMock).not.toHaveBeenCalled();
3094+
});
3095+
29693096
it("leaves non-OpenAI agent-proxy realtime auto-response enabled when wake names are requested", async () => {
29703097
resolveConfiguredRealtimeVoiceProviderMock.mockReturnValueOnce({
29713098
provider: { id: "google" },

extensions/discord/src/voice/realtime.ts

Lines changed: 160 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200;
6464
const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000;
6565
const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000;
6666
const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500;
67+
const DISCORD_REALTIME_WAKE_NAME_FUZZY_PREFIX_WORDS = 3;
6768
const REALTIME_PCM16_BYTES_PER_SAMPLE = 2;
6869
const DISCORD_RAW_PCM_FRAME_BYTES = 3_840;
6970
const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25;
@@ -353,6 +354,19 @@ function normalizeWakeName(value: string): string | undefined {
353354
return normalized || undefined;
354355
}
355356

357+
function normalizeWakeNameCandidate(value: string): string | undefined {
358+
const normalized = value
359+
.toLowerCase()
360+
.replace(/[^a-z0-9]+/g, " ")
361+
.replace(/\s+/g, " ")
362+
.trim();
363+
return normalized || undefined;
364+
}
365+
366+
function compactWakeName(value: string): string {
367+
return value.replace(/[^a-z0-9]+/g, "");
368+
}
369+
356370
function escapeRegExp(value: string): string {
357371
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
358372
}
@@ -384,6 +398,136 @@ function stripLeadingWakeName(text: string, wakeName: string): string {
384398
.trim();
385399
}
386400

401+
type LeadingWakeNameCandidate = {
402+
heardName: string;
403+
endIndex: number;
404+
strongBoundary: boolean;
405+
};
406+
407+
type WakeNameTranscriptResult =
408+
| { allowed: true; text: string; wakeName: string; heardName: string; match: "exact" | "fuzzy" }
409+
| { allowed: false; text: string };
410+
type AllowedWakeNameTranscriptResult = Extract<WakeNameTranscriptResult, { allowed: true }>;
411+
412+
function leadingWakeNameCandidates(text: string): LeadingWakeNameCandidate[] {
413+
const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text);
414+
const nameStart = opener?.[0].length ?? 0;
415+
const candidates: LeadingWakeNameCandidate[] = [];
416+
const tokenPattern = /[a-z0-9]+/gi;
417+
tokenPattern.lastIndex = nameStart;
418+
419+
for (
420+
let wordCount = 0;
421+
wordCount < DISCORD_REALTIME_WAKE_NAME_FUZZY_PREFIX_WORDS;
422+
wordCount += 1
423+
) {
424+
const token = tokenPattern.exec(text);
425+
if (!token) {
426+
break;
427+
}
428+
const between = text.slice(
429+
wordCount === 0 ? nameStart : candidates[wordCount - 1]?.endIndex,
430+
token.index,
431+
);
432+
if (wordCount > 0 && !/^[\s'-]+$/.test(between)) {
433+
break;
434+
}
435+
const endIndex = token.index + token[0].length;
436+
const heardName = normalizeWakeNameCandidate(text.slice(nameStart, endIndex));
437+
if (!heardName) {
438+
break;
439+
}
440+
const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/);
441+
candidates.push({
442+
heardName,
443+
endIndex,
444+
strongBoundary: Boolean(boundary),
445+
});
446+
}
447+
448+
return candidates;
449+
}
450+
451+
function levenshteinDistance(left: string, right: string): number {
452+
if (left === right) {
453+
return 0;
454+
}
455+
if (!left) {
456+
return right.length;
457+
}
458+
if (!right) {
459+
return left.length;
460+
}
461+
462+
let previous = Array.from({ length: right.length + 1 }, (_, index) => index);
463+
for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) {
464+
const current = [leftIndex + 1];
465+
for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) {
466+
const cost = left[leftIndex] === right[rightIndex] ? 0 : 1;
467+
current[rightIndex + 1] = Math.min(
468+
current[rightIndex] + 1,
469+
previous[rightIndex + 1] + 1,
470+
previous[rightIndex] + cost,
471+
);
472+
}
473+
previous = current;
474+
}
475+
return previous[right.length] ?? Math.max(left.length, right.length);
476+
}
477+
478+
function isFuzzyWakeNameMatch(candidate: LeadingWakeNameCandidate, wakeName: string): boolean {
479+
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
480+
if (!normalizedWakeName) {
481+
return false;
482+
}
483+
const heardCompact = compactWakeName(candidate.heardName);
484+
const wakeCompact = compactWakeName(normalizedWakeName);
485+
if (!heardCompact || !wakeCompact || wakeCompact.length < 5) {
486+
return false;
487+
}
488+
if (!candidate.strongBoundary) {
489+
return false;
490+
}
491+
const distance = levenshteinDistance(heardCompact, wakeCompact);
492+
if (distance <= 1) {
493+
return true;
494+
}
495+
return distance === 2 && wakeCompact.length >= 5 && heardCompact.length !== wakeCompact.length;
496+
}
497+
498+
function stripLeadingWakeNameCandidate(text: string, candidate: LeadingWakeNameCandidate): string {
499+
return text
500+
.slice(candidate.endIndex)
501+
.replace(/^\s*(?:[-,:;.!?]+\s*)?/, "")
502+
.trim();
503+
}
504+
505+
function matchLeadingFuzzyWakeName(
506+
text: string,
507+
wakeNames: string[],
508+
): AllowedWakeNameTranscriptResult | undefined {
509+
for (const candidate of leadingWakeNameCandidates(text)) {
510+
for (const wakeName of wakeNames) {
511+
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
512+
if (!normalizedWakeName) {
513+
continue;
514+
}
515+
const heardCompact = compactWakeName(candidate.heardName);
516+
const wakeCompact = compactWakeName(normalizedWakeName);
517+
if (heardCompact === wakeCompact || isFuzzyWakeNameMatch(candidate, wakeName)) {
518+
return {
519+
allowed: true,
520+
text: stripLeadingWakeNameCandidate(text, candidate),
521+
wakeName,
522+
heardName: candidate.heardName,
523+
match: heardCompact === wakeCompact ? "exact" : "fuzzy",
524+
};
525+
}
526+
}
527+
}
528+
return undefined;
529+
}
530+
387531
function resolveDiscordRealtimeWakeNames(params: {
388532
config: DiscordRealtimeVoiceConfig;
389533
cfg: OpenClawConfig;
@@ -1273,13 +1417,26 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
12731417
this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext());
12741418
}
12751419

1276-
private resolveWakeNameTranscript(text: string): { allowed: boolean; text: string } {
1420+
private resolveWakeNameTranscript(text: string): WakeNameTranscriptResult {
12771421
if (!this.requireWakeName) {
1278-
return { allowed: true, text };
1422+
return { allowed: true, text, wakeName: "", heardName: "", match: "exact" };
12791423
}
12801424
const wakeName = this.wakeNames.find((name) => includesWakeName(text, name));
12811425
if (wakeName) {
1282-
return { allowed: true, text: stripLeadingWakeName(text, wakeName) };
1426+
return {
1427+
allowed: true,
1428+
text: stripLeadingWakeName(text, wakeName),
1429+
wakeName,
1430+
heardName: wakeName,
1431+
match: "exact",
1432+
};
1433+
}
1434+
const fuzzyWakeName = matchLeadingFuzzyWakeName(text, this.wakeNames);
1435+
if (fuzzyWakeName) {
1436+
logger.info(
1437+
`discord voice: realtime wake-name gate matched canonical=${fuzzyWakeName.wakeName} heard=${fuzzyWakeName.heardName} match=${fuzzyWakeName.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
1438+
);
1439+
return fuzzyWakeName;
12831440
}
12841441
return { allowed: false, text };
12851442
}

0 commit comments

Comments
 (0)