Skip to content

Commit 60f9358

Browse files
committed
fix(tts): preserve legacy tool voice hints
1 parent dc7c703 commit 60f9358

5 files changed

Lines changed: 67 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
2525

2626
### Fixes
2727

28+
- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
29+
`MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
30+
Thanks @azade-c.
2831
- Telegram/STT: frame inbound voice-note transcripts as machine-generated,
2932
untrusted text in agent context while preserving raw transcript mention
3033
detection. Closes #33360. Thanks @smartchainark.

docs/reference/rich-output-protocol.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
1414
- `[embed ...]` for Control UI rich rendering
1515

1616
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
17+
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
1718

1819
When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
1920
turn. If the same media URL is sent in a streamed block and repeated in the final

src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
165165
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
166166
});
167167

168+
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
169+
const onToolResult = vi.fn();
170+
const ctx = createMockContext({
171+
shouldEmitToolOutput: false,
172+
onToolResult,
173+
builtinToolNames: new Set(["tts"]),
174+
});
175+
176+
await handleToolExecutionEnd(ctx, {
177+
type: "tool_execution_end",
178+
toolName: "tts",
179+
toolCallId: "tc-1",
180+
isError: false,
181+
result: {
182+
content: [
183+
{
184+
type: "text",
185+
text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
186+
},
187+
],
188+
},
189+
});
190+
191+
expect(onToolResult).not.toHaveBeenCalled();
192+
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
193+
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
194+
});
195+
168196
it("does NOT emit local media for untrusted tools", async () => {
169197
const onToolResult = vi.fn();
170198
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });

src/agents/pi-embedded-subscribe.tools.media.test.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
5151
});
5252
});
5353

54+
it("extracts audioAsVoice from legacy MEDIA text", () => {
55+
expect(
56+
extractToolResultMediaArtifact({
57+
content: [
58+
{ type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
59+
],
60+
}),
61+
).toEqual({
62+
mediaUrls: ["/tmp/reply.opus"],
63+
audioAsVoice: true,
64+
});
65+
});
66+
67+
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
68+
expect(
69+
extractToolResultMediaArtifact({
70+
content: [
71+
{ type: "text", text: "[[audio_as_voice]]" },
72+
{ type: "text", text: "MEDIA:/tmp/reply.opus" },
73+
],
74+
}),
75+
).toEqual({
76+
mediaUrls: ["/tmp/reply.opus"],
77+
audioAsVoice: true,
78+
});
79+
});
80+
5481
it("extracts structured media trust markers", () => {
5582
expect(
5683
extractToolResultMediaArtifact({

src/agents/pi-embedded-subscribe.tools.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
307307
// parser so directive matching and validation stay in sync with outbound
308308
// reply parsing.
309309
const paths: string[] = [];
310+
let audioAsVoice = false;
310311
let hasImageContent = false;
311312
for (const item of content) {
312313
if (!item || typeof item !== "object") {
@@ -319,14 +320,20 @@ export function extractToolResultMediaArtifact(
319320
}
320321
if (entry.type === "text" && typeof entry.text === "string") {
321322
const parsed = splitMediaFromOutput(entry.text);
323+
if (parsed.audioAsVoice) {
324+
audioAsVoice = true;
325+
}
322326
if (parsed.mediaUrls?.length) {
323327
paths.push(...parsed.mediaUrls);
324328
}
325329
}
326330
}
327331

328332
if (paths.length > 0) {
329-
return { mediaUrls: paths };
333+
return {
334+
mediaUrls: paths,
335+
...(audioAsVoice ? { audioAsVoice: true } : {}),
336+
};
330337
}
331338

332339
// Fall back to legacy details.path when image content exists but no

0 commit comments

Comments
 (0)