fix(tts): preserve legacy tool voice hints

steipete · steipete · commit 60f93583488a · 2026-04-25T17:56:37.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
+  `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
+  Thanks @azade-c.
 - Telegram/STT: frame inbound voice-note transcripts as machine-generated,
   untrusted text in agent context while preserving raw transcript mention
   detection. Closes #33360. Thanks @smartchainark.
diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
 - `[embed ...]` for Control UI rich rendering
 
 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
+Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
 
 When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
 turn. If the same media URL is sent in a streamed block and repeated in the final
diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
     expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
   });
 
+  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({
+      shouldEmitToolOutput: false,
+      onToolResult,
+      builtinToolNames: new Set(["tts"]),
+    });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "tts",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          {
+            type: "text",
+            text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
+          },
+        ],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
+    expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
+  });
+
   it("does NOT emit local media for untrusted tools", async () => {
     const onToolResult = vi.fn();
     const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
     });
   });
 
+  it("extracts audioAsVoice from legacy MEDIA text", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
+  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "[[audio_as_voice]]" },
+          { type: "text", text: "MEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
   it("extracts structured media trust markers", () => {
     expect(
       extractToolResultMediaArtifact({
diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
   // parser so directive matching and validation stay in sync with outbound
   // reply parsing.
   const paths: string[] = [];
+  let audioAsVoice = false;
   let hasImageContent = false;
   for (const item of content) {
     if (!item || typeof item !== "object") {
@@ -319,14 +320,20 @@ export function extractToolResultMediaArtifact(
     }
     if (entry.type === "text" && typeof entry.text === "string") {
       const parsed = splitMediaFromOutput(entry.text);
+      if (parsed.audioAsVoice) {
+        audioAsVoice = true;
+      }
       if (parsed.mediaUrls?.length) {
         paths.push(...parsed.mediaUrls);
       }
     }
   }
 
   if (paths.length > 0) {
-    return { mediaUrls: paths };
+    return {
+      mediaUrls: paths,
+      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    };
   }
 
   // Fall back to legacy details.path when image content exists but no

Original file line number	Diff line number	Diff line change
`@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(`
`307`	`307`	`// parser so directive matching and validation stay in sync with outbound`
`308`	`308`	`// reply parsing.`
`309`	`309`	`const paths: string[] = [];`
	`310`	`+ let audioAsVoice = false;`
`310`	`311`	`let hasImageContent = false;`
`311`	`312`	`for (const item of content) {`
`312`	`313`	`if (!item \|\| typeof item !== "object") {`
`@@ -319,14 +320,20 @@ export function extractToolResultMediaArtifact(`
`319`	`320`	`}`
`320`	`321`	`if (entry.type === "text" && typeof entry.text === "string") {`
`321`	`322`	`const parsed = splitMediaFromOutput(entry.text);`
	`323`	`+ if (parsed.audioAsVoice) {`
	`324`	`+ audioAsVoice = true;`
	`325`	`+ }`
`322`	`326`	`if (parsed.mediaUrls?.length) {`
`323`	`327`	`paths.push(...parsed.mediaUrls);`
`324`	`328`	`}`
`325`	`329`	`}`
`326`	`330`	`}`
`327`	`331`
`328`	`332`	`if (paths.length > 0) {`
`329`		`- return { mediaUrls: paths };`
	`333`	`+ return {`
	`334`	`+ mediaUrls: paths,`
	`335`	`+ ...(audioAsVoice ? { audioAsVoice: true } : {}),`
	`336`	`+ };`
`330`	`337`	`}`
`331`	`338`
`332`	`339`	`// Fall back to legacy details.path when image content exists but no`