fix(media): gate markdown image extraction by channel (#72718)

Bartok9 · steipete · web-flow · commit f0b327cf686e · 2026-04-27T11:27:35.000+01:00
Closes #72642 Co-authored-by: Peter Steinberger <steipete@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -97,6 +97,7 @@ Docs: https://docs.openclaw.ai
 - Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND.
 - Google Meet/Voice Call: send Gemini Live a non-blocking consult continuation before long OpenClaw agent consults finish, then deliver the final result when idle so calls and meetings do not sit silent during tool-backed answers. (#72189) Thanks @VACInc.
 - Google Meet: preserve Gemini Live function names when replying to realtime tool calls so Google SDK validation accepts the `FunctionResponse` payload. Fixes #72425. (#72426) Thanks @BsnizND.
+- Discord/media: keep incidental Markdown image badges in final replies as text unless a channel opts into Markdown-image media extraction, while preserving Telegram Markdown-image media replies and explicit `MEDIA:` attachments. Fixes #72642. Thanks @solavrc and @Bartok9.
 - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
 - Cron: apply `cron.maxConcurrentRuns` to a dedicated `cron-nested` isolated agent-turn lane as well as cron dispatch, so parallel cron jobs no longer serialize on inner LLM execution while non-cron nested flows keep their existing lane behavior. Fixes #72707. Thanks @kagura-agent.
 - Cron: report isolated runs as successful when verified cron delivery already delivered the reply, while keeping unresolved Message/Canvas tool failures fatal. Fixes #72732 and #50170; follow-up to #54188. Thanks @zNatix, @pixeldyn, and @ChickenEggRoll.
diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md
@@ -17,6 +17,10 @@ Remote `MEDIA:` attachments must be public `https:` URLs. Plain `http:`,
 loopback, link-local, private, and internal hostnames are ignored as attachment
 directives; server-side media fetchers still enforce their own network guards.
 
+Plain Markdown image syntax stays text by default. Channels that intentionally
+map Markdown image replies to media attachments opt in at their outbound
+adapter; Telegram does this so `![alt](url)` can still become a media reply.
+
 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
 Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.
 
diff --git a/extensions/telegram/src/outbound-adapter.ts b/extensions/telegram/src/outbound-adapter.ts
@@ -121,6 +121,7 @@ export const telegramOutbound: ChannelOutboundAdapter = {
   deliveryMode: "direct",
   chunker: markdownToTelegramHtmlChunks,
   chunkerMode: "markdown",
+  extractMarkdownImages: true,
   textChunkLimit: TELEGRAM_TEXT_CHUNK_LIMIT,
   sanitizeText: ({ text }) => sanitizeForPlainText(text),
   shouldSkipPlainTextSanitization: ({ payload }) => Boolean(payload.channelData),
diff --git a/extensions/telegram/src/outbound-base.ts b/extensions/telegram/src/outbound-base.ts
@@ -4,6 +4,7 @@ export const telegramOutboundBaseAdapter = {
   deliveryMode: "direct" as const,
   chunker: chunkMarkdownText,
   chunkerMode: "markdown" as const,
+  extractMarkdownImages: true,
   textChunkLimit: 4000,
   pollMaxOptions: 10,
 };
diff --git a/src/auto-reply/reply/agent-runner-payloads.test.ts b/src/auto-reply/reply/agent-runner-payloads.test.ts
@@ -350,6 +350,7 @@ describe("buildReplyPayloads media filter integration", () => {
   it("extracts markdown image replies into final payload media urls", async () => {
     const { replyPayloads } = await buildReplyPayloads({
       ...baseParams,
+      extractMarkdownImages: true,
       payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
     });
 
@@ -364,6 +365,7 @@ describe("buildReplyPayloads media filter integration", () => {
   it("preserves inline caption text when lifting markdown image replies into media", async () => {
     const { replyPayloads } = await buildReplyPayloads({
       ...baseParams,
+      extractMarkdownImages: true,
       payloads: [{ text: 'Look ![chart](https://example.com/chart.png "Quarterly chart") now' }],
     });
 
@@ -379,6 +381,7 @@ describe("buildReplyPayloads media filter integration", () => {
     const text = "Look ![chart](file:///etc/passwd) now";
     const { replyPayloads } = await buildReplyPayloads({
       ...baseParams,
+      extractMarkdownImages: true,
       payloads: [{ text }],
     });
 
diff --git a/src/auto-reply/reply/agent-runner-payloads.ts b/src/auto-reply/reply/agent-runner-payloads.ts
@@ -107,6 +107,7 @@ export async function buildReplyPayloads(params: {
   originatingChannel?: OriginatingChannelType;
   originatingTo?: string;
   accountId?: string;
+  extractMarkdownImages?: boolean;
   normalizeMediaPaths?: (payload: ReplyPayload) => Promise<ReplyPayload>;
 }): Promise<{ replyPayloads: ReplyPayload[]; didLogHeartbeatStrip: boolean }> {
   let didLogHeartbeatStrip = params.didLogHeartbeatStrip;
@@ -148,6 +149,7 @@ export async function buildReplyPayloads(params: {
           currentMessageId: params.currentMessageId,
           silentToken: SILENT_REPLY_TOKEN,
           parseMode: "always",
+          extractMarkdownImages: params.extractMarkdownImages,
         });
         const mediaNormalizedPayload = await normalizeReplyPayloadMedia({
           payload: parsed.payload,
diff --git a/src/auto-reply/reply/reply-delivery.ts b/src/auto-reply/reply/reply-delivery.ts
@@ -17,6 +17,7 @@ export function normalizeReplyPayloadDirectives(params: {
   silentToken?: string;
   trimLeadingWhitespace?: boolean;
   parseMode?: ReplyDirectiveParseMode;
+  extractMarkdownImages?: boolean;
 }): { payload: ReplyPayload; isSilent: boolean } {
   const parseMode = params.parseMode ?? "always";
   const silentToken = params.silentToken ?? SILENT_REPLY_TOKEN;
@@ -27,12 +28,14 @@ export function normalizeReplyPayloadDirectives(params: {
     (parseMode === "auto" &&
       (sourceText.includes("[[") ||
         /media:/i.test(sourceText) ||
+        (params.extractMarkdownImages === true && /!\[[^\]]*]\(/.test(sourceText)) ||
         sourceText.includes(silentToken)));
 
   const parsed = shouldParse
     ? parseReplyDirectives(sourceText, {
         currentMessageId: params.currentMessageId,
         silentToken,
+        extractMarkdownImages: params.extractMarkdownImages,
       })
     : undefined;
 
diff --git a/src/auto-reply/reply/reply-directives.ts b/src/auto-reply/reply/reply-directives.ts
@@ -13,11 +13,19 @@ export type ReplyDirectiveParseResult = {
   isSilent: boolean;
 };
 
+export type ReplyDirectiveParseOptions = {
+  currentMessageId?: string;
+  silentToken?: string;
+  extractMarkdownImages?: boolean;
+};
+
 export function parseReplyDirectives(
   raw: string,
-  options: { currentMessageId?: string; silentToken?: string } = {},
+  options: ReplyDirectiveParseOptions = {},
 ): ReplyDirectiveParseResult {
-  const split = splitMediaFromOutput(raw);
+  const split = splitMediaFromOutput(raw, {
+    extractMarkdownImages: options.extractMarkdownImages,
+  });
   let text = split.text ?? "";
 
   const replyParsed = parseInlineDirectives(text, {
diff --git a/src/channels/plugins/outbound.types.ts b/src/channels/plugins/outbound.types.ts
@@ -76,6 +76,8 @@ export type ChannelOutboundAdapter = {
   deliveryMode: "direct" | "gateway" | "hybrid";
   chunker?: ((text: string, limit: number, ctx?: ChannelOutboundChunkContext) => string[]) | null;
   chunkerMode?: "text" | "markdown";
+  /** Lift remote Markdown image syntax in text into outbound media attachments. */
+  extractMarkdownImages?: boolean;
   textChunkLimit?: number;
   sanitizeText?: (params: { text: string; payload: ReplyPayload }) => string;
   pollMaxOptions?: number;
diff --git a/src/infra/outbound/deliver.test.ts b/src/infra/outbound/deliver.test.ts
@@ -1235,6 +1235,54 @@ describe("deliverOutboundPayloads", () => {
     );
   });
 
+  it("keeps markdown images as text for channels that do not opt in", async () => {
+    const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-text", roomId: "!room" });
+
+    await deliverOutboundPayloads({
+      cfg: matrixChunkConfig,
+      channel: "matrix",
+      to: "!room:example",
+      payloads: [{ text: "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)" }],
+      deps: { matrix: sendMatrix },
+    });
+
+    expect(sendMatrix).toHaveBeenCalledWith(
+      "!room:example",
+      "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)",
+      expect.not.objectContaining({ mediaUrl: expect.any(String) }),
+    );
+  });
+
+  it("extracts markdown images for channels that opt in", async () => {
+    const sendMatrix = vi.fn().mockResolvedValue({ messageId: "m-media", roomId: "!room" });
+    setActivePluginRegistry(
+      createTestRegistry([
+        {
+          pluginId: "matrix",
+          source: "test",
+          plugin: createOutboundTestPlugin({
+            id: "matrix",
+            outbound: { ...matrixOutboundForTest, extractMarkdownImages: true },
+          }),
+        },
+      ]),
+    );
+
+    await deliverOutboundPayloads({
+      cfg: matrixChunkConfig,
+      channel: "matrix",
+      to: "!room:example",
+      payloads: [{ text: "Chart ![chart](https://example.com/chart.png) now" }],
+      deps: { matrix: sendMatrix },
+    });
+
+    expect(sendMatrix).toHaveBeenCalledWith(
+      "!room:example",
+      "Chart now",
+      expect.objectContaining({ mediaUrl: "https://example.com/chart.png" }),
+    );
+  });
+
   it("normalizes payloads and drops empty entries", () => {
     const normalized = normalizeOutboundPayloads([
       { text: "hi" },
diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts
@@ -147,6 +147,24 @@ type ChannelHandlerParams = {
 };
 
 // Channel docking: outbound delivery delegates to plugin.outbound adapters.
+async function resolveChannelOutboundDirectiveOptions(params: {
+  cfg: OpenClawConfig;
+  channel: Exclude<OutboundChannel, "none">;
+}): Promise<{ extractMarkdownImages?: boolean }> {
+  let outbound = await loadChannelOutboundAdapter(params.channel);
+  if (!outbound) {
+    const { bootstrapOutboundChannelPlugin } = await loadChannelBootstrapRuntime();
+    bootstrapOutboundChannelPlugin({
+      channel: params.channel,
+      cfg: params.cfg,
+    });
+    outbound = await loadChannelOutboundAdapter(params.channel);
+  }
+  return {
+    extractMarkdownImages: outbound?.extractMarkdownImages === true ? true : undefined,
+  };
+}
+
 async function createChannelHandler(params: ChannelHandlerParams): Promise<ChannelHandler> {
   let outbound = await loadChannelOutboundAdapter(params.channel);
   if (!outbound) {
@@ -841,11 +859,13 @@ async function deliverOutboundPayloadsCore(
   params: DeliverOutboundPayloadsCoreParams,
 ): Promise<OutboundDeliveryResult[]> {
   const { cfg, channel, to, payloads } = params;
+  const directiveOptions = await resolveChannelOutboundDirectiveOptions({ cfg, channel });
   const outboundPayloadPlan = createOutboundPayloadPlan(payloads, {
     cfg,
     sessionKey: params.session?.policyKey ?? params.session?.key,
     surface: channel,
     conversationType: params.session?.conversationType,
+    extractMarkdownImages: directiveOptions.extractMarkdownImages,
   });
   const accountId = params.accountId;
   const deps = params.deps;
diff --git a/src/infra/outbound/payloads.test.ts b/src/infra/outbound/payloads.test.ts
@@ -642,6 +642,44 @@ describe("OutboundPayloadPlan projections", () => {
     const plan = createOutboundPayloadPlan(matrix);
     expect(projectOutboundPayloadPlanForMirror(plan)).toEqual(resolveMirrorProjection(matrix));
   });
+
+  it("keeps markdown images as text unless extraction is enabled", () => {
+    const input = "Tech: ![Node.js](https://img.shields.io/badge/Node.js-339933)";
+
+    expect(
+      projectOutboundPayloadPlanForDelivery(createOutboundPayloadPlan([{ text: input }])),
+    ).toEqual([
+      {
+        text: input,
+        mediaUrl: undefined,
+        mediaUrls: undefined,
+        replyToId: undefined,
+        replyToCurrent: undefined,
+        replyToTag: false,
+        audioAsVoice: false,
+      },
+    ]);
+  });
+
+  it("extracts markdown images when the outbound channel opts in", () => {
+    const input = "Chart ![chart](https://example.com/chart.png) now";
+
+    expect(
+      projectOutboundPayloadPlanForDelivery(
+        createOutboundPayloadPlan([{ text: input }], { extractMarkdownImages: true }),
+      ),
+    ).toEqual([
+      {
+        text: "Chart now",
+        mediaUrl: "https://example.com/chart.png",
+        mediaUrls: ["https://example.com/chart.png"],
+        replyToId: undefined,
+        replyToCurrent: undefined,
+        replyToTag: false,
+        audioAsVoice: false,
+      },
+    ]);
+  });
 });
 
 describe("formatOutboundPayloadLog", () => {
diff --git a/src/infra/outbound/payloads.ts b/src/infra/outbound/payloads.ts
@@ -67,6 +67,7 @@ type OutboundPayloadPlanContext = {
    * (see `pending-spawn-query.ts`).
    */
   hasPendingSpawnedChildren?: boolean;
+  extractMarkdownImages?: boolean;
 };
 
 export type OutboundPayloadMirror = {
@@ -131,11 +132,14 @@ type PreparedOutboundPayloadPlanEntry = {
 
 function createOutboundPayloadPlanEntry(
   payload: ReplyPayload,
+  context: Pick<OutboundPayloadPlanContext, "extractMarkdownImages"> = {},
 ): PreparedOutboundPayloadPlanEntry | null {
   if (shouldSuppressReasoningPayload(payload)) {
     return null;
   }
-  const parsed = parseReplyDirectives(payload.text ?? "");
+  const parsed = parseReplyDirectives(payload.text ?? "", {
+    extractMarkdownImages: context.extractMarkdownImages,
+  });
   const explicitMediaUrls = payload.mediaUrls ?? parsed.mediaUrls;
   const explicitMediaUrl = payload.mediaUrl ?? parsed.mediaUrl;
   const mergedMedia = mergeMediaUrls(
@@ -193,7 +197,9 @@ export function createOutboundPayloadPlan(
     context.hasPendingSpawnedChildren ?? resolvePendingSpawnedChildren(context.sessionKey);
   const prepared: PreparedOutboundPayloadPlanEntry[] = [];
   for (const payload of payloads) {
-    const entry = createOutboundPayloadPlanEntry(payload);
+    const entry = createOutboundPayloadPlanEntry(payload, {
+      extractMarkdownImages: context.extractMarkdownImages,
+    });
     if (!entry) {
       continue;
     }
diff --git a/src/media/parse.test.ts b/src/media/parse.test.ts
diff --git a/src/media/parse.ts b/src/media/parse.ts