Skip to content

Commit 2dfcd77

Browse files
committed
fix(reply): parse markdown image replies as media
1 parent e58d50b commit 2dfcd77

4 files changed

Lines changed: 107 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
2626
- Agents/subagents: emit the subagent registry lazy-runtime stub on the stable dist path that both source and bundled runtime imports resolve, so the follow-up dist fix no longer still fails with `ERR_MODULE_NOT_FOUND` at runtime. (#66420) Thanks @obviyus.
2727
- Media-understanding/proxy env: auto-upgrade provider HTTP helper requests to trusted env-proxy mode only when `HTTP_PROXY`/`HTTPS_PROXY` is active and the target is not bypassed by `NO_PROXY`, so remote media-understanding and transcription requests stop failing local DNS pre-resolution in proxy-only environments without widening SSRF bypasses. (#52162) Thanks @mjamiv and @vincentkoc.
2828
- Telegram/media downloads: let Telegram media fetches trust an operator-configured explicit proxy for target DNS resolution after hostname-policy checks, so proxy-backed installs stop failing `could not download media` on Bot API file downloads after the DNS-pinning regression. (#66245) Thanks @dawei41468 and @vincentkoc.
29+
- Telegram/media replies: parse markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `![...](...)` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc.
2930
- Browser: keep loopback CDP readiness checks reachable under strict SSRF defaults so OpenClaw can reconnect to locally started managed Chrome. (#66354) Thanks @hxy91819.
3031
- Agents/context engine: compact engine-owned sessions from the first tool-loop delta and preserve ingest fallback when `afterTurn` is absent, so long-running tool loops can stay bounded without dropping engine state. (#63555) Thanks @Bikkies.
3132
- Discord/native commands: return the real status card for native `/status` interactions instead of falling through to the synthetic `✅ Done.` ack when the generic dispatcher produces no visible reply. (#54629) Thanks @tkozzer and @vincentkoc.

src/auto-reply/reply/agent-runner-payloads.test.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,20 @@ describe("buildReplyPayloads media filter integration", () => {
220220
expect(replyPayloads).toHaveLength(0);
221221
});
222222

223+
it("extracts markdown image replies into final payload media urls", async () => {
224+
const { replyPayloads } = await buildReplyPayloads({
225+
...baseParams,
226+
payloads: [{ text: "Here you go\n\n![chart](https://example.com/chart.png)" }],
227+
});
228+
229+
expect(replyPayloads).toHaveLength(1);
230+
expect(replyPayloads[0]).toMatchObject({
231+
text: "Here you go",
232+
mediaUrl: "https://example.com/chart.png",
233+
mediaUrls: ["https://example.com/chart.png"],
234+
});
235+
});
236+
223237
it("deduplicates final payloads against directly sent block keys regardless of replyToId", async () => {
224238
// When block streaming is not active but directlySentBlockKeys has entries
225239
// (e.g. from pre-tool flush), the key should match even if replyToId differs.

src/media/parse.test.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,4 +103,21 @@ describe("splitMediaFromOutput", () => {
103103
{ type: "text", text: "```text\nMEDIA:https://example.com/ignored.png\n```\nAfter" },
104104
]);
105105
});
106+
107+
it("extracts markdown image urls while keeping surrounding caption text", () => {
108+
expectParsedMediaOutputCase("Caption\n\n![chart](https://example.com/chart.png)", {
109+
text: "Caption",
110+
mediaUrls: ["https://example.com/chart.png"],
111+
});
112+
});
113+
114+
it("extracts multiple markdown image urls in order", () => {
115+
expectParsedMediaOutputCase(
116+
"Before\n![one](https://example.com/one.png)\nMiddle\n![two](./out/two.png)\nAfter",
117+
{
118+
text: "Before\nMiddle\nAfter",
119+
mediaUrls: ["https://example.com/one.png", "./out/two.png"],
120+
},
121+
);
122+
});
106123
});

src/media/parse.ts

Lines changed: 75 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { parseAudioTag } from "./audio-tags.js";
55

66
// Allow optional wrapping backticks and punctuation after the token; capture the core token.
77
export const MEDIA_TOKEN_RE = /\bMEDIA:\s*`?([^\n]+)`?/gi;
8+
const MARKDOWN_IMAGE_RE = /!\[[^\]]*]\(([^)\n]+)\)/g;
89

910
export type ParsedMediaOutputSegment =
1011
| {
@@ -125,6 +126,59 @@ function mayContainFenceMarkers(input: string): boolean {
125126
return input.includes("```") || input.includes("~~~");
126127
}
127128

129+
function cleanLineText(text: string): string {
130+
return text.replace(/[ \t]{2,}/g, " ").trim();
131+
}
132+
133+
function collectMarkdownImageSegments(params: { line: string; media: string[] }): {
134+
cleanedLine?: string;
135+
lineSegments: ParsedMediaOutputSegment[];
136+
foundMedia: boolean;
137+
} {
138+
const matches = Array.from(params.line.matchAll(MARKDOWN_IMAGE_RE));
139+
if (matches.length === 0) {
140+
return { lineSegments: [], foundMedia: false };
141+
}
142+
143+
const pieces: string[] = [];
144+
const lineSegments: ParsedMediaOutputSegment[] = [];
145+
let cursor = 0;
146+
let foundMedia = false;
147+
148+
for (const match of matches) {
149+
const start = match.index ?? 0;
150+
pieces.push(params.line.slice(cursor, start));
151+
152+
const target = normalizeMediaSource(cleanCandidate(unwrapQuoted(match[1]) ?? match[1] ?? ""));
153+
if (isValidMedia(target, { allowSpaces: true, allowBareFilename: true })) {
154+
const beforeText = cleanLineText(pieces.join(""));
155+
if (beforeText) {
156+
lineSegments.push({ type: "text", text: beforeText });
157+
}
158+
pieces.length = 0;
159+
params.media.push(target);
160+
lineSegments.push({ type: "media", url: target });
161+
foundMedia = true;
162+
} else {
163+
pieces.push(match[0]);
164+
}
165+
166+
cursor = start + match[0].length;
167+
}
168+
169+
pieces.push(params.line.slice(cursor));
170+
const cleanedLine = cleanLineText(pieces.join(""));
171+
if (cleanedLine) {
172+
lineSegments.push({ type: "text", text: cleanedLine });
173+
}
174+
175+
return {
176+
cleanedLine: cleanedLine || undefined,
177+
lineSegments,
178+
foundMedia,
179+
};
180+
}
181+
128182
// Check if a character offset is inside any fenced code block
129183
function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset: number): boolean {
130184
return fenceSpans.some((span) => offset >= span.start && offset < span.end);
@@ -144,8 +198,9 @@ export function splitMediaFromOutput(raw: string): {
144198
return { text: "" };
145199
}
146200
const mayContainMediaToken = /media:/i.test(trimmedRaw);
201+
const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
147202
const mayContainAudioTag = trimmedRaw.includes("[[");
148-
if (!mayContainMediaToken && !mayContainAudioTag) {
203+
if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
149204
return { text: trimmedRaw };
150205
}
151206

@@ -185,8 +240,23 @@ export function splitMediaFromOutput(raw: string): {
185240

186241
const trimmedStart = line.trimStart();
187242
if (!trimmedStart.startsWith("MEDIA:")) {
188-
keptLines.push(line);
189-
pushTextSegment(line);
243+
const markdownImageResult = collectMarkdownImageSegments({ line, media });
244+
if (!markdownImageResult.foundMedia) {
245+
keptLines.push(line);
246+
pushTextSegment(line);
247+
} else {
248+
foundMediaToken = true;
249+
if (markdownImageResult.cleanedLine) {
250+
keptLines.push(markdownImageResult.cleanedLine);
251+
}
252+
for (const segment of markdownImageResult.lineSegments) {
253+
if (segment.type === "text") {
254+
pushTextSegment(segment.text);
255+
continue;
256+
}
257+
segments.push(segment);
258+
}
259+
}
190260
lineOffset += line.length + 1; // +1 for newline
191261
continue;
192262
}
@@ -269,10 +339,7 @@ export function splitMediaFromOutput(raw: string): {
269339
}
270340

271341
if (hasValidMedia) {
272-
const beforeText = pieces
273-
.join("")
274-
.replace(/[ \t]{2,}/g, " ")
275-
.trim();
342+
const beforeText = cleanLineText(pieces.join(""));
276343
if (beforeText) {
277344
lineSegments.push({ type: "text", text: beforeText });
278345
}
@@ -297,10 +364,7 @@ export function splitMediaFromOutput(raw: string): {
297364

298365
pieces.push(line.slice(cursor));
299366

300-
const cleanedLine = pieces
301-
.join("")
302-
.replace(/[ \t]{2,}/g, " ")
303-
.trim();
367+
const cleanedLine = cleanLineText(pieces.join(""));
304368

305369
// If the line becomes empty, drop it.
306370
if (cleanedLine) {

0 commit comments

Comments
 (0)