Skip to content

Commit 2c549ae

Browse files
fix(cli): support image describe urls (#82854)
Co-authored-by: Peter Steinberger <steipete@gmail.com>
1 parent ab2943e commit 2c549ae

10 files changed

Lines changed: 172 additions & 29 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Docs: https://docs.openclaw.ai
77
### Fixes
88

99
- Agents/diagnostics: split slow embedded-run `attempt-dispatch` startup summaries into workspace, prompt, runtime-plan, and final dispatch subspans so traces identify the delayed setup phase. Fixes #82782. (#82783) Thanks @galiniliev.
10+
- CLI/media: accept HTTP(S) URLs in `openclaw infer image describe --file`, fetching remote images through the guarded media path instead of treating URLs as local files. Fixes #82837. (#82854) Thanks @neeravmakwana.
1011
- Agents/subagents: route group/channel subagent completions through message-tool-only handoffs when required and keep active-requester wake failures from dropping completion delivery. Fixes #82803. Thanks @galiniliev, @yozakura-ava, and @moeedahmed.
1112
- Memory-core: scan persisted memory source sessions on startup, comparing on-disk transcripts against the index and marking only missing/newer/resized files dirty for incremental sync. Fixes #82341. (#82341) Thanks @giodl73-repo.
1213
- Telegram: keep the top-level default account in the account list when named accounts or bindings are added alongside top-level credentials, preserving default polling while still letting named-only configs resolve to a single account. Fixes #82794. (#82794) Thanks @giodl73-repo.

src/cli/capability-cli.test.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,7 @@ describe("capability cli", () => {
478478
};
479479
type ImageDescribeParams = {
480480
filePath?: string;
481+
mediaUrl?: string;
481482
model?: unknown;
482483
prompt?: unknown;
483484
provider?: unknown;
@@ -1147,6 +1148,26 @@ describe("capability cli", () => {
11471148
expect(describeCall?.timeoutMs).toBe(90000);
11481149
});
11491150

1151+
it("keeps image describe URL files as remote media references", async () => {
1152+
await runRegisteredCli({
1153+
register: registerCapabilityCli as (program: Command) => void,
1154+
argv: [
1155+
"capability",
1156+
"image",
1157+
"describe",
1158+
"--file",
1159+
"https://example.com/photo.png",
1160+
"--json",
1161+
],
1162+
});
1163+
1164+
const describeCall = imageDescribeCall();
1165+
expect(describeCall?.filePath).toBe("https://example.com/photo.png");
1166+
expect(describeCall?.mediaUrl).toBe("https://example.com/photo.png");
1167+
const outputs = firstJsonOutput()?.outputs as Array<Record<string, unknown>>;
1168+
expect(outputs[0]?.path).toBe("https://example.com/photo.png");
1169+
});
1170+
11501171
it("uses the explicit media-understanding provider for image describe model overrides", async () => {
11511172
await runRegisteredCli({
11521173
register: registerCapabilityCli as (program: Command) => void,
@@ -1177,6 +1198,29 @@ describe("capability cli", () => {
11771198
expect(firstJsonOutput()?.model).toBe("gpt-4.1-mini");
11781199
});
11791200

1201+
it("keeps explicit-model image describe URL files as remote media references", async () => {
1202+
await runRegisteredCli({
1203+
register: registerCapabilityCli as (program: Command) => void,
1204+
argv: [
1205+
"capability",
1206+
"image",
1207+
"describe",
1208+
"--file",
1209+
"https://example.com/photo.png",
1210+
"--model",
1211+
"ollama/qwen2.5vl:7b",
1212+
"--json",
1213+
],
1214+
});
1215+
1216+
const describeCall = firstImageDescribeWithModelCall();
1217+
expect(describeCall?.filePath).toBe("https://example.com/photo.png");
1218+
expect(describeCall?.mediaUrl).toBe("https://example.com/photo.png");
1219+
expect(mocks.describeImageFile).not.toHaveBeenCalled();
1220+
const outputs = firstJsonOutput()?.outputs as Array<Record<string, unknown>>;
1221+
expect(outputs[0]?.path).toBe("https://example.com/photo.png");
1222+
});
1223+
11801224
it("passes describe-many prompts to each image", async () => {
11811225
await runRegisteredCli({
11821226
register: registerCapabilityCli as (program: Command) => void,

src/cli/capability-cli.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1097,10 +1097,12 @@ async function runImageDescribe(params: {
10971097
const prompt = normalizeOptionalString(params.prompt);
10981098
const outputs = await Promise.all(
10991099
params.files.map(async (filePath) => {
1100-
const resolvedPath = path.resolve(filePath);
1100+
const isRemoteUrl = /^https?:\/\//i.test(filePath.trim());
1101+
const resolvedPath = isRemoteUrl ? filePath.trim() : path.resolve(filePath);
11011102
const result = activeModel
11021103
? await describeImageFileWithModel({
11031104
filePath: resolvedPath,
1105+
...(isRemoteUrl ? { mediaUrl: resolvedPath } : {}),
11041106
cfg,
11051107
agentDir,
11061108
provider: activeModel.provider,
@@ -1110,6 +1112,7 @@ async function runImageDescribe(params: {
11101112
})
11111113
: await describeImageFile({
11121114
filePath: resolvedPath,
1115+
...(isRemoteUrl ? { mediaUrl: resolvedPath } : {}),
11131116
cfg,
11141117
agentDir,
11151118
prompt,

src/media-understanding/attachments.cache.ts

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import { detectMime } from "../media/mime.js";
1515
import { buildRandomTempFilePath } from "../plugin-sdk/temp-path.js";
1616
import { normalizeAttachmentPath } from "./attachments.normalize.js";
1717
import { MediaUnderstandingSkipError } from "./errors.js";
18-
import { fetchWithTimeout } from "./shared.js";
1918
import type { MediaAttachment } from "./types.js";
2019

2120
type MediaBufferResult = {
@@ -67,16 +66,6 @@ export type MediaAttachmentCacheOptions = {
6766
workspaceDir?: string;
6867
};
6968

70-
function resolveRequestUrl(input: RequestInfo | URL): string {
71-
if (typeof input === "string") {
72-
return input;
73-
}
74-
if (input instanceof URL) {
75-
return input.toString();
76-
}
77-
return input.url;
78-
}
79-
8069
export class MediaAttachmentCache {
8170
private readonly entries = new Map<number, AttachmentCacheEntry>();
8271
private readonly attachments: MediaAttachment[];
@@ -171,11 +160,9 @@ export class MediaAttachmentCache {
171160
}
172161

173162
try {
174-
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
175-
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, globalThis.fetch);
176163
const fetched = await readRemoteMediaBuffer({
177164
url,
178-
fetchImpl,
165+
timeoutMs: params.timeoutMs,
179166
maxBytes: params.maxBytes,
180167
ssrfPolicy: this.ssrfPolicy,
181168
retry: REMOTE_MEDIA_FETCH_RETRY,

src/media-understanding/media-understanding-url-fallback.test.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ vi.mock("../media/fetch.js", async () => {
1717

1818
function requireReadRemoteMediaBufferInput(): {
1919
url?: unknown;
20-
fetchImpl?: unknown;
20+
timeoutMs?: unknown;
2121
maxBytes?: unknown;
2222
ssrfPolicy?: unknown;
2323
retry?: unknown;
@@ -84,15 +84,13 @@ describe("media understanding attachment URL fallback", () => {
8484
expect(path.extname(result.path)).toBe(".jpg");
8585
expect(readRemoteMediaBufferMock).toHaveBeenCalledTimes(1);
8686
const fetchInput = requireReadRemoteMediaBufferInput();
87-
const fetchImpl = fetchInput.fetchImpl;
8887
expect(fetchInput).toStrictEqual({
8988
url: fallbackUrl,
90-
fetchImpl,
89+
timeoutMs: 1000,
9190
maxBytes: 1024,
9291
ssrfPolicy: undefined,
9392
retry: expect.objectContaining({ attempts: 3 }),
9493
});
95-
expect(typeof fetchImpl).toBe("function");
9694
// Clean up the temp file
9795
if (result.cleanup) {
9896
await result.cleanup();
@@ -113,15 +111,13 @@ describe("media understanding attachment URL fallback", () => {
113111
expect(result.buffer.toString()).toBe("fallback-buffer");
114112
expect(readRemoteMediaBufferMock).toHaveBeenCalledTimes(1);
115113
const fetchInput = requireReadRemoteMediaBufferInput();
116-
const fetchImpl = fetchInput.fetchImpl;
117114
expect(fetchInput).toStrictEqual({
118115
url: fallbackUrl,
119-
fetchImpl,
116+
timeoutMs: 1000,
120117
maxBytes: 1024,
121118
ssrfPolicy: undefined,
122119
retry: expect.objectContaining({ attempts: 3 }),
123120
});
124-
expect(typeof fetchImpl).toBe("function");
125121
},
126122
);
127123
});

src/media-understanding/runtime-types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import type {
1111
export type RunMediaUnderstandingFileParams = {
1212
capability: "image" | "audio" | "video";
1313
filePath: string;
14+
mediaUrl?: string;
1415
cfg: OpenClawConfig;
1516
agentDir?: string;
1617
workspaceDir?: string;
@@ -30,6 +31,7 @@ export type RunMediaUnderstandingFileResult = {
3031

3132
export type DescribeImageFileParams = {
3233
filePath: string;
34+
mediaUrl?: string;
3335
cfg: OpenClawConfig;
3436
agentDir?: string;
3537
workspaceDir?: string;
@@ -41,6 +43,7 @@ export type DescribeImageFileParams = {
4143

4244
export type DescribeImageFileWithModelParams = {
4345
filePath: string;
46+
mediaUrl?: string;
4447
cfg: OpenClawConfig;
4548
agentDir?: string;
4649
workspaceDir?: string;

src/media-understanding/runtime.test.ts

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,15 @@ import {
1313

1414
const mocks = vi.hoisted(() => {
1515
const cleanup = vi.fn(async () => {});
16+
const getBuffer = vi.fn(async () => ({
17+
buffer: Buffer.from("remote-image"),
18+
fileName: "photo.png",
19+
mime: "image/png",
20+
size: 12,
21+
}));
1622
return {
1723
buildProviderRegistry: vi.fn(() => new Map()),
18-
createMediaAttachmentCache: vi.fn(() => ({ cleanup })),
24+
createMediaAttachmentCache: vi.fn(() => ({ cleanup, getBuffer })),
1925
normalizeMediaAttachments: vi.fn<() => MediaAttachment[]>(() => []),
2026
normalizeMediaProviderId: vi.fn((provider: string) => provider.trim().toLowerCase()),
2127
buildMediaUnderstandingRegistry: vi.fn(() => new Map()),
@@ -24,6 +30,7 @@ const mocks = vi.hoisted(() => {
2430
describeImageWithModel: vi.fn(async () => ({ text: "generic image ok", model: "vision" })),
2531
runCapability: vi.fn(),
2632
cleanup,
33+
getBuffer,
2734
};
2835
});
2936

@@ -71,6 +78,13 @@ describe("media-understanding runtime", () => {
7178
mocks.runCapability.mockReset();
7279
mocks.cleanup.mockReset();
7380
mocks.cleanup.mockResolvedValue(undefined);
81+
mocks.getBuffer.mockReset();
82+
mocks.getBuffer.mockResolvedValue({
83+
buffer: Buffer.from("remote-image"),
84+
fileName: "photo.png",
85+
mime: "image/png",
86+
size: 12,
87+
});
7488
});
7589

7690
it("returns disabled state without loading providers", async () => {
@@ -201,6 +215,36 @@ describe("media-understanding runtime", () => {
201215
});
202216
});
203217

218+
it("passes image file URLs as remote media understanding inputs", async () => {
219+
const output: MediaUnderstandingOutput = {
220+
kind: "image.description",
221+
attachmentIndex: 0,
222+
provider: "vision-plugin",
223+
model: "vision-v1",
224+
text: "image ok",
225+
};
226+
const media = [{ index: 0, url: "https://example.com/photo.png", mime: "image/png" }];
227+
mocks.normalizeMediaAttachments.mockReturnValue(media);
228+
mocks.runCapability.mockResolvedValue({ outputs: [output] });
229+
230+
await describeImageFile({
231+
filePath: "https://example.com/photo.png",
232+
mediaUrl: "https://example.com/photo.png",
233+
mime: "image/png",
234+
cfg: {} as OpenClawConfig,
235+
agentDir: "/tmp/agent",
236+
});
237+
238+
expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
239+
MediaUrl: "https://example.com/photo.png",
240+
MediaType: "image/png",
241+
});
242+
expect(requireRunCapabilityRequest()).toMatchObject({
243+
ctx: { MediaUrl: "https://example.com/photo.png", MediaType: "image/png" },
244+
media,
245+
});
246+
});
247+
204248
it("passes workspaceDir through audio and video file helpers", async () => {
205249
mocks.runCapability.mockResolvedValue({
206250
outputs: [],
@@ -251,7 +295,7 @@ describe("media-understanding runtime", () => {
251295
it("passes per-request image prompts into media understanding config", async () => {
252296
const media = [{ index: 0, path: "/tmp/sample.jpg", mime: "image/jpeg" }];
253297
const providerRegistry = new Map();
254-
const cache = { cleanup: mocks.cleanup };
298+
const cache = { cleanup: mocks.cleanup, getBuffer: mocks.getBuffer };
255299
const output: MediaUnderstandingOutput = {
256300
kind: "image.description",
257301
attachmentIndex: 0,
@@ -347,6 +391,27 @@ describe("media-understanding runtime", () => {
347391
});
348392
});
349393

394+
it("preserves fetched metadata for explicit model URL inputs", async () => {
395+
await describeImageFileWithModel({
396+
filePath: "https://example.com/photo.png",
397+
mediaUrl: "https://example.com/photo.png",
398+
provider: "zai",
399+
model: "glm-4.6v",
400+
prompt: "Describe it",
401+
cfg: {} as OpenClawConfig,
402+
agentDir: "/tmp/agent",
403+
});
404+
405+
expect(mocks.describeImageWithModel).toHaveBeenCalledWith(
406+
expect.objectContaining({
407+
buffer: Buffer.from("remote-image"),
408+
fileName: "photo.png",
409+
mime: "image/png",
410+
}),
411+
);
412+
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
413+
});
414+
350415
it("routes direct image description through a provider-specific image hook", async () => {
351416
const describeImage = vi.fn(async () => ({
352417
text: "image ok",

src/media-understanding/runtime.ts

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import path from "node:path";
22
import { readLocalFileSafely } from "../infra/fs-safe.js";
3+
import { DEFAULT_MAX_BYTES } from "./defaults.constants.js";
34
import { describeImageWithModel } from "./image-runtime.js";
45
import {
56
buildMediaUnderstandingRegistry,
@@ -47,9 +48,9 @@ function resolveDecisionFailureReason(
4748
return normalizeDecisionReason(findDecisionReason(decision, "failed"));
4849
}
4950

50-
function buildFileContext(params: { filePath: string; mime?: string }) {
51+
function buildFileContext(params: { filePath: string; mediaUrl?: string; mime?: string }) {
5152
return {
52-
MediaPath: params.filePath,
53+
...(params.mediaUrl ? { MediaUrl: params.mediaUrl } : { MediaPath: params.filePath }),
5354
MediaType: params.mime,
5455
};
5556
}
@@ -165,12 +166,33 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
165166
const timeoutMs = params.timeoutMs ?? 30_000;
166167
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
167168
const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
168-
const buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer;
169+
let buffer: Buffer;
170+
let fileName = path.basename(params.filePath);
171+
let mime = params.mime;
172+
if (params.mediaUrl) {
173+
const cache = createMediaAttachmentCache(normalizeMediaAttachments(buildFileContext(params)), {
174+
ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
175+
});
176+
try {
177+
const media = await cache.getBuffer({
178+
attachmentIndex: 0,
179+
maxBytes: DEFAULT_MAX_BYTES.image,
180+
timeoutMs,
181+
});
182+
buffer = media.buffer;
183+
fileName = media.fileName;
184+
mime = media.mime;
185+
} finally {
186+
await cache.cleanup();
187+
}
188+
} else {
189+
buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer;
190+
}
169191
const describeImage = provider?.describeImage ?? describeImageWithModel;
170192
return await describeImage({
171193
buffer,
172-
fileName: path.basename(params.filePath),
173-
mime: params.mime,
194+
fileName,
195+
mime,
174196
provider: params.provider,
175197
model: params.model,
176198
prompt: params.prompt,

src/media/fetch.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,24 @@ describe("readRemoteMediaBuffer", () => {
539539
});
540540
});
541541

542+
it("passes request timeout through the guarded fetch path", async () => {
543+
const fetchImpl = vi.fn(async () => new Response("ok", { status: 200 }));
544+
545+
await readRemoteMediaBuffer({
546+
url: "https://example.com/file.bin",
547+
fetchImpl,
548+
lookupFn: makeLookupFn(),
549+
maxBytes: 1024,
550+
timeoutMs: 1234,
551+
});
552+
553+
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
554+
expect(requireFetchGuardRequest()).toMatchObject({
555+
url: "https://example.com/file.bin",
556+
timeoutMs: 1234,
557+
});
558+
});
559+
542560
it("streams successful responses directly into the media store", async () => {
543561
const fetchImpl = vi.fn(
544562
async () =>

0 commit comments

Comments
 (0)