Skip to content

Commit fa7de46

Browse files
committed
fix(cli): report missing infer media providers
1 parent 7985158 commit fa7de46

6 files changed

Lines changed: 145 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
2525

2626
### Fixes
2727

28+
- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski.
2829
- Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar.
2930
- Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen.
3031
- Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010.

src/cli/capability-cli.test.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,51 @@ describe("capability cli", () => {
782782
);
783783
});
784784

785+
it("reports missing image understanding configuration for image describe", async () => {
786+
mocks.describeImageFile.mockResolvedValueOnce({
787+
text: undefined,
788+
decision: {
789+
capability: "image",
790+
outcome: "skipped",
791+
attachments: [{ attachmentIndex: 0, attempts: [] }],
792+
},
793+
} as never);
794+
795+
await expect(
796+
runRegisteredCli({
797+
register: registerCapabilityCli as (program: Command) => void,
798+
argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"],
799+
}),
800+
).rejects.toThrow("exit 1");
801+
expect(mocks.runtime.error).toHaveBeenCalledWith(
802+
expect.stringContaining("No image understanding provider is configured or ready"),
803+
);
804+
expect(mocks.runtime.error).toHaveBeenCalledWith(
805+
expect.stringContaining("agents.defaults.imageModel.primary"),
806+
);
807+
});
808+
809+
it("reports missing image understanding configuration for image describe-many", async () => {
810+
mocks.describeImageFile.mockResolvedValueOnce({
811+
text: undefined,
812+
decision: {
813+
capability: "image",
814+
outcome: "skipped",
815+
attachments: [{ attachmentIndex: 0, attempts: [] }],
816+
},
817+
} as never);
818+
819+
await expect(
820+
runRegisteredCli({
821+
register: registerCapabilityCli as (program: Command) => void,
822+
argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"],
823+
}),
824+
).rejects.toThrow("exit 1");
825+
expect(mocks.runtime.error).toHaveBeenCalledWith(
826+
expect.stringContaining("No image understanding provider is configured or ready"),
827+
);
828+
});
829+
785830
it("rewrites mismatched explicit image output extensions to the detected file type", async () => {
786831
const jpegBase64 =
787832
"/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z";
@@ -1278,6 +1323,30 @@ describe("capability cli", () => {
12781323
);
12791324
});
12801325

1326+
it("reports missing audio transcription configuration for audio transcribe", async () => {
1327+
mocks.transcribeAudioFile.mockResolvedValueOnce({
1328+
text: undefined,
1329+
decision: {
1330+
capability: "audio",
1331+
outcome: "skipped",
1332+
attachments: [{ attachmentIndex: 0, attempts: [] }],
1333+
},
1334+
} as never);
1335+
1336+
await expect(
1337+
runRegisteredCli({
1338+
register: registerCapabilityCli as (program: Command) => void,
1339+
argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
1340+
}),
1341+
).rejects.toThrow("exit 1");
1342+
expect(mocks.runtime.error).toHaveBeenCalledWith(
1343+
expect.stringContaining("No audio transcription provider is configured or ready"),
1344+
);
1345+
expect(mocks.runtime.error).toHaveBeenCalledWith(
1346+
expect.stringContaining("tools.media.audio.models"),
1347+
);
1348+
});
1349+
12811350
it("surfaces the underlying transcription failure for audio transcribe", async () => {
12821351
mocks.transcribeAudioFile.mockRejectedValueOnce(
12831352
new Error("Audio transcription response missing text"),

src/cli/capability-cli.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import type {
3030
ImageGenerationOutputFormat,
3131
} from "../image-generation/types.js";
3232
import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
33+
import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js";
3334
import {
3435
describeImageFile,
3536
describeImageFileWithModel,
@@ -964,6 +965,11 @@ async function runImageDescribe(params: {
964965
timeoutMs: params.timeoutMs,
965966
});
966967
if (!result.text) {
968+
if (isMissingMediaUnderstandingProvider(result)) {
969+
throw new Error(
970+
"No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model <provider/model> after configuring that provider's auth/API key.",
971+
);
972+
}
967973
throw new Error(`No description returned for image: ${resolvedPath}`);
968974
}
969975
return {
@@ -986,6 +992,15 @@ async function runImageDescribe(params: {
986992
} satisfies CapabilityEnvelope;
987993
}
988994

995+
function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean {
996+
const decision = result.decision;
997+
return (
998+
decision?.outcome === "skipped" &&
999+
decision.attachments.length > 0 &&
1000+
decision.attachments.every((attachment) => attachment.attempts.length === 0)
1001+
);
1002+
}
1003+
9891004
async function runAudioTranscribe(params: {
9901005
file: string;
9911006
language?: string;
@@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: {
10021017
prompt: params.prompt,
10031018
});
10041019
if (!result.text) {
1020+
if (isMissingMediaUnderstandingProvider(result)) {
1021+
throw new Error(
1022+
"No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model <provider/model> after configuring that provider's auth/API key.",
1023+
);
1024+
}
10051025
throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`);
10061026
}
10071027
return {

src/media-understanding/runtime-types.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import type { OpenClawConfig } from "../config/types.js";
22
import type { ActiveMediaModel } from "./active-model.types.js";
3-
import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
3+
import type {
4+
MediaUnderstandingDecision,
5+
MediaUnderstandingOutput,
6+
MediaUnderstandingProvider,
7+
} from "./types.js";
48

59
export type RunMediaUnderstandingFileParams = {
610
capability: "image" | "audio" | "video";
@@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = {
1822
provider?: string;
1923
model?: string;
2024
output?: MediaUnderstandingOutput;
25+
decision?: MediaUnderstandingDecision;
2126
};
2227

2328
export type DescribeImageFileParams = {
@@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = {
7378
params: DescribeImageFileWithModelParams,
7479
) => Promise<DescribeImageFileWithModelResult>;
7580
describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
76-
transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
81+
transcribeAudioFile: (
82+
params: TranscribeAudioFileParams,
83+
) => Promise<RunMediaUnderstandingFileResult>;
7784
};

src/media-understanding/runtime.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,46 @@ describe("media-understanding runtime", () => {
6363
provider: undefined,
6464
model: undefined,
6565
output: undefined,
66+
decision: { capability: "image", outcome: "disabled", attachments: [] },
6667
});
6768

6869
expect(mocks.buildProviderRegistry).not.toHaveBeenCalled();
6970
expect(mocks.runCapability).not.toHaveBeenCalled();
7071
});
7172

73+
it("preserves skipped decisions when no media provider is available", async () => {
74+
const decision = {
75+
capability: "audio" as const,
76+
outcome: "skipped" as const,
77+
attachments: [{ attachmentIndex: 0, attempts: [] }],
78+
};
79+
mocks.normalizeMediaAttachments.mockReturnValue([
80+
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
81+
]);
82+
mocks.runCapability.mockResolvedValue({
83+
outputs: [],
84+
decision,
85+
});
86+
87+
await expect(
88+
runMediaUnderstandingFile({
89+
capability: "audio",
90+
filePath: "/tmp/sample.ogg",
91+
mime: "audio/ogg",
92+
cfg: {} as OpenClawConfig,
93+
agentDir: "/tmp/agent",
94+
}),
95+
).resolves.toEqual({
96+
text: undefined,
97+
provider: undefined,
98+
model: undefined,
99+
output: undefined,
100+
decision,
101+
});
102+
103+
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
104+
});
105+
72106
it("returns the matching capability output", async () => {
73107
const output: MediaUnderstandingOutput = {
74108
kind: "image.description",

src/media-understanding/runtime.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile(
8484
const ctx = buildFileContext(params);
8585
const attachments = normalizeMediaAttachments(ctx);
8686
if (attachments.length === 0) {
87-
return { text: undefined };
87+
return {
88+
text: undefined,
89+
decision: { capability: params.capability, outcome: "no-attachment", attachments: [] },
90+
};
8891
}
8992
const config = cfg.tools?.media?.[params.capability];
9093
if (config?.enabled === false) {
@@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile(
9396
provider: undefined,
9497
model: undefined,
9598
output: undefined,
99+
decision: { capability: params.capability, outcome: "disabled", attachments: [] },
96100
};
97101
}
98102

@@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile(
124128
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
125129
);
126130
const text = output?.text?.trim();
127-
return {
131+
const fileResult: RunMediaUnderstandingFileResult = {
128132
text: text || undefined,
129133
provider: output?.provider,
130134
model: output?.model,
131135
output,
132136
};
137+
if (result.decision) {
138+
fileResult.decision = result.decision;
139+
}
140+
return fileResult;
133141
} finally {
134142
await cache.cleanup();
135143
}
@@ -171,7 +179,7 @@ export async function describeVideoFile(
171179

172180
export async function transcribeAudioFile(
173181
params: TranscribeAudioFileParams,
174-
): Promise<{ text: string | undefined }> {
182+
): Promise<RunMediaUnderstandingFileResult> {
175183
const cfg =
176184
params.language || params.prompt
177185
? {
@@ -192,5 +200,5 @@ export async function transcribeAudioFile(
192200
}
193201
: params.cfg;
194202
const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" });
195-
return { text: result.text };
203+
return result;
196204
}

0 commit comments

Comments
 (0)