Skip to content

Commit 118d95a

Browse files
committed
fix(codex): hydrate queued inbound images
1 parent 1e5450f commit 118d95a

5 files changed

Lines changed: 166 additions & 86 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Docs: https://docs.openclaw.ai
3838

3939
### Fixes
4040

41+
- Codex app-server: hydrate current inbound image attachments before queued runs so Responses-backed agents receive Discord and other channel images as native vision input. Fixes #83466. Thanks @iannwu.
4142
- Codex app-server: preserve network access for sandboxed Codex code-mode turns when the OpenClaw sandbox allows outbound egress. Fixes #83347. Thanks @YusukeIt0.
4243
- QA-Lab: keep the OTLP smoke decoder independent of removed OpenTelemetry generated-root internals.
4344
- Messages: default group/channel visible replies to automatic final delivery again, keeping `message_tool` opt-in for ambient/shared rooms and tool-reliable models.

src/auto-reply/reply/agent-runner-execution.ts

Lines changed: 8 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import crypto from "node:crypto";
2-
import type { ImageContent } from "@earendil-works/pi-ai";
32
import {
43
hasOutboundReplyContent,
54
resolveSendableOutboundReplyParts,
@@ -95,8 +94,8 @@ import {
9594
resolveQueuedReplyRuntimeConfig,
9695
resolveModelFallbackOptions,
9796
} from "./agent-runner-utils.js";
98-
import { resolveAgentTurnAttachments } from "./agent-turn-attachments.js";
9997
import { type BlockReplyPipeline } from "./block-reply-pipeline.js";
98+
import { resolveCurrentTurnImages } from "./current-turn-images.js";
10099
import { resolveOriginMessageProvider } from "./origin-routing.js";
101100
import {
102101
classifyProviderRequestError,
@@ -168,80 +167,6 @@ const FALLBACK_SELECTION_STATE_KEYS = [
168167
"authProfileOverrideCompactionCount",
169168
] as const satisfies ReadonlyArray<keyof FallbackSelectionState>;
170169

171-
function countCurrentPiImageAttachmentCandidates(ctx: TemplateContext): number {
172-
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
173-
const paths =
174-
pathsFromArray && pathsFromArray.length > 0
175-
? pathsFromArray
176-
: normalizeOptionalString(ctx.MediaPath)
177-
? [ctx.MediaPath]
178-
: [];
179-
if (paths.length === 0) {
180-
return 0;
181-
}
182-
const types =
183-
Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
184-
? ctx.MediaTypes
185-
: undefined;
186-
let count = 0;
187-
for (const [index, pathValue] of paths.entries()) {
188-
const mediaPath = normalizeOptionalString(pathValue);
189-
const mediaType = normalizeOptionalString(types?.[index] ?? ctx.MediaType);
190-
if (mediaPath && mediaType?.startsWith("image/")) {
191-
count++;
192-
}
193-
}
194-
return count;
195-
}
196-
197-
async function resolveNativePiTurnImages(params: {
198-
ctx: TemplateContext;
199-
cfg: OpenClawConfig;
200-
images?: ImageContent[];
201-
imageOrder?: GetReplyOptions["imageOrder"];
202-
}): Promise<{
203-
images?: ImageContent[];
204-
imageOrder?: GetReplyOptions["imageOrder"];
205-
}> {
206-
if (Array.isArray(params.images) && params.images.length > 0) {
207-
return { images: params.images, imageOrder: params.imageOrder };
208-
}
209-
210-
const currentImageCandidateCount = countCurrentPiImageAttachmentCandidates(params.ctx);
211-
if (currentImageCandidateCount === 0) {
212-
return { images: params.images, imageOrder: params.imageOrder };
213-
}
214-
215-
try {
216-
const resolved = await resolveAgentTurnAttachments({
217-
ctx: params.ctx,
218-
cfg: params.cfg,
219-
includeRecentHistoryImages: false,
220-
});
221-
const images = resolved.attachments.map(
222-
(attachment): ImageContent => ({
223-
type: "image",
224-
data: attachment.data,
225-
mimeType: attachment.mediaType,
226-
}),
227-
);
228-
if (images.length < currentImageCandidateCount) {
229-
logVerbose(
230-
`agent-runner: native PI media resolution produced ${images.length}/${currentImageCandidateCount} current image attachment(s); falling back to prompt image refs`,
231-
);
232-
return { images: params.images, imageOrder: params.imageOrder };
233-
}
234-
return images.length > 0
235-
? { images, imageOrder: images.map(() => "inline" as const) }
236-
: { images: params.images, imageOrder: params.imageOrder };
237-
} catch (error) {
238-
logVerbose(
239-
`agent-runner: media attachment image resolution failed, proceeding without native images: ${formatErrorMessage(error)}`,
240-
);
241-
return { images: params.images, imageOrder: params.imageOrder };
242-
}
243-
}
244-
245170
function setFallbackSelectionStateField(
246171
entry: SessionEntry,
247172
key: keyof FallbackSelectionState,
@@ -1252,11 +1177,11 @@ export async function runAgentTurnWithFallback(params: {
12521177
requesterSenderUsername: params.followupRun.run.senderUsername,
12531178
requesterSenderE164: params.followupRun.run.senderE164,
12541179
});
1255-
const nativePiTurnImages = await resolveNativePiTurnImages({
1180+
const currentTurnImages = await resolveCurrentTurnImages({
12561181
ctx: params.sessionCtx,
12571182
cfg: runtimeConfig,
1258-
images: params.opts?.images,
1259-
imageOrder: params.opts?.imageOrder,
1183+
images: params.followupRun.images ?? params.opts?.images,
1184+
imageOrder: params.followupRun.imageOrder ?? params.opts?.imageOrder,
12601185
});
12611186
let didNotifyAgentRunStart = false;
12621187
const notifyAgentRunStart = () => {
@@ -1770,8 +1695,8 @@ export async function runAgentTurnWithFallback(params: {
17701695
bootstrapPromptWarningSignaturesSeen[
17711696
bootstrapPromptWarningSignaturesSeen.length - 1
17721697
],
1773-
images: params.opts?.images,
1774-
imageOrder: params.opts?.imageOrder,
1698+
images: currentTurnImages.images,
1699+
imageOrder: currentTurnImages.imageOrder,
17751700
skillsSnapshot: params.followupRun.run.skillsSnapshot,
17761701
messageChannel: params.followupRun.originatingChannel ?? undefined,
17771702
messageProvider: hookMessageProvider,
@@ -1889,8 +1814,8 @@ export async function runAgentTurnWithFallback(params: {
18891814
forceHeartbeatTool: params.opts?.forceHeartbeatTool,
18901815
bootstrapContextMode: params.opts?.bootstrapContextMode,
18911816
bootstrapContextRunKind: params.opts?.isHeartbeat ? "heartbeat" : "default",
1892-
images: nativePiTurnImages.images,
1893-
imageOrder: nativePiTurnImages.imageOrder,
1817+
images: currentTurnImages.images,
1818+
imageOrder: currentTurnImages.imageOrder,
18941819
abortSignal: params.replyOperation?.abortSignal ?? params.opts?.abortSignal,
18951820
replyOperation: params.replyOperation,
18961821
blockReplyBreak: params.resolvedBlockStreamingBreak,
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import type { ImageContent } from "@earendil-works/pi-ai";
2+
import type { OpenClawConfig } from "../../config/types.openclaw.js";
3+
import { logVerbose } from "../../globals.js";
4+
import { formatErrorMessage } from "../../infra/errors.js";
5+
import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js";
6+
import { normalizeOptionalString } from "../../shared/string-coerce.js";
7+
import type { MsgContext } from "../templating.js";
8+
import { resolveAgentTurnAttachments } from "./agent-turn-attachments.js";
9+
10+
function countCurrentImageAttachmentCandidates(ctx: MsgContext): number {
11+
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
12+
const paths =
13+
pathsFromArray && pathsFromArray.length > 0
14+
? pathsFromArray
15+
: normalizeOptionalString(ctx.MediaPath)
16+
? [ctx.MediaPath]
17+
: [];
18+
if (paths.length === 0) {
19+
return 0;
20+
}
21+
const types =
22+
Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
23+
? ctx.MediaTypes
24+
: undefined;
25+
let count = 0;
26+
for (const [index, pathValue] of paths.entries()) {
27+
const mediaPath = normalizeOptionalString(pathValue);
28+
const mediaType = normalizeOptionalString(types?.[index] ?? ctx.MediaType);
29+
if (mediaPath && mediaType?.startsWith("image/")) {
30+
count++;
31+
}
32+
}
33+
return count;
34+
}
35+
36+
export async function resolveCurrentTurnImages(params: {
37+
ctx: MsgContext;
38+
cfg: OpenClawConfig;
39+
images?: ImageContent[];
40+
imageOrder?: PromptImageOrderEntry[];
41+
}): Promise<{
42+
images?: ImageContent[];
43+
imageOrder?: PromptImageOrderEntry[];
44+
}> {
45+
if (Array.isArray(params.images) && params.images.length > 0) {
46+
return { images: params.images, imageOrder: params.imageOrder };
47+
}
48+
49+
const currentImageCandidateCount = countCurrentImageAttachmentCandidates(params.ctx);
50+
if (currentImageCandidateCount === 0) {
51+
return { images: params.images, imageOrder: params.imageOrder };
52+
}
53+
54+
try {
55+
const resolved = await resolveAgentTurnAttachments({
56+
ctx: params.ctx,
57+
cfg: params.cfg,
58+
includeRecentHistoryImages: false,
59+
});
60+
const images = resolved.attachments.map(
61+
(attachment): ImageContent => ({
62+
type: "image",
63+
data: attachment.data,
64+
mimeType: attachment.mediaType,
65+
}),
66+
);
67+
if (images.length < currentImageCandidateCount) {
68+
logVerbose(
69+
`agent-runner: native PI media resolution produced ${images.length}/${currentImageCandidateCount} current image attachment(s); falling back to prompt image refs`,
70+
);
71+
return { images: params.images, imageOrder: params.imageOrder };
72+
}
73+
return images.length > 0
74+
? { images, imageOrder: images.map(() => "inline" as const) }
75+
: { images: params.images, imageOrder: params.imageOrder };
76+
} catch (error) {
77+
logVerbose(
78+
`agent-runner: media attachment image resolution failed, proceeding without native images: ${formatErrorMessage(error)}`,
79+
);
80+
return { images: params.images, imageOrder: params.imageOrder };
81+
}
82+
}

src/auto-reply/reply/get-reply-run.media-only.test.ts

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
import { mkdtemp, rm, writeFile } from "node:fs/promises";
2+
import os from "node:os";
3+
import path from "node:path";
14
import { importFreshModule } from "openclaw/plugin-sdk/test-fixtures";
2-
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
5+
import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
36
import {
47
clearActiveEmbeddedRun,
58
setActiveEmbeddedRun,
@@ -269,6 +272,8 @@ function requireLastRunReplyAgentCall() {
269272
}
270273

271274
describe("runPreparedReply media-only handling", () => {
275+
const cleanupPaths: string[] = [];
276+
272277
beforeAll(async () => {
273278
({ runPreparedReply } = await import("./get-reply-run.js"));
274279
({ runReplyAgent } = await import("./agent-runner.runtime.js"));
@@ -289,6 +294,11 @@ describe("runPreparedReply media-only handling", () => {
289294
replyRunTesting.resetReplyRunRegistry();
290295
});
291296

297+
afterEach(() => {
298+
const paths = cleanupPaths.splice(0);
299+
return Promise.all(paths.map((entry) => rm(entry, { recursive: true, force: true })));
300+
});
301+
292302
it("does not load session store runtime on module import", async () => {
293303
await loadFreshGetReplyRunModuleForTest();
294304

@@ -870,6 +880,59 @@ describe("runPreparedReply media-only handling", () => {
870880
expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
871881
});
872882

883+
it("hydrates current MediaPaths into queued followup images", async () => {
884+
const tmpDir = await mkdtemp(path.join(os.tmpdir(), "openclaw-followup-image-"));
885+
cleanupPaths.push(tmpDir);
886+
const imagePath = path.join(tmpDir, "inbound.png");
887+
await writeFile(
888+
imagePath,
889+
Buffer.from(
890+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=",
891+
"base64",
892+
),
893+
);
894+
895+
const result = await runPreparedReply(
896+
baseParams({
897+
ctx: {
898+
Body: "describe this",
899+
RawBody: "describe this",
900+
CommandBody: "describe this",
901+
MediaPaths: [imagePath],
902+
MediaTypes: ["image/png"],
903+
MediaWorkspaceDir: tmpDir,
904+
OriginatingChannel: "discord",
905+
OriginatingTo: "C123",
906+
ChatType: "group",
907+
},
908+
sessionCtx: {
909+
Body: "describe this",
910+
BodyStripped: "describe this",
911+
Provider: "discord",
912+
OriginatingChannel: "discord",
913+
OriginatingTo: "C123",
914+
ChatType: "group",
915+
MediaPaths: [imagePath],
916+
MediaTypes: ["image/png"],
917+
MediaWorkspaceDir: tmpDir,
918+
},
919+
}),
920+
);
921+
922+
expect(result).toEqual({ text: "ok" });
923+
expect(vi.mocked(runReplyAgent)).toHaveBeenCalledOnce();
924+
const call = requireRunReplyAgentCall();
925+
expect(call.followupRun.images).toEqual([
926+
{
927+
type: "image",
928+
data: expect.any(String),
929+
mimeType: "image/png",
930+
},
931+
]);
932+
expect(call.followupRun.images?.[0]?.data).toHaveLength(92);
933+
expect(call.followupRun.imageOrder).toEqual(["inline"]);
934+
});
935+
873936
it("does not send a standalone reset notice for reply-producing /new turns", async () => {
874937
await runPreparedReply(
875938
baseParams({

src/auto-reply/reply/get-reply-run.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ import { SILENT_REPLY_TOKEN } from "../tokens.js";
5454
import type { GetReplyOptions, ReplyPayload } from "../types.js";
5555
import { applySessionHints } from "./body.js";
5656
import type { buildCommandContext } from "./commands.js";
57+
import { resolveCurrentTurnImages } from "./current-turn-images.js";
5758
import type { InlineDirectives } from "./directive-handling.js";
5859
import { isSystemEventProvider } from "./effective-reply-route.js";
5960
import { shouldUseReplyFastTestRuntime } from "./get-reply-fast-path.js";
@@ -1035,6 +1036,14 @@ export async function runPreparedReply(
10351036
ctx,
10361037
sessionKey,
10371038
});
1039+
const currentTurnImages = await traceRunPhase("reply.resolve_current_turn_images", () =>
1040+
resolveCurrentTurnImages({
1041+
ctx,
1042+
cfg,
1043+
images: opts?.images,
1044+
imageOrder: opts?.imageOrder,
1045+
}),
1046+
);
10381047
const followupRun = {
10391048
prompt: queuedBody,
10401049
transcriptPrompt: transcriptCommandBody,
@@ -1046,8 +1055,8 @@ export async function runPreparedReply(
10461055
messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid,
10471056
summaryLine: baseBodyTrimmedRaw,
10481057
enqueuedAt: Date.now(),
1049-
images: opts?.images,
1050-
imageOrder: opts?.imageOrder,
1058+
images: currentTurnImages.images,
1059+
imageOrder: currentTurnImages.imageOrder,
10511060
// Originating channel for reply routing.
10521061
originatingChannel: ctx.OriginatingChannel,
10531062
originatingTo: ctx.OriginatingTo,

0 commit comments

Comments
 (0)