Skip to content

Commit f105e77

Browse files
committed
fix(codex): project raw image generation media
1 parent b3b962a commit f105e77

4 files changed

Lines changed: 307 additions & 5 deletions

File tree

extensions/codex/src/app-server/event-projector.test.ts

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ import fs from "node:fs/promises";
22
import os from "node:os";
33
import path from "node:path";
44
import type { EmbeddedRunAttemptParams } from "openclaw/plugin-sdk/agent-harness";
5-
import { resetAgentEventsForTest } from "openclaw/plugin-sdk/agent-harness-runtime";
5+
import {
6+
embeddedAgentLog,
7+
resetAgentEventsForTest,
8+
} from "openclaw/plugin-sdk/agent-harness-runtime";
69
import { SessionManager } from "openclaw/plugin-sdk/agent-sessions";
710
import {
811
onInternalDiagnosticEvent,
@@ -26,6 +29,8 @@ import { createCodexTestModel } from "./test-support.js";
2629
const THREAD_ID = "thread-1";
2730
const TURN_ID = "turn-1";
2831
const tempDirs = new Set<string>();
32+
const tinyPngBase64 =
33+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=";
2934

3035
type ProjectorNotification = Parameters<CodexAppServerEventProjector["handleNotification"]>[0];
3136

@@ -102,6 +107,7 @@ afterEach(async () => {
102107
resetGlobalHookRunner();
103108
resetCodexRateLimitCacheForTests();
104109
vi.restoreAllMocks();
110+
vi.unstubAllEnvs();
105111
for (const tempDir of tempDirs) {
106112
await fs.rm(tempDir, { recursive: true, force: true });
107113
}
@@ -516,6 +522,136 @@ describe("CodexAppServerEventProjector", () => {
516522

517523
expect(result.assistantTexts).toStrictEqual([]);
518524
expect(result.toolMediaUrls).toEqual([savedPath]);
525+
expect(result.replayMetadata).toStrictEqual({
526+
hadPotentialSideEffects: true,
527+
replaySafe: false,
528+
});
529+
});
530+
531+
it("saves raw Codex image-generation results as reply media", async () => {
532+
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-codex-media-state-"));
533+
tempDirs.add(stateDir);
534+
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
535+
const projector = await createProjector();
536+
537+
await projector.handleNotification(
538+
forCurrentTurn("rawResponseItem/completed", {
539+
item: {
540+
type: "image_generation_call",
541+
id: "ig_raw_1",
542+
status: "generating",
543+
result: tinyPngBase64,
544+
revised_prompt: "A tiny blue square",
545+
},
546+
}),
547+
);
548+
549+
const result = projector.buildResult(buildEmptyToolTelemetry());
550+
const mediaUrl = result.toolMediaUrls?.[0];
551+
552+
expect(result.assistantTexts).toStrictEqual([]);
553+
expect(result.toolMediaUrls).toHaveLength(1);
554+
expect(mediaUrl).toContain(`${path.sep}media${path.sep}tool-image-generation${path.sep}`);
555+
expect(mediaUrl?.endsWith(".png")).toBe(true);
556+
await expect(fs.readFile(mediaUrl ?? "")).resolves.toEqual(
557+
Buffer.from(tinyPngBase64, "base64"),
558+
);
559+
expect(result.replayMetadata).toStrictEqual({
560+
hadPotentialSideEffects: true,
561+
replaySafe: false,
562+
});
563+
});
564+
565+
it("keeps raw image-generation results replay-invalid when media save fails", async () => {
566+
const warn = vi.spyOn(embeddedAgentLog, "warn").mockImplementation(() => undefined);
567+
const projector = await createProjector({
568+
...(await createParams()),
569+
config: { agents: { defaults: { mediaMaxMb: 0.000001 } } },
570+
} as EmbeddedRunAttemptParams);
571+
572+
await projector.handleNotification(
573+
forCurrentTurn("rawResponseItem/completed", {
574+
item: {
575+
type: "image_generation_call",
576+
id: "ig_raw_capped",
577+
status: "completed",
578+
result: tinyPngBase64,
579+
},
580+
}),
581+
);
582+
583+
const result = projector.buildResult(buildEmptyToolTelemetry());
584+
585+
expect(result.toolMediaUrls).toBeUndefined();
586+
expect(result.replayMetadata).toStrictEqual({
587+
hadPotentialSideEffects: true,
588+
replaySafe: false,
589+
});
590+
expect(warn).toHaveBeenCalledWith(
591+
"codex app-server raw image generation result exceeds media limit",
592+
expect.objectContaining({ itemId: "ig_raw_capped" }),
593+
);
594+
});
595+
596+
it("dedupes raw and typed Codex image-generation media for the same item", async () => {
597+
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-codex-media-state-"));
598+
tempDirs.add(stateDir);
599+
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
600+
const projector = await createProjector();
601+
const savedPath = "/tmp/codex-home/generated_images/session-1/ig_123.png";
602+
603+
await projector.handleNotification(
604+
forCurrentTurn("rawResponseItem/completed", {
605+
item: {
606+
type: "image_generation_call",
607+
id: "ig_123",
608+
status: "generating",
609+
result: tinyPngBase64,
610+
},
611+
}),
612+
);
613+
await projector.handleNotification(
614+
turnCompleted([
615+
{
616+
type: "imageGeneration",
617+
id: "ig_123",
618+
status: "completed",
619+
revisedPrompt: "A tiny blue square",
620+
result: tinyPngBase64,
621+
savedPath,
622+
},
623+
]),
624+
);
625+
626+
const result = projector.buildResult(buildEmptyToolTelemetry());
627+
628+
expect(result.toolMediaUrls).toHaveLength(1);
629+
expect(result.toolMediaUrls?.[0]).not.toBe(savedPath);
630+
});
631+
632+
it("preserves distinct raw image-generation items with identical image bytes", async () => {
633+
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-codex-media-state-"));
634+
tempDirs.add(stateDir);
635+
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
636+
const projector = await createProjector();
637+
638+
for (const id of ["ig_raw_1", "ig_raw_2"]) {
639+
await projector.handleNotification(
640+
forCurrentTurn("rawResponseItem/completed", {
641+
item: {
642+
type: "image_generation_call",
643+
id,
644+
status: "generating",
645+
result: tinyPngBase64,
646+
},
647+
}),
648+
);
649+
}
650+
651+
const result = projector.buildResult(buildEmptyToolTelemetry());
652+
653+
expect(result.toolMediaUrls).toHaveLength(2);
654+
expect(new Set(result.toolMediaUrls)).toHaveLength(2);
519655
});
520656

521657
it("does not append native Codex image-generation media after explicit media delivery", async () => {

extensions/codex/src/app-server/event-projector.ts

Lines changed: 113 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ import {
2020
type ToolProgressDetailMode,
2121
} from "openclaw/plugin-sdk/agent-harness-runtime";
2222
import { emitTrustedDiagnosticEvent } from "openclaw/plugin-sdk/diagnostic-runtime";
23+
import { generatedImageAssetFromBase64 } from "openclaw/plugin-sdk/image-generation";
2324
import type { AssistantMessage, Usage } from "openclaw/plugin-sdk/llm";
25+
import { saveMediaBuffer } from "openclaw/plugin-sdk/media-store";
2426
import { resolveCodexLocalRuntimeAttribution } from "./local-runtime-attribution.js";
2527
import {
2628
readCodexNotificationThreadId,
@@ -106,6 +108,10 @@ const CODEX_PROMPT_TOTAL_INPUT_KEYS = [
106108

107109
const MAX_TOOL_OUTPUT_DELTA_MESSAGES_PER_ITEM = 20;
108110
const TOOL_TRANSCRIPT_OUTPUT_MAX_CHARS = 12_000;
111+
const GENERATED_IMAGE_MEDIA_SUBDIR = "tool-image-generation";
112+
const BYTES_PER_MB = 1024 * 1024;
113+
// Match OpenClaw's default image media cap for generated image tool outputs.
114+
const DEFAULT_GENERATED_IMAGE_MAX_BYTES = 6 * BYTES_PER_MB;
109115
const TRANSCRIPT_PROGRESS_SUPPRESSED_TOOL_NAMES = new Set([
110116
"message",
111117
"messages",
@@ -168,6 +174,8 @@ export class CodexAppServerEventProjector {
168174
private readonly transcriptToolProgressCallIds = new Set<string>();
169175
private lastNativeToolError: EmbeddedRunAttemptResult["lastToolError"];
170176
private readonly nativeGeneratedMediaUrls = new Set<string>();
177+
private readonly nativeGeneratedMediaItemIds = new Set<string>();
178+
private readonly nativeGeneratedMediaUrlsByItemId = new Map<string, string>();
171179
private readonly diagnosticToolStartedAtByItem = new Map<string, number>();
172180
private readonly afterToolCallObservedItemIds = new Set<string>();
173181
private assistantStarted = false;
@@ -252,7 +260,7 @@ export class CodexAppServerEventProjector {
252260
await this.handleTurnCompleted(params);
253261
break;
254262
case "rawResponseItem/completed":
255-
this.handleRawResponseItemCompleted(params);
263+
await this.handleRawResponseItemCompleted(params);
256264
break;
257265
case "error":
258266
if (readBooleanAlias(params, ["willRetry", "will_retry"]) === true) {
@@ -331,6 +339,7 @@ export class CodexAppServerEventProjector {
331339
const hadPotentialSideEffects =
332340
toolTelemetry.didSendViaMessagingTool ||
333341
(toolTelemetry.successfulCronAdds ?? 0) > 0 ||
342+
this.nativeGeneratedMediaItemIds.size > 0 ||
334343
this.sideEffectingToolItemIds.size > 0 ||
335344
this.sideEffectingDynamicToolCallIds.size > 0;
336345
return {
@@ -812,9 +821,13 @@ export class CodexAppServerEventProjector {
812821
});
813822
}
814823

815-
private handleRawResponseItemCompleted(params: JsonObject): void {
824+
private async handleRawResponseItemCompleted(params: JsonObject): Promise<void> {
816825
const item = isJsonObject(params.item) ? params.item : undefined;
817-
if (!item || readString(item, "role") !== "assistant") {
826+
if (!item) {
827+
return;
828+
}
829+
await this.recordRawGeneratedImageMedia(item);
830+
if (readString(item, "role") !== "assistant") {
818831
return;
819832
}
820833
const text = extractRawAssistantText(item);
@@ -839,10 +852,73 @@ export class CodexAppServerEventProjector {
839852
}
840853
const savedPath = readItemString(item, "savedPath")?.trim();
841854
if (savedPath) {
842-
this.nativeGeneratedMediaUrls.add(savedPath);
855+
this.recordNativeGeneratedMediaUrl({
856+
itemId: item.id,
857+
mediaUrl: savedPath,
858+
});
859+
}
860+
}
861+
862+
private async recordRawGeneratedImageMedia(item: JsonObject): Promise<void> {
863+
if (readString(item, "type") !== "image_generation_call") {
864+
return;
865+
}
866+
const result = readString(item, "result");
867+
if (!result) {
868+
return;
869+
}
870+
const itemId = readString(item, "id") ?? `raw-image-${this.nativeGeneratedMediaItemIds.size}`;
871+
this.nativeGeneratedMediaItemIds.add(itemId);
872+
const maxBytes = resolveGeneratedImageMaxBytes(this.params.config);
873+
const estimatedDecodedBytes = estimateBase64DecodedBytes(result);
874+
if (estimatedDecodedBytes !== undefined && estimatedDecodedBytes > maxBytes) {
875+
embeddedAgentLog.warn("codex app-server raw image generation result exceeds media limit", {
876+
itemId,
877+
estimatedDecodedBytes,
878+
maxBytes,
879+
});
880+
return;
881+
}
882+
const asset = generatedImageAssetFromBase64({
883+
base64: result,
884+
index: this.nativeGeneratedMediaItemIds.size,
885+
revisedPrompt: readString(item, "revised_prompt") ?? readString(item, "revisedPrompt"),
886+
fileNamePrefix: "codex-image-generation",
887+
sniffMimeType: true,
888+
});
889+
if (!asset) {
890+
return;
891+
}
892+
try {
893+
const saved = await saveMediaBuffer(
894+
asset.buffer,
895+
asset.mimeType,
896+
GENERATED_IMAGE_MEDIA_SUBDIR,
897+
maxBytes,
898+
asset.fileName,
899+
);
900+
this.recordNativeGeneratedMediaUrl({
901+
itemId,
902+
mediaUrl: saved.path,
903+
});
904+
} catch (error) {
905+
embeddedAgentLog.warn("codex app-server raw image generation result save failed", {
906+
itemId,
907+
error,
908+
});
843909
}
844910
}
845911

912+
private recordNativeGeneratedMediaUrl(params: { itemId: string; mediaUrl: string }): void {
913+
if (this.nativeGeneratedMediaUrlsByItemId.has(params.itemId)) {
914+
this.nativeGeneratedMediaItemIds.add(params.itemId);
915+
return;
916+
}
917+
this.nativeGeneratedMediaUrlsByItemId.set(params.itemId, params.mediaUrl);
918+
this.nativeGeneratedMediaUrls.add(params.mediaUrl);
919+
this.nativeGeneratedMediaItemIds.add(params.itemId);
920+
}
921+
846922
private buildToolMediaUrls(toolTelemetry: CodexAppServerToolTelemetry): string[] | undefined {
847923
const mediaUrls = new Set(
848924
toolTelemetry.toolMediaUrls?.map((url) => url.trim()).filter(Boolean) ?? [],
@@ -1583,6 +1659,39 @@ function readString(record: JsonObject, key: string): string | undefined {
15831659
return typeof value === "string" ? value : undefined;
15841660
}
15851661

1662+
function estimateBase64DecodedBytes(base64: string): number | undefined {
1663+
let nonWhitespaceLength = 0;
1664+
let previousCode = -1;
1665+
let lastCode = -1;
1666+
for (let i = 0; i < base64.length; i += 1) {
1667+
const code = base64.charCodeAt(i);
1668+
if (isBase64WhitespaceCode(code)) {
1669+
continue;
1670+
}
1671+
nonWhitespaceLength += 1;
1672+
previousCode = lastCode;
1673+
lastCode = code;
1674+
}
1675+
if (nonWhitespaceLength === 0) {
1676+
return undefined;
1677+
}
1678+
const equalsCode = "=".charCodeAt(0);
1679+
const padding = lastCode === equalsCode ? (previousCode === equalsCode ? 2 : 1) : 0;
1680+
return Math.max(0, Math.floor((nonWhitespaceLength * 3) / 4) - padding);
1681+
}
1682+
1683+
function isBase64WhitespaceCode(code: number): boolean {
1684+
return code === 0x20 || code === 0x09 || code === 0x0a || code === 0x0d;
1685+
}
1686+
1687+
function resolveGeneratedImageMaxBytes(config: EmbeddedRunAttemptParams["config"]): number {
1688+
const configured = config?.agents?.defaults?.mediaMaxMb;
1689+
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
1690+
return Math.floor(configured * BYTES_PER_MB);
1691+
}
1692+
return DEFAULT_GENERATED_IMAGE_MAX_BYTES;
1693+
}
1694+
15861695
function normalizeNonEmptyString(value: unknown): string | undefined {
15871696
if (typeof value !== "string") {
15881697
return undefined;

extensions/codex/src/app-server/run-attempt.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1935,6 +1935,11 @@ export async function runCodexAppServerAttempt(
19351935

19361936
try {
19371937
await completion;
1938+
// Timeout completion can win while a received notification is still being
1939+
// projected, for example while persisting raw image-generation media. Wait
1940+
// for already-queued projection work so the final result includes artifacts
1941+
// from the notification that triggered the idle watchdog.
1942+
await notificationQueue;
19381943
const result = activeProjector.buildResult(toolBridge.telemetry, { yieldDetected });
19391944
const finalAborted =
19401945
result.aborted || (runAbortController.signal.aborted && !clientClosedAbort);

0 commit comments

Comments
 (0)