fix: restore inbound image embedding for CLI routed BlueBubbles turns (#51373)

tyler6204 · web-flow · commit 00e932a83cdd · 2026-03-26T15:47:44.000+09:00
* fix(cli): hydrate prompt image refs for inbound media * Agents: harden CLI prompt image hydration (#51373) * test: fix CLI prompt image hydration helper mocks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- BlueBubbles/CLI agents: restore inbound prompt image refs for CLI routed turns, reapply embedded runner image size guardrails, and cover both CLI image transport paths with regression tests. (#51373)
 - OpenAI Codex/image tools: register Codex for media understanding and route image prompts through Codex instructions so image analysis no longer fails on missing provider registration or missing `instructions`. (#54829) Thanks @neeravmakwana.
 - Telegram: deliver verbose tool summaries inside forum topic sessions again, so threaded topic chats now match DM verbose behavior. (#43236) Thanks @frankbuild.
 - Agents/sandbox: honor `tools.sandbox.tools.alsoAllow`, let explicit sandbox re-allows remove matching built-in default-deny tools, and keep sandbox explain/error guidance aligned with the effective sandbox tool policy. (#54492) Thanks @ngutman.
diff --git a/src/agents/cli-runner.helpers.test.ts b/src/agents/cli-runner.helpers.test.ts
@@ -0,0 +1,104 @@
+import type { ImageContent } from "@mariozechner/pi-ai";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { MAX_IMAGE_BYTES } from "../media/constants.js";
+import { loadPromptRefImages } from "./cli-runner/helpers.js";
+import * as promptImageUtils from "./pi-embedded-runner/run/images.js";
+import type { SandboxFsBridge } from "./sandbox/fs-bridge.js";
+import * as toolImages from "./tool-images.js";
+
+describe("loadPromptRefImages", () => {
+  beforeEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it("returns empty results when the prompt has no image refs", async () => {
+    const loadImageFromRefSpy = vi.spyOn(promptImageUtils, "loadImageFromRef");
+    const sanitizeImageBlocksSpy = vi.spyOn(toolImages, "sanitizeImageBlocks");
+
+    await expect(
+      loadPromptRefImages({
+        prompt: "just text",
+        workspaceDir: "/workspace",
+      }),
+    ).resolves.toEqual([]);
+
+    expect(loadImageFromRefSpy).not.toHaveBeenCalled();
+    expect(sanitizeImageBlocksSpy).not.toHaveBeenCalled();
+  });
+
+  it("passes the max-byte guardrail through load and sanitize", async () => {
+    const loadedImage: ImageContent = {
+      type: "image",
+      data: "c29tZS1pbWFnZQ==",
+      mimeType: "image/png",
+    };
+    const sanitizedImage: ImageContent = {
+      type: "image",
+      data: "c2FuaXRpemVkLWltYWdl",
+      mimeType: "image/jpeg",
+    };
+    const sandbox = {
+      root: "/sandbox",
+      bridge: {} as SandboxFsBridge,
+    };
+
+    const loadImageFromRefSpy = vi
+      .spyOn(promptImageUtils, "loadImageFromRef")
+      .mockResolvedValueOnce(loadedImage);
+    const sanitizeImageBlocksSpy = vi
+      .spyOn(toolImages, "sanitizeImageBlocks")
+      .mockResolvedValueOnce({ images: [sanitizedImage], dropped: 0 });
+
+    const result = await loadPromptRefImages({
+      prompt: "Look at /tmp/photo.png",
+      workspaceDir: "/workspace",
+      workspaceOnly: true,
+      sandbox,
+    });
+
+    const [ref, workspaceDir, options] = loadImageFromRefSpy.mock.calls[0] ?? [];
+    expect(ref).toMatchObject({ resolved: "/tmp/photo.png", type: "path" });
+    expect(workspaceDir).toBe("/workspace");
+    expect(options).toEqual({
+      maxBytes: MAX_IMAGE_BYTES,
+      workspaceOnly: true,
+      sandbox,
+    });
+    expect(sanitizeImageBlocksSpy).toHaveBeenCalledWith([loadedImage], "prompt:images", {
+      maxBytes: MAX_IMAGE_BYTES,
+    });
+    expect(result).toEqual([sanitizedImage]);
+  });
+
+  it("dedupes repeated refs and skips failed loads before sanitizing", async () => {
+    const loadedImage: ImageContent = {
+      type: "image",
+      data: "b25lLWltYWdl",
+      mimeType: "image/png",
+    };
+
+    const loadImageFromRefSpy = vi
+      .spyOn(promptImageUtils, "loadImageFromRef")
+      .mockResolvedValueOnce(loadedImage)
+      .mockResolvedValueOnce(null);
+    const sanitizeImageBlocksSpy = vi
+      .spyOn(toolImages, "sanitizeImageBlocks")
+      .mockResolvedValueOnce({ images: [loadedImage], dropped: 0 });
+
+    const result = await loadPromptRefImages({
+      prompt: "Compare /tmp/a.png with /tmp/a.png and /tmp/b.png",
+      workspaceDir: "/workspace",
+    });
+
+    expect(loadImageFromRefSpy).toHaveBeenCalledTimes(2);
+    expect(
+      loadImageFromRefSpy.mock.calls.map(
+        (call) => (call[0] as { resolved?: string } | undefined)?.resolved,
+      ),
+    ).toEqual(["/tmp/a.png", "/tmp/b.png"]);
+    expect(sanitizeImageBlocksSpy).toHaveBeenCalledWith([loadedImage], "prompt:images", {
+      maxBytes: MAX_IMAGE_BYTES,
+    });
+    expect(result).toEqual([loadedImage]);
+  });
+});
diff --git a/src/agents/cli-runner.test.ts b/src/agents/cli-runner.test.ts
@@ -3,13 +3,16 @@ import os from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../config/config.js";
+import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
 import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
 import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
 import type { WorkspaceBootstrapFile } from "./workspace.js";
 
 const supervisorSpawnMock = vi.fn();
 const enqueueSystemEventMock = vi.fn();
 const requestHeartbeatNowMock = vi.fn();
+const SMALL_PNG_BASE64 =
+  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
 const hoisted = vi.hoisted(() => {
   type BootstrapContext = {
     bootstrapFiles: WorkspaceBootstrapFile[];
@@ -325,6 +328,135 @@ describe("runCliAgent with process supervisor", () => {
     expect(promptCarrier).toContain("hi");
   });
 
+  it("hydrates prompt media refs into CLI image args", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 0,
+        exitSignal: null,
+        durationMs: 50,
+        stdout: "ok",
+        stderr: "",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );
+
+    const tempDir = await fs.mkdtemp(
+      path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-prompt-image-"),
+    );
+    const sourceImage = path.join(tempDir, "bb-image.png");
+    await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
+
+    try {
+      await runCliAgent({
+        sessionId: "s1",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: tempDir,
+        prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
+        provider: "codex-cli",
+        model: "gpt-5.2-codex",
+        timeoutMs: 1_000,
+        runId: "run-prompt-image",
+      });
+    } finally {
+      await fs.rm(tempDir, { recursive: true, force: true });
+    }
+
+    const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[] };
+    const argv = input.argv ?? [];
+    const imageArgIndex = argv.indexOf("--image");
+    expect(imageArgIndex).toBeGreaterThanOrEqual(0);
+    expect(argv[imageArgIndex + 1]).toContain("openclaw-cli-images-");
+    expect(argv[imageArgIndex + 1]).not.toBe(sourceImage);
+  });
+
+  it("appends hydrated prompt media refs to generic backend prompts", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 0,
+        exitSignal: null,
+        durationMs: 50,
+        stdout: "ok",
+        stderr: "",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );
+
+    const tempDir = await fs.mkdtemp(
+      path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-prompt-image-generic-"),
+    );
+    const sourceImage = path.join(tempDir, "claude-image.png");
+    await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
+
+    try {
+      await runCliAgent({
+        sessionId: "s1",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: tempDir,
+        prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
+        provider: "claude-cli",
+        model: "claude-opus-4-1",
+        timeoutMs: 1_000,
+        runId: "run-prompt-image-generic",
+      });
+    } finally {
+      await fs.rm(tempDir, { recursive: true, force: true });
+    }
+
+    const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[]; input?: string };
+    const argv = input.argv ?? [];
+    expect(argv).not.toContain("--image");
+    const promptCarrier = [input.input ?? "", ...argv].join("\n");
+    const appendedPath = argv.find((value) => value.includes("openclaw-cli-images-"));
+    expect(appendedPath).toBeDefined();
+    expect(appendedPath).not.toBe(sourceImage);
+    expect(promptCarrier).toContain(appendedPath ?? "");
+  });
+
+  it("prefers explicit images over prompt refs", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 0,
+        exitSignal: null,
+        durationMs: 50,
+        stdout: "ok",
+        stderr: "",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );
+
+    const tempDir = await fs.mkdtemp(
+      path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-explicit-images-"),
+    );
+    const sourceImage = path.join(tempDir, "ignored-prompt-image.png");
+    await fs.writeFile(sourceImage, Buffer.from(SMALL_PNG_BASE64, "base64"));
+
+    try {
+      await runCliAgent({
+        sessionId: "s1",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: tempDir,
+        prompt: `[media attached: ${sourceImage} (image/png)]\n\n<media:image>`,
+        images: [{ type: "image", data: SMALL_PNG_BASE64, mimeType: "image/png" }],
+        provider: "codex-cli",
+        model: "gpt-5.2-codex",
+        timeoutMs: 1_000,
+        runId: "run-explicit-image-precedence",
+      });
+    } finally {
+      await fs.rm(tempDir, { recursive: true, force: true });
+    }
+
+    const input = supervisorSpawnMock.mock.calls[0]?.[0] as { argv?: string[] };
+    const argv = input.argv ?? [];
+    expect(argv.filter((arg) => arg === "--image")).toHaveLength(1);
+  });
+
   it("fails with timeout when no-output watchdog trips", async () => {
     supervisorSpawnMock.mockResolvedValueOnce(
       createManagedRun({
diff --git a/src/agents/cli-runner.ts b/src/agents/cli-runner.ts
@@ -27,6 +27,7 @@ import {
   buildCliArgs,
   buildSystemPrompt,
   enqueueCliRun,
+  loadPromptRefImages,
   normalizeCliModel,
   parseCliJson,
   parseCliJsonl,
@@ -222,8 +223,12 @@ export async function runCliAgent(params: {
     let prompt = prependBootstrapPromptWarning(params.prompt, bootstrapPromptWarning.lines, {
       preserveExactPrompt: heartbeatPrompt,
     });
-    if (params.images && params.images.length > 0) {
-      const imagePayload = await writeCliImages(params.images);
+    const resolvedImages =
+      params.images && params.images.length > 0
+        ? params.images
+        : await loadPromptRefImages({ prompt, workspaceDir });
+    if (resolvedImages.length > 0) {
+      const imagePayload = await writeCliImages(resolvedImages);
       imagePaths = imagePayload.paths;
       cleanupImages = imagePayload.cleanup;
       if (!backend.imageArg) {
diff --git a/src/agents/cli-runner/helpers.ts b/src/agents/cli-runner/helpers.ts
@@ -8,15 +8,19 @@ import { KeyedAsyncQueue } from "openclaw/plugin-sdk/keyed-async-queue";
 import type { ThinkLevel } from "../../auto-reply/thinking.js";
 import type { OpenClawConfig } from "../../config/config.js";
 import type { CliBackendConfig } from "../../config/types.js";
+import { MAX_IMAGE_BYTES } from "../../media/constants.js";
 import { buildTtsSystemPromptHint } from "../../tts/tts.js";
 import { isRecord } from "../../utils.js";
 import { buildModelAliasLines } from "../model-alias-lines.js";
 import { resolveDefaultModelForAgent } from "../model-selection.js";
 import { resolveOwnerDisplaySetting } from "../owner-display.js";
 import type { EmbeddedContextFile } from "../pi-embedded-helpers.js";
+import { detectImageReferences, loadImageFromRef } from "../pi-embedded-runner/run/images.js";
+import type { SandboxFsBridge } from "../sandbox/fs-bridge.js";
 import { detectRuntimeShell } from "../shell-utils.js";
 import { buildSystemPromptParams } from "../system-prompt-params.js";
 import { buildAgentSystemPrompt } from "../system-prompt.js";
+import { sanitizeImageBlocks } from "../tool-images.js";
 export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js";
 
 const CLI_RUN_QUEUE = new KeyedAsyncQueue();
@@ -324,6 +328,43 @@ export function appendImagePathsToPrompt(prompt: string, paths: string[]): strin
   return `${trimmed}${separator}${paths.join("\n")}`;
 }
 
+export async function loadPromptRefImages(params: {
+  prompt: string;
+  workspaceDir: string;
+  maxBytes?: number;
+  workspaceOnly?: boolean;
+  sandbox?: { root: string; bridge: SandboxFsBridge };
+}): Promise<ImageContent[]> {
+  const refs = detectImageReferences(params.prompt);
+  if (refs.length === 0) {
+    return [];
+  }
+
+  const maxBytes = params.maxBytes ?? MAX_IMAGE_BYTES;
+  const seen = new Set<string>();
+  const images: ImageContent[] = [];
+  for (const ref of refs) {
+    const key = `${ref.type}:${ref.resolved}`;
+    if (seen.has(key)) {
+      continue;
+    }
+    seen.add(key);
+    const image = await loadImageFromRef(ref, params.workspaceDir, {
+      maxBytes,
+      workspaceOnly: params.workspaceOnly,
+      sandbox: params.sandbox,
+    });
+    if (image) {
+      images.push(image);
+    }
+  }
+
+  const { images: sanitizedImages } = await sanitizeImageBlocks(images, "prompt:images", {
+    maxBytes,
+  });
+  return sanitizedImages;
+}
+
 export async function writeCliImages(
   images: ImageContent[],
 ): Promise<{ paths: string[]; cleanup: () => Promise<void> }> {