fix(agents): honor OpenAI-compatible cache retention

steipete · lonexreb · web-flow · commit 3e351b718e28 · 2026-05-27T13:21:23.000+01:00
Carry over #82973 and fix #81281 by preserving explicit cacheRetention for OpenAI-compatible completions providers that opt into prompt-cache-key support. The change keeps explicit cacheRetention suppressed for OpenAI-compatible providers without compat.supportsPromptCacheKey, adds regression coverage for both paths, and updates prompt-caching docs for prompt_cache_key / prompt_cache_retention behavior. Fixes #81281. Supersedes #82973. Co-authored-by: lonexreb <reach2shubhankar@gmail.com>
diff --git a/docs/reference/prompt-caching.md b/docs/reference/prompt-caching.md
@@ -104,8 +104,8 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
 ### OpenAI (direct API)
 
 - Prompt caching is automatic on supported recent models. OpenClaw does not need to inject block-level cache markers.
-- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns and uses `prompt_cache_retention: "24h"` only when `cacheRetention: "long"` is selected on direct OpenAI hosts.
-- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; `cacheRetention: "none"` still suppresses it.
+- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns. Direct OpenAI hosts use `prompt_cache_retention: "24h"` when `cacheRetention: "long"` is selected.
+- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; with that same opt-in, explicit `cacheRetention: "long"` also forwards `prompt_cache_retention: "24h"`, and `cacheRetention: "none"` suppresses both fields.
 - OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
 - OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
 - OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts
@@ -4637,6 +4637,67 @@ describe("openai transport stream", () => {
     expect(notOptedIn.prompt_cache_key).toBeUndefined();
   });
 
+  it("emits prompt_cache_retention=24h for completions when cacheRetention is long", () => {
+    const model = {
+      id: "custom-model",
+      name: "Custom Model",
+      api: "openai-completions",
+      provider: "custom-cpa",
+      baseUrl: "https://proxy.example.com/v1",
+      compat: { supportsPromptCacheKey: true },
+      reasoning: false,
+      input: ["text"],
+      cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+      contextWindow: 32768,
+      maxTokens: 8192,
+    } as unknown as Model<"openai-completions">;
+    const context = {
+      systemPrompt: "system",
+      messages: [],
+      tools: [],
+    } as never;
+
+    const longRetention = buildOpenAICompletionsParams(model, context, {
+      sessionId: "session-123",
+      cacheRetention: "long",
+    }) as { prompt_cache_key?: string; prompt_cache_retention?: string };
+
+    expect(longRetention.prompt_cache_key).toBe("session-123");
+    expect(longRetention.prompt_cache_retention).toBe("24h");
+  });
+
+  it("omits prompt_cache_retention for completions when cacheRetention is short or unset", () => {
+    const model = {
+      id: "custom-model",
+      name: "Custom Model",
+      api: "openai-completions",
+      provider: "custom-cpa",
+      baseUrl: "https://proxy.example.com/v1",
+      compat: { supportsPromptCacheKey: true },
+      reasoning: false,
+      input: ["text"],
+      cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+      contextWindow: 32768,
+      maxTokens: 8192,
+    } as unknown as Model<"openai-completions">;
+    const context = {
+      systemPrompt: "system",
+      messages: [],
+      tools: [],
+    } as never;
+
+    const shortRetention = buildOpenAICompletionsParams(model, context, {
+      sessionId: "session-123",
+      cacheRetention: "short",
+    });
+    const defaultRetention = buildOpenAICompletionsParams(model, context, {
+      sessionId: "session-123",
+    });
+
+    expect(shortRetention).not.toHaveProperty("prompt_cache_retention");
+    expect(defaultRetention).not.toHaveProperty("prompt_cache_retention");
+  });
+
   it("sorts Chat Completions tools by function name for stable prompt-cache payloads", () => {
     const model = {
       id: "custom-model",
diff --git a/src/agents/openai-transport-stream.ts b/src/agents/openai-transport-stream.ts
@@ -3499,6 +3499,15 @@ export function buildOpenAICompletionsParams(
   }
   if (compat.supportsPromptCacheKey && cacheRetention !== "none" && options?.sessionId) {
     params.prompt_cache_key = options.sessionId;
+    // When the caller explicitly opted into long retention, forward the
+    // canonical prompt_cache_retention value alongside the cache key so
+    // OpenAI-compatible completions backends (oMLX, llama.cpp, official
+    // OpenAI, etc.) can honor the 24h prefix-cache lifetime. Without this
+    // the key reaches the wire but the retention preference is silently
+    // dropped (issue #81281).
+    if (cacheRetention === "long") {
+      params.prompt_cache_retention = "24h";
+    }
   }
   if (options?.temperature !== undefined) {
     params.temperature = options.temperature;
diff --git a/src/agents/pi-embedded-runner-extraparams.test.ts b/src/agents/pi-embedded-runner-extraparams.test.ts
@@ -2647,6 +2647,55 @@ describe("applyExtraParamsToAgent", () => {
     expect(calls[0]?.cacheRetention).toBe("long");
   });
 
+  it("passes through explicit cacheRetention for prompt-cache-key openai-completions providers", () => {
+    const { calls, agent } = createOptionsCaptureAgent();
+    const cfg = buildModelConfig("omlx-local/local_model", {
+      cacheRetention: "long",
+    });
+
+    applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
+
+    const model = {
+      api: "openai-completions",
+      provider: "omlx-local",
+      id: "local_model",
+      compat: { supportsPromptCacheKey: true },
+    } as unknown as Model<"openai-completions">;
+    const context: Context = { messages: [] };
+
+    void agent.streamFn?.(model, context, {
+      sessionId: "session-81281",
+    });
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]?.cacheRetention).toBe("long");
+    expect(calls[0]?.sessionId).toBe("session-81281");
+  });
+
+  it("keeps explicit cacheRetention off openai-completions providers without prompt-cache-key support", () => {
+    const { calls, agent } = createOptionsCaptureAgent();
+    const cfg = buildModelConfig("omlx-local/local_model", {
+      cacheRetention: "long",
+    });
+
+    applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
+
+    const model = {
+      api: "openai-completions",
+      provider: "omlx-local",
+      id: "local_model",
+    } as Model<"openai-completions">;
+    const context: Context = { messages: [] };
+
+    void agent.streamFn?.(model, context, {
+      sessionId: "session-81281",
+    });
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]?.cacheRetention).toBeUndefined();
+    expect(calls[0]?.sessionId).toBe("session-81281");
+  });
+
   it("passes through explicit cacheRetention for custom anthropic-messages providers", () => {
     const { calls, agent } = createOptionsCaptureAgent();
     const cfg = {
diff --git a/src/agents/pi-embedded-runner/extra-params.ts b/src/agents/pi-embedded-runner/extra-params.ts
@@ -494,11 +494,20 @@ function createStreamFnWithExtraParams(
     streamParams.seed = resolvedSeed;
   }
 
+  const readSupportsPromptCacheKey = (m: unknown): boolean => {
+    const compat = (m as { compat?: unknown })?.compat;
+    if (!compat || typeof compat !== "object") {
+      return false;
+    }
+    return (compat as Record<string, unknown>).supportsPromptCacheKey === true;
+  };
+
   const initialCacheRetention = resolveCacheRetention(
     extraParams,
     provider,
     typeof model?.api === "string" ? model.api : undefined,
     typeof model?.id === "string" ? model.id : undefined,
+    readSupportsPromptCacheKey(model),
   );
   if (Object.keys(streamParams).length > 0 || initialCacheRetention) {
     const debugParams = initialCacheRetention
@@ -514,6 +523,7 @@ function createStreamFnWithExtraParams(
       provider,
       typeof callModel.api === "string" ? callModel.api : undefined,
       typeof callModel.id === "string" ? callModel.id : undefined,
+      readSupportsPromptCacheKey(callModel),
     );
     const hasStreamParams = Object.keys(streamParams).length > 0 || cacheRetention;
     if (!hasStreamParams) {
diff --git a/src/agents/pi-embedded-runner/prompt-cache-retention.test.ts b/src/agents/pi-embedded-runner/prompt-cache-retention.test.ts
@@ -30,6 +30,100 @@ describe("prompt cache retention", () => {
     ).toBeUndefined();
   });
 
+  it("passes explicit cacheRetention through for openai-completions providers when supportsPromptCacheKey (issue #81281)", () => {
+    // Regression: openai-completions providers with prefix-caching backends
+    // (oMLX, llama.cpp, etc.) set compat.supportsPromptCacheKey: true and
+    // cacheRetention: "long" but the wrapper was silently dropping the
+    // user's explicit cacheRetention because the provider is neither in the
+    // anthropic family nor google-eligible.
+    expect(
+      resolveCacheRetention(
+        { cacheRetention: "long" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        true,
+      ),
+    ).toBe("long");
+    expect(
+      resolveCacheRetention(
+        { cacheRetention: "short" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        true,
+      ),
+    ).toBe("short");
+    expect(
+      resolveCacheRetention(
+        { cacheRetention: "none" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        true,
+      ),
+    ).toBe("none");
+  });
+
+  it("does not honor explicit cacheRetention for openai-completions without supportsPromptCacheKey", () => {
+    // Providers that route via openai-completions but do not advertise prompt
+    // caching (e.g. amazon-bedrock proxying amazon.* nova models) must keep
+    // the explicit cacheRetention from leaking into the outgoing payload.
+    expect(
+      resolveCacheRetention(
+        { cacheRetention: "long" },
+        "amazon-bedrock",
+        "openai-completions",
+        "amazon.nova-micro-v1:0",
+      ),
+    ).toBeUndefined();
+    expect(
+      resolveCacheRetention(
+        { cacheRetention: "long" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        false,
+      ),
+    ).toBeUndefined();
+  });
+
+  it("returns undefined for openai-completions without explicit cacheRetention", () => {
+    // Without an explicit user choice, openai-completions providers fall back
+    // to the transport-level default ("short") rather than receiving a
+    // wrapper-injected value.
+    expect(
+      resolveCacheRetention(undefined, "omlx-local", "openai-completions", "local_model", true),
+    ).toBeUndefined();
+    expect(
+      resolveCacheRetention({}, "omlx-local", "openai-completions", "local_model", true),
+    ).toBeUndefined();
+  });
+
+  it("does not map legacy cacheControlTtl for openai-completions prompt-cache-key providers", () => {
+    // Legacy TTL aliases were Anthropic/Google semantics; OpenAI-compatible
+    // completions providers need an explicit cacheRetention value before the
+    // wrapper forwards retention to the transport.
+    expect(
+      resolveCacheRetention(
+        { cacheControlTtl: "1h" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        true,
+      ),
+    ).toBeUndefined();
+    expect(
+      resolveCacheRetention(
+        { cacheControlTtl: "5m" },
+        "omlx-local",
+        "openai-completions",
+        "local_model",
+        true,
+      ),
+    ).toBeUndefined();
+  });
+
   it("identifies supported direct Google cache families", () => {
     expect(
       isGooglePromptCacheEligible({
diff --git a/src/agents/pi-embedded-runner/prompt-cache-retention.ts b/src/agents/pi-embedded-runner/prompt-cache-retention.ts
@@ -19,6 +19,7 @@ export function resolveCacheRetention(
   provider: string,
   modelApi?: string,
   modelId?: string,
+  supportsPromptCacheKey?: boolean,
 ): CacheRetention | undefined {
   const hasExplicitCacheConfig =
     extraParams?.cacheRetention !== undefined || extraParams?.cacheControlTtl !== undefined;
@@ -29,8 +30,16 @@ export function resolveCacheRetention(
     hasExplicitCacheConfig,
   });
   const googleEligible = isGooglePromptCacheEligible({ modelApi, modelId });
+  // OpenAI-compatible completions backends (oMLX, llama.cpp, etc.) opt into
+  // prompt caching via `compat.supportsPromptCacheKey: true`. Without that
+  // flag they sit outside the anthropic/google family gates, so issue #81281
+  // dropped the user's explicit `cacheRetention` before the transport layer
+  // could emit it. Proxies that route non-cacheable models via the same
+  // openai-completions wire (amazon-bedrock + amazon.* nova models) leave
+  // the flag unset, so the existing family gate still applies to them.
+  const cacheKeyEligible = supportsPromptCacheKey === true;
 
-  if (!family && !googleEligible) {
+  if (!family && !googleEligible && !cacheKeyEligible) {
     return undefined;
   }
 
@@ -40,10 +49,10 @@ export function resolveCacheRetention(
   }
 
   const legacy = extraParams?.cacheControlTtl;
-  if (legacy === "5m") {
+  if (legacy === "5m" && (family || googleEligible)) {
     return "short";
   }
-  if (legacy === "1h") {
+  if (legacy === "1h" && (family || googleEligible)) {
     return "long";
   }