fix(models): preserve provider prompt cache boundaries

steipete · web-flow · commit eef24d452fb5 · 2026-06-02T09:19:52.000-04:00
Split Anthropic system prompts at the cache boundary so only stable prefixes get cache_control, strip the internal marker when cache control is disabled, and keep OpenAI-compatible Anthropic cache-control routes from caching dynamic suffixes.\n\nFixes #89386.
diff --git a/src/llm/providers/anthropic.test.ts b/src/llm/providers/anthropic.test.ts
@@ -1,4 +1,5 @@
 import { beforeEach, describe, expect, it, vi } from "vitest";
+import { SYSTEM_PROMPT_CACHE_BOUNDARY } from "../../agents/system-prompt-cache-boundary.js";
 import type { Context, Model } from "../types.js";
 
 const anthropicMockState = vi.hoisted(() => ({
@@ -216,4 +217,66 @@ describe("Anthropic provider", () => {
     expect(result.stopReason).toBe("error");
     expect((capturedPayload as { stop_sequences?: unknown }).stop_sequences).toEqual(["STOP"]);
   });
+
+  it("splits the system prompt cache boundary into cached and uncached Anthropic blocks", async () => {
+    let capturedPayload: unknown;
+    const stream = streamSimpleAnthropic(
+      makeAnthropicModel(),
+      {
+        systemPrompt: `Stable prefix${SYSTEM_PROMPT_CACHE_BOUNDARY}Dynamic suffix`,
+        messages: [{ role: "user", content: "hello", timestamp: 0 }],
+      },
+      {
+        apiKey: "sk-ant-provider",
+        onPayload: (payload) => {
+          capturedPayload = payload;
+          throw new Error("stop before network");
+        },
+      },
+    );
+
+    const result = await stream.result();
+
+    expect(result.stopReason).toBe("error");
+    expect((capturedPayload as { system?: unknown }).system).toEqual([
+      {
+        type: "text",
+        text: "Stable prefix",
+        cache_control: { type: "ephemeral" },
+      },
+      {
+        type: "text",
+        text: "Dynamic suffix",
+      },
+    ]);
+  });
+
+  it("strips the internal cache boundary when Anthropic cache control is disabled", async () => {
+    let capturedPayload: unknown;
+    const stream = streamSimpleAnthropic(
+      makeAnthropicModel(),
+      {
+        systemPrompt: `Stable prefix${SYSTEM_PROMPT_CACHE_BOUNDARY}Dynamic suffix`,
+        messages: [{ role: "user", content: "hello", timestamp: 0 }],
+      },
+      {
+        apiKey: "sk-ant-provider",
+        cacheRetention: "none",
+        onPayload: (payload) => {
+          capturedPayload = payload;
+          throw new Error("stop before network");
+        },
+      },
+    );
+
+    const result = await stream.result();
+
+    expect(result.stopReason).toBe("error");
+    expect((capturedPayload as { system?: unknown }).system).toEqual([
+      {
+        type: "text",
+        text: "Stable prefix\nDynamic suffix",
+      },
+    ]);
+  });
 });
diff --git a/src/llm/providers/anthropic.ts b/src/llm/providers/anthropic.ts
@@ -5,7 +5,12 @@ import type {
   MessageCreateParamsStreaming,
   MessageParam,
   RawMessageStreamEvent,
+  TextBlockParam,
 } from "@anthropic-ai/sdk/resources/messages.js";
+import {
+  splitSystemPromptCacheBoundary,
+  stripSystemPromptCacheBoundary,
+} from "../../agents/system-prompt-cache-boundary.js";
 import { getEnvApiKey } from "../env-api-keys.js";
 import { calculateCost, clampThinkingLevel } from "../model-utils.js";
 import type {
@@ -951,21 +956,10 @@ function buildParams(
       },
     ];
     if (context.systemPrompt) {
-      params.system.push({
-        type: "text",
-        text: sanitizeSurrogates(context.systemPrompt),
-        ...(cacheControl ? { cache_control: cacheControl } : {}),
-      });
+      params.system.push(...buildSystemPromptBlocks(context.systemPrompt, cacheControl));
     }
   } else if (context.systemPrompt) {
-    // Add cache control to system prompt for non-OAuth tokens
-    params.system = [
-      {
-        type: "text",
-        text: sanitizeSurrogates(context.systemPrompt),
-        ...(cacheControl ? { cache_control: cacheControl } : {}),
-      },
-    ];
+    params.system = buildSystemPromptBlocks(context.systemPrompt, cacheControl);
   }
 
   // Temperature is incompatible with extended thinking (adaptive or budget-based).
@@ -1220,6 +1214,41 @@ function convertMessages(
   return params;
 }
 
+function buildSystemPromptBlocks(
+  systemPrompt: string,
+  cacheControl: CacheControlEphemeral | undefined,
+): TextBlockParam[] {
+  if (!cacheControl) {
+    return [
+      { type: "text", text: sanitizeSurrogates(stripSystemPromptCacheBoundary(systemPrompt)) },
+    ];
+  }
+
+  const split = splitSystemPromptCacheBoundary(systemPrompt);
+  if (!split) {
+    return [
+      {
+        type: "text",
+        text: sanitizeSurrogates(systemPrompt),
+        cache_control: cacheControl,
+      },
+    ];
+  }
+
+  const blocks: TextBlockParam[] = [];
+  if (split.stablePrefix) {
+    blocks.push({
+      type: "text",
+      text: sanitizeSurrogates(split.stablePrefix),
+      cache_control: cacheControl,
+    });
+  }
+  if (split.dynamicSuffix) {
+    blocks.push({ type: "text", text: sanitizeSurrogates(split.dynamicSuffix) });
+  }
+  return blocks.length > 0 ? blocks : [{ type: "text", text: "" }];
+}
+
 function shouldUseFineGrainedToolStreamingBeta(
   model: Model<"anthropic-messages">,
   context: Context,
diff --git a/src/llm/providers/openai-completions.test.ts b/src/llm/providers/openai-completions.test.ts
@@ -1,5 +1,6 @@
 import type { ChatCompletionChunk } from "openai/resources/chat/completions.js";
 import { describe, expect, it, vi } from "vitest";
+import { SYSTEM_PROMPT_CACHE_BOUNDARY } from "../../agents/system-prompt-cache-boundary.js";
 import type { Context, Model } from "../types.js";
 
 type DeepPartial<T> = { [P in keyof T]?: DeepPartial<T[P]> };
@@ -224,6 +225,75 @@ describe("OpenAI-compatible completions params", () => {
     expect(capturedCacheKey).toBeUndefined();
     expect(capturedRetention).toBe("24h");
   });
+
+  it("strips the internal cache boundary from OpenAI-compatible system prompts", async () => {
+    let capturedMessages: unknown;
+    const stream = streamOpenAICompletions(
+      createModel(32_000),
+      {
+        systemPrompt: `Stable prefix${SYSTEM_PROMPT_CACHE_BOUNDARY}Dynamic suffix`,
+        messages: [{ role: "user", content: "hi", timestamp: 1 }],
+      },
+      {
+        apiKey: "sk-test",
+        onPayload(payload) {
+          capturedMessages = (payload as { messages?: unknown }).messages;
+          throw new Error("stop before network");
+        },
+      },
+    );
+
+    const result = await stream.result();
+
+    expect(result.stopReason).toBe("error");
+    const messages = capturedMessages as Array<{ role: string; content: unknown }>;
+    expect(messages[0]).toEqual({
+      role: "system",
+      content: "Stable prefix\nDynamic suffix",
+    });
+  });
+
+  it("splits the cache boundary before applying Anthropic cache control for OpenRouter Anthropic models", async () => {
+    let capturedMessages: unknown;
+    const stream = streamOpenAICompletions(
+      {
+        ...createModel(32_000),
+        id: "anthropic/claude-sonnet-4.6",
+        provider: "openrouter",
+        baseUrl: "https://openrouter.ai/api/v1",
+      },
+      {
+        systemPrompt: `Stable prefix${SYSTEM_PROMPT_CACHE_BOUNDARY}Dynamic suffix`,
+        messages: [{ role: "user", content: "hi", timestamp: 1 }],
+      },
+      {
+        apiKey: "sk-test",
+        onPayload(payload) {
+          capturedMessages = (payload as { messages?: unknown }).messages;
+          throw new Error("stop before network");
+        },
+      },
+    );
+
+    const result = await stream.result();
+
+    expect(result.stopReason).toBe("error");
+    const messages = capturedMessages as Array<{ role: string; content: unknown }>;
+    expect(messages[0]).toEqual({
+      role: "system",
+      content: [
+        {
+          type: "text",
+          text: "Stable prefix",
+          cache_control: { type: "ephemeral" },
+        },
+        {
+          type: "text",
+          text: "Dynamic suffix",
+        },
+      ],
+    });
+  });
 });
 
 describe("openai-completions stop-reason tool-call guard", () => {
diff --git a/src/llm/providers/openai-completions.ts b/src/llm/providers/openai-completions.ts
@@ -10,6 +10,10 @@ import type {
   ChatCompletionSystemMessageParam,
   ChatCompletionToolMessageParam,
 } from "openai/resources/chat/completions.js";
+import {
+  splitSystemPromptCacheBoundary,
+  stripSystemPromptCacheBoundary,
+} from "../../agents/system-prompt-cache-boundary.js";
 import { createReasoningTagTextPartitioner } from "../../shared/text/reasoning-tag-text-partitioner.js";
 import { getEnvApiKey } from "../env-api-keys.js";
 import { calculateCost, clampThinkingLevel } from "../model-utils.js";
@@ -584,8 +588,10 @@ function buildParams(
   compat: ResolvedOpenAICompletionsCompat = getCompat(model),
   cacheRetention: CacheRetention = resolveCacheRetention(options?.cacheRetention),
 ) {
-  const messages = convertMessages(model, context, compat);
   const cacheControl = getCompatCacheControl(compat, cacheRetention);
+  const messages = convertMessages(model, context, compat, {
+    preserveSystemPromptCacheBoundary: cacheControl !== undefined,
+  });
 
   type ChatCompletionRequestParams = Omit<
     OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming,
@@ -835,13 +841,7 @@ function addCacheControlToTextContent(
     if (content.length === 0) {
       return false;
     }
-    message.content = [
-      {
-        type: "text",
-        text: content,
-        cache_control: cacheControl,
-      },
-    ] as ChatCompletionTextPartWithCacheControl[];
+    message.content = buildCacheControlledTextParts(content, cacheControl);
     return true;
   }
 
@@ -852,19 +852,43 @@ function addCacheControlToTextContent(
   for (let i = content.length - 1; i >= 0; i--) {
     const part = content[i];
     if (part?.type === "text") {
-      const textPart = part as ChatCompletionTextPartWithCacheControl;
-      textPart.cache_control = cacheControl;
+      const text = (part as ChatCompletionTextPartWithCacheControl).text;
+      content.splice(i, 1, ...buildCacheControlledTextParts(text, cacheControl));
       return true;
     }
   }
 
   return false;
 }
 
+function buildCacheControlledTextParts(
+  text: string,
+  cacheControl: OpenAICompatCacheControl,
+): ChatCompletionTextPartWithCacheControl[] {
+  const split = splitSystemPromptCacheBoundary(text);
+  if (!split) {
+    return [{ type: "text", text, cache_control: cacheControl }];
+  }
+
+  const parts: ChatCompletionTextPartWithCacheControl[] = [];
+  if (split.stablePrefix) {
+    parts.push({
+      type: "text",
+      text: split.stablePrefix,
+      cache_control: cacheControl,
+    });
+  }
+  if (split.dynamicSuffix) {
+    parts.push({ type: "text", text: split.dynamicSuffix });
+  }
+  return parts.length > 0 ? parts : [{ type: "text", text: "" }];
+}
+
 export function convertMessages(
   model: Model<"openai-completions">,
   context: Context,
   compat: ResolvedOpenAICompletionsCompat,
+  options: { preserveSystemPromptCacheBoundary?: boolean } = {},
 ): ChatCompletionMessageParam[] {
   const params: ChatCompletionMessageParam[] = [];
 
@@ -892,7 +916,13 @@ export function convertMessages(
   if (context.systemPrompt) {
     const useDeveloperRole = model.reasoning && compat.supportsDeveloperRole;
     const role = useDeveloperRole ? "developer" : "system";
-    params.push({ role, content: sanitizeSurrogates(context.systemPrompt) });
+    const systemPrompt = options.preserveSystemPromptCacheBoundary
+      ? context.systemPrompt
+      : stripSystemPromptCacheBoundary(context.systemPrompt);
+    params.push({
+      role,
+      content: sanitizeSurrogates(systemPrompt),
+    });
   }
 
   let lastRole: string | null = null;