Skip to content

Commit 557c5bf

Browse files
committed
test(live): soften OpenAI cache telemetry floor
1 parent 1d6de8d commit 557c5bf

3 files changed

Lines changed: 117 additions & 6 deletions

File tree

src/agents/live-cache-regression-baseline.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,18 +64,21 @@ export const LIVE_CACHE_REGRESSION_BASELINE = {
6464
observedHitRate: 0.891,
6565
minCacheRead: 4_096,
6666
minHitRate: 0.85,
67+
warnOnly: true,
6768
},
6869
stable: {
6970
observedCacheRead: 4_864,
7071
observedHitRate: 0.966,
7172
minCacheRead: 4_608,
7273
minHitRate: 0.9,
74+
warnOnly: true,
7375
},
7476
tool: {
7577
observedCacheRead: 4_608,
7678
observedHitRate: 0.896,
7779
minCacheRead: 4_096,
7880
minHitRate: 0.85,
81+
warnOnly: true,
7982
},
8083
},
8184
} as const satisfies Record<string, Record<string, LiveCacheFloor>>;

src/agents/live-cache-regression-runner.test.ts

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ describe("live cache regression runner", () => {
2828
]);
2929
});
3030

31-
it("keeps hard cache floors blocking for required OpenAI lanes", () => {
31+
it("keeps OpenAI text cache floor misses advisory", () => {
3232
const regressions: string[] = [];
3333
const warnings: string[] = [];
3434

@@ -47,11 +47,11 @@ describe("live cache regression runner", () => {
4747
warnings,
4848
});
4949

50-
expect(regressions).toEqual([
50+
expect(regressions).toEqual([]);
51+
expect(warnings).toEqual([
5152
"openai:stable cacheRead=0 < min=4608",
5253
"openai:stable hitRate=0.000 < min=0.900",
5354
]);
54-
expect(warnings).toEqual([]);
5555
});
5656

5757
it("retries hard cache baseline misses once", () => {
@@ -122,6 +122,65 @@ describe("live cache regression runner", () => {
122122
).toBe(false);
123123
});
124124

125+
it("keeps OpenAI cache probes above the reasoning output floor", () => {
126+
expect(
127+
__testing.resolveCacheProbeMaxTokens({
128+
maxTokens: 32,
129+
providerTag: "openai",
130+
}),
131+
).toBe(256);
132+
expect(
133+
__testing.resolveCacheProbeMaxTokens({
134+
maxTokens: 512,
135+
providerTag: "openai",
136+
}),
137+
).toBe(512);
138+
expect(
139+
__testing.resolveCacheProbeMaxTokens({
140+
maxTokens: 32,
141+
providerTag: "anthropic",
142+
}),
143+
).toBe(32);
144+
});
145+
146+
it("accepts empty OpenAI cache probe text only when usage is observable", () => {
147+
expect(
148+
__testing.shouldAcceptEmptyOpenAICacheProbe({
149+
providerTag: "openai",
150+
text: "",
151+
usage: { input: 5_000 },
152+
}),
153+
).toBe(true);
154+
expect(
155+
__testing.shouldAcceptEmptyOpenAICacheProbe({
156+
providerTag: "openai",
157+
text: "",
158+
usage: { cacheRead: 4_608 },
159+
}),
160+
).toBe(true);
161+
expect(
162+
__testing.shouldAcceptEmptyOpenAICacheProbe({
163+
providerTag: "openai",
164+
text: "wrong",
165+
usage: { input: 5_000 },
166+
}),
167+
).toBe(false);
168+
expect(
169+
__testing.shouldAcceptEmptyOpenAICacheProbe({
170+
providerTag: "anthropic",
171+
text: "",
172+
usage: { input: 5_000 },
173+
}),
174+
).toBe(false);
175+
expect(
176+
__testing.shouldAcceptEmptyOpenAICacheProbe({
177+
providerTag: "openai",
178+
text: "",
179+
usage: {},
180+
}),
181+
).toBe(false);
182+
});
183+
125184
it("accepts a warmup that already hits the provider cache", () => {
126185
const findings = __testing.evaluateAgainstBaseline({
127186
lane: "image",

src/agents/live-cache-regression-runner.ts

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const ANTHROPIC_TIMEOUT_MS = 120_000;
2222
const LIVE_CACHE_LANE_RETRIES = 1;
2323
const LIVE_CACHE_RESPONSE_RETRIES = 2;
2424
const OPENAI_CACHE_REASONING = "low" as unknown as never;
25+
const OPENAI_CACHE_MIN_MAX_TOKENS = 256;
2526
const OPENAI_PREFIX = buildStableCachePrefix("openai");
2627
const OPENAI_MCP_PREFIX = buildStableCachePrefix("openai-mcp-style");
2728
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
@@ -153,6 +154,32 @@ function shouldRetryCacheProbeText(params: {
153154
);
154155
}
155156

157+
function resolveCacheProbeMaxTokens(params: {
158+
maxTokens: number | undefined;
159+
providerTag: "anthropic" | "openai";
160+
}): number {
161+
const requested = params.maxTokens ?? 64;
162+
if (params.providerTag !== "openai") {
163+
return requested;
164+
}
165+
return Math.max(requested, OPENAI_CACHE_MIN_MAX_TOKENS);
166+
}
167+
168+
function shouldAcceptEmptyOpenAICacheProbe(params: {
169+
providerTag: "anthropic" | "openai";
170+
text: string;
171+
usage: CacheUsage;
172+
}): boolean {
173+
if (params.providerTag !== "openai" || params.text.trim().length > 0) {
174+
return false;
175+
}
176+
return (
177+
(params.usage.input ?? 0) > 0 ||
178+
(params.usage.cacheRead ?? 0) > 0 ||
179+
(params.usage.cacheWrite ?? 0) > 0
180+
);
181+
}
182+
156183
async function runToolOnlyTurn(params: {
157184
apiKey: string;
158185
cacheRetention: "none" | "short" | "long";
@@ -242,14 +269,35 @@ async function completeCacheProbe(params: {
242269
apiKey: params.apiKey,
243270
cacheRetention: params.cacheRetention,
244271
sessionId: params.sessionId,
245-
maxTokens: params.maxTokens ?? 64,
272+
maxTokens: resolveCacheProbeMaxTokens({
273+
maxTokens: params.maxTokens,
274+
providerTag: params.providerTag,
275+
}),
246276
temperature: 0,
247277
...(params.providerTag === "openai" ? { reasoning: OPENAI_CACHE_REASONING } : {}),
248278
},
249279
`${params.providerTag} cache lane ${params.suffix}`,
250280
timeoutMs,
251281
);
252282
const text = extractAssistantText(response);
283+
const usage = normalizeCacheUsage(response.usage);
284+
if (
285+
shouldAcceptEmptyOpenAICacheProbe({
286+
providerTag: params.providerTag,
287+
text,
288+
usage,
289+
})
290+
) {
291+
logLiveCache(
292+
`${params.providerTag} cache lane ${params.suffix} accepted empty text with usage ${formatUsage(usage)}`,
293+
);
294+
return {
295+
suffix: params.suffix,
296+
text,
297+
usage,
298+
hitRate: computeCacheHitRate(usage),
299+
};
300+
}
253301
if (shouldRetryCacheProbeText({ attempt, suffix: params.suffix, text })) {
254302
logLiveCache(
255303
`${params.providerTag} cache lane ${params.suffix} response mismatch; retrying: ${JSON.stringify(text)}`,
@@ -262,7 +310,6 @@ async function completeCacheProbe(params: {
262310
if (!responseTextLower.includes(markerLower)) {
263311
throw new CacheProbeTextMismatchError(params.suffix, text);
264312
}
265-
const usage = normalizeCacheUsage(response.usage);
266313
return {
267314
suffix: params.suffix,
268315
text,
@@ -551,6 +598,8 @@ function appendBaselineFindings(target: BaselineFindings, source: BaselineFindin
551598
export const __testing = {
552599
assertAgainstBaseline,
553600
evaluateAgainstBaseline,
601+
resolveCacheProbeMaxTokens,
602+
shouldAcceptEmptyOpenAICacheProbe,
554603
shouldRetryCacheProbeText,
555604
shouldRetryBaselineFindings,
556605
};
@@ -562,7 +611,7 @@ export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResul
562611
provider: "openai",
563612
api: "openai-responses",
564613
envVar: "OPENCLAW_LIVE_OPENAI_CACHE_MODEL",
565-
preferredModelIds: ["gpt-5.2", "gpt-5.4-mini", "gpt-5.4", "gpt-5.5"],
614+
preferredModelIds: ["gpt-4.1", "gpt-5.2", "gpt-5.4-mini", "gpt-5.4", "gpt-5.5"],
566615
});
567616
const anthropic = await resolveLiveDirectModel({
568617
provider: "anthropic",

0 commit comments

Comments
 (0)