fix: preserve runtime token budget in deferred context-engine maintenance (#66820)

jalehman · web-flow · commit 75e7fc97f804 · 2026-04-14T15:30:37.000-07:00
* fix(context-engine): pass deferred maintenance token budget

Thread tokenBudget through the after-turn runtime context so background context-engine maintenance reuses the real model context window instead of falling back to 128k. Also pass through a best-effort currentTokenCount from the latest call total and make the runtime context type explicit about both fields.

Regeneration-Prompt: |
  OpenClaw already passed the real context token budget into direct context-engine calls like afterTurn and assemble, but deferred maintain() reused only the runtimeContext object and that object did not carry tokenBudget. Lossless Claw therefore fell back to 128k during background maintenance, which made budget-trigger fire much more aggressively than the live model context warranted. Thread the real contextTokenBudget into buildAfterTurnRuntimeContext so deferred maintenance receives the same budget, and pass a straightforward best-effort currentTokenCount from the latest call total while the relevant data is already in scope. Keep the change additive, update the runtime-context type, and cover the background maintenance/runtime-context behavior with focused tests.

* fix(context-engine): use prompt usage for deferred maintenance
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
 - Agents/fallback: preserve the original prompt body on model fallback retries with session history so the retrying model keeps the active task instead of only seeing a generic continue message. (#66029) Thanks @WuKongAI-CMU.
 - Reply/secrets: resolve active reply channel/account SecretRefs before reply-run message-action discovery so channel token SecretRefs (for example Discord) do not degrade into discovery-time unresolved-secret failures. (#66796) Thanks @joshavant.
 - Agents/Anthropic: ignore non-positive Anthropic Messages token overrides and fail locally when no positive token budget remains, so invalid `max_tokens` values no longer reach the provider API. (#66664) thanks @jalehman
+- Agents/context engines: preserve prompt-only token counts, not full request totals, when deferred maintenance reuses after-turn runtime context so background compaction bookkeeping matches the active prompt window. (#66820) thanks @jalehman.
 
 ## 2026.4.14
 
diff --git a/src/agents/pi-embedded-runner/context-engine-maintenance.test.ts b/src/agents/pi-embedded-runner/context-engine-maintenance.test.ts
@@ -424,7 +424,11 @@ describe("runContextEngineMaintenance", () => {
           sessionKey,
           sessionFile: "/tmp/session.jsonl",
           reason: "turn",
-          runtimeContext: { workspaceDir: "/tmp/workspace" },
+          runtimeContext: {
+            workspaceDir: "/tmp/workspace",
+            tokenBudget: 2048,
+            currentTokenCount: 1536,
+          },
         });
 
         expect(result).toBeUndefined();
@@ -453,6 +457,8 @@ describe("runContextEngineMaintenance", () => {
           runtimeContext: expect.objectContaining({
             workspaceDir: "/tmp/workspace",
             allowDeferredCompactionExecution: true,
+            tokenBudget: 2048,
+            currentTokenCount: 1536,
           }),
         });
 
diff --git a/src/agents/pi-embedded-runner/run/attempt.prompt-helpers.ts b/src/agents/pi-embedded-runner/run/attempt.prompt-helpers.ts
@@ -226,6 +226,8 @@ export function buildAfterTurnRuntimeContext(params: {
   >;
   workspaceDir: string;
   agentDir: string;
+  tokenBudget?: number;
+  currentTokenCount?: number;
   promptCache?: ContextEnginePromptCacheInfo;
 }): ContextEngineRuntimeContext {
   return {
@@ -252,6 +254,16 @@ export function buildAfterTurnRuntimeContext(params: {
       extraSystemPrompt: params.attempt.extraSystemPrompt,
       ownerNumbers: params.attempt.ownerNumbers,
     }),
+    ...(typeof params.tokenBudget === "number" &&
+      Number.isFinite(params.tokenBudget) &&
+      params.tokenBudget > 0
+      ? { tokenBudget: Math.floor(params.tokenBudget) }
+      : {}),
+    ...(typeof params.currentTokenCount === "number" &&
+      Number.isFinite(params.currentTokenCount) &&
+      params.currentTokenCount > 0
+      ? { currentTokenCount: Math.floor(params.currentTokenCount) }
+      : {}),
     ...(params.promptCache ? { promptCache: params.promptCache } : {}),
   };
 }
diff --git a/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.context-engine.test.ts b/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.context-engine.test.ts
@@ -14,6 +14,8 @@ import {
   runAttemptContextEngineBootstrap,
 } from "./attempt.context-engine-helpers.js";
 import {
+  cleanupTempPaths,
+  createContextEngineAttemptRunner,
   createContextEngineBootstrapAndAssemble,
   expectCalledWithSessionKey,
   getHoisted,
@@ -109,13 +111,15 @@ async function finalizeTurn(
 
 describe("runEmbeddedAttempt context engine sessionKey forwarding", () => {
   const sessionKey = "agent:main:discord:channel:test-ctx-engine";
+  const tempPaths: string[] = [];
   beforeEach(() => {
     resetEmbeddedAttemptHarness();
     clearMemoryPluginState();
     hoisted.runContextEngineMaintenanceMock.mockReset().mockResolvedValue(undefined);
   });
 
   afterEach(async () => {
+    await cleanupTempPaths(tempPaths);
     clearMemoryPluginState();
     vi.restoreAllMocks();
   });
@@ -395,6 +399,59 @@ describe("runEmbeddedAttempt context engine sessionKey forwarding", () => {
     );
   });
 
+  it("derives deferred maintenance currentTokenCount from prompt-only usage", async () => {
+    const afterTurn = vi.fn(
+      async (_params: {
+        runtimeContext?: {
+          currentTokenCount?: number;
+          promptCache?: { lastCallUsage?: { total?: number } };
+        };
+      }) => {},
+    );
+
+    await createContextEngineAttemptRunner({
+      sessionKey,
+      tempPaths,
+      contextEngine: {
+        assemble: async ({ messages }) => ({
+          messages,
+          estimatedTokens: 1,
+        }),
+        afterTurn,
+      },
+      sessionPrompt: async (session) => {
+        session.messages = [
+          ...session.messages,
+          {
+            role: "assistant",
+            content: "done",
+            timestamp: 2,
+            usage: {
+              input: 10,
+              output: 5,
+              cacheRead: 40,
+              cacheWrite: 2,
+              total: 57,
+            },
+          } as unknown as AgentMessage,
+        ];
+      },
+    });
+
+    expect(afterTurn).toHaveBeenCalledWith(
+      expect.objectContaining({
+        runtimeContext: expect.objectContaining({
+          currentTokenCount: 52,
+          promptCache: expect.objectContaining({
+            lastCallUsage: expect.objectContaining({
+              total: 57,
+            }),
+          }),
+        }),
+      }),
+    );
+  });
+
   it("skips maintenance when ingestBatch fails", async () => {
     const { bootstrap, assemble } = createContextEngineBootstrapAndAssemble();
     const ingestBatch = vi.fn(async () => {
diff --git a/src/agents/pi-embedded-runner/run/attempt.test.ts b/src/agents/pi-embedded-runner/run/attempt.test.ts
@@ -2836,6 +2836,8 @@ describe("buildAfterTurnRuntimeContext", () => {
       },
       workspaceDir: "/tmp/workspace",
       agentDir: "/tmp/agent",
+      tokenBudget: 1050000,
+      currentTokenCount: 232393,
     });
 
     expect(legacy).toMatchObject({
@@ -2844,6 +2846,8 @@ describe("buildAfterTurnRuntimeContext", () => {
       model: "gpt-5.4",
       workspaceDir: "/tmp/workspace",
       agentDir: "/tmp/agent",
+      tokenBudget: 1050000,
+      currentTokenCount: 232393,
     });
   });
 
diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -116,7 +116,7 @@ import {
   resolveTranscriptPolicy,
   shouldAllowProviderOwnedThinkingReplay,
 } from "../../transcript-policy.js";
-import { normalizeUsage, type NormalizedUsage } from "../../usage.js";
+import { derivePromptTokens, normalizeUsage, type NormalizedUsage } from "../../usage.js";
 import { DEFAULT_BOOTSTRAP_FILENAME } from "../../workspace.js";
 import { isRunnerAbortError } from "../abort.js";
 import { isCacheTtlEligibleProvider, readLastCacheTtlTimestamp } from "../cache-ttl.js";
@@ -887,6 +887,7 @@ export async function runEmbeddedAttempt(
           attempt: params,
           workspaceDir: effectiveWorkspace,
           agentDir,
+          tokenBudget: params.contextTokenBudget,
         }),
         runMaintenance: async (contextParams) =>
           await runContextEngineMaintenance({
@@ -2201,10 +2202,13 @@ export async function runEmbeddedAttempt(
 
         // Let the active context engine run its post-turn lifecycle.
         if (params.contextEngine) {
+          const runtimeCurrentTokenCount = derivePromptTokens(lastCallUsage);
           const afterTurnRuntimeContext = buildAfterTurnRuntimeContext({
             attempt: params,
             workspaceDir: effectiveWorkspace,
             agentDir,
+            tokenBudget: params.contextTokenBudget,
+            currentTokenCount: runtimeCurrentTokenCount,
             promptCache,
           });
           await finalizeAttemptContextEngineTurn({
diff --git a/src/context-engine/types.ts b/src/context-engine/types.ts
@@ -140,6 +140,10 @@ export type ContextEngineRuntimeContext = Record<string, unknown> & {
    * consuming deferred compaction debt.
    */
   allowDeferredCompactionExecution?: boolean;
+  /** Runtime-resolved context window budget for the active model call. */
+  tokenBudget?: number;
+  /** Best-effort current prompt/context token estimate for this turn. */
+  currentTokenCount?: number;
   /** Optional prompt-cache telemetry for cache-aware engines. */
   promptCache?: ContextEnginePromptCacheInfo;
   /**