fix: return real usage for OpenAI-compatible chat completions (#62986) (thanks @Lellansin)

Lellansin · obviyus · web-flow · commit 2ccd1839f212 · 2026-04-10T21:46:24.000+05:30
* Gateway: fix chat completions usage compatibility * Gateway: clarify usage-gated stream wait * Gateway: preserve aggregate usage totals * Agents: clamp usage components before total * fix(gateway): bound usage stream finalization * fix: add OpenAI compat usage changelog (#62986) (thanks @Lellansin) * fix(agents): emit lifecycle terminal events after flush --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai
 - Daemon/gateway: prevent systemd restart storms on configuration errors by exiting with `EX_CONFIG` and adding generated unit restart-prevention guards. (#63913) Thanks @neo1027144-creator.
 - Agents/exec: prevent gateway crash ("Agent listener invoked outside active run") when a subagent exec tool produces stdout/stderr after the agent run has ended or been aborted. (#62821) Thanks @openperf.
 - Browser/tabs: route `/tabs/action` close/select through the same browser endpoint reachability and policy checks as list/new (including Playwright-backed remote tab operations), reject CDP HTTP redirects on probe requests, and sanitize blocked-endpoint error responses so tab list/focus/close flows fail closed without echoing raw policy details back to callers. (#63332)
+- Gateway/OpenAI compat: return real `usage` for non-stream `/v1/chat/completions` responses, emit the final usage chunk when `stream_options.include_usage=true`, and bound usage-gated stream finalization after lifecycle end. (#62986) Thanks @Lellansin.
 
 ## 2026.4.9
 
diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts
@@ -224,4 +224,63 @@ describe("handleAgentEnd", () => {
     resolveChannelFlush?.();
     await endPromise;
   });
+
+  it("emits lifecycle end after async channel flush completes", async () => {
+    let resolveChannelFlush: (() => void) | undefined;
+    const onAgentEvent = vi.fn();
+    const onBlockReplyFlush = vi.fn(
+      () =>
+        new Promise<void>((resolve) => {
+          resolveChannelFlush = resolve;
+        }),
+    );
+    const ctx = createContext(undefined, { onAgentEvent, onBlockReplyFlush });
+
+    const endPromise = handleAgentEnd(ctx);
+
+    expect(onAgentEvent).not.toHaveBeenCalled();
+
+    resolveChannelFlush?.();
+    await endPromise;
+
+    expect(onAgentEvent).toHaveBeenCalledWith({
+      stream: "lifecycle",
+      data: { phase: "end" },
+    });
+  });
+
+  it("emits lifecycle error after async channel flush completes", async () => {
+    let resolveChannelFlush: (() => void) | undefined;
+    const onAgentEvent = vi.fn();
+    const onBlockReplyFlush = vi.fn(
+      () =>
+        new Promise<void>((resolve) => {
+          resolveChannelFlush = resolve;
+        }),
+    );
+    const ctx = createContext(
+      {
+        role: "assistant",
+        stopReason: "error",
+        errorMessage: "connection refused",
+        content: [{ type: "text", text: "" }],
+      },
+      { onAgentEvent, onBlockReplyFlush },
+    );
+
+    const endPromise = handleAgentEnd(ctx);
+
+    expect(onAgentEvent).not.toHaveBeenCalled();
+
+    resolveChannelFlush?.();
+    await endPromise;
+
+    expect(onAgentEvent).toHaveBeenCalledWith({
+      stream: "lifecycle",
+      data: {
+        phase: "error",
+        error: "LLM request failed: connection refused by the provider endpoint.",
+      },
+    });
+  });
 });
diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
@@ -38,6 +38,7 @@ export function handleAgentStart(ctx: EmbeddedPiSubscribeContext) {
 export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
   const lastAssistant = ctx.state.lastAssistant;
   const isError = isAssistantMessage(lastAssistant) && lastAssistant.stopReason === "error";
+  let lifecycleErrorText: string | undefined;
 
   if (isError && lastAssistant) {
     const friendlyError = formatAssistantErrorText(lastAssistant, {
@@ -54,6 +55,7 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
     const observedError = buildApiErrorObservationFields(rawError);
     const safeErrorText =
       buildTextObservationFields(errorText).textPreview ?? "LLM request failed.";
+    lifecycleErrorText = safeErrorText;
     const safeRunId = sanitizeForConsole(ctx.params.runId) ?? "-";
     const safeModel = sanitizeForConsole(lastAssistant.model) ?? "unknown";
     const safeProvider = sanitizeForConsole(lastAssistant.provider) ?? "unknown";
@@ -71,24 +73,30 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
       ...observedError,
       consoleMessage: `embedded run agent end: runId=${safeRunId} isError=true model=${safeModel} provider=${safeProvider} error=${safeErrorText}${rawErrorConsoleSuffix}`,
     });
-    emitAgentEvent({
-      runId: ctx.params.runId,
-      stream: "lifecycle",
-      data: {
-        phase: "error",
-        error: safeErrorText,
-        endedAt: Date.now(),
-      },
-    });
-    void ctx.params.onAgentEvent?.({
-      stream: "lifecycle",
-      data: {
-        phase: "error",
-        error: safeErrorText,
-      },
-    });
   } else {
     ctx.log.debug(`embedded run agent end: runId=${ctx.params.runId} isError=${isError}`);
+  }
+
+  const emitLifecycleTerminal = () => {
+    if (isError) {
+      emitAgentEvent({
+        runId: ctx.params.runId,
+        stream: "lifecycle",
+        data: {
+          phase: "error",
+          error: lifecycleErrorText ?? "LLM request failed.",
+          endedAt: Date.now(),
+        },
+      });
+      void ctx.params.onAgentEvent?.({
+        stream: "lifecycle",
+        data: {
+          phase: "error",
+          error: lifecycleErrorText ?? "LLM request failed.",
+        },
+      });
+      return;
+    }
     emitAgentEvent({
       runId: ctx.params.runId,
       stream: "lifecycle",
@@ -101,7 +109,7 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
       stream: "lifecycle",
       data: { phase: "end" },
     });
-  }
+  };
 
   const finalizeAgentEnd = () => {
     ctx.state.blockState.thinking = false;
@@ -140,11 +148,14 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
   const flushBlockReplyBufferResult = ctx.flushBlockReplyBuffer();
   finalizeAgentEnd();
   if (isPromiseLike<void>(flushBlockReplyBufferResult)) {
-    return flushBlockReplyBufferResult.then(() => flushPendingMediaAndChannel());
+    return flushBlockReplyBufferResult
+      .then(() => flushPendingMediaAndChannel())
+      .then(() => emitLifecycleTerminal());
   }
 
   const flushPendingMediaAndChannelResult = flushPendingMediaAndChannel();
   if (isPromiseLike<void>(flushPendingMediaAndChannelResult)) {
-    return flushPendingMediaAndChannelResult;
+    return flushPendingMediaAndChannelResult.then(() => emitLifecycleTerminal());
   }
+  emitLifecycleTerminal();
 }
diff --git a/src/agents/usage.test.ts b/src/agents/usage.test.ts
@@ -4,6 +4,7 @@ import {
   hasNonzeroUsage,
   derivePromptTokens,
   deriveSessionTotalTokens,
+  toOpenAiChatCompletionsUsage,
 } from "./usage.js";
 
 describe("normalizeUsage", () => {
@@ -146,6 +147,90 @@ describe("normalizeUsage", () => {
   });
 });
 
+describe("toOpenAiChatCompletionsUsage", () => {
+  it("uses max(component sum, aggregate total) when breakdown is partial", () => {
+    const usage = normalizeUsage({ output_tokens: 20, total_tokens: 100 });
+    expect(toOpenAiChatCompletionsUsage(usage)).toEqual({
+      prompt_tokens: 0,
+      completion_tokens: 20,
+      total_tokens: 100,
+    });
+  });
+
+  it("uses component sum when it exceeds aggregate total", () => {
+    expect(
+      toOpenAiChatCompletionsUsage({
+        input: 30,
+        output: 40,
+        total: 50,
+      }),
+    ).toEqual({
+      prompt_tokens: 30,
+      completion_tokens: 40,
+      total_tokens: 70,
+    });
+  });
+
+  it("uses aggregate total when only total is present", () => {
+    const usage = normalizeUsage({ total_tokens: 42 });
+    expect(toOpenAiChatCompletionsUsage(usage)).toEqual({
+      prompt_tokens: 0,
+      completion_tokens: 0,
+      total_tokens: 42,
+    });
+  });
+
+  it("returns zeros for undefined usage", () => {
+    expect(toOpenAiChatCompletionsUsage(undefined)).toEqual({
+      prompt_tokens: 0,
+      completion_tokens: 0,
+      total_tokens: 0,
+    });
+  });
+
+  it("raises total_tokens with aggregate when cache write is excluded from prompt sum", () => {
+    expect(
+      toOpenAiChatCompletionsUsage({
+        input: 10,
+        output: 5,
+        cacheWrite: 100,
+        total: 200,
+      }),
+    ).toEqual({
+      prompt_tokens: 10,
+      completion_tokens: 5,
+      total_tokens: 200,
+    });
+  });
+
+  it("clamps negative completion before deriving total_tokens", () => {
+    expect(
+      toOpenAiChatCompletionsUsage({
+        input: 3,
+        output: -5,
+      }),
+    ).toEqual({
+      prompt_tokens: 3,
+      completion_tokens: 0,
+      total_tokens: 3,
+    });
+  });
+
+  it("preserves aggregate total when components are partially negative", () => {
+    expect(
+      toOpenAiChatCompletionsUsage({
+        input: 3,
+        output: -5,
+        total: 7,
+      }),
+    ).toEqual({
+      prompt_tokens: 3,
+      completion_tokens: 0,
+      total_tokens: 7,
+    });
+  });
+});
+
 describe("hasNonzeroUsage", () => {
   it("returns true when cache read is nonzero", () => {
     const usage = { cacheRead: 100 };
diff --git a/src/agents/usage.ts b/src/agents/usage.ts
@@ -143,6 +143,41 @@ export function normalizeUsage(raw?: UsageLike | null): NormalizedUsage | undefi
   };
 }
 
+/**
+ * Maps normalized usage to OpenAI Chat Completions `usage` fields.
+ *
+ * `prompt_tokens` is input + cacheRead (cache write is excluded to match the
+ * OpenAI-style breakdown used by the compat endpoint).
+ *
+ * `total_tokens` is the greater of the component sum and aggregate `total` when
+ * present, so a partial breakdown cannot discard a valid upstream total.
+ */
+export function toOpenAiChatCompletionsUsage(usage: NormalizedUsage | undefined): {
+  prompt_tokens: number;
+  completion_tokens: number;
+  total_tokens: number;
+} {
+  const input = usage?.input ?? 0;
+  const output = usage?.output ?? 0;
+  const cacheRead = usage?.cacheRead ?? 0;
+  const promptTokens = Math.max(0, input + cacheRead);
+  const completionTokens = Math.max(0, output);
+  const componentTotal = promptTokens + completionTokens;
+  const aggregateRaw = usage?.total;
+  const aggregateTotal =
+    typeof aggregateRaw === "number" && Number.isFinite(aggregateRaw)
+      ? Math.max(0, aggregateRaw)
+      : undefined;
+  const totalTokens =
+    aggregateTotal !== undefined ? Math.max(componentTotal, aggregateTotal) : componentTotal;
+
+  return {
+    prompt_tokens: promptTokens,
+    completion_tokens: completionTokens,
+    total_tokens: totalTokens,
+  };
+}
+
 export function derivePromptTokens(usage?: {
   input?: number;
   cacheRead?: number;
diff --git a/src/gateway/openai-http.test.ts b/src/gateway/openai-http.test.ts
diff --git a/src/gateway/openai-http.ts b/src/gateway/openai-http.ts