fix(diagnostics): fence late tool/model starts on owner-less run teardown

849261680 · 849261680 · commit 83c3212755ca · 2026-06-06T13:47:51.000+08:00
Address review: tool/model start events are async-queued, so a start emitted
before an owner-less reply-run teardown could drain after the eviction and
re-arm an owner-less marker, restoring the blocked_tool_call leak. The
owner-less eviction now records a start-event sequence cutoff for the session
owner refs at the current event sequence (the same mechanism stuck-session
recovery uses via recoveredOwnerStartEventCutoffs), so a late-draining start is
ignored instead of recreating the marker. Adds an async-drain regression test.
diff --git a/src/auto-reply/reply/reply-run-registry.test.ts b/src/auto-reply/reply/reply-run-registry.test.ts
@@ -1,7 +1,11 @@
 // Tests active reply run registry add, lookup, and cleanup behavior.
 import { afterEach, describe, expect, it, vi } from "vitest";
 import type { DiagnosticEventPayload } from "../../infra/diagnostic-events.js";
-import { onInternalDiagnosticEvent } from "../../infra/diagnostic-events.js";
+import {
+  emitInternalDiagnosticEvent,
+  onInternalDiagnosticEvent,
+  waitForDiagnosticEventsDrained,
+} from "../../infra/diagnostic-events.js";
 import {
   getDiagnosticSessionActivitySnapshot,
   markDiagnosticToolStartedForTest,
@@ -137,6 +141,31 @@ describe("reply run registry", () => {
     });
   });
 
+  it("ignores a tool start that drains after an owner-less reply completion", async () => {
+    const sessionKey = "agent:main:slack:channel:chat-2";
+    const sessionId = "session-1";
+    const op = createReplyOperation({ sessionKey, sessionId, resetTriggered: false });
+
+    // A native tool start is emitted while the run is active but stays queued in
+    // the async diagnostic pipeline (not yet drained into activity state).
+    emitInternalDiagnosticEvent({
+      type: "tool.execution.started",
+      runId: "run-1",
+      sessionId,
+      sessionKey,
+      toolName: "bash",
+      toolCallId: "t1",
+    });
+
+    // The reply run completes (owner-less teardown) before the start drains.
+    op.complete();
+    expect(getDiagnosticSessionActivitySnapshot({ sessionKey }).activeWorkKind).toBeUndefined();
+
+    // Draining the queued start must NOT re-arm an owner-less marker.
+    await waitForDiagnosticEventsDrained();
+    expect(getDiagnosticSessionActivitySnapshot({ sessionKey }).activeWorkKind).toBeUndefined();
+  });
+
   it("clears queued operations immediately on user abort", () => {
     const operation = createReplyOperation({
       sessionKey: "agent:main:main",
diff --git a/src/logging/diagnostic-run-activity.ts b/src/logging/diagnostic-run-activity.ts
@@ -1,6 +1,7 @@
 // Diagnostic run activity helpers summarize run lifecycle activity for diagnostics.
 import {
   emitInternalDiagnosticEvent,
+  getInternalDiagnosticEventSequence,
   onInternalDiagnosticEvent,
   type DiagnosticEventPayload,
   type DiagnosticSessionActiveWorkKind,
@@ -331,12 +332,58 @@ export function markDiagnosticEmbeddedRunEnded(params: {
   // emitted completion would otherwise survive and re-block later turns on the
   // same sessionKey as blocked_tool_call. A still-active inner run keeps them.
   const clearAllActivity = params.clearRunActivity !== false;
-  if (clearAllActivity || activity.activeEmbeddedRuns.size === 0) {
-    clearActiveRunMarkers(activity, clearAllActivity ? undefined : "orphaned_no_owner");
+  if (clearAllActivity) {
+    clearActiveRunMarkers(activity, undefined);
+  } else if (activity.activeEmbeddedRuns.size === 0) {
+    evictOrphanedActivityMarkers(activity, params);
   }
   touchSessionActivity(activity, "embedded_run:ended");
 }
 
+// Owner-less reply-run teardown: the embedded owner is gone, so any leftover
+// tool/model markers are orphaned and must be evicted. Tool/model start events
+// are async-queued, so a start emitted before this teardown can still drain
+// after it; without a sequence cutoff that late start would re-arm an owner-less
+// marker and restore the blocked_tool_call leak. Fence the session owner refs at
+// the current event sequence (mirrors stuck-session recovery) before clearing.
+function evictOrphanedActivityMarkers(
+  activity: SessionActivity,
+  params: { sessionId?: string; sessionKey?: string },
+): void {
+  rememberRecoveredOwnerStartEventCutoffs(
+    activity,
+    collectOrphanOwnerRefs(activity, params),
+    getInternalDiagnosticEventSequence(),
+  );
+  clearActiveRunMarkers(activity, "orphaned_no_owner");
+}
+
+function collectOrphanOwnerRefs(
+  activity: SessionActivity,
+  params: { sessionId?: string },
+): Set<string> {
+  const refs = new Set<string>();
+  const add = (ref: string | undefined) => {
+    const trimmed = ref?.trim();
+    if (trimmed) {
+      refs.add(trimmed);
+    }
+  };
+  // The session id covers a late start for this run even if no marker has been
+  // recorded yet; marker run/session ids cover starts that key off a run id.
+  add(params.sessionId);
+  add(activity.sessionId);
+  for (const tool of activity.activeTools.values()) {
+    add(tool.runId);
+    add(tool.sessionId);
+  }
+  for (const modelCall of activity.activeModelCalls.values()) {
+    add(modelCall.runId);
+    add(modelCall.sessionId);
+  }
+  return refs;
+}
+
 // Clears all tool/model markers for a session. When an evictReason is given the
 // markers are orphaned (no embedded owner can complete them); emit a structured
 // event so operators can tell recovered stale state from a real active tool.