Skip to content

Commit 83c3212

Browse files
committed
fix(diagnostics): fence late tool/model starts on owner-less run teardown
Address review: tool/model start events are async-queued, so a start emitted before an owner-less reply-run teardown could drain after the eviction and re-arm an owner-less marker, restoring the blocked_tool_call leak. The owner-less eviction now records a start-event sequence cutoff for the session owner refs at the current event sequence (the same mechanism stuck-session recovery uses via recoveredOwnerStartEventCutoffs), so a late-draining start is ignored instead of recreating the marker. Adds an async-drain regression test.
1 parent 3be2275 commit 83c3212

2 files changed

Lines changed: 79 additions & 3 deletions

File tree

src/auto-reply/reply/reply-run-registry.test.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
// Tests active reply run registry add, lookup, and cleanup behavior.
22
import { afterEach, describe, expect, it, vi } from "vitest";
33
import type { DiagnosticEventPayload } from "../../infra/diagnostic-events.js";
4-
import { onInternalDiagnosticEvent } from "../../infra/diagnostic-events.js";
4+
import {
5+
emitInternalDiagnosticEvent,
6+
onInternalDiagnosticEvent,
7+
waitForDiagnosticEventsDrained,
8+
} from "../../infra/diagnostic-events.js";
59
import {
610
getDiagnosticSessionActivitySnapshot,
711
markDiagnosticToolStartedForTest,
@@ -137,6 +141,31 @@ describe("reply run registry", () => {
137141
});
138142
});
139143

144+
it("ignores a tool start that drains after an owner-less reply completion", async () => {
145+
const sessionKey = "agent:main:slack:channel:chat-2";
146+
const sessionId = "session-1";
147+
const op = createReplyOperation({ sessionKey, sessionId, resetTriggered: false });
148+
149+
// A native tool start is emitted while the run is active but stays queued in
150+
// the async diagnostic pipeline (not yet drained into activity state).
151+
emitInternalDiagnosticEvent({
152+
type: "tool.execution.started",
153+
runId: "run-1",
154+
sessionId,
155+
sessionKey,
156+
toolName: "bash",
157+
toolCallId: "t1",
158+
});
159+
160+
// The reply run completes (owner-less teardown) before the start drains.
161+
op.complete();
162+
expect(getDiagnosticSessionActivitySnapshot({ sessionKey }).activeWorkKind).toBeUndefined();
163+
164+
// Draining the queued start must NOT re-arm an owner-less marker.
165+
await waitForDiagnosticEventsDrained();
166+
expect(getDiagnosticSessionActivitySnapshot({ sessionKey }).activeWorkKind).toBeUndefined();
167+
});
168+
140169
it("clears queued operations immediately on user abort", () => {
141170
const operation = createReplyOperation({
142171
sessionKey: "agent:main:main",

src/logging/diagnostic-run-activity.ts

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Diagnostic run activity helpers summarize run lifecycle activity for diagnostics.
22
import {
33
emitInternalDiagnosticEvent,
4+
getInternalDiagnosticEventSequence,
45
onInternalDiagnosticEvent,
56
type DiagnosticEventPayload,
67
type DiagnosticSessionActiveWorkKind,
@@ -331,12 +332,58 @@ export function markDiagnosticEmbeddedRunEnded(params: {
331332
// emitted completion would otherwise survive and re-block later turns on the
332333
// same sessionKey as blocked_tool_call. A still-active inner run keeps them.
333334
const clearAllActivity = params.clearRunActivity !== false;
334-
if (clearAllActivity || activity.activeEmbeddedRuns.size === 0) {
335-
clearActiveRunMarkers(activity, clearAllActivity ? undefined : "orphaned_no_owner");
335+
if (clearAllActivity) {
336+
clearActiveRunMarkers(activity, undefined);
337+
} else if (activity.activeEmbeddedRuns.size === 0) {
338+
evictOrphanedActivityMarkers(activity, params);
336339
}
337340
touchSessionActivity(activity, "embedded_run:ended");
338341
}
339342

343+
// Owner-less reply-run teardown: the embedded owner is gone, so any leftover
344+
// tool/model markers are orphaned and must be evicted. Tool/model start events
345+
// are async-queued, so a start emitted before this teardown can still drain
346+
// after it; without a sequence cutoff that late start would re-arm an owner-less
347+
// marker and restore the blocked_tool_call leak. Fence the session owner refs at
348+
// the current event sequence (mirrors stuck-session recovery) before clearing.
349+
function evictOrphanedActivityMarkers(
350+
activity: SessionActivity,
351+
params: { sessionId?: string; sessionKey?: string },
352+
): void {
353+
rememberRecoveredOwnerStartEventCutoffs(
354+
activity,
355+
collectOrphanOwnerRefs(activity, params),
356+
getInternalDiagnosticEventSequence(),
357+
);
358+
clearActiveRunMarkers(activity, "orphaned_no_owner");
359+
}
360+
361+
function collectOrphanOwnerRefs(
362+
activity: SessionActivity,
363+
params: { sessionId?: string },
364+
): Set<string> {
365+
const refs = new Set<string>();
366+
const add = (ref: string | undefined) => {
367+
const trimmed = ref?.trim();
368+
if (trimmed) {
369+
refs.add(trimmed);
370+
}
371+
};
372+
// The session id covers a late start for this run even if no marker has been
373+
// recorded yet; marker run/session ids cover starts that key off a run id.
374+
add(params.sessionId);
375+
add(activity.sessionId);
376+
for (const tool of activity.activeTools.values()) {
377+
add(tool.runId);
378+
add(tool.sessionId);
379+
}
380+
for (const modelCall of activity.activeModelCalls.values()) {
381+
add(modelCall.runId);
382+
add(modelCall.sessionId);
383+
}
384+
return refs;
385+
}
386+
340387
// Clears all tool/model markers for a session. When an evictReason is given the
341388
// markers are orphaned (no embedded owner can complete them); emit a structured
342389
// event so operators can tell recovered stale state from a real active tool.

0 commit comments

Comments
 (0)