Skip to content

Commit b1e5c9d

Browse files
authored
fix(agents): centralize terminal run outcome precedence (#88136)
* fix(agents): centralize terminal run outcome precedence * docs(agents): explain terminal outcome precedence * docs(agents): note terminal outcome helper * fix(agents): preserve pending hard timeout over late completion * test(agents): align global session scoping expectation * Revert "test(agents): align global session scoping expectation" This reverts commit 9b4a0c3. * test(infra): stabilize CONNECT timeout cap test * fix(agents): prioritize hard timeout terminal evidence * fix(gateway): preserve pending hard timeout snapshots
1 parent ba3eae5 commit b1e5c9d

9 files changed

Lines changed: 814 additions & 53 deletions

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Skills own workflows; root owns hard policy and routing.
7272
- Plugin SDK exception: shipped external API gets new API first plus named compat/deprecation, small tests/docs if useful, removal plan.
7373
- Migrate internal/bundled callers to modern API in the same change. Do not let internal compat become permanent architecture.
7474
- Channels are implementation under `src/channels/**`; plugin authors get SDK seams. Providers own auth/catalog/runtime hooks; core owns generic loop.
75+
- Agent run terminal state: normalize/merge via `src/agents/agent-run-terminal-outcome.ts`; do not rederive timeout/cancel precedence in projections.
7576
- Hot paths should carry prepared facts forward: provider id, model ref, channel id, target, capability family, attachment class. Do not rediscover with broad plugin/provider/channel/capability loaders.
7677
- Do not fix repeated request-time discovery with scattered caches. Move the canonical fact earlier; reuse prepared runtime objects; delete duplicate lookup branches.
7778
- Gateway/plugin metadata is process-stable: installs, manifests, catalogs, generated paths, bundled metadata. Changes require restart or explicit owner reload/install/doctor flow.
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import { describe, expect, it } from "vitest";
2+
import {
3+
buildAgentRunTerminalOutcome,
4+
isHardAgentRunTimeoutPhase,
5+
mergeAgentRunTerminalOutcome,
6+
} from "./agent-run-terminal-outcome.js";
7+
8+
describe("agent run terminal outcome", () => {
9+
it("treats provider/preflight/post-turn timeout phases as hard run timeouts", () => {
10+
expect(isHardAgentRunTimeoutPhase("preflight")).toBe(true);
11+
expect(isHardAgentRunTimeoutPhase("provider")).toBe(true);
12+
expect(isHardAgentRunTimeoutPhase("post_turn")).toBe(true);
13+
expect(isHardAgentRunTimeoutPhase("queue")).toBe(false);
14+
expect(isHardAgentRunTimeoutPhase("gateway_draining")).toBe(false);
15+
});
16+
17+
it("keeps queue and gateway draining timeouts non-sticky", () => {
18+
expect(
19+
buildAgentRunTerminalOutcome({
20+
status: "timeout",
21+
}).reason,
22+
).toBe("timed_out");
23+
expect(
24+
buildAgentRunTerminalOutcome({
25+
status: "timeout",
26+
timeoutPhase: "queue",
27+
}).reason,
28+
).toBe("timed_out");
29+
expect(
30+
buildAgentRunTerminalOutcome({
31+
status: "timeout",
32+
timeoutPhase: "gateway_draining",
33+
}).reason,
34+
).toBe("timed_out");
35+
});
36+
37+
it("keeps explicit rpc and stop cancellations sticky even with queue attribution", () => {
38+
const rpcCancel = buildAgentRunTerminalOutcome({
39+
status: "timeout",
40+
stopReason: "rpc",
41+
timeoutPhase: "queue",
42+
providerStarted: false,
43+
endedAt: 100,
44+
});
45+
const lateCompletion = buildAgentRunTerminalOutcome({
46+
status: "ok",
47+
endedAt: 200,
48+
});
49+
50+
expect(rpcCancel.reason).toBe("cancelled");
51+
expect(rpcCancel.status).toBe("timeout");
52+
expect(mergeAgentRunTerminalOutcome(rpcCancel, lateCompletion)).toBe(rpcCancel);
53+
expect(
54+
buildAgentRunTerminalOutcome({
55+
status: "timeout",
56+
stopReason: "stop",
57+
timeoutPhase: "gateway_draining",
58+
}).reason,
59+
).toBe("cancelled");
60+
});
61+
62+
it("does not treat successful model stop metadata as cancellation", () => {
63+
expect(
64+
buildAgentRunTerminalOutcome({
65+
status: "ok",
66+
stopReason: "stop",
67+
}),
68+
).toEqual({
69+
reason: "completed",
70+
status: "ok",
71+
stopReason: "stop",
72+
});
73+
});
74+
75+
it("prefers hard timeout evidence over default rpc cancellation metadata", () => {
76+
const timeout = buildAgentRunTerminalOutcome({
77+
status: "timeout",
78+
stopReason: "rpc",
79+
timeoutPhase: "provider",
80+
providerStarted: true,
81+
endedAt: 200,
82+
});
83+
const earlierCompletion = buildAgentRunTerminalOutcome({
84+
status: "ok",
85+
endedAt: 190,
86+
});
87+
88+
expect(timeout.reason).toBe("hard_timeout");
89+
expect(timeout.status).toBe("timeout");
90+
expect(mergeAgentRunTerminalOutcome(timeout, earlierCompletion)).toBe(earlierCompletion);
91+
});
92+
93+
it("keeps a hard timeout over later aborts or failures for the same run", () => {
94+
const timeout = buildAgentRunTerminalOutcome({
95+
status: "timeout",
96+
timeoutPhase: "provider",
97+
endedAt: 200,
98+
});
99+
const lateAbort = buildAgentRunTerminalOutcome({
100+
status: "error",
101+
stopReason: "aborted",
102+
endedAt: 250,
103+
});
104+
const lateFailure = buildAgentRunTerminalOutcome({
105+
status: "error",
106+
error: "late rejection",
107+
endedAt: 260,
108+
});
109+
110+
expect(mergeAgentRunTerminalOutcome(timeout, lateAbort)).toBe(timeout);
111+
expect(mergeAgentRunTerminalOutcome(timeout, lateFailure)).toBe(timeout);
112+
});
113+
114+
it("lets an earlier proven completion correct a provisional timeout", () => {
115+
const timeout = buildAgentRunTerminalOutcome({
116+
status: "timeout",
117+
timeoutPhase: "provider",
118+
endedAt: 200,
119+
});
120+
const earlierCompletion = buildAgentRunTerminalOutcome({
121+
status: "ok",
122+
endedAt: 190,
123+
});
124+
125+
expect(mergeAgentRunTerminalOutcome(timeout, earlierCompletion)).toBe(earlierCompletion);
126+
});
127+
});
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import { formatBlockedLivenessError, isBlockedLivenessState } from "../shared/agent-liveness.js";
2+
import { AGENT_RUN_ABORTED_ERROR, isAbortedAgentStopReason } from "./run-termination.js";
3+
import {
4+
normalizeAgentRunTimeoutPhase,
5+
normalizeProviderStarted,
6+
type AgentRunTimeoutPhase,
7+
} from "./run-timeout-attribution.js";
8+
9+
export type AgentRunWaitStatus = "ok" | "error" | "timeout";
10+
11+
export type AgentRunTerminalReason =
12+
| "completed"
13+
| "hard_timeout"
14+
| "timed_out"
15+
| "cancelled"
16+
| "aborted"
17+
| "blocked"
18+
| "failed";
19+
20+
export type AgentRunTerminalOutcome = {
21+
reason: AgentRunTerminalReason;
22+
status: AgentRunWaitStatus;
23+
error?: string;
24+
stopReason?: string;
25+
livenessState?: string;
26+
timeoutPhase?: AgentRunTimeoutPhase;
27+
providerStarted?: boolean;
28+
startedAt?: number;
29+
endedAt?: number;
30+
};
31+
32+
export type AgentRunTerminalInput = {
33+
status: AgentRunWaitStatus;
34+
error?: unknown;
35+
stopReason?: unknown;
36+
livenessState?: unknown;
37+
timeoutPhase?: unknown;
38+
providerStarted?: unknown;
39+
startedAt?: unknown;
40+
endedAt?: unknown;
41+
};
42+
43+
const HARD_TIMEOUT_PHASES = new Set<AgentRunTimeoutPhase>(["preflight", "provider", "post_turn"]);
44+
45+
function asFiniteTimestamp(value: unknown): number | undefined {
46+
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
47+
}
48+
49+
function asNonEmptyString(value: unknown): string | undefined {
50+
return typeof value === "string" && value.trim() ? value : undefined;
51+
}
52+
53+
export function isHardAgentRunTimeoutPhase(value: unknown): value is AgentRunTimeoutPhase {
54+
const phase = normalizeAgentRunTimeoutPhase(value);
55+
return phase !== undefined && HARD_TIMEOUT_PHASES.has(phase);
56+
}
57+
58+
export function isHardAgentRunTimeoutOutcome(
59+
outcome: AgentRunTerminalOutcome | undefined | null,
60+
): boolean {
61+
return outcome?.reason === "hard_timeout";
62+
}
63+
64+
export function isStickyAgentRunTerminalOutcome(
65+
outcome: AgentRunTerminalOutcome | undefined | null,
66+
): boolean {
67+
return outcome?.reason === "hard_timeout" || outcome?.reason === "cancelled";
68+
}
69+
70+
function isCancellationStopReason(value: string | undefined): boolean {
71+
return value === "rpc" || value === "stop";
72+
}
73+
74+
export function buildAgentRunTerminalOutcome(
75+
input: AgentRunTerminalInput,
76+
): AgentRunTerminalOutcome {
77+
const stopReason = asNonEmptyString(input.stopReason);
78+
const livenessState = asNonEmptyString(input.livenessState);
79+
const timeoutPhase = normalizeAgentRunTimeoutPhase(input.timeoutPhase);
80+
const providerStarted = normalizeProviderStarted(input.providerStarted);
81+
const rawError = asNonEmptyString(input.error);
82+
// Queue and gateway-draining timeouts are wait-layer uncertainty. Only
83+
// provider-started or provider-phase timeouts are sticky child-run facts.
84+
const hardTimeout =
85+
input.status === "timeout" &&
86+
(isHardAgentRunTimeoutPhase(timeoutPhase) || providerStarted === true);
87+
const aborted = isAbortedAgentStopReason(stopReason);
88+
// ACP/model `stop` can be a normal successful finish. Treat rpc/stop as
89+
// cancellation only for non-success terminal payloads from abort paths.
90+
const cancelled = input.status !== "ok" && isCancellationStopReason(stopReason);
91+
const blocked = isBlockedLivenessState(livenessState);
92+
const error = blocked
93+
? formatBlockedLivenessError(rawError)
94+
: aborted && !rawError
95+
? AGENT_RUN_ABORTED_ERROR
96+
: rawError;
97+
const reason: AgentRunTerminalReason = blocked
98+
? "blocked"
99+
: hardTimeout
100+
? "hard_timeout"
101+
: aborted
102+
? "aborted"
103+
: cancelled
104+
? "cancelled"
105+
: input.status === "timeout"
106+
? "timed_out"
107+
: input.status === "error"
108+
? "failed"
109+
: "completed";
110+
return {
111+
reason,
112+
status:
113+
reason === "completed"
114+
? "ok"
115+
: input.status === "timeout" &&
116+
(reason === "hard_timeout" || reason === "timed_out" || reason === "cancelled")
117+
? "timeout"
118+
: "error",
119+
...(error ? { error } : {}),
120+
...(stopReason ? { stopReason } : {}),
121+
...(livenessState ? { livenessState } : {}),
122+
...(timeoutPhase ? { timeoutPhase } : {}),
123+
...(providerStarted !== undefined ? { providerStarted } : {}),
124+
...(asFiniteTimestamp(input.startedAt) !== undefined
125+
? { startedAt: asFiniteTimestamp(input.startedAt) }
126+
: {}),
127+
...(asFiniteTimestamp(input.endedAt) !== undefined
128+
? { endedAt: asFiniteTimestamp(input.endedAt) }
129+
: {}),
130+
};
131+
}
132+
133+
function completedBeforeOrAtTimeout(params: {
134+
completed: AgentRunTerminalOutcome;
135+
timeout: AgentRunTerminalOutcome;
136+
}): boolean {
137+
return (
138+
params.completed.reason === "completed" &&
139+
typeof params.completed.endedAt === "number" &&
140+
typeof params.timeout.endedAt === "number" &&
141+
params.completed.endedAt <= params.timeout.endedAt
142+
);
143+
}
144+
145+
export function mergeAgentRunTerminalOutcome(
146+
current: AgentRunTerminalOutcome | undefined,
147+
incoming: AgentRunTerminalOutcome,
148+
): AgentRunTerminalOutcome {
149+
if (!current) {
150+
return incoming;
151+
}
152+
if (current.reason === "cancelled") {
153+
return current;
154+
}
155+
// A hard timeout owns the run unless later evidence proves completion ended
156+
// before that timeout; late abort/error cleanup must not downgrade it.
157+
if (isHardAgentRunTimeoutOutcome(current)) {
158+
return completedBeforeOrAtTimeout({ completed: incoming, timeout: current })
159+
? incoming
160+
: current;
161+
}
162+
if (incoming.reason === "cancelled") {
163+
return incoming;
164+
}
165+
if (isHardAgentRunTimeoutOutcome(incoming)) {
166+
return completedBeforeOrAtTimeout({ completed: current, timeout: incoming })
167+
? current
168+
: incoming;
169+
}
170+
return incoming;
171+
}

src/agents/run-wait.ts

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { callGateway } from "../gateway/call.js";
22
import { formatErrorMessage } from "../infra/errors.js";
33
import { normalizeBlockedLivenessWaitStatus } from "../shared/agent-liveness.js";
44
import { parseFiniteNumber } from "../shared/number-coercion.js";
5-
import { AGENT_RUN_ABORTED_ERROR, isAbortedAgentStopReason } from "./run-termination.js";
5+
import { buildAgentRunTerminalOutcome } from "./agent-run-terminal-outcome.js";
66
import {
77
normalizeAgentRunTimeoutPhase,
88
normalizeProviderStarted,
@@ -66,13 +66,23 @@ function normalizeAgentWaitResult(
6666
wait?: RawAgentWaitResponse,
6767
): AgentWaitResult {
6868
const stopReason = typeof wait?.stopReason === "string" ? wait.stopReason : undefined;
69-
const abortedStopReason = isAbortedAgentStopReason(stopReason);
70-
const error =
71-
abortedStopReason && typeof wait?.error !== "string" ? AGENT_RUN_ABORTED_ERROR : wait?.error;
69+
const terminalOutcome =
70+
status === "pending"
71+
? undefined
72+
: buildAgentRunTerminalOutcome({
73+
status,
74+
error: wait?.error,
75+
stopReason,
76+
livenessState: wait?.livenessState,
77+
timeoutPhase: wait?.timeoutPhase,
78+
providerStarted: wait?.providerStarted,
79+
startedAt: wait?.startedAt,
80+
endedAt: wait?.endedAt,
81+
});
7282
const normalized = normalizeBlockedLivenessWaitStatus({
73-
status: abortedStopReason ? "error" : status,
83+
status: terminalOutcome?.status ?? status,
7484
livenessState: wait?.livenessState,
75-
error,
85+
error: terminalOutcome?.error,
7686
});
7787
return {
7888
status: normalized.status,

0 commit comments

Comments
 (0)