Skip to content

Commit f63c221

Browse files
committed
fix(agents): finalize killed subagent task rows via maintenance to avoid kill-vs-complete race
Kill path writes setDetachedTaskDeliveryStatusByRunId(not_applicable) immediately; task-row status finalization is deferred to maintenance via a new optional getSubagentRunEndedAt hook on TaskRegistryMaintenanceRuntime. When the runtime is authoritative and endedAt is set, shouldMarkLost returns true before the 5-minute grace window (convergence ≤60 s vs ≤6 min). shouldApplyRunScopedStatusUpdate now allows lost→succeeded so a late lifecycle COMPLETE can override the maintenance-inferred lost outcome. Regression tests cover: stuck-running→lost, fresh-kill timing (before grace expires), live-run retained, non-authoritative skipped, and kill→maintenance-lost→late-COMPLETE→succeeded. Fixes #90444
1 parent 076bf2a commit f63c221

6 files changed

Lines changed: 442 additions & 119 deletions

src/agents/subagent-registry-run-manager.ts

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
88
import { callGateway } from "../gateway/call.js";
99
import { createSubsystemLogger } from "../logging/subsystem.js";
1010
import { getGlobalHookRunner } from "../plugins/hook-runner-global.js";
11-
import { createRunningTaskRun } from "../tasks/detached-task-runtime.js";
11+
import {
12+
createRunningTaskRun,
13+
setDetachedTaskDeliveryStatusByRunId,
14+
} from "../tasks/detached-task-runtime.js";
1215
import { normalizeDeliveryContext } from "../utils/delivery-context.shared.js";
1316
import type { DeliveryContext } from "../utils/delivery-context.types.js";
1417
import { buildAgentRunTerminalOutcomeFromWaitResult } from "./agent-run-terminal-outcome.js";
@@ -788,6 +791,23 @@ export function createSubagentRunManager(params: {
788791
if (updated > 0) {
789792
params.persist();
790793
for (const entry of entriesByChildSessionKey.values()) {
794+
// Mark delivery not_applicable immediately; task-row status is left to
795+
// maintenance to avoid writing "failed" before a concurrent lifecycle
796+
// COMPLETE can write "succeeded" (terminal-to-terminal guard blocks it).
797+
try {
798+
setDetachedTaskDeliveryStatusByRunId({
799+
runId: entry.runId,
800+
runtime: "subagent",
801+
sessionKey: entry.childSessionKey,
802+
deliveryStatus: "not_applicable",
803+
});
804+
} catch (err) {
805+
log.warn("failed to update killed subagent background task delivery state", {
806+
err,
807+
runId: entry.runId,
808+
childSessionKey: entry.childSessionKey,
809+
});
810+
}
791811
const emitEndedHook = () =>
792812
emitSubagentEndedHookOnce({
793813
entry,

src/agents/subagent-registry.test.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ const mocks = vi.hoisted(() => ({
104104
runSubagentEnded: vi.fn(async () => {}),
105105
resolveAgentTimeoutMs: vi.fn(() => 1_000),
106106
scheduleOrphanRecovery: vi.fn(),
107+
failTaskRunByRunId: vi.fn(),
108+
completeTaskRunByRunId: vi.fn(),
109+
setDetachedTaskDeliveryStatusByRunId: vi.fn(),
107110
}));
108111

109112
vi.mock("../gateway/call.js", () => ({
@@ -168,6 +171,13 @@ vi.mock("./subagent-orphan-recovery.js", () => ({
168171
scheduleOrphanRecovery: mocks.scheduleOrphanRecovery,
169172
}));
170173

174+
vi.mock("../tasks/detached-task-runtime.js", () => ({
175+
createRunningTaskRun: vi.fn(),
176+
failTaskRunByRunId: mocks.failTaskRunByRunId,
177+
completeTaskRunByRunId: mocks.completeTaskRunByRunId,
178+
setDetachedTaskDeliveryStatusByRunId: mocks.setDetachedTaskDeliveryStatusByRunId,
179+
}));
180+
171181
describe("subagent registry seam flow", () => {
172182
let mod: typeof import("./subagent-registry.js");
173183

@@ -3177,6 +3187,51 @@ describe("subagent registry seam flow", () => {
31773187
});
31783188
});
31793189

3190+
it("marks delivery not_applicable when a run is killed", () => {
3191+
// Regression: the kill path persisted subagent run terminal state but
3192+
// never finalized the mirrored task row, leaving it stuck in running so
3193+
// cancel and maintenance could not clear it (#90444).
3194+
// The fix: mark delivery not_applicable immediately (killed runs skip the
3195+
// announce flow) and defer task-row status finalization to maintenance,
3196+
// which avoids the kill-vs-complete race where a premature "failed" write
3197+
// blocks a concurrent lifecycle completion from writing "succeeded".
3198+
mod.registerSubagentRun({
3199+
runId: "run-task-kill-finalize",
3200+
childSessionKey: "agent:main:subagent:task-kill",
3201+
requesterSessionKey: "agent:main:main",
3202+
requesterDisplayKey: "main",
3203+
task: "task finalized on kill",
3204+
cleanup: "keep",
3205+
});
3206+
3207+
mocks.failTaskRunByRunId.mockClear();
3208+
mocks.setDetachedTaskDeliveryStatusByRunId.mockClear();
3209+
3210+
const updated = mod.markSubagentRunTerminated({
3211+
runId: "run-task-kill-finalize",
3212+
reason: "killed",
3213+
});
3214+
3215+
expect(updated).toBe(1);
3216+
expect(mocks.failTaskRunByRunId).not.toHaveBeenCalled();
3217+
expect(mocks.setDetachedTaskDeliveryStatusByRunId).toHaveBeenCalledOnce();
3218+
expectRecordFields(
3219+
getMockCallArg(
3220+
mocks.setDetachedTaskDeliveryStatusByRunId,
3221+
0,
3222+
0,
3223+
"setDetachedTaskDeliveryStatusByRunId call",
3224+
),
3225+
{
3226+
runId: "run-task-kill-finalize",
3227+
runtime: "subagent",
3228+
sessionKey: "agent:main:subagent:task-kill",
3229+
deliveryStatus: "not_applicable",
3230+
},
3231+
"setDetachedTaskDeliveryStatusByRunId params",
3232+
);
3233+
});
3234+
31803235
it("announces readable failure when an interrupted run is finalized", async () => {
31813236
mod.addSubagentRunForTests({
31823237
runId: "run-interrupted",

src/tasks/task-registry.maintenance.issue-60299.test.ts

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,15 @@ function createTaskRegistryMaintenanceHarness(params: {
6666
cronStore?: CronStoreFile;
6767
cronRunLogEntries?: Record<string, CronRunLogEntry[]>;
6868
runtimeAuthoritative?: boolean;
69+
terminalSubagentRunEndedAt?: Record<string, number>;
6970
}) {
7071
const sessionStore = params.sessionStore ?? {};
7172
const acpEntry = params.acpEntry;
7273
const activeCronJobIds = new Set(params.activeCronJobIds ?? []);
7374
const activeRunIds = new Set(params.activeRunIds ?? []);
7475
const activeAcpSessionKeys = new Set(params.activeAcpSessionKeys ?? []);
7576
const cronRunLogEntries = params.cronRunLogEntries ?? {};
77+
const terminalSubagentRunEndedAt = params.terminalSubagentRunEndedAt ?? {};
7678
const currentTasks = new Map(params.tasks.map((task) => [task.taskId, { ...task }]));
7779

7880
const runtime: TaskRegistryMaintenanceRuntime = {
@@ -175,6 +177,7 @@ function createTaskRegistryMaintenanceHarness(params: {
175177
resolveCronJobsStorePath: () => "/tmp/openclaw-test-cron/jobs.json",
176178
loadCronJobsStoreSync: () => params.cronStore ?? { version: 1, jobs: [] },
177179
readCronRunLogEntriesSync: ({ jobId }) => (jobId ? (cronRunLogEntries[jobId] ?? []) : []),
180+
getSubagentRunEndedAt: (runId: string) => terminalSubagentRunEndedAt[runId],
178181
};
179182

180183
setTaskRegistryMaintenanceRuntimeForTests(runtime);
@@ -727,3 +730,153 @@ describe("task-registry maintenance issue #60299", () => {
727730
expect(hookNow).toBeGreaterThanOrEqual(beforeMaintenance);
728731
});
729732
});
733+
734+
describe("task-registry maintenance issue #90444", () => {
735+
it("marks a running subagent task lost when its in-memory run is terminal", async () => {
736+
// Regression: the kill path defers task-row finalization to maintenance to
737+
// avoid the kill-vs-complete race. Maintenance must detect terminal
738+
// in-memory subagent runs and clear their stuck running task rows.
739+
const runId = "run-killed-zombie-90444";
740+
const task = makeStaleTask({
741+
runtime: "subagent",
742+
runId,
743+
childSessionKey: "agent:main:subagent:zombie-90444",
744+
});
745+
746+
// Session store still has an entry (kill happened before session cleanup).
747+
const { currentTasks } = createTaskRegistryMaintenanceHarness({
748+
tasks: [task],
749+
sessionStore: {
750+
"agent:main:subagent:zombie-90444": {
751+
sessionId: "sess-zombie-90444",
752+
updatedAt: Date.now(),
753+
},
754+
},
755+
// The in-memory run is terminal (endedAt set).
756+
terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
757+
runtimeAuthoritative: true,
758+
});
759+
760+
expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
761+
expectTaskStatus(currentTasks, task.taskId, "lost");
762+
});
763+
764+
it("keeps a running subagent task live when its in-memory run has not ended", async () => {
765+
const runId = "run-active-subagent-90444";
766+
const task = makeStaleTask({
767+
runtime: "subagent",
768+
runId,
769+
childSessionKey: "agent:main:subagent:active-90444",
770+
});
771+
772+
const { currentTasks } = createTaskRegistryMaintenanceHarness({
773+
tasks: [task],
774+
sessionStore: {
775+
"agent:main:subagent:active-90444": {
776+
sessionId: "sess-active-90444",
777+
updatedAt: Date.now(),
778+
},
779+
},
780+
// No endedAt in the terminal map → run is still live.
781+
terminalSubagentRunEndedAt: {},
782+
runtimeAuthoritative: true,
783+
});
784+
785+
expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 0 });
786+
expectTaskStatus(currentTasks, task.taskId, "running");
787+
});
788+
789+
it("marks a killed subagent task lost in non-authoritative (CLI maintenance) context", async () => {
790+
const runId = "run-nonauth-zombie-90444";
791+
const task = makeStaleTask({
792+
runtime: "subagent",
793+
runId,
794+
childSessionKey: "agent:main:subagent:nonauth-90444",
795+
});
796+
797+
// CLI maintenance reads endedAt from the SQLite-backed snapshot rather than
798+
// the process-local in-memory map, so it can finalize kills the gateway
799+
// persisted to SQLite even when isRuntimeAuthoritative() is false.
800+
const { currentTasks } = createTaskRegistryMaintenanceHarness({
801+
tasks: [task],
802+
sessionStore: {
803+
"agent:main:subagent:nonauth-90444": {
804+
sessionId: "sess-nonauth-90444",
805+
updatedAt: Date.now(),
806+
},
807+
},
808+
terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
809+
runtimeAuthoritative: false,
810+
});
811+
812+
expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
813+
expectTaskStatus(currentTasks, task.taskId, "lost");
814+
});
815+
816+
it("marks a freshly killed subagent task lost before the lost-grace window expires", async () => {
817+
// Regression for the timing gap ClawSweeper caught: the terminal-run check
818+
// must fire in shouldMarkLost before hasLostGraceExpired so a task killed
819+
// seconds ago is finalized on the next sweep, not after 5+ minutes.
820+
const now = Date.now();
821+
const runId = "run-fresh-killed-90444";
822+
const task = makeStaleTask({
823+
runtime: "subagent",
824+
runId,
825+
childSessionKey: "agent:main:subagent:fresh-90444",
826+
// Fresh timestamps: task was created and killed 30 s ago, well within
827+
// the 5-minute TASK_RECONCILE_GRACE_MS window.
828+
createdAt: now - 30_000,
829+
startedAt: now - 30_000,
830+
lastEventAt: now - 30_000,
831+
});
832+
833+
const { currentTasks } = createTaskRegistryMaintenanceHarness({
834+
tasks: [task],
835+
sessionStore: {
836+
"agent:main:subagent:fresh-90444": {
837+
sessionId: "sess-fresh-90444",
838+
updatedAt: now,
839+
},
840+
},
841+
terminalSubagentRunEndedAt: { [runId]: now - 5_000 },
842+
runtimeAuthoritative: true,
843+
});
844+
845+
expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
846+
expectTaskStatus(currentTasks, task.taskId, "lost");
847+
});
848+
849+
it("marks a same-run CLI peer task lost when the parent subagent run is terminal", async () => {
850+
// Regression for #90444: the issue reports both the parent runtime='subagent'
851+
// row and the child runtime='cli' row for the same run staying stuck. The
852+
// terminal-run fast path must cover both runtimes.
853+
const runId = "run-cli-peer-90444";
854+
const subagentTask = makeStaleTask({
855+
runtime: "subagent",
856+
runId,
857+
childSessionKey: "agent:main:subagent:peer-90444",
858+
});
859+
const cliPeerTask = makeStaleTask({
860+
runtime: "cli",
861+
sourceId: runId,
862+
childSessionKey: "agent:main:cli:peer-90444",
863+
});
864+
865+
const { currentTasks } = createTaskRegistryMaintenanceHarness({
866+
tasks: [subagentTask, cliPeerTask],
867+
sessionStore: {
868+
"agent:main:subagent:peer-90444": {
869+
sessionId: "sess-sub-peer-90444",
870+
updatedAt: Date.now(),
871+
},
872+
"agent:main:cli:peer-90444": { sessionId: "sess-cli-peer-90444", updatedAt: Date.now() },
873+
},
874+
terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
875+
runtimeAuthoritative: true,
876+
});
877+
878+
expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 2 });
879+
expectTaskStatus(currentTasks, subagentTask.taskId, "lost");
880+
expectTaskStatus(currentTasks, cliPeerTask.taskId, "lost");
881+
});
882+
});

0 commit comments

Comments
 (0)