openclaw
diff --git a/‎src/agents/subagent-registry-run-manager.ts‎
Lines changed: 21 additions & 1 deletion b/‎src/agents/subagent-registry-run-manager.ts‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎src/agents/subagent-registry.test.ts‎
Lines changed: 55 additions & 0 deletions b/‎src/agents/subagent-registry.test.ts‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/tasks/task-registry.maintenance.issue-60299.test.ts‎
Lines changed: 153 additions & 0 deletions b/‎src/tasks/task-registry.maintenance.issue-60299.test.ts‎
Lines changed: 153 additions & 0 deletions
@@ -8,7 +8,10 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
 import { callGateway } from "../gateway/call.js";
 import { createSubsystemLogger } from "../logging/subsystem.js";
 import { getGlobalHookRunner } from "../plugins/hook-runner-global.js";
-import { createRunningTaskRun } from "../tasks/detached-task-runtime.js";
+import {
+  createRunningTaskRun,
+  setDetachedTaskDeliveryStatusByRunId,
+} from "../tasks/detached-task-runtime.js";
 import { normalizeDeliveryContext } from "../utils/delivery-context.shared.js";
 import type { DeliveryContext } from "../utils/delivery-context.types.js";
 import { buildAgentRunTerminalOutcomeFromWaitResult } from "./agent-run-terminal-outcome.js";
@@ -788,6 +791,23 @@ export function createSubagentRunManager(params: {
     if (updated > 0) {
       params.persist();
       for (const entry of entriesByChildSessionKey.values()) {
+        // Mark delivery not_applicable immediately; task-row status is left to
+        // maintenance to avoid writing "failed" before a concurrent lifecycle
+        // COMPLETE can write "succeeded" (terminal-to-terminal guard blocks it).
+        try {
+          setDetachedTaskDeliveryStatusByRunId({
+            runId: entry.runId,
+            runtime: "subagent",
+            sessionKey: entry.childSessionKey,
+            deliveryStatus: "not_applicable",
+          });
+        } catch (err) {
+          log.warn("failed to update killed subagent background task delivery state", {
+            err,
+            runId: entry.runId,
+            childSessionKey: entry.childSessionKey,
+          });
+        }
         const emitEndedHook = () =>
           emitSubagentEndedHookOnce({
             entry,
 
@@ -104,6 +104,9 @@ const mocks = vi.hoisted(() => ({
   runSubagentEnded: vi.fn(async () => {}),
   resolveAgentTimeoutMs: vi.fn(() => 1_000),
   scheduleOrphanRecovery: vi.fn(),
+  failTaskRunByRunId: vi.fn(),
+  completeTaskRunByRunId: vi.fn(),
+  setDetachedTaskDeliveryStatusByRunId: vi.fn(),
 }));
 
 vi.mock("../gateway/call.js", () => ({
@@ -168,6 +171,13 @@ vi.mock("./subagent-orphan-recovery.js", () => ({
   scheduleOrphanRecovery: mocks.scheduleOrphanRecovery,
 }));
 
+vi.mock("../tasks/detached-task-runtime.js", () => ({
+  createRunningTaskRun: vi.fn(),
+  failTaskRunByRunId: mocks.failTaskRunByRunId,
+  completeTaskRunByRunId: mocks.completeTaskRunByRunId,
+  setDetachedTaskDeliveryStatusByRunId: mocks.setDetachedTaskDeliveryStatusByRunId,
+}));
+
 describe("subagent registry seam flow", () => {
   let mod: typeof import("./subagent-registry.js");
 
@@ -3177,6 +3187,51 @@ describe("subagent registry seam flow", () => {
     });
   });
 
+  it("marks delivery not_applicable when a run is killed", () => {
+    // Regression: the kill path persisted subagent run terminal state but
+    // never finalized the mirrored task row, leaving it stuck in running so
+    // cancel and maintenance could not clear it (#90444).
+    // The fix: mark delivery not_applicable immediately (killed runs skip the
+    // announce flow) and defer task-row status finalization to maintenance,
+    // which avoids the kill-vs-complete race where a premature "failed" write
+    // blocks a concurrent lifecycle completion from writing "succeeded".
+    mod.registerSubagentRun({
+      runId: "run-task-kill-finalize",
+      childSessionKey: "agent:main:subagent:task-kill",
+      requesterSessionKey: "agent:main:main",
+      requesterDisplayKey: "main",
+      task: "task finalized on kill",
+      cleanup: "keep",
+    });
+
+    mocks.failTaskRunByRunId.mockClear();
+    mocks.setDetachedTaskDeliveryStatusByRunId.mockClear();
+
+    const updated = mod.markSubagentRunTerminated({
+      runId: "run-task-kill-finalize",
+      reason: "killed",
+    });
+
+    expect(updated).toBe(1);
+    expect(mocks.failTaskRunByRunId).not.toHaveBeenCalled();
+    expect(mocks.setDetachedTaskDeliveryStatusByRunId).toHaveBeenCalledOnce();
+    expectRecordFields(
+      getMockCallArg(
+        mocks.setDetachedTaskDeliveryStatusByRunId,
+        0,
+        0,
+        "setDetachedTaskDeliveryStatusByRunId call",
+      ),
+      {
+        runId: "run-task-kill-finalize",
+        runtime: "subagent",
+        sessionKey: "agent:main:subagent:task-kill",
+        deliveryStatus: "not_applicable",
+      },
+      "setDetachedTaskDeliveryStatusByRunId params",
+    );
+  });
+
   it("announces readable failure when an interrupted run is finalized", async () => {
     mod.addSubagentRunForTests({
       runId: "run-interrupted",
 
@@ -66,13 +66,15 @@ function createTaskRegistryMaintenanceHarness(params: {
   cronStore?: CronStoreFile;
   cronRunLogEntries?: Record<string, CronRunLogEntry[]>;
   runtimeAuthoritative?: boolean;
+  terminalSubagentRunEndedAt?: Record<string, number>;
 }) {
   const sessionStore = params.sessionStore ?? {};
   const acpEntry = params.acpEntry;
   const activeCronJobIds = new Set(params.activeCronJobIds ?? []);
   const activeRunIds = new Set(params.activeRunIds ?? []);
   const activeAcpSessionKeys = new Set(params.activeAcpSessionKeys ?? []);
   const cronRunLogEntries = params.cronRunLogEntries ?? {};
+  const terminalSubagentRunEndedAt = params.terminalSubagentRunEndedAt ?? {};
   const currentTasks = new Map(params.tasks.map((task) => [task.taskId, { ...task }]));
 
   const runtime: TaskRegistryMaintenanceRuntime = {
@@ -175,6 +177,7 @@ function createTaskRegistryMaintenanceHarness(params: {
     resolveCronJobsStorePath: () => "/tmp/openclaw-test-cron/jobs.json",
     loadCronJobsStoreSync: () => params.cronStore ?? { version: 1, jobs: [] },
     readCronRunLogEntriesSync: ({ jobId }) => (jobId ? (cronRunLogEntries[jobId] ?? []) : []),
+    getSubagentRunEndedAt: (runId: string) => terminalSubagentRunEndedAt[runId],
   };
 
   setTaskRegistryMaintenanceRuntimeForTests(runtime);
@@ -727,3 +730,153 @@ describe("task-registry maintenance issue #60299", () => {
     expect(hookNow).toBeGreaterThanOrEqual(beforeMaintenance);
   });
 });
+
+describe("task-registry maintenance issue #90444", () => {
+  it("marks a running subagent task lost when its in-memory run is terminal", async () => {
+    // Regression: the kill path defers task-row finalization to maintenance to
+    // avoid the kill-vs-complete race. Maintenance must detect terminal
+    // in-memory subagent runs and clear their stuck running task rows.
+    const runId = "run-killed-zombie-90444";
+    const task = makeStaleTask({
+      runtime: "subagent",
+      runId,
+      childSessionKey: "agent:main:subagent:zombie-90444",
+    });
+
+    // Session store still has an entry (kill happened before session cleanup).
+    const { currentTasks } = createTaskRegistryMaintenanceHarness({
+      tasks: [task],
+      sessionStore: {
+        "agent:main:subagent:zombie-90444": {
+          sessionId: "sess-zombie-90444",
+          updatedAt: Date.now(),
+        },
+      },
+      // The in-memory run is terminal (endedAt set).
+      terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
+      runtimeAuthoritative: true,
+    });
+
+    expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
+    expectTaskStatus(currentTasks, task.taskId, "lost");
+  });
+
+  it("keeps a running subagent task live when its in-memory run has not ended", async () => {
+    const runId = "run-active-subagent-90444";
+    const task = makeStaleTask({
+      runtime: "subagent",
+      runId,
+      childSessionKey: "agent:main:subagent:active-90444",
+    });
+
+    const { currentTasks } = createTaskRegistryMaintenanceHarness({
+      tasks: [task],
+      sessionStore: {
+        "agent:main:subagent:active-90444": {
+          sessionId: "sess-active-90444",
+          updatedAt: Date.now(),
+        },
+      },
+      // No endedAt in the terminal map → run is still live.
+      terminalSubagentRunEndedAt: {},
+      runtimeAuthoritative: true,
+    });
+
+    expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 0 });
+    expectTaskStatus(currentTasks, task.taskId, "running");
+  });
+
+  it("marks a killed subagent task lost in non-authoritative (CLI maintenance) context", async () => {
+    const runId = "run-nonauth-zombie-90444";
+    const task = makeStaleTask({
+      runtime: "subagent",
+      runId,
+      childSessionKey: "agent:main:subagent:nonauth-90444",
+    });
+
+    // CLI maintenance reads endedAt from the SQLite-backed snapshot rather than
+    // the process-local in-memory map, so it can finalize kills the gateway
+    // persisted to SQLite even when isRuntimeAuthoritative() is false.
+    const { currentTasks } = createTaskRegistryMaintenanceHarness({
+      tasks: [task],
+      sessionStore: {
+        "agent:main:subagent:nonauth-90444": {
+          sessionId: "sess-nonauth-90444",
+          updatedAt: Date.now(),
+        },
+      },
+      terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
+      runtimeAuthoritative: false,
+    });
+
+    expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
+    expectTaskStatus(currentTasks, task.taskId, "lost");
+  });
+
+  it("marks a freshly killed subagent task lost before the lost-grace window expires", async () => {
+    // Regression for the timing gap ClawSweeper caught: the terminal-run check
+    // must fire in shouldMarkLost before hasLostGraceExpired so a task killed
+    // seconds ago is finalized on the next sweep, not after 5+ minutes.
+    const now = Date.now();
+    const runId = "run-fresh-killed-90444";
+    const task = makeStaleTask({
+      runtime: "subagent",
+      runId,
+      childSessionKey: "agent:main:subagent:fresh-90444",
+      // Fresh timestamps: task was created and killed 30 s ago, well within
+      // the 5-minute TASK_RECONCILE_GRACE_MS window.
+      createdAt: now - 30_000,
+      startedAt: now - 30_000,
+      lastEventAt: now - 30_000,
+    });
+
+    const { currentTasks } = createTaskRegistryMaintenanceHarness({
+      tasks: [task],
+      sessionStore: {
+        "agent:main:subagent:fresh-90444": {
+          sessionId: "sess-fresh-90444",
+          updatedAt: now,
+        },
+      },
+      terminalSubagentRunEndedAt: { [runId]: now - 5_000 },
+      runtimeAuthoritative: true,
+    });
+
+    expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 1 });
+    expectTaskStatus(currentTasks, task.taskId, "lost");
+  });
+
+  it("marks a same-run CLI peer task lost when the parent subagent run is terminal", async () => {
+    // Regression for #90444: the issue reports both the parent runtime='subagent'
+    // row and the child runtime='cli' row for the same run staying stuck. The
+    // terminal-run fast path must cover both runtimes.
+    const runId = "run-cli-peer-90444";
+    const subagentTask = makeStaleTask({
+      runtime: "subagent",
+      runId,
+      childSessionKey: "agent:main:subagent:peer-90444",
+    });
+    const cliPeerTask = makeStaleTask({
+      runtime: "cli",
+      sourceId: runId,
+      childSessionKey: "agent:main:cli:peer-90444",
+    });
+
+    const { currentTasks } = createTaskRegistryMaintenanceHarness({
+      tasks: [subagentTask, cliPeerTask],
+      sessionStore: {
+        "agent:main:subagent:peer-90444": {
+          sessionId: "sess-sub-peer-90444",
+          updatedAt: Date.now(),
+        },
+        "agent:main:cli:peer-90444": { sessionId: "sess-cli-peer-90444", updatedAt: Date.now() },
+      },
+      terminalSubagentRunEndedAt: { [runId]: Date.now() - 5000 },
+      runtimeAuthoritative: true,
+    });
+
+    expectMaintenanceCounts(await runTaskRegistryMaintenance(), { reconciled: 2 });
+    expectTaskStatus(currentTasks, subagentTask.taskId, "lost");
+    expectTaskStatus(currentTasks, cliPeerTask.taskId, "lost");
+  });
+});