@@ -66,13 +66,15 @@ function createTaskRegistryMaintenanceHarness(params: {
6666 cronStore ?: CronStoreFile ;
6767 cronRunLogEntries ?: Record < string , CronRunLogEntry [ ] > ;
6868 runtimeAuthoritative ?: boolean ;
69+ terminalSubagentRunEndedAt ?: Record < string , number > ;
6970} ) {
7071 const sessionStore = params . sessionStore ?? { } ;
7172 const acpEntry = params . acpEntry ;
7273 const activeCronJobIds = new Set ( params . activeCronJobIds ?? [ ] ) ;
7374 const activeRunIds = new Set ( params . activeRunIds ?? [ ] ) ;
7475 const activeAcpSessionKeys = new Set ( params . activeAcpSessionKeys ?? [ ] ) ;
7576 const cronRunLogEntries = params . cronRunLogEntries ?? { } ;
77+ const terminalSubagentRunEndedAt = params . terminalSubagentRunEndedAt ?? { } ;
7678 const currentTasks = new Map ( params . tasks . map ( ( task ) => [ task . taskId , { ...task } ] ) ) ;
7779
7880 const runtime : TaskRegistryMaintenanceRuntime = {
@@ -175,6 +177,7 @@ function createTaskRegistryMaintenanceHarness(params: {
175177 resolveCronJobsStorePath : ( ) => "/tmp/openclaw-test-cron/jobs.json" ,
176178 loadCronJobsStoreSync : ( ) => params . cronStore ?? { version : 1 , jobs : [ ] } ,
177179 readCronRunLogEntriesSync : ( { jobId } ) => ( jobId ? ( cronRunLogEntries [ jobId ] ?? [ ] ) : [ ] ) ,
180+ getSubagentRunEndedAt : ( runId : string ) => terminalSubagentRunEndedAt [ runId ] ,
178181 } ;
179182
180183 setTaskRegistryMaintenanceRuntimeForTests ( runtime ) ;
@@ -727,3 +730,153 @@ describe("task-registry maintenance issue #60299", () => {
727730 expect ( hookNow ) . toBeGreaterThanOrEqual ( beforeMaintenance ) ;
728731 } ) ;
729732} ) ;
733+
734+ describe ( "task-registry maintenance issue #90444" , ( ) => {
735+ it ( "marks a running subagent task lost when its in-memory run is terminal" , async ( ) => {
736+ // Regression: the kill path defers task-row finalization to maintenance to
737+ // avoid the kill-vs-complete race. Maintenance must detect terminal
738+ // in-memory subagent runs and clear their stuck running task rows.
739+ const runId = "run-killed-zombie-90444" ;
740+ const task = makeStaleTask ( {
741+ runtime : "subagent" ,
742+ runId,
743+ childSessionKey : "agent:main:subagent:zombie-90444" ,
744+ } ) ;
745+
746+ // Session store still has an entry (kill happened before session cleanup).
747+ const { currentTasks } = createTaskRegistryMaintenanceHarness ( {
748+ tasks : [ task ] ,
749+ sessionStore : {
750+ "agent:main:subagent:zombie-90444" : {
751+ sessionId : "sess-zombie-90444" ,
752+ updatedAt : Date . now ( ) ,
753+ } ,
754+ } ,
755+ // The in-memory run is terminal (endedAt set).
756+ terminalSubagentRunEndedAt : { [ runId ] : Date . now ( ) - 5000 } ,
757+ runtimeAuthoritative : true ,
758+ } ) ;
759+
760+ expectMaintenanceCounts ( await runTaskRegistryMaintenance ( ) , { reconciled : 1 } ) ;
761+ expectTaskStatus ( currentTasks , task . taskId , "lost" ) ;
762+ } ) ;
763+
764+ it ( "keeps a running subagent task live when its in-memory run has not ended" , async ( ) => {
765+ const runId = "run-active-subagent-90444" ;
766+ const task = makeStaleTask ( {
767+ runtime : "subagent" ,
768+ runId,
769+ childSessionKey : "agent:main:subagent:active-90444" ,
770+ } ) ;
771+
772+ const { currentTasks } = createTaskRegistryMaintenanceHarness ( {
773+ tasks : [ task ] ,
774+ sessionStore : {
775+ "agent:main:subagent:active-90444" : {
776+ sessionId : "sess-active-90444" ,
777+ updatedAt : Date . now ( ) ,
778+ } ,
779+ } ,
780+ // No endedAt in the terminal map → run is still live.
781+ terminalSubagentRunEndedAt : { } ,
782+ runtimeAuthoritative : true ,
783+ } ) ;
784+
785+ expectMaintenanceCounts ( await runTaskRegistryMaintenance ( ) , { reconciled : 0 } ) ;
786+ expectTaskStatus ( currentTasks , task . taskId , "running" ) ;
787+ } ) ;
788+
789+ it ( "marks a killed subagent task lost in non-authoritative (CLI maintenance) context" , async ( ) => {
790+ const runId = "run-nonauth-zombie-90444" ;
791+ const task = makeStaleTask ( {
792+ runtime : "subagent" ,
793+ runId,
794+ childSessionKey : "agent:main:subagent:nonauth-90444" ,
795+ } ) ;
796+
797+ // CLI maintenance reads endedAt from the SQLite-backed snapshot rather than
798+ // the process-local in-memory map, so it can finalize kills the gateway
799+ // persisted to SQLite even when isRuntimeAuthoritative() is false.
800+ const { currentTasks } = createTaskRegistryMaintenanceHarness ( {
801+ tasks : [ task ] ,
802+ sessionStore : {
803+ "agent:main:subagent:nonauth-90444" : {
804+ sessionId : "sess-nonauth-90444" ,
805+ updatedAt : Date . now ( ) ,
806+ } ,
807+ } ,
808+ terminalSubagentRunEndedAt : { [ runId ] : Date . now ( ) - 5000 } ,
809+ runtimeAuthoritative : false ,
810+ } ) ;
811+
812+ expectMaintenanceCounts ( await runTaskRegistryMaintenance ( ) , { reconciled : 1 } ) ;
813+ expectTaskStatus ( currentTasks , task . taskId , "lost" ) ;
814+ } ) ;
815+
816+ it ( "marks a freshly killed subagent task lost before the lost-grace window expires" , async ( ) => {
817+ // Regression for the timing gap ClawSweeper caught: the terminal-run check
818+ // must fire in shouldMarkLost before hasLostGraceExpired so a task killed
819+ // seconds ago is finalized on the next sweep, not after 5+ minutes.
820+ const now = Date . now ( ) ;
821+ const runId = "run-fresh-killed-90444" ;
822+ const task = makeStaleTask ( {
823+ runtime : "subagent" ,
824+ runId,
825+ childSessionKey : "agent:main:subagent:fresh-90444" ,
826+ // Fresh timestamps: task was created and killed 30 s ago, well within
827+ // the 5-minute TASK_RECONCILE_GRACE_MS window.
828+ createdAt : now - 30_000 ,
829+ startedAt : now - 30_000 ,
830+ lastEventAt : now - 30_000 ,
831+ } ) ;
832+
833+ const { currentTasks } = createTaskRegistryMaintenanceHarness ( {
834+ tasks : [ task ] ,
835+ sessionStore : {
836+ "agent:main:subagent:fresh-90444" : {
837+ sessionId : "sess-fresh-90444" ,
838+ updatedAt : now ,
839+ } ,
840+ } ,
841+ terminalSubagentRunEndedAt : { [ runId ] : now - 5_000 } ,
842+ runtimeAuthoritative : true ,
843+ } ) ;
844+
845+ expectMaintenanceCounts ( await runTaskRegistryMaintenance ( ) , { reconciled : 1 } ) ;
846+ expectTaskStatus ( currentTasks , task . taskId , "lost" ) ;
847+ } ) ;
848+
849+ it ( "marks a same-run CLI peer task lost when the parent subagent run is terminal" , async ( ) => {
850+ // Regression for #90444: the issue reports both the parent runtime='subagent'
851+ // row and the child runtime='cli' row for the same run staying stuck. The
852+ // terminal-run fast path must cover both runtimes.
853+ const runId = "run-cli-peer-90444" ;
854+ const subagentTask = makeStaleTask ( {
855+ runtime : "subagent" ,
856+ runId,
857+ childSessionKey : "agent:main:subagent:peer-90444" ,
858+ } ) ;
859+ const cliPeerTask = makeStaleTask ( {
860+ runtime : "cli" ,
861+ sourceId : runId ,
862+ childSessionKey : "agent:main:cli:peer-90444" ,
863+ } ) ;
864+
865+ const { currentTasks } = createTaskRegistryMaintenanceHarness ( {
866+ tasks : [ subagentTask , cliPeerTask ] ,
867+ sessionStore : {
868+ "agent:main:subagent:peer-90444" : {
869+ sessionId : "sess-sub-peer-90444" ,
870+ updatedAt : Date . now ( ) ,
871+ } ,
872+ "agent:main:cli:peer-90444" : { sessionId : "sess-cli-peer-90444" , updatedAt : Date . now ( ) } ,
873+ } ,
874+ terminalSubagentRunEndedAt : { [ runId ] : Date . now ( ) - 5000 } ,
875+ runtimeAuthoritative : true ,
876+ } ) ;
877+
878+ expectMaintenanceCounts ( await runTaskRegistryMaintenance ( ) , { reconciled : 2 } ) ;
879+ expectTaskStatus ( currentTasks , subagentTask . taskId , "lost" ) ;
880+ expectTaskStatus ( currentTasks , cliPeerTask . taskId , "lost" ) ;
881+ } ) ;
882+ } ) ;
0 commit comments