feat(agent): guard todo completion transitions#2576
Conversation
|
C:/Program Files/Git/e2e diff |
|
🤖 e2e bot failed — see the run log. |
|
C:/Program Files/Git/e2e diff |
🤖 Reasonix e2e — diff test-genResult: ❌ fail · 3 changed source file(s) across 3 package(s)
Packages: ./internal/agent, ./internal/evidence, ./internal/tool/builtin go test output (tail)agent run note: exit status 1 Pass = the agent added ≥1 new test function, the affected packages' tests are green, AND those tests fail when the PR's source is reverted (so they genuinely cover the change).
|
|
C:/Program Files/Git/e2e diff |
🤖 Reasonix e2e — diff test-genResult: ✅ pass · 3 changed source file(s) across 3 package(s)
Packages: ./internal/agent, ./internal/evidence, ./internal/tool/builtin Pass = the agent added ≥1 new test function, the affected packages' tests are green, AND those tests fail when the PR's source is reverted (so they genuinely cover the change).
|
|
C:/Program Files/Git/e2e diff x2 |
🤖 Reasonix e2e — diff test-genResult: ✅ pass · 3 changed source file(s) across 3 package(s)
Packages: ./internal/agent, ./internal/evidence, ./internal/tool/builtin Per-test differential
Generated tests (review the assertions)diff --git a/internal/evidence/evidence_test.go b/internal/evidence/evidence_test.go
index b346b2e2..c876037f 100644
--- a/internal/evidence/evidence_test.go
+++ b/internal/evidence/evidence_test.go
@@ -251,3 +251,117 @@ func TestLedgerNoBaselineDoesNotConstrainCompletedTodos(t *testing.T) {
t.Fatalf("no baseline should not report missing completions, got %+v", missing)
}
}
+
+func TestUnverifiedCompletedTodosNilLedger(t *testing.T) {
+ var ledger *Ledger
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos([]TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ })
+
+ if hasBaseline {
+ t.Fatal("nil ledger should not report a baseline")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("nil ledger should not report missing completions, got %+v", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosEmptyCurrent(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{{Content: "Add parser", Status: "in_progress"}},
+ })
+
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(nil)
+ if !hasBaseline {
+ t.Fatal("non-empty ledger should report a baseline even with empty current")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("empty current todos should not report missing completions, got %+v", missing)
+ }
+
+ missing, hasBaseline = ledger.UnverifiedCompletedTodos([]TodoItem{})
+ if !hasBaseline {
+ t.Fatal("non-empty ledger should report a baseline even with empty slice")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("empty slice should not report missing completions, got %+v", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosPartialAuthorization(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Add parser", Status: "in_progress"},
+ {Content: "Wire parser", Status: "pending"},
+ },
+ })
+ // Only "Add parser" has a matching complete_step.
+ ledger.Record(Receipt{ToolName: "complete_step", Success: true, Step: "Add parser"})
+
+ current := []TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ {Content: "Wire parser", Status: "completed"},
+ }
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 1 || missing[0].Content != "Wire parser" {
+ t.Fatalf("missing = %+v, want only Wire parser", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosAlreadyCompletedByPosition(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ {Content: "Wire parser", Status: "in_progress"},
+ },
+ })
+
+ current := []TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ {Content: "Wire parser", Status: "completed"},
+ }
+ // "Add parser" was already completed in baseline (same position, same content).
+ // "Wire parser" newly completed but no complete_step.
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 1 || missing[0].Content != "Wire parser" {
+ t.Fatalf("missing = %+v, want only Wire parser", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosAlreadyCompletedByIdentity(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ },
+ })
+ // New todo_write moves "Add parser" to position 2, but its content matches.
+ current := []TodoItem{
+ {Content: "Setup project", Status: "in_progress"},
+ {Content: "Add parser", Status: "completed"},
+ }
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("already-completed item should match by identity despite position change, missing = %+v", missing)
+ }
+}
diff --git a/internal/tool/builtin/todo_test.go b/internal/tool/builtin/todo_test.go
index f475c4d0..bba36a40 100644
--- a/internal/tool/builtin/todo_test.go
+++ b/internal/tool/builtin/todo_test.go
@@ -83,3 +83,70 @@ func TestTodoWriteIgnoresFailedCompleteStepReceipt(t *testing.T) {
t.Fatalf("failed complete_step should not authorize new completion, got %v", err)
}
}
+
+func TestTodoWriteAcceptsPreviouslyCompleted(t *testing.T) {
+ ledger := evidence.NewLedger()
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{{Content: "Add parser", Status: "completed"}},
+ })
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{{Content: "Add parser", Status: "completed"}, {Content: "Wire parser", Status: "in_progress"}},
+ })
+ ctx := evidence.WithLedger(context.Background(), ledger)
+ args := json.RawMessage(`{"todos":[{"content":"Add parser","status":"completed"},{"content":"Wire parser","status":"completed"}]}`)
+
+ _, err := (todoWrite{}).Execute(ctx, args)
+ // "Add parser" was already completed before — allowed.
+ // "Wire parser" newly completed without a complete_step — rejected.
+ if err == nil || !strings.Contains(err.Error(), "Wire parser") {
+ t.Fatalf("expected rejection for newly completed 'Wire parser', got %v", err)
+ }
+}
+
+func TestTodoWriteRejectsMultipleNewCompletionsWithoutStep(t *testing.T) {
+ ledger := evidence.NewLedger()
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{
+ {Content: "Add parser", Status: "in_progress"},
+ {Content: "Wire parser", Status: "pending"},
+ },
+ })
+ ctx := evidence.WithLedger(context.Background(), ledger)
+ args := json.RawMessage(`{"todos":[
+ {"content":"Add parser","status":"completed"},
+ {"content":"Wire parser","status":"completed"}
+ ]}`)
+
+ _, err := (todoWrite{}).Execute(ctx, args)
+ if err == nil {
+ t.Fatal("expected rejection when multiple todos newly completed without complete_step")
+ }
+ // Error should mention the count (plural form) or each item.
+ if !strings.Contains(err.Error(), "complete_step") {
+ t.Fatalf("error should mention complete_step, got %v", err)
+ }
+}
+
+func TestTodoWriteEmptyTodosSucceeds(t *testing.T) {
+ ctx := evidence.WithLedger(context.Background(), evidence.NewLedger())
+ args := json.RawMessage(`{"todos":[]}`)
+
+ _, err := (todoWrite{}).Execute(ctx, args)
+ if err != nil {
+ t.Fatalf("empty todos list should be accepted, got %v", err)
+ }
+}
+
+func TestTodoWriteAllowsCompletionWithoutLedger(t *testing.T) {
+ // No ledger in context — preserve loose behavior.
+ args := json.RawMessage(`{"todos":[{"content":"Add parser","status":"completed"}]}`)
+ if _, err := (todoWrite{}).Execute(context.Background(), args); err != nil {
+ t.Fatalf("completion without a ledger in context should be accepted, got %v", err)
+ }
+}
Pass = the agent added ≥1 test, the affected packages are green, AND ≥1 new test fails when the PR's source is reverted. "By assertion" pins are strong (they check changed behavior); "by compile only" pins just need a PR-added symbol — and since Go compiles per package, one compile-coupled test marks every test in its package that way, so read the diff above to judge the rest.
|
|
C:/Program Files/Git/e2e diff |
🤖 Reasonix e2e — diff test-genResult: ✅ pass · 3 changed source file(s) across 3 package(s)
Packages: ./internal/agent, ./internal/evidence, ./internal/tool/builtin Single stochastic run — a green result is one sample, not a guarantee. Comment Per-test differential
Generated tests (review the assertions)diff --git a/internal/agent/guards_test.go b/internal/agent/guards_test.go
index 26e5ce8d..3547261f 100644
--- a/internal/agent/guards_test.go
+++ b/internal/agent/guards_test.go
@@ -198,6 +198,33 @@ func TestPartitionToolCallsTodoWriteSerial(t *testing.T) {
}
}
+func TestPartitionToolCallsTodoWriteAndCompleteStepSerial(t *testing.T) {
+ // Both todo_write and complete_step are special-cased as serial, even when
+ // read-only, so neither joins a parallel run with other read-only tools.
+ reg := tool.NewRegistry()
+ reg.Add(fakeTool{name: "grep", readOnly: true})
+ reg.Add(fakeTool{name: "todo_write", readOnly: true})
+ reg.Add(fakeTool{name: "complete_step", readOnly: true})
+ reg.Add(fakeTool{name: "read_file", readOnly: true})
+
+ calls := []provider.ToolCall{
+ {Name: "grep"},
+ {Name: "todo_write"},
+ {Name: "complete_step"},
+ {Name: "read_file"},
+ }
+ got := partitionToolCalls(reg, calls)
+ want := []toolCallBatch{
+ {start: 0, end: 1, parallel: true},
+ {start: 1, end: 2},
+ {start: 2, end: 3},
+ {start: 3, end: 4, parallel: true},
+ }
+ if !reflect.DeepEqual(got, want) {
+ t.Fatalf("partitionToolCalls = %+v, want %+v", got, want)
+ }
+}
+
// TestExecuteBatchParallelReadOnly checks that three 80ms read-only calls
// complete in well under 3×80ms — the wall-clock proof of true parallelism.
func TestExecuteBatchParallelReadOnly(t *testing.T) {
@@ -310,6 +337,68 @@ func TestExecuteOneFailedReceiptDoesNotVerify(t *testing.T) {
}
}
+func TestExecuteOneCompleteStepRecordsReceiptOnSuccess(t *testing.T) {
+ completeStep, ok := tool.LookupBuiltin("complete_step")
+ if !ok {
+ t.Fatal("complete_step builtin not registered")
+ }
+ todoWrite, ok := tool.LookupBuiltin("todo_write")
+ if !ok {
+ t.Fatal("todo_write builtin not registered")
+ }
+ reg := tool.NewRegistry()
+ reg.Add(completeStep)
+ reg.Add(todoWrite)
+ a := New(nil, reg, NewSession(""), Options{}, event.Discard)
+
+ // First establish a baseline: todo_write with "Add parser" in_progress.
+ a.executeOne(context.Background(), provider.ToolCall{
+ Name: "todo_write",
+ Arguments: `{"todos":[{"content":"Add parser","status":"in_progress"}]}`,
+ })
+
+ // Then a successful complete_step for "Add parser".
+ _ = a.executeOne(context.Background(), provider.ToolCall{
+ Name: "complete_step",
+ Arguments: `{
+ "step":"Add parser",
+ "result":"parser added",
+ "evidence":[{"kind":"manual","summary":"checked"}]
+ }`,
+ })
+
+ // Now a todo_write that tries to mark "Add parser" as completed. Because the
+ // complete_step receipt was recorded on a.evidence, the completion should be
+ // authorized and the call must succeed.
+ out := a.executeOne(context.Background(), provider.ToolCall{
+ Name: "todo_write",
+ Arguments: `{"todos":[{"content":"Add parser","status":"completed"}]}`,
+ })
+ if out.errMsg != "" {
+ t.Fatalf("todo_write rejected after successful complete_step: errMsg=%q, output=%q", out.errMsg, out.output)
+ }
+}
+
+func TestExecuteOneCompleteStepDoesNotRecordReceiptOnError(t *testing.T) {
+ // A complete_step call that returns an error (e.g. missing step field)
+ // must NOT record any receipt — failed complete_step receipts are silently
+ // dropped so they never accidentally authorize a todo completion.
+ reg := tool.NewRegistry()
+ reg.Add(fakeTool{name: "complete_step", readOnly: true, err: errors.New("validation failed")})
+ a := New(nil, reg, NewSession(""), Options{}, event.Discard)
+
+ a.executeOne(context.Background(), provider.ToolCall{
+ Name: "complete_step",
+ Arguments: `{"step":"","result":"","evidence":[]}`,
+ })
+
+ // The ledger should be empty — no receipt recorded for the failed call.
+ _, ok := a.evidence.MatchLatestTodoStep("anything")
+ if ok {
+ t.Fatal("errored complete_step must not record a receipt")
+ }
+}
+
func TestRunResetsEvidenceLedger(t *testing.T) {
reg := tool.NewRegistry()
reg.Add(fakeTool{name: "bash", readOnly: false})
diff --git a/internal/evidence/evidence_test.go b/internal/evidence/evidence_test.go
index b346b2e2..8e2d6359 100644
--- a/internal/evidence/evidence_test.go
+++ b/internal/evidence/evidence_test.go
@@ -238,6 +238,119 @@ func TestLedgerMatchesCompletionByActiveFormAndNumber(t *testing.T) {
}
}
+func TestUnverifiedCompletedTodosPartialAuthorization(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Add parser", Status: "in_progress"},
+ {Content: "Write tests", Status: "in_progress"},
+ {Content: "Document", Status: "pending"},
+ },
+ })
+
+ // Only "Add parser" has a matching complete_step receipt.
+ ledger.Record(Receipt{ToolName: "complete_step", Success: true, Step: "Add parser"})
+
+ current := []TodoItem{
+ {Content: "Add parser", Status: "completed"},
+ {Content: "Write tests", Status: "completed"},
+ {Content: "Document", Status: "pending"},
+ }
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 1 || missing[0].Content != "Write tests" {
+ t.Fatalf("missing = %+v, want only Write tests (Add parser should be authorized)", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosAlreadyCompletedTransition(t *testing.T) {
+ // An item that was already completed in the baseline and remains completed
+ // in the current list should not be flagged as missing.
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Bootstrap", Status: "completed"},
+ {Content: "Add parser", Status: "in_progress"},
+ },
+ })
+
+ current := []TodoItem{
+ {Content: "Bootstrap", Status: "completed"},
+ {Content: "Add parser", Status: "completed"},
+ }
+ ledger.Record(Receipt{ToolName: "complete_step", Success: true, Step: "Add parser"})
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("already-completed baseline item and authorized new completion should yield no missing, got %+v", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosIdentityByActiveForm(t *testing.T) {
+ // previousTodoCompleted must match by activeForm when position differs.
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Add parser", Status: "in_progress", ActiveForm: "Adding parser"},
+ },
+ })
+
+ current := []TodoItem{
+ {Content: "Add parser", Status: "completed", ActiveForm: "Adding parser"},
+ }
+ // complete_step matches by activeForm.
+ ledger.Record(Receipt{ToolName: "complete_step", Success: true, Step: "Adding parser"})
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 0 {
+ t.Fatalf("matching by activeForm should authorize completion, missing = %+v", missing)
+ }
+}
+
+func TestUnverifiedCompletedTodosMultipleMissing(t *testing.T) {
+ ledger := NewLedger()
+ ledger.Record(Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []TodoItem{
+ {Content: "Task A", Status: "in_progress"},
+ {Content: "Task B", Status: "in_progress"},
+ {Content: "Task C", Status: "in_progress"},
+ },
+ })
+
+ current := []TodoItem{
+ {Content: "Task A", Status: "completed"},
+ {Content: "Task B", Status: "completed"},
+ {Content: "Task C", Status: "completed"},
+ }
+ // No complete_step at all — all three are missing.
+ missing, hasBaseline := ledger.UnverifiedCompletedTodos(current)
+ if !hasBaseline {
+ t.Fatal("expected prior todo_write baseline")
+ }
+ if len(missing) != 3 {
+ t.Fatalf("expected 3 missing completions, got %+v", missing)
+ }
+ for _, m := range missing {
+ if m.Status != "completed" {
+ t.Fatalf("missing item %d has status %q, want completed", m.Index, m.Status)
+ }
+ }
+}
+
func TestLedgerNoBaselineDoesNotConstrainCompletedTodos(t *testing.T) {
ledger := NewLedger()
missing, hasBaseline := ledger.UnverifiedCompletedTodos([]TodoItem{
diff --git a/internal/tool/builtin/todo_test.go b/internal/tool/builtin/todo_test.go
index f475c4d0..b441543f 100644
--- a/internal/tool/builtin/todo_test.go
+++ b/internal/tool/builtin/todo_test.go
@@ -83,3 +83,68 @@ func TestTodoWriteIgnoresFailedCompleteStepReceipt(t *testing.T) {
t.Fatalf("failed complete_step should not authorize new completion, got %v", err)
}
}
+
+func TestTodoWriteRejectsMultipleNewCompletions(t *testing.T) {
+ ledger := evidence.NewLedger()
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{
+ {Content: "Task A", Status: "in_progress"},
+ {Content: "Task B", Status: "in_progress"},
+ },
+ })
+ ctx := evidence.WithLedger(context.Background(), ledger)
+ args := json.RawMessage(`{"todos":[
+ {"content":"Task A","status":"completed"},
+ {"content":"Task B","status":"completed"}
+ ]}`)
+
+ _, err := (todoWrite{}).Execute(ctx, args)
+ if err == nil {
+ t.Fatal("multiple new completions without complete_step should be rejected")
+ }
+ if !strings.Contains(err.Error(), "2 todos") {
+ t.Fatalf("error should mention count for multiple items, got %v", err)
+ }
+}
+
+func TestTodoWriteAcceptsCompleteStepByNumber(t *testing.T) {
+ ledger := evidence.NewLedger()
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{
+ {Content: "Add parser", Status: "in_progress"},
+ {Content: "Write tests", Status: "in_progress"},
+ },
+ })
+ ledger.Record(evidence.Receipt{ToolName: "complete_step", Success: true, Step: "1"})
+ ctx := evidence.WithLedger(context.Background(), ledger)
+ args := json.RawMessage(`{"todos":[
+ {"content":"Add parser","status":"completed"},
+ {"content":"Write tests","status":"in_progress"}
+ ]}`)
+
+ if _, err := (todoWrite{}).Execute(ctx, args); err != nil {
+ t.Fatalf("complete_step by number should authorize completion: %v", err)
+ }
+}
+
+func TestTodoWriteAcceptsCompleteStepByActiveForm(t *testing.T) {
+ ledger := evidence.NewLedger()
+ ledger.Record(evidence.Receipt{
+ ToolName: "todo_write",
+ Success: true,
+ Todos: []evidence.TodoItem{
+ {Content: "Add parser", Status: "in_progress", ActiveForm: "Adding parser"},
+ },
+ })
+ ledger.Record(evidence.Receipt{ToolName: "complete_step", Success: true, Step: "Adding parser"})
+ ctx := evidence.WithLedger(context.Background(), ledger)
+ args := json.RawMessage(`{"todos":[{"content":"Add parser","status":"completed","activeForm":"Adding parser"}]}`)
+
+ if _, err := (todoWrite{}).Execute(ctx, args); err != nil {
+ t.Fatalf("complete_step by activeForm should authorize completion: %v", err)
+ }
+}
Pass = the agent added ≥1 test, the affected packages are green, AND ≥1 new test fails when the PR's source is reverted. "By assertion" pins are strong (they check changed behavior); "by compile only" pins just need a PR-added symbol — and since Go compiles per package, one compile-coupled test marks every test in its package that way. Mutation is the behavioral signal for additive PRs: each changed function's return is replaced with zero values and the new tests are re-run; "caught" means a test asserts that output, "survived" means it doesn't. Read the generated tests above to judge the rest.
|
Related to #2572
Summary
Conflict check
Review evidence
Verification
Notes