fix(server): cap task_actions / messages fetches at 500 rows

saltbo · saltbo · commit 700594d413b3 · 2026-04-19T17:02:38.000-04:00
Every SSE reconnect on a task was reading every row in task_actions
and messages for that task, then slicing to 50 in JS. D1 billed for
the full scan. With 100 concurrent SSE sessions, this is the dominant
component of the 7M rows read/day we see on prod.

Changes:
  - listMessages / getTaskActions: add limit param, default to the new
    MAX_TASK_PARTITION_ROWS constant (500). Without since returns the
    latest N in ASC order (DESC fetch + reverse). With since returns up
    to N after the cursor in ASC order.
  - sse.ts: initial fetch asks for 50 rows (no cursor) or 500 (with
    cursor). Catch-up polls request 500. When either feed returns
    exactly the cap an event gap SSE frame is emitted so the client
    can fall back to HTTP instead of silently missing older rows.
  - taskRepo.getTaskById now delegates to getTaskActions instead of
    duplicating the SQL.
  - Known limitation documented for created_at cursor collision.
diff --git a/apps/web/server/db.ts b/apps/web/server/db.ts
@@ -14,6 +14,12 @@ export function newLongId(): string {
 
 export type D1 = D1Database;
 
+// Hard ceiling on rows returned from a single task partition (actions or
+// messages). Protects D1 read budget against tasks with runaway row counts.
+// Any fetch that returns exactly this many rows is at the cap — callers
+// must assume older/newer rows beyond this point were silently truncated.
+export const MAX_TASK_PARTITION_ROWS = 500;
+
 export function parseJsonFields<T>(row: T, fields: (keyof T)[]): T {
   for (const f of fields) {
     if (typeof row[f] === "string") row[f] = JSON.parse(row[f] as string);
diff --git a/apps/web/server/messageRepo.ts b/apps/web/server/messageRepo.ts
@@ -1,5 +1,5 @@
 import type { Message, SenderType } from "@agent-kanban/shared";
-import { type D1, newLongId } from "./db";
+import { type D1, MAX_TASK_PARTITION_ROWS, newLongId } from "./db";
 
 export async function createMessage(db: D1, taskId: string, senderType: SenderType, senderId: string, content: string): Promise<Message> {
   const id = newLongId();
@@ -12,10 +12,23 @@ export async function createMessage(db: D1, taskId: string, senderType: SenderTy
   return { id, task_id: taskId, sender_type: senderType, sender_id: senderId, content, created_at: now };
 }
 
-export async function listMessages(db: D1, taskId: string, since?: string): Promise<Message[]> {
-  const query = since
-    ? db.prepare("SELECT * FROM messages WHERE task_id = ? AND created_at > ? ORDER BY created_at ASC").bind(taskId, since)
-    : db.prepare("SELECT * FROM messages WHERE task_id = ? ORDER BY created_at ASC").bind(taskId);
-  const result = await query.all<Message>();
-  return result.results;
+// When `since` is provided, returns up to `limit` rows after the cursor in
+// ASC order (incremental catch-up). Without `since`, returns the most recent
+// `limit` rows — fetched DESC then reversed so callers always see ASC order.
+// A hard LIMIT protects against task_id partitions with runaway row counts.
+//
+// KNOWN LIMITATION: `since` uses `created_at > ?`, which skips rows sharing
+// the cursor's millisecond. `newLongId()` is random (not monotonic) so the id
+// can't serve as a tiebreaker today. Tracked for follow-up — fix requires
+// either a monotonic sequence column or cursor-pair semantics.
+export async function listMessages(db: D1, taskId: string, since?: string, limit: number = MAX_TASK_PARTITION_ROWS): Promise<Message[]> {
+  if (since) {
+    const result = await db
+      .prepare("SELECT * FROM messages WHERE task_id = ? AND created_at > ? ORDER BY created_at ASC LIMIT ?")
+      .bind(taskId, since, limit)
+      .all<Message>();
+    return result.results;
+  }
+  const result = await db.prepare("SELECT * FROM messages WHERE task_id = ? ORDER BY created_at DESC LIMIT ?").bind(taskId, limit).all<Message>();
+  return result.results.reverse();
 }
diff --git a/apps/web/server/sse.ts b/apps/web/server/sse.ts
@@ -1,3 +1,4 @@
+import { MAX_TASK_PARTITION_ROWS } from "./db";
 import { listMessages } from "./messageRepo";
 import { getTaskActions } from "./taskRepo";
 import type { Env } from "./types";
@@ -57,16 +58,39 @@ export async function createSSEResponse(env: Env, taskId: string, lastEventId: s
     return writer.write(encoder.encode(msg));
   };
 
+  // Signal the client that their catch-up window hit the hard row cap and
+  // older rows were silently truncated. Client should drop its cursor and
+  // reload the task via HTTP. Unknown SSE event types are ignored by older
+  // clients, so this is backward-compatible.
+  const writeGap = (reason: string) => {
+    const msg = `event: gap\ndata: ${JSON.stringify({ reason })}\n\n`;
+    return writer.write(encoder.encode(msg));
+  };
+
   const run = async () => {
-    const [initialNotes, initialMessages] = await Promise.all([getTaskActions(db, taskId, since), listMessages(db, taskId, since)]);
+    // Without `since`, fetch the 50 most recent — repo layer already returns
+    // them in ASC order. With `since`, cap catch-up at the partition ceiling
+    // so reconnects after long offline periods can't detonate D1 reads.
+    const initialLimit = since ? MAX_TASK_PARTITION_ROWS : 50;
+    const [initialNotes, initialMessages] = await Promise.all([
+      getTaskActions(db, taskId, since, initialLimit),
+      listMessages(db, taskId, since, initialLimit),
+    ]);
+
+    // When catching up and either feed returned exactly the cap, older rows
+    // were truncated. Emit a gap signal before the rows we do have so the
+    // client can decide to reload via HTTP instead of silently missing data.
+    if (since && (initialNotes.length === initialLimit || initialMessages.length === initialLimit)) {
+      await writeGap("initial_truncated");
+    }
 
-    const noteEvents: SSEEvent[] = (since ? initialNotes : initialNotes.slice(-50)).map((l) => ({
+    const noteEvents: SSEEvent[] = initialNotes.map((l) => ({
       id: l.id,
       type: "note" as const,
       data: JSON.stringify(l),
       created_at: l.created_at,
     }));
-    const msgEvents: SSEEvent[] = (since ? initialMessages : initialMessages.slice(-50)).map((m) => ({
+    const msgEvents: SSEEvent[] = initialMessages.map((m) => ({
       id: m.id,
       type: "message" as const,
       data: JSON.stringify(m),
@@ -85,7 +109,17 @@ export async function createSSEResponse(env: Env, taskId: string, lastEventId: s
     while (Date.now() < deadline) {
       await new Promise((r) => setTimeout(r, 2000));
 
-      const [newNotes, newMessages] = await Promise.all([getTaskActions(db, taskId, lastSeen), listMessages(db, taskId, lastSeen)]);
+      const [newNotes, newMessages] = await Promise.all([
+        getTaskActions(db, taskId, lastSeen, MAX_TASK_PARTITION_ROWS),
+        listMessages(db, taskId, lastSeen, MAX_TASK_PARTITION_ROWS),
+      ]);
+
+      // Same ceiling signal during live polling — a 2s window with >500 new
+      // rows means the client's cursor is behind reality and the tail is at
+      // risk of silent truncation on the next tick. Tell the client to reload.
+      if (newNotes.length === MAX_TASK_PARTITION_ROWS || newMessages.length === MAX_TASK_PARTITION_ROWS) {
+        await writeGap("poll_truncated");
+      }
 
       const newNoteEvents = newNotes.map((l) => ({
         id: l.id,
diff --git a/apps/web/server/taskRepo.ts b/apps/web/server/taskRepo.ts
@@ -2,7 +2,7 @@ import type { BoardAction, CreateTaskInput, IdentityType, Task, TaskAction, Task
 import { validateTransition } from "@agent-kanban/shared";
 import { HTTPException } from "hono/http-exception";
 import { getDefaultBoard } from "./boardRepo";
-import { type D1, newLongId, parseJsonFields } from "./db";
+import { type D1, MAX_TASK_PARTITION_ROWS, newLongId, parseJsonFields } from "./db";
 import { computeBlocked, detectCycle, getDependencies, setDependencies } from "./taskDeps";
 
 const parseTask = <T extends Task>(row: T) => parseJsonFields(row, ["labels", "input"]);
@@ -236,21 +236,12 @@ export async function getTask(db: D1, taskId: string, ownerId: string): Promise<
   if (!task) return null;
   parseTask(task);
 
-  const [actions, deps, blockedSet] = await Promise.all([
-    db
-      .prepare(
-        "SELECT n.*, ag.name as actor_name, ag.public_key as actor_public_key FROM task_actions n LEFT JOIN agents ag ON n.actor_type LIKE 'agent:%' AND n.actor_id = ag.id WHERE n.task_id = ? ORDER BY n.created_at ASC",
-      )
-      .bind(taskId)
-      .all<TaskAction>(),
-    getDependencies(db, taskId),
-    computeBlocked(db, [taskId]),
-  ]);
+  const [actions, deps, blockedSet] = await Promise.all([getTaskActions(db, taskId), getDependencies(db, taskId), computeBlocked(db, [taskId])]);
 
-  const duration = computeDuration(actions.results);
+  const duration = computeDuration(actions);
   task.blocked = blockedSet.has(taskId);
 
-  return { ...task, notes: actions.results, duration_minutes: duration, depends_on: deps, subtask_count: task.subtask_count };
+  return { ...task, notes: actions, duration_minutes: duration, depends_on: deps, subtask_count: task.subtask_count };
 }
 
 export async function updateTask(
@@ -551,23 +542,25 @@ export async function addTaskAction(
   };
 }
 
-export async function getTaskActions(db: D1, taskId: string, since?: string): Promise<TaskAction[]> {
-  let query =
+// When `since` is provided, returns up to `limit` rows after the cursor in
+// ASC order (incremental catch-up). Without `since`, returns the most recent
+// `limit` rows — fetched DESC then reversed so callers always see ASC order.
+// A hard LIMIT protects against tasks with runaway action counts.
+//
+// KNOWN LIMITATION: `since` uses `n.created_at > ?`, which skips rows sharing
+// the cursor's millisecond. `newLongId()` is random (not monotonic) so the id
+// can't serve as a tiebreaker today. Tracked for follow-up — fix requires
+// either a monotonic sequence column or cursor-pair semantics.
+export async function getTaskActions(db: D1, taskId: string, since?: string, limit: number = MAX_TASK_PARTITION_ROWS): Promise<TaskAction[]> {
+  const base =
     "SELECT n.*, ag.name as actor_name, ag.public_key as actor_public_key FROM task_actions n LEFT JOIN agents ag ON n.actor_type LIKE 'agent:%' AND n.actor_id = ag.id WHERE n.task_id = ?";
-  const binds: unknown[] = [taskId];
 
   if (since) {
-    query += " AND n.created_at > ?";
-    binds.push(since);
+    const result = await db.prepare(`${base} AND n.created_at > ? ORDER BY n.created_at ASC LIMIT ?`).bind(taskId, since, limit).all<TaskAction>();
+    return result.results;
   }
-
-  query += " ORDER BY n.created_at ASC";
-
-  const result = await db
-    .prepare(query)
-    .bind(...binds)
-    .all<TaskAction>();
-  return result.results;
+  const result = await db.prepare(`${base} ORDER BY n.created_at DESC LIMIT ?`).bind(taskId, limit).all<TaskAction>();
+  return result.results.reverse();
 }
 
 export async function getBoardActionsByBoardId(db: D1, boardId: string, since: string): Promise<BoardAction[]> {
diff --git a/tests/message-and-action-limits.test.ts b/tests/message-and-action-limits.test.ts
diff --git a/tests/sse.test.ts b/tests/sse.test.ts