Skip to content

Commit e72d8b2

Browse files
committed
perf(daemon): idle exponential backoff in poll loop
Daemon was polling every 10s forever, burning ~17K "is there work?" API calls per daemon per day even when idle. Five idle daemons alone accounted for the observed 100K Worker requests / day we see on prod, about 80 percent of which returned nothing. Change: track whether each tick did real work - a dispatched task, a reaped session, a resumed rate-limit backoff - and use that to drive an exponential idle backoff capped at 120s. Any real event resets to opts.pollInterval. Implementation in packages/cli/src/daemon/loop.ts: - Renamed backoffMs to errorBackoffMs for clarity. Only used by handleTickError. Unchanged semantics. - New idleBackoffMs starts at opts.pollInterval. resetIdleBackoff() snaps back to base; bumpIdleBackoff() multiplies by 1.5 capped at MAX_IDLE_BACKOFF_MS (120s). - tick() snapshots pool.activeCount before reap/kill/resume phases. After dispatch, if dispatched OR activeCount changed -> reset, else bump. Pool saturation (>= maxConcurrent) leaves backoff untouched so a full daemon doesn't accelerate against a 409. - nextPollDelay() uses idleBackoffMs as baseline, still clamped against the nearest rate-limit resume deadline. - onSlotFreed() and resumeRateLimitedSessions() reset backoff before scheduling, so any real signal snaps the daemon back. Backoff curve: 10 -> 15 -> 22 -> 33 -> 50 -> 75 -> 112 -> 120 seconds. Reaches ceiling after ~7 idle ticks (~70s of real time). Expected: idle daemon request count drops from ~17K to ~3-5K per day, about 4-6x reduction. At 5 daemons: 100K/day -> 15-25K/day. 16 new tests added in tests/loop-idle-backoff.test.ts covering the backoff state machine, reset triggers, pool saturation path, and nextPollDelay clamping. All 2044 tests pass.
1 parent ab3ea32 commit e72d8b2

2 files changed

Lines changed: 445 additions & 9 deletions

File tree

packages/cli/src/daemon/loop.ts

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ const logger = createLogger("loop");
2929

3030
const RATE_LIMIT_RESUME_PROMPT = "Rate limit window has reset. Continue working on the task where you left off.";
3131

32+
// Idle exponential backoff: each tick that does no work slows the next
33+
// tick down by this multiplier up to MAX_IDLE_BACKOFF_MS. Any real event
34+
// — a dispatched task, a reap, a slot freed, a rate-limit resume —
35+
// snaps the daemon back to the base pollInterval.
36+
const IDLE_BACKOFF_MULTIPLIER = 1.5;
37+
const MAX_IDLE_BACKOFF_MS = 120_000;
38+
3239
export interface LoopOpts {
3340
maxConcurrent: number;
3441
pollInterval: number;
@@ -39,7 +46,8 @@ export interface LoopOpts {
3946
export class DaemonLoop {
4047
private running = false;
4148
private pollTimer: ReturnType<typeof setTimeout> | null = null;
42-
private backoffMs: number;
49+
private errorBackoffMs: number;
50+
private idleBackoffMs: number;
4351
private sessions = getSessionManager();
4452

4553
constructor(
@@ -49,7 +57,8 @@ export class DaemonLoop {
4957
private prMonitor: PrMonitor,
5058
private opts: LoopOpts,
5159
) {
52-
this.backoffMs = opts.pollInterval;
60+
this.errorBackoffMs = opts.pollInterval;
61+
this.idleBackoffMs = opts.pollInterval;
5362
}
5463

5564
start(): void {
@@ -63,6 +72,7 @@ export class DaemonLoop {
6372
}
6473

6574
onSlotFreed(): void {
75+
this.resetIdleBackoff();
6676
this.schedulePoll(this.opts.pollInterval);
6777
}
6878

@@ -80,9 +90,18 @@ export class DaemonLoop {
8090
if (s.resumeAfter && s.resumeAfter > now) continue;
8191
await resumeOneSession(s, RATE_LIMIT_RESUME_PROMPT, this.client, this.pool);
8292
}
93+
this.resetIdleBackoff();
8394
this.schedulePoll(0);
8495
}
8596

97+
private resetIdleBackoff(): void {
98+
this.idleBackoffMs = this.opts.pollInterval;
99+
}
100+
101+
private bumpIdleBackoff(): void {
102+
this.idleBackoffMs = Math.min(this.idleBackoffMs * IDLE_BACKOFF_MULTIPLIER, MAX_IDLE_BACKOFF_MS);
103+
}
104+
86105
/**
87106
* Resume rate_limited sessions whose backoff timer has expired.
88107
* Covers transient crash recovery (not driven by RateLimiter timer).
@@ -97,14 +116,15 @@ export class DaemonLoop {
97116
}
98117
}
99118

100-
/** Pick the shorter of pollInterval and the nearest backoff expiry. */
119+
/** Pick the shorter of idle backoff and the nearest rate-limit resume. */
101120
private nextPollDelay(): number {
102121
const now = Date.now();
103122
let earliest = Infinity;
104123
for (const s of this.sessions.list({ type: "worker", status: "rate_limited" })) {
105124
if (s.resumeAfter && s.resumeAfter > now) earliest = Math.min(earliest, s.resumeAfter - now);
106125
}
107-
return Math.min(this.opts.pollInterval, earliest === Infinity ? this.opts.pollInterval : Math.max(earliest, 1000));
126+
const rateLimitDelay = earliest === Infinity ? Infinity : Math.max(earliest, 1000);
127+
return Math.min(this.idleBackoffMs, rateLimitDelay);
108128
}
109129

110130
private schedulePoll(delayMs: number): void {
@@ -116,6 +136,11 @@ export class DaemonLoop {
116136
private async tick(): Promise<void> {
117137
if (!this.running) return;
118138

139+
// Pool size can shrink during the reap phases if any orphan / cancelled /
140+
// cleanup-pending session reaches a terminal state. Treating that as work
141+
// keeps the daemon responsive right after a cleanup.
142+
const activeBefore = this.pool.activeCount;
143+
119144
await this.killCancelledTasks();
120145
await reapOrphanWorkerSessions(this.sessions, this.pool, this.client);
121146
await reapCleanupPending(this.sessions);
@@ -129,17 +154,28 @@ export class DaemonLoop {
129154

130155
await this.resumeBackoffSessions();
131156

157+
const reapedOrResumed = this.pool.activeCount !== activeBefore;
158+
132159
if (this.pool.activeCount >= this.opts.maxConcurrent) {
160+
// Pool saturated — wait for onSlotFreed. Don't touch idle backoff; this
161+
// is "no capacity," not "nothing to do," so accelerating would just
162+
// burn requests on a server that keeps saying 409.
133163
this.schedulePoll(this.nextPollDelay());
134164
return;
135165
}
136166

137-
await dispatchTasks(this.client, this.pool, this.rateLimiter, this.prMonitor, {
167+
const dispatched = await dispatchTasks(this.client, this.pool, this.rateLimiter, this.prMonitor, {
138168
maxConcurrent: this.opts.maxConcurrent,
139169
pollInterval: this.opts.pollInterval,
140170
});
141171

142-
this.backoffMs = this.opts.pollInterval;
172+
if (dispatched || reapedOrResumed) {
173+
this.resetIdleBackoff();
174+
} else {
175+
this.bumpIdleBackoff();
176+
}
177+
178+
this.errorBackoffMs = this.opts.pollInterval;
143179
this.schedulePoll(this.nextPollDelay());
144180
}
145181

@@ -153,12 +189,12 @@ export class DaemonLoop {
153189
private handleTickError(err: any): void {
154190
if (err instanceof ApiError && err.status === 429) {
155191
logger.warn("Rate limited, backing off");
156-
this.backoffMs = Math.min(Math.max(this.backoffMs * 2, 30000), 60000);
192+
this.errorBackoffMs = Math.min(Math.max(this.errorBackoffMs * 2, 30000), 60000);
157193
} else {
158194
logger.warn(`Poll error: ${err.message}${err.cause ? ` — cause: ${err.cause.message ?? err.cause}` : ""}`);
159-
this.backoffMs = Math.min(this.backoffMs * 2, 60000);
195+
this.errorBackoffMs = Math.min(this.errorBackoffMs * 2, 60000);
160196
}
161-
this.schedulePoll(this.backoffMs);
197+
this.schedulePoll(this.errorBackoffMs);
162198
}
163199
}
164200

0 commit comments

Comments
 (0)