Skip to content

Commit 42e1d48

Browse files
extrasmall0steipete
authored andcommitted
fix(auth): use shorter backoff for auth_permanent failures
auth_permanent errors (e.g. API_KEY_INVALID) can be caused by transient provider outages rather than genuinely revoked credentials. Previously these used the same 5h-24h billing backoff, which left providers disabled long after the upstream issue resolved. Introduce separate authPermanentBackoffMinutes (default: 10) and authPermanentMaxMinutes (default: 60) config options so auth_permanent failures recover in minutes rather than hours. Fixes #56838
1 parent 022a24e commit 42e1d48

5 files changed

Lines changed: 47 additions & 4 deletions

File tree

src/agents/auth-profiles/usage.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
653653
label: "disabledUntil(auth_permanent)",
654654
reason: "auth_permanent" as const,
655655
buildUsageStats: (now: number): WindowStats => ({
656-
disabledUntil: now + 20 * 60 * 60 * 1000,
656+
disabledUntil: now + 50 * 60 * 1000,
657657
disabledReason: "auth_permanent",
658658
errorCount: 5,
659659
failureCounts: { auth_permanent: 5 },
@@ -724,8 +724,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
724724
lastFailureAt: now - 60_000,
725725
}),
726726
// errorCount resets, auth_permanent count resets to 1 →
727-
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
728-
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
727+
// calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m
728+
expectedUntil: (now: number) => now + 10 * 60 * 1000,
729729
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
730730
},
731731
];

src/agents/auth-profiles/usage.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,8 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
516516
type ResolvedAuthCooldownConfig = {
517517
billingBackoffMs: number;
518518
billingMaxMs: number;
519+
authPermanentBackoffMs: number;
520+
authPermanentMaxMs: number;
519521
failureWindowMs: number;
520522
};
521523

@@ -556,9 +558,17 @@ function resolveAuthCooldownConfig(params: {
556558
defaults.failureWindowHours,
557559
);
558560

561+
const resolveMinutes = (value: unknown, fallback: number) =>
562+
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
563+
564+
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
565+
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
566+
559567
return {
560568
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
561569
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
570+
authPermanentBackoffMs: authPermanentBackoffMinutes * 60 * 1000,
571+
authPermanentMaxMs: authPermanentMaxMinutes * 60 * 1000,
562572
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
563573
};
564574
}
@@ -662,7 +672,7 @@ function computeNextProfileUsageStats(params: {
662672
lastFailureAt: params.now,
663673
};
664674

665-
if (params.reason === "billing" || params.reason === "auth_permanent") {
675+
if (params.reason === "billing") {
666676
const billingCount = failureCounts[params.reason] ?? 1;
667677
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
668678
errorCount: billingCount,
@@ -677,6 +687,23 @@ function computeNextProfileUsageStats(params: {
677687
recomputedUntil: params.now + backoffMs,
678688
});
679689
updatedStats.disabledReason = params.reason;
690+
} else if (params.reason === "auth_permanent") {
691+
// auth_permanent errors can be caused by transient provider outages (e.g.
692+
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
693+
// backoff than billing so the provider recovers automatically once the
694+
// upstream issue resolves.
695+
const authPermCount = failureCounts[params.reason] ?? 1;
696+
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
697+
errorCount: authPermCount,
698+
baseMs: params.cfgResolved.authPermanentBackoffMs,
699+
maxMs: params.cfgResolved.authPermanentMaxMs,
700+
});
701+
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
702+
existingUntil: params.existing.disabledUntil,
703+
now: params.now,
704+
recomputedUntil: params.now + backoffMs,
705+
});
706+
updatedStats.disabledReason = params.reason;
680707
} else {
681708
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
682709
// Keep active cooldown windows immutable so retries within the window

src/config/schema.help.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,10 @@ export const FIELD_HELP: Record<string, string> = {
822822
"auth.cooldowns.billingBackoffHoursByProvider":
823823
"Optional per-provider overrides for billing backoff (hours).",
824824
"auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).",
825+
"auth.cooldowns.authPermanentBackoffMinutes":
826+
"Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.",
827+
"auth.cooldowns.authPermanentMaxMinutes":
828+
"Cap (minutes) for auth_permanent backoff (default: 60).",
825829
"auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).",
826830
"auth.cooldowns.overloadedProfileRotations":
827831
"Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).",

src/config/schema.labels.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,8 @@ export const FIELD_LABELS: Record<string, string> = {
485485
"auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)",
486486
"auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides",
487487
"auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)",
488+
"auth.cooldowns.authPermanentBackoffMinutes": "Auth-Permanent Backoff (minutes)",
489+
"auth.cooldowns.authPermanentMaxMinutes": "Auth-Permanent Backoff Cap (minutes)",
488490
"auth.cooldowns.failureWindowHours": "Failover Window (hours)",
489491
"auth.cooldowns.overloadedProfileRotations": "Overloaded Profile Rotations",
490492
"auth.cooldowns.overloadedBackoffMs": "Overloaded Backoff (ms)",

src/config/types.auth.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,16 @@ export type AuthConfig = {
2121
billingBackoffHoursByProvider?: Record<string, number>;
2222
/** Billing backoff cap (hours). Default: 24. */
2323
billingMaxHours?: number;
24+
/**
25+
* Base backoff for permanent-auth failures (minutes). These errors (e.g.
26+
* API_KEY_INVALID) can be caused by transient provider outages, so the
27+
* default is much shorter than billing backoff. Default: 10.
28+
*/
29+
authPermanentBackoffMinutes?: number;
30+
/**
31+
* Cap for permanent-auth backoff (minutes). Default: 60.
32+
*/
33+
authPermanentMaxMinutes?: number;
2434
/**
2535
* Failure window for backoff counters (hours). If no failures occur within
2636
* this window, counters reset. Default: 24.

0 commit comments

Comments
 (0)