Skip to content

Commit 68542eb

Browse files
fix: preserve active abort recovery on terminal stalls
1 parent 4624e34 commit 68542eb

3 files changed

Lines changed: 38 additions & 53 deletions

File tree

extensions/telegram/src/polling-session.ts

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -223,12 +223,10 @@ export class TelegramPollingSession {
223223
#spooledUpdateHandlerTimeoutMs: number;
224224
#spooledUpdateHandlerAbortGraceMs: number;
225225
#deliveryDrainInFlight = false;
226-
// L5: callback supplied by the active isolated-ingress cycle to request
227-
// the cycle abort itself when the bot has visibly lost its grammy
228-
// initialized state mid-cycle (e.g., after a network drop during a
229-
// handler retry loop). When set, calling it triggers a clean exit so
230-
// the outer `runUntilAbort` loop creates a fresh bot and re-runs
231-
// `bot.init()`. Cleared at the end of the cycle.
226+
// Callback supplied by the active isolated-ingress cycle to request the cycle
227+
// abort itself when the bot has visibly lost its grammy initialized state
228+
// mid-cycle. When set, calling it triggers a clean exit so the outer
229+
// `runUntilAbort` loop creates a fresh bot and re-runs `bot.init()`.
232230
#requestCycleRestartOnBotReinitNeeded: ((reason: string) => void) | null = null;
233231

234232
constructor(private readonly opts: TelegramPollingSessionOpts) {
@@ -476,14 +474,10 @@ export class TelegramPollingSession {
476474
this.opts.log(
477475
`[telegram][diag] spooled update ${params.update.updateId} failed; keeping for retry: ${errMessage}`,
478476
);
479-
// L5: if the grammy bot has lost its initialized state mid-cycle (typically
480-
// after a network drop during a handler retry loop, where bot.init() never
481-
// gets re-run), every subsequent update handler will fail with the same
482-
// "Bot not initialized" message in a tight retry loop. Detect that case and
483-
// ask the active cycle to abort itself; the outer runUntilAbort loop will
484-
// create a fresh TelegramBot instance and re-run bot.init() against a now-
485-
// stable network. Without this, the spool worker keeps retrying forever
486-
// until something external (gateway restart, abort signal) intervenes.
477+
// If the grammy bot has lost its initialized state mid-cycle, every
478+
// subsequent update handler fails with the same message in a tight retry
479+
// loop. Ask the active cycle to abort itself so the outer runUntilAbort
480+
// loop can create a fresh TelegramBot instance and re-run bot.init().
487481
if (typeof errMessage === "string" && errMessage.includes("Bot not initialized")) {
488482
const requestRestart = this.#requestCycleRestartOnBotReinitNeeded;
489483
if (requestRestart) {
@@ -746,11 +740,10 @@ export class TelegramPollingSession {
746740
void worker.stop();
747741
};
748742
this.opts.abortSignal?.addEventListener("abort", stopOnAbort, { once: true });
749-
// L5: install a one-shot callback that the spool failure path can use to
750-
// ask this cycle to restart when it sees grammy's "Bot not initialized"
751-
// error. Setting restartRequested + stopping the worker tears the cycle
752-
// down via the existing try/finally cleanup below; the outer
753-
// runUntilAbort loop then creates a new bot and re-runs bot.init().
743+
// Install a one-shot callback that the spool failure path can use to ask
744+
// this cycle to restart when it sees grammy's "Bot not initialized" error.
745+
// Setting restartRequested and stopping the worker tears the cycle down via
746+
// the existing try/finally cleanup below.
754747
this.#requestCycleRestartOnBotReinitNeeded = (reason: string) => {
755748
if (restartRequested) {
756749
return;
@@ -857,8 +850,8 @@ export class TelegramPollingSession {
857850
clearInterval(drainTimer);
858851
unsubscribe();
859852
this.opts.abortSignal?.removeEventListener("abort", stopOnAbort);
860-
// L5: clear the restart-request callback so a future cycle (or other
861-
// session) isn't accidentally talking to this cycle's local state.
853+
// Clear the restart-request callback so a future cycle is not
854+
// accidentally talking to this cycle's local state.
862855
this.#requestCycleRestartOnBotReinitNeeded = null;
863856
await worker.stop();
864857
if (!restartRequested) {

src/agents/auth-profiles/store.ts

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -590,16 +590,11 @@ export function loadAuthProfileStoreForSecretsRuntime(agentDir?: string): AuthPr
590590
return loadAuthProfileStoreForRuntime(agentDir, {
591591
readOnly: true,
592592
allowKeychainPrompt: false,
593-
// L4 PATCH (lane-pump branch): include legacy OAuth sidecar material when
594-
// the runtime is resolving secrets for an agent turn. Without this, embedded
595-
// agent runs (Telegram replies, cron invocations) cannot reach the access
596-
// token for openai-codex profiles whose `oauthRef.source` is
593+
// Include legacy OAuth sidecar material when the runtime is resolving
594+
// secrets for an agent turn. Without this, embedded agent runs cannot reach
595+
// the access token for openai-codex profiles whose `oauthRef.source` is
597596
// "openclaw-credentials", and resolveApiKeyForProfile() falls through to
598-
// "No API key found". The OAuth-manager-internal refresh helper added in
599-
// upstream #83312 already sets this to true; this default was inadvertently
600-
// left at `false` after the sidecar runtime removal in #82777, breaking
601-
// the embedded-agent OAuth resolution path while leaving the direct CLI
602-
// inference path unaffected. See UPSTREAM_ISSUE_DRAFT.md in local-patches.
597+
// "No API key found".
603598
resolveLegacyOAuthSidecars: true,
604599
});
605600
}
@@ -614,12 +609,9 @@ export function loadAuthProfileStoreWithoutExternalProfiles(
614609
const options: LoadAuthProfileStoreOptions = {
615610
readOnly: true,
616611
allowKeychainPrompt: loadOptions?.allowKeychainPrompt ?? false,
617-
// L4.1 PATCH: default sidecar resolution to true so that any caller
618-
// not explicitly overriding (model-auth-label, model-provider-auth,
619-
// pi-auth-discovery, list.list-command, etc.) still picks up legacy
620-
// OAuth credential material. Was inadvertently left at `false` in the
621-
// upstream #82777/#83312 refactor and breaks isolated/sub-agent auth
622-
// resolution paths (e.g., cron-nested lanes).
612+
// Default sidecar resolution to true so callers that do not explicitly
613+
// override still pick up legacy OAuth credential material for isolated and
614+
// sub-agent auth resolution paths.
623615
resolveLegacyOAuthSidecars: loadOptions?.resolveLegacyOAuthSidecars ?? true,
624616
};
625617
const store = loadAuthProfileStoreForAgent(agentDir, options);
@@ -657,10 +649,10 @@ export function ensureAuthProfileStoreWithoutExternalProfiles(
657649
agentDir?: string,
658650
options?: { allowKeychainPrompt?: boolean; resolveLegacyOAuthSidecars?: boolean },
659651
): AuthProfileStore {
660-
// L4.1 PATCH: forward `resolveLegacyOAuthSidecars` through this entry
661-
// point so embedded-runner sub-agents (cron-nested, isolated session
662-
// lanes for AgentOS sweeps) can read the legacy sidecar credential
663-
// material. Default true to match `loadAuthProfileStoreWithoutExternalProfiles`.
652+
// Forward `resolveLegacyOAuthSidecars` through this entry point so embedded
653+
// runner sub-agents and isolated session lanes can read legacy sidecar
654+
// credential material. Default true to match
655+
// `loadAuthProfileStoreWithoutExternalProfiles`.
664656
const resolveLegacyOAuthSidecars = options?.resolveLegacyOAuthSidecars ?? true;
665657
const effectiveOptions: LoadAuthProfileStoreOptions = {
666658
...(options ?? {}),
@@ -677,9 +669,8 @@ export function ensureAuthProfileStoreWithoutExternalProfiles(
677669
return store;
678670
}
679671

680-
// L4.1 PATCH: use effectiveOptions (with sidecar resolution) for the main
681-
// fallback load too, otherwise sub-agents that need to merge in the main
682-
// store would still miss the legacy credential material.
672+
// Use the same options for the main fallback load; sub-agents that merge in
673+
// the main store need the same legacy sidecar material.
683674
const mainStore = loadAuthProfileStoreForAgent(undefined, effectiveOptions);
684675
return mergeAuthProfileStores(mainStore, store);
685676
}

src/logging/diagnostic.ts

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ import { resolveEmbeddedSessionLane } from "../agents/pi-embedded-runner/lanes.j
33
import { getRuntimeConfig } from "../config/config.js";
44
import { resolveAllAgentSessionStoreTargetsSync } from "../config/sessions/targets.js";
55
import type { OpenClawConfig } from "../config/types.openclaw.js";
6-
import { resetCommandLane } from "../process/command-queue.js";
76
import {
87
areDiagnosticsEnabledForProcess,
98
emitDiagnosticEvent,
109
isDiagnosticsEnabled,
1110
type DiagnosticPhaseSnapshot,
1211
type DiagnosticLivenessWarningReason,
1312
} from "../infra/diagnostic-events.js";
13+
import { resetCommandLane } from "../process/command-queue.js";
1414
import { emitDiagnosticMemorySample, resetDiagnosticMemoryForTest } from "./diagnostic-memory.js";
1515
import {
1616
getCurrentDiagnosticPhase,
@@ -1134,6 +1134,14 @@ export function startDiagnosticHeartbeat(
11341134
thresholdMs: stuckSessionWarnMs,
11351135
abortThresholdMs: stuckSessionAbortMs,
11361136
});
1137+
const activeAbortRecoveryEligible =
1138+
classification !== undefined &&
1139+
isActiveAbortRecoveryEligible({
1140+
classification,
1141+
activity,
1142+
ageMs: attentionAgeMs,
1143+
stuckSessionAbortMs,
1144+
});
11371145
if (classification?.recoveryEligible) {
11381146
requestStuckSessionRecovery({
11391147
recover: opts?.recoverStuckSession ?? recoverStuckSession,
@@ -1143,19 +1151,12 @@ export function startDiagnosticHeartbeat(
11431151
sessionKey: state.sessionKey,
11441152
ageMs: attentionAgeMs,
11451153
queueDepth: state.queueDepth,
1154+
...(activeAbortRecoveryEligible ? { allowActiveAbort: true } : {}),
11461155
expectedState: state.state,
11471156
stateGeneration: state.generation,
11481157
},
11491158
});
1150-
} else if (
1151-
classification &&
1152-
isActiveAbortRecoveryEligible({
1153-
classification,
1154-
activity,
1155-
ageMs: attentionAgeMs,
1156-
stuckSessionAbortMs,
1157-
})
1158-
) {
1159+
} else if (classification && activeAbortRecoveryEligible) {
11591160
requestStuckSessionRecovery({
11601161
recover: opts?.recoverStuckSession ?? recoverStuckSession,
11611162
classification,

0 commit comments

Comments
 (0)