Conversation
| func (p *DeviceFlowProvider) runAutoRefresh( | ||
| autoCtx context.Context, | ||
| exec executors.ScheduledExecutor, | ||
| refreshBefore time.Duration, | ||
| fallbackInterval time.Duration, | ||
| runImmediately bool, | ||
| ) { | ||
| if autoCtx.Err() != nil { | ||
| return | ||
| } | ||
|
|
||
| delay := time.Duration(0) | ||
| if !runImmediately { | ||
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | ||
| } | ||
|
|
||
| p.autoMu.Lock() | ||
|
|
||
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | ||
| p.autoMu.Unlock() | ||
| return | ||
| } | ||
|
|
||
| prevCancel := p.autoTaskCancel | ||
| p.autoTaskCancel = nil | ||
| p.autoMu.Unlock() | ||
|
|
||
| if prevCancel != nil { | ||
| prevCancel() | ||
| } | ||
|
|
||
| cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) { | ||
| defer func() { | ||
| if r := recover(); r != nil { | ||
| slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r)) | ||
| } | ||
| }() | ||
|
|
||
| if autoCtx.Err() != nil { | ||
| for { | ||
| if !sleepForAutoRefresh(autoCtx, delay) { | ||
| return | ||
| } | ||
|
|
||
| // Ensure credentials are fresh | ||
| refreshFailed := false | ||
|
|
||
| if _, err := p.GetToken(autoCtx); err != nil { | ||
| slog.WarnContext(autoCtx, "failed to auto refresh device flow token", slog.Any("error", err)) | ||
|
|
||
| refreshFailed = true | ||
| } | ||
|
|
||
| if autoCtx.Err() != nil { | ||
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx) | ||
| if !ok { | ||
| return | ||
| } | ||
|
|
||
| if refreshFailed { | ||
| p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval) | ||
| delay = fallbackInterval | ||
| } else { | ||
| p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false) | ||
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | ||
| } | ||
| }, delay) | ||
| if err != nil { | ||
| p.StopAutoRefresh() | ||
| return | ||
| } | ||
|
|
||
| p.autoMu.Lock() | ||
|
|
||
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | ||
| p.autoMu.Unlock() | ||
| cancelFunc() | ||
|
|
||
| return | ||
| } | ||
|
|
||
| p.autoTaskCancel = cancelFunc | ||
| p.autoMu.Unlock() | ||
| } |
There was a problem hiding this comment.
🟡 Missing top-level recover guard in goroutine DeviceFlowProvider.runAutoRefresh
runAutoRefresh is launched as a goroutine at llm/oauth/device_flow_provider.go:438 but does not install a top-level defer recover() guard. The recover guard exists only inside runAutoRefreshOnce (llm/oauth/device_flow_provider.go:480), so any panic in the surrounding loop code (sleepForAutoRefresh, nextAutoRefreshDelay) would crash the goroutine without logging. This violates .agent/rules/go-general.md rule 8: "Any manually started goroutine must install a top-level defer recover() guard that logs the panic before the goroutine exits."
| func (p *DeviceFlowProvider) runAutoRefresh( | |
| autoCtx context.Context, | |
| exec executors.ScheduledExecutor, | |
| refreshBefore time.Duration, | |
| fallbackInterval time.Duration, | |
| runImmediately bool, | |
| ) { | |
| if autoCtx.Err() != nil { | |
| return | |
| } | |
| delay := time.Duration(0) | |
| if !runImmediately { | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| p.autoMu.Lock() | |
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | |
| p.autoMu.Unlock() | |
| return | |
| } | |
| prevCancel := p.autoTaskCancel | |
| p.autoTaskCancel = nil | |
| p.autoMu.Unlock() | |
| if prevCancel != nil { | |
| prevCancel() | |
| } | |
| cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) { | |
| defer func() { | |
| if r := recover(); r != nil { | |
| slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r)) | |
| } | |
| }() | |
| if autoCtx.Err() != nil { | |
| for { | |
| if !sleepForAutoRefresh(autoCtx, delay) { | |
| return | |
| } | |
| // Ensure credentials are fresh | |
| refreshFailed := false | |
| if _, err := p.GetToken(autoCtx); err != nil { | |
| slog.WarnContext(autoCtx, "failed to auto refresh device flow token", slog.Any("error", err)) | |
| refreshFailed = true | |
| } | |
| if autoCtx.Err() != nil { | |
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx) | |
| if !ok { | |
| return | |
| } | |
| if refreshFailed { | |
| p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval) | |
| delay = fallbackInterval | |
| } else { | |
| p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false) | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| }, delay) | |
| if err != nil { | |
| p.StopAutoRefresh() | |
| return | |
| } | |
| p.autoMu.Lock() | |
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | |
| p.autoMu.Unlock() | |
| cancelFunc() | |
| return | |
| } | |
| p.autoTaskCancel = cancelFunc | |
| p.autoMu.Unlock() | |
| } | |
| func (p *DeviceFlowProvider) runAutoRefresh( | |
| autoCtx context.Context, | |
| refreshBefore time.Duration, | |
| fallbackInterval time.Duration, | |
| ) { | |
| defer func() { | |
| if r := recover(); r != nil { | |
| slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r)) | |
| } | |
| }() | |
| delay := time.Duration(0) | |
| for { | |
| if !sleepForAutoRefresh(autoCtx, delay) { | |
| return | |
| } | |
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx) | |
| if !ok { | |
| return | |
| } | |
| if refreshFailed { | |
| delay = fallbackInterval | |
| } else { | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| } | |
| } |
Was this helpful? React with 👍 or 👎 to provide feedback.
| func (p *TokenProvider) runAutoRefresh( | ||
| autoCtx context.Context, | ||
| exec executors.ScheduledExecutor, | ||
| refreshBefore time.Duration, | ||
| fallbackInterval time.Duration, | ||
| runImmediately bool, | ||
| ) { | ||
| if autoCtx.Err() != nil { | ||
| return | ||
| } | ||
|
|
||
| delay := time.Duration(0) | ||
| if !runImmediately { | ||
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | ||
| } | ||
|
|
||
| p.autoMu.Lock() | ||
|
|
||
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | ||
| p.autoMu.Unlock() | ||
| return | ||
| } | ||
|
|
||
| prevCancel := p.autoTaskCancel | ||
| p.autoTaskCancel = nil | ||
| p.autoMu.Unlock() | ||
|
|
||
| if prevCancel != nil { | ||
| prevCancel() | ||
| } | ||
|
|
||
| cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) { | ||
| defer func() { | ||
| if r := recover(); r != nil { | ||
| slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r)) | ||
| } | ||
| }() | ||
|
|
||
| if autoCtx.Err() != nil { | ||
| for { | ||
| if !sleepForAutoRefresh(autoCtx, delay) { | ||
| return | ||
| } | ||
|
|
||
| refreshFailed := false | ||
|
|
||
| if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil { | ||
| slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err)) | ||
|
|
||
| refreshFailed = true | ||
| } | ||
|
|
||
| if autoCtx.Err() != nil { | ||
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore) | ||
| if !ok { | ||
| return | ||
| } | ||
|
|
||
| if refreshFailed { | ||
| // Use fallback interval to avoid tight retry loops on persistent errors | ||
| // (e.g. refresh_token_reused, invalid_grant). | ||
| p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval) | ||
| delay = fallbackInterval | ||
| } else { | ||
| p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false) | ||
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | ||
| } | ||
| }, delay) | ||
| if err != nil { | ||
| p.StopAutoRefresh() | ||
| return | ||
| } | ||
|
|
||
| p.autoMu.Lock() | ||
|
|
||
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | ||
| p.autoMu.Unlock() | ||
| cancelFunc() | ||
|
|
||
| return | ||
| } | ||
|
|
||
| p.autoTaskCancel = cancelFunc | ||
| p.autoMu.Unlock() | ||
| } |
There was a problem hiding this comment.
🟡 Missing top-level recover guard in goroutine TokenProvider.runAutoRefresh
runAutoRefresh is launched as a goroutine at llm/oauth/token_provider.go:306 but does not install a top-level defer recover() guard. The recover guard exists only inside runAutoRefreshOnce (llm/oauth/token_provider.go:347), so any panic in the surrounding loop code (sleepForAutoRefresh, nextAutoRefreshDelay) would crash the goroutine without logging. This violates .agent/rules/go-general.md rule 8: "Any manually started goroutine must install a top-level defer recover() guard that logs the panic before the goroutine exits."
| func (p *TokenProvider) runAutoRefresh( | |
| autoCtx context.Context, | |
| exec executors.ScheduledExecutor, | |
| refreshBefore time.Duration, | |
| fallbackInterval time.Duration, | |
| runImmediately bool, | |
| ) { | |
| if autoCtx.Err() != nil { | |
| return | |
| } | |
| delay := time.Duration(0) | |
| if !runImmediately { | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| p.autoMu.Lock() | |
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | |
| p.autoMu.Unlock() | |
| return | |
| } | |
| prevCancel := p.autoTaskCancel | |
| p.autoTaskCancel = nil | |
| p.autoMu.Unlock() | |
| if prevCancel != nil { | |
| prevCancel() | |
| } | |
| cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) { | |
| defer func() { | |
| if r := recover(); r != nil { | |
| slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r)) | |
| } | |
| }() | |
| if autoCtx.Err() != nil { | |
| for { | |
| if !sleepForAutoRefresh(autoCtx, delay) { | |
| return | |
| } | |
| refreshFailed := false | |
| if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil { | |
| slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err)) | |
| refreshFailed = true | |
| } | |
| if autoCtx.Err() != nil { | |
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore) | |
| if !ok { | |
| return | |
| } | |
| if refreshFailed { | |
| // Use fallback interval to avoid tight retry loops on persistent errors | |
| // (e.g. refresh_token_reused, invalid_grant). | |
| p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval) | |
| delay = fallbackInterval | |
| } else { | |
| p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false) | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| }, delay) | |
| if err != nil { | |
| p.StopAutoRefresh() | |
| return | |
| } | |
| p.autoMu.Lock() | |
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | |
| p.autoMu.Unlock() | |
| cancelFunc() | |
| return | |
| } | |
| p.autoTaskCancel = cancelFunc | |
| p.autoMu.Unlock() | |
| } | |
| func (p *TokenProvider) runAutoRefresh( | |
| autoCtx context.Context, | |
| refreshBefore time.Duration, | |
| fallbackInterval time.Duration, | |
| ) { | |
| defer func() { | |
| if r := recover(); r != nil { | |
| slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r)) | |
| } | |
| }() | |
| delay := time.Duration(0) | |
| for { | |
| if !sleepForAutoRefresh(autoCtx, delay) { | |
| return | |
| } | |
| refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore) | |
| if !ok { | |
| return | |
| } | |
| if refreshFailed { | |
| delay = fallbackInterval | |
| } else { | |
| delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval) | |
| } | |
| } | |
| } |
Was this helpful? React with 👍 or 👎 to provide feedback.
Greptile SummaryThis PR fixes a stall in cache refresh caused by
Confidence Score: 5/5Safe to merge. The core fix — moving old-channel cleanup to a background goroutine — is straightforward and well-tested. The executor-to-goroutine refactor in both OAuth providers simplifies the scheduling logic without changing observable behavior. All changed code paths are logically sound. The goroutine cleanup in channel.go correctly copies the slice before launching so it cannot race with the next swap. The new runAutoRefresh loop replicates the old executor's sequencing (one refresh at a time, delay computed from token expiry). The one issue found — panic recovery scoped to the whole cleanup function rather than each channel iteration — only matters if stopTokenProvider or stopChannelOutbounds ever actually panic, which is unlikely in practice. internal/server/biz/channel.go — the per-function panic recovery in cleanupSwappedChannels could leave later channels uncleaned on an unexpected panic. Important Files Changed
Sequence DiagramsequenceDiagram
participant Loader as Channel Loader
participant Swap as onEnabledChannelsSwap
participant NewCh as New Channel
participant Cleanup as cleanupSwappedChannels (goroutine)
participant OldCh as Old Channel
Loader->>Swap: "call with old=[...], new=[...]"
Swap->>Swap: cacheVersion.Add(1)
loop for each new channel
Swap->>NewCh: startTokenProvider()
NewCh-->>Swap: returns (auto-refresh goroutine started)
end
Swap->>Cleanup: go cleanupSwappedChannels(oldChannels copy)
Swap-->>Loader: returns immediately
Note over Cleanup,OldCh: runs asynchronously
loop for each old channel
Cleanup->>OldCh: stopTokenProvider()
OldCh-->>Cleanup: cancel context (auto-refresh goroutine exits)
Cleanup->>OldCh: stopChannelOutbounds()
end
Reviews (2): Last reviewed commit: "fix: cache did not refresh sometime" | Re-trigger Greptile |
| func (p *TokenProvider) runAutoRefreshOnce(autoCtx context.Context, refreshBefore time.Duration) (refreshFailed bool, ok bool) { | ||
| defer func() { | ||
| if r := recover(); r != nil { | ||
| slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r)) | ||
| refreshFailed = false | ||
| ok = false | ||
| } | ||
| }() | ||
|
|
||
| if autoCtx.Err() != nil { | ||
| return | ||
| return false, false | ||
| } | ||
|
|
||
| p.autoMu.Lock() | ||
|
|
||
| if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor { | ||
| p.autoMu.Unlock() | ||
| return | ||
| if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil { | ||
| slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err)) | ||
| refreshFailed = true | ||
| } | ||
|
|
||
| prevCancel := p.autoTaskCancel | ||
| p.autoTaskCancel = nil | ||
| p.autoMu.Unlock() | ||
|
|
||
| if prevCancel != nil { | ||
| prevCancel() | ||
| if autoCtx.Err() != nil { | ||
| return refreshFailed, false | ||
| } | ||
|
|
||
| cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) { | ||
| if autoCtx.Err() != nil { | ||
| return | ||
| } | ||
| return refreshFailed, true | ||
| } |
There was a problem hiding this comment.
Panic leaves
autoCancel set — auto-refresh silently dead
When runAutoRefreshOnce recovers from a panic it returns ok = false, which causes runAutoRefresh to exit. The goroutine is gone, but p.autoCancel was never cleared. Any subsequent call to StartAutoRefresh hits the if p.autoCancel != nil guard and returns early without launching a new goroutine, so the provider is permanently stuck with no background refresh until an explicit StopAutoRefresh() call clears the field.
A minimal fix is to clear p.autoCancel inside the goroutine when exiting abnormally — e.g., call p.StopAutoRefresh() at the end of runAutoRefresh so the state is always consistent when the loop exits for any reason. The same issue exists in DeviceFlowProvider.runAutoRefreshOnce.
Uh oh!
There was an error while loading. Please reload this page.