Skip to content

fix: cache did not refresh sometime#1783

Merged
looplj merged 1 commit into
unstablefrom
dev-tmp
Jun 4, 2026
Merged

fix: cache did not refresh sometime#1783
looplj merged 1 commit into
unstablefrom
dev-tmp

Conversation

@looplj

@looplj looplj commented Jun 4, 2026

Copy link
Copy Markdown
Owner

@devin-ai-integration devin-ai-integration Bot left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Devin Review found 2 potential issues.

View 3 additional findings in Devin Review.

Open in Devin Review

Comment on lines +455 to 477
func (p *DeviceFlowProvider) runAutoRefresh(
autoCtx context.Context,
exec executors.ScheduledExecutor,
refreshBefore time.Duration,
fallbackInterval time.Duration,
runImmediately bool,
) {
if autoCtx.Err() != nil {
return
}

delay := time.Duration(0)
if !runImmediately {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}

p.autoMu.Lock()

if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
return
}

prevCancel := p.autoTaskCancel
p.autoTaskCancel = nil
p.autoMu.Unlock()

if prevCancel != nil {
prevCancel()
}

cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r))
}
}()

if autoCtx.Err() != nil {
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}

// Ensure credentials are fresh
refreshFailed := false

if _, err := p.GetToken(autoCtx); err != nil {
slog.WarnContext(autoCtx, "failed to auto refresh device flow token", slog.Any("error", err))

refreshFailed = true
}

if autoCtx.Err() != nil {
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx)
if !ok {
return
}

if refreshFailed {
p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval)
delay = fallbackInterval
} else {
p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false)
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}, delay)
if err != nil {
p.StopAutoRefresh()
return
}

p.autoMu.Lock()

if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
cancelFunc()

return
}

p.autoTaskCancel = cancelFunc
p.autoMu.Unlock()
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Missing top-level recover guard in goroutine DeviceFlowProvider.runAutoRefresh

runAutoRefresh is launched as a goroutine at llm/oauth/device_flow_provider.go:438 but does not install a top-level defer recover() guard. The recover guard exists only inside runAutoRefreshOnce (llm/oauth/device_flow_provider.go:480), so any panic in the surrounding loop code (sleepForAutoRefresh, nextAutoRefreshDelay) would crash the goroutine without logging. This violates .agent/rules/go-general.md rule 8: "Any manually started goroutine must install a top-level defer recover() guard that logs the panic before the goroutine exits."

Suggested change
func (p *DeviceFlowProvider) runAutoRefresh(
autoCtx context.Context,
exec executors.ScheduledExecutor,
refreshBefore time.Duration,
fallbackInterval time.Duration,
runImmediately bool,
) {
if autoCtx.Err() != nil {
return
}
delay := time.Duration(0)
if !runImmediately {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
p.autoMu.Lock()
if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
return
}
prevCancel := p.autoTaskCancel
p.autoTaskCancel = nil
p.autoMu.Unlock()
if prevCancel != nil {
prevCancel()
}
cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r))
}
}()
if autoCtx.Err() != nil {
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}
// Ensure credentials are fresh
refreshFailed := false
if _, err := p.GetToken(autoCtx); err != nil {
slog.WarnContext(autoCtx, "failed to auto refresh device flow token", slog.Any("error", err))
refreshFailed = true
}
if autoCtx.Err() != nil {
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx)
if !ok {
return
}
if refreshFailed {
p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval)
delay = fallbackInterval
} else {
p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false)
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}, delay)
if err != nil {
p.StopAutoRefresh()
return
}
p.autoMu.Lock()
if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
cancelFunc()
return
}
p.autoTaskCancel = cancelFunc
p.autoMu.Unlock()
}
func (p *DeviceFlowProvider) runAutoRefresh(
autoCtx context.Context,
refreshBefore time.Duration,
fallbackInterval time.Duration,
) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh device flow provider panicked", slog.Any("cause", r))
}
}()
delay := time.Duration(0)
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx)
if !ok {
return
}
if refreshFailed {
delay = fallbackInterval
} else {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}
}
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Comment on lines +322 to 344
func (p *TokenProvider) runAutoRefresh(
autoCtx context.Context,
exec executors.ScheduledExecutor,
refreshBefore time.Duration,
fallbackInterval time.Duration,
runImmediately bool,
) {
if autoCtx.Err() != nil {
return
}

delay := time.Duration(0)
if !runImmediately {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}

p.autoMu.Lock()

if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
return
}

prevCancel := p.autoTaskCancel
p.autoTaskCancel = nil
p.autoMu.Unlock()

if prevCancel != nil {
prevCancel()
}

cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r))
}
}()

if autoCtx.Err() != nil {
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}

refreshFailed := false

if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil {
slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err))

refreshFailed = true
}

if autoCtx.Err() != nil {
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore)
if !ok {
return
}

if refreshFailed {
// Use fallback interval to avoid tight retry loops on persistent errors
// (e.g. refresh_token_reused, invalid_grant).
p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval)
delay = fallbackInterval
} else {
p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false)
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}, delay)
if err != nil {
p.StopAutoRefresh()
return
}

p.autoMu.Lock()

if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
cancelFunc()

return
}

p.autoTaskCancel = cancelFunc
p.autoMu.Unlock()
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Missing top-level recover guard in goroutine TokenProvider.runAutoRefresh

runAutoRefresh is launched as a goroutine at llm/oauth/token_provider.go:306 but does not install a top-level defer recover() guard. The recover guard exists only inside runAutoRefreshOnce (llm/oauth/token_provider.go:347), so any panic in the surrounding loop code (sleepForAutoRefresh, nextAutoRefreshDelay) would crash the goroutine without logging. This violates .agent/rules/go-general.md rule 8: "Any manually started goroutine must install a top-level defer recover() guard that logs the panic before the goroutine exits."

Suggested change
func (p *TokenProvider) runAutoRefresh(
autoCtx context.Context,
exec executors.ScheduledExecutor,
refreshBefore time.Duration,
fallbackInterval time.Duration,
runImmediately bool,
) {
if autoCtx.Err() != nil {
return
}
delay := time.Duration(0)
if !runImmediately {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
p.autoMu.Lock()
if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
return
}
prevCancel := p.autoTaskCancel
p.autoTaskCancel = nil
p.autoMu.Unlock()
if prevCancel != nil {
prevCancel()
}
cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r))
}
}()
if autoCtx.Err() != nil {
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}
refreshFailed := false
if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil {
slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err))
refreshFailed = true
}
if autoCtx.Err() != nil {
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore)
if !ok {
return
}
if refreshFailed {
// Use fallback interval to avoid tight retry loops on persistent errors
// (e.g. refresh_token_reused, invalid_grant).
p.scheduleNextAutoRefreshWithDelay(autoCtx, exec, refreshBefore, fallbackInterval, fallbackInterval)
delay = fallbackInterval
} else {
p.scheduleNextAutoRefresh(autoCtx, exec, refreshBefore, fallbackInterval, false)
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}, delay)
if err != nil {
p.StopAutoRefresh()
return
}
p.autoMu.Lock()
if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
cancelFunc()
return
}
p.autoTaskCancel = cancelFunc
p.autoMu.Unlock()
}
func (p *TokenProvider) runAutoRefresh(
autoCtx context.Context,
refreshBefore time.Duration,
fallbackInterval time.Duration,
) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r))
}
}()
delay := time.Duration(0)
for {
if !sleepForAutoRefresh(autoCtx, delay) {
return
}
refreshFailed, ok := p.runAutoRefreshOnce(autoCtx, refreshBefore)
if !ok {
return
}
if refreshFailed {
delay = fallbackInterval
} else {
delay = p.nextAutoRefreshDelay(refreshBefore, fallbackInterval)
}
}
}
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

@greptile-apps

greptile-apps Bot commented Jun 4, 2026

Copy link
Copy Markdown
Contributor

Greptile Summary

This PR fixes a stall in cache refresh caused by onEnabledChannelsSwap blocking synchronously on stopping old channels' token providers. It also replaces both providers' ScheduledExecutor-based auto-refresh with a simpler goroutine loop, removing the github.com/zhenzou/executors dependency entirely.

  • channel.go: Old-channel cleanup (stop token providers + outbounds) is moved to a background goroutine so the swap — including the cache-version bump and new-provider start — returns immediately. A defensive panic recovery is added to prevent the goroutine from crashing silently.
  • token_provider.go / device_flow_provider.go: The complex recursive scheduleNextAutoRefresh chain using executors.ScheduledExecutor is replaced by a straightforward runAutoRefresh goroutine loop with sleepForAutoRefresh. StopAutoRefresh now only cancels the context instead of also awaiting executor shutdown.
  • go.mod / go.sum: Removes github.com/zhenzou/executors and eight transitive dependencies.

Confidence Score: 5/5

Safe to merge. The core fix — moving old-channel cleanup to a background goroutine — is straightforward and well-tested. The executor-to-goroutine refactor in both OAuth providers simplifies the scheduling logic without changing observable behavior.

All changed code paths are logically sound. The goroutine cleanup in channel.go correctly copies the slice before launching so it cannot race with the next swap. The new runAutoRefresh loop replicates the old executor's sequencing (one refresh at a time, delay computed from token expiry). The one issue found — panic recovery scoped to the whole cleanup function rather than each channel iteration — only matters if stopTokenProvider or stopChannelOutbounds ever actually panic, which is unlikely in practice.

internal/server/biz/channel.go — the per-function panic recovery in cleanupSwappedChannels could leave later channels uncleaned on an unexpected panic.

Important Files Changed

Filename Overview
internal/server/biz/channel.go Moves old-channel cleanup (stopTokenProvider + stopChannelOutbounds) to a background goroutine so the swap operation returns immediately; adds a panic-recovery wrapper, but that wrapper only catches the first panic in the loop.
internal/server/biz/channel_llm_openai_compatible_test.go Adds TestOnEnabledChannelsSwapDoesNotWaitForOldCleanup, correctly verifying the swap returns quickly while background cleanup completes asynchronously.
llm/oauth/token_provider.go Replaces the executors.ScheduledExecutor-based scheduling with a plain goroutine loop (runAutoRefresh + runAutoRefreshOnce); removes executor fields and the associated Shutdown() call from StopAutoRefresh.
llm/oauth/device_flow_provider.go Same executor-to-goroutine refactor as token_provider.go; removes scheduleNextAutoRefresh/scheduleNextAutoRefreshWithDelay and replaces with runAutoRefresh loop identical in structure to TokenProvider's.
llm/go.mod Removes the github.com/zhenzou/executors direct dependency and several transitive deps that came with it (supercronic, dubbogo, panjf2000/ants, etc.), shrinking the dependency tree significantly.

Sequence Diagram

sequenceDiagram
    participant Loader as Channel Loader
    participant Swap as onEnabledChannelsSwap
    participant NewCh as New Channel
    participant Cleanup as cleanupSwappedChannels (goroutine)
    participant OldCh as Old Channel

    Loader->>Swap: "call with old=[...], new=[...]"
    Swap->>Swap: cacheVersion.Add(1)
    loop for each new channel
        Swap->>NewCh: startTokenProvider()
        NewCh-->>Swap: returns (auto-refresh goroutine started)
    end
    Swap->>Cleanup: go cleanupSwappedChannels(oldChannels copy)
    Swap-->>Loader: returns immediately

    Note over Cleanup,OldCh: runs asynchronously
    loop for each old channel
        Cleanup->>OldCh: stopTokenProvider()
        OldCh-->>Cleanup: cancel context (auto-refresh goroutine exits)
        Cleanup->>OldCh: stopChannelOutbounds()
    end
Loading

Reviews (2): Last reviewed commit: "fix: cache did not refresh sometime" | Re-trigger Greptile

Comment on lines +346 to +369
func (p *TokenProvider) runAutoRefreshOnce(autoCtx context.Context, refreshBefore time.Duration) (refreshFailed bool, ok bool) {
defer func() {
if r := recover(); r != nil {
slog.ErrorContext(autoCtx, "auto refresh token provider panicked", slog.Any("cause", r))
refreshFailed = false
ok = false
}
}()

if autoCtx.Err() != nil {
return
return false, false
}

p.autoMu.Lock()

if p.autoCancel == nil || p.autoExecutor == nil || exec != p.autoExecutor {
p.autoMu.Unlock()
return
if _, err := p.EnsureFresh(autoCtx, refreshBefore); err != nil {
slog.WarnContext(autoCtx, "failed to auto refresh token", slog.Any("error", err))
refreshFailed = true
}

prevCancel := p.autoTaskCancel
p.autoTaskCancel = nil
p.autoMu.Unlock()

if prevCancel != nil {
prevCancel()
if autoCtx.Err() != nil {
return refreshFailed, false
}

cancelFunc, err := exec.ScheduleFunc(func(_ context.Context) {
if autoCtx.Err() != nil {
return
}
return refreshFailed, true
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Panic leaves autoCancel set — auto-refresh silently dead

When runAutoRefreshOnce recovers from a panic it returns ok = false, which causes runAutoRefresh to exit. The goroutine is gone, but p.autoCancel was never cleared. Any subsequent call to StartAutoRefresh hits the if p.autoCancel != nil guard and returns early without launching a new goroutine, so the provider is permanently stuck with no background refresh until an explicit StopAutoRefresh() call clears the field.

A minimal fix is to clear p.autoCancel inside the goroutine when exiting abnormally — e.g., call p.StopAutoRefresh() at the end of runAutoRefresh so the state is always consistent when the loop exits for any reason. The same issue exists in DeviceFlowProvider.runAutoRefreshOnce.

@looplj looplj merged commit ac64075 into unstable Jun 4, 2026
4 checks passed
junjiangao pushed a commit to junjiangao/axonhub that referenced this pull request Jun 5, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant