From 6aec505016de92c4860316edb37b66e14f6ff5d2 Mon Sep 17 00:00:00 2001 From: fofoj <163302188+fofoj@users.noreply.github.com> Date: Thu, 28 May 2026 20:05:38 +0800 Subject: [PATCH 1/2] fix(oauth): don't overwrite credentials JSONB in 401 handler The 401 handler in RateLimitService.HandleUpstreamError set account.Credentials["expires_at"] = time.Now() and then persisted the full credentials map via persistAccountCredentials, which routes through accountRepository.UpdateCredentials -> ent SetCredentials and replaces the entire JSONB column. The account passed to the handler is the request-start snapshot taken by the gateway at SelectAccount time. When another worker has just rotated refresh_token via oauth_refresh_api.RefreshIfNeeded, the snapshot still holds the old refresh_token; writing the full snapshot back rolls refresh_token in the DB back to the stale value. The next refresh cycle then calls the upstream with the stale token, receives invalid_grant, and tryRecoverFromRefreshRace re-reads the DB only to find currentRT == usedRT (because the 401 handler just poisoned the DB), returns false, and the account is incorrectly disabled. Drop the credentials write. InvalidateToken + SetTempUnschedulable is sufficient: the account is held out of scheduling during the cooldown, and after the cooldown the next request goes through token_provider's NeedsRefresh check, which routes through the locked, DB-re-reading RefreshIfNeeded path. The "force background refresh by setting expires_at = now" semantic is intentionally dropped. token_refresh_service will naturally pick the account up when the real expires_at enters the refresh window, and if the real expires_at has already passed by the time the account becomes schedulable again, token_provider's NeedsRefresh returns true and RefreshIfNeeded fires synchronously on the next request. --- backend/internal/service/ratelimit_service.go | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/backend/internal/service/ratelimit_service.go b/backend/internal/service/ratelimit_service.go index d12824ec..ecbd86d1 100644 --- a/backend/internal/service/ratelimit_service.go +++ b/backend/internal/service/ratelimit_service.go @@ -248,17 +248,15 @@ func (s *RateLimitService) HandleUpstreamError(ctx context.Context, account *Acc shouldDisable = true break } - // 2. 设置 expires_at 为当前时间,强制下次请求刷新 token - if account.Credentials == nil { - account.Credentials = make(map[string]any) - } - account.Credentials["expires_at"] = time.Now().Format(time.RFC3339) - if err := persistAccountCredentials(ctx, s.accountRepo, account, account.Credentials); err != nil { - slog.Warn("oauth_401_force_refresh_update_failed", "account_id", account.ID, "error", err) - } else { - slog.Info("oauth_401_force_refresh_set", "account_id", account.ID, "platform", account.Platform) - } - // 3. 临时不可调度,替代 SetError(保持 status=active 让刷新服务能拾取) + // 2. 临时不可调度,替代 SetError(保持 status=active 让刷新服务能拾取) + // 注意:此处不再写回 account.Credentials/expires_at。 + // 原实现使用请求开始时的 account 快照整列覆盖 credentials JSONB(见 + // persistAccountCredentials → accountRepository.UpdateCredentials → SetCredentials), + // 在另一个 worker 刚刷新完 refresh_token 的窄窗口内会把新 refresh_token 回滚为旧值, + // 导致下一周期用旧 refresh_token 调上游拿到 invalid_grant 后, + // tryRecoverFromRefreshRace 重读 DB 发现 currentRT == usedRT 也救不回来,账号被错误 disable。 + // 这里仅依赖 InvalidateToken + SetTempUnschedulable 让账号在冷却期内不被调度, + // 冷却结束后由 token_provider 的 NeedsRefresh / token_refresh_service 走带分布式锁的正路刷新。 msg := "Authentication failed (401): invalid or expired credentials" if upstreamMsg != "" { msg = "OAuth 401: " + upstreamMsg From be3613593b1b984389a8de031eed3e2ae04300c8 Mon Sep 17 00:00:00 2001 From: fofoj <163302188+fofoj@users.noreply.github.com> Date: Thu, 28 May 2026 20:32:16 +0800 Subject: [PATCH 2/2] test(oauth): update OAuth 401 tests to match new no-write behavior Two tests in ratelimit_service_401_test.go were encoding the bug behavior itself: - OAuth401InvalidatorError asserted updateCredentialsCalls == 1 - OAuth401UsesCredentialsUpdater asserted updateCredentialsCalls == 1 and lastCredentials["expires_at"] non-empty Both assertions exercised the exact write-back this PR removes. Update them to reflect the new contract and guard against regression: - OAuth401InvalidatorError: assert updateCredentialsCalls == 0 - OAuth401UsesCredentialsUpdater is renamed to OAuth401DoesNotOverwriteCredentials with reversed assertions, so it now serves as a regression test ensuring the 401 handler never writes credentials back from the request-start snapshot. --- .../service/ratelimit_service_401_test.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/backend/internal/service/ratelimit_service_401_test.go b/backend/internal/service/ratelimit_service_401_test.go index a964775e..873aaf33 100644 --- a/backend/internal/service/ratelimit_service_401_test.go +++ b/backend/internal/service/ratelimit_service_401_test.go @@ -129,7 +129,10 @@ func TestRateLimitService_HandleUpstreamError_OAuth401SetsTempUnschedulable(t *t } // TestRateLimitService_HandleUpstreamError_OAuth401InvalidatorError -// OpenAI OAuth 401 缓存失效出错时仍走 temp_unschedulable +// OpenAI OAuth 401 缓存失效出错时仍走 temp_unschedulable。 +// 注意:401 handler 不再回写 credentials(避免请求开始时的快照整列覆盖 DB +// 把另一个 worker 刚刷新出来的新 refresh_token 回滚为旧值), +// 因此 updateCredentialsCalls 应当为 0。 func TestRateLimitService_HandleUpstreamError_OAuth401InvalidatorError(t *testing.T) { repo := &rateLimitAccountRepoStub{} invalidator := &tokenCacheInvalidatorRecorder{err: errors.New("boom")} @@ -149,7 +152,7 @@ func TestRateLimitService_HandleUpstreamError_OAuth401InvalidatorError(t *testin require.True(t, shouldDisable) require.Equal(t, 0, repo.setErrorCalls) require.Equal(t, 1, repo.tempCalls) - require.Equal(t, 1, repo.updateCredentialsCalls) + require.Equal(t, 0, repo.updateCredentialsCalls) require.Len(t, invalidator.accounts, 1) } @@ -171,7 +174,12 @@ func TestRateLimitService_HandleUpstreamError_NonOAuth401(t *testing.T) { require.Empty(t, invalidator.accounts) } -func TestRateLimitService_HandleUpstreamError_OAuth401UsesCredentialsUpdater(t *testing.T) { +// TestRateLimitService_HandleUpstreamError_OAuth401DoesNotOverwriteCredentials +// 回归测试:确保 401 handler 不再使用请求开始时的 account 快照写回 credentials。 +// 原实现会通过 persistAccountCredentials → UpdateCredentials → SetCredentials +// 整列覆盖 credentials JSONB,在另一个 worker 刚刷新完 refresh_token 的窄窗口内 +// 会把新 refresh_token 回滚为快照中的旧值,导致下一周期拿 invalid_grant 被错误 disable。 +func TestRateLimitService_HandleUpstreamError_OAuth401DoesNotOverwriteCredentials(t *testing.T) { repo := &rateLimitAccountRepoStub{} service := NewRateLimitService(repo, nil, &config.Config{}, nil, nil) account := &Account{ @@ -187,8 +195,9 @@ func TestRateLimitService_HandleUpstreamError_OAuth401UsesCredentialsUpdater(t * shouldDisable := service.HandleUpstreamError(context.Background(), account, 401, http.Header{}, []byte("unauthorized")) require.True(t, shouldDisable) - require.Equal(t, 1, repo.updateCredentialsCalls) - require.NotEmpty(t, repo.lastCredentials["expires_at"]) + require.Equal(t, 0, repo.updateCredentialsCalls, "401 handler must not write credentials back from the request-start snapshot") + require.Equal(t, 1, repo.tempCalls, "401 handler should still set temp-unschedulable cooldown") + require.Nil(t, repo.lastCredentials, "no credentials should have been persisted") } // 缺少 refresh_token 的 OAuth 账号 401 应直接 SetError 永久禁用,