diff --git a/backend/internal/handler/ops_error_logger.go b/backend/internal/handler/ops_error_logger.go index 93554912..398124cc 100644 --- a/backend/internal/handler/ops_error_logger.go +++ b/backend/internal/handler/ops_error_logger.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "log" "runtime" "runtime/debug" @@ -22,10 +23,11 @@ import ( ) const ( - opsModelKey = "ops_model" - opsStreamKey = "ops_stream" - opsRequestBodyKey = "ops_request_body" - opsAccountIDKey = "ops_account_id" + opsModelKey = "ops_model" + opsStreamKey = "ops_stream" + opsRequestBodyKey = "ops_request_body" + opsAccountIDKey = "ops_account_id" + opsRoutingCapacityLimitedKey = "ops_routing_capacity_limited" opsUpstreamModelKey = "ops_upstream_model" opsRequestTypeKey = "ops_request_type" @@ -45,6 +47,8 @@ const ( opsCodeSubscriptionNotFound = "SUBSCRIPTION_NOT_FOUND" opsCodeSubscriptionInvalid = "SUBSCRIPTION_INVALID" opsCodeUserInactive = "USER_INACTIVE" + opsCodeInvalidAPIKey = "INVALID_API_KEY" + opsCodeAPIKeyRequired = "API_KEY_REQUIRED" ) const ( @@ -393,6 +397,42 @@ func setOpsSelectedAccount(c *gin.Context, accountID int64, platform ...string) } } +func markOpsRoutingCapacityLimited(c *gin.Context) { + if c == nil { + return + } + c.Set(opsRoutingCapacityLimitedKey, true) +} + +func markOpsRoutingCapacityLimitedIfNoAvailable(c *gin.Context, err error) { + if !isOpsNoAvailableAccountError(err) { + return + } + markOpsRoutingCapacityLimited(c) +} + +func isOpsRoutingCapacityLimited(c *gin.Context) bool { + if c == nil { + return false + } + v, ok := c.Get(opsRoutingCapacityLimitedKey) + if !ok { + return false + } + marked, _ := v.(bool) + return marked +} + +func isOpsNoAvailableAccountError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, service.ErrNoAvailableAccounts) || errors.Is(err, service.ErrNoAvailableCompactAccounts) { + return true + } + return isOpsNoAvailableAccountMessage(err.Error()) +} + type opsCaptureWriter struct { gin.ResponseWriter limit int @@ -775,11 +815,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc { normalizedType := normalizeOpsErrorType(parsed.ErrorType, parsed.Code) - phase := classifyOpsPhase(normalizedType, parsed.Message, parsed.Code) - isBusinessLimited := classifyOpsIsBusinessLimited(normalizedType, phase, parsed.Code, status, parsed.Message) - - errorOwner := classifyOpsErrorOwner(phase, parsed.Message) - errorSource := classifyOpsErrorSource(phase, parsed.Message) + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, normalizedType, parsed.Message, parsed.Code, status) entry := &service.OpsInsertErrorLogInput{ RequestID: requestID, @@ -1114,6 +1150,9 @@ func classifyOpsPhase(errType, message, code string) string { msg := strings.ToLower(message) // Standardized phases: request|auth|routing|upstream|network|internal // Map billing/concurrency/response => request; scheduling => routing. + if isOpsClientAuthError(code, msg) { + return "auth" + } switch strings.TrimSpace(code) { case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid: return "request" @@ -1134,7 +1173,7 @@ func classifyOpsPhase(errType, message, code string) string { case "upstream_error", "overloaded_error": return "upstream" case "api_error": - if strings.Contains(msg, opsErrNoAvailableAccounts) { + if isOpsNoAvailableAccountMessage(msg) { return "routing" } return "internal" @@ -1178,7 +1217,27 @@ func classifyOpsIsRetryable(errType string, statusCode int) bool { } } -func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool { +func classifyOpsErrorLog(c *gin.Context, errType, message, code string, status int) (phase string, isBusinessLimited bool, errorOwner string, errorSource string) { + phase = classifyOpsPhase(errType, message, code) + routingCapacityLimited := isOpsRoutingCapacityLimited(c) + upstreamError := hasOpsUpstreamErrorContext(c) + if upstreamError && !routingCapacityLimited { + phase = "upstream" + } + if routingCapacityLimited { + phase = "routing" + } + localClientAuthError := !upstreamError && phase == "auth" && isOpsClientAuthError(code, strings.ToLower(message)) + isBusinessLimited = routingCapacityLimited || classifyOpsIsBusinessLimited(errType, phase, code, status, message, localClientAuthError) + errorOwner = classifyOpsErrorOwner(phase, message) + errorSource = classifyOpsErrorSource(phase, message) + return phase, isBusinessLimited, errorOwner, errorSource +} + +func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string, localClientAuthError ...bool) bool { + if len(localClientAuthError) > 0 && localClientAuthError[0] { + return true + } switch strings.TrimSpace(code) { case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid, opsCodeUserInactive: return true @@ -1195,6 +1254,47 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa return false } +func isOpsClientAuthError(code string, msg string) bool { + switch strings.TrimSpace(code) { + case opsCodeInvalidAPIKey, opsCodeAPIKeyRequired: + return true + } + return strings.Contains(msg, "invalid api key") || strings.Contains(msg, "api key is required") +} + +func hasOpsUpstreamErrorContext(c *gin.Context) bool { + if c == nil { + return false + } + if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok { + switch code := v.(type) { + case int: + if code > 0 { + return true + } + case int64: + if code > 0 { + return true + } + } + } + if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok { + if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 { + return true + } + } + return false +} + +func isOpsNoAvailableAccountMessage(message string) bool { + msg := strings.ToLower(message) + return strings.Contains(msg, opsErrNoAvailableAccounts) || + strings.Contains(msg, "no available account") || + strings.Contains(msg, "no available gemini accounts") || + strings.Contains(msg, "no available openai accounts") || + strings.Contains(msg, "no available compatible accounts") +} + func classifyOpsErrorOwner(phase string, message string) string { // Standardized owners: client|provider|platform switch phase { diff --git a/backend/internal/handler/ops_error_logger_test.go b/backend/internal/handler/ops_error_logger_test.go index 6ae45110..e1df03cc 100644 --- a/backend/internal/handler/ops_error_logger_test.go +++ b/backend/internal/handler/ops_error_logger_test.go @@ -275,6 +275,187 @@ func TestNormalizeOpsErrorType(t *testing.T) { } } +func TestClassifyOpsNoAvailableAccountsExcludedFromSLA(t *testing.T) { + const message = "No available accounts" + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + + markOpsRoutingCapacityLimited(c) + + errType := normalizeOpsErrorType("api_error", "") + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, message, "", http.StatusServiceUnavailable) + + require.Equal(t, "api_error", errType) + require.Equal(t, "routing", phase) + require.True(t, isBusinessLimited) + require.Equal(t, "platform", errorOwner) + require.Equal(t, "gateway", errorSource) +} + +func TestClassifyOpsRoutingCapacityMarkerExcludesMaskedSelectionFailureFromSLA(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + + markOpsRoutingCapacityLimited(c) + + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog( + c, + "api_error", + "Service temporarily unavailable", + "", + http.StatusServiceUnavailable, + ) + + require.Equal(t, "routing", phase) + require.True(t, isBusinessLimited) + require.Equal(t, "platform", errorOwner) + require.Equal(t, "gateway", errorSource) +} + +func TestClassifyOpsAuthClientErrorsExcludedFromSLA(t *testing.T) { + tests := []struct { + name string + errType string + message string + code string + status int + }{ + { + name: "standard invalid API key", + errType: "api_error", + message: "Invalid API key", + code: "INVALID_API_KEY", + status: http.StatusUnauthorized, + }, + { + name: "standard missing API key", + errType: "api_error", + message: "API key is required in Authorization header (Bearer scheme), x-api-key header, or x-goog-api-key header", + code: "API_KEY_REQUIRED", + status: http.StatusUnauthorized, + }, + { + name: "google invalid API key", + errType: "api_error", + message: "Invalid API key", + code: "401", + status: http.StatusUnauthorized, + }, + { + name: "google missing API key", + errType: "api_error", + message: "API key is required", + code: "401", + status: http.StatusUnauthorized, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + + errType := normalizeOpsErrorType(tt.errType, tt.code) + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, tt.message, tt.code, tt.status) + + require.Equal(t, "api_error", errType) + require.Equal(t, "auth", phase) + require.True(t, isBusinessLimited) + require.Equal(t, "client", errorOwner) + require.Equal(t, "client_request", errorSource) + }) + } +} + +func TestClassifyOpsUnsupportedModelExcludedFromSLA(t *testing.T) { + tests := []string{ + "No available accounts: no available accounts supporting model: made-up-model", + "No available accounts: no available OpenAI accounts supporting model: made-up-model", + "No available Gemini accounts: no available Gemini accounts supporting model: made-up-model", + "No available accounts: no available accounts supporting model: made-up-model (channel pricing restriction)", + } + + for _, message := range tests { + t.Run(message, func(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + markOpsRoutingCapacityLimited(c) + + errType := normalizeOpsErrorType("api_error", "") + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, message, "", http.StatusServiceUnavailable) + + require.Equal(t, "api_error", errType) + require.Equal(t, "routing", phase) + require.True(t, isBusinessLimited) + require.Equal(t, "platform", errorOwner) + require.Equal(t, "gateway", errorSource) + }) + } +} + +func TestClassifyOpsUnmarkedNoAvailableTextStillCountsForSLA(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog( + c, + "api_error", + "No available accounts", + "", + http.StatusServiceUnavailable, + ) + + require.Equal(t, "routing", phase) + require.False(t, isBusinessLimited) + require.Equal(t, "platform", errorOwner) + require.Equal(t, "gateway", errorSource) +} + +func TestClassifyOpsUpstreamAuthTextStillCountsForSLA(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + service.SetOpsUpstreamError(c, http.StatusUnauthorized, "Invalid API key", "") + + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog( + c, + "api_error", + "Invalid API key", + "401", + http.StatusUnauthorized, + ) + + require.Equal(t, "upstream", phase) + require.False(t, isBusinessLimited) + require.Equal(t, "provider", errorOwner) + require.Equal(t, "upstream_http", errorSource) +} + +func TestClassifyOpsUpstreamNoAvailableTextStillCountsForSLA(t *testing.T) { + gin.SetMode(gin.TestMode) + rec := httptest.NewRecorder() + c, _ := gin.CreateTestContext(rec) + service.SetOpsUpstreamError(c, http.StatusServiceUnavailable, "No available accounts", "") + + phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog( + c, + "api_error", + "No available accounts", + "", + http.StatusServiceUnavailable, + ) + + require.Equal(t, "upstream", phase) + require.False(t, isBusinessLimited) + require.Equal(t, "provider", errorOwner) + require.Equal(t, "upstream_http", errorSource) +} + func TestSetOpsEndpointContext_SetsContextKeys(t *testing.T) { gin.SetMode(gin.TestMode) rec := httptest.NewRecorder()