fix: 调整 Ops 错误分类的 SLA 排除逻辑
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
f5bd25bea0
commit
ae6ee23e2e
@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"log"
|
"log"
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
@ -22,10 +23,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
opsModelKey = "ops_model"
|
opsModelKey = "ops_model"
|
||||||
opsStreamKey = "ops_stream"
|
opsStreamKey = "ops_stream"
|
||||||
opsRequestBodyKey = "ops_request_body"
|
opsRequestBodyKey = "ops_request_body"
|
||||||
opsAccountIDKey = "ops_account_id"
|
opsAccountIDKey = "ops_account_id"
|
||||||
|
opsRoutingCapacityLimitedKey = "ops_routing_capacity_limited"
|
||||||
|
|
||||||
opsUpstreamModelKey = "ops_upstream_model"
|
opsUpstreamModelKey = "ops_upstream_model"
|
||||||
opsRequestTypeKey = "ops_request_type"
|
opsRequestTypeKey = "ops_request_type"
|
||||||
@ -45,6 +47,8 @@ const (
|
|||||||
opsCodeSubscriptionNotFound = "SUBSCRIPTION_NOT_FOUND"
|
opsCodeSubscriptionNotFound = "SUBSCRIPTION_NOT_FOUND"
|
||||||
opsCodeSubscriptionInvalid = "SUBSCRIPTION_INVALID"
|
opsCodeSubscriptionInvalid = "SUBSCRIPTION_INVALID"
|
||||||
opsCodeUserInactive = "USER_INACTIVE"
|
opsCodeUserInactive = "USER_INACTIVE"
|
||||||
|
opsCodeInvalidAPIKey = "INVALID_API_KEY"
|
||||||
|
opsCodeAPIKeyRequired = "API_KEY_REQUIRED"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -393,6 +397,42 @@ func setOpsSelectedAccount(c *gin.Context, accountID int64, platform ...string)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func markOpsRoutingCapacityLimited(c *gin.Context) {
|
||||||
|
if c == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Set(opsRoutingCapacityLimitedKey, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func markOpsRoutingCapacityLimitedIfNoAvailable(c *gin.Context, err error) {
|
||||||
|
if !isOpsNoAvailableAccountError(err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
markOpsRoutingCapacityLimited(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isOpsRoutingCapacityLimited(c *gin.Context) bool {
|
||||||
|
if c == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
v, ok := c.Get(opsRoutingCapacityLimitedKey)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
marked, _ := v.(bool)
|
||||||
|
return marked
|
||||||
|
}
|
||||||
|
|
||||||
|
func isOpsNoAvailableAccountError(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if errors.Is(err, service.ErrNoAvailableAccounts) || errors.Is(err, service.ErrNoAvailableCompactAccounts) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return isOpsNoAvailableAccountMessage(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
type opsCaptureWriter struct {
|
type opsCaptureWriter struct {
|
||||||
gin.ResponseWriter
|
gin.ResponseWriter
|
||||||
limit int
|
limit int
|
||||||
@ -775,11 +815,7 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
|
|||||||
|
|
||||||
normalizedType := normalizeOpsErrorType(parsed.ErrorType, parsed.Code)
|
normalizedType := normalizeOpsErrorType(parsed.ErrorType, parsed.Code)
|
||||||
|
|
||||||
phase := classifyOpsPhase(normalizedType, parsed.Message, parsed.Code)
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, normalizedType, parsed.Message, parsed.Code, status)
|
||||||
isBusinessLimited := classifyOpsIsBusinessLimited(normalizedType, phase, parsed.Code, status, parsed.Message)
|
|
||||||
|
|
||||||
errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
|
|
||||||
errorSource := classifyOpsErrorSource(phase, parsed.Message)
|
|
||||||
|
|
||||||
entry := &service.OpsInsertErrorLogInput{
|
entry := &service.OpsInsertErrorLogInput{
|
||||||
RequestID: requestID,
|
RequestID: requestID,
|
||||||
@ -1114,6 +1150,9 @@ func classifyOpsPhase(errType, message, code string) string {
|
|||||||
msg := strings.ToLower(message)
|
msg := strings.ToLower(message)
|
||||||
// Standardized phases: request|auth|routing|upstream|network|internal
|
// Standardized phases: request|auth|routing|upstream|network|internal
|
||||||
// Map billing/concurrency/response => request; scheduling => routing.
|
// Map billing/concurrency/response => request; scheduling => routing.
|
||||||
|
if isOpsClientAuthError(code, msg) {
|
||||||
|
return "auth"
|
||||||
|
}
|
||||||
switch strings.TrimSpace(code) {
|
switch strings.TrimSpace(code) {
|
||||||
case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid:
|
case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid:
|
||||||
return "request"
|
return "request"
|
||||||
@ -1134,7 +1173,7 @@ func classifyOpsPhase(errType, message, code string) string {
|
|||||||
case "upstream_error", "overloaded_error":
|
case "upstream_error", "overloaded_error":
|
||||||
return "upstream"
|
return "upstream"
|
||||||
case "api_error":
|
case "api_error":
|
||||||
if strings.Contains(msg, opsErrNoAvailableAccounts) {
|
if isOpsNoAvailableAccountMessage(msg) {
|
||||||
return "routing"
|
return "routing"
|
||||||
}
|
}
|
||||||
return "internal"
|
return "internal"
|
||||||
@ -1178,7 +1217,27 @@ func classifyOpsIsRetryable(errType string, statusCode int) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
|
func classifyOpsErrorLog(c *gin.Context, errType, message, code string, status int) (phase string, isBusinessLimited bool, errorOwner string, errorSource string) {
|
||||||
|
phase = classifyOpsPhase(errType, message, code)
|
||||||
|
routingCapacityLimited := isOpsRoutingCapacityLimited(c)
|
||||||
|
upstreamError := hasOpsUpstreamErrorContext(c)
|
||||||
|
if upstreamError && !routingCapacityLimited {
|
||||||
|
phase = "upstream"
|
||||||
|
}
|
||||||
|
if routingCapacityLimited {
|
||||||
|
phase = "routing"
|
||||||
|
}
|
||||||
|
localClientAuthError := !upstreamError && phase == "auth" && isOpsClientAuthError(code, strings.ToLower(message))
|
||||||
|
isBusinessLimited = routingCapacityLimited || classifyOpsIsBusinessLimited(errType, phase, code, status, message, localClientAuthError)
|
||||||
|
errorOwner = classifyOpsErrorOwner(phase, message)
|
||||||
|
errorSource = classifyOpsErrorSource(phase, message)
|
||||||
|
return phase, isBusinessLimited, errorOwner, errorSource
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string, localClientAuthError ...bool) bool {
|
||||||
|
if len(localClientAuthError) > 0 && localClientAuthError[0] {
|
||||||
|
return true
|
||||||
|
}
|
||||||
switch strings.TrimSpace(code) {
|
switch strings.TrimSpace(code) {
|
||||||
case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid, opsCodeUserInactive:
|
case opsCodeInsufficientBalance, opsCodeUsageLimitExceeded, opsCodeSubscriptionNotFound, opsCodeSubscriptionInvalid, opsCodeUserInactive:
|
||||||
return true
|
return true
|
||||||
@ -1195,6 +1254,47 @@ func classifyOpsIsBusinessLimited(errType, phase, code string, status int, messa
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isOpsClientAuthError(code string, msg string) bool {
|
||||||
|
switch strings.TrimSpace(code) {
|
||||||
|
case opsCodeInvalidAPIKey, opsCodeAPIKeyRequired:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return strings.Contains(msg, "invalid api key") || strings.Contains(msg, "api key is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasOpsUpstreamErrorContext(c *gin.Context) bool {
|
||||||
|
if c == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamStatusCodeKey); ok {
|
||||||
|
switch code := v.(type) {
|
||||||
|
case int:
|
||||||
|
if code > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
case int64:
|
||||||
|
if code > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v, ok := c.Get(service.OpsUpstreamErrorsKey); ok {
|
||||||
|
if events, ok := v.([]*service.OpsUpstreamErrorEvent); ok && len(events) > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isOpsNoAvailableAccountMessage(message string) bool {
|
||||||
|
msg := strings.ToLower(message)
|
||||||
|
return strings.Contains(msg, opsErrNoAvailableAccounts) ||
|
||||||
|
strings.Contains(msg, "no available account") ||
|
||||||
|
strings.Contains(msg, "no available gemini accounts") ||
|
||||||
|
strings.Contains(msg, "no available openai accounts") ||
|
||||||
|
strings.Contains(msg, "no available compatible accounts")
|
||||||
|
}
|
||||||
|
|
||||||
func classifyOpsErrorOwner(phase string, message string) string {
|
func classifyOpsErrorOwner(phase string, message string) string {
|
||||||
// Standardized owners: client|provider|platform
|
// Standardized owners: client|provider|platform
|
||||||
switch phase {
|
switch phase {
|
||||||
|
|||||||
@ -275,6 +275,187 @@ func TestNormalizeOpsErrorType(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsNoAvailableAccountsExcludedFromSLA(t *testing.T) {
|
||||||
|
const message = "No available accounts"
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
|
||||||
|
markOpsRoutingCapacityLimited(c)
|
||||||
|
|
||||||
|
errType := normalizeOpsErrorType("api_error", "")
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, message, "", http.StatusServiceUnavailable)
|
||||||
|
|
||||||
|
require.Equal(t, "api_error", errType)
|
||||||
|
require.Equal(t, "routing", phase)
|
||||||
|
require.True(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "platform", errorOwner)
|
||||||
|
require.Equal(t, "gateway", errorSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsRoutingCapacityMarkerExcludesMaskedSelectionFailureFromSLA(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
|
||||||
|
markOpsRoutingCapacityLimited(c)
|
||||||
|
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(
|
||||||
|
c,
|
||||||
|
"api_error",
|
||||||
|
"Service temporarily unavailable",
|
||||||
|
"",
|
||||||
|
http.StatusServiceUnavailable,
|
||||||
|
)
|
||||||
|
|
||||||
|
require.Equal(t, "routing", phase)
|
||||||
|
require.True(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "platform", errorOwner)
|
||||||
|
require.Equal(t, "gateway", errorSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsAuthClientErrorsExcludedFromSLA(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
errType string
|
||||||
|
message string
|
||||||
|
code string
|
||||||
|
status int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "standard invalid API key",
|
||||||
|
errType: "api_error",
|
||||||
|
message: "Invalid API key",
|
||||||
|
code: "INVALID_API_KEY",
|
||||||
|
status: http.StatusUnauthorized,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "standard missing API key",
|
||||||
|
errType: "api_error",
|
||||||
|
message: "API key is required in Authorization header (Bearer scheme), x-api-key header, or x-goog-api-key header",
|
||||||
|
code: "API_KEY_REQUIRED",
|
||||||
|
status: http.StatusUnauthorized,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "google invalid API key",
|
||||||
|
errType: "api_error",
|
||||||
|
message: "Invalid API key",
|
||||||
|
code: "401",
|
||||||
|
status: http.StatusUnauthorized,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "google missing API key",
|
||||||
|
errType: "api_error",
|
||||||
|
message: "API key is required",
|
||||||
|
code: "401",
|
||||||
|
status: http.StatusUnauthorized,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
|
||||||
|
errType := normalizeOpsErrorType(tt.errType, tt.code)
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, tt.message, tt.code, tt.status)
|
||||||
|
|
||||||
|
require.Equal(t, "api_error", errType)
|
||||||
|
require.Equal(t, "auth", phase)
|
||||||
|
require.True(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "client", errorOwner)
|
||||||
|
require.Equal(t, "client_request", errorSource)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsUnsupportedModelExcludedFromSLA(t *testing.T) {
|
||||||
|
tests := []string{
|
||||||
|
"No available accounts: no available accounts supporting model: made-up-model",
|
||||||
|
"No available accounts: no available OpenAI accounts supporting model: made-up-model",
|
||||||
|
"No available Gemini accounts: no available Gemini accounts supporting model: made-up-model",
|
||||||
|
"No available accounts: no available accounts supporting model: made-up-model (channel pricing restriction)",
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, message := range tests {
|
||||||
|
t.Run(message, func(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
markOpsRoutingCapacityLimited(c)
|
||||||
|
|
||||||
|
errType := normalizeOpsErrorType("api_error", "")
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(c, errType, message, "", http.StatusServiceUnavailable)
|
||||||
|
|
||||||
|
require.Equal(t, "api_error", errType)
|
||||||
|
require.Equal(t, "routing", phase)
|
||||||
|
require.True(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "platform", errorOwner)
|
||||||
|
require.Equal(t, "gateway", errorSource)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsUnmarkedNoAvailableTextStillCountsForSLA(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(
|
||||||
|
c,
|
||||||
|
"api_error",
|
||||||
|
"No available accounts",
|
||||||
|
"",
|
||||||
|
http.StatusServiceUnavailable,
|
||||||
|
)
|
||||||
|
|
||||||
|
require.Equal(t, "routing", phase)
|
||||||
|
require.False(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "platform", errorOwner)
|
||||||
|
require.Equal(t, "gateway", errorSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsUpstreamAuthTextStillCountsForSLA(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
service.SetOpsUpstreamError(c, http.StatusUnauthorized, "Invalid API key", "")
|
||||||
|
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(
|
||||||
|
c,
|
||||||
|
"api_error",
|
||||||
|
"Invalid API key",
|
||||||
|
"401",
|
||||||
|
http.StatusUnauthorized,
|
||||||
|
)
|
||||||
|
|
||||||
|
require.Equal(t, "upstream", phase)
|
||||||
|
require.False(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "provider", errorOwner)
|
||||||
|
require.Equal(t, "upstream_http", errorSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyOpsUpstreamNoAvailableTextStillCountsForSLA(t *testing.T) {
|
||||||
|
gin.SetMode(gin.TestMode)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
c, _ := gin.CreateTestContext(rec)
|
||||||
|
service.SetOpsUpstreamError(c, http.StatusServiceUnavailable, "No available accounts", "")
|
||||||
|
|
||||||
|
phase, isBusinessLimited, errorOwner, errorSource := classifyOpsErrorLog(
|
||||||
|
c,
|
||||||
|
"api_error",
|
||||||
|
"No available accounts",
|
||||||
|
"",
|
||||||
|
http.StatusServiceUnavailable,
|
||||||
|
)
|
||||||
|
|
||||||
|
require.Equal(t, "upstream", phase)
|
||||||
|
require.False(t, isBusinessLimited)
|
||||||
|
require.Equal(t, "provider", errorOwner)
|
||||||
|
require.Equal(t, "upstream_http", errorSource)
|
||||||
|
}
|
||||||
|
|
||||||
func TestSetOpsEndpointContext_SetsContextKeys(t *testing.T) {
|
func TestSetOpsEndpointContext_SetsContextKeys(t *testing.T) {
|
||||||
gin.SetMode(gin.TestMode)
|
gin.SetMode(gin.TestMode)
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user