diff --git a/backend/cmd/server/wire_gen.go b/backend/cmd/server/wire_gen.go index d0dcacd2..ffb53780 100644 --- a/backend/cmd/server/wire_gen.go +++ b/backend/cmd/server/wire_gen.go @@ -237,7 +237,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) { jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService) adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService) apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig) - engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient) + healthService := service.NewHealthService(db, redisClient) + engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, healthService, redisClient) httpServer := server.ProvideHTTPServer(configConfig, engine) opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig) opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig) diff --git a/backend/go.mod b/backend/go.mod index 135cbd3e..509619b1 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -107,6 +107,7 @@ require ( github.com/goccy/go-json v0.10.2 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-querystring v1.1.0 // indirect + github.com/google/subcommands v1.2.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.18.1 // indirect @@ -176,6 +177,7 @@ require ( golang.org/x/mod v0.32.0 // indirect golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect + golang.org/x/tools v0.41.0 // indirect google.golang.org/grpc v1.75.1 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/ini.v1 v1.67.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index f5b7968f..c8102f65 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -180,6 +180,8 @@ github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/subcommands v1.2.0 h1:vWQspBTo2nEqTUFita5/KeEWlUL8kQObDFbub/EN9oE= +github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4= diff --git a/backend/internal/server/http.go b/backend/internal/server/http.go index a8034e98..a9e3524c 100644 --- a/backend/internal/server/http.go +++ b/backend/internal/server/http.go @@ -35,6 +35,7 @@ func ProvideRouter( subscriptionService *service.SubscriptionService, opsService *service.OpsService, settingService *service.SettingService, + healthService *service.HealthService, redisClient *redis.Client, ) *gin.Engine { if cfg.Server.Mode == "release" { @@ -56,7 +57,7 @@ func ProvideRouter( } } - return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient) + return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient) } // ProvideHTTPServer 提供 HTTP 服务器 diff --git a/backend/internal/server/router.go b/backend/internal/server/router.go index 99701531..d532bd7f 100644 --- a/backend/internal/server/router.go +++ b/backend/internal/server/router.go @@ -30,6 +30,7 @@ func SetupRouter( subscriptionService *service.SubscriptionService, opsService *service.OpsService, settingService *service.SettingService, + healthService *service.HealthService, cfg *config.Config, redisClient *redis.Client, ) *gin.Engine { @@ -81,7 +82,7 @@ func SetupRouter( } // 注册路由 - registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient) + registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient) return r } @@ -97,11 +98,12 @@ func registerRoutes( subscriptionService *service.SubscriptionService, opsService *service.OpsService, settingService *service.SettingService, + healthService *service.HealthService, cfg *config.Config, redisClient *redis.Client, ) { // 通用路由(健康检查、状态等) - routes.RegisterCommonRoutes(r) + routes.RegisterCommonRoutes(r, healthService) // API v1 v1 := r.Group("/api/v1") diff --git a/backend/internal/server/routes/common.go b/backend/internal/server/routes/common.go index 4989358d..bd71dc12 100644 --- a/backend/internal/server/routes/common.go +++ b/backend/internal/server/routes/common.go @@ -1,16 +1,45 @@ package routes import ( + "context" "net/http" + "time" + "github.com/Wei-Shaw/sub2api/internal/service" "github.com/gin-gonic/gin" ) -// RegisterCommonRoutes 注册通用路由(健康检查、状态等) -func RegisterCommonRoutes(r *gin.Engine) { - // 健康检查 - r.GET("/health", func(c *gin.Context) { +// readinessHandlerTimeout 限定 readiness 端点对外的最大返回耗时。 +// HealthService 内部对每个组件再有独立超时,所以这里给宽一点即可。 +const readinessHandlerTimeout = 3 * time.Second + +// RegisterCommonRoutes 注册通用路由(健康检查、状态等)。 +// +// 健康端点的语义分层: +// - /healthz : liveness 探针。零依赖、永远 200。容器/进程探活专用。 +// - /ready : readiness 探针。检查 DB+Redis;任一失败返回 503。 +// - /health : 历史端点,等价于 /healthz,保留向后兼容。 +// +// dashboard 用的"业务健康分"由 ops_health_score 单独提供,与本路由无关。 +func RegisterCommonRoutes(r *gin.Engine, healthService *service.HealthService) { + // Liveness:仅证明进程在响应。 + livenessHandler := func(c *gin.Context) { + _ = healthService.Liveness() c.JSON(http.StatusOK, gin.H{"status": "ok"}) + } + r.GET("/healthz", livenessHandler) + r.GET("/health", livenessHandler) // 向后兼容旧的 docker-compose healthcheck + + // Readiness:检查关键依赖。失败时返回 503 但仍带详情,便于排障。 + r.GET("/ready", func(c *gin.Context) { + ctx, cancel := context.WithTimeout(c.Request.Context(), readinessHandlerTimeout) + defer cancel() + report := healthService.Readiness(ctx) + status := http.StatusOK + if !report.OK { + status = http.StatusServiceUnavailable + } + c.JSON(status, report) }) // Claude Code 遥测日志(忽略,直接返回200) diff --git a/backend/internal/server/routes/common_test.go b/backend/internal/server/routes/common_test.go new file mode 100644 index 00000000..51a5e43c --- /dev/null +++ b/backend/internal/server/routes/common_test.go @@ -0,0 +1,49 @@ +package routes + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/Wei-Shaw/sub2api/internal/service" + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/require" +) + +func newTestRouter(t *testing.T, hs *service.HealthService) *gin.Engine { + t.Helper() + gin.SetMode(gin.TestMode) + r := gin.New() + RegisterCommonRoutes(r, hs) + return r +} + +func TestCommonRoutes_LivenessEndpoints(t *testing.T) { + r := newTestRouter(t, service.NewHealthService(nil, nil)) + for _, path := range []string{"/healthz", "/health"} { + req := httptest.NewRequest(http.MethodGet, path, nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + require.Equal(t, http.StatusOK, w.Code, "liveness path %s should be 200", path) + } +} + +func TestCommonRoutes_ReadyEndpoint_NoDepsReturnsOK(t *testing.T) { + // 没有 DB/Redis 依赖时 readiness 视为 ok(早期启动场景)。 + r := newTestRouter(t, service.NewHealthService(nil, nil)) + req := httptest.NewRequest(http.MethodGet, "/ready", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + require.Equal(t, http.StatusOK, w.Code) + require.Contains(t, w.Body.String(), "\"ok\":true") +} + +func TestCommonRoutes_SetupStatusUnchanged(t *testing.T) { + // 验证我们没有破坏既有的 /setup/status 行为(前端依赖)。 + r := newTestRouter(t, service.NewHealthService(nil, nil)) + req := httptest.NewRequest(http.MethodGet, "/setup/status", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + require.Equal(t, http.StatusOK, w.Code) + require.Contains(t, w.Body.String(), "needs_setup") +} diff --git a/backend/internal/service/health_service.go b/backend/internal/service/health_service.go new file mode 100644 index 00000000..eb709da2 --- /dev/null +++ b/backend/internal/service/health_service.go @@ -0,0 +1,119 @@ +// Package service - HealthService 提供 liveness 与 readiness 探针。 +// +// 设计动机:原有 /health 端点既被 docker-compose healthcheck 使用,又被 +// dashboard 的 ops_health_score 复用——后者会触发 DB/Redis 等重操作, +// 导致探活流量污染监控指标。本服务把两类语义拆开: +// - Liveness : 仅证明进程存活(无外部依赖检查)。 +// - Readiness : 检查 DB + Redis 连通,作为是否可接收流量的判断。 +// +// dashboard 维度的"业务健康分"仍由 ops_health_score 计算,与本服务无关。 +package service + +import ( + "context" + "database/sql" + "errors" + "time" + + "github.com/redis/go-redis/v9" +) + +// 探针默认超时。Readiness 探针需要快速失败,避免堆积。 +const ( + defaultReadinessTimeout = 2 * time.Second +) + +// ReadinessReport 描述各依赖项的状态,便于上层暴露细节给排障。 +type ReadinessReport struct { + OK bool `json:"ok"` + Details map[string]ComponentStatus `json:"details"` + Elapsed time.Duration `json:"elapsed_ms"` +} + +// ComponentStatus 单个依赖项的状态。Error 字段在 OK=true 时为空。 +type ComponentStatus struct { + OK bool `json:"ok"` + Error string `json:"error,omitempty"` + Elapsed string `json:"elapsed,omitempty"` +} + +// HealthService 提供 liveness/readiness 探针。 +// 字段都允许为 nil:缺失的依赖在 readiness 中自动跳过,便于测试和分阶段启用。 +type HealthService struct { + db *sql.DB + rdb *redis.Client + timeout time.Duration +} + +// NewHealthService 构造函数。timeout<=0 时使用默认值。 +func NewHealthService(db *sql.DB, rdb *redis.Client) *HealthService { + return &HealthService{ + db: db, + rdb: rdb, + timeout: defaultReadinessTimeout, + } +} + +// Liveness 仅返回 nil。任何调用方能拿到这个返回值就说明进程在响应请求。 +// 保持无副作用、零依赖,便于 K8s livenessProbe 高频调用。 +func (s *HealthService) Liveness() error { + return nil +} + +// Readiness 检查所有外部依赖。任一失败则整体 OK=false。 +// 单个依赖的 ctx 超时由 timeout 控制,独立计时不互相阻塞。 +func (s *HealthService) Readiness(ctx context.Context) ReadinessReport { + start := time.Now() + report := ReadinessReport{ + OK: true, + Details: make(map[string]ComponentStatus, 2), + } + + if s.db != nil { + report.Details["database"] = s.checkDB(ctx) + if !report.Details["database"].OK { + report.OK = false + } + } + if s.rdb != nil { + report.Details["redis"] = s.checkRedis(ctx) + if !report.Details["redis"].OK { + report.OK = false + } + } + + report.Elapsed = time.Since(start) + return report +} + +func (s *HealthService) checkDB(parent context.Context) ComponentStatus { + ctx, cancel := context.WithTimeout(parent, s.timeout) + defer cancel() + start := time.Now() + err := s.db.PingContext(ctx) + status := ComponentStatus{Elapsed: time.Since(start).String()} + if err != nil { + status.Error = err.Error() + return status + } + status.OK = true + return status +} + +func (s *HealthService) checkRedis(parent context.Context) ComponentStatus { + ctx, cancel := context.WithTimeout(parent, s.timeout) + defer cancel() + start := time.Now() + pong, err := s.rdb.Ping(ctx).Result() + status := ComponentStatus{Elapsed: time.Since(start).String()} + if err != nil { + status.Error = err.Error() + return status + } + if pong != "PONG" { + status.Error = errors.New("unexpected redis ping response: " + pong).Error() + return status + } + status.OK = true + return status +} diff --git a/backend/internal/service/health_service_test.go b/backend/internal/service/health_service_test.go new file mode 100644 index 00000000..0fa8d931 --- /dev/null +++ b/backend/internal/service/health_service_test.go @@ -0,0 +1,93 @@ +package service + +import ( + "context" + "database/sql" + "errors" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/require" +) + +func TestHealthService_Liveness_AlwaysOK(t *testing.T) { + s := NewHealthService(nil, nil) + require.NoError(t, s.Liveness()) +} + +func TestHealthService_Readiness_AllNilReturnsOK(t *testing.T) { + // 当所有依赖都为 nil 时(早期启动或 unit test),readiness 应直接 OK。 + s := NewHealthService(nil, nil) + report := s.Readiness(context.Background()) + require.True(t, report.OK) + require.Empty(t, report.Details) +} + +func TestHealthService_Readiness_DBPingFails(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectPing().WillReturnError(errors.New("connection refused")) + + s := NewHealthService(db, nil) + report := s.Readiness(context.Background()) + require.False(t, report.OK) + require.Contains(t, report.Details, "database") + require.False(t, report.Details["database"].OK) + require.Contains(t, report.Details["database"].Error, "connection refused") +} + +func TestHealthService_Readiness_DBOK(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectPing() + + s := NewHealthService(db, nil) + report := s.Readiness(context.Background()) + require.True(t, report.OK) + require.True(t, report.Details["database"].OK) +} + +func TestHealthService_Readiness_RedisFails(t *testing.T) { + // 指向一个不可达端口让 redis ping 立刻失败。 + rdb := redis.NewClient(&redis.Options{ + Addr: "127.0.0.1:1", + DialTimeout: 200 * time.Millisecond, + ReadTimeout: 200 * time.Millisecond, + }) + defer rdb.Close() + + s := NewHealthService(nil, rdb) + s.timeout = 500 * time.Millisecond + report := s.Readiness(context.Background()) + require.False(t, report.OK) + require.Contains(t, report.Details, "redis") + require.False(t, report.Details["redis"].OK) +} + +func TestHealthService_Readiness_PerComponentTimeout(t *testing.T) { + // 验证 readiness 在超时时不会无限挂住。 + db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + require.NoError(t, err) + defer db.Close() + mock.ExpectPing().WillDelayFor(2 * time.Second) + + s := NewHealthService(db, nil) + s.timeout = 100 * time.Millisecond + + start := time.Now() + report := s.Readiness(context.Background()) + elapsed := time.Since(start) + + require.Less(t, elapsed, 1*time.Second, "readiness should respect per-component timeout") + require.False(t, report.OK) + require.NotEmpty(t, report.Details["database"].Error, "timeout should propagate as an error") +} + +// 抑制未使用包警告(database/sql 在签名里使用)。 +var _ = sql.ErrNoRows diff --git a/backend/internal/service/wire.go b/backend/internal/service/wire.go index d79a3531..abf437d5 100644 --- a/backend/internal/service/wire.go +++ b/backend/internal/service/wire.go @@ -451,6 +451,7 @@ var ProviderSet = wire.NewSet( ProvideSettingService, NewDataManagementService, ProvideBackupService, + NewHealthService, ProvideOpsSystemLogSink, NewOpsService, ProvideOpsMetricsCollector, diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index bb213c76..a5415298 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -168,7 +168,7 @@ services: networks: - sub2api-network healthcheck: - test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/health"] + test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/ready"] interval: 30s timeout: 10s retries: 3