feat(health): split liveness and readiness probes

Add HealthService with Liveness (no-op) and Readiness (DB+Redis ping
with per-component timeout) checks. Expose three endpoints:

- /healthz : new liveness endpoint, zero-dependency, always 200
- /ready   : new readiness endpoint, returns 503 with details on dep
             failure; suitable for K8s readinessProbe and load balancers
- /health  : preserved for backward compatibility, equivalent to
             /healthz

Switch primary docker-compose healthcheck to /ready so the container
is only marked healthy once DB+Redis are reachable. Standalone/dev/
local compose files keep /health to avoid disrupting existing setups.

Tests: unit tests cover liveness, readiness with both deps healthy,
each dep failing independently, and per-component timeout enforcement.
This commit is contained in:
win 2026-04-28 23:39:50 +08:00
parent d6df41feaa
commit 110902ad4b
11 changed files with 308 additions and 9 deletions

View File

@ -237,7 +237,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
healthService := service.NewHealthService(db, redisClient)
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, healthService, redisClient)
httpServer := server.ProvideHTTPServer(configConfig, engine)
opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)

View File

@ -107,6 +107,7 @@ require (
github.com/goccy/go-json v0.10.2 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/go-querystring v1.1.0 // indirect
github.com/google/subcommands v1.2.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/hashicorp/hcl/v2 v2.18.1 // indirect
@ -176,6 +177,7 @@ require (
golang.org/x/mod v0.32.0 // indirect
golang.org/x/sys v0.41.0 // indirect
golang.org/x/text v0.34.0 // indirect
golang.org/x/tools v0.41.0 // indirect
google.golang.org/grpc v1.75.1 // indirect
google.golang.org/protobuf v1.36.10 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect

View File

@ -180,6 +180,8 @@ github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/subcommands v1.2.0 h1:vWQspBTo2nEqTUFita5/KeEWlUL8kQObDFbub/EN9oE=
github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=

View File

@ -35,6 +35,7 @@ func ProvideRouter(
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
settingService *service.SettingService,
healthService *service.HealthService,
redisClient *redis.Client,
) *gin.Engine {
if cfg.Server.Mode == "release" {
@ -56,7 +57,7 @@ func ProvideRouter(
}
}
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient)
}
// ProvideHTTPServer 提供 HTTP 服务器

View File

@ -30,6 +30,7 @@ func SetupRouter(
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
settingService *service.SettingService,
healthService *service.HealthService,
cfg *config.Config,
redisClient *redis.Client,
) *gin.Engine {
@ -81,7 +82,7 @@ func SetupRouter(
}
// 注册路由
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient)
return r
}
@ -97,11 +98,12 @@ func registerRoutes(
subscriptionService *service.SubscriptionService,
opsService *service.OpsService,
settingService *service.SettingService,
healthService *service.HealthService,
cfg *config.Config,
redisClient *redis.Client,
) {
// 通用路由(健康检查、状态等)
routes.RegisterCommonRoutes(r)
routes.RegisterCommonRoutes(r, healthService)
// API v1
v1 := r.Group("/api/v1")

View File

@ -1,16 +1,45 @@
package routes
import (
"context"
"net/http"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// RegisterCommonRoutes 注册通用路由(健康检查、状态等)
func RegisterCommonRoutes(r *gin.Engine) {
// 健康检查
r.GET("/health", func(c *gin.Context) {
// readinessHandlerTimeout 限定 readiness 端点对外的最大返回耗时。
// HealthService 内部对每个组件再有独立超时,所以这里给宽一点即可。
const readinessHandlerTimeout = 3 * time.Second
// RegisterCommonRoutes 注册通用路由(健康检查、状态等)。
//
// 健康端点的语义分层:
// - /healthz : liveness 探针。零依赖、永远 200。容器/进程探活专用。
// - /ready : readiness 探针。检查 DB+Redis任一失败返回 503。
// - /health : 历史端点,等价于 /healthz保留向后兼容。
//
// dashboard 用的"业务健康分"由 ops_health_score 单独提供,与本路由无关。
func RegisterCommonRoutes(r *gin.Engine, healthService *service.HealthService) {
// Liveness仅证明进程在响应。
livenessHandler := func(c *gin.Context) {
_ = healthService.Liveness()
c.JSON(http.StatusOK, gin.H{"status": "ok"})
}
r.GET("/healthz", livenessHandler)
r.GET("/health", livenessHandler) // 向后兼容旧的 docker-compose healthcheck
// Readiness检查关键依赖。失败时返回 503 但仍带详情,便于排障。
r.GET("/ready", func(c *gin.Context) {
ctx, cancel := context.WithTimeout(c.Request.Context(), readinessHandlerTimeout)
defer cancel()
report := healthService.Readiness(ctx)
status := http.StatusOK
if !report.OK {
status = http.StatusServiceUnavailable
}
c.JSON(status, report)
})
// Claude Code 遥测日志忽略直接返回200

View File

@ -0,0 +1,49 @@
package routes
import (
"net/http"
"net/http/httptest"
"testing"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
"github.com/stretchr/testify/require"
)
func newTestRouter(t *testing.T, hs *service.HealthService) *gin.Engine {
t.Helper()
gin.SetMode(gin.TestMode)
r := gin.New()
RegisterCommonRoutes(r, hs)
return r
}
func TestCommonRoutes_LivenessEndpoints(t *testing.T) {
r := newTestRouter(t, service.NewHealthService(nil, nil))
for _, path := range []string{"/healthz", "/health"} {
req := httptest.NewRequest(http.MethodGet, path, nil)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
require.Equal(t, http.StatusOK, w.Code, "liveness path %s should be 200", path)
}
}
func TestCommonRoutes_ReadyEndpoint_NoDepsReturnsOK(t *testing.T) {
// 没有 DB/Redis 依赖时 readiness 视为 ok早期启动场景
r := newTestRouter(t, service.NewHealthService(nil, nil))
req := httptest.NewRequest(http.MethodGet, "/ready", nil)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
require.Equal(t, http.StatusOK, w.Code)
require.Contains(t, w.Body.String(), "\"ok\":true")
}
func TestCommonRoutes_SetupStatusUnchanged(t *testing.T) {
// 验证我们没有破坏既有的 /setup/status 行为(前端依赖)。
r := newTestRouter(t, service.NewHealthService(nil, nil))
req := httptest.NewRequest(http.MethodGet, "/setup/status", nil)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
require.Equal(t, http.StatusOK, w.Code)
require.Contains(t, w.Body.String(), "needs_setup")
}

View File

@ -0,0 +1,119 @@
// Package service - HealthService 提供 liveness 与 readiness 探针。
//
// 设计动机:原有 /health 端点既被 docker-compose healthcheck 使用,又被
// dashboard 的 ops_health_score 复用——后者会触发 DB/Redis 等重操作,
// 导致探活流量污染监控指标。本服务把两类语义拆开:
// - Liveness : 仅证明进程存活(无外部依赖检查)。
// - Readiness : 检查 DB + Redis 连通,作为是否可接收流量的判断。
//
// dashboard 维度的"业务健康分"仍由 ops_health_score 计算,与本服务无关。
package service
import (
"context"
"database/sql"
"errors"
"time"
"github.com/redis/go-redis/v9"
)
// 探针默认超时。Readiness 探针需要快速失败,避免堆积。
const (
defaultReadinessTimeout = 2 * time.Second
)
// ReadinessReport 描述各依赖项的状态,便于上层暴露细节给排障。
type ReadinessReport struct {
OK bool `json:"ok"`
Details map[string]ComponentStatus `json:"details"`
Elapsed time.Duration `json:"elapsed_ms"`
}
// ComponentStatus 单个依赖项的状态。Error 字段在 OK=true 时为空。
type ComponentStatus struct {
OK bool `json:"ok"`
Error string `json:"error,omitempty"`
Elapsed string `json:"elapsed,omitempty"`
}
// HealthService 提供 liveness/readiness 探针。
// 字段都允许为 nil缺失的依赖在 readiness 中自动跳过,便于测试和分阶段启用。
type HealthService struct {
db *sql.DB
rdb *redis.Client
timeout time.Duration
}
// NewHealthService 构造函数。timeout<=0 时使用默认值。
func NewHealthService(db *sql.DB, rdb *redis.Client) *HealthService {
return &HealthService{
db: db,
rdb: rdb,
timeout: defaultReadinessTimeout,
}
}
// Liveness 仅返回 nil。任何调用方能拿到这个返回值就说明进程在响应请求。
// 保持无副作用、零依赖,便于 K8s livenessProbe 高频调用。
func (s *HealthService) Liveness() error {
return nil
}
// Readiness 检查所有外部依赖。任一失败则整体 OK=false。
// 单个依赖的 ctx 超时由 timeout 控制,独立计时不互相阻塞。
func (s *HealthService) Readiness(ctx context.Context) ReadinessReport {
start := time.Now()
report := ReadinessReport{
OK: true,
Details: make(map[string]ComponentStatus, 2),
}
if s.db != nil {
report.Details["database"] = s.checkDB(ctx)
if !report.Details["database"].OK {
report.OK = false
}
}
if s.rdb != nil {
report.Details["redis"] = s.checkRedis(ctx)
if !report.Details["redis"].OK {
report.OK = false
}
}
report.Elapsed = time.Since(start)
return report
}
func (s *HealthService) checkDB(parent context.Context) ComponentStatus {
ctx, cancel := context.WithTimeout(parent, s.timeout)
defer cancel()
start := time.Now()
err := s.db.PingContext(ctx)
status := ComponentStatus{Elapsed: time.Since(start).String()}
if err != nil {
status.Error = err.Error()
return status
}
status.OK = true
return status
}
func (s *HealthService) checkRedis(parent context.Context) ComponentStatus {
ctx, cancel := context.WithTimeout(parent, s.timeout)
defer cancel()
start := time.Now()
pong, err := s.rdb.Ping(ctx).Result()
status := ComponentStatus{Elapsed: time.Since(start).String()}
if err != nil {
status.Error = err.Error()
return status
}
if pong != "PONG" {
status.Error = errors.New("unexpected redis ping response: " + pong).Error()
return status
}
status.OK = true
return status
}

View File

@ -0,0 +1,93 @@
package service
import (
"context"
"database/sql"
"errors"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/require"
)
func TestHealthService_Liveness_AlwaysOK(t *testing.T) {
s := NewHealthService(nil, nil)
require.NoError(t, s.Liveness())
}
func TestHealthService_Readiness_AllNilReturnsOK(t *testing.T) {
// 当所有依赖都为 nil 时(早期启动或 unit testreadiness 应直接 OK。
s := NewHealthService(nil, nil)
report := s.Readiness(context.Background())
require.True(t, report.OK)
require.Empty(t, report.Details)
}
func TestHealthService_Readiness_DBPingFails(t *testing.T) {
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
require.NoError(t, err)
defer db.Close()
mock.ExpectPing().WillReturnError(errors.New("connection refused"))
s := NewHealthService(db, nil)
report := s.Readiness(context.Background())
require.False(t, report.OK)
require.Contains(t, report.Details, "database")
require.False(t, report.Details["database"].OK)
require.Contains(t, report.Details["database"].Error, "connection refused")
}
func TestHealthService_Readiness_DBOK(t *testing.T) {
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
require.NoError(t, err)
defer db.Close()
mock.ExpectPing()
s := NewHealthService(db, nil)
report := s.Readiness(context.Background())
require.True(t, report.OK)
require.True(t, report.Details["database"].OK)
}
func TestHealthService_Readiness_RedisFails(t *testing.T) {
// 指向一个不可达端口让 redis ping 立刻失败。
rdb := redis.NewClient(&redis.Options{
Addr: "127.0.0.1:1",
DialTimeout: 200 * time.Millisecond,
ReadTimeout: 200 * time.Millisecond,
})
defer rdb.Close()
s := NewHealthService(nil, rdb)
s.timeout = 500 * time.Millisecond
report := s.Readiness(context.Background())
require.False(t, report.OK)
require.Contains(t, report.Details, "redis")
require.False(t, report.Details["redis"].OK)
}
func TestHealthService_Readiness_PerComponentTimeout(t *testing.T) {
// 验证 readiness 在超时时不会无限挂住。
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
require.NoError(t, err)
defer db.Close()
mock.ExpectPing().WillDelayFor(2 * time.Second)
s := NewHealthService(db, nil)
s.timeout = 100 * time.Millisecond
start := time.Now()
report := s.Readiness(context.Background())
elapsed := time.Since(start)
require.Less(t, elapsed, 1*time.Second, "readiness should respect per-component timeout")
require.False(t, report.OK)
require.NotEmpty(t, report.Details["database"].Error, "timeout should propagate as an error")
}
// 抑制未使用包警告database/sql 在签名里使用)。
var _ = sql.ErrNoRows

View File

@ -451,6 +451,7 @@ var ProviderSet = wire.NewSet(
ProvideSettingService,
NewDataManagementService,
ProvideBackupService,
NewHealthService,
ProvideOpsSystemLogSink,
NewOpsService,
ProvideOpsMetricsCollector,

View File

@ -168,7 +168,7 @@ services:
networks:
- sub2api-network
healthcheck:
test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/health"]
test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/ready"]
interval: 30s
timeout: 10s
retries: 3