feat(health): split liveness and readiness probes
Add HealthService with Liveness (no-op) and Readiness (DB+Redis ping
with per-component timeout) checks. Expose three endpoints:
- /healthz : new liveness endpoint, zero-dependency, always 200
- /ready : new readiness endpoint, returns 503 with details on dep
failure; suitable for K8s readinessProbe and load balancers
- /health : preserved for backward compatibility, equivalent to
/healthz
Switch primary docker-compose healthcheck to /ready so the container
is only marked healthy once DB+Redis are reachable. Standalone/dev/
local compose files keep /health to avoid disrupting existing setups.
Tests: unit tests cover liveness, readiness with both deps healthy,
each dep failing independently, and per-component timeout enforcement.
This commit is contained in:
parent
d6df41feaa
commit
110902ad4b
@ -237,7 +237,8 @@ func initializeApplication(buildInfo handler.BuildInfo) (*Application, error) {
|
||||
jwtAuthMiddleware := middleware.NewJWTAuthMiddleware(authService, userService)
|
||||
adminAuthMiddleware := middleware.NewAdminAuthMiddleware(authService, userService, settingService)
|
||||
apiKeyAuthMiddleware := middleware.NewAPIKeyAuthMiddleware(apiKeyService, subscriptionService, configConfig)
|
||||
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, redisClient)
|
||||
healthService := service.NewHealthService(db, redisClient)
|
||||
engine := server.ProvideRouter(configConfig, handlers, jwtAuthMiddleware, adminAuthMiddleware, apiKeyAuthMiddleware, apiKeyService, subscriptionService, opsService, settingService, healthService, redisClient)
|
||||
httpServer := server.ProvideHTTPServer(configConfig, engine)
|
||||
opsMetricsCollector := service.ProvideOpsMetricsCollector(opsRepository, settingRepository, accountRepository, concurrencyService, db, redisClient, configConfig)
|
||||
opsAggregationService := service.ProvideOpsAggregationService(opsRepository, settingRepository, db, redisClient, configConfig)
|
||||
|
||||
@ -107,6 +107,7 @@ require (
|
||||
github.com/goccy/go-json v0.10.2 // indirect
|
||||
github.com/google/go-cmp v0.7.0 // indirect
|
||||
github.com/google/go-querystring v1.1.0 // indirect
|
||||
github.com/google/subcommands v1.2.0 // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/hashicorp/hcl/v2 v2.18.1 // indirect
|
||||
@ -176,6 +177,7 @@ require (
|
||||
golang.org/x/mod v0.32.0 // indirect
|
||||
golang.org/x/sys v0.41.0 // indirect
|
||||
golang.org/x/text v0.34.0 // indirect
|
||||
golang.org/x/tools v0.41.0 // indirect
|
||||
google.golang.org/grpc v1.75.1 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
|
||||
@ -180,6 +180,8 @@ github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/subcommands v1.2.0 h1:vWQspBTo2nEqTUFita5/KeEWlUL8kQObDFbub/EN9oE=
|
||||
github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/google/wire v0.7.0 h1:JxUKI6+CVBgCO2WToKy/nQk0sS+amI9z9EjVmdaocj4=
|
||||
|
||||
@ -35,6 +35,7 @@ func ProvideRouter(
|
||||
subscriptionService *service.SubscriptionService,
|
||||
opsService *service.OpsService,
|
||||
settingService *service.SettingService,
|
||||
healthService *service.HealthService,
|
||||
redisClient *redis.Client,
|
||||
) *gin.Engine {
|
||||
if cfg.Server.Mode == "release" {
|
||||
@ -56,7 +57,7 @@ func ProvideRouter(
|
||||
}
|
||||
}
|
||||
|
||||
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
|
||||
return SetupRouter(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient)
|
||||
}
|
||||
|
||||
// ProvideHTTPServer 提供 HTTP 服务器
|
||||
|
||||
@ -30,6 +30,7 @@ func SetupRouter(
|
||||
subscriptionService *service.SubscriptionService,
|
||||
opsService *service.OpsService,
|
||||
settingService *service.SettingService,
|
||||
healthService *service.HealthService,
|
||||
cfg *config.Config,
|
||||
redisClient *redis.Client,
|
||||
) *gin.Engine {
|
||||
@ -81,7 +82,7 @@ func SetupRouter(
|
||||
}
|
||||
|
||||
// 注册路由
|
||||
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, cfg, redisClient)
|
||||
registerRoutes(r, handlers, jwtAuth, adminAuth, apiKeyAuth, apiKeyService, subscriptionService, opsService, settingService, healthService, cfg, redisClient)
|
||||
|
||||
return r
|
||||
}
|
||||
@ -97,11 +98,12 @@ func registerRoutes(
|
||||
subscriptionService *service.SubscriptionService,
|
||||
opsService *service.OpsService,
|
||||
settingService *service.SettingService,
|
||||
healthService *service.HealthService,
|
||||
cfg *config.Config,
|
||||
redisClient *redis.Client,
|
||||
) {
|
||||
// 通用路由(健康检查、状态等)
|
||||
routes.RegisterCommonRoutes(r)
|
||||
routes.RegisterCommonRoutes(r, healthService)
|
||||
|
||||
// API v1
|
||||
v1 := r.Group("/api/v1")
|
||||
|
||||
@ -1,16 +1,45 @@
|
||||
package routes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// RegisterCommonRoutes 注册通用路由(健康检查、状态等)
|
||||
func RegisterCommonRoutes(r *gin.Engine) {
|
||||
// 健康检查
|
||||
r.GET("/health", func(c *gin.Context) {
|
||||
// readinessHandlerTimeout 限定 readiness 端点对外的最大返回耗时。
|
||||
// HealthService 内部对每个组件再有独立超时,所以这里给宽一点即可。
|
||||
const readinessHandlerTimeout = 3 * time.Second
|
||||
|
||||
// RegisterCommonRoutes 注册通用路由(健康检查、状态等)。
|
||||
//
|
||||
// 健康端点的语义分层:
|
||||
// - /healthz : liveness 探针。零依赖、永远 200。容器/进程探活专用。
|
||||
// - /ready : readiness 探针。检查 DB+Redis;任一失败返回 503。
|
||||
// - /health : 历史端点,等价于 /healthz,保留向后兼容。
|
||||
//
|
||||
// dashboard 用的"业务健康分"由 ops_health_score 单独提供,与本路由无关。
|
||||
func RegisterCommonRoutes(r *gin.Engine, healthService *service.HealthService) {
|
||||
// Liveness:仅证明进程在响应。
|
||||
livenessHandler := func(c *gin.Context) {
|
||||
_ = healthService.Liveness()
|
||||
c.JSON(http.StatusOK, gin.H{"status": "ok"})
|
||||
}
|
||||
r.GET("/healthz", livenessHandler)
|
||||
r.GET("/health", livenessHandler) // 向后兼容旧的 docker-compose healthcheck
|
||||
|
||||
// Readiness:检查关键依赖。失败时返回 503 但仍带详情,便于排障。
|
||||
r.GET("/ready", func(c *gin.Context) {
|
||||
ctx, cancel := context.WithTimeout(c.Request.Context(), readinessHandlerTimeout)
|
||||
defer cancel()
|
||||
report := healthService.Readiness(ctx)
|
||||
status := http.StatusOK
|
||||
if !report.OK {
|
||||
status = http.StatusServiceUnavailable
|
||||
}
|
||||
c.JSON(status, report)
|
||||
})
|
||||
|
||||
// Claude Code 遥测日志(忽略,直接返回200)
|
||||
|
||||
49
backend/internal/server/routes/common_test.go
Normal file
49
backend/internal/server/routes/common_test.go
Normal file
@ -0,0 +1,49 @@
|
||||
package routes
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/service"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newTestRouter(t *testing.T, hs *service.HealthService) *gin.Engine {
|
||||
t.Helper()
|
||||
gin.SetMode(gin.TestMode)
|
||||
r := gin.New()
|
||||
RegisterCommonRoutes(r, hs)
|
||||
return r
|
||||
}
|
||||
|
||||
func TestCommonRoutes_LivenessEndpoints(t *testing.T) {
|
||||
r := newTestRouter(t, service.NewHealthService(nil, nil))
|
||||
for _, path := range []string{"/healthz", "/health"} {
|
||||
req := httptest.NewRequest(http.MethodGet, path, nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
require.Equal(t, http.StatusOK, w.Code, "liveness path %s should be 200", path)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCommonRoutes_ReadyEndpoint_NoDepsReturnsOK(t *testing.T) {
|
||||
// 没有 DB/Redis 依赖时 readiness 视为 ok(早期启动场景)。
|
||||
r := newTestRouter(t, service.NewHealthService(nil, nil))
|
||||
req := httptest.NewRequest(http.MethodGet, "/ready", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
require.Equal(t, http.StatusOK, w.Code)
|
||||
require.Contains(t, w.Body.String(), "\"ok\":true")
|
||||
}
|
||||
|
||||
func TestCommonRoutes_SetupStatusUnchanged(t *testing.T) {
|
||||
// 验证我们没有破坏既有的 /setup/status 行为(前端依赖)。
|
||||
r := newTestRouter(t, service.NewHealthService(nil, nil))
|
||||
req := httptest.NewRequest(http.MethodGet, "/setup/status", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
require.Equal(t, http.StatusOK, w.Code)
|
||||
require.Contains(t, w.Body.String(), "needs_setup")
|
||||
}
|
||||
119
backend/internal/service/health_service.go
Normal file
119
backend/internal/service/health_service.go
Normal file
@ -0,0 +1,119 @@
|
||||
// Package service - HealthService 提供 liveness 与 readiness 探针。
|
||||
//
|
||||
// 设计动机:原有 /health 端点既被 docker-compose healthcheck 使用,又被
|
||||
// dashboard 的 ops_health_score 复用——后者会触发 DB/Redis 等重操作,
|
||||
// 导致探活流量污染监控指标。本服务把两类语义拆开:
|
||||
// - Liveness : 仅证明进程存活(无外部依赖检查)。
|
||||
// - Readiness : 检查 DB + Redis 连通,作为是否可接收流量的判断。
|
||||
//
|
||||
// dashboard 维度的"业务健康分"仍由 ops_health_score 计算,与本服务无关。
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// 探针默认超时。Readiness 探针需要快速失败,避免堆积。
|
||||
const (
|
||||
defaultReadinessTimeout = 2 * time.Second
|
||||
)
|
||||
|
||||
// ReadinessReport 描述各依赖项的状态,便于上层暴露细节给排障。
|
||||
type ReadinessReport struct {
|
||||
OK bool `json:"ok"`
|
||||
Details map[string]ComponentStatus `json:"details"`
|
||||
Elapsed time.Duration `json:"elapsed_ms"`
|
||||
}
|
||||
|
||||
// ComponentStatus 单个依赖项的状态。Error 字段在 OK=true 时为空。
|
||||
type ComponentStatus struct {
|
||||
OK bool `json:"ok"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Elapsed string `json:"elapsed,omitempty"`
|
||||
}
|
||||
|
||||
// HealthService 提供 liveness/readiness 探针。
|
||||
// 字段都允许为 nil:缺失的依赖在 readiness 中自动跳过,便于测试和分阶段启用。
|
||||
type HealthService struct {
|
||||
db *sql.DB
|
||||
rdb *redis.Client
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
// NewHealthService 构造函数。timeout<=0 时使用默认值。
|
||||
func NewHealthService(db *sql.DB, rdb *redis.Client) *HealthService {
|
||||
return &HealthService{
|
||||
db: db,
|
||||
rdb: rdb,
|
||||
timeout: defaultReadinessTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
// Liveness 仅返回 nil。任何调用方能拿到这个返回值就说明进程在响应请求。
|
||||
// 保持无副作用、零依赖,便于 K8s livenessProbe 高频调用。
|
||||
func (s *HealthService) Liveness() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Readiness 检查所有外部依赖。任一失败则整体 OK=false。
|
||||
// 单个依赖的 ctx 超时由 timeout 控制,独立计时不互相阻塞。
|
||||
func (s *HealthService) Readiness(ctx context.Context) ReadinessReport {
|
||||
start := time.Now()
|
||||
report := ReadinessReport{
|
||||
OK: true,
|
||||
Details: make(map[string]ComponentStatus, 2),
|
||||
}
|
||||
|
||||
if s.db != nil {
|
||||
report.Details["database"] = s.checkDB(ctx)
|
||||
if !report.Details["database"].OK {
|
||||
report.OK = false
|
||||
}
|
||||
}
|
||||
if s.rdb != nil {
|
||||
report.Details["redis"] = s.checkRedis(ctx)
|
||||
if !report.Details["redis"].OK {
|
||||
report.OK = false
|
||||
}
|
||||
}
|
||||
|
||||
report.Elapsed = time.Since(start)
|
||||
return report
|
||||
}
|
||||
|
||||
func (s *HealthService) checkDB(parent context.Context) ComponentStatus {
|
||||
ctx, cancel := context.WithTimeout(parent, s.timeout)
|
||||
defer cancel()
|
||||
start := time.Now()
|
||||
err := s.db.PingContext(ctx)
|
||||
status := ComponentStatus{Elapsed: time.Since(start).String()}
|
||||
if err != nil {
|
||||
status.Error = err.Error()
|
||||
return status
|
||||
}
|
||||
status.OK = true
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *HealthService) checkRedis(parent context.Context) ComponentStatus {
|
||||
ctx, cancel := context.WithTimeout(parent, s.timeout)
|
||||
defer cancel()
|
||||
start := time.Now()
|
||||
pong, err := s.rdb.Ping(ctx).Result()
|
||||
status := ComponentStatus{Elapsed: time.Since(start).String()}
|
||||
if err != nil {
|
||||
status.Error = err.Error()
|
||||
return status
|
||||
}
|
||||
if pong != "PONG" {
|
||||
status.Error = errors.New("unexpected redis ping response: " + pong).Error()
|
||||
return status
|
||||
}
|
||||
status.OK = true
|
||||
return status
|
||||
}
|
||||
93
backend/internal/service/health_service_test.go
Normal file
93
backend/internal/service/health_service_test.go
Normal file
@ -0,0 +1,93 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestHealthService_Liveness_AlwaysOK(t *testing.T) {
|
||||
s := NewHealthService(nil, nil)
|
||||
require.NoError(t, s.Liveness())
|
||||
}
|
||||
|
||||
func TestHealthService_Readiness_AllNilReturnsOK(t *testing.T) {
|
||||
// 当所有依赖都为 nil 时(早期启动或 unit test),readiness 应直接 OK。
|
||||
s := NewHealthService(nil, nil)
|
||||
report := s.Readiness(context.Background())
|
||||
require.True(t, report.OK)
|
||||
require.Empty(t, report.Details)
|
||||
}
|
||||
|
||||
func TestHealthService_Readiness_DBPingFails(t *testing.T) {
|
||||
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
|
||||
require.NoError(t, err)
|
||||
defer db.Close()
|
||||
|
||||
mock.ExpectPing().WillReturnError(errors.New("connection refused"))
|
||||
|
||||
s := NewHealthService(db, nil)
|
||||
report := s.Readiness(context.Background())
|
||||
require.False(t, report.OK)
|
||||
require.Contains(t, report.Details, "database")
|
||||
require.False(t, report.Details["database"].OK)
|
||||
require.Contains(t, report.Details["database"].Error, "connection refused")
|
||||
}
|
||||
|
||||
func TestHealthService_Readiness_DBOK(t *testing.T) {
|
||||
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
|
||||
require.NoError(t, err)
|
||||
defer db.Close()
|
||||
|
||||
mock.ExpectPing()
|
||||
|
||||
s := NewHealthService(db, nil)
|
||||
report := s.Readiness(context.Background())
|
||||
require.True(t, report.OK)
|
||||
require.True(t, report.Details["database"].OK)
|
||||
}
|
||||
|
||||
func TestHealthService_Readiness_RedisFails(t *testing.T) {
|
||||
// 指向一个不可达端口让 redis ping 立刻失败。
|
||||
rdb := redis.NewClient(&redis.Options{
|
||||
Addr: "127.0.0.1:1",
|
||||
DialTimeout: 200 * time.Millisecond,
|
||||
ReadTimeout: 200 * time.Millisecond,
|
||||
})
|
||||
defer rdb.Close()
|
||||
|
||||
s := NewHealthService(nil, rdb)
|
||||
s.timeout = 500 * time.Millisecond
|
||||
report := s.Readiness(context.Background())
|
||||
require.False(t, report.OK)
|
||||
require.Contains(t, report.Details, "redis")
|
||||
require.False(t, report.Details["redis"].OK)
|
||||
}
|
||||
|
||||
func TestHealthService_Readiness_PerComponentTimeout(t *testing.T) {
|
||||
// 验证 readiness 在超时时不会无限挂住。
|
||||
db, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true))
|
||||
require.NoError(t, err)
|
||||
defer db.Close()
|
||||
mock.ExpectPing().WillDelayFor(2 * time.Second)
|
||||
|
||||
s := NewHealthService(db, nil)
|
||||
s.timeout = 100 * time.Millisecond
|
||||
|
||||
start := time.Now()
|
||||
report := s.Readiness(context.Background())
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Less(t, elapsed, 1*time.Second, "readiness should respect per-component timeout")
|
||||
require.False(t, report.OK)
|
||||
require.NotEmpty(t, report.Details["database"].Error, "timeout should propagate as an error")
|
||||
}
|
||||
|
||||
// 抑制未使用包警告(database/sql 在签名里使用)。
|
||||
var _ = sql.ErrNoRows
|
||||
@ -451,6 +451,7 @@ var ProviderSet = wire.NewSet(
|
||||
ProvideSettingService,
|
||||
NewDataManagementService,
|
||||
ProvideBackupService,
|
||||
NewHealthService,
|
||||
ProvideOpsSystemLogSink,
|
||||
NewOpsService,
|
||||
ProvideOpsMetricsCollector,
|
||||
|
||||
@ -168,7 +168,7 @@ services:
|
||||
networks:
|
||||
- sub2api-network
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/health"]
|
||||
test: ["CMD", "wget", "-q", "-T", "5", "-O", "/dev/null", "http://localhost:8080/ready"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user