sub2api/backend/internal/service/health_service.go
win 110902ad4b feat(health): split liveness and readiness probes
Add HealthService with Liveness (no-op) and Readiness (DB+Redis ping
with per-component timeout) checks. Expose three endpoints:

- /healthz : new liveness endpoint, zero-dependency, always 200
- /ready   : new readiness endpoint, returns 503 with details on dep
             failure; suitable for K8s readinessProbe and load balancers
- /health  : preserved for backward compatibility, equivalent to
             /healthz

Switch primary docker-compose healthcheck to /ready so the container
is only marked healthy once DB+Redis are reachable. Standalone/dev/
local compose files keep /health to avoid disrupting existing setups.

Tests: unit tests cover liveness, readiness with both deps healthy,
each dep failing independently, and per-component timeout enforcement.
2026-04-28 23:39:50 +08:00

120 lines
3.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package service - HealthService 提供 liveness 与 readiness 探针。
//
// 设计动机:原有 /health 端点既被 docker-compose healthcheck 使用,又被
// dashboard 的 ops_health_score 复用——后者会触发 DB/Redis 等重操作,
// 导致探活流量污染监控指标。本服务把两类语义拆开:
// - Liveness : 仅证明进程存活(无外部依赖检查)。
// - Readiness : 检查 DB + Redis 连通,作为是否可接收流量的判断。
//
// dashboard 维度的"业务健康分"仍由 ops_health_score 计算,与本服务无关。
package service
import (
"context"
"database/sql"
"errors"
"time"
"github.com/redis/go-redis/v9"
)
// 探针默认超时。Readiness 探针需要快速失败,避免堆积。
const (
defaultReadinessTimeout = 2 * time.Second
)
// ReadinessReport 描述各依赖项的状态,便于上层暴露细节给排障。
type ReadinessReport struct {
OK bool `json:"ok"`
Details map[string]ComponentStatus `json:"details"`
Elapsed time.Duration `json:"elapsed_ms"`
}
// ComponentStatus 单个依赖项的状态。Error 字段在 OK=true 时为空。
type ComponentStatus struct {
OK bool `json:"ok"`
Error string `json:"error,omitempty"`
Elapsed string `json:"elapsed,omitempty"`
}
// HealthService 提供 liveness/readiness 探针。
// 字段都允许为 nil缺失的依赖在 readiness 中自动跳过,便于测试和分阶段启用。
type HealthService struct {
db *sql.DB
rdb *redis.Client
timeout time.Duration
}
// NewHealthService 构造函数。timeout<=0 时使用默认值。
func NewHealthService(db *sql.DB, rdb *redis.Client) *HealthService {
return &HealthService{
db: db,
rdb: rdb,
timeout: defaultReadinessTimeout,
}
}
// Liveness 仅返回 nil。任何调用方能拿到这个返回值就说明进程在响应请求。
// 保持无副作用、零依赖,便于 K8s livenessProbe 高频调用。
func (s *HealthService) Liveness() error {
return nil
}
// Readiness 检查所有外部依赖。任一失败则整体 OK=false。
// 单个依赖的 ctx 超时由 timeout 控制,独立计时不互相阻塞。
func (s *HealthService) Readiness(ctx context.Context) ReadinessReport {
start := time.Now()
report := ReadinessReport{
OK: true,
Details: make(map[string]ComponentStatus, 2),
}
if s.db != nil {
report.Details["database"] = s.checkDB(ctx)
if !report.Details["database"].OK {
report.OK = false
}
}
if s.rdb != nil {
report.Details["redis"] = s.checkRedis(ctx)
if !report.Details["redis"].OK {
report.OK = false
}
}
report.Elapsed = time.Since(start)
return report
}
func (s *HealthService) checkDB(parent context.Context) ComponentStatus {
ctx, cancel := context.WithTimeout(parent, s.timeout)
defer cancel()
start := time.Now()
err := s.db.PingContext(ctx)
status := ComponentStatus{Elapsed: time.Since(start).String()}
if err != nil {
status.Error = err.Error()
return status
}
status.OK = true
return status
}
func (s *HealthService) checkRedis(parent context.Context) ComponentStatus {
ctx, cancel := context.WithTimeout(parent, s.timeout)
defer cancel()
start := time.Now()
pong, err := s.rdb.Ping(ctx).Result()
status := ComponentStatus{Elapsed: time.Since(start).String()}
if err != nil {
status.Error = err.Error()
return status
}
if pong != "PONG" {
status.Error = errors.New("unexpected redis ping response: " + pong).Error()
return status
}
status.OK = true
return status
}