Add HealthService with Liveness (no-op) and Readiness (DB+Redis ping
with per-component timeout) checks. Expose three endpoints:
- /healthz : new liveness endpoint, zero-dependency, always 200
- /ready : new readiness endpoint, returns 503 with details on dep
failure; suitable for K8s readinessProbe and load balancers
- /health : preserved for backward compatibility, equivalent to
/healthz
Switch primary docker-compose healthcheck to /ready so the container
is only marked healthy once DB+Redis are reachable. Standalone/dev/
local compose files keep /health to avoid disrupting existing setups.
Tests: unit tests cover liveness, readiness with both deps healthy,
each dep failing independently, and per-component timeout enforcement.
120 lines
3.5 KiB
Go
120 lines
3.5 KiB
Go
// Package service - HealthService 提供 liveness 与 readiness 探针。
|
||
//
|
||
// 设计动机:原有 /health 端点既被 docker-compose healthcheck 使用,又被
|
||
// dashboard 的 ops_health_score 复用——后者会触发 DB/Redis 等重操作,
|
||
// 导致探活流量污染监控指标。本服务把两类语义拆开:
|
||
// - Liveness : 仅证明进程存活(无外部依赖检查)。
|
||
// - Readiness : 检查 DB + Redis 连通,作为是否可接收流量的判断。
|
||
//
|
||
// dashboard 维度的"业务健康分"仍由 ops_health_score 计算,与本服务无关。
|
||
package service
|
||
|
||
import (
|
||
"context"
|
||
"database/sql"
|
||
"errors"
|
||
"time"
|
||
|
||
"github.com/redis/go-redis/v9"
|
||
)
|
||
|
||
// 探针默认超时。Readiness 探针需要快速失败,避免堆积。
|
||
const (
|
||
defaultReadinessTimeout = 2 * time.Second
|
||
)
|
||
|
||
// ReadinessReport 描述各依赖项的状态,便于上层暴露细节给排障。
|
||
type ReadinessReport struct {
|
||
OK bool `json:"ok"`
|
||
Details map[string]ComponentStatus `json:"details"`
|
||
Elapsed time.Duration `json:"elapsed_ms"`
|
||
}
|
||
|
||
// ComponentStatus 单个依赖项的状态。Error 字段在 OK=true 时为空。
|
||
type ComponentStatus struct {
|
||
OK bool `json:"ok"`
|
||
Error string `json:"error,omitempty"`
|
||
Elapsed string `json:"elapsed,omitempty"`
|
||
}
|
||
|
||
// HealthService 提供 liveness/readiness 探针。
|
||
// 字段都允许为 nil:缺失的依赖在 readiness 中自动跳过,便于测试和分阶段启用。
|
||
type HealthService struct {
|
||
db *sql.DB
|
||
rdb *redis.Client
|
||
timeout time.Duration
|
||
}
|
||
|
||
// NewHealthService 构造函数。timeout<=0 时使用默认值。
|
||
func NewHealthService(db *sql.DB, rdb *redis.Client) *HealthService {
|
||
return &HealthService{
|
||
db: db,
|
||
rdb: rdb,
|
||
timeout: defaultReadinessTimeout,
|
||
}
|
||
}
|
||
|
||
// Liveness 仅返回 nil。任何调用方能拿到这个返回值就说明进程在响应请求。
|
||
// 保持无副作用、零依赖,便于 K8s livenessProbe 高频调用。
|
||
func (s *HealthService) Liveness() error {
|
||
return nil
|
||
}
|
||
|
||
// Readiness 检查所有外部依赖。任一失败则整体 OK=false。
|
||
// 单个依赖的 ctx 超时由 timeout 控制,独立计时不互相阻塞。
|
||
func (s *HealthService) Readiness(ctx context.Context) ReadinessReport {
|
||
start := time.Now()
|
||
report := ReadinessReport{
|
||
OK: true,
|
||
Details: make(map[string]ComponentStatus, 2),
|
||
}
|
||
|
||
if s.db != nil {
|
||
report.Details["database"] = s.checkDB(ctx)
|
||
if !report.Details["database"].OK {
|
||
report.OK = false
|
||
}
|
||
}
|
||
if s.rdb != nil {
|
||
report.Details["redis"] = s.checkRedis(ctx)
|
||
if !report.Details["redis"].OK {
|
||
report.OK = false
|
||
}
|
||
}
|
||
|
||
report.Elapsed = time.Since(start)
|
||
return report
|
||
}
|
||
|
||
func (s *HealthService) checkDB(parent context.Context) ComponentStatus {
|
||
ctx, cancel := context.WithTimeout(parent, s.timeout)
|
||
defer cancel()
|
||
start := time.Now()
|
||
err := s.db.PingContext(ctx)
|
||
status := ComponentStatus{Elapsed: time.Since(start).String()}
|
||
if err != nil {
|
||
status.Error = err.Error()
|
||
return status
|
||
}
|
||
status.OK = true
|
||
return status
|
||
}
|
||
|
||
func (s *HealthService) checkRedis(parent context.Context) ComponentStatus {
|
||
ctx, cancel := context.WithTimeout(parent, s.timeout)
|
||
defer cancel()
|
||
start := time.Now()
|
||
pong, err := s.rdb.Ping(ctx).Result()
|
||
status := ComponentStatus{Elapsed: time.Since(start).String()}
|
||
if err != nil {
|
||
status.Error = err.Error()
|
||
return status
|
||
}
|
||
if pong != "PONG" {
|
||
status.Error = errors.New("unexpected redis ping response: " + pong).Error()
|
||
return status
|
||
}
|
||
status.OK = true
|
||
return status
|
||
}
|