fix(gateway): sanitize stream errors to avoid leaking infrastructure topology
(*net.OpError).Error() concatenates Source/Addr fields, so the previous disconnectMsg surfaced internal source IP/port and upstream server address to clients via SSE error frames and UpstreamFailoverError.ResponseBody (reported by @Wei-Shaw on PR #2066). - Add sanitizeStreamError that maps known errors (io.ErrUnexpectedEOF, context.Canceled, syscall.ECONNRESET/EPIPE/ETIMEDOUT/...) to fixed descriptions and falls back to a generic placeholder, with an explicit *net.OpError branch that drops Source/Addr fields entirely. - Use sanitized message in client-facing disconnectMsg; full ev.err is still preserved in the existing operator log line for diagnosis. - Tests cover net.OpError redaction, the failover ResponseBody path, and every known sanitized error mapping.
This commit is contained in:
parent
4c474616b9
commit
d78478e866
@ -11,6 +11,7 @@ import (
|
||||
"io"
|
||||
"log/slog"
|
||||
mathrand "math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@ -20,6 +21,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
@ -6434,6 +6436,49 @@ func (s *GatewayService) shouldFailoverOn400(respBody []byte) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// sanitizeStreamError 返回不含网络地址的客户端可见错误描述。
|
||||
// 默认 (*net.OpError).Error() 会拼接 Source/Addr 字段,泄露内部 IP/端口与上游
|
||||
// 服务器地址(例如 "read tcp 10.0.0.1:54321->52.1.2.3:443: read: connection
|
||||
// reset by peer")。该函数只保留可识别的错误类别,原始 err 仍在调用点写入日志。
|
||||
func sanitizeStreamError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
switch {
|
||||
case errors.Is(err, io.ErrUnexpectedEOF):
|
||||
return "unexpected EOF"
|
||||
case errors.Is(err, io.EOF):
|
||||
return "EOF"
|
||||
case errors.Is(err, context.Canceled):
|
||||
return "canceled"
|
||||
case errors.Is(err, context.DeadlineExceeded):
|
||||
return "deadline exceeded"
|
||||
case errors.Is(err, syscall.ECONNRESET):
|
||||
return "connection reset by peer"
|
||||
case errors.Is(err, syscall.ECONNABORTED):
|
||||
return "connection aborted"
|
||||
case errors.Is(err, syscall.ETIMEDOUT):
|
||||
return "connection timed out"
|
||||
case errors.Is(err, syscall.EPIPE):
|
||||
return "broken pipe"
|
||||
case errors.Is(err, syscall.ECONNREFUSED):
|
||||
return "connection refused"
|
||||
}
|
||||
var netErr *net.OpError
|
||||
if errors.As(err, &netErr) {
|
||||
if netErr.Timeout() {
|
||||
if netErr.Op != "" {
|
||||
return netErr.Op + " timeout"
|
||||
}
|
||||
return "i/o timeout"
|
||||
}
|
||||
if netErr.Op != "" {
|
||||
return netErr.Op + " network error"
|
||||
}
|
||||
}
|
||||
return "upstream connection error"
|
||||
}
|
||||
|
||||
// ExtractUpstreamErrorMessage 从上游响应体中提取错误消息
|
||||
// 支持 Claude 风格的错误格式:{"type":"error","error":{"type":"...","message":"..."}}
|
||||
func ExtractUpstreamErrorMessage(body []byte) string {
|
||||
@ -7061,7 +7106,10 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
||||
// 上游中途读错误(unexpected EOF / connection reset 等,常见于 HTTP/2 GOAWAY):
|
||||
// 若尚未向客户端写过任何字节,包成 UpstreamFailoverError 让 handler 层走 failover/重试。
|
||||
// 已经开始写流时 SSE 协议无 resume,只能透传错误事件给客户端。
|
||||
disconnectMsg := fmt.Sprintf("upstream stream disconnected: %s", ev.err)
|
||||
// 注意:面向客户端的 disconnectMsg 必须用 sanitizeStreamError 剥离地址,
|
||||
// 默认 *net.OpError 的 Error() 会泄露内部 IP/端口和上游地址。完整 ev.err
|
||||
// 仅在下方 LegacyPrintf 内部日志中保留供运维诊断。
|
||||
disconnectMsg := "upstream stream disconnected: " + sanitizeStreamError(ev.err)
|
||||
if !c.Writer.Written() {
|
||||
logger.LegacyPrintf("service.gateway", "Upstream stream read error before any client output (account=%d), failing over: %v", account.ID, ev.err)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
|
||||
@ -6,8 +6,10 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -297,3 +299,97 @@ func TestHandleStreamingResponse_StreamReadErrorAfterOutput_PassesThrough(t *tes
|
||||
require.Contains(t, body, `"stream_read_error"`, "error.type 必须为 stream_read_error")
|
||||
require.Contains(t, body, "upstream stream disconnected", "error.message 必须包含具体根因,Claude Code 等客户端才能显示有效错误文案")
|
||||
}
|
||||
|
||||
// 默认 (*net.OpError).Error() 会拼接 Source/Addr 字段,泄露内部 IP/端口与上游
|
||||
// 服务器地址。sanitizeStreamError 必须剥离这些信息,避免基础设施拓扑通过
|
||||
// failover ResponseBody 或 SSE error 帧返回给客户端。
|
||||
func TestSanitizeStreamError_StripsNetworkAddresses(t *testing.T) {
|
||||
src, err := net.ResolveTCPAddr("tcp", "10.0.0.1:54321")
|
||||
require.NoError(t, err)
|
||||
dst, err := net.ResolveTCPAddr("tcp", "52.1.2.3:443")
|
||||
require.NoError(t, err)
|
||||
|
||||
raw := &net.OpError{
|
||||
Op: "read",
|
||||
Net: "tcp",
|
||||
Source: src,
|
||||
Addr: dst,
|
||||
Err: syscall.ECONNRESET,
|
||||
}
|
||||
|
||||
// 前置:原始 Error() 确实包含会泄露的字段(避免测试在 Go 行为变化时静默通过)
|
||||
require.Contains(t, raw.Error(), "10.0.0.1")
|
||||
require.Contains(t, raw.Error(), "52.1.2.3")
|
||||
|
||||
got := sanitizeStreamError(raw)
|
||||
require.NotContains(t, got, "10.0.0.1", "不得泄露内部源 IP")
|
||||
require.NotContains(t, got, "54321", "不得泄露源端口")
|
||||
require.NotContains(t, got, "52.1.2.3", "不得泄露上游目标 IP")
|
||||
require.NotContains(t, got, "443", "不得泄露上游端口")
|
||||
require.Equal(t, "connection reset by peer", got)
|
||||
}
|
||||
|
||||
func TestSanitizeStreamError_KnownErrors(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want string
|
||||
}{
|
||||
{"unexpected EOF", io.ErrUnexpectedEOF, "unexpected EOF"},
|
||||
{"EOF", io.EOF, "EOF"},
|
||||
{"context canceled", context.Canceled, "canceled"},
|
||||
{"deadline exceeded", context.DeadlineExceeded, "deadline exceeded"},
|
||||
{"ECONNRESET 直接", syscall.ECONNRESET, "connection reset by peer"},
|
||||
{"EPIPE", syscall.EPIPE, "broken pipe"},
|
||||
{"ETIMEDOUT", syscall.ETIMEDOUT, "connection timed out"},
|
||||
{"未识别错误兜底", errors.New("weird internal error"), "upstream connection error"},
|
||||
{"nil 返回空串", nil, ""},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
require.Equal(t, tc.want, sanitizeStreamError(tc.err))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// failover ResponseBody 必须用 sanitize 过的消息,避免泄露给客户端 / 写入 ops 日志
|
||||
// 时携带内部地址信息。
|
||||
func TestHandleStreamingResponse_FailoverBodyDoesNotLeakAddresses(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
svc := newMinimalGatewayService()
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(rec)
|
||||
c.Request = httptest.NewRequest(http.MethodPost, "/v1/messages", nil)
|
||||
|
||||
src, _ := net.ResolveTCPAddr("tcp", "10.0.0.1:54321")
|
||||
dst, _ := net.ResolveTCPAddr("tcp", "52.1.2.3:443")
|
||||
netErr := &net.OpError{
|
||||
Op: "read",
|
||||
Net: "tcp",
|
||||
Source: src,
|
||||
Addr: dst,
|
||||
Err: syscall.ECONNRESET,
|
||||
}
|
||||
|
||||
resp := &http.Response{
|
||||
StatusCode: http.StatusOK,
|
||||
Header: http.Header{"Content-Type": []string{"text/event-stream"}},
|
||||
Body: &streamReadCloser{err: netErr},
|
||||
}
|
||||
|
||||
_, err := svc.handleStreamingResponse(context.Background(), resp, c, &Account{ID: 1}, time.Now(), "model", "model", false)
|
||||
require.Error(t, err)
|
||||
|
||||
var failoverErr *UpstreamFailoverError
|
||||
require.True(t, errors.As(err, &failoverErr))
|
||||
|
||||
body := string(failoverErr.ResponseBody)
|
||||
require.NotContains(t, body, "10.0.0.1", "failover ResponseBody 不得泄露内部源 IP")
|
||||
require.NotContains(t, body, "54321")
|
||||
require.NotContains(t, body, "52.1.2.3", "failover ResponseBody 不得泄露上游 IP")
|
||||
require.NotContains(t, body, "443")
|
||||
// 仍然包含可诊断的根因
|
||||
require.Contains(t, body, "connection reset by peer")
|
||||
require.Contains(t, body, "upstream stream disconnected")
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user