From ed2aac25a6134976027d9b9de1d1665ea828be47 Mon Sep 17 00:00:00 2001 From: Pluviobyte Date: Wed, 27 May 2026 09:59:58 +0000 Subject: [PATCH] fix(billing): apply long-context multiplier to cache_creation price Follow-up to #2816 (already merged): the same long-context pricing exemption that affected cache_read also applies to all three cache_creation price fields (standard, 5m ephemeral, 1h ephemeral). computeCacheCreationCost reads these prices directly from pricing and never sees the LongContextInputMultiplier that computeTokenBreakdown applies to inputPrice / outputPrice / cacheReadPrice. For GPT-5.4 / 5.5 above the 272k threshold, this causes the cache_write portion of long sessions to be billed at roughly half what it should be (default multiplier 2.0). Cache writes are conceptually input-side operations and should share the same long-context treatment as input / cache_read. This patch threads an explicit multiplier into computeCacheCreationCost so the function can be unit-tested in isolation and matches the existing pattern used for cache_read. computeTokenBreakdown captures the long context decision once and passes LongContextInputMultiplier when it applies, 1.0 otherwise. Adds three regression tests mirroring the #2816 cache_read tests: - positive: long-context triggered -> cache_creation scaled by 2.0x - negative: below threshold -> cache_creation stays at base price - breakdown: 5m + 1h ephemeral prices both scaled when applicable Refs #2816 Co-authored-by: Cursor --- backend/internal/service/billing_service.go | 18 ++-- .../internal/service/billing_service_test.go | 83 +++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/backend/internal/service/billing_service.go b/backend/internal/service/billing_service.go index de68b755..940a827d 100644 --- a/backend/internal/service/billing_service.go +++ b/backend/internal/service/billing_service.go @@ -516,6 +516,7 @@ func (s *BillingService) computeTokenBreakdown( inputPrice := pricing.InputPricePerToken outputPrice := pricing.OutputPricePerToken cacheReadPrice := pricing.CacheReadPricePerToken + cacheCreationMultiplier := 1.0 tierMultiplier := 1.0 if usePriorityServiceTierPricing(serviceTier, pricing) { @@ -538,6 +539,10 @@ func (s *BillingService) computeTokenBreakdown( // 缓存读取本质上是输入侧的复用,应与 input 一同应用长上下文倍率; // 否则 cache hit 越多,少计的费用越多(见 #2293)。 cacheReadPrice *= pricing.LongContextInputMultiplier + // 缓存创建(cache_write)也是输入侧操作,三档价格(标准 / 5m / 1h) + // 都通过 computeCacheCreationCost 直接读取 pricing.*,不会经过这里 + // 的倍率修改,因此显式向下传一个倍率,避免长上下文场景下被漏乘。 + cacheCreationMultiplier = pricing.LongContextInputMultiplier } bd := &CostBreakdown{} @@ -560,7 +565,7 @@ func (s *BillingService) computeTokenBreakdown( } // 缓存创建费用 - bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens) + bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens, cacheCreationMultiplier) bd.CacheReadCost = float64(tokens.CacheReadTokens) * cacheReadPrice @@ -580,16 +585,17 @@ func (s *BillingService) computeTokenBreakdown( } // computeCacheCreationCost 计算缓存创建费用(支持 5m/1h 分类或标准计费)。 -func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens) float64 { +// multiplier 用于长上下文等场景下的整体价格缩放(普通调用传 1.0 即可)。 +func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens, multiplier float64) float64 { if pricing.SupportsCacheBreakdown && (pricing.CacheCreation5mPrice > 0 || pricing.CacheCreation1hPrice > 0) { if tokens.CacheCreation5mTokens == 0 && tokens.CacheCreation1hTokens == 0 && tokens.CacheCreationTokens > 0 { // API 未返回 ephemeral 明细,回退到全部按 5m 单价计费 - return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice + return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice * multiplier } - return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice + - float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice + return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice*multiplier + + float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice*multiplier } - return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken + return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken * multiplier } // calculatePerRequestCost 按次/图片计费 diff --git a/backend/internal/service/billing_service_test.go b/backend/internal/service/billing_service_test.go index 73677526..0ab1f50d 100644 --- a/backend/internal/service/billing_service_test.go +++ b/backend/internal/service/billing_service_test.go @@ -246,6 +246,89 @@ func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheReadAtBasePrice(t *test "cache_read_cost should remain at base price when below long-context threshold") } +// 回归测试 #2816 follow-up:长上下文计费触发时,cache_creation_tokens 也应应用 +// LongContextInputMultiplier。computeCacheCreationCost 直接读取 pricing.* 价格, +// 不经过 computeTokenBreakdown 内的 inputPrice / cacheReadPrice 倍率修改,因此 +// 修复前 cache_creation 部分会按基础价计算,少计费用约 50%(默认倍率 2.0)。 +func TestCalculateCost_OpenAIGPT54LongContextAppliesMultiplierToCacheCreation(t *testing.T) { + svc := newTestBillingService() + + // InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值 + tokens := UsageTokens{ + InputTokens: 1000, + CacheReadTokens: 300000, + CacheCreationTokens: 10000, + OutputTokens: 1000, + } + + cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0) + require.NoError(t, err) + + // gpt-5.4 fallback: CacheCreationPricePerToken = 2.5e-6, LongContextInputMultiplier = 2.0 + expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6 * 2.0 + require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10, + "cache_creation_cost should be scaled by LongContextInputMultiplier when long-context pricing applies") +} + +// 阴性测试:未触发长上下文时,cache_creation_price 不应被错误地乘以倍率。 +func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheCreationAtBasePrice(t *testing.T) { + svc := newTestBillingService() + + // InputTokens + CacheReadTokens = 1000 + 100000 = 101000 < 272000 阈值,不触发长上下文 + tokens := UsageTokens{ + InputTokens: 1000, + CacheReadTokens: 100000, + CacheCreationTokens: 10000, + OutputTokens: 1000, + } + + cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0) + require.NoError(t, err) + + expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6 + require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10, + "cache_creation_cost should remain at base price when below long-context threshold") +} + +// 覆盖 5m / 1h ephemeral 分类计费路径:长上下文触发时两档价格都应被倍率缩放。 +// 使用手工构造的 pricing(参考 TestCalculateCost_SupportsCacheBreakdown 的写法) +// 以便同时控制 SupportsCacheBreakdown + 长上下文阈值。 +func TestCalculateCost_LongContextAppliesMultiplierToCacheCreation5mAnd1h(t *testing.T) { + svc := &BillingService{ + cfg: &config.Config{}, + fallbackPrices: map[string]*ModelPricing{ + "claude-sonnet-4": { + InputPricePerToken: 3e-6, + OutputPricePerToken: 15e-6, + CacheReadPricePerToken: 0.3e-6, + SupportsCacheBreakdown: true, + CacheCreation5mPrice: 4e-6, + CacheCreation1hPrice: 5e-6, + LongContextInputThreshold: 272000, + LongContextInputMultiplier: 2.0, + LongContextOutputMultiplier: 1.5, + }, + }, + } + + // InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值 + tokens := UsageTokens{ + InputTokens: 1000, + CacheReadTokens: 300000, + CacheCreation5mTokens: 8000, + CacheCreation1hTokens: 4000, + OutputTokens: 1000, + } + + cost, err := svc.CalculateCost("claude-sonnet-4", tokens, 1.0) + require.NoError(t, err) + + expected5m := float64(tokens.CacheCreation5mTokens) * 4e-6 * 2.0 + expected1h := float64(tokens.CacheCreation1hTokens) * 5e-6 * 2.0 + require.InDelta(t, expected5m+expected1h, cost.CacheCreationCost, 1e-10, + "both 5m and 1h cache_creation prices should be scaled by LongContextInputMultiplier") +} + func TestGetFallbackPricing_FamilyMatching(t *testing.T) { svc := newTestBillingService()