fix(billing): apply long-context multiplier to cache_creation price

Follow-up to #2816 (already merged): the same long-context pricing exemption that affected cache_read also applies to all three cache_creation price fields (standard, 5m ephemeral, 1h ephemeral). computeCacheCreationCost reads these prices directly from pricing and never sees the LongContextInputMultiplier that computeTokenBreakdown applies to inputPrice / outputPrice / cacheReadPrice. For GPT-5.4 / 5.5 above the 272k threshold, this causes the cache_write portion of long sessions to be billed at roughly half what it should be (default multiplier 2.0). Cache writes are conceptually input-side operations and should share the same long-context treatment as input / cache_read. This patch threads an explicit multiplier into computeCacheCreationCost so the function can be unit-tested in isolation and matches the existing pattern used for cache_read. computeTokenBreakdown captures the long context decision once and passes LongContextInputMultiplier when it applies, 1.0 otherwise. Adds three regression tests mirroring the #2816 cache_read tests: - positive: long-context triggered -> cache_creation scaled by 2.0x - negative: below threshold -> cache_creation stays at base price - breakdown: 5m + 1h ephemeral prices both scaled when applicable Refs #2816 Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 09:59:58 +00:00 · 2026-05-27 09:59:58 +00:00 · ed2aac25a6
commit ed2aac25a6
parent b0142146af
2 changed files with 95 additions and 6 deletions
--- a/backend/internal/service/billing_service.go
+++ b/backend/internal/service/billing_service.go
@ -516,6 +516,7 @@ func (s *BillingService) computeTokenBreakdown(
 	inputPrice := pricing.InputPricePerToken
 	outputPrice := pricing.OutputPricePerToken
 	cacheReadPrice := pricing.CacheReadPricePerToken
+	cacheCreationMultiplier := 1.0
 	tierMultiplier := 1.0

 	if usePriorityServiceTierPricing(serviceTier, pricing) {
@ -538,6 +539,10 @@ func (s *BillingService) computeTokenBreakdown(
 		// 缓存读取本质上是输入侧的复用，应与 input 一同应用长上下文倍率；
 		// 否则 cache hit 越多，少计的费用越多（见 #2293）。
 		cacheReadPrice *= pricing.LongContextInputMultiplier
+		// 缓存创建（cache_write）也是输入侧操作，三档价格（标准 / 5m / 1h）
+		// 都通过 computeCacheCreationCost 直接读取 pricing.*，不会经过这里
+		// 的倍率修改，因此显式向下传一个倍率，避免长上下文场景下被漏乘。
+		cacheCreationMultiplier = pricing.LongContextInputMultiplier
 	}

 	bd := &CostBreakdown{}
@ -560,7 +565,7 @@ func (s *BillingService) computeTokenBreakdown(
 	}

 	// 缓存创建费用
-	bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens)
+	bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens, cacheCreationMultiplier)

 	bd.CacheReadCost = float64(tokens.CacheReadTokens) * cacheReadPrice

@ -580,16 +585,17 @@ func (s *BillingService) computeTokenBreakdown(
 }

 // computeCacheCreationCost 计算缓存创建费用（支持 5m/1h 分类或标准计费）。
-func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens) float64 {
+// multiplier 用于长上下文等场景下的整体价格缩放（普通调用传 1.0 即可）。
+func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens, multiplier float64) float64 {
 	if pricing.SupportsCacheBreakdown && (pricing.CacheCreation5mPrice > 0 || pricing.CacheCreation1hPrice > 0) {
 		if tokens.CacheCreation5mTokens == 0 && tokens.CacheCreation1hTokens == 0 && tokens.CacheCreationTokens > 0 {
 			// API 未返回 ephemeral 明细，回退到全部按 5m 单价计费
-			return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice
+			return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice * multiplier
 		}
-		return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice +
-			float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice
+		return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice*multiplier +
+			float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice*multiplier
 	}
-	return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken
+	return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken * multiplier
 }

 // calculatePerRequestCost 按次/图片计费
--- a/backend/internal/service/billing_service_test.go
+++ b/backend/internal/service/billing_service_test.go
@ -246,6 +246,89 @@ func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheReadAtBasePrice(t *test
 		"cache_read_cost should remain at base price when below long-context threshold")
 }

+// 回归测试 #2816 follow-up：长上下文计费触发时，cache_creation_tokens 也应应用
+// LongContextInputMultiplier。computeCacheCreationCost 直接读取 pricing.* 价格，
+// 不经过 computeTokenBreakdown 内的 inputPrice / cacheReadPrice 倍率修改，因此
+// 修复前 cache_creation 部分会按基础价计算，少计费用约 50%（默认倍率 2.0）。
+func TestCalculateCost_OpenAIGPT54LongContextAppliesMultiplierToCacheCreation(t *testing.T) {
+	svc := newTestBillingService()
+
+	// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
+	tokens := UsageTokens{
+		InputTokens:         1000,
+		CacheReadTokens:     300000,
+		CacheCreationTokens: 10000,
+		OutputTokens:        1000,
+	}
+
+	cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
+	require.NoError(t, err)
+
+	// gpt-5.4 fallback: CacheCreationPricePerToken = 2.5e-6, LongContextInputMultiplier = 2.0
+	expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6 * 2.0
+	require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
+		"cache_creation_cost should be scaled by LongContextInputMultiplier when long-context pricing applies")
+}
+
+// 阴性测试：未触发长上下文时，cache_creation_price 不应被错误地乘以倍率。
+func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheCreationAtBasePrice(t *testing.T) {
+	svc := newTestBillingService()
+
+	// InputTokens + CacheReadTokens = 1000 + 100000 = 101000 < 272000 阈值，不触发长上下文
+	tokens := UsageTokens{
+		InputTokens:         1000,
+		CacheReadTokens:     100000,
+		CacheCreationTokens: 10000,
+		OutputTokens:        1000,
+	}
+
+	cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
+	require.NoError(t, err)
+
+	expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6
+	require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
+		"cache_creation_cost should remain at base price when below long-context threshold")
+}
+
+// 覆盖 5m / 1h ephemeral 分类计费路径：长上下文触发时两档价格都应被倍率缩放。
+// 使用手工构造的 pricing（参考 TestCalculateCost_SupportsCacheBreakdown 的写法）
+// 以便同时控制 SupportsCacheBreakdown + 长上下文阈值。
+func TestCalculateCost_LongContextAppliesMultiplierToCacheCreation5mAnd1h(t *testing.T) {
+	svc := &BillingService{
+		cfg: &config.Config{},
+		fallbackPrices: map[string]*ModelPricing{
+			"claude-sonnet-4": {
+				InputPricePerToken:          3e-6,
+				OutputPricePerToken:         15e-6,
+				CacheReadPricePerToken:      0.3e-6,
+				SupportsCacheBreakdown:      true,
+				CacheCreation5mPrice:        4e-6,
+				CacheCreation1hPrice:        5e-6,
+				LongContextInputThreshold:   272000,
+				LongContextInputMultiplier:  2.0,
+				LongContextOutputMultiplier: 1.5,
+			},
+		},
+	}
+
+	// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
+	tokens := UsageTokens{
+		InputTokens:           1000,
+		CacheReadTokens:       300000,
+		CacheCreation5mTokens: 8000,
+		CacheCreation1hTokens: 4000,
+		OutputTokens:          1000,
+	}
+
+	cost, err := svc.CalculateCost("claude-sonnet-4", tokens, 1.0)
+	require.NoError(t, err)
+
+	expected5m := float64(tokens.CacheCreation5mTokens) * 4e-6 * 2.0
+	expected1h := float64(tokens.CacheCreation1hTokens) * 5e-6 * 2.0
+	require.InDelta(t, expected5m+expected1h, cost.CacheCreationCost, 1e-10,
+		"both 5m and 1h cache_creation prices should be scaled by LongContextInputMultiplier")
+}
+
 func TestGetFallbackPricing_FamilyMatching(t *testing.T) {
 	svc := newTestBillingService()