fix(billing): apply long-context multiplier to cache_creation price

Follow-up to #2816 (already merged): the same long-context pricing
exemption that affected cache_read also applies to all three
cache_creation price fields (standard, 5m ephemeral, 1h ephemeral).
computeCacheCreationCost reads these prices directly from pricing and
never sees the LongContextInputMultiplier that computeTokenBreakdown
applies to inputPrice / outputPrice / cacheReadPrice.

For GPT-5.4 / 5.5 above the 272k threshold, this causes the cache_write
portion of long sessions to be billed at roughly half what it should
be (default multiplier 2.0). Cache writes are conceptually input-side
operations and should share the same long-context treatment as input /
cache_read.

This patch threads an explicit multiplier into computeCacheCreationCost
so the function can be unit-tested in isolation and matches the existing
pattern used for cache_read. computeTokenBreakdown captures the long
context decision once and passes LongContextInputMultiplier when it
applies, 1.0 otherwise.

Adds three regression tests mirroring the #2816 cache_read tests:
- positive: long-context triggered -> cache_creation scaled by 2.0x
- negative: below threshold -> cache_creation stays at base price
- breakdown: 5m + 1h ephemeral prices both scaled when applicable

Refs #2816

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Pluviobyte 2026-05-27 09:59:58 +00:00
parent b0142146af
commit ed2aac25a6
No known key found for this signature in database
2 changed files with 95 additions and 6 deletions

View File

@ -516,6 +516,7 @@ func (s *BillingService) computeTokenBreakdown(
inputPrice := pricing.InputPricePerToken
outputPrice := pricing.OutputPricePerToken
cacheReadPrice := pricing.CacheReadPricePerToken
cacheCreationMultiplier := 1.0
tierMultiplier := 1.0
if usePriorityServiceTierPricing(serviceTier, pricing) {
@ -538,6 +539,10 @@ func (s *BillingService) computeTokenBreakdown(
// 缓存读取本质上是输入侧的复用,应与 input 一同应用长上下文倍率;
// 否则 cache hit 越多,少计的费用越多(见 #2293
cacheReadPrice *= pricing.LongContextInputMultiplier
// 缓存创建cache_write也是输入侧操作三档价格标准 / 5m / 1h
// 都通过 computeCacheCreationCost 直接读取 pricing.*,不会经过这里
// 的倍率修改,因此显式向下传一个倍率,避免长上下文场景下被漏乘。
cacheCreationMultiplier = pricing.LongContextInputMultiplier
}
bd := &CostBreakdown{}
@ -560,7 +565,7 @@ func (s *BillingService) computeTokenBreakdown(
}
// 缓存创建费用
bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens)
bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens, cacheCreationMultiplier)
bd.CacheReadCost = float64(tokens.CacheReadTokens) * cacheReadPrice
@ -580,16 +585,17 @@ func (s *BillingService) computeTokenBreakdown(
}
// computeCacheCreationCost 计算缓存创建费用(支持 5m/1h 分类或标准计费)。
func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens) float64 {
// multiplier 用于长上下文等场景下的整体价格缩放(普通调用传 1.0 即可)。
func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens, multiplier float64) float64 {
if pricing.SupportsCacheBreakdown && (pricing.CacheCreation5mPrice > 0 || pricing.CacheCreation1hPrice > 0) {
if tokens.CacheCreation5mTokens == 0 && tokens.CacheCreation1hTokens == 0 && tokens.CacheCreationTokens > 0 {
// API 未返回 ephemeral 明细,回退到全部按 5m 单价计费
return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice
return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice * multiplier
}
return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice +
float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice
return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice*multiplier +
float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice*multiplier
}
return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken
return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken * multiplier
}
// calculatePerRequestCost 按次/图片计费

View File

@ -246,6 +246,89 @@ func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheReadAtBasePrice(t *test
"cache_read_cost should remain at base price when below long-context threshold")
}
// 回归测试 #2816 follow-up长上下文计费触发时cache_creation_tokens 也应应用
// LongContextInputMultiplier。computeCacheCreationCost 直接读取 pricing.* 价格,
// 不经过 computeTokenBreakdown 内的 inputPrice / cacheReadPrice 倍率修改,因此
// 修复前 cache_creation 部分会按基础价计算,少计费用约 50%(默认倍率 2.0)。
func TestCalculateCost_OpenAIGPT54LongContextAppliesMultiplierToCacheCreation(t *testing.T) {
svc := newTestBillingService()
// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
tokens := UsageTokens{
InputTokens: 1000,
CacheReadTokens: 300000,
CacheCreationTokens: 10000,
OutputTokens: 1000,
}
cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
require.NoError(t, err)
// gpt-5.4 fallback: CacheCreationPricePerToken = 2.5e-6, LongContextInputMultiplier = 2.0
expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6 * 2.0
require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
"cache_creation_cost should be scaled by LongContextInputMultiplier when long-context pricing applies")
}
// 阴性测试未触发长上下文时cache_creation_price 不应被错误地乘以倍率。
func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheCreationAtBasePrice(t *testing.T) {
svc := newTestBillingService()
// InputTokens + CacheReadTokens = 1000 + 100000 = 101000 < 272000 阈值,不触发长上下文
tokens := UsageTokens{
InputTokens: 1000,
CacheReadTokens: 100000,
CacheCreationTokens: 10000,
OutputTokens: 1000,
}
cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
require.NoError(t, err)
expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6
require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
"cache_creation_cost should remain at base price when below long-context threshold")
}
// 覆盖 5m / 1h ephemeral 分类计费路径:长上下文触发时两档价格都应被倍率缩放。
// 使用手工构造的 pricing参考 TestCalculateCost_SupportsCacheBreakdown 的写法)
// 以便同时控制 SupportsCacheBreakdown + 长上下文阈值。
func TestCalculateCost_LongContextAppliesMultiplierToCacheCreation5mAnd1h(t *testing.T) {
svc := &BillingService{
cfg: &config.Config{},
fallbackPrices: map[string]*ModelPricing{
"claude-sonnet-4": {
InputPricePerToken: 3e-6,
OutputPricePerToken: 15e-6,
CacheReadPricePerToken: 0.3e-6,
SupportsCacheBreakdown: true,
CacheCreation5mPrice: 4e-6,
CacheCreation1hPrice: 5e-6,
LongContextInputThreshold: 272000,
LongContextInputMultiplier: 2.0,
LongContextOutputMultiplier: 1.5,
},
},
}
// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
tokens := UsageTokens{
InputTokens: 1000,
CacheReadTokens: 300000,
CacheCreation5mTokens: 8000,
CacheCreation1hTokens: 4000,
OutputTokens: 1000,
}
cost, err := svc.CalculateCost("claude-sonnet-4", tokens, 1.0)
require.NoError(t, err)
expected5m := float64(tokens.CacheCreation5mTokens) * 4e-6 * 2.0
expected1h := float64(tokens.CacheCreation1hTokens) * 5e-6 * 2.0
require.InDelta(t, expected5m+expected1h, cost.CacheCreationCost, 1e-10,
"both 5m and 1h cache_creation prices should be scaled by LongContextInputMultiplier")
}
func TestGetFallbackPricing_FamilyMatching(t *testing.T) {
svc := newTestBillingService()