fix(billing): apply long-context multiplier to cache_creation price
Follow-up to #2816 (already merged): the same long-context pricing exemption that affected cache_read also applies to all three cache_creation price fields (standard, 5m ephemeral, 1h ephemeral). computeCacheCreationCost reads these prices directly from pricing and never sees the LongContextInputMultiplier that computeTokenBreakdown applies to inputPrice / outputPrice / cacheReadPrice. For GPT-5.4 / 5.5 above the 272k threshold, this causes the cache_write portion of long sessions to be billed at roughly half what it should be (default multiplier 2.0). Cache writes are conceptually input-side operations and should share the same long-context treatment as input / cache_read. This patch threads an explicit multiplier into computeCacheCreationCost so the function can be unit-tested in isolation and matches the existing pattern used for cache_read. computeTokenBreakdown captures the long context decision once and passes LongContextInputMultiplier when it applies, 1.0 otherwise. Adds three regression tests mirroring the #2816 cache_read tests: - positive: long-context triggered -> cache_creation scaled by 2.0x - negative: below threshold -> cache_creation stays at base price - breakdown: 5m + 1h ephemeral prices both scaled when applicable Refs #2816 Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
b0142146af
commit
ed2aac25a6
@ -516,6 +516,7 @@ func (s *BillingService) computeTokenBreakdown(
|
||||
inputPrice := pricing.InputPricePerToken
|
||||
outputPrice := pricing.OutputPricePerToken
|
||||
cacheReadPrice := pricing.CacheReadPricePerToken
|
||||
cacheCreationMultiplier := 1.0
|
||||
tierMultiplier := 1.0
|
||||
|
||||
if usePriorityServiceTierPricing(serviceTier, pricing) {
|
||||
@ -538,6 +539,10 @@ func (s *BillingService) computeTokenBreakdown(
|
||||
// 缓存读取本质上是输入侧的复用,应与 input 一同应用长上下文倍率;
|
||||
// 否则 cache hit 越多,少计的费用越多(见 #2293)。
|
||||
cacheReadPrice *= pricing.LongContextInputMultiplier
|
||||
// 缓存创建(cache_write)也是输入侧操作,三档价格(标准 / 5m / 1h)
|
||||
// 都通过 computeCacheCreationCost 直接读取 pricing.*,不会经过这里
|
||||
// 的倍率修改,因此显式向下传一个倍率,避免长上下文场景下被漏乘。
|
||||
cacheCreationMultiplier = pricing.LongContextInputMultiplier
|
||||
}
|
||||
|
||||
bd := &CostBreakdown{}
|
||||
@ -560,7 +565,7 @@ func (s *BillingService) computeTokenBreakdown(
|
||||
}
|
||||
|
||||
// 缓存创建费用
|
||||
bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens)
|
||||
bd.CacheCreationCost = s.computeCacheCreationCost(pricing, tokens, cacheCreationMultiplier)
|
||||
|
||||
bd.CacheReadCost = float64(tokens.CacheReadTokens) * cacheReadPrice
|
||||
|
||||
@ -580,16 +585,17 @@ func (s *BillingService) computeTokenBreakdown(
|
||||
}
|
||||
|
||||
// computeCacheCreationCost 计算缓存创建费用(支持 5m/1h 分类或标准计费)。
|
||||
func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens) float64 {
|
||||
// multiplier 用于长上下文等场景下的整体价格缩放(普通调用传 1.0 即可)。
|
||||
func (s *BillingService) computeCacheCreationCost(pricing *ModelPricing, tokens UsageTokens, multiplier float64) float64 {
|
||||
if pricing.SupportsCacheBreakdown && (pricing.CacheCreation5mPrice > 0 || pricing.CacheCreation1hPrice > 0) {
|
||||
if tokens.CacheCreation5mTokens == 0 && tokens.CacheCreation1hTokens == 0 && tokens.CacheCreationTokens > 0 {
|
||||
// API 未返回 ephemeral 明细,回退到全部按 5m 单价计费
|
||||
return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice
|
||||
return float64(tokens.CacheCreationTokens) * pricing.CacheCreation5mPrice * multiplier
|
||||
}
|
||||
return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice +
|
||||
float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice
|
||||
return float64(tokens.CacheCreation5mTokens)*pricing.CacheCreation5mPrice*multiplier +
|
||||
float64(tokens.CacheCreation1hTokens)*pricing.CacheCreation1hPrice*multiplier
|
||||
}
|
||||
return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken
|
||||
return float64(tokens.CacheCreationTokens) * pricing.CacheCreationPricePerToken * multiplier
|
||||
}
|
||||
|
||||
// calculatePerRequestCost 按次/图片计费
|
||||
|
||||
@ -246,6 +246,89 @@ func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheReadAtBasePrice(t *test
|
||||
"cache_read_cost should remain at base price when below long-context threshold")
|
||||
}
|
||||
|
||||
// 回归测试 #2816 follow-up:长上下文计费触发时,cache_creation_tokens 也应应用
|
||||
// LongContextInputMultiplier。computeCacheCreationCost 直接读取 pricing.* 价格,
|
||||
// 不经过 computeTokenBreakdown 内的 inputPrice / cacheReadPrice 倍率修改,因此
|
||||
// 修复前 cache_creation 部分会按基础价计算,少计费用约 50%(默认倍率 2.0)。
|
||||
func TestCalculateCost_OpenAIGPT54LongContextAppliesMultiplierToCacheCreation(t *testing.T) {
|
||||
svc := newTestBillingService()
|
||||
|
||||
// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
|
||||
tokens := UsageTokens{
|
||||
InputTokens: 1000,
|
||||
CacheReadTokens: 300000,
|
||||
CacheCreationTokens: 10000,
|
||||
OutputTokens: 1000,
|
||||
}
|
||||
|
||||
cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
|
||||
require.NoError(t, err)
|
||||
|
||||
// gpt-5.4 fallback: CacheCreationPricePerToken = 2.5e-6, LongContextInputMultiplier = 2.0
|
||||
expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6 * 2.0
|
||||
require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
|
||||
"cache_creation_cost should be scaled by LongContextInputMultiplier when long-context pricing applies")
|
||||
}
|
||||
|
||||
// 阴性测试:未触发长上下文时,cache_creation_price 不应被错误地乘以倍率。
|
||||
func TestCalculateCost_OpenAIGPT54NoLongContextKeepsCacheCreationAtBasePrice(t *testing.T) {
|
||||
svc := newTestBillingService()
|
||||
|
||||
// InputTokens + CacheReadTokens = 1000 + 100000 = 101000 < 272000 阈值,不触发长上下文
|
||||
tokens := UsageTokens{
|
||||
InputTokens: 1000,
|
||||
CacheReadTokens: 100000,
|
||||
CacheCreationTokens: 10000,
|
||||
OutputTokens: 1000,
|
||||
}
|
||||
|
||||
cost, err := svc.CalculateCost("gpt-5.4-2026-03-05", tokens, 1.0)
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedCacheCreation := float64(tokens.CacheCreationTokens) * 2.5e-6
|
||||
require.InDelta(t, expectedCacheCreation, cost.CacheCreationCost, 1e-10,
|
||||
"cache_creation_cost should remain at base price when below long-context threshold")
|
||||
}
|
||||
|
||||
// 覆盖 5m / 1h ephemeral 分类计费路径:长上下文触发时两档价格都应被倍率缩放。
|
||||
// 使用手工构造的 pricing(参考 TestCalculateCost_SupportsCacheBreakdown 的写法)
|
||||
// 以便同时控制 SupportsCacheBreakdown + 长上下文阈值。
|
||||
func TestCalculateCost_LongContextAppliesMultiplierToCacheCreation5mAnd1h(t *testing.T) {
|
||||
svc := &BillingService{
|
||||
cfg: &config.Config{},
|
||||
fallbackPrices: map[string]*ModelPricing{
|
||||
"claude-sonnet-4": {
|
||||
InputPricePerToken: 3e-6,
|
||||
OutputPricePerToken: 15e-6,
|
||||
CacheReadPricePerToken: 0.3e-6,
|
||||
SupportsCacheBreakdown: true,
|
||||
CacheCreation5mPrice: 4e-6,
|
||||
CacheCreation1hPrice: 5e-6,
|
||||
LongContextInputThreshold: 272000,
|
||||
LongContextInputMultiplier: 2.0,
|
||||
LongContextOutputMultiplier: 1.5,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// InputTokens + CacheReadTokens = 1000 + 300000 = 301000 > 272000 阈值
|
||||
tokens := UsageTokens{
|
||||
InputTokens: 1000,
|
||||
CacheReadTokens: 300000,
|
||||
CacheCreation5mTokens: 8000,
|
||||
CacheCreation1hTokens: 4000,
|
||||
OutputTokens: 1000,
|
||||
}
|
||||
|
||||
cost, err := svc.CalculateCost("claude-sonnet-4", tokens, 1.0)
|
||||
require.NoError(t, err)
|
||||
|
||||
expected5m := float64(tokens.CacheCreation5mTokens) * 4e-6 * 2.0
|
||||
expected1h := float64(tokens.CacheCreation1hTokens) * 5e-6 * 2.0
|
||||
require.InDelta(t, expected5m+expected1h, cost.CacheCreationCost, 1e-10,
|
||||
"both 5m and 1h cache_creation prices should be scaled by LongContextInputMultiplier")
|
||||
}
|
||||
|
||||
func TestGetFallbackPricing_FamilyMatching(t *testing.T) {
|
||||
svc := newTestBillingService()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user