diff --git a/backend/internal/pkg/apicompat/anthropic_responses_test.go b/backend/internal/pkg/apicompat/anthropic_responses_test.go index bb566081..8997835c 100644 --- a/backend/internal/pkg/apicompat/anthropic_responses_test.go +++ b/backend/internal/pkg/apicompat/anthropic_responses_test.go @@ -1597,3 +1597,139 @@ func TestAnthropicToResponses_TemperatureStrippedForAllGpt5Variants(t *testing.T }) } } + +// --------------------------------------------------------------------------- +// AnthropicToResponsesResponse: Anthropic input_tokens excludes cached tokens +// while OpenAI Responses input_tokens is the total including cached tokens. +// --------------------------------------------------------------------------- + +func TestAnthropicToResponsesResponse_CacheTokensUseOpenAIInputSemantics(t *testing.T) { + resp := &AnthropicResponse{ + ID: "msg_cache", + Model: "claude-sonnet-4-5-20250929", + Content: []AnthropicContentBlock{ + {Type: "text", Text: "ok"}, + }, + StopReason: "end_turn", + Usage: AnthropicUsage{ + InputTokens: 3318, + OutputTokens: 123, + CacheReadInputTokens: 50688, + CacheCreationInputTokens: 200, + }, + } + + out := AnthropicToResponsesResponse(resp) + require.NotNil(t, out.Usage) + // 3318 (uncached) + 50688 (read) + 200 (creation) = 54206 + assert.Equal(t, 54206, out.Usage.InputTokens) + assert.Equal(t, 123, out.Usage.OutputTokens) + assert.Equal(t, 54329, out.Usage.TotalTokens) + require.NotNil(t, out.Usage.InputTokensDetails) + assert.Equal(t, 50688, out.Usage.InputTokensDetails.CachedTokens) +} + +func TestAnthropicToResponsesResponse_NoCacheTokens(t *testing.T) { + resp := &AnthropicResponse{ + ID: "msg_nocache", + Model: "claude-sonnet-4-5-20250929", + Content: []AnthropicContentBlock{ + {Type: "text", Text: "ok"}, + }, + StopReason: "end_turn", + Usage: AnthropicUsage{ + InputTokens: 100, + OutputTokens: 50, + }, + } + + out := AnthropicToResponsesResponse(resp) + require.NotNil(t, out.Usage) + assert.Equal(t, 100, out.Usage.InputTokens) + assert.Equal(t, 50, out.Usage.OutputTokens) + assert.Equal(t, 150, out.Usage.TotalTokens) + assert.Nil(t, out.Usage.InputTokensDetails) +} + +func TestAnthropicEventToResponses_CacheTokensRoundTripFromMessageStart(t *testing.T) { + state := NewAnthropicEventToResponsesState() + + // message_start carries cache fields on the initial Usage object. + AnthropicEventToResponsesEvents(&AnthropicStreamEvent{ + Type: "message_start", + Message: &AnthropicResponse{ + ID: "msg_stream_cache", + Model: "claude-sonnet-4-5-20250929", + Usage: AnthropicUsage{ + InputTokens: 12, + CacheReadInputTokens: 9, + CacheCreationInputTokens: 3, + }, + }, + }, state) + + AnthropicEventToResponsesEvents(&AnthropicStreamEvent{ + Type: "message_delta", + Usage: &AnthropicUsage{ + OutputTokens: 7, + }, + }, state) + + events := AnthropicEventToResponsesEvents(&AnthropicStreamEvent{Type: "message_stop"}, state) + + // The terminal response.completed event must include OpenAI-semantic usage. + var completed *ResponsesStreamEvent + for i := range events { + if events[i].Type == "response.completed" { + completed = &events[i] + } + } + require.NotNil(t, completed, "response.completed event must be emitted") + require.NotNil(t, completed.Response) + require.NotNil(t, completed.Response.Usage) + // 12 (uncached) + 9 (read) + 3 (creation) = 24 + assert.Equal(t, 24, completed.Response.Usage.InputTokens) + assert.Equal(t, 7, completed.Response.Usage.OutputTokens) + assert.Equal(t, 31, completed.Response.Usage.TotalTokens) + require.NotNil(t, completed.Response.Usage.InputTokensDetails) + assert.Equal(t, 9, completed.Response.Usage.InputTokensDetails.CachedTokens) +} + +func TestAnthropicEventToResponses_CacheTokensFromMessageDelta(t *testing.T) { + state := NewAnthropicEventToResponsesState() + + AnthropicEventToResponsesEvents(&AnthropicStreamEvent{ + Type: "message_start", + Message: &AnthropicResponse{ + ID: "msg_delta_cache", + Model: "claude-sonnet-4-5-20250929", + Usage: AnthropicUsage{InputTokens: 20}, + }, + }, state) + + // Some upstreams only emit cache fields on the final message_delta. + AnthropicEventToResponsesEvents(&AnthropicStreamEvent{ + Type: "message_delta", + Usage: &AnthropicUsage{ + OutputTokens: 8, + CacheReadInputTokens: 11, + CacheCreationInputTokens: 4, + }, + }, state) + + events := AnthropicEventToResponsesEvents(&AnthropicStreamEvent{Type: "message_stop"}, state) + + var completed *ResponsesStreamEvent + for i := range events { + if events[i].Type == "response.completed" { + completed = &events[i] + } + } + require.NotNil(t, completed) + require.NotNil(t, completed.Response.Usage) + // 20 (uncached) + 11 (read) + 4 (creation) = 35 + assert.Equal(t, 35, completed.Response.Usage.InputTokens) + assert.Equal(t, 8, completed.Response.Usage.OutputTokens) + require.NotNil(t, completed.Response.Usage.InputTokensDetails) + assert.Equal(t, 11, completed.Response.Usage.InputTokensDetails.CachedTokens) +} diff --git a/backend/internal/pkg/apicompat/anthropic_to_responses_response.go b/backend/internal/pkg/apicompat/anthropic_to_responses_response.go index 9290e399..de8ab78d 100644 --- a/backend/internal/pkg/apicompat/anthropic_to_responses_response.go +++ b/backend/internal/pkg/apicompat/anthropic_to_responses_response.go @@ -95,10 +95,16 @@ func AnthropicToResponsesResponse(resp *AnthropicResponse) *ResponsesResponse { } // Usage + // Anthropic's input_tokens excludes cache_read/cache_creation, while OpenAI + // Responses' input_tokens is the total including cached tokens. Add them back + // when converting so downstream consumers see OpenAI semantics. + totalInputTokens := resp.Usage.InputTokens + + resp.Usage.CacheReadInputTokens + + resp.Usage.CacheCreationInputTokens out.Usage = &ResponsesUsage{ - InputTokens: resp.Usage.InputTokens, + InputTokens: totalInputTokens, OutputTokens: resp.Usage.OutputTokens, - TotalTokens: resp.Usage.InputTokens + resp.Usage.OutputTokens, + TotalTokens: totalInputTokens + resp.Usage.OutputTokens, } if resp.Usage.CacheReadInputTokens > 0 { out.Usage.InputTokensDetails = &ResponsesInputTokensDetails{ @@ -150,10 +156,13 @@ type AnthropicEventToResponsesState struct { CurrentCallID string CurrentName string - // Usage from message_delta - InputTokens int - OutputTokens int - CacheReadInputTokens int + // Usage from message_start / message_delta. InputTokens here follows + // Anthropic semantics (excludes cached tokens); they are added back when + // emitting the OpenAI Responses usage. + InputTokens int + OutputTokens int + CacheReadInputTokens int + CacheCreationInputTokens int } // NewAnthropicEventToResponsesState returns an initialised stream state. @@ -225,6 +234,12 @@ func anthToResHandleMessageStart(evt *AnthropicStreamEvent, state *AnthropicEven if evt.Message.Usage.InputTokens > 0 { state.InputTokens = evt.Message.Usage.InputTokens } + if evt.Message.Usage.CacheReadInputTokens > 0 { + state.CacheReadInputTokens = evt.Message.Usage.CacheReadInputTokens + } + if evt.Message.Usage.CacheCreationInputTokens > 0 { + state.CacheCreationInputTokens = evt.Message.Usage.CacheCreationInputTokens + } } if state.CreatedSent { @@ -392,9 +407,15 @@ func anthToResHandleMessageDelta(evt *AnthropicStreamEvent, state *AnthropicEven // Update usage if evt.Usage != nil { state.OutputTokens = evt.Usage.OutputTokens + if evt.Usage.InputTokens > 0 { + state.InputTokens = evt.Usage.InputTokens + } if evt.Usage.CacheReadInputTokens > 0 { state.CacheReadInputTokens = evt.Usage.CacheReadInputTokens } + if evt.Usage.CacheCreationInputTokens > 0 { + state.CacheCreationInputTokens = evt.Usage.CacheCreationInputTokens + } } return nil @@ -472,10 +493,13 @@ func makeResponsesCompletedEvent( seq := state.SequenceNumber state.SequenceNumber++ + // Anthropic's input_tokens excludes cache_read/cache_creation; add them + // back to match OpenAI Responses semantics where input_tokens is the total. + totalInputTokens := state.InputTokens + state.CacheReadInputTokens + state.CacheCreationInputTokens usage := &ResponsesUsage{ - InputTokens: state.InputTokens, + InputTokens: totalInputTokens, OutputTokens: state.OutputTokens, - TotalTokens: state.InputTokens + state.OutputTokens, + TotalTokens: totalInputTokens + state.OutputTokens, } if state.CacheReadInputTokens > 0 { usage.InputTokensDetails = &ResponsesInputTokensDetails{