fix: pin fixed anthropic/fantasy forks for streaming token accounting (#24077)

2026-06-02 20:48:20 +00:00 · 2026-04-08 19:07:39 +02:00
parent 543c448b72
commit 590235138f
9 changed files with 237 additions and 23 deletions
@@ -86,6 +86,54 @@ func TestRun_ActiveToolsPrepareBehavior(t *testing.T) {
 	require.True(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[4]))
 }

+func TestProcessStepStream_AnthropicUsageMatchesFinalDelta(t *testing.T) {
+	t.Parallel()
+
+	model := &loopTestModel{
+		provider: fantasyanthropic.Name,
+		streamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
+			return streamFromParts([]fantasy.StreamPart{
+				{Type: fantasy.StreamPartTypeTextStart, ID: "text-1"},
+				{Type: fantasy.StreamPartTypeTextDelta, ID: "text-1", Delta: "cached response"},
+				{Type: fantasy.StreamPartTypeTextEnd, ID: "text-1"},
+				{
+					Type: fantasy.StreamPartTypeFinish,
+					Usage: fantasy.Usage{
+						InputTokens:         200,
+						OutputTokens:        75,
+						TotalTokens:         275,
+						CacheCreationTokens: 30,
+						CacheReadTokens:     150,
+						ReasoningTokens:     0,
+					},
+					FinishReason: fantasy.FinishReasonStop,
+				},
+			}), nil
+		},
+	}
+
+	var persistedStep PersistedStep
+
+	err := Run(context.Background(), RunOptions{
+		Model: model,
+		Messages: []fantasy.Message{
+			textMessage(fantasy.MessageRoleUser, "hello"),
+		},
+		MaxSteps:             1,
+		ContextLimitFallback: 4096,
+		PersistStep: func(_ context.Context, step PersistedStep) error {
+			persistedStep = step
+			return nil
+		},
+	})
+	require.NoError(t, err)
+	require.Equal(t, int64(200), persistedStep.Usage.InputTokens)
+	require.Equal(t, int64(75), persistedStep.Usage.OutputTokens)
+	require.Equal(t, int64(275), persistedStep.Usage.TotalTokens)
+	require.Equal(t, int64(30), persistedStep.Usage.CacheCreationTokens)
+	require.Equal(t, int64(150), persistedStep.Usage.CacheReadTokens)
+}
+
 func TestRun_OnRetryEnrichesProvider(t *testing.T) {
 	t.Parallel()

@@ -53,8 +53,10 @@ type AnthropicMessage struct {

 // AnthropicUsage represents usage information in an Anthropic response.
 type AnthropicUsage struct {
-	InputTokens  int `json:"input_tokens"`
-	OutputTokens int `json:"output_tokens"`
+	InputTokens              int `json:"input_tokens"`
+	OutputTokens             int `json:"output_tokens"`
+	CacheCreationInputTokens int `json:"cache_creation_input_tokens,omitempty"`
+	CacheReadInputTokens     int `json:"cache_read_input_tokens,omitempty"`
 }

 // AnthropicChunk represents a streaming chunk from Anthropic.
@@ -67,14 +69,16 @@ type AnthropicChunk struct {
 	StopReason   string                `json:"stop_reason,omitempty"`
 	StopSequence *string               `json:"stop_sequence,omitempty"`
 	Usage        AnthropicUsage        `json:"usage,omitempty"`
+	UsageMap     map[string]int        `json:"-"`
 }

 // AnthropicChunkMessage represents message metadata in a chunk.
 type AnthropicChunkMessage struct {
-	ID    string `json:"id"`
-	Type  string `json:"type"`
-	Role  string `json:"role"`
-	Model string `json:"model"`
+	ID    string         `json:"id"`
+	Type  string         `json:"type"`
+	Role  string         `json:"role"`
+	Model string         `json:"model"`
+	Usage map[string]int `json:"usage,omitempty"`
 }

 // AnthropicContentBlock represents a content block in a chunk.
@@ -206,7 +210,11 @@ func (s *anthropicServer) writeStreamingResponse(w http.ResponseWriter, chunks <
 				"stop_reason":   chunk.StopReason,
 				"stop_sequence": chunk.StopSequence,
 			}
-			chunkData["usage"] = chunk.Usage
+			if chunk.UsageMap != nil {
+				chunkData["usage"] = chunk.UsageMap
+			} else {
+				chunkData["usage"] = chunk.Usage
+			}
 		case "message_stop":
 			// No additional fields
 		}
@@ -342,6 +350,80 @@ func AnthropicTextChunks(deltas ...string) []AnthropicChunk {
 	return chunks
 }

+// AnthropicTextChunksWithCacheUsage creates a streaming response with text
+// deltas and explicit cache token usage. The message_start event carries
+// the initial input and cache token counts, and the final message_delta
+// carries the output token count.
+func AnthropicTextChunksWithCacheUsage(usage AnthropicUsage, deltas ...string) []AnthropicChunk {
+	if len(deltas) == 0 {
+		return nil
+	}
+
+	messageID := fmt.Sprintf("msg-%s", uuid.New().String()[:8])
+	model := "claude-3-opus-20240229"
+
+	messageUsage := map[string]int{
+		"input_tokens": usage.InputTokens,
+	}
+	if usage.CacheCreationInputTokens != 0 {
+		messageUsage["cache_creation_input_tokens"] = usage.CacheCreationInputTokens
+	}
+	if usage.CacheReadInputTokens != 0 {
+		messageUsage["cache_read_input_tokens"] = usage.CacheReadInputTokens
+	}
+
+	chunks := []AnthropicChunk{
+		{
+			Type: "message_start",
+			Message: AnthropicChunkMessage{
+				ID:    messageID,
+				Type:  "message",
+				Role:  "assistant",
+				Model: model,
+				Usage: messageUsage,
+			},
+		},
+		{
+			Type:  "content_block_start",
+			Index: 0,
+			ContentBlock: AnthropicContentBlock{
+				Type: "text",
+				Text: "",
+			},
+		},
+	}
+
+	for _, delta := range deltas {
+		chunks = append(chunks, AnthropicChunk{
+			Type:  "content_block_delta",
+			Index: 0,
+			Delta: AnthropicDeltaBlock{
+				Type: "text_delta",
+				Text: delta,
+			},
+		})
+	}
+
+	chunks = append(chunks,
+		AnthropicChunk{
+			Type:  "content_block_stop",
+			Index: 0,
+		},
+		AnthropicChunk{
+			Type:       "message_delta",
+			StopReason: "end_turn",
+			UsageMap: map[string]int{
+				"output_tokens": usage.OutputTokens,
+			},
+		},
+		AnthropicChunk{
+			Type: "message_stop",
+		},
+	)
+
+	return chunks
+}
+
 // AnthropicToolCallChunks creates a complete streaming response for a tool call.
 // Input JSON can be split across multiple deltas, matching Anthropic's
 // input_json_delta streaming behavior.
@@ -63,6 +63,59 @@ func TestAnthropic_Streaming(t *testing.T) {
 	require.Equal(t, len(expectedDeltas), deltaIndex, "Expected %d deltas, got %d. Total parts received: %d", len(expectedDeltas), deltaIndex, len(allParts))
 }

+func TestAnthropic_StreamingUsageIncludesCacheTokens(t *testing.T) {
+	t.Parallel()
+
+	serverURL := chattest.NewAnthropic(t, func(req *chattest.AnthropicRequest) chattest.AnthropicResponse {
+		return chattest.AnthropicStreamingResponse(
+			chattest.AnthropicTextChunksWithCacheUsage(chattest.AnthropicUsage{
+				InputTokens:              200,
+				OutputTokens:             75,
+				CacheCreationInputTokens: 30,
+				CacheReadInputTokens:     150,
+			}, "cached", " response")...,
+		)
+	})
+
+	client, err := fantasyanthropic.New(
+		fantasyanthropic.WithAPIKey("test-key"),
+		fantasyanthropic.WithBaseURL(serverURL),
+	)
+	require.NoError(t, err)
+
+	model, err := client.LanguageModel(context.Background(), "claude-3-opus-20240229")
+	require.NoError(t, err)
+
+	stream, err := model.Stream(context.Background(), fantasy.Call{
+		Prompt: []fantasy.Message{
+			{
+				Role:    fantasy.MessageRoleUser,
+				Content: []fantasy.MessagePart{fantasy.TextPart{Text: "hello"}},
+			},
+		},
+	})
+	require.NoError(t, err)
+
+	var (
+		finishPart fantasy.StreamPart
+		found      bool
+	)
+	for part := range stream {
+		if part.Type != fantasy.StreamPartTypeFinish {
+			continue
+		}
+		finishPart = part
+		found = true
+	}
+
+	require.True(t, found)
+	require.Equal(t, int64(200), finishPart.Usage.InputTokens)
+	require.Equal(t, int64(75), finishPart.Usage.OutputTokens)
+	require.Equal(t, int64(275), finishPart.Usage.TotalTokens)
+	require.Equal(t, int64(30), finishPart.Usage.CacheCreationTokens)
+	require.Equal(t, int64(150), finishPart.Usage.CacheReadTokens)
+}
+
 func TestAnthropic_ToolCalls(t *testing.T) {
 	t.Parallel()

@@ -180,7 +180,7 @@ func generateTitle(
 	model fantasy.LanguageModel,
 	input string,
 ) (string, error) {
-	title, _, err := generateStructuredTitle(ctx, model, titleGenerationPrompt, input)
+	title, err := generateStructuredTitle(ctx, model, titleGenerationPrompt, input)
 	if err != nil {
 		return "", err
 	}
@@ -192,6 +192,24 @@ func generateStructuredTitle(
 	model fantasy.LanguageModel,
 	systemPrompt string,
 	userInput string,
+) (string, error) {
+	title, _, err := generateStructuredTitleWithUsage(
+		ctx,
+		model,
+		systemPrompt,
+		userInput,
+	)
+	if err != nil {
+		return "", err
+	}
+	return title, nil
+}
+
+func generateStructuredTitleWithUsage(
+	ctx context.Context,
+	model fantasy.LanguageModel,
+	systemPrompt string,
+	userInput string,
 ) (string, fantasy.Usage, error) {
 	userInput = strings.TrimSpace(userInput)
 	if userInput == "" {
@@ -226,8 +244,6 @@ func generateStructuredTitle(
 		return genErr
 	}, nil)
 	if err != nil {
-		// Extract usage from the error when available so that
-		// failed attempts are still accounted for in usage tracking.
 		var usage fantasy.Usage
 		var noObjErr *fantasy.NoObjectGeneratedError
 		if errors.As(err, &noObjErr) {
@@ -529,7 +545,7 @@ func generateManualTitle(
 		userInput = strings.TrimSpace(firstUserText)
 	}

-	title, usage, err := generateStructuredTitle(
+	title, usage, err := generateStructuredTitleWithUsage(
 		titleCtx,
 		fallbackModel,
 		systemPrompt,
@@ -579,7 +595,7 @@ func generatePushSummary(
 	candidates = append(candidates, fallbackModel)

 	for _, model := range candidates {
-		summary, _, err := generateShortText(summaryCtx, model, pushSummaryPrompt, input)
+		summary, err := generateShortText(summaryCtx, model, pushSummaryPrompt, input)
 		if err != nil {
 			logger.Debug(ctx, "push summary model candidate failed",
 				slog.Error(err),
@@ -601,7 +617,7 @@ func generateShortText(
 	model fantasy.LanguageModel,
 	systemPrompt string,
 	userInput string,
-) (string, fantasy.Usage, error) {
+) (string, error) {
 	prompt := []fantasy.Message{
 		{
 			Role: fantasy.MessageRoleSystem,
@@ -629,7 +645,7 @@ func generateShortText(
 		return genErr
 	}, nil)
 	if err != nil {
-		return "", fantasy.Usage{}, xerrors.Errorf("generate short text: %w", err)
+		return "", xerrors.Errorf("generate short text: %w", err)
 	}

 	responseParts := make([]codersdk.ChatMessagePart, 0, len(response.Content))
@@ -639,5 +655,5 @@ func generateShortText(
 		}
 	}
 	text := normalizeShortTextOutput(contentBlocksToText(responseParts))
-	return text, response.Usage, nil
+	return text, nil
 }
@@ -515,12 +515,9 @@ func Test_generateShortText_NormalizesQuotedOutput(t *testing.T) {
 		},
 	}

-	text, usage, err := generateShortText(context.Background(), model, "system", "user")
+	text, err := generateShortText(context.Background(), model, "system", "user")
 	require.NoError(t, err)
 	require.Equal(t, "Quoted summary", text)
-	require.Equal(t, int64(3), usage.InputTokens)
-	require.Equal(t, int64(2), usage.OutputTokens)
-	require.Equal(t, int64(5), usage.TotalTokens)
 }

 type stubModel struct {
@@ -83,3 +83,8 @@ Select a user to see:
  bar shows current spend relative to the limit.
 - **Per-model breakdown** — table of costs and token usage by model.
 - **Per-chat breakdown** — table of costs and token usage by chat session.
+
+> [!NOTE]
+> Automatic title generation uses lightweight models, such as Claude Haiku or GPT-4o
+> Mini. Its token usage is not counted towards usage limits or shown in usage
+> summaries.
@@ -82,7 +82,12 @@ replace github.com/spf13/afero => github.com/aslilac/afero v0.0.0-20250403163713
 // 3) ibetitsmike/fantasy#4 — skip ephemeral replay items when store=false
 replace charm.land/fantasy => github.com/coder/fantasy v0.0.0-20260325145725-112927d9b6d8

-replace github.com/charmbracelet/anthropic-sdk-go => github.com/kylecarbs/anthropic-sdk-go v0.0.0-20260223140439-63879b0b8dab
+// Forked from coder/anthropic-sdk-go (fantasy branch) which adds:
+// 1) All kylecarbs changes (fantasy branch merge).
+// 2) Explicit usage-field merging in Accumulate using JSON presence checks
+//    (preserves input and cache tokens from message_start when message_delta
+//    omits them).
+replace github.com/charmbracelet/anthropic-sdk-go => github.com/coder/anthropic-sdk-go v0.0.0-20260408163834-8345653c189a

 require (
 	cdr.dev/slog/v3 v3.0.0
@@ -316,6 +316,8 @@ github.com/coder/aibridge v1.1.1-0.20260331154949-a011104f377d h1:yoDGndlvKP6fiK
 github.com/coder/aibridge v1.1.1-0.20260331154949-a011104f377d/go.mod h1:u6WvGLMQQbk3ByeOw+LBdVgDNc/v/ujAtUc6MfvzQb4=
 github.com/coder/aisdk-go v0.0.9 h1:Vzo/k2qwVGLTR10ESDeP2Ecek1SdPfZlEjtTfMveiVo=
 github.com/coder/aisdk-go v0.0.9/go.mod h1:KF6/Vkono0FJJOtWtveh5j7yfNrSctVTpwgweYWSp5M=
+github.com/coder/anthropic-sdk-go v0.0.0-20260408163834-8345653c189a h1:0wjvSIzTI5BkhcrV1oKED3S8MEpPUVJOat19D7ityrw=
+github.com/coder/anthropic-sdk-go v0.0.0-20260408163834-8345653c189a/go.mod h1:hqlYqR7uPKOKfnNeicUbZp0Ps0GeYFlKYtwh5HGDCx8=
 github.com/coder/boundary v0.8.4-0.20260304164748-566aeea939ab h1:HrlxyTmMQpOHfSKzRU1vf5TxrmV6vL5OiWq+Dvn5qh0=
 github.com/coder/boundary v0.8.4-0.20260304164748-566aeea939ab/go.mod h1:BhJhyKW/+zZQzaGZ3vn27if2k0Vx5xLXzq7ZCQx5gPk=
 github.com/coder/bubbletea v1.2.2-0.20241212190825-007a1cdb2c41 h1:SBN/DA63+ZHwuWwPHPYoCZ/KLAjHv5g4h2MS4f2/MTI=
@@ -811,8 +813,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/kylecarbs/anthropic-sdk-go v0.0.0-20260223140439-63879b0b8dab h1:5UMYqr13zFQKfq8YscVuFwE7cCQpLieaPJDtLUPe11E=
-github.com/kylecarbs/anthropic-sdk-go v0.0.0-20260223140439-63879b0b8dab/go.mod h1:hqlYqR7uPKOKfnNeicUbZp0Ps0GeYFlKYtwh5HGDCx8=
 github.com/kylecarbs/chroma/v2 v2.0.0-20240401211003-9e036e0631f3 h1:Z9/bo5PSeMutpdiKYNt/TTSfGM1Ll0naj3QzYX9VxTc=
 github.com/kylecarbs/chroma/v2 v2.0.0-20240401211003-9e036e0631f3/go.mod h1:BUGjjsD+ndS6eX37YgTchSEG+Jg9Jv1GiZs9sqPqztk=
 github.com/kylecarbs/openai-go/v3 v3.0.0-20260319113850-9477dcaedcae h1:xlFZNX4nnxpj9Cf6mTwD3pirXGNtBJ/6COsf9iZmsL0=
@@ -1,5 +1,5 @@
 import dayjs from "dayjs";
-import { TriangleAlertIcon } from "lucide-react";
+import { InfoIcon, TriangleAlertIcon } from "lucide-react";
 import type { FC } from "react";
 import { getErrorMessage } from "#/api/errors";
 import type * as TypesGen from "#/api/typesGenerated";
@@ -235,6 +235,14 @@ export const ChatCostSummaryView: FC<ChatCostSummaryViewProps> = ({
 				</div>
 			)}

+			<div className="flex items-start gap-3 p-4 text-sm text-content-secondary">
+				<InfoIcon className="h-5 w-5 shrink-0" />
+				<span>
+					Automatic title generation uses lightweight models and is not counted
+					towards usage limits.
+				</span>
+			</div>
+
 			{summary.by_model.length === 0 && summary.by_chat.length === 0 ? (
 				<p className="py-12 text-center text-content-secondary">
 					{emptyMessage}