feat: add Prometheus metrics for chatd subsystem (#24371)

Adds 7 Prometheus metrics to the chatd subsystem and introduces typed `ActivityBumpReason` for deadline bump attribution. | Metric | Type | Labels | |--------|------|--------| | `coderd_chatd_chats` | Gauge | `state` (streaming, waiting) | | `coderd_chatd_message_count` | Histogram | `provider` | | `coderd_chatd_prompt_size_bytes` | Histogram | `provider` | | `coderd_chatd_tool_result_size_bytes` | Histogram | `provider`, `tool_name` | | `coderd_chatd_ttft_seconds` | Histogram | `provider` | | `coderd_chatd_compaction_total` | Counter | `provider`, `result` | | `coderd_chatd_steps_total` | Counter | `provider` | > 🤖
2026-06-02 20:48:20 +00:00 · 2026-04-15 19:53:10 +01:00
parent 2b68a1f4bd
commit d7439a9de0
13 changed files with 644 additions and 13 deletions
@@ -216,7 +216,7 @@ func (a *AppsAPI) UpdateAppStatus(ctx context.Context, req *agentproto.UpdateApp
 		// We pass time.Time{} for nextAutostart since we don't have access to
 		// TemplateScheduleStore here. The activity bump logic handles this by
 		// defaulting to the template's activity_bump duration (typically 1 hour).
-		workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{})
+		workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}, workspacestats.ActivityBumpReasonAppActivity)
 	}
 	// just return a blank response because it doesn't contain any settable fields at present.
 	return new(agentproto.UpdateAppStatusResponse), nil
@@ -792,6 +792,7 @@ func New(options *Options) *API {
 			Pubsub:                         options.Pubsub,
 			WebpushDispatcher:              options.WebPushDispatcher,
 			UsageTracker:                   options.WorkspaceUsageTracker,
 			PrometheusRegistry:             options.PrometheusRegistry,
 		})
 		gitSyncLogger := options.Logger.Named("gitsync")
 		refresher := gitsync.NewRefresher(
@@ -11,6 +11,21 @@ import (
 	"github.com/coder/coder/v2/coderd/database"
 )
 // ActivityBumpReason represents the reason for an activity bump.
 type ActivityBumpReason string
 const (
 	// ActivityBumpReasonWorkspaceStats indicates the bump was triggered
 	// by SSH or terminal activity reported via workspace stats.
 	ActivityBumpReasonWorkspaceStats ActivityBumpReason = "workspace_stats"
 	// ActivityBumpReasonChatHeartbeat indicates the bump was triggered
 	// by an AI chat heartbeat.
 	ActivityBumpReasonChatHeartbeat ActivityBumpReason = "chat_heartbeat"
 	// ActivityBumpReasonAppActivity indicates the bump was triggered
 	// by app or port-forward activity.
 	ActivityBumpReasonAppActivity ActivityBumpReason = "app_activity"
 )
 // ActivityBumpWorkspace automatically bumps the workspace's auto-off timer
 // if it is set to expire soon. The deadline will be bumped by 1 hour*.
 // If the bump crosses over an autostart time, the workspace will be
@@ -36,7 +51,7 @@ import (
 // A way to avoid this is to configure the max deadline to something that will not
 // span more than 1 day. This will force the workspace to restart and reset the deadline
 // each morning when it autostarts.
-func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time) {
+func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time, reason ActivityBumpReason) {
 	// We set a short timeout so if the app is under load, these
 	// low priority operations fail first.
 	ctx, cancel := context.WithTimeout(ctx, time.Second*15)
@@ -50,6 +65,7 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
 			// Bump will fail if the context is canceled, but this is ok.
 			log.Error(ctx, "activity bump failed", slog.Error(err),
 				slog.F("workspace_id", workspaceID),
 				slog.F("reason", reason),
 			)
 		}
 		return
@@ -57,5 +73,6 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
 	log.Debug(ctx, "bumped deadline from activity",
 		slog.F("workspace_id", workspaceID),
 		slog.F("reason", reason),
 	)
 }
@@ -268,13 +268,14 @@ func Test_ActivityBumpWorkspace(t *testing.T) {
 				// Bump duration is measured from the time of the bump, so we measure from here.
 				start := dbtime.Now()
-				workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start))
+				workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start), workspacestats.ActivityBumpReasonWorkspaceStats)
 				end := dbtime.Now()
 				// Validate our state after bump
 				updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID)
 				require.NoError(t, err, "unexpected error getting latest workspace build")
 				require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed")
 				if tt.expectedBump == 0 {
 					assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at")
 					assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline")
@@ -194,7 +194,7 @@ func (r *Reporter) ReportAgentStats(ctx context.Context, now time.Time, workspac
 		}
 		// bump workspace activity
-		ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart)
+		ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart, ActivityBumpReasonWorkspaceStats)
 	}
 	// bump workspace last_used_at
@@ -18,6 +18,7 @@ import (
 	"charm.land/fantasy"
 	"charm.land/fantasy/providers/anthropic"
 	"github.com/google/uuid"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/shopspring/decimal"
 	"github.com/sqlc-dev/pqtype"
 	"golang.org/x/sync/errgroup"
@@ -145,6 +146,7 @@ type Server struct {
 	usageTracker *workspacestats.UsageTracker
 	clock        quartz.Clock
 	metrics      *chatloop.Metrics
 	recordingSem chan struct{}
 	// Configuration
@@ -2700,6 +2702,7 @@ type Config struct {
 	WebpushDispatcher              webpush.Dispatcher
 	UsageTracker                   *workspacestats.UsageTracker
 	Clock                          quartz.Clock
 	PrometheusRegistry             prometheus.Registerer
 }
 // New creates a new chat processor. The processor polls for pending
@@ -2768,6 +2771,11 @@ func New(cfg Config) *Server {
 		wakeCh:                         make(chan struct{}, 1),
 		heartbeatRegistry:              make(map[uuid.UUID]*heartbeatEntry),
 	}
 	if cfg.PrometheusRegistry != nil {
 		p.metrics = chatloop.NewMetrics(cfg.PrometheusRegistry)
 	} else {
 		p.metrics = chatloop.NopMetrics()
 	}
 	//nolint:gocritic // The chat processor uses a scoped chatd context.
 	ctx = dbauthz.AsChatd(ctx)
@@ -4027,7 +4035,7 @@ func (p *Server) trackWorkspaceUsage(
 		// approx. 333 CTE queries/second. A cheap fix for this could
 		// be to heartbeat every Nth query. Leaving as potential future
 		// low-hanging fruit if needed.
-		workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{})
+		workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}, workspacestats.ActivityBumpReasonChatHeartbeat)
 	}
 	return wsID
 }
@@ -4036,6 +4044,9 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
 	logger := p.logger.With(slog.F("chat_id", chat.ID))
 	logger.Info(ctx, "processing chat request")
 	p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
 	defer p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
 	chatCtx, cancel := context.WithCancelCause(ctx)
 	defer cancel(nil)
@@ -4246,6 +4257,12 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
 		}
 	}()
 	p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
 	p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Inc()
 	defer func() {
 		p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Dec()
 		p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
 	}()
 	runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger)
 	if err != nil {
 		if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) {
@@ -5165,12 +5182,18 @@ func (p *Server) runChat(
 		)
 	}
 	// Record builtin tool names before appending MCP tools
 	// so the metrics layer can bound label cardinality.
 	builtinToolNames := make(map[string]bool, len(tools))
 	for _, t := range tools {
 		builtinToolNames[t.Info().Name] = true
 	}
 	// Append tools from external MCP servers. These appear
 	// after the built-in tools so the LLM sees them as
 	// additional capabilities.
 	tools = append(tools, mcpTools...)
 	tools = append(tools, workspaceMCPTools...)
 	// Append dynamic tools declared by the client at chat
 	// creation time. These appear in the LLM's tool list but
 	// are never executed by the chatloop — the client handles
@@ -5251,6 +5274,8 @@ func (p *Server) runChat(
 		Model:    model,
 		Messages: prompt,
 		Tools:    tools, MaxSteps: maxChatSteps,
 		Metrics:          p.metrics,
 		BuiltinToolNames: builtinToolNames,
 		ModelConfig:     callConfig,
 		ProviderOptions: providerOptions,
@@ -2610,6 +2610,7 @@ func TestProcessChat_IgnoresStaleControlNotification(t *testing.T) {
 		chatHeartbeatInterval: time.Minute,
 		configCache:           newChatConfigCache(ctx, db, clock),
 		heartbeatRegistry:     make(map[uuid.UUID]*heartbeatEntry),
 		metrics:               chatloop.NopMetrics(),
 	}
 	// Publish a stale "pending" notification on the control channel
@@ -150,6 +150,15 @@ type RunOptions struct {
 	OnRetry chatretry.OnRetryFn
 	OnInterruptedPersistError func(error)
 	// Metrics records Prometheus metrics for the chatd subsystem.
 	// When nil, no metrics are recorded.
 	Metrics *Metrics
 	// BuiltinToolNames lists tool names that are built into chatd.
 	// Tool results from tools not in this set are recorded under
 	// the "mcp" label to bound cardinality.
 	BuiltinToolNames map[string]bool
 }
 // ProviderTool pairs a provider-native tool definition with an
@@ -297,6 +306,9 @@ func Run(ctx context.Context, opts RunOptions) error {
 	if opts.Clock == nil {
 		opts.Clock = quartz.NewReal()
 	}
 	if opts.Metrics == nil {
 		opts.Metrics = NopMetrics()
 	}
 	publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) {
 		if opts.PublishMessagePart == nil {
@@ -343,6 +355,8 @@ func Run(ctx context.Context, opts RunOptions) error {
 		for step := 0; totalSteps < opts.MaxSteps; step++ {
 			totalSteps++
 			provider := opts.Model.Provider()
 			opts.Metrics.StepsTotal.WithLabelValues(provider).Inc()
 			stepStart := time.Now()
 			// Copy messages so that provider-specific caching
 			// mutations don't leak back to the caller's slice.
@@ -354,7 +368,8 @@ func Run(ctx context.Context, opts RunOptions) error {
 			if applyAnthropicCaching {
 				addAnthropicPromptCaching(prepared)
 			}
-
+			opts.Metrics.MessageCount.WithLabelValues(provider).Observe(float64(len(prepared)))
 			opts.Metrics.PromptSizeBytes.WithLabelValues(provider).Observe(float64(EstimatePromptSize(prepared)))
 			call := fantasy.Call{
 				Prompt:           prepared,
 				Tools:            tools,
@@ -371,12 +386,13 @@ func Run(ctx context.Context, opts RunOptions) error {
 			err := chatretry.Retry(ctx, func(retryCtx context.Context) error {
 				attempt, streamErr := guardedStream(
 					retryCtx,
-					opts.Model.Provider(),
+					provider,
 					opts.Clock,
 					opts.StartupTimeout,
 					func(attemptCtx context.Context) (fantasy.StreamResponse, error) {
 						return opts.Model.Stream(attemptCtx, call)
 					},
 					opts.Metrics,
 				)
 				if streamErr != nil {
 					return streamErr
@@ -444,7 +460,7 @@ func Run(ctx context.Context, opts RunOptions) error {
 				}
 				// Execute only built-in tools.
-				toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, func(tr fantasy.ToolResultContent, completedAt time.Time) {
+				toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) {
 					recordToolResultTimestamp(&result, tr.ToolCallID, completedAt)
 					ssePart := chatprompt.PartFromContent(tr)
 					ssePart.CreatedAt = &completedAt
@@ -582,9 +598,11 @@ func Run(ctx context.Context, opts RunOptions) error {
 					result.providerMetadata,
 					messages,
 				)
 				opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
 				if compactErr != nil && opts.Compaction.OnError != nil {
 					opts.Compaction.OnError(compactErr)
 				}
 				if did {
 					alreadyCompacted = true
 					compactedOnFinalStep = true
@@ -622,6 +640,7 @@ func Run(ctx context.Context, opts RunOptions) error {
 				lastProviderMetadata,
 				messages,
 			)
 			opts.Metrics.RecordCompaction(opts.Model.Provider(), did, err)
 			if err != nil {
 				if opts.Compaction.OnError != nil {
 					opts.Compaction.OnError(err)
@@ -720,6 +739,7 @@ func guardedStream(
 	clock quartz.Clock,
 	timeout time.Duration,
 	openStream func(context.Context) (fantasy.StreamResponse, error),
 	metrics *Metrics,
 ) (guardedAttempt, error) {
 	attemptCtx, cancelAttempt := context.WithCancelCause(parent)
 	guard := newStartupGuard(clock, timeout, cancelAttempt)
@@ -731,6 +751,7 @@ func guardedStream(
 		})
 	}
 	streamStart := clock.Now()
 	stream, err := openStream(attemptCtx)
 	if err != nil {
 		err = classifyStartupTimeout(attemptCtx, provider, err)
@@ -738,11 +759,17 @@ func guardedStream(
 		return guardedAttempt{}, err
 	}
 	recordTTFT := sync.OnceFunc(func() {
 		metrics.TTFTSeconds.WithLabelValues(provider).Observe(
 			clock.Since(streamStart).Seconds(),
 		)
 	})
 	return guardedAttempt{
 		ctx: attemptCtx,
 		stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) {
 			for part := range stream {
 				guard.Disarm()
 				recordTTFT()
 				if !yield(part) {
 					return
 				}
@@ -985,6 +1012,9 @@ func executeTools(
 	allTools []fantasy.AgentTool,
 	providerTools []ProviderTool,
 	toolCalls []fantasy.ToolCallContent,
 	metrics *Metrics,
 	provider string,
 	builtinToolNames map[string]bool,
 	onResult func(fantasy.ToolResultContent, time.Time),
 ) []fantasy.ToolResultContent {
 	if len(toolCalls) == 0 {
@@ -1039,7 +1069,7 @@ func executeTools(
 				// accurate individual completion times.
 				completedAt[i] = dbtime.Now()
 			}()
-			results[i] = executeSingleTool(ctx, toolMap, tc)
+			results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, builtinToolNames)
 		}()
 	}
 	wg.Wait()
@@ -1060,12 +1090,24 @@ func executeSingleTool(
 	ctx context.Context,
 	toolMap map[string]fantasy.AgentTool,
 	tc fantasy.ToolCallContent,
 	metrics *Metrics,
 	provider string,
 	builtinToolNames map[string]bool,
 ) fantasy.ToolResultContent {
 	result := fantasy.ToolResultContent{
 		ToolCallID:       tc.ToolCallID,
 		ToolName:         tc.ToolName,
 		ProviderExecuted: false,
 	}
 	defer func() {
 		toolLabel := tc.ToolName
 		if !builtinToolNames[tc.ToolName] {
 			toolLabel = "mcp"
 		}
 		metrics.ToolResultSizeBytes.WithLabelValues(provider, toolLabel).Observe(
 			float64(ToolResultSize(result)),
 		)
 	}()
 	tool, exists := toolMap[tc.ToolName]
 	if !exists {
@@ -1256,7 +1298,7 @@ func tryCompactOnExit(
 	if err != nil {
 		return
 	}
-	_, compactErr := tryCompact(
+	did, compactErr := tryCompact(
 		ctx,
 		opts.Model,
 		opts.Compaction,
@@ -1265,6 +1307,7 @@ func tryCompactOnExit(
 		metadata,
 		reloaded,
 	)
 	opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
 	if compactErr != nil && opts.Compaction.OnError != nil {
 		opts.Compaction.OnError(compactErr)
 	}
@@ -0,0 +1,173 @@
 package chatloop
 import (
 	"context"
 	"errors"
 	"charm.land/fantasy"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
 )
 const (
 	metricsNamespace = "coderd"
 	metricsSubsystem = "chatd"
 	// Label values for Chats.
 	StateStreaming = "streaming"
 	StateWaiting   = "waiting"
 	// Label values for CompactionTotal.
 	CompactionResultSuccess = "success"
 	CompactionResultError   = "error"
 	CompactionResultTimeout = "timeout"
 )
 // Metrics holds Prometheus metrics for the chatd subsystem.
 type Metrics struct {
 	Chats               *prometheus.GaugeVec
 	MessageCount        *prometheus.HistogramVec
 	PromptSizeBytes     *prometheus.HistogramVec
 	ToolResultSizeBytes *prometheus.HistogramVec
 	TTFTSeconds         *prometheus.HistogramVec
 	CompactionTotal     *prometheus.CounterVec
 	StepsTotal          *prometheus.CounterVec
 }
 // NewMetrics creates a new Metrics instance registered with the
 // given registerer.
 func NewMetrics(reg prometheus.Registerer) *Metrics {
 	factory := promauto.With(reg)
 	return &Metrics{
 		Chats: factory.NewGaugeVec(prometheus.GaugeOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "chats",
 			Help:      "Number of chats being processed, by state.",
 		}, []string{"state"}),
 		MessageCount: factory.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "message_count",
 			Help:      "Number of messages in the prompt per LLM request.",
 			Buckets:   prometheus.ExponentialBuckets(1, 2, 8), // 1, 2, 4, 8, 16, 32, 64, 128
 		}, []string{"provider"}),
 		PromptSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "prompt_size_bytes",
 			Help:      "Estimated byte size of the prompt per LLM request.",
 			Buckets:   prometheus.ExponentialBuckets(1024, 4, 10), // 1KB .. 256MB
 		}, []string{"provider"}),
 		ToolResultSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "tool_result_size_bytes",
 			Help:      "Size in bytes of each tool execution result.",
 			Buckets:   prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB
 		}, []string{"provider", "tool_name"}),
 		TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "ttft_seconds",
 			Help:      "Time-to-first-token: wall time from LLM request to first streamed chunk.",
 			Buckets:   []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
 		}, []string{"provider"}),
 		CompactionTotal: factory.NewCounterVec(prometheus.CounterOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "compaction_total",
 			Help:      "Total compaction outcomes (only recorded when compaction was triggered or failed).",
 		}, []string{"provider", "result"}),
 		StepsTotal: factory.NewCounterVec(prometheus.CounterOpts{
 			Namespace: metricsNamespace,
 			Subsystem: metricsSubsystem,
 			Name:      "steps_total",
 			Help:      "Total agentic loop steps across all chats.",
 		}, []string{"provider"}),
 	}
 }
 // NopMetrics returns a Metrics instance that discards all data.
 // Useful for tests and when metrics collection is not desired.
 func NopMetrics() *Metrics {
 	return NewMetrics(prometheus.NewRegistry())
 }
 // RecordCompaction classifies and records a compaction attempt.
 // It is a no-op when m is nil.
 func (m *Metrics) RecordCompaction(provider string, compacted bool, err error) {
 	if m == nil {
 		return
 	}
 	switch {
 	case err != nil && errors.Is(err, context.DeadlineExceeded):
 		m.CompactionTotal.WithLabelValues(provider, CompactionResultTimeout).Inc()
 	case err != nil && errors.Is(err, context.Canceled):
 		// User interruption, not a compaction failure.
 		return
 	case err != nil:
 		m.CompactionTotal.WithLabelValues(provider, CompactionResultError).Inc()
 	case compacted:
 		m.CompactionTotal.WithLabelValues(provider, CompactionResultSuccess).Inc()
 		// !compacted && err == nil means threshold not reached -- not
 		// recorded.
 	}
 }
 // EstimatePromptSize returns a cheap byte-size estimate of a
 // fantasy prompt by summing the text content lengths of all
 // message parts. This avoids JSON marshaling overhead.
 func EstimatePromptSize(messages []fantasy.Message) int {
 	var size int
 	for _, msg := range messages {
 		for _, part := range msg.Content {
 			size += ContentPartSize(part)
 		}
 	}
 	return size
 }
 // ContentPartSize returns the byte length of a MessagePart's
 // primary text or data field.
 func ContentPartSize(part fantasy.MessagePart) int {
 	switch p := part.(type) {
 	case fantasy.TextPart:
 		return len(p.Text)
 	case fantasy.ReasoningPart:
 		return len(p.Text)
 	case fantasy.FilePart:
 		return len(p.Data)
 	case fantasy.ToolCallPart:
 		return len(p.Input)
 	case fantasy.ToolResultPart:
 		return toolResultOutputSize(p.Output)
 	default:
 		return 0
 	}
 }
 // ToolResultSize returns the byte length of a
 // ToolResultContent's primary text or data field.
 func ToolResultSize(r fantasy.ToolResultContent) int {
 	return toolResultOutputSize(r.Result)
 }
 func toolResultOutputSize(output fantasy.ToolResultOutputContent) int {
 	if output == nil {
 		return 0
 	}
 	switch v := output.(type) {
 	case fantasy.ToolResultOutputContentText:
 		return len(v.Text)
 	case fantasy.ToolResultOutputContentError:
 		if v.Error != nil {
 			return len(v.Error.Error())
 		}
 		return 0
 	case fantasy.ToolResultOutputContentMedia:
 		return len(v.Data)
 	default:
 		return 0
 	}
 }
@@ -0,0 +1,342 @@
 package chatloop_test
 import (
 	"context"
 	"testing"
 	"charm.land/fantasy"
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/coder/coder/v2/coderd/x/chatd/chatloop"
 	"github.com/coder/coder/v2/coderd/x/chatd/chattest"
 )
 func TestNewMetrics_RegistersAllMetrics(t *testing.T) {
 	t.Parallel()
 	reg := prometheus.NewRegistry()
 	m := chatloop.NewMetrics(reg)
 	// Initialize vector metrics so they appear in Gather output.
 	m.Chats.WithLabelValues(chatloop.StateStreaming)
 	m.CompactionTotal.WithLabelValues("anthropic", chatloop.CompactionResultSuccess)
 	m.ToolResultSizeBytes.WithLabelValues("anthropic", "test")
 	m.MessageCount.WithLabelValues("anthropic")
 	m.PromptSizeBytes.WithLabelValues("anthropic")
 	m.TTFTSeconds.WithLabelValues("anthropic")
 	m.StepsTotal.WithLabelValues("anthropic")
 	families, err := reg.Gather()
 	require.NoError(t, err)
 	expected := map[string]dto.MetricType{
 		"coderd_chatd_chats":                  dto.MetricType_GAUGE,
 		"coderd_chatd_message_count":          dto.MetricType_HISTOGRAM,
 		"coderd_chatd_prompt_size_bytes":      dto.MetricType_HISTOGRAM,
 		"coderd_chatd_tool_result_size_bytes": dto.MetricType_HISTOGRAM,
 		"coderd_chatd_ttft_seconds":           dto.MetricType_HISTOGRAM,
 		"coderd_chatd_compaction_total":       dto.MetricType_COUNTER,
 		"coderd_chatd_steps_total":            dto.MetricType_COUNTER,
 	}
 	found := make(map[string]dto.MetricType)
 	for _, f := range families {
 		found[f.GetName()] = f.GetType()
 	}
 	for name, expectedType := range expected {
 		actualType, ok := found[name]
 		assert.True(t, ok, "metric %q not registered", name)
 		if ok {
 			assert.Equal(t, expectedType, actualType, "metric %q has wrong type", name)
 		}
 	}
 }
 func TestNopMetrics_DoesNotPanic(t *testing.T) {
 	t.Parallel()
 	m := chatloop.NopMetrics()
 	// Exercise every metric to confirm no nil-pointer panics.
 	m.Chats.WithLabelValues("streaming").Inc()
 	m.Chats.WithLabelValues("streaming").Dec()
 	m.Chats.WithLabelValues("waiting").Inc()
 	m.Chats.WithLabelValues("waiting").Dec()
 	m.MessageCount.WithLabelValues("anthropic").Observe(10)
 	m.PromptSizeBytes.WithLabelValues("openai").Observe(4096)
 	m.ToolResultSizeBytes.WithLabelValues("anthropic", "execute").Observe(512)
 	m.TTFTSeconds.WithLabelValues("anthropic").Observe(0.5)
 	m.CompactionTotal.WithLabelValues("anthropic", "success").Inc()
 	m.CompactionTotal.WithLabelValues("openai", "error").Inc()
 	m.CompactionTotal.WithLabelValues("google", "timeout").Inc()
 	m.StepsTotal.WithLabelValues("anthropic").Inc()
 }
 func TestEstimatePromptSize(t *testing.T) {
 	t.Parallel()
 	messages := []fantasy.Message{
 		{
 			Role: fantasy.MessageRoleSystem,
 			Content: []fantasy.MessagePart{
 				fantasy.TextPart{Text: "You are a helpful assistant."},
 			},
 		},
 		{
 			Role: fantasy.MessageRoleUser,
 			Content: []fantasy.MessagePart{
 				fantasy.TextPart{Text: "Hello world"},
 				fantasy.ReasoningPart{Text: "thinking..."},
 				fantasy.FilePart{Data: []byte("filedata")},
 			},
 		},
 		{
 			Role: fantasy.MessageRoleAssistant,
 			Content: []fantasy.MessagePart{
 				fantasy.TextPart{Text: "Hi there!"},
 				fantasy.ToolCallPart{Input: `{"file":"main.go"}`},
 			},
 		},
 		{
 			Role: fantasy.MessageRoleTool,
 			Content: []fantasy.MessagePart{
 				fantasy.ToolResultPart{
 					Output: fantasy.ToolResultOutputContentText{Text: "result"},
 				},
 			},
 		},
 	}
 	size := chatloop.EstimatePromptSize(messages)
 	// "You are a helpful assistant." (28) + "Hello world" (11) +
 	// "thinking..." (11) + "filedata" (8) +
 	// "Hi there!" (9) + `{"file":"main.go"}` (18) +
 	// "result" (6) = 91
 	assert.Equal(t, 91, size)
 }
 func TestToolResultSize(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name     string
 		result   fantasy.ToolResultContent
 		expected int
 	}{
 		{
 			name: "text",
 			result: fantasy.ToolResultContent{
 				Result: fantasy.ToolResultOutputContentText{Text: "hello"},
 			},
 			expected: 5,
 		},
 		{
 			name: "error",
 			result: fantasy.ToolResultContent{
 				Result: fantasy.ToolResultOutputContentError{
 					Error: assert.AnError,
 				},
 			},
 			expected: len(assert.AnError.Error()),
 		},
 		{
 			name: "media",
 			result: fantasy.ToolResultContent{
 				Result: fantasy.ToolResultOutputContentMedia{Data: "base64data"},
 			},
 			expected: 10,
 		},
 		{
 			name:     "nil_result",
 			result:   fantasy.ToolResultContent{},
 			expected: 0,
 		},
 		{
 			name: "error_nil_error",
 			result: fantasy.ToolResultContent{
 				Result: fantasy.ToolResultOutputContentError{Error: nil},
 			},
 			expected: 0,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 			assert.Equal(t, tt.expected, chatloop.ToolResultSize(tt.result))
 		})
 	}
 }
 func TestRecordCompaction(t *testing.T) {
 	t.Parallel()
 	t.Run("nil metrics does not panic", func(t *testing.T) {
 		t.Parallel()
 		var m *chatloop.Metrics
 		m.RecordCompaction("anthropic", true, nil)
 	})
 	tests := []struct {
 		name      string
 		compacted bool
 		err       error
 		wantLabel string
 		wantCount int
 	}{
 		{
 			name:      "success",
 			compacted: true,
 			err:       nil,
 			wantLabel: chatloop.CompactionResultSuccess,
 			wantCount: 1,
 		},
 		{
 			name:      "error",
 			compacted: false,
 			err:       assert.AnError,
 			wantLabel: chatloop.CompactionResultError,
 			wantCount: 1,
 		},
 		{
 			name:      "timeout",
 			compacted: false,
 			err:       context.DeadlineExceeded,
 			wantLabel: chatloop.CompactionResultTimeout,
 			wantCount: 1,
 		},
 		{
 			name:      "threshold_not_reached",
 			compacted: false,
 			err:       nil,
 			wantLabel: "",
 			wantCount: 0,
 		},
 		{
 			name:      "canceled",
 			compacted: false,
 			err:       context.Canceled,
 			wantLabel: "",
 			wantCount: 0,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 			reg := prometheus.NewRegistry()
 			m := chatloop.NewMetrics(reg)
 			m.RecordCompaction("test", tt.compacted, tt.err)
 			families, err := reg.Gather()
 			require.NoError(t, err)
 			if tt.wantCount == 0 {
 				for _, f := range families {
 					assert.NotEqual(t, "coderd_chatd_compaction_total", f.GetName(),
 						"compaction_total should not be recorded")
 				}
 				return
 			}
 			var found bool
 			for _, f := range families {
 				if f.GetName() != "coderd_chatd_compaction_total" {
 					continue
 				}
 				found = true
 				require.Len(t, f.GetMetric(), 1)
 				metric := f.GetMetric()[0]
 				assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue())
 				// Check label.
 				for _, lp := range metric.GetLabel() {
 					if lp.GetName() == "result" {
 						assert.Equal(t, tt.wantLabel, lp.GetValue())
 					}
 				}
 			}
 			assert.True(t, found, "compaction_total metric not found")
 		})
 	}
 }
 func TestRun_RecordsMetrics(t *testing.T) {
 	t.Parallel()
 	reg := prometheus.NewRegistry()
 	metrics := chatloop.NewMetrics(reg)
 	model := &chattest.FakeModel{
 		ProviderName: "test-provider",
 		StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
 			return func(yield func(fantasy.StreamPart) bool) {
 				parts := []fantasy.StreamPart{
 					{Type: fantasy.StreamPartTypeTextStart, ID: "t1"},
 					{Type: fantasy.StreamPartTypeTextDelta, ID: "t1", Delta: "hello"},
 					{Type: fantasy.StreamPartTypeTextEnd, ID: "t1"},
 					{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
 				}
 				for _, p := range parts {
 					if !yield(p) {
 						return
 					}
 				}
 			}, nil
 		},
 	}
 	err := chatloop.Run(context.Background(), chatloop.RunOptions{
 		Model: model,
 		Messages: []fantasy.Message{
 			{
 				Role: fantasy.MessageRoleUser,
 				Content: []fantasy.MessagePart{
 					fantasy.TextPart{Text: "hello"},
 				},
 			},
 		},
 		MaxSteps: 1,
 		PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error {
 			return nil
 		},
 		Metrics: metrics,
 	})
 	require.NoError(t, err)
 	families, err := reg.Gather()
 	require.NoError(t, err)
 	found := make(map[string]bool)
 	for _, f := range families {
 		found[f.GetName()] = true
 		switch f.GetName() {
 		case "coderd_chatd_steps_total":
 			require.Len(t, f.GetMetric(), 1)
 			assert.Equal(t, float64(1), f.GetMetric()[0].GetCounter().GetValue(),
 				"steps_total should be 1 after one step")
 		case "coderd_chatd_message_count":
 			require.Len(t, f.GetMetric(), 1)
 			assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
 				"message_count should have 1 observation")
 		case "coderd_chatd_prompt_size_bytes":
 			require.Len(t, f.GetMetric(), 1)
 			assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
 				"prompt_size_bytes should have 1 observation")
 		case "coderd_chatd_ttft_seconds":
 			require.Len(t, f.GetMetric(), 1)
 			assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
 				"ttft_seconds should have 1 observation")
 		}
 	}
 	assert.True(t, found["coderd_chatd_steps_total"], "steps_total not recorded")
 	assert.True(t, found["coderd_chatd_message_count"], "message_count not recorded")
 	assert.True(t, found["coderd_chatd_prompt_size_bytes"], "prompt_size_bytes not recorded")
 	assert.True(t, found["coderd_chatd_ttft_seconds"], "ttft_seconds not recorded")
 }
@@ -198,6 +198,13 @@ deployment. They will always be available from the agent.
 | `coderd_authz_authorize_duration_seconds`                               | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed.                                                                                                                                                                                                                                                                                                                                                                                                                               | `allowed`                                                                                             |
 | `coderd_authz_prepare_authorize_duration_seconds`                       | histogram | Duration of the 'PrepareAuthorize' call in seconds.                                                                                                                                                                                                                                                                                                                                                                                                                                                        |                                                                                                       |
 | `coderd_build_info`                                                     | gauge     | Describes the current build/version of the Coder server. Value is always 1.                                                                                                                                                                                                                                                                                                                                                                                                                                | `revision` `version`                                                                                  |
 | `coderd_chatd_chats`                                                    | gauge     | Number of chats being processed, by state.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | `state`                                                                                               |
 | `coderd_chatd_compaction_total`                                         | counter   | Total compaction outcomes (only recorded when compaction was triggered or failed).                                                                                                                                                                                                                                                                                                                                                                                                                         | `provider` `result`                                                                                   |
 | `coderd_chatd_message_count`                                            | histogram | Number of messages in the prompt per LLM request.                                                                                                                                                                                                                                                                                                                                                                                                                                                          | `provider`                                                                                            |
 | `coderd_chatd_prompt_size_bytes`                                        | histogram | Estimated byte size of the prompt per LLM request.                                                                                                                                                                                                                                                                                                                                                                                                                                                         | `provider`                                                                                            |
 | `coderd_chatd_steps_total`                                              | counter   | Total agentic loop steps across all chats.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | `provider`                                                                                            |
 | `coderd_chatd_tool_result_size_bytes`                                   | histogram | Size in bytes of each tool execution result.                                                                                                                                                                                                                                                                                                                                                                                                                                                               | `provider` `tool_name`                                                                                |
 | `coderd_chatd_ttft_seconds`                                             | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk.                                                                                                                                                                                                                                                                                                                                                                                                                                   | `provider`                                                                                            |
 | `coderd_db_query_counts_total`                                          | counter   | Total number of queries labelled by HTTP route, method, and query name.                                                                                                                                                                                                                                                                                                                                                                                                                                    | `method` `query` `route`                                                                              |
 | `coderd_db_query_latencies_seconds`                                     | histogram | Latency distribution of queries in seconds.                                                                                                                                                                                                                                                                                                                                                                                                                                                                | `query`                                                                                               |
 | `coderd_db_tx_duration_seconds`                                         | histogram | Duration of transactions in seconds.                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | `success` `tx_id`                                                                                     |
@@ -2908,7 +2908,7 @@ func TestPrebuildActivityBump(t *testing.T) {
 	require.Zero(t, prebuild.LatestBuild.MaxDeadline)
 	// When: activity bump is applied to an unclaimed prebuild
-	workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour))
+	workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
 	// Then: prebuild Deadline/MaxDeadline remain unchanged
 	prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID)
@@ -2941,7 +2941,7 @@ func TestPrebuildActivityBump(t *testing.T) {
 	workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
 	// When: activity bump is applied to a claimed prebuild
-	workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour))
+	workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
 	// Then: Deadline is extended by the activity bump, MaxDeadline remains unset
 	workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
@@ -226,6 +226,27 @@ coderd_authz_prepare_authorize_duration_seconds 0
 # HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1.
 # TYPE coderd_build_info gauge
 coderd_build_info{version="",revision=""} 0
 # HELP coderd_chatd_chats Number of chats being processed, by state.
 # TYPE coderd_chatd_chats gauge
 coderd_chatd_chats{state=""} 0
 # HELP coderd_chatd_compaction_total Total compaction outcomes (only recorded when compaction was triggered or failed).
 # TYPE coderd_chatd_compaction_total counter
 coderd_chatd_compaction_total{provider="",result=""} 0
 # HELP coderd_chatd_message_count Number of messages in the prompt per LLM request.
 # TYPE coderd_chatd_message_count histogram
 coderd_chatd_message_count{provider=""} 0
 # HELP coderd_chatd_prompt_size_bytes Estimated byte size of the prompt per LLM request.
 # TYPE coderd_chatd_prompt_size_bytes histogram
 coderd_chatd_prompt_size_bytes{provider=""} 0
 # HELP coderd_chatd_steps_total Total agentic loop steps across all chats.
 # TYPE coderd_chatd_steps_total counter
 coderd_chatd_steps_total{provider=""} 0
 # HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result.
 # TYPE coderd_chatd_tool_result_size_bytes histogram
 coderd_chatd_tool_result_size_bytes{provider="",tool_name=""} 0
 # HELP coderd_chatd_ttft_seconds Time-to-first-token: wall time from LLM request to first streamed chunk.
 # TYPE coderd_chatd_ttft_seconds histogram
 coderd_chatd_ttft_seconds{provider=""} 0
 # HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name.
 # TYPE coderd_db_query_counts_total counter
 coderd_db_query_counts_total{route="",method="",query=""} 0