feat: add Prometheus metrics for chatd subsystem (#24371)

Adds 7 Prometheus metrics to the chatd subsystem and introduces typed
`ActivityBumpReason` for deadline bump attribution.

| Metric | Type | Labels |
|--------|------|--------|
| `coderd_chatd_chats` | Gauge | `state` (streaming, waiting) |
| `coderd_chatd_message_count` | Histogram | `provider` |
| `coderd_chatd_prompt_size_bytes` | Histogram | `provider` |
| `coderd_chatd_tool_result_size_bytes` | Histogram | `provider`,
`tool_name` |
| `coderd_chatd_ttft_seconds` | Histogram | `provider` |
| `coderd_chatd_compaction_total` | Counter | `provider`, `result` |
| `coderd_chatd_steps_total` | Counter | `provider` |

> 🤖
This commit is contained in:
Cian Johnston
2026-04-15 19:53:10 +01:00
committed by GitHub
parent 2b68a1f4bd
commit d7439a9de0
13 changed files with 644 additions and 13 deletions
+1 -1
View File
@@ -216,7 +216,7 @@ func (a *AppsAPI) UpdateAppStatus(ctx context.Context, req *agentproto.UpdateApp
// We pass time.Time{} for nextAutostart since we don't have access to // We pass time.Time{} for nextAutostart since we don't have access to
// TemplateScheduleStore here. The activity bump logic handles this by // TemplateScheduleStore here. The activity bump logic handles this by
// defaulting to the template's activity_bump duration (typically 1 hour). // defaulting to the template's activity_bump duration (typically 1 hour).
workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}) workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}, workspacestats.ActivityBumpReasonAppActivity)
} }
// just return a blank response because it doesn't contain any settable fields at present. // just return a blank response because it doesn't contain any settable fields at present.
return new(agentproto.UpdateAppStatusResponse), nil return new(agentproto.UpdateAppStatusResponse), nil
+1
View File
@@ -792,6 +792,7 @@ func New(options *Options) *API {
Pubsub: options.Pubsub, Pubsub: options.Pubsub,
WebpushDispatcher: options.WebPushDispatcher, WebpushDispatcher: options.WebPushDispatcher,
UsageTracker: options.WorkspaceUsageTracker, UsageTracker: options.WorkspaceUsageTracker,
PrometheusRegistry: options.PrometheusRegistry,
}) })
gitSyncLogger := options.Logger.Named("gitsync") gitSyncLogger := options.Logger.Named("gitsync")
refresher := gitsync.NewRefresher( refresher := gitsync.NewRefresher(
+18 -1
View File
@@ -11,6 +11,21 @@ import (
"github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database"
) )
// ActivityBumpReason represents the reason for an activity bump.
type ActivityBumpReason string
const (
// ActivityBumpReasonWorkspaceStats indicates the bump was triggered
// by SSH or terminal activity reported via workspace stats.
ActivityBumpReasonWorkspaceStats ActivityBumpReason = "workspace_stats"
// ActivityBumpReasonChatHeartbeat indicates the bump was triggered
// by an AI chat heartbeat.
ActivityBumpReasonChatHeartbeat ActivityBumpReason = "chat_heartbeat"
// ActivityBumpReasonAppActivity indicates the bump was triggered
// by app or port-forward activity.
ActivityBumpReasonAppActivity ActivityBumpReason = "app_activity"
)
// ActivityBumpWorkspace automatically bumps the workspace's auto-off timer // ActivityBumpWorkspace automatically bumps the workspace's auto-off timer
// if it is set to expire soon. The deadline will be bumped by 1 hour*. // if it is set to expire soon. The deadline will be bumped by 1 hour*.
// If the bump crosses over an autostart time, the workspace will be // If the bump crosses over an autostart time, the workspace will be
@@ -36,7 +51,7 @@ import (
// A way to avoid this is to configure the max deadline to something that will not // A way to avoid this is to configure the max deadline to something that will not
// span more than 1 day. This will force the workspace to restart and reset the deadline // span more than 1 day. This will force the workspace to restart and reset the deadline
// each morning when it autostarts. // each morning when it autostarts.
func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time) { func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time, reason ActivityBumpReason) {
// We set a short timeout so if the app is under load, these // We set a short timeout so if the app is under load, these
// low priority operations fail first. // low priority operations fail first.
ctx, cancel := context.WithTimeout(ctx, time.Second*15) ctx, cancel := context.WithTimeout(ctx, time.Second*15)
@@ -50,6 +65,7 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
// Bump will fail if the context is canceled, but this is ok. // Bump will fail if the context is canceled, but this is ok.
log.Error(ctx, "activity bump failed", slog.Error(err), log.Error(ctx, "activity bump failed", slog.Error(err),
slog.F("workspace_id", workspaceID), slog.F("workspace_id", workspaceID),
slog.F("reason", reason),
) )
} }
return return
@@ -57,5 +73,6 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
log.Debug(ctx, "bumped deadline from activity", log.Debug(ctx, "bumped deadline from activity",
slog.F("workspace_id", workspaceID), slog.F("workspace_id", workspaceID),
slog.F("reason", reason),
) )
} }
+2 -1
View File
@@ -268,13 +268,14 @@ func Test_ActivityBumpWorkspace(t *testing.T) {
// Bump duration is measured from the time of the bump, so we measure from here. // Bump duration is measured from the time of the bump, so we measure from here.
start := dbtime.Now() start := dbtime.Now()
workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start)) workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start), workspacestats.ActivityBumpReasonWorkspaceStats)
end := dbtime.Now() end := dbtime.Now()
// Validate our state after bump // Validate our state after bump
updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID) updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID)
require.NoError(t, err, "unexpected error getting latest workspace build") require.NoError(t, err, "unexpected error getting latest workspace build")
require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed") require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed")
if tt.expectedBump == 0 { if tt.expectedBump == 0 {
assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at") assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at")
assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline") assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline")
+1 -1
View File
@@ -194,7 +194,7 @@ func (r *Reporter) ReportAgentStats(ctx context.Context, now time.Time, workspac
} }
// bump workspace activity // bump workspace activity
ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart) ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart, ActivityBumpReasonWorkspaceStats)
} }
// bump workspace last_used_at // bump workspace last_used_at
+27 -2
View File
@@ -18,6 +18,7 @@ import (
"charm.land/fantasy" "charm.land/fantasy"
"charm.land/fantasy/providers/anthropic" "charm.land/fantasy/providers/anthropic"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/shopspring/decimal" "github.com/shopspring/decimal"
"github.com/sqlc-dev/pqtype" "github.com/sqlc-dev/pqtype"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
@@ -145,6 +146,7 @@ type Server struct {
usageTracker *workspacestats.UsageTracker usageTracker *workspacestats.UsageTracker
clock quartz.Clock clock quartz.Clock
metrics *chatloop.Metrics
recordingSem chan struct{} recordingSem chan struct{}
// Configuration // Configuration
@@ -2700,6 +2702,7 @@ type Config struct {
WebpushDispatcher webpush.Dispatcher WebpushDispatcher webpush.Dispatcher
UsageTracker *workspacestats.UsageTracker UsageTracker *workspacestats.UsageTracker
Clock quartz.Clock Clock quartz.Clock
PrometheusRegistry prometheus.Registerer
} }
// New creates a new chat processor. The processor polls for pending // New creates a new chat processor. The processor polls for pending
@@ -2768,6 +2771,11 @@ func New(cfg Config) *Server {
wakeCh: make(chan struct{}, 1), wakeCh: make(chan struct{}, 1),
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry), heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
} }
if cfg.PrometheusRegistry != nil {
p.metrics = chatloop.NewMetrics(cfg.PrometheusRegistry)
} else {
p.metrics = chatloop.NopMetrics()
}
//nolint:gocritic // The chat processor uses a scoped chatd context. //nolint:gocritic // The chat processor uses a scoped chatd context.
ctx = dbauthz.AsChatd(ctx) ctx = dbauthz.AsChatd(ctx)
@@ -4027,7 +4035,7 @@ func (p *Server) trackWorkspaceUsage(
// approx. 333 CTE queries/second. A cheap fix for this could // approx. 333 CTE queries/second. A cheap fix for this could
// be to heartbeat every Nth query. Leaving as potential future // be to heartbeat every Nth query. Leaving as potential future
// low-hanging fruit if needed. // low-hanging fruit if needed.
workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}) workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}, workspacestats.ActivityBumpReasonChatHeartbeat)
} }
return wsID return wsID
} }
@@ -4036,6 +4044,9 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
logger := p.logger.With(slog.F("chat_id", chat.ID)) logger := p.logger.With(slog.F("chat_id", chat.ID))
logger.Info(ctx, "processing chat request") logger.Info(ctx, "processing chat request")
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
defer p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
chatCtx, cancel := context.WithCancelCause(ctx) chatCtx, cancel := context.WithCancelCause(ctx)
defer cancel(nil) defer cancel(nil)
@@ -4246,6 +4257,12 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
} }
}() }()
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Inc()
defer func() {
p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Dec()
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
}()
runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger) runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger)
if err != nil { if err != nil {
if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) { if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) {
@@ -5165,12 +5182,18 @@ func (p *Server) runChat(
) )
} }
// Record builtin tool names before appending MCP tools
// so the metrics layer can bound label cardinality.
builtinToolNames := make(map[string]bool, len(tools))
for _, t := range tools {
builtinToolNames[t.Info().Name] = true
}
// Append tools from external MCP servers. These appear // Append tools from external MCP servers. These appear
// after the built-in tools so the LLM sees them as // after the built-in tools so the LLM sees them as
// additional capabilities. // additional capabilities.
tools = append(tools, mcpTools...) tools = append(tools, mcpTools...)
tools = append(tools, workspaceMCPTools...) tools = append(tools, workspaceMCPTools...)
// Append dynamic tools declared by the client at chat // Append dynamic tools declared by the client at chat
// creation time. These appear in the LLM's tool list but // creation time. These appear in the LLM's tool list but
// are never executed by the chatloop — the client handles // are never executed by the chatloop — the client handles
@@ -5251,6 +5274,8 @@ func (p *Server) runChat(
Model: model, Model: model,
Messages: prompt, Messages: prompt,
Tools: tools, MaxSteps: maxChatSteps, Tools: tools, MaxSteps: maxChatSteps,
Metrics: p.metrics,
BuiltinToolNames: builtinToolNames,
ModelConfig: callConfig, ModelConfig: callConfig,
ProviderOptions: providerOptions, ProviderOptions: providerOptions,
+1
View File
@@ -2610,6 +2610,7 @@ func TestProcessChat_IgnoresStaleControlNotification(t *testing.T) {
chatHeartbeatInterval: time.Minute, chatHeartbeatInterval: time.Minute,
configCache: newChatConfigCache(ctx, db, clock), configCache: newChatConfigCache(ctx, db, clock),
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry), heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
metrics: chatloop.NopMetrics(),
} }
// Publish a stale "pending" notification on the control channel // Publish a stale "pending" notification on the control channel
+48 -5
View File
@@ -150,6 +150,15 @@ type RunOptions struct {
OnRetry chatretry.OnRetryFn OnRetry chatretry.OnRetryFn
OnInterruptedPersistError func(error) OnInterruptedPersistError func(error)
// Metrics records Prometheus metrics for the chatd subsystem.
// When nil, no metrics are recorded.
Metrics *Metrics
// BuiltinToolNames lists tool names that are built into chatd.
// Tool results from tools not in this set are recorded under
// the "mcp" label to bound cardinality.
BuiltinToolNames map[string]bool
} }
// ProviderTool pairs a provider-native tool definition with an // ProviderTool pairs a provider-native tool definition with an
@@ -297,6 +306,9 @@ func Run(ctx context.Context, opts RunOptions) error {
if opts.Clock == nil { if opts.Clock == nil {
opts.Clock = quartz.NewReal() opts.Clock = quartz.NewReal()
} }
if opts.Metrics == nil {
opts.Metrics = NopMetrics()
}
publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) { publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) {
if opts.PublishMessagePart == nil { if opts.PublishMessagePart == nil {
@@ -343,6 +355,8 @@ func Run(ctx context.Context, opts RunOptions) error {
for step := 0; totalSteps < opts.MaxSteps; step++ { for step := 0; totalSteps < opts.MaxSteps; step++ {
totalSteps++ totalSteps++
provider := opts.Model.Provider()
opts.Metrics.StepsTotal.WithLabelValues(provider).Inc()
stepStart := time.Now() stepStart := time.Now()
// Copy messages so that provider-specific caching // Copy messages so that provider-specific caching
// mutations don't leak back to the caller's slice. // mutations don't leak back to the caller's slice.
@@ -354,7 +368,8 @@ func Run(ctx context.Context, opts RunOptions) error {
if applyAnthropicCaching { if applyAnthropicCaching {
addAnthropicPromptCaching(prepared) addAnthropicPromptCaching(prepared)
} }
opts.Metrics.MessageCount.WithLabelValues(provider).Observe(float64(len(prepared)))
opts.Metrics.PromptSizeBytes.WithLabelValues(provider).Observe(float64(EstimatePromptSize(prepared)))
call := fantasy.Call{ call := fantasy.Call{
Prompt: prepared, Prompt: prepared,
Tools: tools, Tools: tools,
@@ -371,12 +386,13 @@ func Run(ctx context.Context, opts RunOptions) error {
err := chatretry.Retry(ctx, func(retryCtx context.Context) error { err := chatretry.Retry(ctx, func(retryCtx context.Context) error {
attempt, streamErr := guardedStream( attempt, streamErr := guardedStream(
retryCtx, retryCtx,
opts.Model.Provider(), provider,
opts.Clock, opts.Clock,
opts.StartupTimeout, opts.StartupTimeout,
func(attemptCtx context.Context) (fantasy.StreamResponse, error) { func(attemptCtx context.Context) (fantasy.StreamResponse, error) {
return opts.Model.Stream(attemptCtx, call) return opts.Model.Stream(attemptCtx, call)
}, },
opts.Metrics,
) )
if streamErr != nil { if streamErr != nil {
return streamErr return streamErr
@@ -444,7 +460,7 @@ func Run(ctx context.Context, opts RunOptions) error {
} }
// Execute only built-in tools. // Execute only built-in tools.
toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, func(tr fantasy.ToolResultContent, completedAt time.Time) { toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) {
recordToolResultTimestamp(&result, tr.ToolCallID, completedAt) recordToolResultTimestamp(&result, tr.ToolCallID, completedAt)
ssePart := chatprompt.PartFromContent(tr) ssePart := chatprompt.PartFromContent(tr)
ssePart.CreatedAt = &completedAt ssePart.CreatedAt = &completedAt
@@ -582,9 +598,11 @@ func Run(ctx context.Context, opts RunOptions) error {
result.providerMetadata, result.providerMetadata,
messages, messages,
) )
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
if compactErr != nil && opts.Compaction.OnError != nil { if compactErr != nil && opts.Compaction.OnError != nil {
opts.Compaction.OnError(compactErr) opts.Compaction.OnError(compactErr)
} }
if did { if did {
alreadyCompacted = true alreadyCompacted = true
compactedOnFinalStep = true compactedOnFinalStep = true
@@ -622,6 +640,7 @@ func Run(ctx context.Context, opts RunOptions) error {
lastProviderMetadata, lastProviderMetadata,
messages, messages,
) )
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, err)
if err != nil { if err != nil {
if opts.Compaction.OnError != nil { if opts.Compaction.OnError != nil {
opts.Compaction.OnError(err) opts.Compaction.OnError(err)
@@ -720,6 +739,7 @@ func guardedStream(
clock quartz.Clock, clock quartz.Clock,
timeout time.Duration, timeout time.Duration,
openStream func(context.Context) (fantasy.StreamResponse, error), openStream func(context.Context) (fantasy.StreamResponse, error),
metrics *Metrics,
) (guardedAttempt, error) { ) (guardedAttempt, error) {
attemptCtx, cancelAttempt := context.WithCancelCause(parent) attemptCtx, cancelAttempt := context.WithCancelCause(parent)
guard := newStartupGuard(clock, timeout, cancelAttempt) guard := newStartupGuard(clock, timeout, cancelAttempt)
@@ -731,6 +751,7 @@ func guardedStream(
}) })
} }
streamStart := clock.Now()
stream, err := openStream(attemptCtx) stream, err := openStream(attemptCtx)
if err != nil { if err != nil {
err = classifyStartupTimeout(attemptCtx, provider, err) err = classifyStartupTimeout(attemptCtx, provider, err)
@@ -738,11 +759,17 @@ func guardedStream(
return guardedAttempt{}, err return guardedAttempt{}, err
} }
recordTTFT := sync.OnceFunc(func() {
metrics.TTFTSeconds.WithLabelValues(provider).Observe(
clock.Since(streamStart).Seconds(),
)
})
return guardedAttempt{ return guardedAttempt{
ctx: attemptCtx, ctx: attemptCtx,
stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) { stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) {
for part := range stream { for part := range stream {
guard.Disarm() guard.Disarm()
recordTTFT()
if !yield(part) { if !yield(part) {
return return
} }
@@ -985,6 +1012,9 @@ func executeTools(
allTools []fantasy.AgentTool, allTools []fantasy.AgentTool,
providerTools []ProviderTool, providerTools []ProviderTool,
toolCalls []fantasy.ToolCallContent, toolCalls []fantasy.ToolCallContent,
metrics *Metrics,
provider string,
builtinToolNames map[string]bool,
onResult func(fantasy.ToolResultContent, time.Time), onResult func(fantasy.ToolResultContent, time.Time),
) []fantasy.ToolResultContent { ) []fantasy.ToolResultContent {
if len(toolCalls) == 0 { if len(toolCalls) == 0 {
@@ -1039,7 +1069,7 @@ func executeTools(
// accurate individual completion times. // accurate individual completion times.
completedAt[i] = dbtime.Now() completedAt[i] = dbtime.Now()
}() }()
results[i] = executeSingleTool(ctx, toolMap, tc) results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, builtinToolNames)
}() }()
} }
wg.Wait() wg.Wait()
@@ -1060,12 +1090,24 @@ func executeSingleTool(
ctx context.Context, ctx context.Context,
toolMap map[string]fantasy.AgentTool, toolMap map[string]fantasy.AgentTool,
tc fantasy.ToolCallContent, tc fantasy.ToolCallContent,
metrics *Metrics,
provider string,
builtinToolNames map[string]bool,
) fantasy.ToolResultContent { ) fantasy.ToolResultContent {
result := fantasy.ToolResultContent{ result := fantasy.ToolResultContent{
ToolCallID: tc.ToolCallID, ToolCallID: tc.ToolCallID,
ToolName: tc.ToolName, ToolName: tc.ToolName,
ProviderExecuted: false, ProviderExecuted: false,
} }
defer func() {
toolLabel := tc.ToolName
if !builtinToolNames[tc.ToolName] {
toolLabel = "mcp"
}
metrics.ToolResultSizeBytes.WithLabelValues(provider, toolLabel).Observe(
float64(ToolResultSize(result)),
)
}()
tool, exists := toolMap[tc.ToolName] tool, exists := toolMap[tc.ToolName]
if !exists { if !exists {
@@ -1256,7 +1298,7 @@ func tryCompactOnExit(
if err != nil { if err != nil {
return return
} }
_, compactErr := tryCompact( did, compactErr := tryCompact(
ctx, ctx,
opts.Model, opts.Model,
opts.Compaction, opts.Compaction,
@@ -1265,6 +1307,7 @@ func tryCompactOnExit(
metadata, metadata,
reloaded, reloaded,
) )
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
if compactErr != nil && opts.Compaction.OnError != nil { if compactErr != nil && opts.Compaction.OnError != nil {
opts.Compaction.OnError(compactErr) opts.Compaction.OnError(compactErr)
} }
+173
View File
@@ -0,0 +1,173 @@
package chatloop
import (
"context"
"errors"
"charm.land/fantasy"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
const (
metricsNamespace = "coderd"
metricsSubsystem = "chatd"
// Label values for Chats.
StateStreaming = "streaming"
StateWaiting = "waiting"
// Label values for CompactionTotal.
CompactionResultSuccess = "success"
CompactionResultError = "error"
CompactionResultTimeout = "timeout"
)
// Metrics holds Prometheus metrics for the chatd subsystem.
type Metrics struct {
Chats *prometheus.GaugeVec
MessageCount *prometheus.HistogramVec
PromptSizeBytes *prometheus.HistogramVec
ToolResultSizeBytes *prometheus.HistogramVec
TTFTSeconds *prometheus.HistogramVec
CompactionTotal *prometheus.CounterVec
StepsTotal *prometheus.CounterVec
}
// NewMetrics creates a new Metrics instance registered with the
// given registerer.
func NewMetrics(reg prometheus.Registerer) *Metrics {
factory := promauto.With(reg)
return &Metrics{
Chats: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "chats",
Help: "Number of chats being processed, by state.",
}, []string{"state"}),
MessageCount: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "message_count",
Help: "Number of messages in the prompt per LLM request.",
Buckets: prometheus.ExponentialBuckets(1, 2, 8), // 1, 2, 4, 8, 16, 32, 64, 128
}, []string{"provider"}),
PromptSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "prompt_size_bytes",
Help: "Estimated byte size of the prompt per LLM request.",
Buckets: prometheus.ExponentialBuckets(1024, 4, 10), // 1KB .. 256MB
}, []string{"provider"}),
ToolResultSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "tool_result_size_bytes",
Help: "Size in bytes of each tool execution result.",
Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB
}, []string{"provider", "tool_name"}),
TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "ttft_seconds",
Help: "Time-to-first-token: wall time from LLM request to first streamed chunk.",
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
}, []string{"provider"}),
CompactionTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "compaction_total",
Help: "Total compaction outcomes (only recorded when compaction was triggered or failed).",
}, []string{"provider", "result"}),
StepsTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "steps_total",
Help: "Total agentic loop steps across all chats.",
}, []string{"provider"}),
}
}
// NopMetrics returns a Metrics instance that discards all data.
// Useful for tests and when metrics collection is not desired.
func NopMetrics() *Metrics {
return NewMetrics(prometheus.NewRegistry())
}
// RecordCompaction classifies and records a compaction attempt.
// It is a no-op when m is nil.
func (m *Metrics) RecordCompaction(provider string, compacted bool, err error) {
if m == nil {
return
}
switch {
case err != nil && errors.Is(err, context.DeadlineExceeded):
m.CompactionTotal.WithLabelValues(provider, CompactionResultTimeout).Inc()
case err != nil && errors.Is(err, context.Canceled):
// User interruption, not a compaction failure.
return
case err != nil:
m.CompactionTotal.WithLabelValues(provider, CompactionResultError).Inc()
case compacted:
m.CompactionTotal.WithLabelValues(provider, CompactionResultSuccess).Inc()
// !compacted && err == nil means threshold not reached -- not
// recorded.
}
}
// EstimatePromptSize returns a cheap byte-size estimate of a
// fantasy prompt by summing the text content lengths of all
// message parts. This avoids JSON marshaling overhead.
func EstimatePromptSize(messages []fantasy.Message) int {
var size int
for _, msg := range messages {
for _, part := range msg.Content {
size += ContentPartSize(part)
}
}
return size
}
// ContentPartSize returns the byte length of a MessagePart's
// primary text or data field.
func ContentPartSize(part fantasy.MessagePart) int {
switch p := part.(type) {
case fantasy.TextPart:
return len(p.Text)
case fantasy.ReasoningPart:
return len(p.Text)
case fantasy.FilePart:
return len(p.Data)
case fantasy.ToolCallPart:
return len(p.Input)
case fantasy.ToolResultPart:
return toolResultOutputSize(p.Output)
default:
return 0
}
}
// ToolResultSize returns the byte length of a
// ToolResultContent's primary text or data field.
func ToolResultSize(r fantasy.ToolResultContent) int {
return toolResultOutputSize(r.Result)
}
func toolResultOutputSize(output fantasy.ToolResultOutputContent) int {
if output == nil {
return 0
}
switch v := output.(type) {
case fantasy.ToolResultOutputContentText:
return len(v.Text)
case fantasy.ToolResultOutputContentError:
if v.Error != nil {
return len(v.Error.Error())
}
return 0
case fantasy.ToolResultOutputContentMedia:
return len(v.Data)
default:
return 0
}
}
+342
View File
@@ -0,0 +1,342 @@
package chatloop_test
import (
"context"
"testing"
"charm.land/fantasy"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/coderd/x/chatd/chatloop"
"github.com/coder/coder/v2/coderd/x/chatd/chattest"
)
func TestNewMetrics_RegistersAllMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := chatloop.NewMetrics(reg)
// Initialize vector metrics so they appear in Gather output.
m.Chats.WithLabelValues(chatloop.StateStreaming)
m.CompactionTotal.WithLabelValues("anthropic", chatloop.CompactionResultSuccess)
m.ToolResultSizeBytes.WithLabelValues("anthropic", "test")
m.MessageCount.WithLabelValues("anthropic")
m.PromptSizeBytes.WithLabelValues("anthropic")
m.TTFTSeconds.WithLabelValues("anthropic")
m.StepsTotal.WithLabelValues("anthropic")
families, err := reg.Gather()
require.NoError(t, err)
expected := map[string]dto.MetricType{
"coderd_chatd_chats": dto.MetricType_GAUGE,
"coderd_chatd_message_count": dto.MetricType_HISTOGRAM,
"coderd_chatd_prompt_size_bytes": dto.MetricType_HISTOGRAM,
"coderd_chatd_tool_result_size_bytes": dto.MetricType_HISTOGRAM,
"coderd_chatd_ttft_seconds": dto.MetricType_HISTOGRAM,
"coderd_chatd_compaction_total": dto.MetricType_COUNTER,
"coderd_chatd_steps_total": dto.MetricType_COUNTER,
}
found := make(map[string]dto.MetricType)
for _, f := range families {
found[f.GetName()] = f.GetType()
}
for name, expectedType := range expected {
actualType, ok := found[name]
assert.True(t, ok, "metric %q not registered", name)
if ok {
assert.Equal(t, expectedType, actualType, "metric %q has wrong type", name)
}
}
}
func TestNopMetrics_DoesNotPanic(t *testing.T) {
t.Parallel()
m := chatloop.NopMetrics()
// Exercise every metric to confirm no nil-pointer panics.
m.Chats.WithLabelValues("streaming").Inc()
m.Chats.WithLabelValues("streaming").Dec()
m.Chats.WithLabelValues("waiting").Inc()
m.Chats.WithLabelValues("waiting").Dec()
m.MessageCount.WithLabelValues("anthropic").Observe(10)
m.PromptSizeBytes.WithLabelValues("openai").Observe(4096)
m.ToolResultSizeBytes.WithLabelValues("anthropic", "execute").Observe(512)
m.TTFTSeconds.WithLabelValues("anthropic").Observe(0.5)
m.CompactionTotal.WithLabelValues("anthropic", "success").Inc()
m.CompactionTotal.WithLabelValues("openai", "error").Inc()
m.CompactionTotal.WithLabelValues("google", "timeout").Inc()
m.StepsTotal.WithLabelValues("anthropic").Inc()
}
func TestEstimatePromptSize(t *testing.T) {
t.Parallel()
messages := []fantasy.Message{
{
Role: fantasy.MessageRoleSystem,
Content: []fantasy.MessagePart{
fantasy.TextPart{Text: "You are a helpful assistant."},
},
},
{
Role: fantasy.MessageRoleUser,
Content: []fantasy.MessagePart{
fantasy.TextPart{Text: "Hello world"},
fantasy.ReasoningPart{Text: "thinking..."},
fantasy.FilePart{Data: []byte("filedata")},
},
},
{
Role: fantasy.MessageRoleAssistant,
Content: []fantasy.MessagePart{
fantasy.TextPart{Text: "Hi there!"},
fantasy.ToolCallPart{Input: `{"file":"main.go"}`},
},
},
{
Role: fantasy.MessageRoleTool,
Content: []fantasy.MessagePart{
fantasy.ToolResultPart{
Output: fantasy.ToolResultOutputContentText{Text: "result"},
},
},
},
}
size := chatloop.EstimatePromptSize(messages)
// "You are a helpful assistant." (28) + "Hello world" (11) +
// "thinking..." (11) + "filedata" (8) +
// "Hi there!" (9) + `{"file":"main.go"}` (18) +
// "result" (6) = 91
assert.Equal(t, 91, size)
}
func TestToolResultSize(t *testing.T) {
t.Parallel()
tests := []struct {
name string
result fantasy.ToolResultContent
expected int
}{
{
name: "text",
result: fantasy.ToolResultContent{
Result: fantasy.ToolResultOutputContentText{Text: "hello"},
},
expected: 5,
},
{
name: "error",
result: fantasy.ToolResultContent{
Result: fantasy.ToolResultOutputContentError{
Error: assert.AnError,
},
},
expected: len(assert.AnError.Error()),
},
{
name: "media",
result: fantasy.ToolResultContent{
Result: fantasy.ToolResultOutputContentMedia{Data: "base64data"},
},
expected: 10,
},
{
name: "nil_result",
result: fantasy.ToolResultContent{},
expected: 0,
},
{
name: "error_nil_error",
result: fantasy.ToolResultContent{
Result: fantasy.ToolResultOutputContentError{Error: nil},
},
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
assert.Equal(t, tt.expected, chatloop.ToolResultSize(tt.result))
})
}
}
func TestRecordCompaction(t *testing.T) {
t.Parallel()
t.Run("nil metrics does not panic", func(t *testing.T) {
t.Parallel()
var m *chatloop.Metrics
m.RecordCompaction("anthropic", true, nil)
})
tests := []struct {
name string
compacted bool
err error
wantLabel string
wantCount int
}{
{
name: "success",
compacted: true,
err: nil,
wantLabel: chatloop.CompactionResultSuccess,
wantCount: 1,
},
{
name: "error",
compacted: false,
err: assert.AnError,
wantLabel: chatloop.CompactionResultError,
wantCount: 1,
},
{
name: "timeout",
compacted: false,
err: context.DeadlineExceeded,
wantLabel: chatloop.CompactionResultTimeout,
wantCount: 1,
},
{
name: "threshold_not_reached",
compacted: false,
err: nil,
wantLabel: "",
wantCount: 0,
},
{
name: "canceled",
compacted: false,
err: context.Canceled,
wantLabel: "",
wantCount: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := chatloop.NewMetrics(reg)
m.RecordCompaction("test", tt.compacted, tt.err)
families, err := reg.Gather()
require.NoError(t, err)
if tt.wantCount == 0 {
for _, f := range families {
assert.NotEqual(t, "coderd_chatd_compaction_total", f.GetName(),
"compaction_total should not be recorded")
}
return
}
var found bool
for _, f := range families {
if f.GetName() != "coderd_chatd_compaction_total" {
continue
}
found = true
require.Len(t, f.GetMetric(), 1)
metric := f.GetMetric()[0]
assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue())
// Check label.
for _, lp := range metric.GetLabel() {
if lp.GetName() == "result" {
assert.Equal(t, tt.wantLabel, lp.GetValue())
}
}
}
assert.True(t, found, "compaction_total metric not found")
})
}
}
func TestRun_RecordsMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
metrics := chatloop.NewMetrics(reg)
model := &chattest.FakeModel{
ProviderName: "test-provider",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
return func(yield func(fantasy.StreamPart) bool) {
parts := []fantasy.StreamPart{
{Type: fantasy.StreamPartTypeTextStart, ID: "t1"},
{Type: fantasy.StreamPartTypeTextDelta, ID: "t1", Delta: "hello"},
{Type: fantasy.StreamPartTypeTextEnd, ID: "t1"},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
}
for _, p := range parts {
if !yield(p) {
return
}
}
}, nil
},
}
err := chatloop.Run(context.Background(), chatloop.RunOptions{
Model: model,
Messages: []fantasy.Message{
{
Role: fantasy.MessageRoleUser,
Content: []fantasy.MessagePart{
fantasy.TextPart{Text: "hello"},
},
},
},
MaxSteps: 1,
PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error {
return nil
},
Metrics: metrics,
})
require.NoError(t, err)
families, err := reg.Gather()
require.NoError(t, err)
found := make(map[string]bool)
for _, f := range families {
found[f.GetName()] = true
switch f.GetName() {
case "coderd_chatd_steps_total":
require.Len(t, f.GetMetric(), 1)
assert.Equal(t, float64(1), f.GetMetric()[0].GetCounter().GetValue(),
"steps_total should be 1 after one step")
case "coderd_chatd_message_count":
require.Len(t, f.GetMetric(), 1)
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
"message_count should have 1 observation")
case "coderd_chatd_prompt_size_bytes":
require.Len(t, f.GetMetric(), 1)
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
"prompt_size_bytes should have 1 observation")
case "coderd_chatd_ttft_seconds":
require.Len(t, f.GetMetric(), 1)
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
"ttft_seconds should have 1 observation")
}
}
assert.True(t, found["coderd_chatd_steps_total"], "steps_total not recorded")
assert.True(t, found["coderd_chatd_message_count"], "message_count not recorded")
assert.True(t, found["coderd_chatd_prompt_size_bytes"], "prompt_size_bytes not recorded")
assert.True(t, found["coderd_chatd_ttft_seconds"], "ttft_seconds not recorded")
}
+7
View File
@@ -198,6 +198,13 @@ deployment. They will always be available from the agent.
| `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` | | `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` |
| `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | | | `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | |
| `coderd_build_info` | gauge | Describes the current build/version of the Coder server. Value is always 1. | `revision` `version` | | `coderd_build_info` | gauge | Describes the current build/version of the Coder server. Value is always 1. | `revision` `version` |
| `coderd_chatd_chats` | gauge | Number of chats being processed, by state. | `state` |
| `coderd_chatd_compaction_total` | counter | Total compaction outcomes (only recorded when compaction was triggered or failed). | `provider` `result` |
| `coderd_chatd_message_count` | histogram | Number of messages in the prompt per LLM request. | `provider` |
| `coderd_chatd_prompt_size_bytes` | histogram | Estimated byte size of the prompt per LLM request. | `provider` |
| `coderd_chatd_steps_total` | counter | Total agentic loop steps across all chats. | `provider` |
| `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `provider` `tool_name` |
| `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `provider` |
| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` | | `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` |
| `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` | | `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` |
| `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` | | `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` |
+2 -2
View File
@@ -2908,7 +2908,7 @@ func TestPrebuildActivityBump(t *testing.T) {
require.Zero(t, prebuild.LatestBuild.MaxDeadline) require.Zero(t, prebuild.LatestBuild.MaxDeadline)
// When: activity bump is applied to an unclaimed prebuild // When: activity bump is applied to an unclaimed prebuild
workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour)) workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
// Then: prebuild Deadline/MaxDeadline remain unchanged // Then: prebuild Deadline/MaxDeadline remain unchanged
prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID) prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID)
@@ -2941,7 +2941,7 @@ func TestPrebuildActivityBump(t *testing.T) {
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID) workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
// When: activity bump is applied to a claimed prebuild // When: activity bump is applied to a claimed prebuild
workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour)) workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
// Then: Deadline is extended by the activity bump, MaxDeadline remains unset // Then: Deadline is extended by the activity bump, MaxDeadline remains unset
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID) workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
+21
View File
@@ -226,6 +226,27 @@ coderd_authz_prepare_authorize_duration_seconds 0
# HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1. # HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1.
# TYPE coderd_build_info gauge # TYPE coderd_build_info gauge
coderd_build_info{version="",revision=""} 0 coderd_build_info{version="",revision=""} 0
# HELP coderd_chatd_chats Number of chats being processed, by state.
# TYPE coderd_chatd_chats gauge
coderd_chatd_chats{state=""} 0
# HELP coderd_chatd_compaction_total Total compaction outcomes (only recorded when compaction was triggered or failed).
# TYPE coderd_chatd_compaction_total counter
coderd_chatd_compaction_total{provider="",result=""} 0
# HELP coderd_chatd_message_count Number of messages in the prompt per LLM request.
# TYPE coderd_chatd_message_count histogram
coderd_chatd_message_count{provider=""} 0
# HELP coderd_chatd_prompt_size_bytes Estimated byte size of the prompt per LLM request.
# TYPE coderd_chatd_prompt_size_bytes histogram
coderd_chatd_prompt_size_bytes{provider=""} 0
# HELP coderd_chatd_steps_total Total agentic loop steps across all chats.
# TYPE coderd_chatd_steps_total counter
coderd_chatd_steps_total{provider=""} 0
# HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result.
# TYPE coderd_chatd_tool_result_size_bytes histogram
coderd_chatd_tool_result_size_bytes{provider="",tool_name=""} 0
# HELP coderd_chatd_ttft_seconds Time-to-first-token: wall time from LLM request to first streamed chunk.
# TYPE coderd_chatd_ttft_seconds histogram
coderd_chatd_ttft_seconds{provider=""} 0
# HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name. # HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name.
# TYPE coderd_db_query_counts_total counter # TYPE coderd_db_query_counts_total counter
coderd_db_query_counts_total{route="",method="",query=""} 0 coderd_db_query_counts_total{route="",method="",query=""} 0