diff --git a/coderd/agentapi/apps.go b/coderd/agentapi/apps.go index 00dd1821f7..759fb26e5c 100644 --- a/coderd/agentapi/apps.go +++ b/coderd/agentapi/apps.go @@ -216,7 +216,7 @@ func (a *AppsAPI) UpdateAppStatus(ctx context.Context, req *agentproto.UpdateApp // We pass time.Time{} for nextAutostart since we don't have access to // TemplateScheduleStore here. The activity bump logic handles this by // defaulting to the template's activity_bump duration (typically 1 hour). - workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}) + workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}, workspacestats.ActivityBumpReasonAppActivity) } // just return a blank response because it doesn't contain any settable fields at present. return new(agentproto.UpdateAppStatusResponse), nil diff --git a/coderd/coderd.go b/coderd/coderd.go index 834eaf6fb5..076825dd2f 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -792,6 +792,7 @@ func New(options *Options) *API { Pubsub: options.Pubsub, WebpushDispatcher: options.WebPushDispatcher, UsageTracker: options.WorkspaceUsageTracker, + PrometheusRegistry: options.PrometheusRegistry, }) gitSyncLogger := options.Logger.Named("gitsync") refresher := gitsync.NewRefresher( diff --git a/coderd/workspacestats/activitybump.go b/coderd/workspacestats/activitybump.go index 0cf4767ad9..0f6014805a 100644 --- a/coderd/workspacestats/activitybump.go +++ b/coderd/workspacestats/activitybump.go @@ -11,6 +11,21 @@ import ( "github.com/coder/coder/v2/coderd/database" ) +// ActivityBumpReason represents the reason for an activity bump. +type ActivityBumpReason string + +const ( + // ActivityBumpReasonWorkspaceStats indicates the bump was triggered + // by SSH or terminal activity reported via workspace stats. + ActivityBumpReasonWorkspaceStats ActivityBumpReason = "workspace_stats" + // ActivityBumpReasonChatHeartbeat indicates the bump was triggered + // by an AI chat heartbeat. + ActivityBumpReasonChatHeartbeat ActivityBumpReason = "chat_heartbeat" + // ActivityBumpReasonAppActivity indicates the bump was triggered + // by app or port-forward activity. + ActivityBumpReasonAppActivity ActivityBumpReason = "app_activity" +) + // ActivityBumpWorkspace automatically bumps the workspace's auto-off timer // if it is set to expire soon. The deadline will be bumped by 1 hour*. // If the bump crosses over an autostart time, the workspace will be @@ -36,7 +51,7 @@ import ( // A way to avoid this is to configure the max deadline to something that will not // span more than 1 day. This will force the workspace to restart and reset the deadline // each morning when it autostarts. -func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time) { +func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time, reason ActivityBumpReason) { // We set a short timeout so if the app is under load, these // low priority operations fail first. ctx, cancel := context.WithTimeout(ctx, time.Second*15) @@ -50,6 +65,7 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto // Bump will fail if the context is canceled, but this is ok. log.Error(ctx, "activity bump failed", slog.Error(err), slog.F("workspace_id", workspaceID), + slog.F("reason", reason), ) } return @@ -57,5 +73,6 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto log.Debug(ctx, "bumped deadline from activity", slog.F("workspace_id", workspaceID), + slog.F("reason", reason), ) } diff --git a/coderd/workspacestats/activitybump_test.go b/coderd/workspacestats/activitybump_test.go index dec683ca55..8838ed6583 100644 --- a/coderd/workspacestats/activitybump_test.go +++ b/coderd/workspacestats/activitybump_test.go @@ -268,13 +268,14 @@ func Test_ActivityBumpWorkspace(t *testing.T) { // Bump duration is measured from the time of the bump, so we measure from here. start := dbtime.Now() - workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start)) + workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start), workspacestats.ActivityBumpReasonWorkspaceStats) end := dbtime.Now() // Validate our state after bump updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID) require.NoError(t, err, "unexpected error getting latest workspace build") require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed") + if tt.expectedBump == 0 { assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at") assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline") diff --git a/coderd/workspacestats/reporter.go b/coderd/workspacestats/reporter.go index e2a7beb1cf..c5b8f9f70a 100644 --- a/coderd/workspacestats/reporter.go +++ b/coderd/workspacestats/reporter.go @@ -194,7 +194,7 @@ func (r *Reporter) ReportAgentStats(ctx context.Context, now time.Time, workspac } // bump workspace activity - ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart) + ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart, ActivityBumpReasonWorkspaceStats) } // bump workspace last_used_at diff --git a/coderd/x/chatd/chatd.go b/coderd/x/chatd/chatd.go index 474675476d..90043de2fe 100644 --- a/coderd/x/chatd/chatd.go +++ b/coderd/x/chatd/chatd.go @@ -18,6 +18,7 @@ import ( "charm.land/fantasy" "charm.land/fantasy/providers/anthropic" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/shopspring/decimal" "github.com/sqlc-dev/pqtype" "golang.org/x/sync/errgroup" @@ -145,6 +146,7 @@ type Server struct { usageTracker *workspacestats.UsageTracker clock quartz.Clock + metrics *chatloop.Metrics recordingSem chan struct{} // Configuration @@ -2700,6 +2702,7 @@ type Config struct { WebpushDispatcher webpush.Dispatcher UsageTracker *workspacestats.UsageTracker Clock quartz.Clock + PrometheusRegistry prometheus.Registerer } // New creates a new chat processor. The processor polls for pending @@ -2768,6 +2771,11 @@ func New(cfg Config) *Server { wakeCh: make(chan struct{}, 1), heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry), } + if cfg.PrometheusRegistry != nil { + p.metrics = chatloop.NewMetrics(cfg.PrometheusRegistry) + } else { + p.metrics = chatloop.NopMetrics() + } //nolint:gocritic // The chat processor uses a scoped chatd context. ctx = dbauthz.AsChatd(ctx) @@ -4027,7 +4035,7 @@ func (p *Server) trackWorkspaceUsage( // approx. 333 CTE queries/second. A cheap fix for this could // be to heartbeat every Nth query. Leaving as potential future // low-hanging fruit if needed. - workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}) + workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}, workspacestats.ActivityBumpReasonChatHeartbeat) } return wsID } @@ -4036,6 +4044,9 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) { logger := p.logger.With(slog.F("chat_id", chat.ID)) logger.Info(ctx, "processing chat request") + p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc() + defer p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec() + chatCtx, cancel := context.WithCancelCause(ctx) defer cancel(nil) @@ -4246,6 +4257,12 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) { } }() + p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec() + p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Inc() + defer func() { + p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Dec() + p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc() + }() runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger) if err != nil { if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) { @@ -5165,12 +5182,18 @@ func (p *Server) runChat( ) } + // Record builtin tool names before appending MCP tools + // so the metrics layer can bound label cardinality. + builtinToolNames := make(map[string]bool, len(tools)) + for _, t := range tools { + builtinToolNames[t.Info().Name] = true + } + // Append tools from external MCP servers. These appear // after the built-in tools so the LLM sees them as // additional capabilities. tools = append(tools, mcpTools...) tools = append(tools, workspaceMCPTools...) - // Append dynamic tools declared by the client at chat // creation time. These appear in the LLM's tool list but // are never executed by the chatloop — the client handles @@ -5251,6 +5274,8 @@ func (p *Server) runChat( Model: model, Messages: prompt, Tools: tools, MaxSteps: maxChatSteps, + Metrics: p.metrics, + BuiltinToolNames: builtinToolNames, ModelConfig: callConfig, ProviderOptions: providerOptions, diff --git a/coderd/x/chatd/chatd_internal_test.go b/coderd/x/chatd/chatd_internal_test.go index 71de8a2a93..0a675311d3 100644 --- a/coderd/x/chatd/chatd_internal_test.go +++ b/coderd/x/chatd/chatd_internal_test.go @@ -2610,6 +2610,7 @@ func TestProcessChat_IgnoresStaleControlNotification(t *testing.T) { chatHeartbeatInterval: time.Minute, configCache: newChatConfigCache(ctx, db, clock), heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry), + metrics: chatloop.NopMetrics(), } // Publish a stale "pending" notification on the control channel diff --git a/coderd/x/chatd/chatloop/chatloop.go b/coderd/x/chatd/chatloop/chatloop.go index 7211ea29c6..1bd45d5b1f 100644 --- a/coderd/x/chatd/chatloop/chatloop.go +++ b/coderd/x/chatd/chatloop/chatloop.go @@ -150,6 +150,15 @@ type RunOptions struct { OnRetry chatretry.OnRetryFn OnInterruptedPersistError func(error) + + // Metrics records Prometheus metrics for the chatd subsystem. + // When nil, no metrics are recorded. + Metrics *Metrics + + // BuiltinToolNames lists tool names that are built into chatd. + // Tool results from tools not in this set are recorded under + // the "mcp" label to bound cardinality. + BuiltinToolNames map[string]bool } // ProviderTool pairs a provider-native tool definition with an @@ -297,6 +306,9 @@ func Run(ctx context.Context, opts RunOptions) error { if opts.Clock == nil { opts.Clock = quartz.NewReal() } + if opts.Metrics == nil { + opts.Metrics = NopMetrics() + } publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) { if opts.PublishMessagePart == nil { @@ -343,6 +355,8 @@ func Run(ctx context.Context, opts RunOptions) error { for step := 0; totalSteps < opts.MaxSteps; step++ { totalSteps++ + provider := opts.Model.Provider() + opts.Metrics.StepsTotal.WithLabelValues(provider).Inc() stepStart := time.Now() // Copy messages so that provider-specific caching // mutations don't leak back to the caller's slice. @@ -354,7 +368,8 @@ func Run(ctx context.Context, opts RunOptions) error { if applyAnthropicCaching { addAnthropicPromptCaching(prepared) } - + opts.Metrics.MessageCount.WithLabelValues(provider).Observe(float64(len(prepared))) + opts.Metrics.PromptSizeBytes.WithLabelValues(provider).Observe(float64(EstimatePromptSize(prepared))) call := fantasy.Call{ Prompt: prepared, Tools: tools, @@ -371,12 +386,13 @@ func Run(ctx context.Context, opts RunOptions) error { err := chatretry.Retry(ctx, func(retryCtx context.Context) error { attempt, streamErr := guardedStream( retryCtx, - opts.Model.Provider(), + provider, opts.Clock, opts.StartupTimeout, func(attemptCtx context.Context) (fantasy.StreamResponse, error) { return opts.Model.Stream(attemptCtx, call) }, + opts.Metrics, ) if streamErr != nil { return streamErr @@ -444,7 +460,7 @@ func Run(ctx context.Context, opts RunOptions) error { } // Execute only built-in tools. - toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, func(tr fantasy.ToolResultContent, completedAt time.Time) { + toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) { recordToolResultTimestamp(&result, tr.ToolCallID, completedAt) ssePart := chatprompt.PartFromContent(tr) ssePart.CreatedAt = &completedAt @@ -582,9 +598,11 @@ func Run(ctx context.Context, opts RunOptions) error { result.providerMetadata, messages, ) + opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr) if compactErr != nil && opts.Compaction.OnError != nil { opts.Compaction.OnError(compactErr) } + if did { alreadyCompacted = true compactedOnFinalStep = true @@ -622,6 +640,7 @@ func Run(ctx context.Context, opts RunOptions) error { lastProviderMetadata, messages, ) + opts.Metrics.RecordCompaction(opts.Model.Provider(), did, err) if err != nil { if opts.Compaction.OnError != nil { opts.Compaction.OnError(err) @@ -720,6 +739,7 @@ func guardedStream( clock quartz.Clock, timeout time.Duration, openStream func(context.Context) (fantasy.StreamResponse, error), + metrics *Metrics, ) (guardedAttempt, error) { attemptCtx, cancelAttempt := context.WithCancelCause(parent) guard := newStartupGuard(clock, timeout, cancelAttempt) @@ -731,6 +751,7 @@ func guardedStream( }) } + streamStart := clock.Now() stream, err := openStream(attemptCtx) if err != nil { err = classifyStartupTimeout(attemptCtx, provider, err) @@ -738,11 +759,17 @@ func guardedStream( return guardedAttempt{}, err } + recordTTFT := sync.OnceFunc(func() { + metrics.TTFTSeconds.WithLabelValues(provider).Observe( + clock.Since(streamStart).Seconds(), + ) + }) return guardedAttempt{ ctx: attemptCtx, stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) { for part := range stream { guard.Disarm() + recordTTFT() if !yield(part) { return } @@ -985,6 +1012,9 @@ func executeTools( allTools []fantasy.AgentTool, providerTools []ProviderTool, toolCalls []fantasy.ToolCallContent, + metrics *Metrics, + provider string, + builtinToolNames map[string]bool, onResult func(fantasy.ToolResultContent, time.Time), ) []fantasy.ToolResultContent { if len(toolCalls) == 0 { @@ -1039,7 +1069,7 @@ func executeTools( // accurate individual completion times. completedAt[i] = dbtime.Now() }() - results[i] = executeSingleTool(ctx, toolMap, tc) + results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, builtinToolNames) }() } wg.Wait() @@ -1060,12 +1090,24 @@ func executeSingleTool( ctx context.Context, toolMap map[string]fantasy.AgentTool, tc fantasy.ToolCallContent, + metrics *Metrics, + provider string, + builtinToolNames map[string]bool, ) fantasy.ToolResultContent { result := fantasy.ToolResultContent{ ToolCallID: tc.ToolCallID, ToolName: tc.ToolName, ProviderExecuted: false, } + defer func() { + toolLabel := tc.ToolName + if !builtinToolNames[tc.ToolName] { + toolLabel = "mcp" + } + metrics.ToolResultSizeBytes.WithLabelValues(provider, toolLabel).Observe( + float64(ToolResultSize(result)), + ) + }() tool, exists := toolMap[tc.ToolName] if !exists { @@ -1256,7 +1298,7 @@ func tryCompactOnExit( if err != nil { return } - _, compactErr := tryCompact( + did, compactErr := tryCompact( ctx, opts.Model, opts.Compaction, @@ -1265,6 +1307,7 @@ func tryCompactOnExit( metadata, reloaded, ) + opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr) if compactErr != nil && opts.Compaction.OnError != nil { opts.Compaction.OnError(compactErr) } diff --git a/coderd/x/chatd/chatloop/metrics.go b/coderd/x/chatd/chatloop/metrics.go new file mode 100644 index 0000000000..c815c53544 --- /dev/null +++ b/coderd/x/chatd/chatloop/metrics.go @@ -0,0 +1,173 @@ +package chatloop + +import ( + "context" + "errors" + + "charm.land/fantasy" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +const ( + metricsNamespace = "coderd" + metricsSubsystem = "chatd" + + // Label values for Chats. + StateStreaming = "streaming" + StateWaiting = "waiting" + + // Label values for CompactionTotal. + CompactionResultSuccess = "success" + CompactionResultError = "error" + CompactionResultTimeout = "timeout" +) + +// Metrics holds Prometheus metrics for the chatd subsystem. +type Metrics struct { + Chats *prometheus.GaugeVec + MessageCount *prometheus.HistogramVec + PromptSizeBytes *prometheus.HistogramVec + ToolResultSizeBytes *prometheus.HistogramVec + TTFTSeconds *prometheus.HistogramVec + CompactionTotal *prometheus.CounterVec + StepsTotal *prometheus.CounterVec +} + +// NewMetrics creates a new Metrics instance registered with the +// given registerer. +func NewMetrics(reg prometheus.Registerer) *Metrics { + factory := promauto.With(reg) + return &Metrics{ + Chats: factory.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "chats", + Help: "Number of chats being processed, by state.", + }, []string{"state"}), + MessageCount: factory.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "message_count", + Help: "Number of messages in the prompt per LLM request.", + Buckets: prometheus.ExponentialBuckets(1, 2, 8), // 1, 2, 4, 8, 16, 32, 64, 128 + }, []string{"provider"}), + PromptSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "prompt_size_bytes", + Help: "Estimated byte size of the prompt per LLM request.", + Buckets: prometheus.ExponentialBuckets(1024, 4, 10), // 1KB .. 256MB + }, []string{"provider"}), + ToolResultSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "tool_result_size_bytes", + Help: "Size in bytes of each tool execution result.", + Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB + }, []string{"provider", "tool_name"}), + TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "ttft_seconds", + Help: "Time-to-first-token: wall time from LLM request to first streamed chunk.", + Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, + }, []string{"provider"}), + CompactionTotal: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "compaction_total", + Help: "Total compaction outcomes (only recorded when compaction was triggered or failed).", + }, []string{"provider", "result"}), + StepsTotal: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "steps_total", + Help: "Total agentic loop steps across all chats.", + }, []string{"provider"}), + } +} + +// NopMetrics returns a Metrics instance that discards all data. +// Useful for tests and when metrics collection is not desired. +func NopMetrics() *Metrics { + return NewMetrics(prometheus.NewRegistry()) +} + +// RecordCompaction classifies and records a compaction attempt. +// It is a no-op when m is nil. +func (m *Metrics) RecordCompaction(provider string, compacted bool, err error) { + if m == nil { + return + } + switch { + case err != nil && errors.Is(err, context.DeadlineExceeded): + m.CompactionTotal.WithLabelValues(provider, CompactionResultTimeout).Inc() + case err != nil && errors.Is(err, context.Canceled): + // User interruption, not a compaction failure. + return + case err != nil: + m.CompactionTotal.WithLabelValues(provider, CompactionResultError).Inc() + case compacted: + m.CompactionTotal.WithLabelValues(provider, CompactionResultSuccess).Inc() + // !compacted && err == nil means threshold not reached -- not + // recorded. + } +} + +// EstimatePromptSize returns a cheap byte-size estimate of a +// fantasy prompt by summing the text content lengths of all +// message parts. This avoids JSON marshaling overhead. +func EstimatePromptSize(messages []fantasy.Message) int { + var size int + for _, msg := range messages { + for _, part := range msg.Content { + size += ContentPartSize(part) + } + } + return size +} + +// ContentPartSize returns the byte length of a MessagePart's +// primary text or data field. +func ContentPartSize(part fantasy.MessagePart) int { + switch p := part.(type) { + case fantasy.TextPart: + return len(p.Text) + case fantasy.ReasoningPart: + return len(p.Text) + case fantasy.FilePart: + return len(p.Data) + case fantasy.ToolCallPart: + return len(p.Input) + case fantasy.ToolResultPart: + return toolResultOutputSize(p.Output) + default: + return 0 + } +} + +// ToolResultSize returns the byte length of a +// ToolResultContent's primary text or data field. +func ToolResultSize(r fantasy.ToolResultContent) int { + return toolResultOutputSize(r.Result) +} + +func toolResultOutputSize(output fantasy.ToolResultOutputContent) int { + if output == nil { + return 0 + } + switch v := output.(type) { + case fantasy.ToolResultOutputContentText: + return len(v.Text) + case fantasy.ToolResultOutputContentError: + if v.Error != nil { + return len(v.Error.Error()) + } + return 0 + case fantasy.ToolResultOutputContentMedia: + return len(v.Data) + default: + return 0 + } +} diff --git a/coderd/x/chatd/chatloop/metrics_test.go b/coderd/x/chatd/chatloop/metrics_test.go new file mode 100644 index 0000000000..c7f3719e63 --- /dev/null +++ b/coderd/x/chatd/chatloop/metrics_test.go @@ -0,0 +1,342 @@ +package chatloop_test + +import ( + "context" + "testing" + + "charm.land/fantasy" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/x/chatd/chatloop" + "github.com/coder/coder/v2/coderd/x/chatd/chattest" +) + +func TestNewMetrics_RegistersAllMetrics(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + m := chatloop.NewMetrics(reg) + + // Initialize vector metrics so they appear in Gather output. + m.Chats.WithLabelValues(chatloop.StateStreaming) + m.CompactionTotal.WithLabelValues("anthropic", chatloop.CompactionResultSuccess) + m.ToolResultSizeBytes.WithLabelValues("anthropic", "test") + m.MessageCount.WithLabelValues("anthropic") + m.PromptSizeBytes.WithLabelValues("anthropic") + m.TTFTSeconds.WithLabelValues("anthropic") + m.StepsTotal.WithLabelValues("anthropic") + + families, err := reg.Gather() + require.NoError(t, err) + + expected := map[string]dto.MetricType{ + "coderd_chatd_chats": dto.MetricType_GAUGE, + "coderd_chatd_message_count": dto.MetricType_HISTOGRAM, + "coderd_chatd_prompt_size_bytes": dto.MetricType_HISTOGRAM, + "coderd_chatd_tool_result_size_bytes": dto.MetricType_HISTOGRAM, + "coderd_chatd_ttft_seconds": dto.MetricType_HISTOGRAM, + "coderd_chatd_compaction_total": dto.MetricType_COUNTER, + "coderd_chatd_steps_total": dto.MetricType_COUNTER, + } + + found := make(map[string]dto.MetricType) + for _, f := range families { + found[f.GetName()] = f.GetType() + } + + for name, expectedType := range expected { + actualType, ok := found[name] + assert.True(t, ok, "metric %q not registered", name) + if ok { + assert.Equal(t, expectedType, actualType, "metric %q has wrong type", name) + } + } +} + +func TestNopMetrics_DoesNotPanic(t *testing.T) { + t.Parallel() + + m := chatloop.NopMetrics() + + // Exercise every metric to confirm no nil-pointer panics. + m.Chats.WithLabelValues("streaming").Inc() + m.Chats.WithLabelValues("streaming").Dec() + m.Chats.WithLabelValues("waiting").Inc() + m.Chats.WithLabelValues("waiting").Dec() + m.MessageCount.WithLabelValues("anthropic").Observe(10) + m.PromptSizeBytes.WithLabelValues("openai").Observe(4096) + m.ToolResultSizeBytes.WithLabelValues("anthropic", "execute").Observe(512) + m.TTFTSeconds.WithLabelValues("anthropic").Observe(0.5) + m.CompactionTotal.WithLabelValues("anthropic", "success").Inc() + m.CompactionTotal.WithLabelValues("openai", "error").Inc() + m.CompactionTotal.WithLabelValues("google", "timeout").Inc() + m.StepsTotal.WithLabelValues("anthropic").Inc() +} + +func TestEstimatePromptSize(t *testing.T) { + t.Parallel() + + messages := []fantasy.Message{ + { + Role: fantasy.MessageRoleSystem, + Content: []fantasy.MessagePart{ + fantasy.TextPart{Text: "You are a helpful assistant."}, + }, + }, + { + Role: fantasy.MessageRoleUser, + Content: []fantasy.MessagePart{ + fantasy.TextPart{Text: "Hello world"}, + fantasy.ReasoningPart{Text: "thinking..."}, + fantasy.FilePart{Data: []byte("filedata")}, + }, + }, + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.TextPart{Text: "Hi there!"}, + fantasy.ToolCallPart{Input: `{"file":"main.go"}`}, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + Output: fantasy.ToolResultOutputContentText{Text: "result"}, + }, + }, + }, + } + + size := chatloop.EstimatePromptSize(messages) + // "You are a helpful assistant." (28) + "Hello world" (11) + + // "thinking..." (11) + "filedata" (8) + + // "Hi there!" (9) + `{"file":"main.go"}` (18) + + // "result" (6) = 91 + assert.Equal(t, 91, size) +} + +func TestToolResultSize(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + result fantasy.ToolResultContent + expected int + }{ + { + name: "text", + result: fantasy.ToolResultContent{ + Result: fantasy.ToolResultOutputContentText{Text: "hello"}, + }, + expected: 5, + }, + { + name: "error", + result: fantasy.ToolResultContent{ + Result: fantasy.ToolResultOutputContentError{ + Error: assert.AnError, + }, + }, + expected: len(assert.AnError.Error()), + }, + { + name: "media", + result: fantasy.ToolResultContent{ + Result: fantasy.ToolResultOutputContentMedia{Data: "base64data"}, + }, + expected: 10, + }, + { + name: "nil_result", + result: fantasy.ToolResultContent{}, + expected: 0, + }, + { + name: "error_nil_error", + result: fantasy.ToolResultContent{ + Result: fantasy.ToolResultOutputContentError{Error: nil}, + }, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.expected, chatloop.ToolResultSize(tt.result)) + }) + } +} + +func TestRecordCompaction(t *testing.T) { + t.Parallel() + + t.Run("nil metrics does not panic", func(t *testing.T) { + t.Parallel() + var m *chatloop.Metrics + m.RecordCompaction("anthropic", true, nil) + }) + + tests := []struct { + name string + compacted bool + err error + wantLabel string + wantCount int + }{ + { + name: "success", + compacted: true, + err: nil, + wantLabel: chatloop.CompactionResultSuccess, + wantCount: 1, + }, + { + name: "error", + compacted: false, + err: assert.AnError, + wantLabel: chatloop.CompactionResultError, + wantCount: 1, + }, + { + name: "timeout", + compacted: false, + err: context.DeadlineExceeded, + wantLabel: chatloop.CompactionResultTimeout, + wantCount: 1, + }, + { + name: "threshold_not_reached", + compacted: false, + err: nil, + wantLabel: "", + wantCount: 0, + }, + { + name: "canceled", + compacted: false, + err: context.Canceled, + wantLabel: "", + wantCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + m := chatloop.NewMetrics(reg) + m.RecordCompaction("test", tt.compacted, tt.err) + + families, err := reg.Gather() + require.NoError(t, err) + + if tt.wantCount == 0 { + for _, f := range families { + assert.NotEqual(t, "coderd_chatd_compaction_total", f.GetName(), + "compaction_total should not be recorded") + } + return + } + + var found bool + for _, f := range families { + if f.GetName() != "coderd_chatd_compaction_total" { + continue + } + found = true + require.Len(t, f.GetMetric(), 1) + metric := f.GetMetric()[0] + assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue()) + // Check label. + for _, lp := range metric.GetLabel() { + if lp.GetName() == "result" { + assert.Equal(t, tt.wantLabel, lp.GetValue()) + } + } + } + assert.True(t, found, "compaction_total metric not found") + }) + } +} + +func TestRun_RecordsMetrics(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + metrics := chatloop.NewMetrics(reg) + + model := &chattest.FakeModel{ + ProviderName: "test-provider", + StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) { + return func(yield func(fantasy.StreamPart) bool) { + parts := []fantasy.StreamPart{ + {Type: fantasy.StreamPartTypeTextStart, ID: "t1"}, + {Type: fantasy.StreamPartTypeTextDelta, ID: "t1", Delta: "hello"}, + {Type: fantasy.StreamPartTypeTextEnd, ID: "t1"}, + {Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop}, + } + for _, p := range parts { + if !yield(p) { + return + } + } + }, nil + }, + } + + err := chatloop.Run(context.Background(), chatloop.RunOptions{ + Model: model, + Messages: []fantasy.Message{ + { + Role: fantasy.MessageRoleUser, + Content: []fantasy.MessagePart{ + fantasy.TextPart{Text: "hello"}, + }, + }, + }, + MaxSteps: 1, + PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error { + return nil + }, + Metrics: metrics, + }) + require.NoError(t, err) + + families, err := reg.Gather() + require.NoError(t, err) + + found := make(map[string]bool) + for _, f := range families { + found[f.GetName()] = true + + switch f.GetName() { + case "coderd_chatd_steps_total": + require.Len(t, f.GetMetric(), 1) + assert.Equal(t, float64(1), f.GetMetric()[0].GetCounter().GetValue(), + "steps_total should be 1 after one step") + case "coderd_chatd_message_count": + require.Len(t, f.GetMetric(), 1) + assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(), + "message_count should have 1 observation") + case "coderd_chatd_prompt_size_bytes": + require.Len(t, f.GetMetric(), 1) + assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(), + "prompt_size_bytes should have 1 observation") + case "coderd_chatd_ttft_seconds": + require.Len(t, f.GetMetric(), 1) + assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(), + "ttft_seconds should have 1 observation") + } + } + + assert.True(t, found["coderd_chatd_steps_total"], "steps_total not recorded") + assert.True(t, found["coderd_chatd_message_count"], "message_count not recorded") + assert.True(t, found["coderd_chatd_prompt_size_bytes"], "prompt_size_bytes not recorded") + assert.True(t, found["coderd_chatd_ttft_seconds"], "ttft_seconds not recorded") +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 97a9588278..b2638b8b5c 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -198,6 +198,13 @@ deployment. They will always be available from the agent. | `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` | | `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | | | `coderd_build_info` | gauge | Describes the current build/version of the Coder server. Value is always 1. | `revision` `version` | +| `coderd_chatd_chats` | gauge | Number of chats being processed, by state. | `state` | +| `coderd_chatd_compaction_total` | counter | Total compaction outcomes (only recorded when compaction was triggered or failed). | `provider` `result` | +| `coderd_chatd_message_count` | histogram | Number of messages in the prompt per LLM request. | `provider` | +| `coderd_chatd_prompt_size_bytes` | histogram | Estimated byte size of the prompt per LLM request. | `provider` | +| `coderd_chatd_steps_total` | counter | Total agentic loop steps across all chats. | `provider` | +| `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `provider` `tool_name` | +| `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `provider` | | `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` | | `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` | | `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` | diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 129ac0df32..d565939919 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -2908,7 +2908,7 @@ func TestPrebuildActivityBump(t *testing.T) { require.Zero(t, prebuild.LatestBuild.MaxDeadline) // When: activity bump is applied to an unclaimed prebuild - workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour)) + workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats) // Then: prebuild Deadline/MaxDeadline remain unchanged prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID) @@ -2941,7 +2941,7 @@ func TestPrebuildActivityBump(t *testing.T) { workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID) // When: activity bump is applied to a claimed prebuild - workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour)) + workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats) // Then: Deadline is extended by the activity bump, MaxDeadline remains unset workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID) diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index b7ada39d28..6ceed627c6 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -226,6 +226,27 @@ coderd_authz_prepare_authorize_duration_seconds 0 # HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1. # TYPE coderd_build_info gauge coderd_build_info{version="",revision=""} 0 +# HELP coderd_chatd_chats Number of chats being processed, by state. +# TYPE coderd_chatd_chats gauge +coderd_chatd_chats{state=""} 0 +# HELP coderd_chatd_compaction_total Total compaction outcomes (only recorded when compaction was triggered or failed). +# TYPE coderd_chatd_compaction_total counter +coderd_chatd_compaction_total{provider="",result=""} 0 +# HELP coderd_chatd_message_count Number of messages in the prompt per LLM request. +# TYPE coderd_chatd_message_count histogram +coderd_chatd_message_count{provider=""} 0 +# HELP coderd_chatd_prompt_size_bytes Estimated byte size of the prompt per LLM request. +# TYPE coderd_chatd_prompt_size_bytes histogram +coderd_chatd_prompt_size_bytes{provider=""} 0 +# HELP coderd_chatd_steps_total Total agentic loop steps across all chats. +# TYPE coderd_chatd_steps_total counter +coderd_chatd_steps_total{provider=""} 0 +# HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result. +# TYPE coderd_chatd_tool_result_size_bytes histogram +coderd_chatd_tool_result_size_bytes{provider="",tool_name=""} 0 +# HELP coderd_chatd_ttft_seconds Time-to-first-token: wall time from LLM request to first streamed chunk. +# TYPE coderd_chatd_ttft_seconds histogram +coderd_chatd_ttft_seconds{provider=""} 0 # HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name. # TYPE coderd_db_query_counts_total counter coderd_db_query_counts_total{route="",method="",query=""} 0