diff --git a/coderd/x/chatd/chatd.go b/coderd/x/chatd/chatd.go index 6677aa52c7..8bd7f6635d 100644 --- a/coderd/x/chatd/chatd.go +++ b/coderd/x/chatd/chatd.go @@ -6410,7 +6410,7 @@ func (p *Server) runChat( } // Record builtin tool names before appending MCP tools - // so the metrics layer can bound label cardinality. + // so the metrics layer can differentiate between built-in and MCP tools. builtinToolNames := make(map[string]bool, len(tools)) for _, t := range tools { builtinToolNames[t.Info().Name] = true @@ -6507,6 +6507,23 @@ func (p *Server) runChat( var loopErr error triggerMessageID, historyTipMessageID, triggerLabel := deriveChatDebugSeed(messages) + + // Enrich the logger with correlation fields useful for + // diagnosing tool-call errors inside the chatloop. + loopLogger := logger.With( + slog.F("owner_id", chat.OwnerID), + slog.F("organization_id", chat.OrganizationID), + slog.F("trigger_message_id", triggerMessageID), + ) + if chat.WorkspaceID.Valid { + loopLogger = loopLogger.With(slog.F("workspace_id", chat.WorkspaceID.UUID)) + } + if chat.AgentID.Valid { + loopLogger = loopLogger.With(slog.F("agent_id", chat.AgentID.UUID)) + } + if chat.ParentChatID.Valid { + loopLogger = loopLogger.With(slog.F("parent_chat_id", chat.ParentChatID.UUID)) + } result.TriggerMessageID = triggerMessageID result.HistoryTipMessageID = historyTipMessageID finishDebugRun := func(error, any) {} @@ -6540,6 +6557,7 @@ func (p *Server) runChat( StopAfterTools: stopAfterBehaviorTools(currentPlanMode, chat.Mode, chat.ParentChatID), MaxSteps: maxChatSteps, Metrics: p.metrics, + Logger: loopLogger, BuiltinToolNames: builtinToolNames, ModelConfig: callConfig, @@ -6563,7 +6581,6 @@ func (p *Server) runChat( } p.publishMessagePart(chat.ID, role, part) }, - Logger: logger, Compaction: compactionOptions, ReloadMessages: func(reloadCtx context.Context) ([]fantasy.Message, error) { reloadedMsgs, err := p.db.GetChatMessagesForPromptByChatID(reloadCtx, chat.ID) diff --git a/coderd/x/chatd/chatloop/chatloop.go b/coderd/x/chatd/chatloop/chatloop.go index 43ed4bbc0c..ad647de15d 100644 --- a/coderd/x/chatd/chatloop/chatloop.go +++ b/coderd/x/chatd/chatloop/chatloop.go @@ -148,6 +148,8 @@ type RunOptions struct { role codersdk.ChatMessageRole, part codersdk.ChatMessagePart, ) + // Callers should attach correlation fields (chat_id, owner_id, etc.) + // using Logger.With before passing the logger in. Logger slog.Logger Compaction *CompactionOptions ReloadMessages func(context.Context) ([]fantasy.Message, error) @@ -175,8 +177,6 @@ type RunOptions struct { Metrics *Metrics // BuiltinToolNames lists tool names that are built into chatd. - // Tool results from tools not in this set are recorded under - // the "mcp" label to bound cardinality. BuiltinToolNames map[string]bool } @@ -493,7 +493,7 @@ func Run(ctx context.Context, opts RunOptions) error { } // Execute only built-in tools. - toolResults = executeTools(ctx, opts.Tools, opts.ActiveTools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, modelName, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) { + toolResults = executeTools(ctx, opts.Tools, opts.ActiveTools, opts.ProviderTools, builtinCalls, opts.Metrics, opts.Logger, provider, modelName, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) { recordToolResultTimestamp(&result, tr.ToolCallID, completedAt) publishToolAttachments(ctx, opts.Logger, tr, completedAt, publishMessagePart) ssePart := chatprompt.PartFromContentWithLogger(ctx, opts.Logger, tr) @@ -1054,6 +1054,7 @@ func executeTools( providerTools []ProviderTool, toolCalls []fantasy.ToolCallContent, metrics *Metrics, + logger slog.Logger, provider, model string, builtinToolNames map[string]bool, onResult func(fantasy.ToolResultContent, time.Time), @@ -1113,7 +1114,7 @@ func executeTools( // accurate individual completion times. completedAt[i] = dbtime.Now() }() - results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, model, builtinToolNames, activeTools, providerRunnerNames) + results[i] = executeSingleTool(ctx, toolMap, tc, metrics, logger, provider, model, builtinToolNames, activeTools, providerRunnerNames) }() } wg.Wait() @@ -1135,6 +1136,7 @@ func executeSingleTool( toolMap map[string]fantasy.AgentTool, tc fantasy.ToolCallContent, metrics *Metrics, + logger slog.Logger, provider, model string, builtinToolNames map[string]bool, activeTools []string, @@ -1146,16 +1148,20 @@ func executeSingleTool( ProviderExecuted: false, } defer func() { - toolLabel := tc.ToolName - if !builtinToolNames[tc.ToolName] { - toolLabel = "mcp" + metricLabel := tc.ToolName + if metricLabel == "" { + metricLabel = "unknown" } - metrics.ToolResultSizeBytes.WithLabelValues(provider, model, toolLabel).Observe( + metrics.ToolResultSizeBytes.WithLabelValues(provider, model, metricLabel).Observe( float64(ToolResultSize(result)), ) + if _, ok := result.Result.(fantasy.ToolResultOutputContentError); ok { + metrics.RecordToolError(provider, model, metricLabel) + } }() - if _, isProviderRunner := providerRunnerNames[tc.ToolName]; !isProviderRunner && !isToolActive(tc.ToolName, activeTools) { + _, isProviderRunner := providerRunnerNames[tc.ToolName] + if !isProviderRunner && !isToolActive(tc.ToolName, activeTools) { result.Result = fantasy.ToolResultOutputContentError{ Error: xerrors.New("Tool not active in this turn: " + tc.ToolName), } @@ -1170,6 +1176,12 @@ func executeSingleTool( return result } + logger.Debug(ctx, "tool execution", + slog.F("tool_name", tc.ToolName), + slog.F("tool_call_id", tc.ToolCallID), + slog.F("builtin", builtinToolNames[tc.ToolName]), + slog.F("is_provider_runner", isProviderRunner), + ) resp, err := tool.Run(ctx, fantasy.ToolCall{ ID: tc.ToolCallID, Name: tc.ToolName, @@ -1180,6 +1192,11 @@ func executeSingleTool( Error: err, } result.ClientMetadata = resp.Metadata + logger.Error(ctx, "tool execution failed", + slog.F("tool_name", tc.ToolName), + slog.F("tool_call_id", tc.ToolCallID), + slog.Error(err), + ) return result } @@ -1189,6 +1206,11 @@ func executeSingleTool( result.Result = fantasy.ToolResultOutputContentError{ Error: xerrors.New(resp.Content), } + logger.Info(ctx, "tool returned error result", + slog.F("tool_name", tc.ToolName), + slog.F("tool_call_id", tc.ToolCallID), + slog.F("tool_error", resp.Content), + ) case resp.Type == "image" || resp.Type == "media": result.Result = fantasy.ToolResultOutputContentMedia{ Data: string(resp.Data), diff --git a/coderd/x/chatd/chatloop/metrics.go b/coderd/x/chatd/chatloop/metrics.go index cbe0638ab2..cc96eb379b 100644 --- a/coderd/x/chatd/chatloop/metrics.go +++ b/coderd/x/chatd/chatloop/metrics.go @@ -31,6 +31,7 @@ type Metrics struct { MessageCount *prometheus.HistogramVec PromptSizeBytes *prometheus.HistogramVec ToolResultSizeBytes *prometheus.HistogramVec + ToolErrorsTotal *prometheus.CounterVec TTFTSeconds *prometheus.HistogramVec CompactionTotal *prometheus.CounterVec StepsTotal *prometheus.CounterVec @@ -70,6 +71,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { Help: "Size in bytes of each tool execution result.", Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB }, []string{"provider", "model", "tool_name"}), + ToolErrorsTotal: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "tool_errors_total", + Help: "Total tool calls that returned an error result.", + }, []string{"provider", "model", "tool_name"}), TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: metricsSubsystem, @@ -141,6 +148,15 @@ func (m *Metrics) RecordStreamRetry(provider, model string, classified chaterror m.StreamRetriesTotal.WithLabelValues(provider, model, classified.Kind).Inc() } +// RecordToolError increments tool_errors_total for the given +// tool. No-op when m is nil. +func (m *Metrics) RecordToolError(provider, model, toolLabel string) { + if m == nil { + return + } + m.ToolErrorsTotal.WithLabelValues(provider, model, toolLabel).Inc() +} + // RecordStreamBufferDropped increments stream_buffer_dropped_total // once per dropped event. No-op when m is nil. func (m *Metrics) RecordStreamBufferDropped() { diff --git a/coderd/x/chatd/chatloop/metrics_test.go b/coderd/x/chatd/chatloop/metrics_test.go index 3a7288a44a..5c4c296d43 100644 --- a/coderd/x/chatd/chatloop/metrics_test.go +++ b/coderd/x/chatd/chatloop/metrics_test.go @@ -28,6 +28,7 @@ func TestNewMetrics_RegistersAllMetrics(t *testing.T) { m.Chats.WithLabelValues(chatloop.StateStreaming) m.CompactionTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", chatloop.CompactionResultSuccess) m.ToolResultSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5", "test") + m.ToolErrorsTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "test") m.MessageCount.WithLabelValues("anthropic", "claude-sonnet-4-5") m.PromptSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5") m.TTFTSeconds.WithLabelValues("anthropic", "claude-sonnet-4-5") @@ -50,6 +51,7 @@ func TestNewMetrics_RegistersAllMetrics(t *testing.T) { "coderd_chatd_steps_total": dto.MetricType_COUNTER, "coderd_chatd_stream_retries_total": dto.MetricType_COUNTER, "coderd_chatd_stream_buffer_dropped_total": dto.MetricType_COUNTER, + "coderd_chatd_tool_errors_total": dto.MetricType_COUNTER, } found := make(map[string]dto.MetricType) @@ -79,6 +81,7 @@ func TestNopMetrics_DoesNotPanic(t *testing.T) { m.MessageCount.WithLabelValues("anthropic", "claude-sonnet-4-5").Observe(10) m.PromptSizeBytes.WithLabelValues("openai", "gpt-5").Observe(4096) m.ToolResultSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5", "execute").Observe(512) + m.ToolErrorsTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "execute").Inc() m.TTFTSeconds.WithLabelValues("anthropic", "claude-sonnet-4-5").Observe(0.5) m.CompactionTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "success").Inc() m.CompactionTotal.WithLabelValues("openai", "gpt-5", "error").Inc() @@ -93,6 +96,7 @@ func TestNopMetrics_DoesNotPanic(t *testing.T) { var nilMetrics *chatloop.Metrics nilMetrics.RecordStreamRetry("anthropic", "claude-sonnet-4-5", chaterror.ClassifiedError{Kind: chaterror.KindTimeout}) nilMetrics.RecordStreamBufferDropped() + nilMetrics.RecordToolError("anthropic", "claude-sonnet-4-5", "test") } func TestEstimatePromptSize(t *testing.T) { @@ -263,25 +267,11 @@ func TestRecordCompaction(t *testing.T) { return } - var found bool - for _, f := range families { - if f.GetName() != "coderd_chatd_compaction_total" { - continue - } - found = true - require.Len(t, f.GetMetric(), 1) - metric := f.GetMetric()[0] - assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue()) - // Check labels: provider, model, result. - labels := map[string]string{} - for _, lp := range metric.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - assert.Equal(t, "test-provider", labels["provider"]) - assert.Equal(t, "test-model", labels["model"]) - assert.Equal(t, tt.wantLabel, labels["result"]) - } - assert.True(t, found, "compaction_total metric not found") + requireCounter(t, reg, "coderd_chatd_compaction_total", float64(tt.wantCount), map[string]string{ + "provider": "test-provider", + "model": "test-model", + "result": tt.wantLabel, + }) }) } } @@ -316,27 +306,11 @@ func TestRecordStreamRetry(t *testing.T) { Kind: tt.kind, }) - families, err := reg.Gather() - require.NoError(t, err) - - var found bool - for _, f := range families { - if f.GetName() != "coderd_chatd_stream_retries_total" { - continue - } - found = true - require.Len(t, f.GetMetric(), 1) - metric := f.GetMetric()[0] - assert.Equal(t, float64(1), metric.GetCounter().GetValue()) - labels := map[string]string{} - for _, lp := range metric.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - assert.Equal(t, "test-provider", labels["provider"]) - assert.Equal(t, "test-model", labels["model"]) - assert.Equal(t, tt.kind, labels["kind"]) - } - assert.True(t, found, "stream_retries_total metric not found") + requireCounter(t, reg, "coderd_chatd_stream_retries_total", 1, map[string]string{ + "provider": "test-provider", + "model": "test-model", + "kind": tt.kind, + }) }) } } @@ -378,6 +352,58 @@ func TestRecordStreamBufferDropped(t *testing.T) { }) } +// requireCounter gathers metrics from reg, finds the named counter +// family, and asserts it has exactly one series with the given value +// and labels. +func requireCounter(t *testing.T, reg *prometheus.Registry, name string, wantValue float64, wantLabels map[string]string) { + t.Helper() + + families, err := reg.Gather() + require.NoError(t, err) + + for _, f := range families { + if f.GetName() != name { + continue + } + require.Len(t, f.GetMetric(), 1, "expected exactly one series for %s", name) + metric := f.GetMetric()[0] + assert.Equal(t, wantValue, metric.GetCounter().GetValue(), "counter value for %s", name) + labels := map[string]string{} + for _, lp := range metric.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + for k, v := range wantLabels { + assert.Equal(t, v, labels[k], "label %s for %s", k, name) + } + return + } + t.Fatalf("metric %s not found in gathered families", name) +} + +func TestRecordToolError(t *testing.T) { + t.Parallel() + + t.Run("nil metrics does not panic", func(t *testing.T) { + t.Parallel() + var m *chatloop.Metrics + m.RecordToolError("anthropic", "claude-sonnet-4-5", "test") + }) + + t.Run("increments with correct labels", func(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + m := chatloop.NewMetrics(reg) + m.RecordToolError("test-provider", "test-model", "read_file") + + requireCounter(t, reg, "coderd_chatd_tool_errors_total", 1, map[string]string{ + "provider": "test-provider", + "model": "test-model", + "tool_name": "read_file", + }) + }) +} + func TestRun_RecordsMetrics(t *testing.T) { t.Parallel() @@ -536,27 +562,11 @@ func TestRun_StreamRetry_RecordsMetric(t *testing.T) { assert.Equal(t, "test-provider", retries[0].classified.Provider) // Metric assertion. - families, err := reg.Gather() - require.NoError(t, err) - - var found bool - for _, f := range families { - if f.GetName() != "coderd_chatd_stream_retries_total" { - continue - } - found = true - require.Len(t, f.GetMetric(), 1) - metric := f.GetMetric()[0] - assert.Equal(t, float64(1), metric.GetCounter().GetValue()) - labels := map[string]string{} - for _, lp := range metric.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - assert.Equal(t, "test-provider", labels["provider"]) - assert.Equal(t, "test-model", labels["model"]) - assert.Equal(t, chaterror.KindRateLimit, labels["kind"]) - } - assert.True(t, found, "stream_retries_total metric not found") + requireCounter(t, reg, "coderd_chatd_stream_retries_total", 1, map[string]string{ + "provider": "test-provider", + "model": "test-model", + "kind": chaterror.KindRateLimit, + }) } // TestRun_StreamRetry_CanceledDoesNotIncrement pins the invariant @@ -603,3 +613,105 @@ func TestRun_StreamRetry_CanceledDoesNotIncrement(t *testing.T) { } } } + +func TestRun_ToolError_RecordsMetric(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + toolFn func(context.Context, struct{}, fantasy.ToolCall) (fantasy.ToolResponse, error) + builtinToolNames map[string]bool + wantLabel string + }{ + { + name: "builtin_tool_IsError", + toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + return fantasy.ToolResponse{ + Content: "something went wrong", + IsError: true, + }, nil + }, + builtinToolNames: map[string]bool{"failing_tool": true}, + wantLabel: "failing_tool", + }, + { + name: "mcp_tool_IsError", + toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + return fantasy.ToolResponse{ + Content: "something went wrong", + IsError: true, + }, nil + }, + builtinToolNames: map[string]bool{}, + wantLabel: "failing_tool", + }, + { + name: "tool_Run_returns_error", + toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + return fantasy.ToolResponse{}, xerrors.New("connection refused") + }, + builtinToolNames: map[string]bool{"failing_tool": true}, + wantLabel: "failing_tool", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + metrics := chatloop.NewMetrics(reg) + + failingTool := fantasy.NewAgentTool( + "failing_tool", + "a tool that always fails", + tt.toolFn, + ) + + model := &chattest.FakeModel{ + ProviderName: "test-provider", + ModelName: "test-model", + StreamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) { + return func(yield func(fantasy.StreamPart) bool) { + parts := []fantasy.StreamPart{ + {Type: fantasy.StreamPartTypeToolInputStart, ID: "tc1", ToolCallName: "failing_tool"}, + {Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc1", Delta: `{}`}, + {Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc1"}, + { + Type: fantasy.StreamPartTypeToolCall, + ID: "tc1", + ToolCallName: "failing_tool", + ToolCallInput: `{}`, + }, + {Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls}, + } + for _, p := range parts { + if !yield(p) { + return + } + } + }, nil + }, + } + + err := chatloop.Run(context.Background(), chatloop.RunOptions{ + Model: model, + MaxSteps: 1, + Tools: []fantasy.AgentTool{failingTool}, + ActiveTools: []string{"failing_tool"}, + BuiltinToolNames: tt.builtinToolNames, + PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error { + return nil + }, + Metrics: metrics, + }) + require.NoError(t, err) + + requireCounter(t, reg, "coderd_chatd_tool_errors_total", 1, map[string]string{ + "provider": "test-provider", + "model": "test-model", + "tool_name": tt.wantLabel, + }) + }) + } +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index bd0d609fa4..ec33e3a0c3 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -209,6 +209,7 @@ deployment. They will always be available from the agent. | `coderd_chatd_stream_retries_total` | counter | Total LLM stream retries. | `kind` `model` `provider` | | `coderd_chatd_stream_subscribers` | gauge | Current number of chat stream subscribers across all chat streams. | | | `coderd_chatd_streams_active` | gauge | Current number of chat stream state entries (in-flight plus retained). | | +| `coderd_chatd_tool_errors_total` | counter | Total tool calls that returned an error result. | `model` `provider` `tool_name` | | `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `model` `provider` `tool_name` | | `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `model` `provider` | | `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` | diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index 7ea6561a25..ce5e9f289d 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -259,6 +259,9 @@ coderd_chatd_stream_subscribers 0 # HELP coderd_chatd_streams_active Current number of chat stream state entries (in-flight plus retained). # TYPE coderd_chatd_streams_active gauge coderd_chatd_streams_active 0 +# HELP coderd_chatd_tool_errors_total Total tool calls that returned an error result. +# TYPE coderd_chatd_tool_errors_total counter +coderd_chatd_tool_errors_total{provider="",model="",tool_name=""} 0 # HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result. # TYPE coderd_chatd_tool_result_size_bytes histogram coderd_chatd_tool_result_size_bytes{provider="",model="",tool_name=""} 0