feat: add chatd tool call error metrics and logging (#24559)

- Add `coderd_chatd_tool_errors_total` prometheus counter (labels:
provider, model, tool_name)
- Log tool call errors at warn level with correlation fields: chat_id,
owner_id, organization_id, workspace_id, agent_id, parent_chat_id,
trigger_message_id, tool_name, tool_call_id, provider, model
- Thread enriched logger from chatd.go into chatloop via
`RunOptions.Logger`
- Remove squashing of all MCP tool calls to the `mcp` bucket

> 🤖
This commit is contained in:
Cian Johnston
2026-04-22 17:19:56 +01:00
committed by GitHub
parent 7904bed947
commit 72e3ae9c5f
6 changed files with 243 additions and 72 deletions
+19 -2
View File
@@ -6410,7 +6410,7 @@ func (p *Server) runChat(
}
// Record builtin tool names before appending MCP tools
// so the metrics layer can bound label cardinality.
// so the metrics layer can differentiate between built-in and MCP tools.
builtinToolNames := make(map[string]bool, len(tools))
for _, t := range tools {
builtinToolNames[t.Info().Name] = true
@@ -6507,6 +6507,23 @@ func (p *Server) runChat(
var loopErr error
triggerMessageID, historyTipMessageID, triggerLabel := deriveChatDebugSeed(messages)
// Enrich the logger with correlation fields useful for
// diagnosing tool-call errors inside the chatloop.
loopLogger := logger.With(
slog.F("owner_id", chat.OwnerID),
slog.F("organization_id", chat.OrganizationID),
slog.F("trigger_message_id", triggerMessageID),
)
if chat.WorkspaceID.Valid {
loopLogger = loopLogger.With(slog.F("workspace_id", chat.WorkspaceID.UUID))
}
if chat.AgentID.Valid {
loopLogger = loopLogger.With(slog.F("agent_id", chat.AgentID.UUID))
}
if chat.ParentChatID.Valid {
loopLogger = loopLogger.With(slog.F("parent_chat_id", chat.ParentChatID.UUID))
}
result.TriggerMessageID = triggerMessageID
result.HistoryTipMessageID = historyTipMessageID
finishDebugRun := func(error, any) {}
@@ -6540,6 +6557,7 @@ func (p *Server) runChat(
StopAfterTools: stopAfterBehaviorTools(currentPlanMode, chat.Mode, chat.ParentChatID),
MaxSteps: maxChatSteps,
Metrics: p.metrics,
Logger: loopLogger,
BuiltinToolNames: builtinToolNames,
ModelConfig: callConfig,
@@ -6563,7 +6581,6 @@ func (p *Server) runChat(
}
p.publishMessagePart(chat.ID, role, part)
},
Logger: logger,
Compaction: compactionOptions,
ReloadMessages: func(reloadCtx context.Context) ([]fantasy.Message, error) {
reloadedMsgs, err := p.db.GetChatMessagesForPromptByChatID(reloadCtx, chat.ID)
+31 -9
View File
@@ -148,6 +148,8 @@ type RunOptions struct {
role codersdk.ChatMessageRole,
part codersdk.ChatMessagePart,
)
// Callers should attach correlation fields (chat_id, owner_id, etc.)
// using Logger.With before passing the logger in.
Logger slog.Logger
Compaction *CompactionOptions
ReloadMessages func(context.Context) ([]fantasy.Message, error)
@@ -175,8 +177,6 @@ type RunOptions struct {
Metrics *Metrics
// BuiltinToolNames lists tool names that are built into chatd.
// Tool results from tools not in this set are recorded under
// the "mcp" label to bound cardinality.
BuiltinToolNames map[string]bool
}
@@ -493,7 +493,7 @@ func Run(ctx context.Context, opts RunOptions) error {
}
// Execute only built-in tools.
toolResults = executeTools(ctx, opts.Tools, opts.ActiveTools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, modelName, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) {
toolResults = executeTools(ctx, opts.Tools, opts.ActiveTools, opts.ProviderTools, builtinCalls, opts.Metrics, opts.Logger, provider, modelName, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) {
recordToolResultTimestamp(&result, tr.ToolCallID, completedAt)
publishToolAttachments(ctx, opts.Logger, tr, completedAt, publishMessagePart)
ssePart := chatprompt.PartFromContentWithLogger(ctx, opts.Logger, tr)
@@ -1054,6 +1054,7 @@ func executeTools(
providerTools []ProviderTool,
toolCalls []fantasy.ToolCallContent,
metrics *Metrics,
logger slog.Logger,
provider, model string,
builtinToolNames map[string]bool,
onResult func(fantasy.ToolResultContent, time.Time),
@@ -1113,7 +1114,7 @@ func executeTools(
// accurate individual completion times.
completedAt[i] = dbtime.Now()
}()
results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, model, builtinToolNames, activeTools, providerRunnerNames)
results[i] = executeSingleTool(ctx, toolMap, tc, metrics, logger, provider, model, builtinToolNames, activeTools, providerRunnerNames)
}()
}
wg.Wait()
@@ -1135,6 +1136,7 @@ func executeSingleTool(
toolMap map[string]fantasy.AgentTool,
tc fantasy.ToolCallContent,
metrics *Metrics,
logger slog.Logger,
provider, model string,
builtinToolNames map[string]bool,
activeTools []string,
@@ -1146,16 +1148,20 @@ func executeSingleTool(
ProviderExecuted: false,
}
defer func() {
toolLabel := tc.ToolName
if !builtinToolNames[tc.ToolName] {
toolLabel = "mcp"
metricLabel := tc.ToolName
if metricLabel == "" {
metricLabel = "unknown"
}
metrics.ToolResultSizeBytes.WithLabelValues(provider, model, toolLabel).Observe(
metrics.ToolResultSizeBytes.WithLabelValues(provider, model, metricLabel).Observe(
float64(ToolResultSize(result)),
)
if _, ok := result.Result.(fantasy.ToolResultOutputContentError); ok {
metrics.RecordToolError(provider, model, metricLabel)
}
}()
if _, isProviderRunner := providerRunnerNames[tc.ToolName]; !isProviderRunner && !isToolActive(tc.ToolName, activeTools) {
_, isProviderRunner := providerRunnerNames[tc.ToolName]
if !isProviderRunner && !isToolActive(tc.ToolName, activeTools) {
result.Result = fantasy.ToolResultOutputContentError{
Error: xerrors.New("Tool not active in this turn: " + tc.ToolName),
}
@@ -1170,6 +1176,12 @@ func executeSingleTool(
return result
}
logger.Debug(ctx, "tool execution",
slog.F("tool_name", tc.ToolName),
slog.F("tool_call_id", tc.ToolCallID),
slog.F("builtin", builtinToolNames[tc.ToolName]),
slog.F("is_provider_runner", isProviderRunner),
)
resp, err := tool.Run(ctx, fantasy.ToolCall{
ID: tc.ToolCallID,
Name: tc.ToolName,
@@ -1180,6 +1192,11 @@ func executeSingleTool(
Error: err,
}
result.ClientMetadata = resp.Metadata
logger.Error(ctx, "tool execution failed",
slog.F("tool_name", tc.ToolName),
slog.F("tool_call_id", tc.ToolCallID),
slog.Error(err),
)
return result
}
@@ -1189,6 +1206,11 @@ func executeSingleTool(
result.Result = fantasy.ToolResultOutputContentError{
Error: xerrors.New(resp.Content),
}
logger.Info(ctx, "tool returned error result",
slog.F("tool_name", tc.ToolName),
slog.F("tool_call_id", tc.ToolCallID),
slog.F("tool_error", resp.Content),
)
case resp.Type == "image" || resp.Type == "media":
result.Result = fantasy.ToolResultOutputContentMedia{
Data: string(resp.Data),
+16
View File
@@ -31,6 +31,7 @@ type Metrics struct {
MessageCount *prometheus.HistogramVec
PromptSizeBytes *prometheus.HistogramVec
ToolResultSizeBytes *prometheus.HistogramVec
ToolErrorsTotal *prometheus.CounterVec
TTFTSeconds *prometheus.HistogramVec
CompactionTotal *prometheus.CounterVec
StepsTotal *prometheus.CounterVec
@@ -70,6 +71,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
Help: "Size in bytes of each tool execution result.",
Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB
}, []string{"provider", "model", "tool_name"}),
ToolErrorsTotal: factory.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "tool_errors_total",
Help: "Total tool calls that returned an error result.",
}, []string{"provider", "model", "tool_name"}),
TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
@@ -141,6 +148,15 @@ func (m *Metrics) RecordStreamRetry(provider, model string, classified chaterror
m.StreamRetriesTotal.WithLabelValues(provider, model, classified.Kind).Inc()
}
// RecordToolError increments tool_errors_total for the given
// tool. No-op when m is nil.
func (m *Metrics) RecordToolError(provider, model, toolLabel string) {
if m == nil {
return
}
m.ToolErrorsTotal.WithLabelValues(provider, model, toolLabel).Inc()
}
// RecordStreamBufferDropped increments stream_buffer_dropped_total
// once per dropped event. No-op when m is nil.
func (m *Metrics) RecordStreamBufferDropped() {
+173 -61
View File
@@ -28,6 +28,7 @@ func TestNewMetrics_RegistersAllMetrics(t *testing.T) {
m.Chats.WithLabelValues(chatloop.StateStreaming)
m.CompactionTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", chatloop.CompactionResultSuccess)
m.ToolResultSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5", "test")
m.ToolErrorsTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "test")
m.MessageCount.WithLabelValues("anthropic", "claude-sonnet-4-5")
m.PromptSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5")
m.TTFTSeconds.WithLabelValues("anthropic", "claude-sonnet-4-5")
@@ -50,6 +51,7 @@ func TestNewMetrics_RegistersAllMetrics(t *testing.T) {
"coderd_chatd_steps_total": dto.MetricType_COUNTER,
"coderd_chatd_stream_retries_total": dto.MetricType_COUNTER,
"coderd_chatd_stream_buffer_dropped_total": dto.MetricType_COUNTER,
"coderd_chatd_tool_errors_total": dto.MetricType_COUNTER,
}
found := make(map[string]dto.MetricType)
@@ -79,6 +81,7 @@ func TestNopMetrics_DoesNotPanic(t *testing.T) {
m.MessageCount.WithLabelValues("anthropic", "claude-sonnet-4-5").Observe(10)
m.PromptSizeBytes.WithLabelValues("openai", "gpt-5").Observe(4096)
m.ToolResultSizeBytes.WithLabelValues("anthropic", "claude-sonnet-4-5", "execute").Observe(512)
m.ToolErrorsTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "execute").Inc()
m.TTFTSeconds.WithLabelValues("anthropic", "claude-sonnet-4-5").Observe(0.5)
m.CompactionTotal.WithLabelValues("anthropic", "claude-sonnet-4-5", "success").Inc()
m.CompactionTotal.WithLabelValues("openai", "gpt-5", "error").Inc()
@@ -93,6 +96,7 @@ func TestNopMetrics_DoesNotPanic(t *testing.T) {
var nilMetrics *chatloop.Metrics
nilMetrics.RecordStreamRetry("anthropic", "claude-sonnet-4-5", chaterror.ClassifiedError{Kind: chaterror.KindTimeout})
nilMetrics.RecordStreamBufferDropped()
nilMetrics.RecordToolError("anthropic", "claude-sonnet-4-5", "test")
}
func TestEstimatePromptSize(t *testing.T) {
@@ -263,25 +267,11 @@ func TestRecordCompaction(t *testing.T) {
return
}
var found bool
for _, f := range families {
if f.GetName() != "coderd_chatd_compaction_total" {
continue
}
found = true
require.Len(t, f.GetMetric(), 1)
metric := f.GetMetric()[0]
assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue())
// Check labels: provider, model, result.
labels := map[string]string{}
for _, lp := range metric.GetLabel() {
labels[lp.GetName()] = lp.GetValue()
}
assert.Equal(t, "test-provider", labels["provider"])
assert.Equal(t, "test-model", labels["model"])
assert.Equal(t, tt.wantLabel, labels["result"])
}
assert.True(t, found, "compaction_total metric not found")
requireCounter(t, reg, "coderd_chatd_compaction_total", float64(tt.wantCount), map[string]string{
"provider": "test-provider",
"model": "test-model",
"result": tt.wantLabel,
})
})
}
}
@@ -316,27 +306,11 @@ func TestRecordStreamRetry(t *testing.T) {
Kind: tt.kind,
})
families, err := reg.Gather()
require.NoError(t, err)
var found bool
for _, f := range families {
if f.GetName() != "coderd_chatd_stream_retries_total" {
continue
}
found = true
require.Len(t, f.GetMetric(), 1)
metric := f.GetMetric()[0]
assert.Equal(t, float64(1), metric.GetCounter().GetValue())
labels := map[string]string{}
for _, lp := range metric.GetLabel() {
labels[lp.GetName()] = lp.GetValue()
}
assert.Equal(t, "test-provider", labels["provider"])
assert.Equal(t, "test-model", labels["model"])
assert.Equal(t, tt.kind, labels["kind"])
}
assert.True(t, found, "stream_retries_total metric not found")
requireCounter(t, reg, "coderd_chatd_stream_retries_total", 1, map[string]string{
"provider": "test-provider",
"model": "test-model",
"kind": tt.kind,
})
})
}
}
@@ -378,6 +352,58 @@ func TestRecordStreamBufferDropped(t *testing.T) {
})
}
// requireCounter gathers metrics from reg, finds the named counter
// family, and asserts it has exactly one series with the given value
// and labels.
func requireCounter(t *testing.T, reg *prometheus.Registry, name string, wantValue float64, wantLabels map[string]string) {
t.Helper()
families, err := reg.Gather()
require.NoError(t, err)
for _, f := range families {
if f.GetName() != name {
continue
}
require.Len(t, f.GetMetric(), 1, "expected exactly one series for %s", name)
metric := f.GetMetric()[0]
assert.Equal(t, wantValue, metric.GetCounter().GetValue(), "counter value for %s", name)
labels := map[string]string{}
for _, lp := range metric.GetLabel() {
labels[lp.GetName()] = lp.GetValue()
}
for k, v := range wantLabels {
assert.Equal(t, v, labels[k], "label %s for %s", k, name)
}
return
}
t.Fatalf("metric %s not found in gathered families", name)
}
func TestRecordToolError(t *testing.T) {
t.Parallel()
t.Run("nil metrics does not panic", func(t *testing.T) {
t.Parallel()
var m *chatloop.Metrics
m.RecordToolError("anthropic", "claude-sonnet-4-5", "test")
})
t.Run("increments with correct labels", func(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := chatloop.NewMetrics(reg)
m.RecordToolError("test-provider", "test-model", "read_file")
requireCounter(t, reg, "coderd_chatd_tool_errors_total", 1, map[string]string{
"provider": "test-provider",
"model": "test-model",
"tool_name": "read_file",
})
})
}
func TestRun_RecordsMetrics(t *testing.T) {
t.Parallel()
@@ -536,27 +562,11 @@ func TestRun_StreamRetry_RecordsMetric(t *testing.T) {
assert.Equal(t, "test-provider", retries[0].classified.Provider)
// Metric assertion.
families, err := reg.Gather()
require.NoError(t, err)
var found bool
for _, f := range families {
if f.GetName() != "coderd_chatd_stream_retries_total" {
continue
}
found = true
require.Len(t, f.GetMetric(), 1)
metric := f.GetMetric()[0]
assert.Equal(t, float64(1), metric.GetCounter().GetValue())
labels := map[string]string{}
for _, lp := range metric.GetLabel() {
labels[lp.GetName()] = lp.GetValue()
}
assert.Equal(t, "test-provider", labels["provider"])
assert.Equal(t, "test-model", labels["model"])
assert.Equal(t, chaterror.KindRateLimit, labels["kind"])
}
assert.True(t, found, "stream_retries_total metric not found")
requireCounter(t, reg, "coderd_chatd_stream_retries_total", 1, map[string]string{
"provider": "test-provider",
"model": "test-model",
"kind": chaterror.KindRateLimit,
})
}
// TestRun_StreamRetry_CanceledDoesNotIncrement pins the invariant
@@ -603,3 +613,105 @@ func TestRun_StreamRetry_CanceledDoesNotIncrement(t *testing.T) {
}
}
}
func TestRun_ToolError_RecordsMetric(t *testing.T) {
t.Parallel()
tests := []struct {
name string
toolFn func(context.Context, struct{}, fantasy.ToolCall) (fantasy.ToolResponse, error)
builtinToolNames map[string]bool
wantLabel string
}{
{
name: "builtin_tool_IsError",
toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
return fantasy.ToolResponse{
Content: "something went wrong",
IsError: true,
}, nil
},
builtinToolNames: map[string]bool{"failing_tool": true},
wantLabel: "failing_tool",
},
{
name: "mcp_tool_IsError",
toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
return fantasy.ToolResponse{
Content: "something went wrong",
IsError: true,
}, nil
},
builtinToolNames: map[string]bool{},
wantLabel: "failing_tool",
},
{
name: "tool_Run_returns_error",
toolFn: func(_ context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
return fantasy.ToolResponse{}, xerrors.New("connection refused")
},
builtinToolNames: map[string]bool{"failing_tool": true},
wantLabel: "failing_tool",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
metrics := chatloop.NewMetrics(reg)
failingTool := fantasy.NewAgentTool(
"failing_tool",
"a tool that always fails",
tt.toolFn,
)
model := &chattest.FakeModel{
ProviderName: "test-provider",
ModelName: "test-model",
StreamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
return func(yield func(fantasy.StreamPart) bool) {
parts := []fantasy.StreamPart{
{Type: fantasy.StreamPartTypeToolInputStart, ID: "tc1", ToolCallName: "failing_tool"},
{Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc1", Delta: `{}`},
{Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc1"},
{
Type: fantasy.StreamPartTypeToolCall,
ID: "tc1",
ToolCallName: "failing_tool",
ToolCallInput: `{}`,
},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls},
}
for _, p := range parts {
if !yield(p) {
return
}
}
}, nil
},
}
err := chatloop.Run(context.Background(), chatloop.RunOptions{
Model: model,
MaxSteps: 1,
Tools: []fantasy.AgentTool{failingTool},
ActiveTools: []string{"failing_tool"},
BuiltinToolNames: tt.builtinToolNames,
PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error {
return nil
},
Metrics: metrics,
})
require.NoError(t, err)
requireCounter(t, reg, "coderd_chatd_tool_errors_total", 1, map[string]string{
"provider": "test-provider",
"model": "test-model",
"tool_name": tt.wantLabel,
})
})
}
}
+1
View File
@@ -209,6 +209,7 @@ deployment. They will always be available from the agent.
| `coderd_chatd_stream_retries_total` | counter | Total LLM stream retries. | `kind` `model` `provider` |
| `coderd_chatd_stream_subscribers` | gauge | Current number of chat stream subscribers across all chat streams. | |
| `coderd_chatd_streams_active` | gauge | Current number of chat stream state entries (in-flight plus retained). | |
| `coderd_chatd_tool_errors_total` | counter | Total tool calls that returned an error result. | `model` `provider` `tool_name` |
| `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `model` `provider` `tool_name` |
| `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `model` `provider` |
| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` |
+3
View File
@@ -259,6 +259,9 @@ coderd_chatd_stream_subscribers 0
# HELP coderd_chatd_streams_active Current number of chat stream state entries (in-flight plus retained).
# TYPE coderd_chatd_streams_active gauge
coderd_chatd_streams_active 0
# HELP coderd_chatd_tool_errors_total Total tool calls that returned an error result.
# TYPE coderd_chatd_tool_errors_total counter
coderd_chatd_tool_errors_total{provider="",model="",tool_name=""} 0
# HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result.
# TYPE coderd_chatd_tool_result_size_bytes histogram
coderd_chatd_tool_result_size_bytes{provider="",model="",tool_name=""} 0