mirror of
https://github.com/coder/coder.git
synced 2026-06-03 21:18:24 +00:00
e8508b2d90
When OpenAI's Responses API returns `Previous response with id ... not found` for a chained turn, classify it as a `ChainBroken` retry, clear `previous_response_id`, exit chain mode, reload full history, and let `chatretry` retry. Self-heals chats whose anchor was poisoned before #25074 stopped truncated streams from being persisted as a successful turn with a stored response id. The new state is exposed via the existing `coderd_chatd_stream_retries_total` counter as a `chain_broken="true"|"false"` label. Aggregating queries (`sum`, `rate` over `provider`/`model`/`kind`) keep working without changes; raw-series matchers without aggregation will now see two series per `(provider, model, kind)` where they previously saw one. The metric is internal-only so the blast radius should be small, but if you have dashboards that index by exact label matchers without aggregation they will need an extra `sum` or an explicit `chain_broken` selector. > 🤖 This PR was created with the help of Coder Agents, and was reviewed by a human 🧑💻
233 lines
7.6 KiB
Go
233 lines
7.6 KiB
Go
package chatloop
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"strconv"
|
|
|
|
"charm.land/fantasy"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
|
|
"github.com/coder/coder/v2/coderd/x/chatd/chaterror"
|
|
)
|
|
|
|
const (
|
|
metricsNamespace = "coderd"
|
|
metricsSubsystem = "chatd"
|
|
|
|
// Label values for Chats.
|
|
StateStreaming = "streaming"
|
|
StateWaiting = "waiting"
|
|
|
|
// Label values for CompactionTotal.
|
|
CompactionResultSuccess = "success"
|
|
CompactionResultError = "error"
|
|
CompactionResultTimeout = "timeout"
|
|
)
|
|
|
|
// Metrics holds Prometheus metrics for the chatd subsystem.
|
|
type Metrics struct {
|
|
Chats *prometheus.GaugeVec
|
|
MessageCount *prometheus.HistogramVec
|
|
PromptSizeBytes *prometheus.HistogramVec
|
|
ToolResultSizeBytes *prometheus.HistogramVec
|
|
ToolErrorsTotal *prometheus.CounterVec
|
|
TTFTSeconds *prometheus.HistogramVec
|
|
CompactionTotal *prometheus.CounterVec
|
|
StepsTotal *prometheus.CounterVec
|
|
StreamRetriesTotal *prometheus.CounterVec
|
|
StreamBufferDroppedTotal prometheus.Counter
|
|
}
|
|
|
|
// NewMetrics creates a new Metrics instance registered with the
|
|
// given registerer.
|
|
func NewMetrics(reg prometheus.Registerer) *Metrics {
|
|
factory := promauto.With(reg)
|
|
return &Metrics{
|
|
Chats: factory.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "chats",
|
|
Help: "Number of chats being processed, by state.",
|
|
}, []string{"state"}),
|
|
MessageCount: factory.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "message_count",
|
|
Help: "Number of messages in the prompt per LLM request.",
|
|
Buckets: prometheus.ExponentialBuckets(1, 2, 11), // 1, 2, 4, ..., 1024
|
|
}, []string{"provider", "model"}),
|
|
PromptSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "prompt_size_bytes",
|
|
Help: "Estimated byte size of the prompt per LLM request.",
|
|
Buckets: prometheus.ExponentialBuckets(1024, 4, 10), // 1KB .. 256MB
|
|
}, []string{"provider", "model"}),
|
|
ToolResultSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "tool_result_size_bytes",
|
|
Help: "Size in bytes of each tool execution result.",
|
|
Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB
|
|
}, []string{"provider", "model", "tool_name"}),
|
|
ToolErrorsTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "tool_errors_total",
|
|
Help: "Total tool calls that returned an error result.",
|
|
}, []string{"provider", "model", "tool_name"}),
|
|
TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "ttft_seconds",
|
|
Help: "Time-to-first-token: wall time from LLM request to first streamed chunk.",
|
|
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
|
|
}, []string{"provider", "model"}),
|
|
CompactionTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "compaction_total",
|
|
Help: "Total compaction outcomes (only recorded when compaction was triggered or failed).",
|
|
}, []string{"provider", "model", "result"}),
|
|
StepsTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "steps_total",
|
|
Help: "Total agentic loop steps across all chats.",
|
|
}, []string{"provider", "model"}),
|
|
StreamRetriesTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "stream_retries_total",
|
|
Help: "Total LLM stream retries.",
|
|
}, []string{"provider", "model", "kind", "chain_broken"}),
|
|
StreamBufferDroppedTotal: factory.NewCounter(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsSubsystem,
|
|
Name: "stream_buffer_dropped_total",
|
|
Help: "Number of chat stream buffer events dropped due to the per-chat buffer cap.",
|
|
}),
|
|
}
|
|
}
|
|
|
|
// NopMetrics returns a Metrics instance that discards all data.
|
|
// Useful for tests and when metrics collection is not desired.
|
|
func NopMetrics() *Metrics {
|
|
return NewMetrics(prometheus.NewRegistry())
|
|
}
|
|
|
|
// RecordCompaction classifies and records a compaction attempt.
|
|
// It is a no-op when m is nil.
|
|
func (m *Metrics) RecordCompaction(provider, model string, compacted bool, err error) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
switch {
|
|
case err != nil && errors.Is(err, context.DeadlineExceeded):
|
|
m.CompactionTotal.WithLabelValues(provider, model, CompactionResultTimeout).Inc()
|
|
case err != nil && errors.Is(err, context.Canceled):
|
|
// User interruption, not a compaction failure.
|
|
return
|
|
case err != nil:
|
|
m.CompactionTotal.WithLabelValues(provider, model, CompactionResultError).Inc()
|
|
case compacted:
|
|
m.CompactionTotal.WithLabelValues(provider, model, CompactionResultSuccess).Inc()
|
|
// !compacted && err == nil means threshold not reached -- not
|
|
// recorded.
|
|
}
|
|
}
|
|
|
|
// RecordStreamRetry increments stream_retries_total. The caller
|
|
// must obtain classified via chaterror.Classify (non-empty Kind).
|
|
// No-op when m is nil. The chain_broken label is "true" for chain
|
|
// anchor failures (e.g. OpenAI previous_response_id 404) recovered
|
|
// by the chatloop, and "false" otherwise.
|
|
func (m *Metrics) RecordStreamRetry(provider, model string, classified chaterror.ClassifiedError) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
m.StreamRetriesTotal.WithLabelValues(
|
|
provider,
|
|
model,
|
|
string(classified.Kind),
|
|
strconv.FormatBool(classified.ChainBroken),
|
|
).Inc()
|
|
}
|
|
|
|
// RecordToolError increments tool_errors_total for the given
|
|
// tool. No-op when m is nil.
|
|
func (m *Metrics) RecordToolError(provider, model, toolLabel string) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
m.ToolErrorsTotal.WithLabelValues(provider, model, toolLabel).Inc()
|
|
}
|
|
|
|
// RecordStreamBufferDropped increments stream_buffer_dropped_total
|
|
// once per dropped event. No-op when m is nil.
|
|
func (m *Metrics) RecordStreamBufferDropped() {
|
|
if m == nil {
|
|
return
|
|
}
|
|
m.StreamBufferDroppedTotal.Inc()
|
|
}
|
|
|
|
// EstimatePromptSize returns a cheap byte-size estimate of a
|
|
// fantasy prompt by summing the text content lengths of all
|
|
// message parts. This avoids JSON marshaling overhead.
|
|
func EstimatePromptSize(messages []fantasy.Message) int {
|
|
var size int
|
|
for _, msg := range messages {
|
|
for _, part := range msg.Content {
|
|
size += ContentPartSize(part)
|
|
}
|
|
}
|
|
return size
|
|
}
|
|
|
|
// ContentPartSize returns the byte length of a MessagePart's
|
|
// primary text or data field.
|
|
func ContentPartSize(part fantasy.MessagePart) int {
|
|
switch p := part.(type) {
|
|
case fantasy.TextPart:
|
|
return len(p.Text)
|
|
case fantasy.ReasoningPart:
|
|
return len(p.Text)
|
|
case fantasy.FilePart:
|
|
return len(p.Data)
|
|
case fantasy.ToolCallPart:
|
|
return len(p.Input)
|
|
case fantasy.ToolResultPart:
|
|
return toolResultOutputSize(p.Output)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// ToolResultSize returns the byte length of a
|
|
// ToolResultContent's primary text or data field.
|
|
func ToolResultSize(r fantasy.ToolResultContent) int {
|
|
return toolResultOutputSize(r.Result)
|
|
}
|
|
|
|
func toolResultOutputSize(output fantasy.ToolResultOutputContent) int {
|
|
if output == nil {
|
|
return 0
|
|
}
|
|
switch v := output.(type) {
|
|
case fantasy.ToolResultOutputContentText:
|
|
return len(v.Text)
|
|
case fantasy.ToolResultOutputContentError:
|
|
if v.Error != nil {
|
|
return len(v.Error.Error())
|
|
}
|
|
return 0
|
|
case fantasy.ToolResultOutputContentMedia:
|
|
return len(v.Data)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|