mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: add Prometheus metrics for chatd subsystem (#24371)
Adds 7 Prometheus metrics to the chatd subsystem and introduces typed
`ActivityBumpReason` for deadline bump attribution.
| Metric | Type | Labels |
|--------|------|--------|
| `coderd_chatd_chats` | Gauge | `state` (streaming, waiting) |
| `coderd_chatd_message_count` | Histogram | `provider` |
| `coderd_chatd_prompt_size_bytes` | Histogram | `provider` |
| `coderd_chatd_tool_result_size_bytes` | Histogram | `provider`,
`tool_name` |
| `coderd_chatd_ttft_seconds` | Histogram | `provider` |
| `coderd_chatd_compaction_total` | Counter | `provider`, `result` |
| `coderd_chatd_steps_total` | Counter | `provider` |
> 🤖
This commit is contained in:
@@ -216,7 +216,7 @@ func (a *AppsAPI) UpdateAppStatus(ctx context.Context, req *agentproto.UpdateApp
|
|||||||
// We pass time.Time{} for nextAutostart since we don't have access to
|
// We pass time.Time{} for nextAutostart since we don't have access to
|
||||||
// TemplateScheduleStore here. The activity bump logic handles this by
|
// TemplateScheduleStore here. The activity bump logic handles this by
|
||||||
// defaulting to the template's activity_bump duration (typically 1 hour).
|
// defaulting to the template's activity_bump duration (typically 1 hour).
|
||||||
workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{})
|
workspacestats.ActivityBumpWorkspace(ctx, a.Log, a.Database, ws.ID, time.Time{}, workspacestats.ActivityBumpReasonAppActivity)
|
||||||
}
|
}
|
||||||
// just return a blank response because it doesn't contain any settable fields at present.
|
// just return a blank response because it doesn't contain any settable fields at present.
|
||||||
return new(agentproto.UpdateAppStatusResponse), nil
|
return new(agentproto.UpdateAppStatusResponse), nil
|
||||||
|
|||||||
@@ -792,6 +792,7 @@ func New(options *Options) *API {
|
|||||||
Pubsub: options.Pubsub,
|
Pubsub: options.Pubsub,
|
||||||
WebpushDispatcher: options.WebPushDispatcher,
|
WebpushDispatcher: options.WebPushDispatcher,
|
||||||
UsageTracker: options.WorkspaceUsageTracker,
|
UsageTracker: options.WorkspaceUsageTracker,
|
||||||
|
PrometheusRegistry: options.PrometheusRegistry,
|
||||||
})
|
})
|
||||||
gitSyncLogger := options.Logger.Named("gitsync")
|
gitSyncLogger := options.Logger.Named("gitsync")
|
||||||
refresher := gitsync.NewRefresher(
|
refresher := gitsync.NewRefresher(
|
||||||
|
|||||||
@@ -11,6 +11,21 @@ import (
|
|||||||
"github.com/coder/coder/v2/coderd/database"
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// ActivityBumpReason represents the reason for an activity bump.
|
||||||
|
type ActivityBumpReason string
|
||||||
|
|
||||||
|
const (
|
||||||
|
// ActivityBumpReasonWorkspaceStats indicates the bump was triggered
|
||||||
|
// by SSH or terminal activity reported via workspace stats.
|
||||||
|
ActivityBumpReasonWorkspaceStats ActivityBumpReason = "workspace_stats"
|
||||||
|
// ActivityBumpReasonChatHeartbeat indicates the bump was triggered
|
||||||
|
// by an AI chat heartbeat.
|
||||||
|
ActivityBumpReasonChatHeartbeat ActivityBumpReason = "chat_heartbeat"
|
||||||
|
// ActivityBumpReasonAppActivity indicates the bump was triggered
|
||||||
|
// by app or port-forward activity.
|
||||||
|
ActivityBumpReasonAppActivity ActivityBumpReason = "app_activity"
|
||||||
|
)
|
||||||
|
|
||||||
// ActivityBumpWorkspace automatically bumps the workspace's auto-off timer
|
// ActivityBumpWorkspace automatically bumps the workspace's auto-off timer
|
||||||
// if it is set to expire soon. The deadline will be bumped by 1 hour*.
|
// if it is set to expire soon. The deadline will be bumped by 1 hour*.
|
||||||
// If the bump crosses over an autostart time, the workspace will be
|
// If the bump crosses over an autostart time, the workspace will be
|
||||||
@@ -36,7 +51,7 @@ import (
|
|||||||
// A way to avoid this is to configure the max deadline to something that will not
|
// A way to avoid this is to configure the max deadline to something that will not
|
||||||
// span more than 1 day. This will force the workspace to restart and reset the deadline
|
// span more than 1 day. This will force the workspace to restart and reset the deadline
|
||||||
// each morning when it autostarts.
|
// each morning when it autostarts.
|
||||||
func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time) {
|
func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Store, workspaceID uuid.UUID, nextAutostart time.Time, reason ActivityBumpReason) {
|
||||||
// We set a short timeout so if the app is under load, these
|
// We set a short timeout so if the app is under load, these
|
||||||
// low priority operations fail first.
|
// low priority operations fail first.
|
||||||
ctx, cancel := context.WithTimeout(ctx, time.Second*15)
|
ctx, cancel := context.WithTimeout(ctx, time.Second*15)
|
||||||
@@ -50,6 +65,7 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
|
|||||||
// Bump will fail if the context is canceled, but this is ok.
|
// Bump will fail if the context is canceled, but this is ok.
|
||||||
log.Error(ctx, "activity bump failed", slog.Error(err),
|
log.Error(ctx, "activity bump failed", slog.Error(err),
|
||||||
slog.F("workspace_id", workspaceID),
|
slog.F("workspace_id", workspaceID),
|
||||||
|
slog.F("reason", reason),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
@@ -57,5 +73,6 @@ func ActivityBumpWorkspace(ctx context.Context, log slog.Logger, db database.Sto
|
|||||||
|
|
||||||
log.Debug(ctx, "bumped deadline from activity",
|
log.Debug(ctx, "bumped deadline from activity",
|
||||||
slog.F("workspace_id", workspaceID),
|
slog.F("workspace_id", workspaceID),
|
||||||
|
slog.F("reason", reason),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -268,13 +268,14 @@ func Test_ActivityBumpWorkspace(t *testing.T) {
|
|||||||
|
|
||||||
// Bump duration is measured from the time of the bump, so we measure from here.
|
// Bump duration is measured from the time of the bump, so we measure from here.
|
||||||
start := dbtime.Now()
|
start := dbtime.Now()
|
||||||
workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start))
|
workspacestats.ActivityBumpWorkspace(ctx, log, db, bld.WorkspaceID, nextAutostart(start), workspacestats.ActivityBumpReasonWorkspaceStats)
|
||||||
end := dbtime.Now()
|
end := dbtime.Now()
|
||||||
|
|
||||||
// Validate our state after bump
|
// Validate our state after bump
|
||||||
updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID)
|
updatedBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, bld.WorkspaceID)
|
||||||
require.NoError(t, err, "unexpected error getting latest workspace build")
|
require.NoError(t, err, "unexpected error getting latest workspace build")
|
||||||
require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed")
|
require.Equal(t, bld.MaxDeadline.UTC(), updatedBuild.MaxDeadline.UTC(), "max_deadline should not have changed")
|
||||||
|
|
||||||
if tt.expectedBump == 0 {
|
if tt.expectedBump == 0 {
|
||||||
assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at")
|
assert.Equal(t, bld.UpdatedAt.UTC(), updatedBuild.UpdatedAt.UTC(), "should not have bumped updated_at")
|
||||||
assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline")
|
assert.Equal(t, bld.Deadline.UTC(), updatedBuild.Deadline.UTC(), "should not have bumped deadline")
|
||||||
|
|||||||
@@ -194,7 +194,7 @@ func (r *Reporter) ReportAgentStats(ctx context.Context, now time.Time, workspac
|
|||||||
}
|
}
|
||||||
|
|
||||||
// bump workspace activity
|
// bump workspace activity
|
||||||
ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart)
|
ActivityBumpWorkspace(ctx, r.opts.Logger.Named("activity_bump"), r.opts.Database, workspace.ID, nextAutostart, ActivityBumpReasonWorkspaceStats)
|
||||||
}
|
}
|
||||||
|
|
||||||
// bump workspace last_used_at
|
// bump workspace last_used_at
|
||||||
|
|||||||
+27
-2
@@ -18,6 +18,7 @@ import (
|
|||||||
"charm.land/fantasy"
|
"charm.land/fantasy"
|
||||||
"charm.land/fantasy/providers/anthropic"
|
"charm.land/fantasy/providers/anthropic"
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/shopspring/decimal"
|
"github.com/shopspring/decimal"
|
||||||
"github.com/sqlc-dev/pqtype"
|
"github.com/sqlc-dev/pqtype"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
@@ -145,6 +146,7 @@ type Server struct {
|
|||||||
|
|
||||||
usageTracker *workspacestats.UsageTracker
|
usageTracker *workspacestats.UsageTracker
|
||||||
clock quartz.Clock
|
clock quartz.Clock
|
||||||
|
metrics *chatloop.Metrics
|
||||||
recordingSem chan struct{}
|
recordingSem chan struct{}
|
||||||
|
|
||||||
// Configuration
|
// Configuration
|
||||||
@@ -2700,6 +2702,7 @@ type Config struct {
|
|||||||
WebpushDispatcher webpush.Dispatcher
|
WebpushDispatcher webpush.Dispatcher
|
||||||
UsageTracker *workspacestats.UsageTracker
|
UsageTracker *workspacestats.UsageTracker
|
||||||
Clock quartz.Clock
|
Clock quartz.Clock
|
||||||
|
PrometheusRegistry prometheus.Registerer
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new chat processor. The processor polls for pending
|
// New creates a new chat processor. The processor polls for pending
|
||||||
@@ -2768,6 +2771,11 @@ func New(cfg Config) *Server {
|
|||||||
wakeCh: make(chan struct{}, 1),
|
wakeCh: make(chan struct{}, 1),
|
||||||
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
|
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
|
||||||
}
|
}
|
||||||
|
if cfg.PrometheusRegistry != nil {
|
||||||
|
p.metrics = chatloop.NewMetrics(cfg.PrometheusRegistry)
|
||||||
|
} else {
|
||||||
|
p.metrics = chatloop.NopMetrics()
|
||||||
|
}
|
||||||
//nolint:gocritic // The chat processor uses a scoped chatd context.
|
//nolint:gocritic // The chat processor uses a scoped chatd context.
|
||||||
ctx = dbauthz.AsChatd(ctx)
|
ctx = dbauthz.AsChatd(ctx)
|
||||||
|
|
||||||
@@ -4027,7 +4035,7 @@ func (p *Server) trackWorkspaceUsage(
|
|||||||
// approx. 333 CTE queries/second. A cheap fix for this could
|
// approx. 333 CTE queries/second. A cheap fix for this could
|
||||||
// be to heartbeat every Nth query. Leaving as potential future
|
// be to heartbeat every Nth query. Leaving as potential future
|
||||||
// low-hanging fruit if needed.
|
// low-hanging fruit if needed.
|
||||||
workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{})
|
workspacestats.ActivityBumpWorkspace(ctx, logger.Named("activity_bump"), p.db, wsID.UUID, time.Time{}, workspacestats.ActivityBumpReasonChatHeartbeat)
|
||||||
}
|
}
|
||||||
return wsID
|
return wsID
|
||||||
}
|
}
|
||||||
@@ -4036,6 +4044,9 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
|
|||||||
logger := p.logger.With(slog.F("chat_id", chat.ID))
|
logger := p.logger.With(slog.F("chat_id", chat.ID))
|
||||||
logger.Info(ctx, "processing chat request")
|
logger.Info(ctx, "processing chat request")
|
||||||
|
|
||||||
|
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
|
||||||
|
defer p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
|
||||||
|
|
||||||
chatCtx, cancel := context.WithCancelCause(ctx)
|
chatCtx, cancel := context.WithCancelCause(ctx)
|
||||||
defer cancel(nil)
|
defer cancel(nil)
|
||||||
|
|
||||||
@@ -4246,6 +4257,12 @@ func (p *Server) processChat(ctx context.Context, chat database.Chat) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Dec()
|
||||||
|
p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Inc()
|
||||||
|
defer func() {
|
||||||
|
p.metrics.Chats.WithLabelValues(chatloop.StateStreaming).Dec()
|
||||||
|
p.metrics.Chats.WithLabelValues(chatloop.StateWaiting).Inc()
|
||||||
|
}()
|
||||||
runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger)
|
runResult, err := p.runChat(chatCtx, chat, generatedTitle, logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) {
|
if errors.Is(err, chatloop.ErrInterrupted) || errors.Is(context.Cause(chatCtx), chatloop.ErrInterrupted) {
|
||||||
@@ -5165,12 +5182,18 @@ func (p *Server) runChat(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record builtin tool names before appending MCP tools
|
||||||
|
// so the metrics layer can bound label cardinality.
|
||||||
|
builtinToolNames := make(map[string]bool, len(tools))
|
||||||
|
for _, t := range tools {
|
||||||
|
builtinToolNames[t.Info().Name] = true
|
||||||
|
}
|
||||||
|
|
||||||
// Append tools from external MCP servers. These appear
|
// Append tools from external MCP servers. These appear
|
||||||
// after the built-in tools so the LLM sees them as
|
// after the built-in tools so the LLM sees them as
|
||||||
// additional capabilities.
|
// additional capabilities.
|
||||||
tools = append(tools, mcpTools...)
|
tools = append(tools, mcpTools...)
|
||||||
tools = append(tools, workspaceMCPTools...)
|
tools = append(tools, workspaceMCPTools...)
|
||||||
|
|
||||||
// Append dynamic tools declared by the client at chat
|
// Append dynamic tools declared by the client at chat
|
||||||
// creation time. These appear in the LLM's tool list but
|
// creation time. These appear in the LLM's tool list but
|
||||||
// are never executed by the chatloop — the client handles
|
// are never executed by the chatloop — the client handles
|
||||||
@@ -5251,6 +5274,8 @@ func (p *Server) runChat(
|
|||||||
Model: model,
|
Model: model,
|
||||||
Messages: prompt,
|
Messages: prompt,
|
||||||
Tools: tools, MaxSteps: maxChatSteps,
|
Tools: tools, MaxSteps: maxChatSteps,
|
||||||
|
Metrics: p.metrics,
|
||||||
|
BuiltinToolNames: builtinToolNames,
|
||||||
|
|
||||||
ModelConfig: callConfig,
|
ModelConfig: callConfig,
|
||||||
ProviderOptions: providerOptions,
|
ProviderOptions: providerOptions,
|
||||||
|
|||||||
@@ -2610,6 +2610,7 @@ func TestProcessChat_IgnoresStaleControlNotification(t *testing.T) {
|
|||||||
chatHeartbeatInterval: time.Minute,
|
chatHeartbeatInterval: time.Minute,
|
||||||
configCache: newChatConfigCache(ctx, db, clock),
|
configCache: newChatConfigCache(ctx, db, clock),
|
||||||
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
|
heartbeatRegistry: make(map[uuid.UUID]*heartbeatEntry),
|
||||||
|
metrics: chatloop.NopMetrics(),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Publish a stale "pending" notification on the control channel
|
// Publish a stale "pending" notification on the control channel
|
||||||
|
|||||||
@@ -150,6 +150,15 @@ type RunOptions struct {
|
|||||||
OnRetry chatretry.OnRetryFn
|
OnRetry chatretry.OnRetryFn
|
||||||
|
|
||||||
OnInterruptedPersistError func(error)
|
OnInterruptedPersistError func(error)
|
||||||
|
|
||||||
|
// Metrics records Prometheus metrics for the chatd subsystem.
|
||||||
|
// When nil, no metrics are recorded.
|
||||||
|
Metrics *Metrics
|
||||||
|
|
||||||
|
// BuiltinToolNames lists tool names that are built into chatd.
|
||||||
|
// Tool results from tools not in this set are recorded under
|
||||||
|
// the "mcp" label to bound cardinality.
|
||||||
|
BuiltinToolNames map[string]bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// ProviderTool pairs a provider-native tool definition with an
|
// ProviderTool pairs a provider-native tool definition with an
|
||||||
@@ -297,6 +306,9 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
if opts.Clock == nil {
|
if opts.Clock == nil {
|
||||||
opts.Clock = quartz.NewReal()
|
opts.Clock = quartz.NewReal()
|
||||||
}
|
}
|
||||||
|
if opts.Metrics == nil {
|
||||||
|
opts.Metrics = NopMetrics()
|
||||||
|
}
|
||||||
|
|
||||||
publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) {
|
publishMessagePart := func(role codersdk.ChatMessageRole, part codersdk.ChatMessagePart) {
|
||||||
if opts.PublishMessagePart == nil {
|
if opts.PublishMessagePart == nil {
|
||||||
@@ -343,6 +355,8 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
|
|
||||||
for step := 0; totalSteps < opts.MaxSteps; step++ {
|
for step := 0; totalSteps < opts.MaxSteps; step++ {
|
||||||
totalSteps++
|
totalSteps++
|
||||||
|
provider := opts.Model.Provider()
|
||||||
|
opts.Metrics.StepsTotal.WithLabelValues(provider).Inc()
|
||||||
stepStart := time.Now()
|
stepStart := time.Now()
|
||||||
// Copy messages so that provider-specific caching
|
// Copy messages so that provider-specific caching
|
||||||
// mutations don't leak back to the caller's slice.
|
// mutations don't leak back to the caller's slice.
|
||||||
@@ -354,7 +368,8 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
if applyAnthropicCaching {
|
if applyAnthropicCaching {
|
||||||
addAnthropicPromptCaching(prepared)
|
addAnthropicPromptCaching(prepared)
|
||||||
}
|
}
|
||||||
|
opts.Metrics.MessageCount.WithLabelValues(provider).Observe(float64(len(prepared)))
|
||||||
|
opts.Metrics.PromptSizeBytes.WithLabelValues(provider).Observe(float64(EstimatePromptSize(prepared)))
|
||||||
call := fantasy.Call{
|
call := fantasy.Call{
|
||||||
Prompt: prepared,
|
Prompt: prepared,
|
||||||
Tools: tools,
|
Tools: tools,
|
||||||
@@ -371,12 +386,13 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
err := chatretry.Retry(ctx, func(retryCtx context.Context) error {
|
err := chatretry.Retry(ctx, func(retryCtx context.Context) error {
|
||||||
attempt, streamErr := guardedStream(
|
attempt, streamErr := guardedStream(
|
||||||
retryCtx,
|
retryCtx,
|
||||||
opts.Model.Provider(),
|
provider,
|
||||||
opts.Clock,
|
opts.Clock,
|
||||||
opts.StartupTimeout,
|
opts.StartupTimeout,
|
||||||
func(attemptCtx context.Context) (fantasy.StreamResponse, error) {
|
func(attemptCtx context.Context) (fantasy.StreamResponse, error) {
|
||||||
return opts.Model.Stream(attemptCtx, call)
|
return opts.Model.Stream(attemptCtx, call)
|
||||||
},
|
},
|
||||||
|
opts.Metrics,
|
||||||
)
|
)
|
||||||
if streamErr != nil {
|
if streamErr != nil {
|
||||||
return streamErr
|
return streamErr
|
||||||
@@ -444,7 +460,7 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Execute only built-in tools.
|
// Execute only built-in tools.
|
||||||
toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, func(tr fantasy.ToolResultContent, completedAt time.Time) {
|
toolResults = executeTools(ctx, opts.Tools, opts.ProviderTools, builtinCalls, opts.Metrics, provider, opts.BuiltinToolNames, func(tr fantasy.ToolResultContent, completedAt time.Time) {
|
||||||
recordToolResultTimestamp(&result, tr.ToolCallID, completedAt)
|
recordToolResultTimestamp(&result, tr.ToolCallID, completedAt)
|
||||||
ssePart := chatprompt.PartFromContent(tr)
|
ssePart := chatprompt.PartFromContent(tr)
|
||||||
ssePart.CreatedAt = &completedAt
|
ssePart.CreatedAt = &completedAt
|
||||||
@@ -582,9 +598,11 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
result.providerMetadata,
|
result.providerMetadata,
|
||||||
messages,
|
messages,
|
||||||
)
|
)
|
||||||
|
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
|
||||||
if compactErr != nil && opts.Compaction.OnError != nil {
|
if compactErr != nil && opts.Compaction.OnError != nil {
|
||||||
opts.Compaction.OnError(compactErr)
|
opts.Compaction.OnError(compactErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if did {
|
if did {
|
||||||
alreadyCompacted = true
|
alreadyCompacted = true
|
||||||
compactedOnFinalStep = true
|
compactedOnFinalStep = true
|
||||||
@@ -622,6 +640,7 @@ func Run(ctx context.Context, opts RunOptions) error {
|
|||||||
lastProviderMetadata,
|
lastProviderMetadata,
|
||||||
messages,
|
messages,
|
||||||
)
|
)
|
||||||
|
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, err)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if opts.Compaction.OnError != nil {
|
if opts.Compaction.OnError != nil {
|
||||||
opts.Compaction.OnError(err)
|
opts.Compaction.OnError(err)
|
||||||
@@ -720,6 +739,7 @@ func guardedStream(
|
|||||||
clock quartz.Clock,
|
clock quartz.Clock,
|
||||||
timeout time.Duration,
|
timeout time.Duration,
|
||||||
openStream func(context.Context) (fantasy.StreamResponse, error),
|
openStream func(context.Context) (fantasy.StreamResponse, error),
|
||||||
|
metrics *Metrics,
|
||||||
) (guardedAttempt, error) {
|
) (guardedAttempt, error) {
|
||||||
attemptCtx, cancelAttempt := context.WithCancelCause(parent)
|
attemptCtx, cancelAttempt := context.WithCancelCause(parent)
|
||||||
guard := newStartupGuard(clock, timeout, cancelAttempt)
|
guard := newStartupGuard(clock, timeout, cancelAttempt)
|
||||||
@@ -731,6 +751,7 @@ func guardedStream(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
streamStart := clock.Now()
|
||||||
stream, err := openStream(attemptCtx)
|
stream, err := openStream(attemptCtx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = classifyStartupTimeout(attemptCtx, provider, err)
|
err = classifyStartupTimeout(attemptCtx, provider, err)
|
||||||
@@ -738,11 +759,17 @@ func guardedStream(
|
|||||||
return guardedAttempt{}, err
|
return guardedAttempt{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
recordTTFT := sync.OnceFunc(func() {
|
||||||
|
metrics.TTFTSeconds.WithLabelValues(provider).Observe(
|
||||||
|
clock.Since(streamStart).Seconds(),
|
||||||
|
)
|
||||||
|
})
|
||||||
return guardedAttempt{
|
return guardedAttempt{
|
||||||
ctx: attemptCtx,
|
ctx: attemptCtx,
|
||||||
stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) {
|
stream: fantasy.StreamResponse(func(yield func(fantasy.StreamPart) bool) {
|
||||||
for part := range stream {
|
for part := range stream {
|
||||||
guard.Disarm()
|
guard.Disarm()
|
||||||
|
recordTTFT()
|
||||||
if !yield(part) {
|
if !yield(part) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -985,6 +1012,9 @@ func executeTools(
|
|||||||
allTools []fantasy.AgentTool,
|
allTools []fantasy.AgentTool,
|
||||||
providerTools []ProviderTool,
|
providerTools []ProviderTool,
|
||||||
toolCalls []fantasy.ToolCallContent,
|
toolCalls []fantasy.ToolCallContent,
|
||||||
|
metrics *Metrics,
|
||||||
|
provider string,
|
||||||
|
builtinToolNames map[string]bool,
|
||||||
onResult func(fantasy.ToolResultContent, time.Time),
|
onResult func(fantasy.ToolResultContent, time.Time),
|
||||||
) []fantasy.ToolResultContent {
|
) []fantasy.ToolResultContent {
|
||||||
if len(toolCalls) == 0 {
|
if len(toolCalls) == 0 {
|
||||||
@@ -1039,7 +1069,7 @@ func executeTools(
|
|||||||
// accurate individual completion times.
|
// accurate individual completion times.
|
||||||
completedAt[i] = dbtime.Now()
|
completedAt[i] = dbtime.Now()
|
||||||
}()
|
}()
|
||||||
results[i] = executeSingleTool(ctx, toolMap, tc)
|
results[i] = executeSingleTool(ctx, toolMap, tc, metrics, provider, builtinToolNames)
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
@@ -1060,12 +1090,24 @@ func executeSingleTool(
|
|||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
toolMap map[string]fantasy.AgentTool,
|
toolMap map[string]fantasy.AgentTool,
|
||||||
tc fantasy.ToolCallContent,
|
tc fantasy.ToolCallContent,
|
||||||
|
metrics *Metrics,
|
||||||
|
provider string,
|
||||||
|
builtinToolNames map[string]bool,
|
||||||
) fantasy.ToolResultContent {
|
) fantasy.ToolResultContent {
|
||||||
result := fantasy.ToolResultContent{
|
result := fantasy.ToolResultContent{
|
||||||
ToolCallID: tc.ToolCallID,
|
ToolCallID: tc.ToolCallID,
|
||||||
ToolName: tc.ToolName,
|
ToolName: tc.ToolName,
|
||||||
ProviderExecuted: false,
|
ProviderExecuted: false,
|
||||||
}
|
}
|
||||||
|
defer func() {
|
||||||
|
toolLabel := tc.ToolName
|
||||||
|
if !builtinToolNames[tc.ToolName] {
|
||||||
|
toolLabel = "mcp"
|
||||||
|
}
|
||||||
|
metrics.ToolResultSizeBytes.WithLabelValues(provider, toolLabel).Observe(
|
||||||
|
float64(ToolResultSize(result)),
|
||||||
|
)
|
||||||
|
}()
|
||||||
|
|
||||||
tool, exists := toolMap[tc.ToolName]
|
tool, exists := toolMap[tc.ToolName]
|
||||||
if !exists {
|
if !exists {
|
||||||
@@ -1256,7 +1298,7 @@ func tryCompactOnExit(
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
_, compactErr := tryCompact(
|
did, compactErr := tryCompact(
|
||||||
ctx,
|
ctx,
|
||||||
opts.Model,
|
opts.Model,
|
||||||
opts.Compaction,
|
opts.Compaction,
|
||||||
@@ -1265,6 +1307,7 @@ func tryCompactOnExit(
|
|||||||
metadata,
|
metadata,
|
||||||
reloaded,
|
reloaded,
|
||||||
)
|
)
|
||||||
|
opts.Metrics.RecordCompaction(opts.Model.Provider(), did, compactErr)
|
||||||
if compactErr != nil && opts.Compaction.OnError != nil {
|
if compactErr != nil && opts.Compaction.OnError != nil {
|
||||||
opts.Compaction.OnError(compactErr)
|
opts.Compaction.OnError(compactErr)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,173 @@
|
|||||||
|
package chatloop
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
|
||||||
|
"charm.land/fantasy"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
metricsNamespace = "coderd"
|
||||||
|
metricsSubsystem = "chatd"
|
||||||
|
|
||||||
|
// Label values for Chats.
|
||||||
|
StateStreaming = "streaming"
|
||||||
|
StateWaiting = "waiting"
|
||||||
|
|
||||||
|
// Label values for CompactionTotal.
|
||||||
|
CompactionResultSuccess = "success"
|
||||||
|
CompactionResultError = "error"
|
||||||
|
CompactionResultTimeout = "timeout"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Metrics holds Prometheus metrics for the chatd subsystem.
|
||||||
|
type Metrics struct {
|
||||||
|
Chats *prometheus.GaugeVec
|
||||||
|
MessageCount *prometheus.HistogramVec
|
||||||
|
PromptSizeBytes *prometheus.HistogramVec
|
||||||
|
ToolResultSizeBytes *prometheus.HistogramVec
|
||||||
|
TTFTSeconds *prometheus.HistogramVec
|
||||||
|
CompactionTotal *prometheus.CounterVec
|
||||||
|
StepsTotal *prometheus.CounterVec
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewMetrics creates a new Metrics instance registered with the
|
||||||
|
// given registerer.
|
||||||
|
func NewMetrics(reg prometheus.Registerer) *Metrics {
|
||||||
|
factory := promauto.With(reg)
|
||||||
|
return &Metrics{
|
||||||
|
Chats: factory.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "chats",
|
||||||
|
Help: "Number of chats being processed, by state.",
|
||||||
|
}, []string{"state"}),
|
||||||
|
MessageCount: factory.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "message_count",
|
||||||
|
Help: "Number of messages in the prompt per LLM request.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(1, 2, 8), // 1, 2, 4, 8, 16, 32, 64, 128
|
||||||
|
}, []string{"provider"}),
|
||||||
|
PromptSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "prompt_size_bytes",
|
||||||
|
Help: "Estimated byte size of the prompt per LLM request.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(1024, 4, 10), // 1KB .. 256MB
|
||||||
|
}, []string{"provider"}),
|
||||||
|
ToolResultSizeBytes: factory.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "tool_result_size_bytes",
|
||||||
|
Help: "Size in bytes of each tool execution result.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(64, 4, 9), // 64B .. 4MB
|
||||||
|
}, []string{"provider", "tool_name"}),
|
||||||
|
TTFTSeconds: factory.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "ttft_seconds",
|
||||||
|
Help: "Time-to-first-token: wall time from LLM request to first streamed chunk.",
|
||||||
|
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
|
||||||
|
}, []string{"provider"}),
|
||||||
|
CompactionTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "compaction_total",
|
||||||
|
Help: "Total compaction outcomes (only recorded when compaction was triggered or failed).",
|
||||||
|
}, []string{"provider", "result"}),
|
||||||
|
StepsTotal: factory.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsSubsystem,
|
||||||
|
Name: "steps_total",
|
||||||
|
Help: "Total agentic loop steps across all chats.",
|
||||||
|
}, []string{"provider"}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NopMetrics returns a Metrics instance that discards all data.
|
||||||
|
// Useful for tests and when metrics collection is not desired.
|
||||||
|
func NopMetrics() *Metrics {
|
||||||
|
return NewMetrics(prometheus.NewRegistry())
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordCompaction classifies and records a compaction attempt.
|
||||||
|
// It is a no-op when m is nil.
|
||||||
|
func (m *Metrics) RecordCompaction(provider string, compacted bool, err error) {
|
||||||
|
if m == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case err != nil && errors.Is(err, context.DeadlineExceeded):
|
||||||
|
m.CompactionTotal.WithLabelValues(provider, CompactionResultTimeout).Inc()
|
||||||
|
case err != nil && errors.Is(err, context.Canceled):
|
||||||
|
// User interruption, not a compaction failure.
|
||||||
|
return
|
||||||
|
case err != nil:
|
||||||
|
m.CompactionTotal.WithLabelValues(provider, CompactionResultError).Inc()
|
||||||
|
case compacted:
|
||||||
|
m.CompactionTotal.WithLabelValues(provider, CompactionResultSuccess).Inc()
|
||||||
|
// !compacted && err == nil means threshold not reached -- not
|
||||||
|
// recorded.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EstimatePromptSize returns a cheap byte-size estimate of a
|
||||||
|
// fantasy prompt by summing the text content lengths of all
|
||||||
|
// message parts. This avoids JSON marshaling overhead.
|
||||||
|
func EstimatePromptSize(messages []fantasy.Message) int {
|
||||||
|
var size int
|
||||||
|
for _, msg := range messages {
|
||||||
|
for _, part := range msg.Content {
|
||||||
|
size += ContentPartSize(part)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return size
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContentPartSize returns the byte length of a MessagePart's
|
||||||
|
// primary text or data field.
|
||||||
|
func ContentPartSize(part fantasy.MessagePart) int {
|
||||||
|
switch p := part.(type) {
|
||||||
|
case fantasy.TextPart:
|
||||||
|
return len(p.Text)
|
||||||
|
case fantasy.ReasoningPart:
|
||||||
|
return len(p.Text)
|
||||||
|
case fantasy.FilePart:
|
||||||
|
return len(p.Data)
|
||||||
|
case fantasy.ToolCallPart:
|
||||||
|
return len(p.Input)
|
||||||
|
case fantasy.ToolResultPart:
|
||||||
|
return toolResultOutputSize(p.Output)
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToolResultSize returns the byte length of a
|
||||||
|
// ToolResultContent's primary text or data field.
|
||||||
|
func ToolResultSize(r fantasy.ToolResultContent) int {
|
||||||
|
return toolResultOutputSize(r.Result)
|
||||||
|
}
|
||||||
|
|
||||||
|
func toolResultOutputSize(output fantasy.ToolResultOutputContent) int {
|
||||||
|
if output == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
switch v := output.(type) {
|
||||||
|
case fantasy.ToolResultOutputContentText:
|
||||||
|
return len(v.Text)
|
||||||
|
case fantasy.ToolResultOutputContentError:
|
||||||
|
if v.Error != nil {
|
||||||
|
return len(v.Error.Error())
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
case fantasy.ToolResultOutputContentMedia:
|
||||||
|
return len(v.Data)
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,342 @@
|
|||||||
|
package chatloop_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"charm.land/fantasy"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/x/chatd/chatloop"
|
||||||
|
"github.com/coder/coder/v2/coderd/x/chatd/chattest"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNewMetrics_RegistersAllMetrics(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
m := chatloop.NewMetrics(reg)
|
||||||
|
|
||||||
|
// Initialize vector metrics so they appear in Gather output.
|
||||||
|
m.Chats.WithLabelValues(chatloop.StateStreaming)
|
||||||
|
m.CompactionTotal.WithLabelValues("anthropic", chatloop.CompactionResultSuccess)
|
||||||
|
m.ToolResultSizeBytes.WithLabelValues("anthropic", "test")
|
||||||
|
m.MessageCount.WithLabelValues("anthropic")
|
||||||
|
m.PromptSizeBytes.WithLabelValues("anthropic")
|
||||||
|
m.TTFTSeconds.WithLabelValues("anthropic")
|
||||||
|
m.StepsTotal.WithLabelValues("anthropic")
|
||||||
|
|
||||||
|
families, err := reg.Gather()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
expected := map[string]dto.MetricType{
|
||||||
|
"coderd_chatd_chats": dto.MetricType_GAUGE,
|
||||||
|
"coderd_chatd_message_count": dto.MetricType_HISTOGRAM,
|
||||||
|
"coderd_chatd_prompt_size_bytes": dto.MetricType_HISTOGRAM,
|
||||||
|
"coderd_chatd_tool_result_size_bytes": dto.MetricType_HISTOGRAM,
|
||||||
|
"coderd_chatd_ttft_seconds": dto.MetricType_HISTOGRAM,
|
||||||
|
"coderd_chatd_compaction_total": dto.MetricType_COUNTER,
|
||||||
|
"coderd_chatd_steps_total": dto.MetricType_COUNTER,
|
||||||
|
}
|
||||||
|
|
||||||
|
found := make(map[string]dto.MetricType)
|
||||||
|
for _, f := range families {
|
||||||
|
found[f.GetName()] = f.GetType()
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, expectedType := range expected {
|
||||||
|
actualType, ok := found[name]
|
||||||
|
assert.True(t, ok, "metric %q not registered", name)
|
||||||
|
if ok {
|
||||||
|
assert.Equal(t, expectedType, actualType, "metric %q has wrong type", name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNopMetrics_DoesNotPanic(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
m := chatloop.NopMetrics()
|
||||||
|
|
||||||
|
// Exercise every metric to confirm no nil-pointer panics.
|
||||||
|
m.Chats.WithLabelValues("streaming").Inc()
|
||||||
|
m.Chats.WithLabelValues("streaming").Dec()
|
||||||
|
m.Chats.WithLabelValues("waiting").Inc()
|
||||||
|
m.Chats.WithLabelValues("waiting").Dec()
|
||||||
|
m.MessageCount.WithLabelValues("anthropic").Observe(10)
|
||||||
|
m.PromptSizeBytes.WithLabelValues("openai").Observe(4096)
|
||||||
|
m.ToolResultSizeBytes.WithLabelValues("anthropic", "execute").Observe(512)
|
||||||
|
m.TTFTSeconds.WithLabelValues("anthropic").Observe(0.5)
|
||||||
|
m.CompactionTotal.WithLabelValues("anthropic", "success").Inc()
|
||||||
|
m.CompactionTotal.WithLabelValues("openai", "error").Inc()
|
||||||
|
m.CompactionTotal.WithLabelValues("google", "timeout").Inc()
|
||||||
|
m.StepsTotal.WithLabelValues("anthropic").Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEstimatePromptSize(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
messages := []fantasy.Message{
|
||||||
|
{
|
||||||
|
Role: fantasy.MessageRoleSystem,
|
||||||
|
Content: []fantasy.MessagePart{
|
||||||
|
fantasy.TextPart{Text: "You are a helpful assistant."},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Role: fantasy.MessageRoleUser,
|
||||||
|
Content: []fantasy.MessagePart{
|
||||||
|
fantasy.TextPart{Text: "Hello world"},
|
||||||
|
fantasy.ReasoningPart{Text: "thinking..."},
|
||||||
|
fantasy.FilePart{Data: []byte("filedata")},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Role: fantasy.MessageRoleAssistant,
|
||||||
|
Content: []fantasy.MessagePart{
|
||||||
|
fantasy.TextPart{Text: "Hi there!"},
|
||||||
|
fantasy.ToolCallPart{Input: `{"file":"main.go"}`},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Role: fantasy.MessageRoleTool,
|
||||||
|
Content: []fantasy.MessagePart{
|
||||||
|
fantasy.ToolResultPart{
|
||||||
|
Output: fantasy.ToolResultOutputContentText{Text: "result"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
size := chatloop.EstimatePromptSize(messages)
|
||||||
|
// "You are a helpful assistant." (28) + "Hello world" (11) +
|
||||||
|
// "thinking..." (11) + "filedata" (8) +
|
||||||
|
// "Hi there!" (9) + `{"file":"main.go"}` (18) +
|
||||||
|
// "result" (6) = 91
|
||||||
|
assert.Equal(t, 91, size)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToolResultSize(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
result fantasy.ToolResultContent
|
||||||
|
expected int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "text",
|
||||||
|
result: fantasy.ToolResultContent{
|
||||||
|
Result: fantasy.ToolResultOutputContentText{Text: "hello"},
|
||||||
|
},
|
||||||
|
expected: 5,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error",
|
||||||
|
result: fantasy.ToolResultContent{
|
||||||
|
Result: fantasy.ToolResultOutputContentError{
|
||||||
|
Error: assert.AnError,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: len(assert.AnError.Error()),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "media",
|
||||||
|
result: fantasy.ToolResultContent{
|
||||||
|
Result: fantasy.ToolResultOutputContentMedia{Data: "base64data"},
|
||||||
|
},
|
||||||
|
expected: 10,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "nil_result",
|
||||||
|
result: fantasy.ToolResultContent{},
|
||||||
|
expected: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error_nil_error",
|
||||||
|
result: fantasy.ToolResultContent{
|
||||||
|
Result: fantasy.ToolResultOutputContentError{Error: nil},
|
||||||
|
},
|
||||||
|
expected: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
assert.Equal(t, tt.expected, chatloop.ToolResultSize(tt.result))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecordCompaction(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("nil metrics does not panic", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
var m *chatloop.Metrics
|
||||||
|
m.RecordCompaction("anthropic", true, nil)
|
||||||
|
})
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
compacted bool
|
||||||
|
err error
|
||||||
|
wantLabel string
|
||||||
|
wantCount int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "success",
|
||||||
|
compacted: true,
|
||||||
|
err: nil,
|
||||||
|
wantLabel: chatloop.CompactionResultSuccess,
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "error",
|
||||||
|
compacted: false,
|
||||||
|
err: assert.AnError,
|
||||||
|
wantLabel: chatloop.CompactionResultError,
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "timeout",
|
||||||
|
compacted: false,
|
||||||
|
err: context.DeadlineExceeded,
|
||||||
|
wantLabel: chatloop.CompactionResultTimeout,
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "threshold_not_reached",
|
||||||
|
compacted: false,
|
||||||
|
err: nil,
|
||||||
|
wantLabel: "",
|
||||||
|
wantCount: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "canceled",
|
||||||
|
compacted: false,
|
||||||
|
err: context.Canceled,
|
||||||
|
wantLabel: "",
|
||||||
|
wantCount: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
m := chatloop.NewMetrics(reg)
|
||||||
|
m.RecordCompaction("test", tt.compacted, tt.err)
|
||||||
|
|
||||||
|
families, err := reg.Gather()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
if tt.wantCount == 0 {
|
||||||
|
for _, f := range families {
|
||||||
|
assert.NotEqual(t, "coderd_chatd_compaction_total", f.GetName(),
|
||||||
|
"compaction_total should not be recorded")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var found bool
|
||||||
|
for _, f := range families {
|
||||||
|
if f.GetName() != "coderd_chatd_compaction_total" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
found = true
|
||||||
|
require.Len(t, f.GetMetric(), 1)
|
||||||
|
metric := f.GetMetric()[0]
|
||||||
|
assert.Equal(t, float64(tt.wantCount), metric.GetCounter().GetValue())
|
||||||
|
// Check label.
|
||||||
|
for _, lp := range metric.GetLabel() {
|
||||||
|
if lp.GetName() == "result" {
|
||||||
|
assert.Equal(t, tt.wantLabel, lp.GetValue())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.True(t, found, "compaction_total metric not found")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRun_RecordsMetrics(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
metrics := chatloop.NewMetrics(reg)
|
||||||
|
|
||||||
|
model := &chattest.FakeModel{
|
||||||
|
ProviderName: "test-provider",
|
||||||
|
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
|
||||||
|
return func(yield func(fantasy.StreamPart) bool) {
|
||||||
|
parts := []fantasy.StreamPart{
|
||||||
|
{Type: fantasy.StreamPartTypeTextStart, ID: "t1"},
|
||||||
|
{Type: fantasy.StreamPartTypeTextDelta, ID: "t1", Delta: "hello"},
|
||||||
|
{Type: fantasy.StreamPartTypeTextEnd, ID: "t1"},
|
||||||
|
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
|
||||||
|
}
|
||||||
|
for _, p := range parts {
|
||||||
|
if !yield(p) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
err := chatloop.Run(context.Background(), chatloop.RunOptions{
|
||||||
|
Model: model,
|
||||||
|
Messages: []fantasy.Message{
|
||||||
|
{
|
||||||
|
Role: fantasy.MessageRoleUser,
|
||||||
|
Content: []fantasy.MessagePart{
|
||||||
|
fantasy.TextPart{Text: "hello"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
MaxSteps: 1,
|
||||||
|
PersistStep: func(_ context.Context, _ chatloop.PersistedStep) error {
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
Metrics: metrics,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
families, err := reg.Gather()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
found := make(map[string]bool)
|
||||||
|
for _, f := range families {
|
||||||
|
found[f.GetName()] = true
|
||||||
|
|
||||||
|
switch f.GetName() {
|
||||||
|
case "coderd_chatd_steps_total":
|
||||||
|
require.Len(t, f.GetMetric(), 1)
|
||||||
|
assert.Equal(t, float64(1), f.GetMetric()[0].GetCounter().GetValue(),
|
||||||
|
"steps_total should be 1 after one step")
|
||||||
|
case "coderd_chatd_message_count":
|
||||||
|
require.Len(t, f.GetMetric(), 1)
|
||||||
|
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
|
||||||
|
"message_count should have 1 observation")
|
||||||
|
case "coderd_chatd_prompt_size_bytes":
|
||||||
|
require.Len(t, f.GetMetric(), 1)
|
||||||
|
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
|
||||||
|
"prompt_size_bytes should have 1 observation")
|
||||||
|
case "coderd_chatd_ttft_seconds":
|
||||||
|
require.Len(t, f.GetMetric(), 1)
|
||||||
|
assert.Equal(t, uint64(1), f.GetMetric()[0].GetHistogram().GetSampleCount(),
|
||||||
|
"ttft_seconds should have 1 observation")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert.True(t, found["coderd_chatd_steps_total"], "steps_total not recorded")
|
||||||
|
assert.True(t, found["coderd_chatd_message_count"], "message_count not recorded")
|
||||||
|
assert.True(t, found["coderd_chatd_prompt_size_bytes"], "prompt_size_bytes not recorded")
|
||||||
|
assert.True(t, found["coderd_chatd_ttft_seconds"], "ttft_seconds not recorded")
|
||||||
|
}
|
||||||
@@ -198,6 +198,13 @@ deployment. They will always be available from the agent.
|
|||||||
| `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` |
|
| `coderd_authz_authorize_duration_seconds` | histogram | Duration of the 'Authorize' call in seconds. Only counts calls that succeed. | `allowed` |
|
||||||
| `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | |
|
| `coderd_authz_prepare_authorize_duration_seconds` | histogram | Duration of the 'PrepareAuthorize' call in seconds. | |
|
||||||
| `coderd_build_info` | gauge | Describes the current build/version of the Coder server. Value is always 1. | `revision` `version` |
|
| `coderd_build_info` | gauge | Describes the current build/version of the Coder server. Value is always 1. | `revision` `version` |
|
||||||
|
| `coderd_chatd_chats` | gauge | Number of chats being processed, by state. | `state` |
|
||||||
|
| `coderd_chatd_compaction_total` | counter | Total compaction outcomes (only recorded when compaction was triggered or failed). | `provider` `result` |
|
||||||
|
| `coderd_chatd_message_count` | histogram | Number of messages in the prompt per LLM request. | `provider` |
|
||||||
|
| `coderd_chatd_prompt_size_bytes` | histogram | Estimated byte size of the prompt per LLM request. | `provider` |
|
||||||
|
| `coderd_chatd_steps_total` | counter | Total agentic loop steps across all chats. | `provider` |
|
||||||
|
| `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `provider` `tool_name` |
|
||||||
|
| `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `provider` |
|
||||||
| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` |
|
| `coderd_db_query_counts_total` | counter | Total number of queries labelled by HTTP route, method, and query name. | `method` `query` `route` |
|
||||||
| `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` |
|
| `coderd_db_query_latencies_seconds` | histogram | Latency distribution of queries in seconds. | `query` |
|
||||||
| `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` |
|
| `coderd_db_tx_duration_seconds` | histogram | Duration of transactions in seconds. | `success` `tx_id` |
|
||||||
|
|||||||
@@ -2908,7 +2908,7 @@ func TestPrebuildActivityBump(t *testing.T) {
|
|||||||
require.Zero(t, prebuild.LatestBuild.MaxDeadline)
|
require.Zero(t, prebuild.LatestBuild.MaxDeadline)
|
||||||
|
|
||||||
// When: activity bump is applied to an unclaimed prebuild
|
// When: activity bump is applied to an unclaimed prebuild
|
||||||
workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour))
|
workspacestats.ActivityBumpWorkspace(ctx, log, db, prebuild.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
|
||||||
|
|
||||||
// Then: prebuild Deadline/MaxDeadline remain unchanged
|
// Then: prebuild Deadline/MaxDeadline remain unchanged
|
||||||
prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID)
|
prebuild = coderdtest.MustWorkspace(t, client, wb.Workspace.ID)
|
||||||
@@ -2941,7 +2941,7 @@ func TestPrebuildActivityBump(t *testing.T) {
|
|||||||
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
|
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
|
||||||
|
|
||||||
// When: activity bump is applied to a claimed prebuild
|
// When: activity bump is applied to a claimed prebuild
|
||||||
workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour))
|
workspacestats.ActivityBumpWorkspace(ctx, log, db, workspace.ID, clock.Now().Add(10*time.Hour), workspacestats.ActivityBumpReasonWorkspaceStats)
|
||||||
|
|
||||||
// Then: Deadline is extended by the activity bump, MaxDeadline remains unset
|
// Then: Deadline is extended by the activity bump, MaxDeadline remains unset
|
||||||
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
|
workspace = coderdtest.MustWorkspace(t, client, claimedWorkspace.ID)
|
||||||
|
|||||||
@@ -226,6 +226,27 @@ coderd_authz_prepare_authorize_duration_seconds 0
|
|||||||
# HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1.
|
# HELP coderd_build_info Describes the current build/version of the Coder server. Value is always 1.
|
||||||
# TYPE coderd_build_info gauge
|
# TYPE coderd_build_info gauge
|
||||||
coderd_build_info{version="",revision=""} 0
|
coderd_build_info{version="",revision=""} 0
|
||||||
|
# HELP coderd_chatd_chats Number of chats being processed, by state.
|
||||||
|
# TYPE coderd_chatd_chats gauge
|
||||||
|
coderd_chatd_chats{state=""} 0
|
||||||
|
# HELP coderd_chatd_compaction_total Total compaction outcomes (only recorded when compaction was triggered or failed).
|
||||||
|
# TYPE coderd_chatd_compaction_total counter
|
||||||
|
coderd_chatd_compaction_total{provider="",result=""} 0
|
||||||
|
# HELP coderd_chatd_message_count Number of messages in the prompt per LLM request.
|
||||||
|
# TYPE coderd_chatd_message_count histogram
|
||||||
|
coderd_chatd_message_count{provider=""} 0
|
||||||
|
# HELP coderd_chatd_prompt_size_bytes Estimated byte size of the prompt per LLM request.
|
||||||
|
# TYPE coderd_chatd_prompt_size_bytes histogram
|
||||||
|
coderd_chatd_prompt_size_bytes{provider=""} 0
|
||||||
|
# HELP coderd_chatd_steps_total Total agentic loop steps across all chats.
|
||||||
|
# TYPE coderd_chatd_steps_total counter
|
||||||
|
coderd_chatd_steps_total{provider=""} 0
|
||||||
|
# HELP coderd_chatd_tool_result_size_bytes Size in bytes of each tool execution result.
|
||||||
|
# TYPE coderd_chatd_tool_result_size_bytes histogram
|
||||||
|
coderd_chatd_tool_result_size_bytes{provider="",tool_name=""} 0
|
||||||
|
# HELP coderd_chatd_ttft_seconds Time-to-first-token: wall time from LLM request to first streamed chunk.
|
||||||
|
# TYPE coderd_chatd_ttft_seconds histogram
|
||||||
|
coderd_chatd_ttft_seconds{provider=""} 0
|
||||||
# HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name.
|
# HELP coderd_db_query_counts_total Total number of queries labelled by HTTP route, method, and query name.
|
||||||
# TYPE coderd_db_query_counts_total counter
|
# TYPE coderd_db_query_counts_total counter
|
||||||
coderd_db_query_counts_total{route="",method="",query=""} 0
|
coderd_db_query_counts_total{route="",method="",query=""} 0
|
||||||
|
|||||||
Reference in New Issue
Block a user