Files
coder/coderd/prometheusmetrics/prometheusmetrics.go
T
Callum Styan 730edba87a fix: fix false positive disconnected agent metric reporting (#24225)
We noticed during higher active workspace counts that the agent
connection metric, generated via a query to the database, would report a
relatively high amount of agents as disconnected. Somewhere between 5
and 20%. However, other metrics such as # of websocket connections would
suggest that all agent connections are healthy.

Looking at the `Agents` function in prometheus metrics, plus the query
execution time (not accounting for actual database RT time) revealed
that this reporting of agents as disconnected was almost certainly false
positives due to clock drift in the way we're generating the metric
values. At 10k metrics, with a p50 of 2ms and p99 of 5ms, the entire
`agents` function could take upwards of 50s to execute. Because we were
doing a query/database RT to query th apps for each agent individually,
and grabbing a `time.Now` value on each iteration of that loop, it's
likely the portion of agents that were reported as disconnected were
those that had last heartbeat the furthest in the past.

The fix here is to set a consistent `now` before fetching agent data to
avoid clock drift inflating the inactive timeout comparison, and replace
the per-agent app query N+1 with a single batched lookup to prevent loop
execution time from pushing agents over the disconnected threshold.

Signed-off-by: Callum Styan <callumstyan@gmail.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:23:06 -07:00

726 lines
24 KiB
Go

package prometheusmetrics
import (
"context"
"database/sql"
"errors"
"fmt"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
"github.com/coder/quartz"
)
const defaultRefreshRate = time.Minute
// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = defaultRefreshRate
}
gauge := prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "active_users_duration_hour",
Help: "The number of users that have been active within the last hour.",
})
err := registerer.Register(gauge)
if err != nil {
return nil, err
}
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
ticker := time.NewTicker(duration)
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, dbtime.Now().Add(-1*time.Hour))
if err != nil {
logger.Error(ctx, "get api keys for active users prometheus metric", slog.Error(err))
continue
}
distinctUsers := map[uuid.UUID]struct{}{}
for _, apiKey := range apiKeys {
distinctUsers[apiKey.UserID] = struct{}{}
}
gauge.Set(float64(len(distinctUsers)))
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// Users tracks the total number of registered users, partitioned by status.
func Users(ctx context.Context, logger slog.Logger, clk quartz.Clock, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
// It's not super important this tracks real-time.
duration = defaultRefreshRate * 5
}
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "total_user_count",
Help: "The total number of registered users, partitioned by status.",
}, []string{"status"})
err := registerer.Register(gauge)
if err != nil {
return nil, xerrors.Errorf("register total_user_count gauge: %w", err)
}
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
ticker := clk.NewTicker(duration)
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
gauge.Reset()
//nolint:gocritic // This is a system service that needs full access
//to the users table.
users, err := db.GetUsers(dbauthz.AsSystemRestricted(ctx), database.GetUsersParams{})
if err != nil {
logger.Error(ctx, "get all users for prometheus metrics", slog.Error(err))
continue
}
for _, user := range users {
gauge.WithLabelValues(string(user.Status)).Inc()
}
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// Workspaces tracks the total number of workspaces with labels on status.
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = defaultRefreshRate
}
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_latest_build",
Help: "The current number of workspace builds by status for all non-deleted workspaces.",
}, []string{"status"})
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
return nil, err
}
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Name: "workspace_latest_build_status",
Help: "The current workspace statuses by template, transition, and owner for all non-deleted workspaces.",
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
return nil, err
}
workspaceCreationTotal := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "coderd",
Name: "workspace_creation_total",
Help: "Total regular (non-prebuilt) workspace creations by organization, template, and preset.",
},
[]string{"organization_name", "template_name", "preset_name"},
)
if err := registerer.Register(workspaceCreationTotal); err != nil {
return nil, err
}
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
updateWorkspaceMetrics := func() {
// Don't count deleted workspaces as part of these metrics.
ws, err := db.GetWorkspacesForWorkspaceMetrics(ctx)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
workspaceLatestBuildTotals.Reset()
workspaceLatestBuildStatuses.Reset()
} else {
logger.Warn(ctx, "failed to load active workspaces for metrics", slog.Error(err))
}
return
}
workspaceLatestBuildTotals.Reset()
workspaceLatestBuildStatuses.Reset()
for _, w := range ws {
status := string(w.LatestBuildStatus)
workspaceLatestBuildTotals.WithLabelValues(status).Add(1)
workspaceLatestBuildStatuses.WithLabelValues(
status,
w.TemplateName,
w.TemplateVersionName.String,
w.OwnerUsername,
string(w.LatestBuildTransition),
).Add(1)
}
// Update regular workspaces (without a prebuild transition) creation counter
regularWorkspaces, err := db.GetRegularWorkspaceCreateMetrics(ctx)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
workspaceCreationTotal.Reset()
} else {
logger.Warn(ctx, "failed to load regular workspaces for metrics", slog.Error(err))
}
return
}
workspaceCreationTotal.Reset()
for _, regularWorkspace := range regularWorkspaces {
workspaceCreationTotal.WithLabelValues(
regularWorkspace.OrganizationName,
regularWorkspace.TemplateName,
regularWorkspace.PresetName,
).Add(float64(regularWorkspace.CreatedCount))
}
}
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)
updateWorkspaceMetrics()
}
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
doTick()
}
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// Agents tracks the total number of workspaces with labels on status.
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
if duration == 0 {
duration = defaultRefreshRate
}
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"}))
err := registerer.Register(agentsGauge)
if err != nil {
return nil, err
}
agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connections",
Help: "Agent connections with statuses.",
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"}))
err = registerer.Register(agentsConnectionsGauge)
if err != nil {
return nil, err
}
agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connection_latencies_seconds",
Help: "Agent connection latencies in seconds.",
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"}))
err = registerer.Register(agentsConnectionLatenciesGauge)
if err != nil {
return nil, err
}
agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "apps",
Help: "Agent applications with statuses.",
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"}))
err = registerer.Register(agentsAppsGauge)
if err != nil {
return nil, err
}
agentsFirstConnectionHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "first_connection_seconds",
Help: "Duration from agent creation to first connection to the control plane in seconds.",
Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{agentmetrics.LabelTemplateName, agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName})
err = registerer.Register(agentsFirstConnectionHistogram)
if err != nil {
return nil, err
}
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "prometheusmetrics",
Name: "agents_execution_seconds",
Help: "Histogram for duration of agents metrics collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
err = registerer.Register(metricsCollectorAgents)
if err != nil {
return nil, err
}
// observedFirstConnection tracks which agents have already had
// their first-connection duration recorded in the histogram.
// Each agent is observed exactly once; the map is pruned every
// tick to remove agents that no longer appear in the query.
observedFirstConnection := make(map[uuid.UUID]struct{})
ctx, cancelFunc := context.WithCancel(ctx)
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
ctx = dbauthz.AsSystemRestricted(ctx)
done := make(chan struct{})
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
go func() {
defer close(done)
defer ticker.Stop()
collect := func() {
logger.Debug(ctx, "agent metrics collection is starting")
timer := prometheus.NewTimer(metricsCollectorAgents)
defer func() {
logger.Debug(ctx, "agent metrics collection is done")
timer.ObserveDuration()
ticker.Reset(duration)
}()
derpMap := derpMapFn()
// Use a consistent value for now for the duration of this collection
// to avoid drift during the loop over workspaceAgents, which can cause
// incorrect reporting of agent connection status.
now := dbtime.Now()
workspaceAgents, err := db.GetWorkspaceAgentsForMetrics(ctx)
if err != nil {
logger.Error(ctx, "can't get workspace agents", slog.Error(err))
return
}
// Prepopulate our known agents and apps before processing, this saves us from having to make a database
// roundtrip for every iteration of the loop to get the list of apps for the current agent.
agentIDs := make([]uuid.UUID, 0, len(workspaceAgents))
for _, agent := range workspaceAgents {
agentIDs = append(agentIDs, agent.WorkspaceAgent.ID)
}
allApps, err := db.GetWorkspaceAppsByAgentIDs(ctx, agentIDs)
if err != nil {
logger.Error(ctx, "can't get workspace apps", slog.Error(err))
return
}
appsByAgentID := make(map[uuid.UUID][]database.WorkspaceApp, len(workspaceAgents))
for _, app := range allApps {
appsByAgentID[app.AgentID] = append(appsByAgentID[app.AgentID], app)
}
for _, agent := range workspaceAgents {
// Collect information about agents
templateVersionName := agent.TemplateVersionName.String
if !agent.TemplateVersionName.Valid {
templateVersionName = "unknown"
}
agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName)
// Record first connection duration exactly once per agent.
if agent.WorkspaceAgent.FirstConnectedAt.Valid {
if _, alreadyObserved := observedFirstConnection[agent.WorkspaceAgent.ID]; !alreadyObserved {
duration := agent.WorkspaceAgent.FirstConnectedAt.Time.Sub(agent.WorkspaceAgent.CreatedAt).Seconds()
if duration < 0 {
logger.Warn(ctx, "negative agent first connection duration (possible clock skew); dropping sample",
slog.F("agent_id", agent.WorkspaceAgent.ID),
slog.F("created_at", agent.WorkspaceAgent.CreatedAt),
slog.F("first_connected_at", agent.WorkspaceAgent.FirstConnectedAt.Time),
slog.F("duration_s", duration),
)
} else {
agentsFirstConnectionHistogram.WithLabelValues(
agent.TemplateName,
agent.WorkspaceAgent.Name,
agent.OwnerUsername,
agent.WorkspaceName,
).Observe(duration)
}
observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{}
}
}
connectionStatus := agent.WorkspaceAgent.Status(now, agentInactiveDisconnectTimeout)
node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID)
tailnetNode := "unknown"
if node != nil {
tailnetNode = node.ID.String()
}
agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, string(connectionStatus.Status), string(agent.WorkspaceAgent.LifecycleState), tailnetNode)
if node == nil {
logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.WorkspaceAgent.ID))
} else {
// Collect information about connection latencies
for rawRegion, latency := range node.DERPLatency {
regionParts := strings.SplitN(rawRegion, "-", 2)
regionID, err := strconv.Atoi(regionParts[0])
if err != nil {
logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.WorkspaceAgent.ID), slog.F("raw_region", rawRegion), slog.Error(err))
continue
}
region, found := derpMap.Regions[regionID]
if !found {
// It's possible that a workspace agent is using an old DERPMap
// and reports regions that do not exist. If that's the case,
// report the region as unknown!
region = &tailcfg.DERPRegion{
RegionID: regionID,
RegionName: fmt.Sprintf("Unnamed %d", regionID),
}
}
agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID))
}
}
// Collect information about registered applications
for _, app := range appsByAgentID[agent.WorkspaceAgent.ID] {
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, app.DisplayName, string(app.Health))
}
}
// Prune observed agents that are no longer in the
// current fetch to prevent unbounded memory growth.
{
currentAgentIDs := make(map[uuid.UUID]struct{}, len(workspaceAgents))
for _, agent := range workspaceAgents {
currentAgentIDs[agent.WorkspaceAgent.ID] = struct{}{}
}
for id := range observedFirstConnection {
if _, exists := currentAgentIDs[id]; !exists {
delete(observedFirstConnection, id)
}
}
}
agentsGauge.Commit()
agentsConnectionsGauge.Commit()
agentsConnectionLatenciesGauge.Commit()
agentsAppsGauge.Commit()
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
collect()
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// nolint:revive // This will be removed alongside the workspaceusage experiment
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string, usage bool) (func(), error) {
if duration == 0 {
duration = defaultRefreshRate
}
if len(aggregateByLabels) == 0 {
aggregateByLabels = agentmetrics.LabelAgentStats
}
aggregateByLabels = filterAcceptableAgentLabels(aggregateByLabels)
metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "prometheusmetrics",
Name: "agentstats_execution_seconds",
Help: "Histogram for duration of agent stats metrics collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
err := registerer.Register(metricsCollectorAgentStats)
if err != nil {
return nil, err
}
agentStatsTxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "tx_bytes",
Help: "Agent Tx bytes",
}, aggregateByLabels))
err = registerer.Register(agentStatsTxBytesGauge)
if err != nil {
return nil, err
}
agentStatsRxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "rx_bytes",
Help: "Agent Rx bytes",
}, aggregateByLabels))
err = registerer.Register(agentStatsRxBytesGauge)
if err != nil {
return nil, err
}
agentStatsConnectionCountGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "connection_count",
Help: "The number of established connections by agent",
}, aggregateByLabels))
err = registerer.Register(agentStatsConnectionCountGauge)
if err != nil {
return nil, err
}
agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "connection_median_latency_seconds",
Help: "The median agent connection latency in seconds",
}, aggregateByLabels))
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
if err != nil {
return nil, err
}
agentStatsSessionCountJetBrainsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_jetbrains",
Help: "The number of session established by JetBrains",
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
if err != nil {
return nil, err
}
agentStatsSessionCountReconnectingPTYGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_reconnecting_pty",
Help: "The number of session established by reconnecting PTY",
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
if err != nil {
return nil, err
}
agentStatsSessionCountSSHGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_ssh",
Help: "The number of session established by SSH",
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountSSHGauge)
if err != nil {
return nil, err
}
agentStatsSessionCountVSCodeGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "session_count_vscode",
Help: "The number of session established by VSCode",
}, aggregateByLabels))
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
if err != nil {
return nil, err
}
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
createdAfter := initialCreateAfter
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
logger.Debug(ctx, "agent metrics collection is starting")
timer := prometheus.NewTimer(metricsCollectorAgentStats)
checkpoint := time.Now()
var (
stats []database.GetWorkspaceAgentStatsAndLabelsRow
err error
)
if usage {
var agentUsageStats []database.GetWorkspaceAgentUsageStatsAndLabelsRow
agentUsageStats, err = db.GetWorkspaceAgentUsageStatsAndLabels(ctx, createdAfter)
stats = make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(agentUsageStats))
for _, agentUsageStat := range agentUsageStats {
stats = append(stats, database.GetWorkspaceAgentStatsAndLabelsRow(agentUsageStat))
}
} else {
stats, err = db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
}
if err != nil {
logger.Error(ctx, "can't get agent stats", slog.Error(err))
} else {
for _, agentStat := range stats {
var labelValues []string
for _, label := range aggregateByLabels {
switch label {
case agentmetrics.LabelUsername:
labelValues = append(labelValues, agentStat.Username)
case agentmetrics.LabelWorkspaceName:
labelValues = append(labelValues, agentStat.WorkspaceName)
case agentmetrics.LabelAgentName:
labelValues = append(labelValues, agentStat.AgentName)
}
}
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
}
if len(stats) > 0 {
agentStatsRxBytesGauge.Commit()
agentStatsTxBytesGauge.Commit()
agentStatsConnectionCountGauge.Commit()
agentStatsConnectionMedianLatencyGauge.Commit()
agentStatsSessionCountJetBrainsGauge.Commit()
agentStatsSessionCountReconnectingPTYGauge.Commit()
agentStatsSessionCountSSHGauge.Commit()
agentStatsSessionCountVSCodeGauge.Commit()
}
}
logger.Debug(ctx, "agent metrics collection is done", slog.F("len", len(stats)))
timer.ObserveDuration()
createdAfter = checkpoint
ticker.Reset(duration)
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// Experiments registers a metric which indicates whether each experiment is enabled or not.
func Experiments(registerer prometheus.Registerer, active codersdk.Experiments) error {
experimentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Name: "experiments",
Help: "Indicates whether each experiment is enabled (1) or not (0)",
}, []string{"experiment"})
if err := registerer.Register(experimentsGauge); err != nil {
return err
}
for _, exp := range codersdk.ExperimentsSafe {
var val float64
for _, enabled := range active {
if exp == enabled {
val = 1
break
}
}
experimentsGauge.WithLabelValues(string(exp)).Set(val)
}
return nil
}
// filterAcceptableAgentLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on
// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value,
// and therefore we have to exclude it from the list of acceptable labels.
func filterAcceptableAgentLabels(labels []string) []string {
out := make([]string, 0, len(labels))
for _, label := range labels {
if label != agentmetrics.LabelTemplateName {
out = append(out, label)
}
}
return out
}