mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
3e46c7986f
Moves the `coderd_agents_first_connection_seconds` histogram from the polling-based `prometheusmetrics.Agents()` loop to the event-driven `agentConnectionMonitor.init()` path. The metric is now recorded exactly once when an agent first connects over the RPC websocket, instead of being retroactively computed each polling tick. The `username` and `workspace_name` labels are removed to reduce cardinality; only `template_name` and `agent_name` are retained. Adds unit tests covering both the happy path (first connection recorded) and the negative-duration guard (clock skew logs a warning, no sample emitted).
690 lines
22 KiB
Go
690 lines
22 KiB
Go
package prometheusmetrics
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"golang.org/x/xerrors"
|
|
"tailscale.com/tailcfg"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/agentmetrics"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
"github.com/coder/coder/v2/tailnet"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
const defaultRefreshRate = time.Minute
|
|
|
|
// ActiveUsers tracks the number of users that have authenticated within the past hour.
|
|
func ActiveUsers(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
|
if duration == 0 {
|
|
duration = defaultRefreshRate
|
|
}
|
|
|
|
gauge := prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "api",
|
|
Name: "active_users_duration_hour",
|
|
Help: "The number of users that have been active within the last hour.",
|
|
})
|
|
err := registerer.Register(gauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
done := make(chan struct{})
|
|
ticker := time.NewTicker(duration)
|
|
go func() {
|
|
defer close(done)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
|
|
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, dbtime.Now().Add(-1*time.Hour))
|
|
if err != nil {
|
|
logger.Error(ctx, "get api keys for active users prometheus metric", slog.Error(err))
|
|
continue
|
|
}
|
|
distinctUsers := map[uuid.UUID]struct{}{}
|
|
for _, apiKey := range apiKeys {
|
|
distinctUsers[apiKey.UserID] = struct{}{}
|
|
}
|
|
gauge.Set(float64(len(distinctUsers)))
|
|
}
|
|
}()
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}, nil
|
|
}
|
|
|
|
// Users tracks the total number of registered users, partitioned by status.
|
|
func Users(ctx context.Context, logger slog.Logger, clk quartz.Clock, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
|
if duration == 0 {
|
|
// It's not super important this tracks real-time.
|
|
duration = defaultRefreshRate * 5
|
|
}
|
|
|
|
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "api",
|
|
Name: "total_user_count",
|
|
Help: "The total number of registered users, partitioned by status.",
|
|
}, []string{"status"})
|
|
err := registerer.Register(gauge)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("register total_user_count gauge: %w", err)
|
|
}
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
done := make(chan struct{})
|
|
ticker := clk.NewTicker(duration)
|
|
go func() {
|
|
defer close(done)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
|
|
gauge.Reset()
|
|
//nolint:gocritic // This is a system service that needs full access
|
|
//to the users table.
|
|
users, err := db.GetUsers(dbauthz.AsSystemRestricted(ctx), database.GetUsersParams{})
|
|
if err != nil {
|
|
logger.Error(ctx, "get all users for prometheus metrics", slog.Error(err))
|
|
continue
|
|
}
|
|
|
|
for _, user := range users {
|
|
gauge.WithLabelValues(string(user.Status)).Inc()
|
|
}
|
|
}
|
|
}()
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}, nil
|
|
}
|
|
|
|
// Workspaces tracks the total number of workspaces with labels on status.
|
|
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
|
if duration == 0 {
|
|
duration = defaultRefreshRate
|
|
}
|
|
|
|
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "api",
|
|
Name: "workspace_latest_build",
|
|
Help: "The current number of workspace builds by status for all non-deleted workspaces.",
|
|
}, []string{"status"})
|
|
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Name: "workspace_latest_build_status",
|
|
Help: "The current workspace statuses by template, transition, and owner for all non-deleted workspaces.",
|
|
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
|
|
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
workspaceCreationTotal := prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "coderd",
|
|
Name: "workspace_creation_total",
|
|
Help: "Total regular (non-prebuilt) workspace creations by organization, template, and preset.",
|
|
},
|
|
[]string{"organization_name", "template_name", "preset_name"},
|
|
)
|
|
if err := registerer.Register(workspaceCreationTotal); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
done := make(chan struct{})
|
|
|
|
updateWorkspaceMetrics := func() {
|
|
// Don't count deleted workspaces as part of these metrics.
|
|
ws, err := db.GetWorkspacesForWorkspaceMetrics(ctx)
|
|
if err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
workspaceLatestBuildTotals.Reset()
|
|
workspaceLatestBuildStatuses.Reset()
|
|
} else {
|
|
logger.Warn(ctx, "failed to load active workspaces for metrics", slog.Error(err))
|
|
}
|
|
return
|
|
}
|
|
|
|
workspaceLatestBuildTotals.Reset()
|
|
workspaceLatestBuildStatuses.Reset()
|
|
|
|
for _, w := range ws {
|
|
status := string(w.LatestBuildStatus)
|
|
workspaceLatestBuildTotals.WithLabelValues(status).Add(1)
|
|
|
|
workspaceLatestBuildStatuses.WithLabelValues(
|
|
status,
|
|
w.TemplateName,
|
|
w.TemplateVersionName.String,
|
|
w.OwnerUsername,
|
|
string(w.LatestBuildTransition),
|
|
).Add(1)
|
|
}
|
|
|
|
// Update regular workspaces (without a prebuild transition) creation counter
|
|
regularWorkspaces, err := db.GetRegularWorkspaceCreateMetrics(ctx)
|
|
if err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
workspaceCreationTotal.Reset()
|
|
} else {
|
|
logger.Warn(ctx, "failed to load regular workspaces for metrics", slog.Error(err))
|
|
}
|
|
return
|
|
}
|
|
|
|
workspaceCreationTotal.Reset()
|
|
|
|
for _, regularWorkspace := range regularWorkspaces {
|
|
workspaceCreationTotal.WithLabelValues(
|
|
regularWorkspace.OrganizationName,
|
|
regularWorkspace.TemplateName,
|
|
regularWorkspace.PresetName,
|
|
).Add(float64(regularWorkspace.CreatedCount))
|
|
}
|
|
}
|
|
|
|
// Use time.Nanosecond to force an initial tick. It will be reset to the
|
|
// correct duration after executing once.
|
|
ticker := time.NewTicker(time.Nanosecond)
|
|
doTick := func() {
|
|
defer ticker.Reset(duration)
|
|
|
|
updateWorkspaceMetrics()
|
|
}
|
|
|
|
go func() {
|
|
defer close(done)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
doTick()
|
|
}
|
|
}
|
|
}()
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}, nil
|
|
}
|
|
|
|
// Agents tracks the total number of workspaces with labels on status.
|
|
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
|
|
if duration == 0 {
|
|
duration = defaultRefreshRate
|
|
}
|
|
|
|
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agents",
|
|
Name: "up",
|
|
Help: "The number of active agents per workspace.",
|
|
}, []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelTemplateName, "template_version"}))
|
|
err := registerer.Register(agentsGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agents",
|
|
Name: "connections",
|
|
Help: "Agent connections with statuses.",
|
|
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "status", "lifecycle_state", "tailnet_node"}))
|
|
err = registerer.Register(agentsConnectionsGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agents",
|
|
Name: "connection_latencies_seconds",
|
|
Help: "Agent connection latencies in seconds.",
|
|
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "derp_region", "preferred"}))
|
|
err = registerer.Register(agentsConnectionLatenciesGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agents",
|
|
Name: "apps",
|
|
Help: "Agent applications with statuses.",
|
|
}, []string{agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, "app_name", "health"}))
|
|
err = registerer.Register(agentsAppsGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prometheusmetrics",
|
|
Name: "agents_execution_seconds",
|
|
Help: "Histogram for duration of agents metrics collection in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
err = registerer.Register(metricsCollectorAgents)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
|
|
ctx = dbauthz.AsSystemRestricted(ctx)
|
|
done := make(chan struct{})
|
|
|
|
// Use time.Nanosecond to force an initial tick. It will be reset to the
|
|
// correct duration after executing once.
|
|
ticker := time.NewTicker(time.Nanosecond)
|
|
go func() {
|
|
defer close(done)
|
|
defer ticker.Stop()
|
|
|
|
collect := func() {
|
|
logger.Debug(ctx, "agent metrics collection is starting")
|
|
timer := prometheus.NewTimer(metricsCollectorAgents)
|
|
defer func() {
|
|
logger.Debug(ctx, "agent metrics collection is done")
|
|
timer.ObserveDuration()
|
|
ticker.Reset(duration)
|
|
}()
|
|
|
|
derpMap := derpMapFn()
|
|
|
|
// Use a consistent value for now for the duration of this collection
|
|
// to avoid drift during the loop over workspaceAgents, which can cause
|
|
// incorrect reporting of agent connection status.
|
|
now := dbtime.Now()
|
|
|
|
workspaceAgents, err := db.GetWorkspaceAgentsForMetrics(ctx)
|
|
if err != nil {
|
|
logger.Error(ctx, "can't get workspace agents", slog.Error(err))
|
|
return
|
|
}
|
|
|
|
// Prepopulate our known agents and apps before processing, this saves us from having to make a database
|
|
// roundtrip for every iteration of the loop to get the list of apps for the current agent.
|
|
agentIDs := make([]uuid.UUID, 0, len(workspaceAgents))
|
|
for _, agent := range workspaceAgents {
|
|
agentIDs = append(agentIDs, agent.WorkspaceAgent.ID)
|
|
}
|
|
allApps, err := db.GetWorkspaceAppsByAgentIDs(ctx, agentIDs)
|
|
if err != nil {
|
|
logger.Error(ctx, "can't get workspace apps", slog.Error(err))
|
|
return
|
|
}
|
|
appsByAgentID := make(map[uuid.UUID][]database.WorkspaceApp, len(workspaceAgents))
|
|
for _, app := range allApps {
|
|
appsByAgentID[app.AgentID] = append(appsByAgentID[app.AgentID], app)
|
|
}
|
|
|
|
for _, agent := range workspaceAgents {
|
|
// Collect information about agents
|
|
templateVersionName := agent.TemplateVersionName.String
|
|
if !agent.TemplateVersionName.Valid {
|
|
templateVersionName = "unknown"
|
|
}
|
|
agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName)
|
|
|
|
connectionStatus := agent.WorkspaceAgent.Status(now, agentInactiveDisconnectTimeout)
|
|
node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID)
|
|
|
|
tailnetNode := "unknown"
|
|
if node != nil {
|
|
tailnetNode = node.ID.String()
|
|
}
|
|
|
|
agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, string(connectionStatus.Status), string(agent.WorkspaceAgent.LifecycleState), tailnetNode)
|
|
|
|
if node == nil {
|
|
logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.WorkspaceAgent.ID))
|
|
} else {
|
|
// Collect information about connection latencies
|
|
for rawRegion, latency := range node.DERPLatency {
|
|
regionParts := strings.SplitN(rawRegion, "-", 2)
|
|
regionID, err := strconv.Atoi(regionParts[0])
|
|
if err != nil {
|
|
logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.WorkspaceAgent.ID), slog.F("raw_region", rawRegion), slog.Error(err))
|
|
continue
|
|
}
|
|
|
|
region, found := derpMap.Regions[regionID]
|
|
if !found {
|
|
// It's possible that a workspace agent is using an old DERPMap
|
|
// and reports regions that do not exist. If that's the case,
|
|
// report the region as unknown!
|
|
region = &tailcfg.DERPRegion{
|
|
RegionID: regionID,
|
|
RegionName: fmt.Sprintf("Unnamed %d", regionID),
|
|
}
|
|
}
|
|
|
|
agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID))
|
|
}
|
|
}
|
|
|
|
// Collect information about registered applications
|
|
for _, app := range appsByAgentID[agent.WorkspaceAgent.ID] {
|
|
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, app.DisplayName, string(app.Health))
|
|
}
|
|
}
|
|
|
|
agentsGauge.Commit()
|
|
agentsConnectionsGauge.Commit()
|
|
agentsConnectionLatenciesGauge.Commit()
|
|
agentsAppsGauge.Commit()
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
collect()
|
|
}
|
|
}()
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}, nil
|
|
}
|
|
|
|
// nolint:revive // This will be removed alongside the workspaceusage experiment
|
|
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string, usage bool) (func(), error) {
|
|
if duration == 0 {
|
|
duration = defaultRefreshRate
|
|
}
|
|
|
|
if len(aggregateByLabels) == 0 {
|
|
aggregateByLabels = agentmetrics.LabelAgentStats
|
|
}
|
|
|
|
aggregateByLabels = filterAcceptableAgentLabels(aggregateByLabels)
|
|
|
|
metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prometheusmetrics",
|
|
Name: "agentstats_execution_seconds",
|
|
Help: "Histogram for duration of agent stats metrics collection in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
err := registerer.Register(metricsCollectorAgentStats)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsTxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "tx_bytes",
|
|
Help: "Agent Tx bytes",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsTxBytesGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsRxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "rx_bytes",
|
|
Help: "Agent Rx bytes",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsRxBytesGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsConnectionCountGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "connection_count",
|
|
Help: "The number of established connections by agent",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsConnectionCountGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "connection_median_latency_seconds",
|
|
Help: "The median agent connection latency in seconds",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsConnectionMedianLatencyGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsSessionCountJetBrainsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "session_count_jetbrains",
|
|
Help: "The number of session established by JetBrains",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsSessionCountJetBrainsGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsSessionCountReconnectingPTYGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "session_count_reconnecting_pty",
|
|
Help: "The number of session established by reconnecting PTY",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsSessionCountSSHGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "session_count_ssh",
|
|
Help: "The number of session established by SSH",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsSessionCountSSHGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
agentStatsSessionCountVSCodeGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "agentstats",
|
|
Name: "session_count_vscode",
|
|
Help: "The number of session established by VSCode",
|
|
}, aggregateByLabels))
|
|
err = registerer.Register(agentStatsSessionCountVSCodeGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
done := make(chan struct{})
|
|
|
|
createdAfter := initialCreateAfter
|
|
// Use time.Nanosecond to force an initial tick. It will be reset to the
|
|
// correct duration after executing once.
|
|
ticker := time.NewTicker(time.Nanosecond)
|
|
go func() {
|
|
defer close(done)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
|
|
logger.Debug(ctx, "agent metrics collection is starting")
|
|
timer := prometheus.NewTimer(metricsCollectorAgentStats)
|
|
|
|
checkpoint := time.Now()
|
|
var (
|
|
stats []database.GetWorkspaceAgentStatsAndLabelsRow
|
|
err error
|
|
)
|
|
if usage {
|
|
var agentUsageStats []database.GetWorkspaceAgentUsageStatsAndLabelsRow
|
|
agentUsageStats, err = db.GetWorkspaceAgentUsageStatsAndLabels(ctx, createdAfter)
|
|
stats = make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(agentUsageStats))
|
|
for _, agentUsageStat := range agentUsageStats {
|
|
stats = append(stats, database.GetWorkspaceAgentStatsAndLabelsRow(agentUsageStat))
|
|
}
|
|
} else {
|
|
stats, err = db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
|
|
}
|
|
if err != nil {
|
|
logger.Error(ctx, "can't get agent stats", slog.Error(err))
|
|
} else {
|
|
for _, agentStat := range stats {
|
|
var labelValues []string
|
|
for _, label := range aggregateByLabels {
|
|
switch label {
|
|
case agentmetrics.LabelUsername:
|
|
labelValues = append(labelValues, agentStat.Username)
|
|
case agentmetrics.LabelWorkspaceName:
|
|
labelValues = append(labelValues, agentStat.WorkspaceName)
|
|
case agentmetrics.LabelAgentName:
|
|
labelValues = append(labelValues, agentStat.AgentName)
|
|
}
|
|
}
|
|
|
|
agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), labelValues...)
|
|
agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), labelValues...)
|
|
|
|
agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), labelValues...)
|
|
agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, labelValues...)
|
|
|
|
agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), labelValues...)
|
|
agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), labelValues...)
|
|
agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), labelValues...)
|
|
agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), labelValues...)
|
|
}
|
|
|
|
if len(stats) > 0 {
|
|
agentStatsRxBytesGauge.Commit()
|
|
agentStatsTxBytesGauge.Commit()
|
|
|
|
agentStatsConnectionCountGauge.Commit()
|
|
agentStatsConnectionMedianLatencyGauge.Commit()
|
|
|
|
agentStatsSessionCountJetBrainsGauge.Commit()
|
|
agentStatsSessionCountReconnectingPTYGauge.Commit()
|
|
agentStatsSessionCountSSHGauge.Commit()
|
|
agentStatsSessionCountVSCodeGauge.Commit()
|
|
}
|
|
}
|
|
|
|
logger.Debug(ctx, "agent metrics collection is done", slog.F("len", len(stats)))
|
|
timer.ObserveDuration()
|
|
|
|
createdAfter = checkpoint
|
|
ticker.Reset(duration)
|
|
}
|
|
}()
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}, nil
|
|
}
|
|
|
|
// Experiments registers a metric which indicates whether each experiment is enabled or not.
|
|
func Experiments(registerer prometheus.Registerer, active codersdk.Experiments) error {
|
|
experimentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Name: "experiments",
|
|
Help: "Indicates whether each experiment is enabled (1) or not (0)",
|
|
}, []string{"experiment"})
|
|
if err := registerer.Register(experimentsGauge); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, exp := range codersdk.ExperimentsSafe {
|
|
var val float64
|
|
for _, enabled := range active {
|
|
if exp == enabled {
|
|
val = 1
|
|
break
|
|
}
|
|
}
|
|
|
|
experimentsGauge.WithLabelValues(string(exp)).Set(val)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// BuildInfo registers a gauge which is always set to 1, with labels
|
|
// describing the running server version. This follows the common
|
|
// pattern used by Prometheus itself and many Go services.
|
|
func BuildInfo(registerer prometheus.Registerer, version, revision string) error {
|
|
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Name: "build_info",
|
|
Help: "Describes the current build/version of the Coder server. Value is always 1.",
|
|
}, []string{"version", "revision"})
|
|
if err := registerer.Register(gauge); err != nil {
|
|
return err
|
|
}
|
|
|
|
gauge.WithLabelValues(version, revision).Set(1)
|
|
|
|
return nil
|
|
}
|
|
|
|
// filterAcceptableAgentLabels handles a slightly messy situation whereby `prometheus-aggregate-agent-stats-by` can control on
|
|
// which labels agent stats are aggregated, but for these specific metrics in this file there is no `template` label value,
|
|
// and therefore we have to exclude it from the list of acceptable labels.
|
|
func filterAcceptableAgentLabels(labels []string) []string {
|
|
out := make([]string, 0, len(labels))
|
|
for _, label := range labels {
|
|
if label != agentmetrics.LabelTemplateName {
|
|
out = append(out, label)
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|