diff --git a/coderd/coderd.go b/coderd/coderd.go index 6af641bfcf..9285d3033b 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -831,6 +831,7 @@ func New(options *Options) *API { if options.DeploymentValues.Prometheus.Enable { options.PrometheusRegistry.MustRegister(stn) api.lifecycleMetrics = agentapi.NewLifecycleMetrics(options.PrometheusRegistry) + api.workspaceAgentRPCMetrics = NewWorkspaceAgentRPCMetrics(options.PrometheusRegistry, options.Logger) } api.NetworkTelemetryBatcher = tailnet.NewNetworkTelemetryBatcher( quartz.NewReal(), @@ -2181,9 +2182,10 @@ type API struct { healthCheckCache atomic.Pointer[healthsdk.HealthcheckReport] healthCheckProgress healthcheck.Progress - statsReporter *workspacestats.Reporter - metadataBatcher *metadatabatcher.Batcher - lifecycleMetrics *agentapi.LifecycleMetrics + statsReporter *workspacestats.Reporter + metadataBatcher *metadatabatcher.Batcher + lifecycleMetrics *agentapi.LifecycleMetrics + workspaceAgentRPCMetrics *WorkspaceAgentRPCMetrics Acquirer *provisionerdserver.Acquirer // dbRolluper rolls up template usage stats from raw agent and app diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index ea3801230e..4e752753cd 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -294,18 +294,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } - agentsFirstConnectionHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "coderd", - Subsystem: "agents", - Name: "first_connection_seconds", - Help: "Duration from agent creation to first connection to the control plane in seconds.", - Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600}, - }, []string{agentmetrics.LabelTemplateName, agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName}) - err = registerer.Register(agentsFirstConnectionHistogram) - if err != nil { - return nil, err - } - metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "prometheusmetrics", @@ -318,12 +306,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } - // observedFirstConnection tracks which agents have already had - // their first-connection duration recorded in the histogram. - // Each agent is observed exactly once; the map is pruned every - // tick to remove agents that no longer appear in the query. - observedFirstConnection := make(map[uuid.UUID]struct{}) - ctx, cancelFunc := context.WithCancel(ctx) // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx = dbauthz.AsSystemRestricted(ctx) @@ -382,28 +364,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName) - // Record first connection duration exactly once per agent. - if agent.WorkspaceAgent.FirstConnectedAt.Valid { - if _, alreadyObserved := observedFirstConnection[agent.WorkspaceAgent.ID]; !alreadyObserved { - duration := agent.WorkspaceAgent.FirstConnectedAt.Time.Sub(agent.WorkspaceAgent.CreatedAt).Seconds() - if duration < 0 { - logger.Warn(ctx, "negative agent first connection duration (possible clock skew); dropping sample", - slog.F("agent_id", agent.WorkspaceAgent.ID), - slog.F("created_at", agent.WorkspaceAgent.CreatedAt), - slog.F("first_connected_at", agent.WorkspaceAgent.FirstConnectedAt.Time), - slog.F("duration_s", duration), - ) - } else { - agentsFirstConnectionHistogram.WithLabelValues( - agent.TemplateName, - agent.WorkspaceAgent.Name, - agent.OwnerUsername, - agent.WorkspaceName, - ).Observe(duration) - } - observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{} - } - } connectionStatus := agent.WorkspaceAgent.Status(now, agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID) @@ -447,20 +407,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } - // Prune observed agents that are no longer in the - // current fetch to prevent unbounded memory growth. - { - currentAgentIDs := make(map[uuid.UUID]struct{}, len(workspaceAgents)) - for _, agent := range workspaceAgents { - currentAgentIDs[agent.WorkspaceAgent.ID] = struct{}{} - } - for id := range observedFirstConnection { - if _, exists := currentAgentIDs[id]; !exists { - delete(observedFirstConnection, id) - } - } - } - agentsGauge.Commit() agentsConnectionsGauge.Commit() agentsConnectionLatenciesGauge.Commit() diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e6a55a8a1b..03bd12f4ee 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -14,7 +14,6 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "tailscale.com/tailcfg" @@ -25,7 +24,6 @@ import ( "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -570,38 +568,6 @@ func TestAgents(t *testing.T) { workspace := coderdtest.CreateWorkspace(t, client, template.ID) coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) - // Set first_connected_at on the agent so the first connection - // duration metric can be observed. - workspace = coderdtest.MustWorkspace(t, client, workspace.ID) - require.NotEmpty(t, workspace.LatestBuild.Resources) - var testAgentID uuid.UUID - var testAgentCreatedAt time.Time - for _, res := range workspace.LatestBuild.Resources { - for _, a := range res.Agents { - if a.Name == "testagent" { - testAgentID = a.ID - testAgentCreatedAt = a.CreatedAt - break - } - } - } - require.NotEqual(t, uuid.Nil, testAgentID, "testagent not found") - err := db.UpdateWorkspaceAgentConnectionByID(dbauthz.AsSystemRestricted(context.Background()), database.UpdateWorkspaceAgentConnectionByIDParams{ - ID: testAgentID, - FirstConnectedAt: sql.NullTime{ - Time: testAgentCreatedAt.Add(45 * time.Second), - Valid: true, - }, - LastConnectedAt: sql.NullTime{ - Time: testAgentCreatedAt.Add(45 * time.Second), - Valid: true, - }, - DisconnectedAt: sql.NullTime{}, - UpdatedAt: dbtime.Now(), - LastConnectedReplicaID: uuid.NullUUID{}, - }) - require.NoError(t, err) - // given derpMap, _ := tailnettest.RunDERPAndSTUN(t) derpMapFn := func() *tailcfg.DERPMap { @@ -628,7 +594,6 @@ func TestAgents(t *testing.T) { var agentsConnections bool var agentsApps bool var agentsExecutionInSeconds bool - var agentsFirstConnection bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -649,7 +614,7 @@ func TestAgents(t *testing.T) { case "coderd_agents_connections": assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state - assert.Equal(t, "connected", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name @@ -665,23 +630,11 @@ func TestAgents(t *testing.T) { agentsApps = true case "coderd_prometheusmetrics_agents_execution_seconds": agentsExecutionInSeconds = true - case "coderd_agents_first_connection_seconds": - for _, m := range metric.Metric { - if m.Histogram != nil && m.Histogram.GetSampleCount() > 0 { - assert.Equal(t, "testagent", getLabelValue(m, "agent_name")) - assert.Equal(t, template.Name, getLabelValue(m, "template_name")) - assert.Equal(t, "testuser", getLabelValue(m, "username")) - assert.Equal(t, workspace.Name, getLabelValue(m, "workspace_name")) - assert.Equal(t, uint64(1), m.Histogram.GetSampleCount()) - assert.InDelta(t, 45.0, m.Histogram.GetSampleSum(), 1.0) - agentsFirstConnection = true - } - } default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds && agentsFirstConnection + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds }, testutil.WaitShort, testutil.IntervalFast) } @@ -1155,12 +1108,3 @@ func insertDeleted(t *testing.T, db database.Store, u database.User, org databas }) require.NoError(t, err) } - -func getLabelValue(m *dto.Metric, name string) string { - for _, l := range m.Label { - if l.GetName() == name { - return l.GetValue() - } - } - return "" -} diff --git a/coderd/workspaceagentsrpc.go b/coderd/workspaceagentsrpc.go index 842a512f44..433f1f572b 100644 --- a/coderd/workspaceagentsrpc.go +++ b/coderd/workspaceagentsrpc.go @@ -12,6 +12,7 @@ import ( "github.com/google/uuid" "github.com/hashicorp/yamux" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" "cdr.dev/slog/v3" @@ -258,6 +259,7 @@ func (api *API) startAgentYamuxMonitor(ctx context.Context, replicaID: api.ID, updater: api, disconnectTimeout: api.AgentInactiveDisconnectTimeout, + metrics: api.workspaceAgentRPCMetrics, logger: api.Logger.With( slog.F("workspace_id", workspaceBuild.WorkspaceID), slog.F("agent_id", workspaceAgent.ID), @@ -291,6 +293,7 @@ type agentConnectionMonitor struct { updater workspaceUpdater logger slog.Logger pingPeriod time.Duration + metrics *WorkspaceAgentRPCMetrics // state manipulated by both sendPings() and monitor() goroutines: needs to be threadsafe lastPing atomic.Pointer[time.Time] @@ -356,6 +359,14 @@ func (m *agentConnectionMonitor) init() { Time: now, Valid: true, } + if m.metrics != nil { + duration := now.Sub(m.workspaceAgent.CreatedAt) + m.metrics.ObserveAgentFirstConnection( + duration, + m.workspace.TemplateName, + m.workspaceAgent.Name, + ) + } } m.lastConnectedAt = sql.NullTime{ Time: now, @@ -496,3 +507,50 @@ func checkBuildIsLatest(ctx context.Context, db database.Store, build database.W } return nil } + +// WorkspaceAgentRPCMetrics holds Prometheus metrics for the agent +// connection monitor. It is nil when Prometheus is not enabled. +type WorkspaceAgentRPCMetrics struct { + logger slog.Logger + FirstConnectionDuration *prometheus.HistogramVec +} + +// NewWorkspaceAgentRPCMetrics creates and registers agent connection +// metrics. +func NewWorkspaceAgentRPCMetrics(reg prometheus.Registerer, logger slog.Logger) *WorkspaceAgentRPCMetrics { + m := &WorkspaceAgentRPCMetrics{ + logger: logger, + FirstConnectionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "first_connection_seconds", + Help: "Duration from agent creation to first connection in seconds.", + Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600}, + }, []string{"template_name", "agent_name"}), + } + reg.MustRegister(m.FirstConnectionDuration) + return m +} + +// ObserveAgentFirstConnection records the duration from agent creation +// to first connection. Negative durations are logged as warnings and +// not recorded, since they indicate clock skew. +func (m *WorkspaceAgentRPCMetrics) ObserveAgentFirstConnection( + duration time.Duration, + templateName string, + agentName string, +) { + if duration < 0 { + m.logger.Warn(context.Background(), + "negative agent first connection duration, possible clock skew", + slog.F("template_name", templateName), + slog.F("agent_name", agentName), + slog.F("duration", duration), + ) + return + } + m.FirstConnectionDuration.WithLabelValues( + templateName, + agentName, + ).Observe(duration.Seconds()) +} diff --git a/coderd/workspaceagentsrpc_internal_test.go b/coderd/workspaceagentsrpc_internal_test.go index 1cbc66e49c..fffe13a902 100644 --- a/coderd/workspaceagentsrpc_internal_test.go +++ b/coderd/workspaceagentsrpc_internal_test.go @@ -9,9 +9,13 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" + "cdr.dev/slog/v3" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbmock" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -338,6 +342,79 @@ func TestAgentConnectionMonitor_StartClose(t *testing.T) { _ = testutil.TryReceive(ctx, t, closed) } +func TestAgentConnectionMonitor_FirstConnectionMetric(t *testing.T) { + t.Parallel() + const metricName = "coderd_agents_first_connection_seconds" + + t.Run("records metric on first connection", func(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + logger := testutil.Logger(t) + metrics := NewWorkspaceAgentRPCMetrics(reg, logger) + + createdAt := dbtime.Now().Add(-30 * time.Second) + uut := &agentConnectionMonitor{ + workspace: database.Workspace{ + TemplateName: "my-template", + }, + workspaceAgent: database.WorkspaceAgent{ + Name: "main", + CreatedAt: createdAt, + // FirstConnectedAt is zero-value (not valid), + // so init() treats this as the first connection. + }, + metrics: metrics, + } + uut.init() + + require.Equal(t, 1, + promtest.CollectAndCount(metrics.FirstConnectionDuration, metricName)) + + // Verify the observed sum reflects the duration since CreatedAt. + // testutil has no helper for reading histogram sums, so extract + // the sample via dto.Metric directly. + var observed dto.Metric + require.NoError(t, metrics.FirstConnectionDuration. + WithLabelValues("my-template", "main").(prometheus.Histogram). + Write(&observed)) + require.EqualValues(t, 1, observed.GetHistogram().GetSampleCount()) + require.GreaterOrEqual(t, observed.GetHistogram().GetSampleSum(), float64(30)) + }) + + t.Run("skips metric and logs warning if duration is negative", func(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + sink := testutil.NewFakeSink(t) + metrics := NewWorkspaceAgentRPCMetrics(reg, sink.Logger()) + + // Set CreatedAt in the future so the duration is negative, + // simulating clock skew. + uut := &agentConnectionMonitor{ + workspace: database.Workspace{ + TemplateName: "my-template", + }, + workspaceAgent: database.WorkspaceAgent{ + Name: "main", + CreatedAt: dbtime.Now().Add(time.Minute), + }, + metrics: metrics, + } + uut.init() + + // The negative-duration path skips the observation, so the + // histogram should have no recorded label combinations. + require.Equal(t, 0, + promtest.CollectAndCount(metrics.FirstConnectionDuration, metricName)) + + // Verify that a warning was logged. + warnings := sink.Entries(func(e slog.SinkEntry) bool { + return e.Level == slog.LevelWarn + }) + require.Len(t, warnings, 1) + require.Contains(t, warnings[0].Message, "negative agent first connection duration") + }) +} + type fakePingerCloser struct { sync.Mutex pings []time.Time diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 2cf8cd8014..210f22d040 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -175,7 +175,7 @@ deployment. They will always be available from the agent. | `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | | `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | | `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_first_connection_seconds` | histogram | Duration from agent creation to first connection to the control plane in seconds. | `agent_name` `template_name` `username` `workspace_name` | +| `coderd_agents_first_connection_seconds` | histogram | Duration from agent creation to first connection in seconds. | `agent_name` `template_name` | | `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` | | `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | | `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index d7aa8fd182..c62709dd76 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -157,9 +157,9 @@ coderd_agents_connection_latencies_seconds{agent_name="",username="",workspace_n # HELP coderd_agents_connections Agent connections with statuses. # TYPE coderd_agents_connections gauge coderd_agents_connections{agent_name="",username="",workspace_name="",status="",lifecycle_state="",tailnet_node=""} 0 -# HELP coderd_agents_first_connection_seconds Duration from agent creation to first connection to the control plane in seconds. +# HELP coderd_agents_first_connection_seconds Duration from agent creation to first connection in seconds. # TYPE coderd_agents_first_connection_seconds histogram -coderd_agents_first_connection_seconds{template_name="",agent_name="",username="",workspace_name=""} 0 +coderd_agents_first_connection_seconds{template_name="",agent_name=""} 0 # HELP coderd_agents_up The number of active agents per workspace. # TYPE coderd_agents_up gauge coderd_agents_up{username="",workspace_name="",template_name="",template_version=""} 0