diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index fe40cb522c..76d7c5faaa 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -294,6 +294,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + agentsFirstConnectionHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "first_connection_seconds", + Help: "Duration from agent creation to first connection to the control plane in seconds.", + Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600}, + }, []string{agentmetrics.LabelTemplateName, agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName}) + err = registerer.Register(agentsFirstConnectionHistogram) + if err != nil { + return nil, err + } + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "prometheusmetrics", @@ -306,6 +318,12 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + // observedFirstConnection tracks which agents have already had + // their first-connection duration recorded in the histogram. + // Each agent is observed exactly once; the map is pruned every + // tick to remove agents that no longer appear in the query. + observedFirstConnection := make(map[uuid.UUID]struct{}) + ctx, cancelFunc := context.WithCancel(ctx) // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx = dbauthz.AsSystemRestricted(ctx) @@ -342,6 +360,28 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName) + // Record first connection duration exactly once per agent. + if agent.WorkspaceAgent.FirstConnectedAt.Valid { + if _, alreadyObserved := observedFirstConnection[agent.WorkspaceAgent.ID]; !alreadyObserved { + duration := agent.WorkspaceAgent.FirstConnectedAt.Time.Sub(agent.WorkspaceAgent.CreatedAt).Seconds() + if duration < 0 { + logger.Warn(ctx, "negative agent first connection duration (possible clock skew); dropping sample", + slog.F("agent_id", agent.WorkspaceAgent.ID), + slog.F("created_at", agent.WorkspaceAgent.CreatedAt), + slog.F("first_connected_at", agent.WorkspaceAgent.FirstConnectedAt.Time), + slog.F("duration_s", duration), + ) + } else { + agentsFirstConnectionHistogram.WithLabelValues( + agent.TemplateName, + agent.WorkspaceAgent.Name, + agent.OwnerUsername, + agent.WorkspaceName, + ).Observe(duration) + } + observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{} + } + } connectionStatus := agent.WorkspaceAgent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID) @@ -391,6 +431,20 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } + // Prune observed agents that are no longer in the + // current fetch to prevent unbounded memory growth. + { + currentAgentIDs := make(map[uuid.UUID]struct{}, len(workspaceAgents)) + for _, agent := range workspaceAgents { + currentAgentIDs[agent.WorkspaceAgent.ID] = struct{}{} + } + for id := range observedFirstConnection { + if _, exists := currentAgentIDs[id]; !exists { + delete(observedFirstConnection, id) + } + } + } + agentsGauge.Commit() agentsConnectionsGauge.Commit() agentsConnectionLatenciesGauge.Commit() diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index d762dd76f1..54903f2769 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -14,6 +14,7 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "tailscale.com/tailcfg" @@ -24,6 +25,7 @@ import ( "github.com/coder/coder/v2/coderd/agentmetrics" "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -568,6 +570,38 @@ func TestAgents(t *testing.T) { workspace := coderdtest.CreateWorkspace(t, client, template.ID) coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + // Set first_connected_at on the agent so the first connection + // duration metric can be observed. + workspace = coderdtest.MustWorkspace(t, client, workspace.ID) + require.NotEmpty(t, workspace.LatestBuild.Resources) + var testAgentID uuid.UUID + var testAgentCreatedAt time.Time + for _, res := range workspace.LatestBuild.Resources { + for _, a := range res.Agents { + if a.Name == "testagent" { + testAgentID = a.ID + testAgentCreatedAt = a.CreatedAt + break + } + } + } + require.NotEqual(t, uuid.Nil, testAgentID, "testagent not found") + err := db.UpdateWorkspaceAgentConnectionByID(dbauthz.AsSystemRestricted(context.Background()), database.UpdateWorkspaceAgentConnectionByIDParams{ + ID: testAgentID, + FirstConnectedAt: sql.NullTime{ + Time: testAgentCreatedAt.Add(45 * time.Second), + Valid: true, + }, + LastConnectedAt: sql.NullTime{ + Time: testAgentCreatedAt.Add(45 * time.Second), + Valid: true, + }, + DisconnectedAt: sql.NullTime{}, + UpdatedAt: dbtime.Now(), + LastConnectedReplicaID: uuid.NullUUID{}, + }) + require.NoError(t, err) + // given derpMap, _ := tailnettest.RunDERPAndSTUN(t) derpMapFn := func() *tailcfg.DERPMap { @@ -594,6 +628,7 @@ func TestAgents(t *testing.T) { var agentsConnections bool var agentsApps bool var agentsExecutionInSeconds bool + var agentsFirstConnection bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -614,7 +649,7 @@ func TestAgents(t *testing.T) { case "coderd_agents_connections": assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state - assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "connected", metric.Metric[0].Label[2].GetValue()) // Status assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name @@ -630,11 +665,23 @@ func TestAgents(t *testing.T) { agentsApps = true case "coderd_prometheusmetrics_agents_execution_seconds": agentsExecutionInSeconds = true + case "coderd_agents_first_connection_seconds": + for _, m := range metric.Metric { + if m.Histogram != nil && m.Histogram.GetSampleCount() > 0 { + assert.Equal(t, "testagent", getLabelValue(m, "agent_name")) + assert.Equal(t, template.Name, getLabelValue(m, "template_name")) + assert.Equal(t, "testuser", getLabelValue(m, "username")) + assert.Equal(t, workspace.Name, getLabelValue(m, "workspace_name")) + assert.Equal(t, uint64(1), m.Histogram.GetSampleCount()) + assert.InDelta(t, 45.0, m.Histogram.GetSampleSum(), 1.0) + agentsFirstConnection = true + } + } default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds && agentsFirstConnection }, testutil.WaitShort, testutil.IntervalFast) } @@ -1081,3 +1128,12 @@ func insertDeleted(t *testing.T, db database.Store, u database.User, org databas }) require.NoError(t, err) } + +func getLabelValue(m *dto.Metric, name string) string { + for _, l := range m.Label { + if l.GetName() == name { + return l.GetValue() + } + } + return "" +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index c9ab350b65..0c3154301e 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -175,6 +175,7 @@ deployment. They will always be available from the agent. | `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | | `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | | `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_first_connection_seconds` | histogram | Duration from agent creation to first connection to the control plane in seconds. | `agent_name` `template_name` `username` `workspace_name` | | `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` | | `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | | `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index fae3de129a..99a9638532 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -157,6 +157,9 @@ coderd_agents_connection_latencies_seconds{agent_name="",username="",workspace_n # HELP coderd_agents_connections Agent connections with statuses. # TYPE coderd_agents_connections gauge coderd_agents_connections{agent_name="",username="",workspace_name="",status="",lifecycle_state="",tailnet_node=""} 0 +# HELP coderd_agents_first_connection_seconds Duration from agent creation to first connection to the control plane in seconds. +# TYPE coderd_agents_first_connection_seconds histogram +coderd_agents_first_connection_seconds{template_name="",agent_name="",username="",workspace_name=""} 0 # HELP coderd_agents_up The number of active agents per workspace. # TYPE coderd_agents_up gauge coderd_agents_up{username="",workspace_name="",template_name="",template_version=""} 0