feat: add Prometheus metric for agent first connection duration (#24179)

## Summary

Add `coderd_agents_first_connection_seconds` histogram metric that
records the
duration from workspace agent creation to first connection. This fills
an
observability gap — provisioner job timings and startup script metrics
exist,
but the agent connection phase (which can take several minutes) was not
exposed
to Prometheus.

Closes https://github.com/coder/coder/issues/21282

## Changes

- **`coderd/prometheusmetrics/prometheusmetrics.go`** — Define and
register a
  `HistogramVec` in the existing `Agents()` polling loop. Observe
`first_connected_at - created_at` exactly once per agent via a
deduplication
  map, pruned each tick to prevent unbounded memory growth.
- **`coderd/prometheusmetrics/prometheusmetrics_test.go`** — Update
`TestAgents`
to set `first_connected_at` on the test agent and assert the histogram
is
  collected with correct labels, sample count, and sample sum.
- **`docs/admin/integrations/prometheus.md`**,
**`scripts/metricsdocgen/generated_metrics`** —
  Auto-generated documentation updates from `make gen`.

## Metric details

| Property | Value |
|---|---|
| Name | `coderd_agents_first_connection_seconds` |
| Type | histogram |
| Labels | `template_name`, `agent_name`, `username`, `workspace_name` |
| Buckets | 1s, 10s, 30s, 1m, 2m, 5m, 10m, 30m, 1h |

## Example PromQL

```promql
# P95 agent connection time by template
histogram_quantile(0.95,
  sum(rate(coderd_agents_first_connection_seconds_bucket[1h])) by (le, template_name)
)
```

<details>
<summary>Implementation notes</summary>

### Design decisions

- **Histogram over gauge**: Enables `histogram_quantile()` for
percentile queries.
- **Observe in `Agents()` polling loop**: All required data is already
fetched by
  `GetWorkspaceAgentsForMetrics()` — no new DB queries.
- **Dedup via `map[uuid.UUID]struct{}`**: Prevents re-observing the same
agent
  across polling ticks. Pruned each cycle to bound memory.
- **Buckets**: Aligned with
`coderd_provisionerd_workspace_build_timings_seconds`
  range (1s–1h).

### Overhead at scale (100k active workspaces)

The deduplication map (`observedFirstConnection`) and per-tick pruning
map
(`currentAgentIDs`) are both `map[[16]byte]struct{}`. At 100k agents:

- **Memory**: ~2.25 MB persistent + ~2.25 MB transient per tick = **~4.5
MB peak**.
- **CPU**: ~25 ms of map operations per tick (one tick per minute) =
**<0.05% of one core**.

Both are negligible relative to the existing cost of the `Agents()` loop
(the DB
query, per-agent `GetWorkspaceAppsByAgentID` calls, and coordinator node
lookups
dominate).

</details>

> 🤖 Generated by Coder Agents
This commit is contained in:
J. Scott Miller
2026-04-14 12:00:46 -05:00
committed by GitHub
parent 6fb27c980d
commit 20b953a99d
4 changed files with 116 additions and 2 deletions
@@ -294,6 +294,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
return nil, err
}
agentsFirstConnectionHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "first_connection_seconds",
Help: "Duration from agent creation to first connection to the control plane in seconds.",
Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{agentmetrics.LabelTemplateName, agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName})
err = registerer.Register(agentsFirstConnectionHistogram)
if err != nil {
return nil, err
}
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "prometheusmetrics",
@@ -306,6 +318,12 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
return nil, err
}
// observedFirstConnection tracks which agents have already had
// their first-connection duration recorded in the histogram.
// Each agent is observed exactly once; the map is pruned every
// tick to remove agents that no longer appear in the query.
observedFirstConnection := make(map[uuid.UUID]struct{})
ctx, cancelFunc := context.WithCancel(ctx)
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
ctx = dbauthz.AsSystemRestricted(ctx)
@@ -342,6 +360,28 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}
agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName)
// Record first connection duration exactly once per agent.
if agent.WorkspaceAgent.FirstConnectedAt.Valid {
if _, alreadyObserved := observedFirstConnection[agent.WorkspaceAgent.ID]; !alreadyObserved {
duration := agent.WorkspaceAgent.FirstConnectedAt.Time.Sub(agent.WorkspaceAgent.CreatedAt).Seconds()
if duration < 0 {
logger.Warn(ctx, "negative agent first connection duration (possible clock skew); dropping sample",
slog.F("agent_id", agent.WorkspaceAgent.ID),
slog.F("created_at", agent.WorkspaceAgent.CreatedAt),
slog.F("first_connected_at", agent.WorkspaceAgent.FirstConnectedAt.Time),
slog.F("duration_s", duration),
)
} else {
agentsFirstConnectionHistogram.WithLabelValues(
agent.TemplateName,
agent.WorkspaceAgent.Name,
agent.OwnerUsername,
agent.WorkspaceName,
).Observe(duration)
}
observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{}
}
}
connectionStatus := agent.WorkspaceAgent.Status(agentInactiveDisconnectTimeout)
node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID)
@@ -391,6 +431,20 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}
}
// Prune observed agents that are no longer in the
// current fetch to prevent unbounded memory growth.
{
currentAgentIDs := make(map[uuid.UUID]struct{}, len(workspaceAgents))
for _, agent := range workspaceAgents {
currentAgentIDs[agent.WorkspaceAgent.ID] = struct{}{}
}
for id := range observedFirstConnection {
if _, exists := currentAgentIDs[id]; !exists {
delete(observedFirstConnection, id)
}
}
}
agentsGauge.Commit()
agentsConnectionsGauge.Commit()
agentsConnectionLatenciesGauge.Commit()
@@ -14,6 +14,7 @@ import (
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
@@ -24,6 +25,7 @@ import (
"github.com/coder/coder/v2/coderd/agentmetrics"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
@@ -568,6 +570,38 @@ func TestAgents(t *testing.T) {
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
// Set first_connected_at on the agent so the first connection
// duration metric can be observed.
workspace = coderdtest.MustWorkspace(t, client, workspace.ID)
require.NotEmpty(t, workspace.LatestBuild.Resources)
var testAgentID uuid.UUID
var testAgentCreatedAt time.Time
for _, res := range workspace.LatestBuild.Resources {
for _, a := range res.Agents {
if a.Name == "testagent" {
testAgentID = a.ID
testAgentCreatedAt = a.CreatedAt
break
}
}
}
require.NotEqual(t, uuid.Nil, testAgentID, "testagent not found")
err := db.UpdateWorkspaceAgentConnectionByID(dbauthz.AsSystemRestricted(context.Background()), database.UpdateWorkspaceAgentConnectionByIDParams{
ID: testAgentID,
FirstConnectedAt: sql.NullTime{
Time: testAgentCreatedAt.Add(45 * time.Second),
Valid: true,
},
LastConnectedAt: sql.NullTime{
Time: testAgentCreatedAt.Add(45 * time.Second),
Valid: true,
},
DisconnectedAt: sql.NullTime{},
UpdatedAt: dbtime.Now(),
LastConnectedReplicaID: uuid.NullUUID{},
})
require.NoError(t, err)
// given
derpMap, _ := tailnettest.RunDERPAndSTUN(t)
derpMapFn := func() *tailcfg.DERPMap {
@@ -594,6 +628,7 @@ func TestAgents(t *testing.T) {
var agentsConnections bool
var agentsApps bool
var agentsExecutionInSeconds bool
var agentsFirstConnection bool
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
@@ -614,7 +649,7 @@ func TestAgents(t *testing.T) {
case "coderd_agents_connections":
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
assert.Equal(t, "connected", metric.Metric[0].Label[2].GetValue()) // Status
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
@@ -630,11 +665,23 @@ func TestAgents(t *testing.T) {
agentsApps = true
case "coderd_prometheusmetrics_agents_execution_seconds":
agentsExecutionInSeconds = true
case "coderd_agents_first_connection_seconds":
for _, m := range metric.Metric {
if m.Histogram != nil && m.Histogram.GetSampleCount() > 0 {
assert.Equal(t, "testagent", getLabelValue(m, "agent_name"))
assert.Equal(t, template.Name, getLabelValue(m, "template_name"))
assert.Equal(t, "testuser", getLabelValue(m, "username"))
assert.Equal(t, workspace.Name, getLabelValue(m, "workspace_name"))
assert.Equal(t, uint64(1), m.Histogram.GetSampleCount())
assert.InDelta(t, 45.0, m.Histogram.GetSampleSum(), 1.0)
agentsFirstConnection = true
}
}
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
}
}
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds && agentsFirstConnection
}, testutil.WaitShort, testutil.IntervalFast)
}
@@ -1081,3 +1128,12 @@ func insertDeleted(t *testing.T, db database.Store, u database.User, org databas
})
require.NoError(t, err)
}
func getLabelValue(m *dto.Metric, name string) string {
for _, l := range m.Label {
if l.GetName() == name {
return l.GetValue()
}
}
return ""
}
+1
View File
@@ -175,6 +175,7 @@ deployment. They will always be available from the agent.
| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` |
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` |
| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` |
| `coderd_agents_first_connection_seconds` | histogram | Duration from agent creation to first connection to the control plane in seconds. | `agent_name` `template_name` `username` `workspace_name` |
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` |
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
+3
View File
@@ -157,6 +157,9 @@ coderd_agents_connection_latencies_seconds{agent_name="",username="",workspace_n
# HELP coderd_agents_connections Agent connections with statuses.
# TYPE coderd_agents_connections gauge
coderd_agents_connections{agent_name="",username="",workspace_name="",status="",lifecycle_state="",tailnet_node=""} 0
# HELP coderd_agents_first_connection_seconds Duration from agent creation to first connection to the control plane in seconds.
# TYPE coderd_agents_first_connection_seconds histogram
coderd_agents_first_connection_seconds{template_name="",agent_name="",username="",workspace_name=""} 0
# HELP coderd_agents_up The number of active agents per workspace.
# TYPE coderd_agents_up gauge
coderd_agents_up{username="",workspace_name="",template_name="",template_version=""} 0