mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: add Prometheus metric for agent first connection duration (#24179)
## Summary Add `coderd_agents_first_connection_seconds` histogram metric that records the duration from workspace agent creation to first connection. This fills an observability gap — provisioner job timings and startup script metrics exist, but the agent connection phase (which can take several minutes) was not exposed to Prometheus. Closes https://github.com/coder/coder/issues/21282 ## Changes - **`coderd/prometheusmetrics/prometheusmetrics.go`** — Define and register a `HistogramVec` in the existing `Agents()` polling loop. Observe `first_connected_at - created_at` exactly once per agent via a deduplication map, pruned each tick to prevent unbounded memory growth. - **`coderd/prometheusmetrics/prometheusmetrics_test.go`** — Update `TestAgents` to set `first_connected_at` on the test agent and assert the histogram is collected with correct labels, sample count, and sample sum. - **`docs/admin/integrations/prometheus.md`**, **`scripts/metricsdocgen/generated_metrics`** — Auto-generated documentation updates from `make gen`. ## Metric details | Property | Value | |---|---| | Name | `coderd_agents_first_connection_seconds` | | Type | histogram | | Labels | `template_name`, `agent_name`, `username`, `workspace_name` | | Buckets | 1s, 10s, 30s, 1m, 2m, 5m, 10m, 30m, 1h | ## Example PromQL ```promql # P95 agent connection time by template histogram_quantile(0.95, sum(rate(coderd_agents_first_connection_seconds_bucket[1h])) by (le, template_name) ) ``` <details> <summary>Implementation notes</summary> ### Design decisions - **Histogram over gauge**: Enables `histogram_quantile()` for percentile queries. - **Observe in `Agents()` polling loop**: All required data is already fetched by `GetWorkspaceAgentsForMetrics()` — no new DB queries. - **Dedup via `map[uuid.UUID]struct{}`**: Prevents re-observing the same agent across polling ticks. Pruned each cycle to bound memory. - **Buckets**: Aligned with `coderd_provisionerd_workspace_build_timings_seconds` range (1s–1h). ### Overhead at scale (100k active workspaces) The deduplication map (`observedFirstConnection`) and per-tick pruning map (`currentAgentIDs`) are both `map[[16]byte]struct{}`. At 100k agents: - **Memory**: ~2.25 MB persistent + ~2.25 MB transient per tick = **~4.5 MB peak**. - **CPU**: ~25 ms of map operations per tick (one tick per minute) = **<0.05% of one core**. Both are negligible relative to the existing cost of the `Agents()` loop (the DB query, per-agent `GetWorkspaceAppsByAgentID` calls, and coordinator node lookups dominate). </details> > 🤖 Generated by Coder Agents
This commit is contained in:
@@ -294,6 +294,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
||||
return nil, err
|
||||
}
|
||||
|
||||
agentsFirstConnectionHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "coderd",
|
||||
Subsystem: "agents",
|
||||
Name: "first_connection_seconds",
|
||||
Help: "Duration from agent creation to first connection to the control plane in seconds.",
|
||||
Buckets: []float64{1, 10, 30, 60, 120, 300, 600, 1800, 3600},
|
||||
}, []string{agentmetrics.LabelTemplateName, agentmetrics.LabelAgentName, agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName})
|
||||
err = registerer.Register(agentsFirstConnectionHistogram)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "coderd",
|
||||
Subsystem: "prometheusmetrics",
|
||||
@@ -306,6 +318,12 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// observedFirstConnection tracks which agents have already had
|
||||
// their first-connection duration recorded in the histogram.
|
||||
// Each agent is observed exactly once; the map is pruned every
|
||||
// tick to remove agents that no longer appear in the query.
|
||||
observedFirstConnection := make(map[uuid.UUID]struct{})
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(ctx)
|
||||
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
|
||||
ctx = dbauthz.AsSystemRestricted(ctx)
|
||||
@@ -342,6 +360,28 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
||||
}
|
||||
agentsGauge.WithLabelValues(VectorOperationAdd, 1, agent.OwnerUsername, agent.WorkspaceName, agent.TemplateName, templateVersionName)
|
||||
|
||||
// Record first connection duration exactly once per agent.
|
||||
if agent.WorkspaceAgent.FirstConnectedAt.Valid {
|
||||
if _, alreadyObserved := observedFirstConnection[agent.WorkspaceAgent.ID]; !alreadyObserved {
|
||||
duration := agent.WorkspaceAgent.FirstConnectedAt.Time.Sub(agent.WorkspaceAgent.CreatedAt).Seconds()
|
||||
if duration < 0 {
|
||||
logger.Warn(ctx, "negative agent first connection duration (possible clock skew); dropping sample",
|
||||
slog.F("agent_id", agent.WorkspaceAgent.ID),
|
||||
slog.F("created_at", agent.WorkspaceAgent.CreatedAt),
|
||||
slog.F("first_connected_at", agent.WorkspaceAgent.FirstConnectedAt.Time),
|
||||
slog.F("duration_s", duration),
|
||||
)
|
||||
} else {
|
||||
agentsFirstConnectionHistogram.WithLabelValues(
|
||||
agent.TemplateName,
|
||||
agent.WorkspaceAgent.Name,
|
||||
agent.OwnerUsername,
|
||||
agent.WorkspaceName,
|
||||
).Observe(duration)
|
||||
}
|
||||
observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{}
|
||||
}
|
||||
}
|
||||
connectionStatus := agent.WorkspaceAgent.Status(agentInactiveDisconnectTimeout)
|
||||
node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID)
|
||||
|
||||
@@ -391,6 +431,20 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
||||
}
|
||||
}
|
||||
|
||||
// Prune observed agents that are no longer in the
|
||||
// current fetch to prevent unbounded memory growth.
|
||||
{
|
||||
currentAgentIDs := make(map[uuid.UUID]struct{}, len(workspaceAgents))
|
||||
for _, agent := range workspaceAgents {
|
||||
currentAgentIDs[agent.WorkspaceAgent.ID] = struct{}{}
|
||||
}
|
||||
for id := range observedFirstConnection {
|
||||
if _, exists := currentAgentIDs[id]; !exists {
|
||||
delete(observedFirstConnection, id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
agentsGauge.Commit()
|
||||
agentsConnectionsGauge.Commit()
|
||||
agentsConnectionLatenciesGauge.Commit()
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"tailscale.com/tailcfg"
|
||||
@@ -24,6 +25,7 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/agentmetrics"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
@@ -568,6 +570,38 @@ func TestAgents(t *testing.T) {
|
||||
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
|
||||
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
|
||||
|
||||
// Set first_connected_at on the agent so the first connection
|
||||
// duration metric can be observed.
|
||||
workspace = coderdtest.MustWorkspace(t, client, workspace.ID)
|
||||
require.NotEmpty(t, workspace.LatestBuild.Resources)
|
||||
var testAgentID uuid.UUID
|
||||
var testAgentCreatedAt time.Time
|
||||
for _, res := range workspace.LatestBuild.Resources {
|
||||
for _, a := range res.Agents {
|
||||
if a.Name == "testagent" {
|
||||
testAgentID = a.ID
|
||||
testAgentCreatedAt = a.CreatedAt
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
require.NotEqual(t, uuid.Nil, testAgentID, "testagent not found")
|
||||
err := db.UpdateWorkspaceAgentConnectionByID(dbauthz.AsSystemRestricted(context.Background()), database.UpdateWorkspaceAgentConnectionByIDParams{
|
||||
ID: testAgentID,
|
||||
FirstConnectedAt: sql.NullTime{
|
||||
Time: testAgentCreatedAt.Add(45 * time.Second),
|
||||
Valid: true,
|
||||
},
|
||||
LastConnectedAt: sql.NullTime{
|
||||
Time: testAgentCreatedAt.Add(45 * time.Second),
|
||||
Valid: true,
|
||||
},
|
||||
DisconnectedAt: sql.NullTime{},
|
||||
UpdatedAt: dbtime.Now(),
|
||||
LastConnectedReplicaID: uuid.NullUUID{},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// given
|
||||
derpMap, _ := tailnettest.RunDERPAndSTUN(t)
|
||||
derpMapFn := func() *tailcfg.DERPMap {
|
||||
@@ -594,6 +628,7 @@ func TestAgents(t *testing.T) {
|
||||
var agentsConnections bool
|
||||
var agentsApps bool
|
||||
var agentsExecutionInSeconds bool
|
||||
var agentsFirstConnection bool
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err := registry.Gather()
|
||||
assert.NoError(t, err)
|
||||
@@ -614,7 +649,7 @@ func TestAgents(t *testing.T) {
|
||||
case "coderd_agents_connections":
|
||||
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
|
||||
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
|
||||
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
|
||||
assert.Equal(t, "connected", metric.Metric[0].Label[2].GetValue()) // Status
|
||||
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
|
||||
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
|
||||
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
|
||||
@@ -630,11 +665,23 @@ func TestAgents(t *testing.T) {
|
||||
agentsApps = true
|
||||
case "coderd_prometheusmetrics_agents_execution_seconds":
|
||||
agentsExecutionInSeconds = true
|
||||
case "coderd_agents_first_connection_seconds":
|
||||
for _, m := range metric.Metric {
|
||||
if m.Histogram != nil && m.Histogram.GetSampleCount() > 0 {
|
||||
assert.Equal(t, "testagent", getLabelValue(m, "agent_name"))
|
||||
assert.Equal(t, template.Name, getLabelValue(m, "template_name"))
|
||||
assert.Equal(t, "testuser", getLabelValue(m, "username"))
|
||||
assert.Equal(t, workspace.Name, getLabelValue(m, "workspace_name"))
|
||||
assert.Equal(t, uint64(1), m.Histogram.GetSampleCount())
|
||||
assert.InDelta(t, 45.0, m.Histogram.GetSampleSum(), 1.0)
|
||||
agentsFirstConnection = true
|
||||
}
|
||||
}
|
||||
default:
|
||||
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
|
||||
}
|
||||
}
|
||||
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
|
||||
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds && agentsFirstConnection
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
}
|
||||
|
||||
@@ -1081,3 +1128,12 @@ func insertDeleted(t *testing.T, db database.Store, u database.User, org databas
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func getLabelValue(m *dto.Metric, name string) string {
|
||||
for _, l := range m.Label {
|
||||
if l.GetName() == name {
|
||||
return l.GetValue()
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -175,6 +175,7 @@ deployment. They will always be available from the agent.
|
||||
| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` |
|
||||
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` |
|
||||
| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` |
|
||||
| `coderd_agents_first_connection_seconds` | histogram | Duration from agent creation to first connection to the control plane in seconds. | `agent_name` `template_name` `username` `workspace_name` |
|
||||
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `template_version` `username` `workspace_name` |
|
||||
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
|
||||
|
||||
@@ -157,6 +157,9 @@ coderd_agents_connection_latencies_seconds{agent_name="",username="",workspace_n
|
||||
# HELP coderd_agents_connections Agent connections with statuses.
|
||||
# TYPE coderd_agents_connections gauge
|
||||
coderd_agents_connections{agent_name="",username="",workspace_name="",status="",lifecycle_state="",tailnet_node=""} 0
|
||||
# HELP coderd_agents_first_connection_seconds Duration from agent creation to first connection to the control plane in seconds.
|
||||
# TYPE coderd_agents_first_connection_seconds histogram
|
||||
coderd_agents_first_connection_seconds{template_name="",agent_name="",username="",workspace_name=""} 0
|
||||
# HELP coderd_agents_up The number of active agents per workspace.
|
||||
# TYPE coderd_agents_up gauge
|
||||
coderd_agents_up{username="",workspace_name="",template_name="",template_version=""} 0
|
||||
|
||||
Reference in New Issue
Block a user