fix: fix false positive disconnected agent metric reporting (#24225)

We noticed during higher active workspace counts that the agent
connection metric, generated via a query to the database, would report a
relatively high amount of agents as disconnected. Somewhere between 5
and 20%. However, other metrics such as # of websocket connections would
suggest that all agent connections are healthy.

Looking at the `Agents` function in prometheus metrics, plus the query
execution time (not accounting for actual database RT time) revealed
that this reporting of agents as disconnected was almost certainly false
positives due to clock drift in the way we're generating the metric
values. At 10k metrics, with a p50 of 2ms and p99 of 5ms, the entire
`agents` function could take upwards of 50s to execute. Because we were
doing a query/database RT to query th apps for each agent individually,
and grabbing a `time.Now` value on each iteration of that loop, it's
likely the portion of agents that were reported as disconnected were
those that had last heartbeat the furthest in the past.

The fix here is to set a consistent `now` before fetching agent data to
avoid clock drift inflating the inactive timeout comparison, and replace
the per-agent app query N+1 with a single batched lookup to prevent loop
execution time from pushing agents over the disconnected threshold.

Signed-off-by: Callum Styan <callumstyan@gmail.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Callum Styan
2026-04-14 22:23:06 -07:00
committed by GitHub
parent 0080bcbf33
commit 730edba87a
6 changed files with 59 additions and 30 deletions
+2 -1
View File
@@ -20,6 +20,7 @@ import (
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/externalauth/gitprovider"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
@@ -522,7 +523,7 @@ func WorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordinator,
}
}
status := dbAgent.Status(agentInactiveDisconnectTimeout)
status := dbAgent.Status(dbtime.Now(), agentInactiveDisconnectTimeout)
workspaceAgent.Status = codersdk.WorkspaceAgentStatus(status.Status)
workspaceAgent.FirstConnectedAt = status.FirstConnectedAt
workspaceAgent.LastConnectedAt = status.LastConnectedAt
+3 -4
View File
@@ -15,7 +15,6 @@ import (
"golang.org/x/oauth2"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
)
@@ -624,7 +623,7 @@ type WorkspaceAgentConnectionStatus struct {
DisconnectedAt *time.Time `json:"disconnected_at"`
}
func (a WorkspaceAgent) Status(inactiveTimeout time.Duration) WorkspaceAgentConnectionStatus {
func (a WorkspaceAgent) Status(now time.Time, inactiveTimeout time.Duration) WorkspaceAgentConnectionStatus {
connectionTimeout := time.Duration(a.ConnectionTimeoutSeconds) * time.Second
status := WorkspaceAgentConnectionStatus{
@@ -643,7 +642,7 @@ func (a WorkspaceAgent) Status(inactiveTimeout time.Duration) WorkspaceAgentConn
switch {
case !a.FirstConnectedAt.Valid:
switch {
case connectionTimeout > 0 && dbtime.Now().Sub(a.CreatedAt) > connectionTimeout:
case connectionTimeout > 0 && now.Sub(a.CreatedAt) > connectionTimeout:
// If the agent took too long to connect the first time,
// mark it as timed out.
status.Status = WorkspaceAgentStatusTimeout
@@ -658,7 +657,7 @@ func (a WorkspaceAgent) Status(inactiveTimeout time.Duration) WorkspaceAgentConn
// If we've disconnected after our last connection, we know the
// agent is no longer connected.
status.Status = WorkspaceAgentStatusDisconnected
case dbtime.Now().Sub(a.LastConnectedAt.Time) > inactiveTimeout:
case now.Sub(a.LastConnectedAt.Time) > inactiveTimeout:
// The connection died without updating the last connected.
status.Status = WorkspaceAgentStatusDisconnected
// Client code needs an accurate disconnected at if the agent has been inactive.
+39 -19
View File
@@ -335,21 +335,43 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
collect := func() {
logger.Debug(ctx, "agent metrics collection is starting")
timer := prometheus.NewTimer(metricsCollectorAgents)
defer func() {
logger.Debug(ctx, "agent metrics collection is done")
timer.ObserveDuration()
ticker.Reset(duration)
}()
derpMap := derpMapFn()
// Use a consistent value for now for the duration of this collection
// to avoid drift during the loop over workspaceAgents, which can cause
// incorrect reporting of agent connection status.
now := dbtime.Now()
workspaceAgents, err := db.GetWorkspaceAgentsForMetrics(ctx)
if err != nil {
logger.Error(ctx, "can't get workspace agents", slog.Error(err))
goto done
return
}
// Prepopulate our known agents and apps before processing, this saves us from having to make a database
// roundtrip for every iteration of the loop to get the list of apps for the current agent.
agentIDs := make([]uuid.UUID, 0, len(workspaceAgents))
for _, agent := range workspaceAgents {
agentIDs = append(agentIDs, agent.WorkspaceAgent.ID)
}
allApps, err := db.GetWorkspaceAppsByAgentIDs(ctx, agentIDs)
if err != nil {
logger.Error(ctx, "can't get workspace apps", slog.Error(err))
return
}
appsByAgentID := make(map[uuid.UUID][]database.WorkspaceApp, len(workspaceAgents))
for _, app := range allApps {
appsByAgentID[app.AgentID] = append(appsByAgentID[app.AgentID], app)
}
for _, agent := range workspaceAgents {
@@ -382,7 +404,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
observedFirstConnection[agent.WorkspaceAgent.ID] = struct{}{}
}
}
connectionStatus := agent.WorkspaceAgent.Status(agentInactiveDisconnectTimeout)
connectionStatus := agent.WorkspaceAgent.Status(now, agentInactiveDisconnectTimeout)
node := (*coordinator.Load()).Node(agent.WorkspaceAgent.ID)
tailnetNode := "unknown"
@@ -420,13 +442,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
}
// Collect information about registered applications
apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.WorkspaceAgent.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.WorkspaceAgent.ID), slog.Error(err))
continue
}
for _, app := range apps {
for _, app := range appsByAgentID[agent.WorkspaceAgent.ID] {
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.WorkspaceAgent.Name, agent.OwnerUsername, agent.WorkspaceName, app.DisplayName, string(app.Health))
}
}
@@ -449,11 +465,15 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
agentsConnectionsGauge.Commit()
agentsConnectionLatenciesGauge.Commit()
agentsAppsGauge.Commit()
}
done:
logger.Debug(ctx, "agent metrics collection is done")
timer.ObserveDuration()
ticker.Reset(duration)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
collect()
}
}()
return func() {
+1 -1
View File
@@ -231,7 +231,7 @@ func (p *DBTokenProvider) Issue(ctx context.Context, rw http.ResponseWriter, r *
}
// Check that the agent is online.
agentStatus := dbReq.Agent.Status(p.WorkspaceAgentInactiveTimeout)
agentStatus := dbReq.Agent.Status(dbtime.Now(), p.WorkspaceAgentInactiveTimeout)
if agentStatus.Status != database.WorkspaceAgentStatusConnected {
WriteWorkspaceAppOffline(p.Logger, p.DashboardURL, rw, r, &appReq, fmt.Sprintf("Agent state is %q, not %q", agentStatus.Status, database.WorkspaceAgentStatusConnected))
return nil, "", false
+2 -1
View File
@@ -14,6 +14,7 @@ import (
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/util/namesgenerator"
"github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect"
"github.com/coder/coder/v2/codersdk"
@@ -442,7 +443,7 @@ func (o CreateWorkspaceOptions) checkExistingWorkspace(
)
selected = agents[0]
}
status := selected.Status(agentInactiveDisconnectTimeout)
status := selected.Status(dbtime.Now(), agentInactiveDisconnectTimeout)
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
+12 -4
View File
@@ -75,10 +75,18 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
return xerrors.Errorf("create user: %w", err)
}
newUser := newUserAndToken.User
newUserClient := codersdk.New(r.client.URL,
codersdk.WithSessionToken(newUserAndToken.SessionToken),
codersdk.WithLogger(logger),
codersdk.WithLogBodies())
// Create a user client with an independent HTTP transport cloned from the
// runner's client. Using codersdk.New directly would inherit
// http.DefaultTransport, which is shared across all runners. That causes
// all user WebSocket connections to reuse the same TCP connection pool and
// land on the same coderd replica, concentrating load.
newUserClient, err := loadtestutil.DupClientCopyingHeaders(r.client, nil)
if err != nil {
return xerrors.Errorf("create user client: %w", err)
}
newUserClient.SetSessionToken(newUserAndToken.SessionToken)
newUserClient.SetLogger(logger)
newUserClient.SetLogBodies(true)
logger.Info(ctx, fmt.Sprintf("user %q created", newUser.Username), slog.F("id", newUser.ID.String()))