From 584c61acb5cee91c5a262a69ec0dc87e68639319 Mon Sep 17 00:00:00 2001 From: Atif Ali Date: Thu, 9 Apr 2026 08:21:28 +0000 Subject: [PATCH] fix: mark connecting agents as unhealthy instead of healthy (#24044) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Workspaces showed as "Healthy" immediately after creation while the agent was still downloading, starting, or connecting. If the agent never connected, the workspace stayed "Healthy" for the entire connection timeout (~120s), then abruptly flipped to "Unhealthy". ## Root cause In `db2sdk.WorkspaceAgent`, the health switch had no case for `WorkspaceAgentConnecting`. Agents in `connecting` status with a non-`off` lifecycle (e.g. `created` after a fresh build) fell through to the `default` case and were marked `Healthy = true`. ## Fix Add an explicit case for `WorkspaceAgentConnecting` that sets `Healthy = false` with reason `"agent has not yet connected"`. The case is placed after the existing `!connected + off` case (which correctly catches stopped agents as "not running") and before the `timeout`/`disconnected` cases. ``` Status + Lifecycle → Health reason ────────────────────────────────────────────────────── any !connected + off → "agent is not running" connecting + created/starting → "agent has not yet connected" ← NEW timeout + any → "agent is taking too long to connect" disconnected + any → "agent has lost connection" connected + start_error → "agent startup script exited with an error" connected + shutting_down → "agent is shutting down" connected + ready/starting → healthy ``` The frontend already handles this case — `getAgentHealthIssue()` returns "Workspace agent is still connecting" with `severity: "info"` for unhealthy workspaces with connecting agents. ## Test changes - **Healthy test**: now actually connects the agent via `agenttest.New` before asserting health (previously passed due to the bug). - **New Connecting test**: verifies a never-connected agent is correctly marked unhealthy. - **Mixed health test**: connects a1 and waits for the mixed state (`a1.Healthy && !workspace.Healthy`) to avoid a race where both agents are initially connecting. - **Sub-agent excluded test**: connects the parent agent and waits for it to be healthy before creating the sub-agent. - **TestWorkspaceAgent/Connect**: flipped assertion to `Health.Healthy == false` for a `dbfake` agent that never connects.
Review notes ### Known follow-up The `healthy:false` workspace search filter maps to `[disconnected, timeout]` and does not include `connecting`. This is a pre-existing gap that is now more consequential — a workspace unhealthy solely due to a connecting agent won't appear in `healthy:false` results. Worth a follow-up issue. ### Deep review findings addressed | Finding | Severity | Status | |---------|----------|--------| | Mixed health test race (all 3 reviewers) | P2 | Fixed — tightened `Eventually` condition | | `TestWorkspaceAgent/Connect` assertion break | P1 | Fixed — flipped assertion | | CLI renders red for connecting agents | Obs | Acknowledged — design trade-off, accurate but visually strong for transient state | | Switch case ordering overlap | Obs | Documented with inline comment |
> 🤖 This PR was created with the help of Coder Agents, and needs a human review. 🧑💻 --- coderd/database/db2sdk/db2sdk.go | 6 ++ coderd/workspaceagents_test.go | 2 +- coderd/workspaces_test.go | 81 ++++++++++++++++++---- site/src/modules/workspaces/health.test.ts | 11 +-- site/src/modules/workspaces/health.ts | 16 +++++ 5 files changed, 98 insertions(+), 18 deletions(-) diff --git a/coderd/database/db2sdk/db2sdk.go b/coderd/database/db2sdk/db2sdk.go index ee595f26cd..2d6906d2f4 100644 --- a/coderd/database/db2sdk/db2sdk.go +++ b/coderd/database/db2sdk/db2sdk.go @@ -538,6 +538,12 @@ func WorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordinator, switch { case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff: workspaceAgent.Health.Reason = "agent is not running" + case workspaceAgent.Status == codersdk.WorkspaceAgentConnecting: + // Note: the case above catches connecting+off as "not running". + // This case handles connecting agents with a non-off lifecycle + // (e.g. "created" or "starting"), where the agent binary has + // not yet established a connection to coderd. + workspaceAgent.Health.Reason = "agent has not yet connected" case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout: workspaceAgent.Health.Reason = "agent is taking too long to connect" case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected: diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index cb2f9167f4..8fffb1f5b1 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -91,7 +91,7 @@ func TestWorkspaceAgent(t *testing.T) { require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory) _, err = anotherClient.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID) require.NoError(t, err) - require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy) + require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy) }) t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) { t.Parallel() diff --git a/coderd/workspaces_test.go b/coderd/workspaces_test.go index 04c41f255f..7f6b2559d2 100644 --- a/coderd/workspaces_test.go +++ b/coderd/workspaces_test.go @@ -213,6 +213,39 @@ func TestWorkspace(t *testing.T) { t.Parallel() t.Run("Healthy", func(t *testing.T) { + t.Parallel() + client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + user := coderdtest.CreateFirstUser(t, client) + authToken := uuid.NewString() + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionGraph: echo.ProvisionGraphWithAgent(authToken), + }) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + _ = agenttest.New(t, client.URL, authToken) + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + var err error + testutil.Eventually(ctx, t, func(ctx context.Context) bool { + workspace, err = client.Workspace(ctx, workspace.ID) + return assert.NoError(t, err) && workspace.Health.Healthy + }, testutil.IntervalMedium) + + agent := workspace.LatestBuild.Resources[0].Agents[0] + + assert.True(t, workspace.Health.Healthy) + assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents) + assert.True(t, agent.Health.Healthy) + assert.Empty(t, agent.Health.Reason) + }) + + t.Run("Connecting", func(t *testing.T) { t.Parallel() client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) user := coderdtest.CreateFirstUser(t, client) @@ -247,10 +280,10 @@ func TestWorkspace(t *testing.T) { agent := workspace.LatestBuild.Resources[0].Agents[0] - assert.True(t, workspace.Health.Healthy) - assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents) - assert.True(t, agent.Health.Healthy) - assert.Empty(t, agent.Health.Reason) + assert.False(t, workspace.Health.Healthy) + assert.Equal(t, []uuid.UUID{agent.ID}, workspace.Health.FailingAgents) + assert.False(t, agent.Health.Healthy) + assert.Equal(t, "agent has not yet connected", agent.Health.Reason) }) t.Run("Unhealthy", func(t *testing.T) { @@ -302,6 +335,7 @@ func TestWorkspace(t *testing.T) { t.Parallel() client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) user := coderdtest.CreateFirstUser(t, client) + a1AuthToken := uuid.NewString() version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ Parse: echo.ParseComplete, ProvisionGraph: []*proto.Response{{ @@ -313,7 +347,9 @@ func TestWorkspace(t *testing.T) { Agents: []*proto.Agent{{ Id: uuid.NewString(), Name: "a1", - Auth: &proto.Agent_Token{}, + Auth: &proto.Agent_Token{ + Token: a1AuthToken, + }, }, { Id: uuid.NewString(), Name: "a2", @@ -330,13 +366,21 @@ func TestWorkspace(t *testing.T) { workspace := coderdtest.CreateWorkspace(t, client, template.ID) coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + _ = agenttest.New(t, client.URL, a1AuthToken) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) defer cancel() var err error testutil.Eventually(ctx, t, func(ctx context.Context) bool { workspace, err = client.Workspace(ctx, workspace.ID) - return assert.NoError(t, err) && !workspace.Health.Healthy + if err != nil { + return false + } + // Wait for the mixed state: a1 connected (healthy) + // and workspace unhealthy (because a2 timed out). + agent1 := workspace.LatestBuild.Resources[0].Agents[0] + return agent1.Health.Healthy && !workspace.Health.Healthy }, testutil.IntervalMedium) assert.False(t, workspace.Health.Healthy) @@ -360,6 +404,7 @@ func TestWorkspace(t *testing.T) { // disconnected, but this should not make the workspace unhealthy. client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) user := coderdtest.CreateFirstUser(t, client) + authToken := uuid.NewString() version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ Parse: echo.ParseComplete, ProvisionGraph: []*proto.Response{{ @@ -371,7 +416,9 @@ func TestWorkspace(t *testing.T) { Agents: []*proto.Agent{{ Id: uuid.NewString(), Name: "parent", - Auth: &proto.Agent_Token{}, + Auth: &proto.Agent_Token{ + Token: authToken, + }, }}, }}, }, @@ -383,14 +430,23 @@ func TestWorkspace(t *testing.T) { workspace := coderdtest.CreateWorkspace(t, client, template.ID) coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + _ = agenttest.New(t, client.URL, authToken) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) defer cancel() - // Get the workspace and parent agent. - workspace, err := client.Workspace(ctx, workspace.ID) - require.NoError(t, err) - parentAgent := workspace.LatestBuild.Resources[0].Agents[0] - require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy initially") + // Wait for the parent agent to connect and be healthy. + var parentAgent codersdk.WorkspaceAgent + testutil.Eventually(ctx, t, func(ctx context.Context) bool { + var err error + workspace, err = client.Workspace(ctx, workspace.ID) + if err != nil { + return false + } + parentAgent = workspace.LatestBuild.Resources[0].Agents[0] + return parentAgent.Health.Healthy + }, testutil.IntervalMedium) + require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy") // Create a sub-agent with a short connection timeout so it becomes // unhealthy quickly (simulating a devcontainer rebuild scenario). @@ -404,6 +460,7 @@ func TestWorkspace(t *testing.T) { // Wait for the sub-agent to become unhealthy due to timeout. var subAgentUnhealthy bool require.Eventually(t, func() bool { + var err error workspace, err = client.Workspace(ctx, workspace.ID) if err != nil { return false diff --git a/site/src/modules/workspaces/health.test.ts b/site/src/modules/workspaces/health.test.ts index ed8e2d61a2..357a6ccadb 100644 --- a/site/src/modules/workspaces/health.test.ts +++ b/site/src/modules/workspaces/health.test.ts @@ -152,14 +152,15 @@ describe("getAgentHealthIssue", () => { }); }); - it("returns connecting issue as default fallback", () => { + it("returns connecting issue for a connecting agent", () => { const ws = buildWorkspace( [{ status: "connecting", lifecycle_state: "starting" }], 1, ); expect(getAgentHealthIssue(ws)).toEqual({ - title: "Workspace agent is still connecting", - detail: "Check the log output if the connection does not complete.", + title: "Workspace agent is connecting", + detail: + "The workspace agent has not connected yet. Wait for it to connect or check the logs if it does not.", severity: "info", prominent: false, }); @@ -227,7 +228,7 @@ describe("getAgentHealthIssue", () => { 2, ); const result = getAgentHealthIssue(ws); - expect(result.title).toBe("2 workspace agents are still connecting"); + expect(result.title).toBe("2 workspace agents are connecting"); }); it("uses singular title when only one agent is failing", () => { @@ -329,7 +330,7 @@ describe("getAgentHealthIssue", () => { 1, ); const result = getAgentHealthIssue(ws); - expect(result.title).toBe("Workspace agent is still connecting"); + expect(result.title).toBe("Workspace agent is connecting"); expect(result.severity).toBe("info"); }); diff --git a/site/src/modules/workspaces/health.ts b/site/src/modules/workspaces/health.ts index cebe53a9b8..88d8ff43b7 100644 --- a/site/src/modules/workspaces/health.ts +++ b/site/src/modules/workspaces/health.ts @@ -34,6 +34,11 @@ export const agentScriptMessages = { * process connecting to the Coder control plane). */ export const agentConnectionMessages = { + connecting: { + title: "Workspace agent is connecting", + detail: + "The workspace agent has not connected yet. Wait for it to connect or check the logs if it does not.", + }, timeout: { title: "Agent is taking longer than expected to connect", detail: @@ -155,6 +160,17 @@ export function getAgentHealthIssue(workspace: Workspace): AgentHealthIssue { }; } + if (statusSet.has("connecting")) { + return { + title: plural + ? `${failingAgentCount} workspace agents are connecting` + : agentConnectionMessages.connecting.title, + detail: agentConnectionMessages.connecting.detail, + severity: "info", + prominent: false, + }; + } + return { title: plural ? `${failingAgentCount} workspace agents are still connecting`