fix: mark connecting agents as unhealthy instead of healthy (#24044)

## Problem

Workspaces showed as "Healthy" immediately after creation while the
agent was still downloading, starting, or connecting. If the agent never
connected, the workspace stayed "Healthy" for the entire connection
timeout (~120s), then abruptly flipped to "Unhealthy".

## Root cause

In `db2sdk.WorkspaceAgent`, the health switch had no case for
`WorkspaceAgentConnecting`. Agents in `connecting` status with a
non-`off` lifecycle (e.g. `created` after a fresh build) fell through to
the `default` case and were marked `Healthy = true`.

## Fix

Add an explicit case for `WorkspaceAgentConnecting` that sets `Healthy =
false` with reason `"agent has not yet connected"`. The case is placed
after the existing `!connected + off` case (which correctly catches
stopped agents as "not running") and before the `timeout`/`disconnected`
cases.

```
Status        + Lifecycle       → Health reason
──────────────────────────────────────────────────────
any !connected + off           → "agent is not running"
connecting    + created/starting → "agent has not yet connected"  ← NEW
timeout       + any            → "agent is taking too long to connect"
disconnected  + any            → "agent has lost connection"
connected     + start_error    → "agent startup script exited with an error"
connected     + shutting_down  → "agent is shutting down"
connected     + ready/starting → healthy
```

The frontend already handles this case — `getAgentHealthIssue()` returns
"Workspace agent is still connecting" with `severity: "info"` for
unhealthy workspaces with connecting agents.

## Test changes

- **Healthy test**: now actually connects the agent via `agenttest.New`
before asserting health (previously passed due to the bug).
- **New Connecting test**: verifies a never-connected agent is correctly
marked unhealthy.
- **Mixed health test**: connects a1 and waits for the mixed state
(`a1.Healthy && !workspace.Healthy`) to avoid a race where both agents
are initially connecting.
- **Sub-agent excluded test**: connects the parent agent and waits for
it to be healthy before creating the sub-agent.
- **TestWorkspaceAgent/Connect**: flipped assertion to `Health.Healthy
== false` for a `dbfake` agent that never connects.

<details>
<summary>Review notes</summary>

### Known follow-up

The `healthy:false` workspace search filter maps to `[disconnected,
timeout]` and does not include `connecting`. This is a pre-existing gap
that is now more consequential — a workspace unhealthy solely due to a
connecting agent won't appear in `healthy:false` results. Worth a
follow-up issue.

### Deep review findings addressed

| Finding | Severity | Status |
|---------|----------|--------|
| Mixed health test race (all 3 reviewers) | P2 | Fixed — tightened
`Eventually` condition |
| `TestWorkspaceAgent/Connect` assertion break | P1 | Fixed — flipped
assertion |
| CLI renders red for connecting agents | Obs | Acknowledged — design
trade-off, accurate but visually strong for transient state |
| Switch case ordering overlap | Obs | Documented with inline comment |

</details>

> 🤖 This PR was created with the help of Coder Agents, and needs a human
review. 🧑💻
This commit is contained in:
Atif Ali
2026-04-09 08:21:28 +00:00
committed by GitHub
parent f95a5202bf
commit 584c61acb5
5 changed files with 98 additions and 18 deletions
+69 -12
View File
@@ -213,6 +213,39 @@ func TestWorkspace(t *testing.T) {
t.Parallel()
t.Run("Healthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
authToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionGraph: echo.ProvisionGraphWithAgent(authToken),
})
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
_ = agenttest.New(t, client.URL, authToken)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && workspace.Health.Healthy
}, testutil.IntervalMedium)
agent := workspace.LatestBuild.Resources[0].Agents[0]
assert.True(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents)
assert.True(t, agent.Health.Healthy)
assert.Empty(t, agent.Health.Reason)
})
t.Run("Connecting", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
@@ -247,10 +280,10 @@ func TestWorkspace(t *testing.T) {
agent := workspace.LatestBuild.Resources[0].Agents[0]
assert.True(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents)
assert.True(t, agent.Health.Healthy)
assert.Empty(t, agent.Health.Reason)
assert.False(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{agent.ID}, workspace.Health.FailingAgents)
assert.False(t, agent.Health.Healthy)
assert.Equal(t, "agent has not yet connected", agent.Health.Reason)
})
t.Run("Unhealthy", func(t *testing.T) {
@@ -302,6 +335,7 @@ func TestWorkspace(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
a1AuthToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionGraph: []*proto.Response{{
@@ -313,7 +347,9 @@ func TestWorkspace(t *testing.T) {
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "a1",
Auth: &proto.Agent_Token{},
Auth: &proto.Agent_Token{
Token: a1AuthToken,
},
}, {
Id: uuid.NewString(),
Name: "a2",
@@ -330,13 +366,21 @@ func TestWorkspace(t *testing.T) {
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
_ = agenttest.New(t, client.URL, a1AuthToken)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && !workspace.Health.Healthy
if err != nil {
return false
}
// Wait for the mixed state: a1 connected (healthy)
// and workspace unhealthy (because a2 timed out).
agent1 := workspace.LatestBuild.Resources[0].Agents[0]
return agent1.Health.Healthy && !workspace.Health.Healthy
}, testutil.IntervalMedium)
assert.False(t, workspace.Health.Healthy)
@@ -360,6 +404,7 @@ func TestWorkspace(t *testing.T) {
// disconnected, but this should not make the workspace unhealthy.
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
authToken := uuid.NewString()
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionGraph: []*proto.Response{{
@@ -371,7 +416,9 @@ func TestWorkspace(t *testing.T) {
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "parent",
Auth: &proto.Agent_Token{},
Auth: &proto.Agent_Token{
Token: authToken,
},
}},
}},
},
@@ -383,14 +430,23 @@ func TestWorkspace(t *testing.T) {
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
_ = agenttest.New(t, client.URL, authToken)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
// Get the workspace and parent agent.
workspace, err := client.Workspace(ctx, workspace.ID)
require.NoError(t, err)
parentAgent := workspace.LatestBuild.Resources[0].Agents[0]
require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy initially")
// Wait for the parent agent to connect and be healthy.
var parentAgent codersdk.WorkspaceAgent
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
var err error
workspace, err = client.Workspace(ctx, workspace.ID)
if err != nil {
return false
}
parentAgent = workspace.LatestBuild.Resources[0].Agents[0]
return parentAgent.Health.Healthy
}, testutil.IntervalMedium)
require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy")
// Create a sub-agent with a short connection timeout so it becomes
// unhealthy quickly (simulating a devcontainer rebuild scenario).
@@ -404,6 +460,7 @@ func TestWorkspace(t *testing.T) {
// Wait for the sub-agent to become unhealthy due to timeout.
var subAgentUnhealthy bool
require.Eventually(t, func() bool {
var err error
workspace, err = client.Workspace(ctx, workspace.ID)
if err != nil {
return false