diff --git a/coderd/workspaces.go b/coderd/workspaces.go index a02c16ec89..a82b22c4ba 100644 --- a/coderd/workspaces.go +++ b/coderd/workspaces.go @@ -2598,6 +2598,13 @@ func convertWorkspace( failingAgents := []uuid.UUID{} for _, resource := range workspaceBuild.Resources { for _, agent := range resource.Agents { + // Sub-agents (e.g., devcontainer agents) are excluded from the + // workspace health calculation. Their health is managed by + // their parent agent, and temporary disconnections during + // devcontainer rebuilds should not affect workspace health. + if agent.ParentID.Valid { + continue + } if !agent.Health.Healthy { failingAgents = append(failingAgents, agent.ID) } diff --git a/coderd/workspaces_test.go b/coderd/workspaces_test.go index 4ab334222a..7d0a19ea64 100644 --- a/coderd/workspaces_test.go +++ b/coderd/workspaces_test.go @@ -346,6 +346,81 @@ func TestWorkspace(t *testing.T) { assert.False(t, agent2.Health.Healthy) assert.NotEmpty(t, agent2.Health.Reason) }) + + t.Run("Sub-agent excluded", func(t *testing.T) { + t.Parallel() + // This test verifies that sub-agents (e.g., devcontainer agents) + // are excluded from the workspace health calculation. When a + // devcontainer is rebuilding, the sub-agent may be temporarily + // disconnected, but this should not make the workspace unhealthy. + client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionApply: []*proto.Response{{ + Type: &proto.Response_Apply{ + Apply: &proto.ApplyComplete{ + Resources: []*proto.Resource{{ + Name: "some", + Type: "example", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "parent", + Auth: &proto.Agent_Token{}, + }}, + }}, + }, + }, + }}, + }) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + // Get the workspace and parent agent. + workspace, err := client.Workspace(ctx, workspace.ID) + require.NoError(t, err) + parentAgent := workspace.LatestBuild.Resources[0].Agents[0] + require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy initially") + + // Create a sub-agent with a short connection timeout so it becomes + // unhealthy quickly (simulating a devcontainer rebuild scenario). + subAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ + ParentID: uuid.NullUUID{Valid: true, UUID: parentAgent.ID}, + ResourceID: parentAgent.ResourceID, + Name: "subagent", + ConnectionTimeoutSeconds: 1, + }) + + // Wait for the sub-agent to become unhealthy due to timeout. + var subAgentUnhealthy bool + require.Eventually(t, func() bool { + workspace, err = client.Workspace(ctx, workspace.ID) + if err != nil { + return false + } + for _, res := range workspace.LatestBuild.Resources { + for _, agent := range res.Agents { + if agent.ID == subAgent.ID && !agent.Health.Healthy { + subAgentUnhealthy = true + return true + } + } + } + return false + }, testutil.WaitShort, testutil.IntervalFast, "sub-agent should become unhealthy") + + require.True(t, subAgentUnhealthy, "sub-agent should be unhealthy") + + // Verify that the workspace is still healthy because sub-agents + // are excluded from the health calculation. + assert.True(t, workspace.Health.Healthy, "workspace should be healthy despite unhealthy sub-agent") + assert.Empty(t, workspace.Health.FailingAgents, "failing agents should not include sub-agent") + }) }) t.Run("Archived", func(t *testing.T) {