From 78d4cf9e47b6cc03bbf3c9ec5554ff9e37282071 Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Mon, 18 May 2026 08:33:29 -0400 Subject: [PATCH] fix: soft-delete stale workspace agents on new build (#25207) --- coderd/database/dbauthz/dbauthz.go | 21 ++ coderd/database/dbauthz/dbauthz_test.go | 13 + coderd/database/dbmetrics/querymetrics.go | 16 ++ coderd/database/dbmock/dbmock.go | 28 +++ ...oft_delete_stale_workspace_agents.down.sql | 3 + ..._soft_delete_stale_workspace_agents.up.sql | 62 +++++ coderd/database/migrations/migrate_test.go | 211 ++++++++++++++++ coderd/database/querier.go | 12 + coderd/database/querier_test.go | 227 ++++++++++++++++++ coderd/database/queries.sql.go | 52 ++++ coderd/database/queries/workspaceagents.sql | 35 +++ .../provisionerdserver/provisionerdserver.go | 22 ++ coderd/workspaceresourceauth_test.go | 6 +- coderd/wsbuilder/wsbuilder.go | 9 + coderd/wsbuilder/wsbuilder_test.go | 7 +- codersdk/toolsdk/toolsdk_test.go | 2 + 16 files changed, 723 insertions(+), 3 deletions(-) create mode 100644 coderd/database/migrations/000498_soft_delete_stale_workspace_agents.down.sql create mode 100644 coderd/database/migrations/000498_soft_delete_stale_workspace_agents.up.sql diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 599eb87460..fd1d6c5ff4 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -6272,6 +6272,27 @@ func (q *querier) SoftDeleteContextFileMessages(ctx context.Context, chatID uuid return q.db.SoftDeleteContextFileMessages(ctx, chatID) } +func (q *querier) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error { + // Internal bookkeeping called from wsbuilder.Builder.Build inside the + // same transaction as an already-authorized InsertWorkspaceBuild. + // Callers pass a system-restricted context. + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { + return err + } + return q.db.SoftDeletePriorWorkspaceAgents(ctx, arg) +} + +func (q *querier) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error { + // Internal bookkeeping called from wsbuilder (orphan-delete) and + // provisionerdserver.CompleteJob (normal delete) inside the same + // transaction as an already-authorized workspace deletion. + // Callers pass a system-restricted context. + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { + return err + } + return q.db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID) +} + func (q *querier) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error { chat, err := q.db.GetChatByID(ctx, arg.ChatID) if err != nil { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index ef65d0f9b6..baf4bfe18a 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4596,6 +4596,19 @@ func (s *MethodTestSuite) TestSystemFunctions() { dbm.EXPECT().UpdateWorkspaceAgentConnectionByID(gomock.Any(), arg).Return(nil).AnyTimes() check.Args(arg).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() })) + s.Run("SoftDeletePriorWorkspaceAgents", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { + arg := database.SoftDeletePriorWorkspaceAgentsParams{ + WorkspaceID: uuid.New(), + CurrentBuildID: uuid.New(), + } + dbm.EXPECT().SoftDeletePriorWorkspaceAgents(gomock.Any(), arg).Return(nil).AnyTimes() + check.Args(arg).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() + })) + s.Run("SoftDeleteWorkspaceAgentsByWorkspaceID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { + wsID := uuid.New() + dbm.EXPECT().SoftDeleteWorkspaceAgentsByWorkspaceID(gomock.Any(), wsID).Return(nil).AnyTimes() + check.Args(wsID).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() + })) s.Run("AcquireProvisionerJob", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { arg := database.AcquireProvisionerJobParams{StartedAt: sql.NullTime{Valid: true, Time: dbtime.Now()}, OrganizationID: uuid.New(), Types: []database.ProvisionerType{database.ProvisionerTypeEcho}, ProvisionerTags: json.RawMessage("{}")} dbm.EXPECT().AcquireProvisionerJob(gomock.Any(), arg).Return(testutil.Fake(s.T(), faker, database.ProvisionerJob{}), nil).AnyTimes() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index eb5c0950bf..aeae4294c9 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -4537,6 +4537,22 @@ func (m queryMetricsStore) SoftDeleteContextFileMessages(ctx context.Context, ch return r0 } +func (m queryMetricsStore) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error { + start := time.Now() + r0 := m.s.SoftDeletePriorWorkspaceAgents(ctx, arg) + m.queryLatencies.WithLabelValues("SoftDeletePriorWorkspaceAgents").Observe(time.Since(start).Seconds()) + m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "SoftDeletePriorWorkspaceAgents").Inc() + return r0 +} + +func (m queryMetricsStore) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error { + start := time.Now() + r0 := m.s.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID) + m.queryLatencies.WithLabelValues("SoftDeleteWorkspaceAgentsByWorkspaceID").Observe(time.Since(start).Seconds()) + m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "SoftDeleteWorkspaceAgentsByWorkspaceID").Inc() + return r0 +} + func (m queryMetricsStore) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error { start := time.Now() r0 := m.s.TouchChatDebugRunUpdatedAt(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 298ef831a0..09d751c88d 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -8588,6 +8588,34 @@ func (mr *MockStoreMockRecorder) SoftDeleteContextFileMessages(ctx, chatID any) return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeleteContextFileMessages", reflect.TypeOf((*MockStore)(nil).SoftDeleteContextFileMessages), ctx, chatID) } +// SoftDeletePriorWorkspaceAgents mocks base method. +func (m *MockStore) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SoftDeletePriorWorkspaceAgents", ctx, arg) + ret0, _ := ret[0].(error) + return ret0 +} + +// SoftDeletePriorWorkspaceAgents indicates an expected call of SoftDeletePriorWorkspaceAgents. +func (mr *MockStoreMockRecorder) SoftDeletePriorWorkspaceAgents(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeletePriorWorkspaceAgents", reflect.TypeOf((*MockStore)(nil).SoftDeletePriorWorkspaceAgents), ctx, arg) +} + +// SoftDeleteWorkspaceAgentsByWorkspaceID mocks base method. +func (m *MockStore) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SoftDeleteWorkspaceAgentsByWorkspaceID", ctx, workspaceID) + ret0, _ := ret[0].(error) + return ret0 +} + +// SoftDeleteWorkspaceAgentsByWorkspaceID indicates an expected call of SoftDeleteWorkspaceAgentsByWorkspaceID. +func (mr *MockStoreMockRecorder) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeleteWorkspaceAgentsByWorkspaceID", reflect.TypeOf((*MockStore)(nil).SoftDeleteWorkspaceAgentsByWorkspaceID), ctx, workspaceID) +} + // TouchChatDebugRunUpdatedAt mocks base method. func (m *MockStore) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error { m.ctrl.T.Helper() diff --git a/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.down.sql b/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.down.sql new file mode 100644 index 0000000000..6385451925 --- /dev/null +++ b/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.down.sql @@ -0,0 +1,3 @@ +-- The backfill is not reversed: soft-deleted agents tied to stopped/deleted +-- builds are no longer referenced anywhere. Restoring deleted=FALSE would +-- only re-create the ambiguity the forward migration fixed. diff --git a/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.up.sql b/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.up.sql new file mode 100644 index 0000000000..98125dcfb3 --- /dev/null +++ b/coderd/database/migrations/000498_soft_delete_stale_workspace_agents.up.sql @@ -0,0 +1,62 @@ +-- Soft-delete stale `workspace_agents` rows. +-- +-- Before v2.33.0, the auth path `GetWorkspaceAgentByInstanceID :one` silently +-- picked the newest matching row, so stale rows from earlier builds were +-- harmless. After #24325 replaced that with a `:many` lookup that rejects +-- ambiguity with HTTP 409, the accumulation becomes a hard failure: any +-- workspace whose EC2 instance hosted more than one build can no longer +-- re-authenticate its agent. +-- +-- This migration backfills the "at most one non-deleted agent per workspace +-- that is itself not deleted" invariant over existing data. Going forward: +-- - `wsbuilder.Builder.Build` maintains it per-build via +-- `SoftDeletePriorWorkspaceAgents`. +-- - `provisionerdserver.CompleteJob` and `wsbuilder` also call +-- `SoftDeleteWorkspaceAgentsByWorkspaceID` when a workspace itself is +-- soft-deleted, so the table doesn't retain orphaned-but-non-deleted +-- agents referencing a deleted workspace. +-- +-- Backfill scope: +-- 1. Every agent belonging to a soft-deleted workspace -> deleted = TRUE. +-- 2. For each still-live workspace, keep only agents belonging to the +-- current (highest build_number) build; soft-delete earlier builds' +-- agents. +-- +-- Related: +-- #24325 (feature that regressed the behavior) +-- #24973 (partial fix, pool starvation) +-- #25031 (partial fix, handler cleanup + deleted-workspace filter) +-- #25155 (bug report) + +-- 1. Soft-delete all agents on workspaces that are themselves deleted. +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + JOIN workspaces w ON w.id = wb.workspace_id + WHERE wa.deleted = FALSE + AND w.deleted = TRUE +); + +-- 2. For every live workspace, soft-delete agents not tied to the latest build. +WITH latest_builds AS ( + SELECT DISTINCT ON (workspace_id) id, workspace_id + FROM workspace_builds + ORDER BY workspace_id, build_number DESC +) +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + JOIN workspaces w ON w.id = wb.workspace_id + LEFT JOIN latest_builds lb ON lb.workspace_id = wb.workspace_id + WHERE wa.deleted = FALSE + AND w.deleted = FALSE + AND (lb.id IS NULL OR wb.id <> lb.id) +); diff --git a/coderd/database/migrations/migrate_test.go b/coderd/database/migrations/migrate_test.go index ae16886661..8496a338eb 100644 --- a/coderd/database/migrations/migrate_test.go +++ b/coderd/database/migrations/migrate_test.go @@ -1185,3 +1185,214 @@ func TestMigration000475AgentsAccessOrgRole(t *testing.T) { "trigger should only create org-member and org-service-account system roles", ) } + +func TestMigration000498SoftDeleteStaleWorkspaceAgents(t *testing.T) { + t.Parallel() + + const migrationVersion = 498 + + sqlDB := testSQLDB(t) + + // Step up to migrationVersion - 1. + next, err := migrations.Stepper(sqlDB) + require.NoError(t, err) + for { + version, more, err := next() + require.NoError(t, err) + if !more { + t.Fatalf("migration %d not found", migrationVersion) + } + if version == migrationVersion-1 { + break + } + } + + ctx := testutil.Context(t, testutil.WaitSuperLong) + now := time.Now().UTC().Truncate(time.Microsecond) + + // Seed the prerequisite tables. Two workspaces share the same EC2-style + // instance id across several builds; a third workspace has a single + // build on a different instance (baseline, must not be affected). + userID := uuid.New() + orgID := uuid.New() + templateID := uuid.New() + templateVersionID := uuid.New() + fileID := uuid.New() + + wsA := uuid.New() + wsB := uuid.New() + wsSingle := uuid.New() + wsDeleted := uuid.New() + + instanceAB := "i-shared-ab" + instanceSingle := "i-solo" + instanceDeleted := "i-deleted" + + // For workspace A: 3 builds on the same instance. + // For workspace B: 2 builds on the same instance (different workspace, + // same instance id, exercises the cross-workspace scoping case). + // For wsSingle: 1 build, should stay non-deleted after the backfill. + // For wsDeleted: 1 build on a soft-deleted workspace. Agent should be + // marked deleted even though it's on the latest build. + type build struct { + id uuid.UUID + jobID uuid.UUID + resourceID uuid.UUID + agentID uuid.UUID + buildNum int32 + wsID uuid.UUID + instanceID string + } + + mkBuild := func(ws uuid.UUID, buildNum int32, instance string) build { + return build{ + id: uuid.New(), + jobID: uuid.New(), + resourceID: uuid.New(), + agentID: uuid.New(), + buildNum: buildNum, + wsID: ws, + instanceID: instance, + } + } + + aBuilds := []build{ + mkBuild(wsA, 1, instanceAB), + mkBuild(wsA, 2, instanceAB), + mkBuild(wsA, 3, instanceAB), + } + bBuilds := []build{ + mkBuild(wsB, 1, instanceAB), + mkBuild(wsB, 2, instanceAB), + } + singleBuilds := []build{ + mkBuild(wsSingle, 1, instanceSingle), + } + deletedBuilds := []build{ + mkBuild(wsDeleted, 1, instanceDeleted), + } + allBuilds := append(append(append(append([]build{}, aBuilds...), bBuilds...), singleBuilds...), deletedBuilds...) + + tx, err := sqlDB.BeginTx(ctx, nil) + require.NoError(t, err) + defer tx.Rollback() + + // Minimal user / org / template / template_version / file. + _, err = tx.ExecContext(ctx, + `INSERT INTO users (id, username, email, hashed_password, created_at, updated_at, status, rbac_roles, login_type) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + userID, "seed", "seed@test.com", []byte{}, now, now, "active", pq.StringArray{}, "password", + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO organizations (id, name, display_name, description, icon, created_at, updated_at, is_default) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + orgID, "seed-org", "Seed Org", "", "", now, now, false, + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO files (id, hash, created_at, created_by, mimetype, data) VALUES ($1, $2, $3, $4, $5, $6)`, + fileID, "hash", now, userID, "application/octet-stream", []byte{}, + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO templates (id, created_at, updated_at, organization_id, name, provisioner, active_version_id, description, created_by, group_acl, user_acl, display_name) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)`, + templateID, now, now, orgID, "tpl", "echo", templateVersionID, "", userID, "{}", "{}", "", + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO template_versions (id, template_id, organization_id, created_at, updated_at, name, readme, job_id, created_by, message) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`, + templateVersionID, templateID, orgID, now, now, "v", "", uuid.New(), userID, "", + ) + require.NoError(t, err) + + for _, ws := range []uuid.UUID{wsA, wsB, wsSingle} { + _, err = tx.ExecContext(ctx, + `INSERT INTO workspaces (id, created_at, updated_at, owner_id, organization_id, template_id, name, deleted, automatic_updates) + VALUES ($1, $2, $3, $4, $5, $6, $7, false, 'never')`, + ws, now, now, userID, orgID, templateID, "ws-"+ws.String()[:8], + ) + require.NoError(t, err) + } + // wsDeleted is a soft-deleted workspace. Its agent is on the latest + // build but must still be soft-deleted by the migration. + _, err = tx.ExecContext(ctx, + `INSERT INTO workspaces (id, created_at, updated_at, owner_id, organization_id, template_id, name, deleted, automatic_updates) + VALUES ($1, $2, $3, $4, $5, $6, $7, true, 'never')`, + wsDeleted, now, now, userID, orgID, templateID, "ws-"+wsDeleted.String()[:8], + ) + require.NoError(t, err) + + // For every build: provisioner_job -> workspace_build -> workspace_resource -> workspace_agent. + for _, b := range allBuilds { + _, err = tx.ExecContext(ctx, + `INSERT INTO provisioner_jobs (id, created_at, updated_at, organization_id, initiator_id, provisioner, storage_method, type, input, file_id) + VALUES ($1, $2, $3, $4, $5, 'echo', 'file', 'workspace_build', '{}', $6)`, + b.jobID, now, now, orgID, userID, fileID, + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO workspace_builds (id, created_at, updated_at, workspace_id, template_version_id, build_number, transition, initiator_id, job_id, reason) + VALUES ($1, $2, $3, $4, $5, $6, 'start', $7, $8, 'initiator')`, + b.id, now, now, b.wsID, templateVersionID, b.buildNum, userID, b.jobID, + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO workspace_resources (id, created_at, job_id, transition, type, name) + VALUES ($1, $2, $3, 'start', 'aws_instance', 'dev')`, + b.resourceID, now, b.jobID, + ) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, + `INSERT INTO workspace_agents (id, created_at, updated_at, name, resource_id, auth_token, auth_instance_id, architecture, operating_system, deleted) + VALUES ($1, $2, $3, 'main', $4, $5, $6, 'amd64', 'linux', false)`, + b.agentID, now, now, b.resourceID, uuid.New(), b.instanceID, + ) + require.NoError(t, err) + } + + require.NoError(t, tx.Commit()) + + // Sanity check pre-migration: all agents should be deleted=false. + var preDeletedCount int + err = sqlDB.QueryRowContext(ctx, + `SELECT COUNT(*) FROM workspace_agents WHERE deleted = true`).Scan(&preDeletedCount) + require.NoError(t, err) + require.Equal(t, 0, preDeletedCount, "no agents should be deleted pre-migration") + + // Run migration 491. + version, more, err := next() + require.NoError(t, err) + require.True(t, more) + require.EqualValues(t, migrationVersion, version) + + // Backfill assertions: + // wsA: builds 1,2,3 → keep agent for build 3, delete for 1 and 2. + // wsB: builds 1,2 → keep agent for build 2, delete for 1. + // wsSingle: 1 build → keep. + // Per workspace, exactly one agent remains deleted=false. + check := func(label string, expectDeleted bool, agent uuid.UUID) { + var deleted bool + err := sqlDB.QueryRowContext(ctx, + `SELECT deleted FROM workspace_agents WHERE id = $1`, agent).Scan(&deleted) + require.NoError(t, err, label) + require.Equal(t, expectDeleted, deleted, label) + } + check("wsA build 1 (old) should be deleted", true, aBuilds[0].agentID) + check("wsA build 2 (old) should be deleted", true, aBuilds[1].agentID) + check("wsA build 3 (latest) should be kept", false, aBuilds[2].agentID) + check("wsB build 1 (old) should be deleted", true, bBuilds[0].agentID) + check("wsB build 2 (latest) should be kept", false, bBuilds[1].agentID) + check("wsSingle build 1 (solo latest) should be kept", false, singleBuilds[0].agentID) + check("wsDeleted: agent on deleted workspace should be soft-deleted even though it's the latest build", + true, deletedBuilds[0].agentID) + + // The ongoing invariants are enforced by wsbuilder.Builder.Build and + // provisionerdserver.CompleteJob via SoftDeletePriorWorkspaceAgents and + // SoftDeleteWorkspaceAgentsByWorkspaceID. Those paths are covered by + // the querier tests TestSoftDeletePriorWorkspaceAgents and + // TestSoftDeleteWorkspaceAgentsByWorkspaceID, plus integration tests + // under coderd/coderd_test.go; not retested here. +} diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 728463c8d5..9366d2316e 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -1077,6 +1077,18 @@ type sqlcQuerier interface { SoftDeleteChatMessageByID(ctx context.Context, id int64) error SoftDeleteChatMessagesAfterID(ctx context.Context, arg SoftDeleteChatMessagesAfterIDParams) error SoftDeleteContextFileMessages(ctx context.Context, chatID uuid.UUID) error + // Marks agents from all prior builds of this workspace as deleted, + // preserving only agents belonging to @current_build_id. Called from + // provisionerdserver when a workspace build completes, after the new + // build's agents have been inserted, so running agents are not + // deleted while a build is still queued or provisioning. + SoftDeletePriorWorkspaceAgents(ctx context.Context, arg SoftDeletePriorWorkspaceAgentsParams) error + // Marks every non-deleted agent belonging to the given workspace as + // deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace + // itself is soft-deleted, so the agent instance-identity auth path + // (which filters on workspace_agents.deleted) doesn't keep seeing + // orphaned rows. + SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error // Overrides updated_at on the parent run without touching any // other column. Used by tests that need to stamp a run with a // specific timestamp after the InsertChatDebugStep CTE has diff --git a/coderd/database/querier_test.go b/coderd/database/querier_test.go index 305ec47940..1d0954f1e7 100644 --- a/coderd/database/querier_test.go +++ b/coderd/database/querier_test.go @@ -13765,3 +13765,230 @@ func TestChatHasUnread(t *testing.T) { insertMsg(database.ChatMessageRoleUser, "user msg") require.False(t, getHasUnread(), "user messages should not trigger unread") } + +// TestSoftDeletePriorWorkspaceAgents verifies the invariant maintained by +// wsbuilder.Builder.Build: when a new build of a workspace is created, all +// agents belonging to prior builds of that same workspace are soft-deleted, +// and agents belonging to *other* workspaces are untouched. +func TestSoftDeletePriorWorkspaceAgents(t *testing.T) { + t.Parallel() + + db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + + // Helper: create a workspace + one build + its agent. Returns the IDs we + // need to assert on. The agent uses the shared EC2-style auth_instance_id + // so we can prove per-workspace scoping. + type buildBundle struct { + workspaceID uuid.UUID + buildID uuid.UUID + agentID uuid.UUID + } + + user := dbgen.User(t, db, database.User{}) + org := dbgen.Organization(t, db, database.Organization{}) + tpl := dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + tplVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}, + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + + newBuild := func(t *testing.T, wsID uuid.UUID, buildNumber int32, instanceID string) buildBundle { + t.Helper() + job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{ + OrganizationID: org.ID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + }) + build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: wsID, + JobID: job.ID, + TemplateVersionID: tplVersion.ID, + BuildNumber: buildNumber, + Transition: database.WorkspaceTransitionStart, + }) + resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: job.ID}) + agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ + ResourceID: resource.ID, + AuthInstanceID: sql.NullString{String: instanceID, Valid: true}, + }) + return buildBundle{workspaceID: wsID, buildID: build.ID, agentID: agent.ID} + } + + // Read `deleted` via raw SQL. GetWorkspaceAgentByID filters deleted rows + // out, which is exactly what we want to observe here. + agentDeleted := func(id uuid.UUID) bool { + t.Helper() + var deleted bool + err := sqlDB.QueryRowContext(ctx, + `SELECT deleted FROM workspace_agents WHERE id = $1`, id).Scan(&deleted) + require.NoError(t, err) + return deleted + } + + // Two workspaces share a single EC2 instance ID across their lifetimes. + wsA := dbgen.Workspace(t, db, database.WorkspaceTable{ + OrganizationID: org.ID, + TemplateID: tpl.ID, + OwnerID: user.ID, + }).ID + wsB := dbgen.Workspace(t, db, database.WorkspaceTable{ + OrganizationID: org.ID, + TemplateID: tpl.ID, + OwnerID: user.ID, + }).ID + instance := "i-shared" + + a1 := newBuild(t, wsA, 1, instance) + a2 := newBuild(t, wsA, 2, instance) + a3 := newBuild(t, wsA, 3, instance) + b1 := newBuild(t, wsB, 1, instance) + b2 := newBuild(t, wsB, 2, instance) + + // Sanity check: all agents start non-deleted. + require.False(t, agentDeleted(a1.agentID)) + require.False(t, agentDeleted(a2.agentID)) + require.False(t, agentDeleted(a3.agentID)) + require.False(t, agentDeleted(b1.agentID)) + require.False(t, agentDeleted(b2.agentID)) + + // Run: "wsA's current build is a3; soft-delete all other wsA agents." + err := db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{ + WorkspaceID: wsA, + CurrentBuildID: a3.buildID, + }) + require.NoError(t, err) + + assert.True(t, agentDeleted(a1.agentID), "wsA build 1 agent should be soft-deleted") + assert.True(t, agentDeleted(a2.agentID), "wsA build 2 agent should be soft-deleted") + assert.False(t, agentDeleted(a3.agentID), "wsA current build's agent must stay") + assert.False(t, agentDeleted(b1.agentID), "wsB build 1 agent must not be touched") + assert.False(t, agentDeleted(b2.agentID), "wsB build 2 agent must not be touched") + + // Idempotency: re-running with the same params is a no-op. + err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{ + WorkspaceID: wsA, + CurrentBuildID: a3.buildID, + }) + require.NoError(t, err) + assert.False(t, agentDeleted(a3.agentID)) + + // Now age wsB: new current build is b2; b1's agent should flip. + err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{ + WorkspaceID: wsB, + CurrentBuildID: b2.buildID, + }) + require.NoError(t, err) + assert.True(t, agentDeleted(b1.agentID)) + assert.False(t, agentDeleted(b2.agentID)) +} + +// TestSoftDeleteWorkspaceAgentsByWorkspaceID verifies the delete-path +// invariant: when a workspace is soft-deleted, every one of its agents +// (across all builds) gets soft-deleted in the same transaction. Agents on +// *other* workspaces, even ones sharing an auth_instance_id, must be +// untouched. +func TestSoftDeleteWorkspaceAgentsByWorkspaceID(t *testing.T) { + t.Parallel() + + db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + + type buildBundle struct { + workspaceID uuid.UUID + buildID uuid.UUID + agentID uuid.UUID + } + + user := dbgen.User(t, db, database.User{}) + org := dbgen.Organization(t, db, database.Organization{}) + tpl := dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + tplVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}, + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + + newBuild := func(t *testing.T, wsID uuid.UUID, buildNumber int32, instanceID string) buildBundle { + t.Helper() + job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{ + OrganizationID: org.ID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + }) + build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: wsID, + JobID: job.ID, + TemplateVersionID: tplVersion.ID, + BuildNumber: buildNumber, + Transition: database.WorkspaceTransitionStart, + }) + resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: job.ID}) + agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ + ResourceID: resource.ID, + AuthInstanceID: sql.NullString{String: instanceID, Valid: true}, + }) + return buildBundle{workspaceID: wsID, buildID: build.ID, agentID: agent.ID} + } + + agentDeleted := func(id uuid.UUID) bool { + t.Helper() + var deleted bool + err := sqlDB.QueryRowContext(ctx, + `SELECT deleted FROM workspace_agents WHERE id = $1`, id).Scan(&deleted) + require.NoError(t, err) + return deleted + } + + // wsA: 3 builds (so multiple agents to sweep on delete). + // wsB: 1 build, same auth_instance_id as wsA (proves scoping). + wsA := dbgen.Workspace(t, db, database.WorkspaceTable{ + OrganizationID: org.ID, + TemplateID: tpl.ID, + OwnerID: user.ID, + }).ID + wsB := dbgen.Workspace(t, db, database.WorkspaceTable{ + OrganizationID: org.ID, + TemplateID: tpl.ID, + OwnerID: user.ID, + }).ID + instance := "i-shared" + + a1 := newBuild(t, wsA, 1, instance) + a2 := newBuild(t, wsA, 2, instance) + a3 := newBuild(t, wsA, 3, instance) + b1 := newBuild(t, wsB, 1, instance) + + // Sanity: all 4 agents start non-deleted. + for _, id := range []uuid.UUID{a1.agentID, a2.agentID, a3.agentID, b1.agentID} { + require.False(t, agentDeleted(id)) + } + + err := db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsA) + require.NoError(t, err) + + // All wsA agents flipped; wsB's agent untouched. + assert.True(t, agentDeleted(a1.agentID), "wsA build 1 agent") + assert.True(t, agentDeleted(a2.agentID), "wsA build 2 agent") + assert.True(t, agentDeleted(a3.agentID), "wsA build 3 agent") + assert.False(t, agentDeleted(b1.agentID), "wsB agent must not be affected") + + // Idempotency: re-running is a no-op. + err = db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsA) + require.NoError(t, err) + assert.False(t, agentDeleted(b1.agentID)) + + // Calling on an empty workspace (no agents) is a no-op and does not error. + wsEmpty := dbgen.Workspace(t, db, database.WorkspaceTable{ + OrganizationID: org.ID, + TemplateID: tpl.ID, + OwnerID: user.ID, + }).ID + err = db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsEmpty) + require.NoError(t, err) +} diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 92394600a2..5de7ed842d 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -30641,6 +30641,58 @@ func (q *sqlQuerier) InsertWorkspaceAgentScriptTimings(ctx context.Context, arg return i, err } +const softDeletePriorWorkspaceAgents = `-- name: SoftDeletePriorWorkspaceAgents :exec +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + WHERE wb.workspace_id = $1 + AND wb.id <> $2 + AND wa.deleted = FALSE +) +` + +type SoftDeletePriorWorkspaceAgentsParams struct { + WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` + CurrentBuildID uuid.UUID `db:"current_build_id" json:"current_build_id"` +} + +// Marks agents from all prior builds of this workspace as deleted, +// preserving only agents belonging to @current_build_id. Called from +// provisionerdserver when a workspace build completes, after the new +// build's agents have been inserted, so running agents are not +// deleted while a build is still queued or provisioning. +func (q *sqlQuerier) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg SoftDeletePriorWorkspaceAgentsParams) error { + _, err := q.db.ExecContext(ctx, softDeletePriorWorkspaceAgents, arg.WorkspaceID, arg.CurrentBuildID) + return err +} + +const softDeleteWorkspaceAgentsByWorkspaceID = `-- name: SoftDeleteWorkspaceAgentsByWorkspaceID :exec +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + WHERE wb.workspace_id = $1 + AND wa.deleted = FALSE +) +` + +// Marks every non-deleted agent belonging to the given workspace as +// deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace +// itself is soft-deleted, so the agent instance-identity auth path +// (which filters on workspace_agents.deleted) doesn't keep seeing +// orphaned rows. +func (q *sqlQuerier) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error { + _, err := q.db.ExecContext(ctx, softDeleteWorkspaceAgentsByWorkspaceID, workspaceID) + return err +} + const updateWorkspaceAgentConnectionByID = `-- name: UpdateWorkspaceAgentConnectionByID :exec UPDATE workspace_agents diff --git a/coderd/database/queries/workspaceagents.sql b/coderd/database/queries/workspaceagents.sql index 4ca48d0dbe..00889ccef4 100644 --- a/coderd/database/queries/workspaceagents.sql +++ b/coderd/database/queries/workspaceagents.sql @@ -517,3 +517,38 @@ WHERE AND workspaces.deleted = FALSE AND users.deleted = FALSE LIMIT 1; + +-- name: SoftDeletePriorWorkspaceAgents :exec +-- Marks agents from all prior builds of this workspace as deleted, +-- preserving only agents belonging to @current_build_id. Called from +-- provisionerdserver when a workspace build completes, after the new +-- build's agents have been inserted, so running agents are not +-- deleted while a build is still queued or provisioning. +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + WHERE wb.workspace_id = @workspace_id + AND wb.id <> @current_build_id + AND wa.deleted = FALSE +); + +-- name: SoftDeleteWorkspaceAgentsByWorkspaceID :exec +-- Marks every non-deleted agent belonging to the given workspace as +-- deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace +-- itself is soft-deleted, so the agent instance-identity auth path +-- (which filters on workspace_agents.deleted) doesn't keep seeing +-- orphaned rows. +UPDATE workspace_agents +SET deleted = TRUE +WHERE id IN ( + SELECT wa.id + FROM workspace_agents wa + JOIN workspace_resources wr ON wr.id = wa.resource_id + JOIN workspace_builds wb ON wb.job_id = wr.job_id + WHERE wb.workspace_id = @workspace_id + AND wa.deleted = FALSE +); diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index a7d9e1387f..2fd3c50671 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -2143,6 +2143,20 @@ func (s *server) completeWorkspaceBuildJob(ctx context.Context, job database.Pro return xerrors.Errorf("insert provisioner job: %w", err) } } + + // Soft-delete agents from prior builds now that this build's + // agents have been inserted. Waiting until completion (rather + // than build creation) avoids bricking running workspaces + // whose agents would otherwise be deleted while the new build + // is still queued or provisioning. See #25155. + err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{ + WorkspaceID: workspaceBuild.WorkspaceID, + CurrentBuildID: workspaceBuild.ID, + }) + if err != nil { + return xerrors.Errorf("soft delete prior workspace agents: %w", err) + } + for _, module := range jobType.WorkspaceBuild.Modules { if err := InsertWorkspaceModule(ctx, db, job.ID, workspaceBuild.Transition, module, telemetrySnapshot); err != nil { return xerrors.Errorf("insert provisioner job module: %w", err) @@ -2391,6 +2405,14 @@ func (s *server) completeWorkspaceBuildJob(ctx context.Context, job database.Pro return xerrors.Errorf("update workspace deleted: %w", err) } + // Soft-delete any agents tied to this workspace so the + // aws-instance-identity handler (which filters on + // workspace_agents.deleted) doesn't keep seeing orphaned rows + // after the workspace itself is deleted. See #25155. + if err := db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceBuild.WorkspaceID); err != nil { + return xerrors.Errorf("soft delete workspace agents: %w", err) + } + // A user might delete their task workspace directly, instead of // deleting the task. To avoid leaving the Task in a scenario where // it has no workspace, we also attempt to delete the task. diff --git a/coderd/workspaceresourceauth_test.go b/coderd/workspaceresourceauth_test.go index 1f8004d1b3..b327c120bc 100644 --- a/coderd/workspaceresourceauth_test.go +++ b/coderd/workspaceresourceauth_test.go @@ -131,8 +131,10 @@ func TestPostWorkspaceAuthAWSInstanceIdentity(t *testing.T) { err := agentClient.RefreshToken(ctx) var apiErr *codersdk.Error require.ErrorAs(t, err, &apiErr) - require.Equal(t, http.StatusBadRequest, apiErr.StatusCode()) - require.Contains(t, apiErr.Message, "isn't registered on the latest history") + // The prior build's agent is soft-deleted when the successor + // build completes (SoftDeletePriorWorkspaceAgents), so the + // auth query finds no candidates at all and returns 404. + require.Equal(t, http.StatusNotFound, apiErr.StatusCode()) }) t.Run("Ambiguous/MultipleAgentsNoSelector", func(t *testing.T) { diff --git a/coderd/wsbuilder/wsbuilder.go b/coderd/wsbuilder/wsbuilder.go index ff8d6d623f..e27a2f9868 100644 --- a/coderd/wsbuilder/wsbuilder.go +++ b/coderd/wsbuilder/wsbuilder.go @@ -615,6 +615,15 @@ func (b *Builder) buildTx(authFunc func(action policy.Action, object rbac.Object }); err != nil { return BuildError{http.StatusInternalServerError, "mark workspace as deleted", err} } + + // Soft-delete any agents tied to this workspace so the + // aws-instance-identity handler doesn't keep seeing + // orphaned rows. Mirrors the path in + // provisionerdserver.CompleteJob. See #25155. + //nolint:gocritic // System-restricted: bookkeeping inside an already-authorized delete transaction. + if err := store.SoftDeleteWorkspaceAgentsByWorkspaceID(dbauthz.AsSystemRestricted(b.ctx), b.workspace.ID); err != nil { + return BuildError{http.StatusInternalServerError, "soft-delete workspace agents on orphan delete", err} + } } return nil diff --git a/coderd/wsbuilder/wsbuilder_test.go b/coderd/wsbuilder/wsbuilder_test.go index 3698c881f1..4e96c06090 100644 --- a/coderd/wsbuilder/wsbuilder_test.go +++ b/coderd/wsbuilder/wsbuilder_test.go @@ -1513,7 +1513,9 @@ func expectUpdateProvisionerJobWithCompleteWithStartedAtByID(assertions func(par } // expectUpdateWorkspaceDeletedByID asserts a call to UpdateWorkspaceDeletedByID -// and runs the provided assertions against it. +// and runs the provided assertions against it. It also expects the follow-up +// SoftDeleteWorkspaceAgentsByWorkspaceID call that wsbuilder.Builder.Build now +// issues inside the same orphan-delete transaction. func expectUpdateWorkspaceDeletedByID(assertions func(params database.UpdateWorkspaceDeletedByIDParams)) func(mTx *dbmock.MockStore) { return func(mTx *dbmock.MockStore) { mTx.EXPECT().UpdateWorkspaceDeletedByID(gomock.Any(), gomock.Any()). @@ -1524,6 +1526,9 @@ func expectUpdateWorkspaceDeletedByID(assertions func(params database.UpdateWork return nil }, ) + mTx.EXPECT().SoftDeleteWorkspaceAgentsByWorkspaceID(gomock.Any(), gomock.Any()). + Times(1). + Return(nil) } } diff --git a/codersdk/toolsdk/toolsdk_test.go b/codersdk/toolsdk/toolsdk_test.go index 62403a6667..4e35d8d6d7 100644 --- a/codersdk/toolsdk/toolsdk_test.go +++ b/codersdk/toolsdk/toolsdk_test.go @@ -741,8 +741,10 @@ func TestTools(t *testing.T) { }) t.Run("GetWorkspaceAgentLogs", func(t *testing.T) { + _ = testutil.Context(t, testutil.WaitShort) tb, err := toolsdk.NewDeps(memberClient) require.NoError(t, err) + logs, err := testTool(t, toolsdk.GetWorkspaceAgentLogs, tb, toolsdk.GetWorkspaceAgentLogsArgs{ WorkspaceAgentID: agentID.String(), })