fix: soft-delete stale workspace agents on new build (#25207)

This commit is contained in:
Garrett Delfosse
2026-05-18 08:33:29 -04:00
committed by GitHub
parent 159089686a
commit 78d4cf9e47
16 changed files with 723 additions and 3 deletions
+21
View File
@@ -6272,6 +6272,27 @@ func (q *querier) SoftDeleteContextFileMessages(ctx context.Context, chatID uuid
return q.db.SoftDeleteContextFileMessages(ctx, chatID)
}
func (q *querier) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error {
// Internal bookkeeping called from wsbuilder.Builder.Build inside the
// same transaction as an already-authorized InsertWorkspaceBuild.
// Callers pass a system-restricted context.
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil {
return err
}
return q.db.SoftDeletePriorWorkspaceAgents(ctx, arg)
}
func (q *querier) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error {
// Internal bookkeeping called from wsbuilder (orphan-delete) and
// provisionerdserver.CompleteJob (normal delete) inside the same
// transaction as an already-authorized workspace deletion.
// Callers pass a system-restricted context.
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil {
return err
}
return q.db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID)
}
func (q *querier) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error {
chat, err := q.db.GetChatByID(ctx, arg.ChatID)
if err != nil {
+13
View File
@@ -4596,6 +4596,19 @@ func (s *MethodTestSuite) TestSystemFunctions() {
dbm.EXPECT().UpdateWorkspaceAgentConnectionByID(gomock.Any(), arg).Return(nil).AnyTimes()
check.Args(arg).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns()
}))
s.Run("SoftDeletePriorWorkspaceAgents", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
arg := database.SoftDeletePriorWorkspaceAgentsParams{
WorkspaceID: uuid.New(),
CurrentBuildID: uuid.New(),
}
dbm.EXPECT().SoftDeletePriorWorkspaceAgents(gomock.Any(), arg).Return(nil).AnyTimes()
check.Args(arg).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns()
}))
s.Run("SoftDeleteWorkspaceAgentsByWorkspaceID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
wsID := uuid.New()
dbm.EXPECT().SoftDeleteWorkspaceAgentsByWorkspaceID(gomock.Any(), wsID).Return(nil).AnyTimes()
check.Args(wsID).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns()
}))
s.Run("AcquireProvisionerJob", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
arg := database.AcquireProvisionerJobParams{StartedAt: sql.NullTime{Valid: true, Time: dbtime.Now()}, OrganizationID: uuid.New(), Types: []database.ProvisionerType{database.ProvisionerTypeEcho}, ProvisionerTags: json.RawMessage("{}")}
dbm.EXPECT().AcquireProvisionerJob(gomock.Any(), arg).Return(testutil.Fake(s.T(), faker, database.ProvisionerJob{}), nil).AnyTimes()
+16
View File
@@ -4537,6 +4537,22 @@ func (m queryMetricsStore) SoftDeleteContextFileMessages(ctx context.Context, ch
return r0
}
func (m queryMetricsStore) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error {
start := time.Now()
r0 := m.s.SoftDeletePriorWorkspaceAgents(ctx, arg)
m.queryLatencies.WithLabelValues("SoftDeletePriorWorkspaceAgents").Observe(time.Since(start).Seconds())
m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "SoftDeletePriorWorkspaceAgents").Inc()
return r0
}
func (m queryMetricsStore) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error {
start := time.Now()
r0 := m.s.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID)
m.queryLatencies.WithLabelValues("SoftDeleteWorkspaceAgentsByWorkspaceID").Observe(time.Since(start).Seconds())
m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "SoftDeleteWorkspaceAgentsByWorkspaceID").Inc()
return r0
}
func (m queryMetricsStore) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error {
start := time.Now()
r0 := m.s.TouchChatDebugRunUpdatedAt(ctx, arg)
+28
View File
@@ -8588,6 +8588,34 @@ func (mr *MockStoreMockRecorder) SoftDeleteContextFileMessages(ctx, chatID any)
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeleteContextFileMessages", reflect.TypeOf((*MockStore)(nil).SoftDeleteContextFileMessages), ctx, chatID)
}
// SoftDeletePriorWorkspaceAgents mocks base method.
func (m *MockStore) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg database.SoftDeletePriorWorkspaceAgentsParams) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "SoftDeletePriorWorkspaceAgents", ctx, arg)
ret0, _ := ret[0].(error)
return ret0
}
// SoftDeletePriorWorkspaceAgents indicates an expected call of SoftDeletePriorWorkspaceAgents.
func (mr *MockStoreMockRecorder) SoftDeletePriorWorkspaceAgents(ctx, arg any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeletePriorWorkspaceAgents", reflect.TypeOf((*MockStore)(nil).SoftDeletePriorWorkspaceAgents), ctx, arg)
}
// SoftDeleteWorkspaceAgentsByWorkspaceID mocks base method.
func (m *MockStore) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "SoftDeleteWorkspaceAgentsByWorkspaceID", ctx, workspaceID)
ret0, _ := ret[0].(error)
return ret0
}
// SoftDeleteWorkspaceAgentsByWorkspaceID indicates an expected call of SoftDeleteWorkspaceAgentsByWorkspaceID.
func (mr *MockStoreMockRecorder) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceID any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SoftDeleteWorkspaceAgentsByWorkspaceID", reflect.TypeOf((*MockStore)(nil).SoftDeleteWorkspaceAgentsByWorkspaceID), ctx, workspaceID)
}
// TouchChatDebugRunUpdatedAt mocks base method.
func (m *MockStore) TouchChatDebugRunUpdatedAt(ctx context.Context, arg database.TouchChatDebugRunUpdatedAtParams) error {
m.ctrl.T.Helper()
@@ -0,0 +1,3 @@
-- The backfill is not reversed: soft-deleted agents tied to stopped/deleted
-- builds are no longer referenced anywhere. Restoring deleted=FALSE would
-- only re-create the ambiguity the forward migration fixed.
@@ -0,0 +1,62 @@
-- Soft-delete stale `workspace_agents` rows.
--
-- Before v2.33.0, the auth path `GetWorkspaceAgentByInstanceID :one` silently
-- picked the newest matching row, so stale rows from earlier builds were
-- harmless. After #24325 replaced that with a `:many` lookup that rejects
-- ambiguity with HTTP 409, the accumulation becomes a hard failure: any
-- workspace whose EC2 instance hosted more than one build can no longer
-- re-authenticate its agent.
--
-- This migration backfills the "at most one non-deleted agent per workspace
-- that is itself not deleted" invariant over existing data. Going forward:
-- - `wsbuilder.Builder.Build` maintains it per-build via
-- `SoftDeletePriorWorkspaceAgents`.
-- - `provisionerdserver.CompleteJob` and `wsbuilder` also call
-- `SoftDeleteWorkspaceAgentsByWorkspaceID` when a workspace itself is
-- soft-deleted, so the table doesn't retain orphaned-but-non-deleted
-- agents referencing a deleted workspace.
--
-- Backfill scope:
-- 1. Every agent belonging to a soft-deleted workspace -> deleted = TRUE.
-- 2. For each still-live workspace, keep only agents belonging to the
-- current (highest build_number) build; soft-delete earlier builds'
-- agents.
--
-- Related:
-- #24325 (feature that regressed the behavior)
-- #24973 (partial fix, pool starvation)
-- #25031 (partial fix, handler cleanup + deleted-workspace filter)
-- #25155 (bug report)
-- 1. Soft-delete all agents on workspaces that are themselves deleted.
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
JOIN workspaces w ON w.id = wb.workspace_id
WHERE wa.deleted = FALSE
AND w.deleted = TRUE
);
-- 2. For every live workspace, soft-delete agents not tied to the latest build.
WITH latest_builds AS (
SELECT DISTINCT ON (workspace_id) id, workspace_id
FROM workspace_builds
ORDER BY workspace_id, build_number DESC
)
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
JOIN workspaces w ON w.id = wb.workspace_id
LEFT JOIN latest_builds lb ON lb.workspace_id = wb.workspace_id
WHERE wa.deleted = FALSE
AND w.deleted = FALSE
AND (lb.id IS NULL OR wb.id <> lb.id)
);
+211
View File
@@ -1185,3 +1185,214 @@ func TestMigration000475AgentsAccessOrgRole(t *testing.T) {
"trigger should only create org-member and org-service-account system roles",
)
}
func TestMigration000498SoftDeleteStaleWorkspaceAgents(t *testing.T) {
t.Parallel()
const migrationVersion = 498
sqlDB := testSQLDB(t)
// Step up to migrationVersion - 1.
next, err := migrations.Stepper(sqlDB)
require.NoError(t, err)
for {
version, more, err := next()
require.NoError(t, err)
if !more {
t.Fatalf("migration %d not found", migrationVersion)
}
if version == migrationVersion-1 {
break
}
}
ctx := testutil.Context(t, testutil.WaitSuperLong)
now := time.Now().UTC().Truncate(time.Microsecond)
// Seed the prerequisite tables. Two workspaces share the same EC2-style
// instance id across several builds; a third workspace has a single
// build on a different instance (baseline, must not be affected).
userID := uuid.New()
orgID := uuid.New()
templateID := uuid.New()
templateVersionID := uuid.New()
fileID := uuid.New()
wsA := uuid.New()
wsB := uuid.New()
wsSingle := uuid.New()
wsDeleted := uuid.New()
instanceAB := "i-shared-ab"
instanceSingle := "i-solo"
instanceDeleted := "i-deleted"
// For workspace A: 3 builds on the same instance.
// For workspace B: 2 builds on the same instance (different workspace,
// same instance id, exercises the cross-workspace scoping case).
// For wsSingle: 1 build, should stay non-deleted after the backfill.
// For wsDeleted: 1 build on a soft-deleted workspace. Agent should be
// marked deleted even though it's on the latest build.
type build struct {
id uuid.UUID
jobID uuid.UUID
resourceID uuid.UUID
agentID uuid.UUID
buildNum int32
wsID uuid.UUID
instanceID string
}
mkBuild := func(ws uuid.UUID, buildNum int32, instance string) build {
return build{
id: uuid.New(),
jobID: uuid.New(),
resourceID: uuid.New(),
agentID: uuid.New(),
buildNum: buildNum,
wsID: ws,
instanceID: instance,
}
}
aBuilds := []build{
mkBuild(wsA, 1, instanceAB),
mkBuild(wsA, 2, instanceAB),
mkBuild(wsA, 3, instanceAB),
}
bBuilds := []build{
mkBuild(wsB, 1, instanceAB),
mkBuild(wsB, 2, instanceAB),
}
singleBuilds := []build{
mkBuild(wsSingle, 1, instanceSingle),
}
deletedBuilds := []build{
mkBuild(wsDeleted, 1, instanceDeleted),
}
allBuilds := append(append(append(append([]build{}, aBuilds...), bBuilds...), singleBuilds...), deletedBuilds...)
tx, err := sqlDB.BeginTx(ctx, nil)
require.NoError(t, err)
defer tx.Rollback()
// Minimal user / org / template / template_version / file.
_, err = tx.ExecContext(ctx,
`INSERT INTO users (id, username, email, hashed_password, created_at, updated_at, status, rbac_roles, login_type)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
userID, "seed", "seed@test.com", []byte{}, now, now, "active", pq.StringArray{}, "password",
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO organizations (id, name, display_name, description, icon, created_at, updated_at, is_default)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
orgID, "seed-org", "Seed Org", "", "", now, now, false,
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO files (id, hash, created_at, created_by, mimetype, data) VALUES ($1, $2, $3, $4, $5, $6)`,
fileID, "hash", now, userID, "application/octet-stream", []byte{},
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO templates (id, created_at, updated_at, organization_id, name, provisioner, active_version_id, description, created_by, group_acl, user_acl, display_name)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)`,
templateID, now, now, orgID, "tpl", "echo", templateVersionID, "", userID, "{}", "{}", "",
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO template_versions (id, template_id, organization_id, created_at, updated_at, name, readme, job_id, created_by, message)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
templateVersionID, templateID, orgID, now, now, "v", "", uuid.New(), userID, "",
)
require.NoError(t, err)
for _, ws := range []uuid.UUID{wsA, wsB, wsSingle} {
_, err = tx.ExecContext(ctx,
`INSERT INTO workspaces (id, created_at, updated_at, owner_id, organization_id, template_id, name, deleted, automatic_updates)
VALUES ($1, $2, $3, $4, $5, $6, $7, false, 'never')`,
ws, now, now, userID, orgID, templateID, "ws-"+ws.String()[:8],
)
require.NoError(t, err)
}
// wsDeleted is a soft-deleted workspace. Its agent is on the latest
// build but must still be soft-deleted by the migration.
_, err = tx.ExecContext(ctx,
`INSERT INTO workspaces (id, created_at, updated_at, owner_id, organization_id, template_id, name, deleted, automatic_updates)
VALUES ($1, $2, $3, $4, $5, $6, $7, true, 'never')`,
wsDeleted, now, now, userID, orgID, templateID, "ws-"+wsDeleted.String()[:8],
)
require.NoError(t, err)
// For every build: provisioner_job -> workspace_build -> workspace_resource -> workspace_agent.
for _, b := range allBuilds {
_, err = tx.ExecContext(ctx,
`INSERT INTO provisioner_jobs (id, created_at, updated_at, organization_id, initiator_id, provisioner, storage_method, type, input, file_id)
VALUES ($1, $2, $3, $4, $5, 'echo', 'file', 'workspace_build', '{}', $6)`,
b.jobID, now, now, orgID, userID, fileID,
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO workspace_builds (id, created_at, updated_at, workspace_id, template_version_id, build_number, transition, initiator_id, job_id, reason)
VALUES ($1, $2, $3, $4, $5, $6, 'start', $7, $8, 'initiator')`,
b.id, now, now, b.wsID, templateVersionID, b.buildNum, userID, b.jobID,
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO workspace_resources (id, created_at, job_id, transition, type, name)
VALUES ($1, $2, $3, 'start', 'aws_instance', 'dev')`,
b.resourceID, now, b.jobID,
)
require.NoError(t, err)
_, err = tx.ExecContext(ctx,
`INSERT INTO workspace_agents (id, created_at, updated_at, name, resource_id, auth_token, auth_instance_id, architecture, operating_system, deleted)
VALUES ($1, $2, $3, 'main', $4, $5, $6, 'amd64', 'linux', false)`,
b.agentID, now, now, b.resourceID, uuid.New(), b.instanceID,
)
require.NoError(t, err)
}
require.NoError(t, tx.Commit())
// Sanity check pre-migration: all agents should be deleted=false.
var preDeletedCount int
err = sqlDB.QueryRowContext(ctx,
`SELECT COUNT(*) FROM workspace_agents WHERE deleted = true`).Scan(&preDeletedCount)
require.NoError(t, err)
require.Equal(t, 0, preDeletedCount, "no agents should be deleted pre-migration")
// Run migration 491.
version, more, err := next()
require.NoError(t, err)
require.True(t, more)
require.EqualValues(t, migrationVersion, version)
// Backfill assertions:
// wsA: builds 1,2,3 → keep agent for build 3, delete for 1 and 2.
// wsB: builds 1,2 → keep agent for build 2, delete for 1.
// wsSingle: 1 build → keep.
// Per workspace, exactly one agent remains deleted=false.
check := func(label string, expectDeleted bool, agent uuid.UUID) {
var deleted bool
err := sqlDB.QueryRowContext(ctx,
`SELECT deleted FROM workspace_agents WHERE id = $1`, agent).Scan(&deleted)
require.NoError(t, err, label)
require.Equal(t, expectDeleted, deleted, label)
}
check("wsA build 1 (old) should be deleted", true, aBuilds[0].agentID)
check("wsA build 2 (old) should be deleted", true, aBuilds[1].agentID)
check("wsA build 3 (latest) should be kept", false, aBuilds[2].agentID)
check("wsB build 1 (old) should be deleted", true, bBuilds[0].agentID)
check("wsB build 2 (latest) should be kept", false, bBuilds[1].agentID)
check("wsSingle build 1 (solo latest) should be kept", false, singleBuilds[0].agentID)
check("wsDeleted: agent on deleted workspace should be soft-deleted even though it's the latest build",
true, deletedBuilds[0].agentID)
// The ongoing invariants are enforced by wsbuilder.Builder.Build and
// provisionerdserver.CompleteJob via SoftDeletePriorWorkspaceAgents and
// SoftDeleteWorkspaceAgentsByWorkspaceID. Those paths are covered by
// the querier tests TestSoftDeletePriorWorkspaceAgents and
// TestSoftDeleteWorkspaceAgentsByWorkspaceID, plus integration tests
// under coderd/coderd_test.go; not retested here.
}
+12
View File
@@ -1077,6 +1077,18 @@ type sqlcQuerier interface {
SoftDeleteChatMessageByID(ctx context.Context, id int64) error
SoftDeleteChatMessagesAfterID(ctx context.Context, arg SoftDeleteChatMessagesAfterIDParams) error
SoftDeleteContextFileMessages(ctx context.Context, chatID uuid.UUID) error
// Marks agents from all prior builds of this workspace as deleted,
// preserving only agents belonging to @current_build_id. Called from
// provisionerdserver when a workspace build completes, after the new
// build's agents have been inserted, so running agents are not
// deleted while a build is still queued or provisioning.
SoftDeletePriorWorkspaceAgents(ctx context.Context, arg SoftDeletePriorWorkspaceAgentsParams) error
// Marks every non-deleted agent belonging to the given workspace as
// deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace
// itself is soft-deleted, so the agent instance-identity auth path
// (which filters on workspace_agents.deleted) doesn't keep seeing
// orphaned rows.
SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error
// Overrides updated_at on the parent run without touching any
// other column. Used by tests that need to stamp a run with a
// specific timestamp after the InsertChatDebugStep CTE has
+227
View File
@@ -13765,3 +13765,230 @@ func TestChatHasUnread(t *testing.T) {
insertMsg(database.ChatMessageRoleUser, "user msg")
require.False(t, getHasUnread(), "user messages should not trigger unread")
}
// TestSoftDeletePriorWorkspaceAgents verifies the invariant maintained by
// wsbuilder.Builder.Build: when a new build of a workspace is created, all
// agents belonging to prior builds of that same workspace are soft-deleted,
// and agents belonging to *other* workspaces are untouched.
func TestSoftDeletePriorWorkspaceAgents(t *testing.T) {
t.Parallel()
db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t)
ctx := testutil.Context(t, testutil.WaitShort)
// Helper: create a workspace + one build + its agent. Returns the IDs we
// need to assert on. The agent uses the shared EC2-style auth_instance_id
// so we can prove per-workspace scoping.
type buildBundle struct {
workspaceID uuid.UUID
buildID uuid.UUID
agentID uuid.UUID
}
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
tpl := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: user.ID,
})
tplVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
newBuild := func(t *testing.T, wsID uuid.UUID, buildNumber int32, instanceID string) buildBundle {
t.Helper()
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
OrganizationID: org.ID,
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
WorkspaceID: wsID,
JobID: job.ID,
TemplateVersionID: tplVersion.ID,
BuildNumber: buildNumber,
Transition: database.WorkspaceTransitionStart,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: job.ID})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
AuthInstanceID: sql.NullString{String: instanceID, Valid: true},
})
return buildBundle{workspaceID: wsID, buildID: build.ID, agentID: agent.ID}
}
// Read `deleted` via raw SQL. GetWorkspaceAgentByID filters deleted rows
// out, which is exactly what we want to observe here.
agentDeleted := func(id uuid.UUID) bool {
t.Helper()
var deleted bool
err := sqlDB.QueryRowContext(ctx,
`SELECT deleted FROM workspace_agents WHERE id = $1`, id).Scan(&deleted)
require.NoError(t, err)
return deleted
}
// Two workspaces share a single EC2 instance ID across their lifetimes.
wsA := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: tpl.ID,
OwnerID: user.ID,
}).ID
wsB := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: tpl.ID,
OwnerID: user.ID,
}).ID
instance := "i-shared"
a1 := newBuild(t, wsA, 1, instance)
a2 := newBuild(t, wsA, 2, instance)
a3 := newBuild(t, wsA, 3, instance)
b1 := newBuild(t, wsB, 1, instance)
b2 := newBuild(t, wsB, 2, instance)
// Sanity check: all agents start non-deleted.
require.False(t, agentDeleted(a1.agentID))
require.False(t, agentDeleted(a2.agentID))
require.False(t, agentDeleted(a3.agentID))
require.False(t, agentDeleted(b1.agentID))
require.False(t, agentDeleted(b2.agentID))
// Run: "wsA's current build is a3; soft-delete all other wsA agents."
err := db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{
WorkspaceID: wsA,
CurrentBuildID: a3.buildID,
})
require.NoError(t, err)
assert.True(t, agentDeleted(a1.agentID), "wsA build 1 agent should be soft-deleted")
assert.True(t, agentDeleted(a2.agentID), "wsA build 2 agent should be soft-deleted")
assert.False(t, agentDeleted(a3.agentID), "wsA current build's agent must stay")
assert.False(t, agentDeleted(b1.agentID), "wsB build 1 agent must not be touched")
assert.False(t, agentDeleted(b2.agentID), "wsB build 2 agent must not be touched")
// Idempotency: re-running with the same params is a no-op.
err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{
WorkspaceID: wsA,
CurrentBuildID: a3.buildID,
})
require.NoError(t, err)
assert.False(t, agentDeleted(a3.agentID))
// Now age wsB: new current build is b2; b1's agent should flip.
err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{
WorkspaceID: wsB,
CurrentBuildID: b2.buildID,
})
require.NoError(t, err)
assert.True(t, agentDeleted(b1.agentID))
assert.False(t, agentDeleted(b2.agentID))
}
// TestSoftDeleteWorkspaceAgentsByWorkspaceID verifies the delete-path
// invariant: when a workspace is soft-deleted, every one of its agents
// (across all builds) gets soft-deleted in the same transaction. Agents on
// *other* workspaces, even ones sharing an auth_instance_id, must be
// untouched.
func TestSoftDeleteWorkspaceAgentsByWorkspaceID(t *testing.T) {
t.Parallel()
db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t)
ctx := testutil.Context(t, testutil.WaitShort)
type buildBundle struct {
workspaceID uuid.UUID
buildID uuid.UUID
agentID uuid.UUID
}
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
tpl := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: user.ID,
})
tplVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
newBuild := func(t *testing.T, wsID uuid.UUID, buildNumber int32, instanceID string) buildBundle {
t.Helper()
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
OrganizationID: org.ID,
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
WorkspaceID: wsID,
JobID: job.ID,
TemplateVersionID: tplVersion.ID,
BuildNumber: buildNumber,
Transition: database.WorkspaceTransitionStart,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: job.ID})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
AuthInstanceID: sql.NullString{String: instanceID, Valid: true},
})
return buildBundle{workspaceID: wsID, buildID: build.ID, agentID: agent.ID}
}
agentDeleted := func(id uuid.UUID) bool {
t.Helper()
var deleted bool
err := sqlDB.QueryRowContext(ctx,
`SELECT deleted FROM workspace_agents WHERE id = $1`, id).Scan(&deleted)
require.NoError(t, err)
return deleted
}
// wsA: 3 builds (so multiple agents to sweep on delete).
// wsB: 1 build, same auth_instance_id as wsA (proves scoping).
wsA := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: tpl.ID,
OwnerID: user.ID,
}).ID
wsB := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: tpl.ID,
OwnerID: user.ID,
}).ID
instance := "i-shared"
a1 := newBuild(t, wsA, 1, instance)
a2 := newBuild(t, wsA, 2, instance)
a3 := newBuild(t, wsA, 3, instance)
b1 := newBuild(t, wsB, 1, instance)
// Sanity: all 4 agents start non-deleted.
for _, id := range []uuid.UUID{a1.agentID, a2.agentID, a3.agentID, b1.agentID} {
require.False(t, agentDeleted(id))
}
err := db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsA)
require.NoError(t, err)
// All wsA agents flipped; wsB's agent untouched.
assert.True(t, agentDeleted(a1.agentID), "wsA build 1 agent")
assert.True(t, agentDeleted(a2.agentID), "wsA build 2 agent")
assert.True(t, agentDeleted(a3.agentID), "wsA build 3 agent")
assert.False(t, agentDeleted(b1.agentID), "wsB agent must not be affected")
// Idempotency: re-running is a no-op.
err = db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsA)
require.NoError(t, err)
assert.False(t, agentDeleted(b1.agentID))
// Calling on an empty workspace (no agents) is a no-op and does not error.
wsEmpty := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: tpl.ID,
OwnerID: user.ID,
}).ID
err = db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, wsEmpty)
require.NoError(t, err)
}
+52
View File
@@ -30641,6 +30641,58 @@ func (q *sqlQuerier) InsertWorkspaceAgentScriptTimings(ctx context.Context, arg
return i, err
}
const softDeletePriorWorkspaceAgents = `-- name: SoftDeletePriorWorkspaceAgents :exec
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
WHERE wb.workspace_id = $1
AND wb.id <> $2
AND wa.deleted = FALSE
)
`
type SoftDeletePriorWorkspaceAgentsParams struct {
WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"`
CurrentBuildID uuid.UUID `db:"current_build_id" json:"current_build_id"`
}
// Marks agents from all prior builds of this workspace as deleted,
// preserving only agents belonging to @current_build_id. Called from
// provisionerdserver when a workspace build completes, after the new
// build's agents have been inserted, so running agents are not
// deleted while a build is still queued or provisioning.
func (q *sqlQuerier) SoftDeletePriorWorkspaceAgents(ctx context.Context, arg SoftDeletePriorWorkspaceAgentsParams) error {
_, err := q.db.ExecContext(ctx, softDeletePriorWorkspaceAgents, arg.WorkspaceID, arg.CurrentBuildID)
return err
}
const softDeleteWorkspaceAgentsByWorkspaceID = `-- name: SoftDeleteWorkspaceAgentsByWorkspaceID :exec
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
WHERE wb.workspace_id = $1
AND wa.deleted = FALSE
)
`
// Marks every non-deleted agent belonging to the given workspace as
// deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace
// itself is soft-deleted, so the agent instance-identity auth path
// (which filters on workspace_agents.deleted) doesn't keep seeing
// orphaned rows.
func (q *sqlQuerier) SoftDeleteWorkspaceAgentsByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) error {
_, err := q.db.ExecContext(ctx, softDeleteWorkspaceAgentsByWorkspaceID, workspaceID)
return err
}
const updateWorkspaceAgentConnectionByID = `-- name: UpdateWorkspaceAgentConnectionByID :exec
UPDATE
workspace_agents
@@ -517,3 +517,38 @@ WHERE
AND workspaces.deleted = FALSE
AND users.deleted = FALSE
LIMIT 1;
-- name: SoftDeletePriorWorkspaceAgents :exec
-- Marks agents from all prior builds of this workspace as deleted,
-- preserving only agents belonging to @current_build_id. Called from
-- provisionerdserver when a workspace build completes, after the new
-- build's agents have been inserted, so running agents are not
-- deleted while a build is still queued or provisioning.
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
WHERE wb.workspace_id = @workspace_id
AND wb.id <> @current_build_id
AND wa.deleted = FALSE
);
-- name: SoftDeleteWorkspaceAgentsByWorkspaceID :exec
-- Marks every non-deleted agent belonging to the given workspace as
-- deleted. Called alongside UpdateWorkspaceDeletedByID when a workspace
-- itself is soft-deleted, so the agent instance-identity auth path
-- (which filters on workspace_agents.deleted) doesn't keep seeing
-- orphaned rows.
UPDATE workspace_agents
SET deleted = TRUE
WHERE id IN (
SELECT wa.id
FROM workspace_agents wa
JOIN workspace_resources wr ON wr.id = wa.resource_id
JOIN workspace_builds wb ON wb.job_id = wr.job_id
WHERE wb.workspace_id = @workspace_id
AND wa.deleted = FALSE
);
@@ -2143,6 +2143,20 @@ func (s *server) completeWorkspaceBuildJob(ctx context.Context, job database.Pro
return xerrors.Errorf("insert provisioner job: %w", err)
}
}
// Soft-delete agents from prior builds now that this build's
// agents have been inserted. Waiting until completion (rather
// than build creation) avoids bricking running workspaces
// whose agents would otherwise be deleted while the new build
// is still queued or provisioning. See #25155.
err = db.SoftDeletePriorWorkspaceAgents(ctx, database.SoftDeletePriorWorkspaceAgentsParams{
WorkspaceID: workspaceBuild.WorkspaceID,
CurrentBuildID: workspaceBuild.ID,
})
if err != nil {
return xerrors.Errorf("soft delete prior workspace agents: %w", err)
}
for _, module := range jobType.WorkspaceBuild.Modules {
if err := InsertWorkspaceModule(ctx, db, job.ID, workspaceBuild.Transition, module, telemetrySnapshot); err != nil {
return xerrors.Errorf("insert provisioner job module: %w", err)
@@ -2391,6 +2405,14 @@ func (s *server) completeWorkspaceBuildJob(ctx context.Context, job database.Pro
return xerrors.Errorf("update workspace deleted: %w", err)
}
// Soft-delete any agents tied to this workspace so the
// aws-instance-identity handler (which filters on
// workspace_agents.deleted) doesn't keep seeing orphaned rows
// after the workspace itself is deleted. See #25155.
if err := db.SoftDeleteWorkspaceAgentsByWorkspaceID(ctx, workspaceBuild.WorkspaceID); err != nil {
return xerrors.Errorf("soft delete workspace agents: %w", err)
}
// A user might delete their task workspace directly, instead of
// deleting the task. To avoid leaving the Task in a scenario where
// it has no workspace, we also attempt to delete the task.
+4 -2
View File
@@ -131,8 +131,10 @@ func TestPostWorkspaceAuthAWSInstanceIdentity(t *testing.T) {
err := agentClient.RefreshToken(ctx)
var apiErr *codersdk.Error
require.ErrorAs(t, err, &apiErr)
require.Equal(t, http.StatusBadRequest, apiErr.StatusCode())
require.Contains(t, apiErr.Message, "isn't registered on the latest history")
// The prior build's agent is soft-deleted when the successor
// build completes (SoftDeletePriorWorkspaceAgents), so the
// auth query finds no candidates at all and returns 404.
require.Equal(t, http.StatusNotFound, apiErr.StatusCode())
})
t.Run("Ambiguous/MultipleAgentsNoSelector", func(t *testing.T) {
+9
View File
@@ -615,6 +615,15 @@ func (b *Builder) buildTx(authFunc func(action policy.Action, object rbac.Object
}); err != nil {
return BuildError{http.StatusInternalServerError, "mark workspace as deleted", err}
}
// Soft-delete any agents tied to this workspace so the
// aws-instance-identity handler doesn't keep seeing
// orphaned rows. Mirrors the path in
// provisionerdserver.CompleteJob. See #25155.
//nolint:gocritic // System-restricted: bookkeeping inside an already-authorized delete transaction.
if err := store.SoftDeleteWorkspaceAgentsByWorkspaceID(dbauthz.AsSystemRestricted(b.ctx), b.workspace.ID); err != nil {
return BuildError{http.StatusInternalServerError, "soft-delete workspace agents on orphan delete", err}
}
}
return nil
+6 -1
View File
@@ -1513,7 +1513,9 @@ func expectUpdateProvisionerJobWithCompleteWithStartedAtByID(assertions func(par
}
// expectUpdateWorkspaceDeletedByID asserts a call to UpdateWorkspaceDeletedByID
// and runs the provided assertions against it.
// and runs the provided assertions against it. It also expects the follow-up
// SoftDeleteWorkspaceAgentsByWorkspaceID call that wsbuilder.Builder.Build now
// issues inside the same orphan-delete transaction.
func expectUpdateWorkspaceDeletedByID(assertions func(params database.UpdateWorkspaceDeletedByIDParams)) func(mTx *dbmock.MockStore) {
return func(mTx *dbmock.MockStore) {
mTx.EXPECT().UpdateWorkspaceDeletedByID(gomock.Any(), gomock.Any()).
@@ -1524,6 +1526,9 @@ func expectUpdateWorkspaceDeletedByID(assertions func(params database.UpdateWork
return nil
},
)
mTx.EXPECT().SoftDeleteWorkspaceAgentsByWorkspaceID(gomock.Any(), gomock.Any()).
Times(1).
Return(nil)
}
}
+2
View File
@@ -741,8 +741,10 @@ func TestTools(t *testing.T) {
})
t.Run("GetWorkspaceAgentLogs", func(t *testing.T) {
_ = testutil.Context(t, testutil.WaitShort)
tb, err := toolsdk.NewDeps(memberClient)
require.NoError(t, err)
logs, err := testTool(t, toolsdk.GetWorkspaceAgentLogs, tb, toolsdk.GetWorkspaceAgentLogsArgs{
WorkspaceAgentID: agentID.String(),
})