fix: prevent db deadlock when workspaces go dormant (#10618)

This commit is contained in:
Jon Ayers
2023-11-13 13:40:20 -06:00
committed by GitHub
parent 76e7a1d06b
commit 75ab16d19a
4 changed files with 113 additions and 34 deletions
+32 -30
View File
@@ -140,42 +140,45 @@ func (e *Executor) runOnce(t time.Time) Stats {
eg.Go(func() error {
var job *database.ProvisionerJob
var auditLog *auditParams
err := e.db.InTx(func(tx database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
return xerrors.Errorf("get workspace by id: %w", err)
}
// Determine the workspace state based on its latest build.
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
return xerrors.Errorf("get latest workspace build: %w", err)
}
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
return xerrors.Errorf("get latest provisioner job: %w", err)
}
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
return nil
return xerrors.Errorf("get template scheduling options: %w", err)
}
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get template by id", slog.Error(err))
return xerrors.Errorf("get template by ID: %w", err)
}
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
// err is used to indicate that a workspace is not eligible
// so returning nil here is ok although ultimately the distinction
// doesn't matter since the transaction is read-only up to
// this point.
return nil
}
@@ -193,17 +196,16 @@ func (e *Executor) runOnce(t time.Time) Stats {
}
build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
if err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
return xerrors.Errorf("build workspace: %w", err)
}
}
// Transition the workspace to dormant if it has breached the template's
// Transition the workspace to dormant if it has breached the template's
// threshold for inactivity.
if reason == database.BuildReasonAutolock {
wsOld := ws
@@ -215,21 +217,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
},
})
auditBuild(e.ctx, e.log, *e.auditor.Load(), auditParams{
Build: build,
Job: latestJob,
Reason: reason,
Old: wsOld,
New: ws,
Success: err == nil,
})
auditLog = &auditParams{
Build: build,
Job: latestJob,
Reason: reason,
Old: wsOld,
New: ws,
}
if err != nil {
log.Error(e.ctx, "unable to transition workspace to dormant",
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
}
log.Info(e.ctx, "dormant workspace",
@@ -267,6 +263,12 @@ func (e *Executor) runOnce(t time.Time) Stats {
if err != nil {
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
}
if auditLog != nil {
// If the transition didn't succeed then updating the workspace
// to indicate dormant didn't either.
auditLog.Success = err == nil
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
}
if job != nil && err == nil {
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
// with an outer transaction like this, and we need to make sure the outer transaction commits before
-2
View File
@@ -10813,8 +10813,6 @@ SET
FROM
templates
WHERE
workspaces.template_id = templates.id
AND
workspaces.id = $1
RETURNING workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates
`
-2
View File
@@ -488,8 +488,6 @@ SET
FROM
templates
WHERE
workspaces.template_id = templates.id
AND
workspaces.id = $1
RETURNING workspaces.*;
+81
View File
@@ -16,11 +16,14 @@ import (
"github.com/coder/coder/v2/coderd/autobuild"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/rbac"
agplschedule "github.com/coder/coder/v2/coderd/schedule"
"github.com/coder/coder/v2/coderd/schedule/cron"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
entaudit "github.com/coder/coder/v2/enterprise/audit"
"github.com/coder/coder/v2/enterprise/audit/backends"
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
"github.com/coder/coder/v2/enterprise/coderd/license"
"github.com/coder/coder/v2/enterprise/coderd/schedule"
@@ -309,6 +312,84 @@ func TestWorkspaceAutobuild(t *testing.T) {
require.True(t, ws.LastUsedAt.After(lastUsedAt))
})
// This test serves as a regression prevention for generating
// audit logs in the same transaction the transition workspaces to
// the dormant state. The auditor that is passed to autobuild does
// not use the transaction when inserting an audit log which can
// cause a deadlock.
t.Run("NoDeadlock", func(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skipf("Skipping non-postgres run")
}
var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
inactiveTTL = time.Minute
)
const (
maxConns = 3
numWorkspaces = maxConns * 5
)
// This is a bit bizarre but necessary so that we can
// initialize our coderd with a real auditor and limit DB connections
// to simulate deadlock conditions.
db, pubsub, sdb := dbtestutil.NewDBWithSQLDB(t)
// Set MaxOpenConns so we can ensure we aren't inadvertently acquiring
// another connection from within a transaction.
sdb.SetMaxOpenConns(maxConns)
auditor := entaudit.NewAuditor(db, entaudit.DefaultFilter, backends.NewPostgres(db, true))
client, user := coderdenttest.New(t, &coderdenttest.Options{
Options: &coderdtest.Options{
AutobuildTicker: ticker,
AutobuildStats: statCh,
TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()),
Database: db,
Pubsub: pubsub,
Auditor: auditor,
IncludeProvisionerDaemon: true,
},
LicenseOptions: &coderdenttest.LicenseOptions{
Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
},
})
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.PlanComplete,
ProvisionApply: echo.ApplyComplete,
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
ctr.TimeTilDormantMillis = ptr.Ref[int64](inactiveTTL.Milliseconds())
})
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
workspaces := make([]codersdk.Workspace, 0, numWorkspaces)
for i := 0; i < numWorkspaces; i++ {
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
workspaces = append(workspaces, ws)
}
// Simulate being inactive.
ticker <- time.Now().Add(time.Hour)
stats := <-statCh
// Expect workspace to transition to stopped state for breaching
// failure TTL.
require.Len(t, stats.Transitions, numWorkspaces)
for _, ws := range workspaces {
// The workspace should be dormant.
ws = coderdtest.MustWorkspace(t, client, ws.ID)
require.NotNil(t, ws.DormantAt)
}
})
t.Run("InactiveTTLTooEarly", func(t *testing.T) {
t.Parallel()