mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix: prevent db deadlock when workspaces go dormant (#10618)
This commit is contained in:
@@ -140,42 +140,45 @@ func (e *Executor) runOnce(t time.Time) Stats {
|
||||
|
||||
eg.Go(func() error {
|
||||
var job *database.ProvisionerJob
|
||||
var auditLog *auditParams
|
||||
err := e.db.InTx(func(tx database.Store) error {
|
||||
// Re-check eligibility since the first check was outside the
|
||||
// transaction and the workspace settings may have changed.
|
||||
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
|
||||
if err != nil {
|
||||
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
|
||||
return nil
|
||||
return xerrors.Errorf("get workspace by id: %w", err)
|
||||
}
|
||||
|
||||
// Determine the workspace state based on its latest build.
|
||||
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
|
||||
if err != nil {
|
||||
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
|
||||
return nil
|
||||
return xerrors.Errorf("get latest workspace build: %w", err)
|
||||
}
|
||||
|
||||
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get latest provisioner job: %w", err)
|
||||
}
|
||||
|
||||
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
|
||||
if err != nil {
|
||||
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
|
||||
return nil
|
||||
return xerrors.Errorf("get template scheduling options: %w", err)
|
||||
}
|
||||
|
||||
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
|
||||
if err != nil {
|
||||
log.Warn(e.ctx, "get template by id", slog.Error(err))
|
||||
return xerrors.Errorf("get template by ID: %w", err)
|
||||
}
|
||||
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
|
||||
|
||||
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
|
||||
if err != nil {
|
||||
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
|
||||
|
||||
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
|
||||
if err != nil {
|
||||
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
|
||||
// err is used to indicate that a workspace is not eligible
|
||||
// so returning nil here is ok although ultimately the distinction
|
||||
// doesn't matter since the transaction is read-only up to
|
||||
// this point.
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -193,17 +196,16 @@ func (e *Executor) runOnce(t time.Time) Stats {
|
||||
}
|
||||
|
||||
build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
|
||||
|
||||
if err != nil {
|
||||
log.Error(e.ctx, "unable to transition workspace",
|
||||
slog.F("transition", nextTransition),
|
||||
slog.Error(err),
|
||||
)
|
||||
return nil
|
||||
return xerrors.Errorf("build workspace: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Transition the workspace to dormant if it has breached the template's
|
||||
// Transition the workspace to dormant if it has breached the template's
|
||||
// threshold for inactivity.
|
||||
if reason == database.BuildReasonAutolock {
|
||||
wsOld := ws
|
||||
@@ -215,21 +217,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
|
||||
},
|
||||
})
|
||||
|
||||
auditBuild(e.ctx, e.log, *e.auditor.Load(), auditParams{
|
||||
Build: build,
|
||||
Job: latestJob,
|
||||
Reason: reason,
|
||||
Old: wsOld,
|
||||
New: ws,
|
||||
Success: err == nil,
|
||||
})
|
||||
|
||||
auditLog = &auditParams{
|
||||
Build: build,
|
||||
Job: latestJob,
|
||||
Reason: reason,
|
||||
Old: wsOld,
|
||||
New: ws,
|
||||
}
|
||||
if err != nil {
|
||||
log.Error(e.ctx, "unable to transition workspace to dormant",
|
||||
slog.F("transition", nextTransition),
|
||||
slog.Error(err),
|
||||
)
|
||||
return nil
|
||||
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
|
||||
}
|
||||
|
||||
log.Info(e.ctx, "dormant workspace",
|
||||
@@ -267,6 +263,12 @@ func (e *Executor) runOnce(t time.Time) Stats {
|
||||
if err != nil {
|
||||
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
|
||||
}
|
||||
if auditLog != nil {
|
||||
// If the transition didn't succeed then updating the workspace
|
||||
// to indicate dormant didn't either.
|
||||
auditLog.Success = err == nil
|
||||
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
|
||||
}
|
||||
if job != nil && err == nil {
|
||||
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
|
||||
// with an outer transaction like this, and we need to make sure the outer transaction commits before
|
||||
|
||||
@@ -10813,8 +10813,6 @@ SET
|
||||
FROM
|
||||
templates
|
||||
WHERE
|
||||
workspaces.template_id = templates.id
|
||||
AND
|
||||
workspaces.id = $1
|
||||
RETURNING workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates
|
||||
`
|
||||
|
||||
@@ -488,8 +488,6 @@ SET
|
||||
FROM
|
||||
templates
|
||||
WHERE
|
||||
workspaces.template_id = templates.id
|
||||
AND
|
||||
workspaces.id = $1
|
||||
RETURNING workspaces.*;
|
||||
|
||||
|
||||
@@ -16,11 +16,14 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/autobuild"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/rbac"
|
||||
agplschedule "github.com/coder/coder/v2/coderd/schedule"
|
||||
"github.com/coder/coder/v2/coderd/schedule/cron"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
entaudit "github.com/coder/coder/v2/enterprise/audit"
|
||||
"github.com/coder/coder/v2/enterprise/audit/backends"
|
||||
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
|
||||
"github.com/coder/coder/v2/enterprise/coderd/license"
|
||||
"github.com/coder/coder/v2/enterprise/coderd/schedule"
|
||||
@@ -309,6 +312,84 @@ func TestWorkspaceAutobuild(t *testing.T) {
|
||||
require.True(t, ws.LastUsedAt.After(lastUsedAt))
|
||||
})
|
||||
|
||||
// This test serves as a regression prevention for generating
|
||||
// audit logs in the same transaction the transition workspaces to
|
||||
// the dormant state. The auditor that is passed to autobuild does
|
||||
// not use the transaction when inserting an audit log which can
|
||||
// cause a deadlock.
|
||||
t.Run("NoDeadlock", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if !dbtestutil.WillUsePostgres() {
|
||||
t.Skipf("Skipping non-postgres run")
|
||||
}
|
||||
|
||||
var (
|
||||
ticker = make(chan time.Time)
|
||||
statCh = make(chan autobuild.Stats)
|
||||
inactiveTTL = time.Minute
|
||||
)
|
||||
|
||||
const (
|
||||
maxConns = 3
|
||||
numWorkspaces = maxConns * 5
|
||||
)
|
||||
// This is a bit bizarre but necessary so that we can
|
||||
// initialize our coderd with a real auditor and limit DB connections
|
||||
// to simulate deadlock conditions.
|
||||
db, pubsub, sdb := dbtestutil.NewDBWithSQLDB(t)
|
||||
// Set MaxOpenConns so we can ensure we aren't inadvertently acquiring
|
||||
// another connection from within a transaction.
|
||||
sdb.SetMaxOpenConns(maxConns)
|
||||
auditor := entaudit.NewAuditor(db, entaudit.DefaultFilter, backends.NewPostgres(db, true))
|
||||
|
||||
client, user := coderdenttest.New(t, &coderdenttest.Options{
|
||||
Options: &coderdtest.Options{
|
||||
AutobuildTicker: ticker,
|
||||
AutobuildStats: statCh,
|
||||
TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()),
|
||||
Database: db,
|
||||
Pubsub: pubsub,
|
||||
Auditor: auditor,
|
||||
IncludeProvisionerDaemon: true,
|
||||
},
|
||||
LicenseOptions: &coderdenttest.LicenseOptions{
|
||||
Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
|
||||
},
|
||||
})
|
||||
|
||||
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
|
||||
Parse: echo.ParseComplete,
|
||||
ProvisionPlan: echo.PlanComplete,
|
||||
ProvisionApply: echo.ApplyComplete,
|
||||
})
|
||||
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
|
||||
ctr.TimeTilDormantMillis = ptr.Ref[int64](inactiveTTL.Milliseconds())
|
||||
})
|
||||
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
|
||||
|
||||
workspaces := make([]codersdk.Workspace, 0, numWorkspaces)
|
||||
for i := 0; i < numWorkspaces; i++ {
|
||||
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
|
||||
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
|
||||
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
|
||||
workspaces = append(workspaces, ws)
|
||||
}
|
||||
|
||||
// Simulate being inactive.
|
||||
ticker <- time.Now().Add(time.Hour)
|
||||
stats := <-statCh
|
||||
|
||||
// Expect workspace to transition to stopped state for breaching
|
||||
// failure TTL.
|
||||
require.Len(t, stats.Transitions, numWorkspaces)
|
||||
for _, ws := range workspaces {
|
||||
// The workspace should be dormant.
|
||||
ws = coderdtest.MustWorkspace(t, client, ws.ID)
|
||||
require.NotNil(t, ws.DormantAt)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("InactiveTTLTooEarly", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user