From 50ff06cc3c1845f5aecd260d743be5cb0a0f4431 Mon Sep 17 00:00:00 2001 From: Danielle Maywood Date: Fri, 13 Dec 2024 16:59:27 +0000 Subject: [PATCH] chore: acquire lock for individual workspace transition (#15859) When Coder is ran in High Availability mode, each Coder instance has a lifecycle executor. These lifecycle executors are all trying to do the same work, and whilst transactions saves us from this causing an issue, we are still doing extra work that could be prevented. This PR adds a `TryAcquireLock` call for each attempted workspace transition, meaning two Coder instances shouldn't duplicate effort. --- coderd/autobuild/lifecycle_executor.go | 12 +++- coderd/autobuild/lifecycle_executor_test.go | 71 +++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index 5e6f9ae2f0..cc4e48b435 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -3,6 +3,7 @@ package autobuild import ( "context" "database/sql" + "fmt" "net/http" "sync" "sync/atomic" @@ -177,6 +178,15 @@ func (e *Executor) runOnce(t time.Time) Stats { err := e.db.InTx(func(tx database.Store) error { var err error + ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID))) + if err != nil { + return xerrors.Errorf("try acquire lifecycle executor lock: %w", err) + } + if !ok { + log.Debug(e.ctx, "unable to acquire lock for workspace, skipping") + return nil + } + // Re-check eligibility since the first check was outside the // transaction and the workspace settings may have changed. ws, err = tx.GetWorkspaceByID(e.ctx, wsID) @@ -389,7 +399,7 @@ func (e *Executor) runOnce(t time.Time) Stats { } return nil }() - if err != nil { + if err != nil && !xerrors.Is(err, context.Canceled) { log.Error(e.ctx, "failed to transition workspace", slog.Error(err)) statsMu.Lock() stats.Errors[wsID] = err diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go index 794d99f778..62bb8b2fd2 100644 --- a/coderd/autobuild/lifecycle_executor_test.go +++ b/coderd/autobuild/lifecycle_executor_test.go @@ -18,6 +18,7 @@ import ( "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/schedule" @@ -72,6 +73,76 @@ func TestExecutorAutostartOK(t *testing.T) { require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"}) } +func TestMultipleLifecycleExecutors(t *testing.T) { + t.Parallel() + + db, ps := dbtestutil.NewDB(t) + + var ( + sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *") + // Create our first client + tickCh = make(chan time.Time, 2) + statsChA = make(chan autobuild.Stats) + clientA = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsChA, + Database: db, + Pubsub: ps, + }) + // ... And then our second client + statsChB = make(chan autobuild.Stats) + _ = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsChB, + Database: db, + Pubsub: ps, + }) + // Now create a workspace (we can use either client, it doesn't matter) + workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + ) + + // Have the workspace stopped so we can perform an autostart + workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) + + // Get both clients to perform a lifecycle execution tick + next := sched.Next(workspace.LatestBuild.CreatedAt) + + startCh := make(chan struct{}) + go func() { + <-startCh + tickCh <- next + }() + go func() { + <-startCh + tickCh <- next + }() + close(startCh) + + // Now we want to check the stats for both clients + statsA := <-statsChA + statsB := <-statsChB + + // We expect there to be no errors + assert.Len(t, statsA.Errors, 0) + assert.Len(t, statsB.Errors, 0) + + // We also expect there to have been only one transition + require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions)) + + stats := statsA + if len(statsB.Transitions) == 1 { + stats = statsB + } + + // And we expect this transition to have been a start transition + assert.Contains(t, stats.Transitions, workspace.ID) + assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) +} + func TestExecutorAutostartTemplateUpdated(t *testing.T) { t.Parallel()