From 98d5e7948d88e5a4f0b906efe09e630c54b6d0e4 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Fri, 29 May 2026 17:12:31 +0300 Subject: [PATCH] fix(coderd/autobuild): handle concurrent build number race in lifecycle executor (#25824) The lifecycle executor did not handle unique-violation errors from InsertWorkspaceBuild. When a concurrent actor (API handler, another lifecycle executor, or prebuilds reconciler) inserts a workspace build with the same build number, PostgreSQL returns a unique constraint violation on workspace_builds_workspace_id_build_number_key. The lifecycle executor treated this as a hard error, logging it and storing it in stats.Errors. The per-workspace advisory lock (pg_try_advisory_xact_lock) prevents two lifecycle executors from racing, but does not protect against races with the CreateWorkspaceBuild API handler or the prebuilds reconciler, which use different (or no) locking. Catch the specific unique-violation error after InTx returns (where the transaction is already rolled back) and clear it. The concurrent actor's build takes effect; the lifecycle executor treats the workspace as a no-op for this tick. Closes coder/internal#455 Closes PLAT-290 --- coderd/autobuild/lifecycle_executor.go | 17 ++++ coderd/autobuild/lifecycle_executor_test.go | 88 +++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index 84fff375e0..5a141ce8cf 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -422,6 +422,23 @@ func (e *Executor) runOnce(t time.Time) Stats { Isolation: sql.LevelRepeatableRead, TxIdentifier: "lifecycle", }) + // A concurrent build (e.g. from the API or another lifecycle + // executor) may have already inserted a build with the same + // number. This is a benign race; the other actor's build + // will take effect. Clear the error so downstream checks + // (audit, notification, stats) treat this as a no-op. + if database.IsUniqueViolation(err, database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey) { + log.Info(e.ctx, "skipping workspace: concurrent build already inserted", slog.Error(err)) + err = nil + // Reset notification flags set before builder.Build. + // The build was rolled back, so this executor did not + // perform the transition. The concurrent actor handles + // both the build and any notifications. Without these + // resets, downstream code would send duplicate or + // incorrect notifications. + didAutoUpdate = false + shouldNotifyTaskPause = false + } if auditLog != nil { // If the transition didn't succeed then updating the workspace // to indicate dormant didn't either. diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go index 345647977d..89805429b9 100644 --- a/coderd/autobuild/lifecycle_executor_test.go +++ b/coderd/autobuild/lifecycle_executor_test.go @@ -4,10 +4,12 @@ import ( "context" "database/sql" "errors" + "sync/atomic" "testing" "time" "github.com/google/uuid" + "github.com/lib/pq" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" @@ -160,6 +162,92 @@ func TestMultipleLifecycleExecutors(t *testing.T) { assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) } +// uniqueViolationStore wraps a database.Store and injects a unique violation +// error from InsertWorkspaceBuild after a configurable number of successful +// calls. This simulates a concurrent build race (e.g. an API-driven start +// racing with the lifecycle executor autostart). +type uniqueViolationStore struct { + database.Store + insertCount *atomic.Int32 // pointer: shared across InTx copies + failAfterN int32 +} + +func newUniqueViolationStore(db database.Store, failAfterN int32) *uniqueViolationStore { + return &uniqueViolationStore{ + Store: db, + insertCount: &atomic.Int32{}, + failAfterN: failAfterN, + } +} + +func (s *uniqueViolationStore) InTx(fn func(database.Store) error, opts *database.TxOptions) error { + return s.Store.InTx(func(tx database.Store) error { + return fn(&uniqueViolationStore{ + Store: tx, + insertCount: s.insertCount, // shared pointer + failAfterN: s.failAfterN, + }) + }, opts) +} + +func (s *uniqueViolationStore) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error { + n := s.insertCount.Add(1) + if n > s.failAfterN { + return &pq.Error{ + Code: pq.ErrorCode("23505"), + Constraint: string(database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey), + Message: `duplicate key value violates unique constraint "workspace_builds_workspace_id_build_number_key"`, + } + } + return s.Store.InsertWorkspaceBuild(ctx, arg) +} + +func TestExecutorBuildNumberRaceIsHandled(t *testing.T) { + t.Parallel() + + // The lifecycle executor must handle a unique-violation from + // InsertWorkspaceBuild gracefully. This error occurs when a concurrent + // actor (API handler, another executor, prebuilds reconciler) inserts a + // build with the same number before the executor's INSERT lands. + // + // We inject the error via a store wrapper. The first two + // InsertWorkspaceBuild calls succeed (setup builds), then the third + // (the lifecycle executor's autostart build) gets a unique violation. + + realDB, ps := dbtestutil.NewDB(t) + wrappedDB := newUniqueViolationStore(realDB, 2) // Allow builds 1 (start) and 2 (stop); fail build 3 (autostart) + + var ( + sched, _ = cron.Weekly("CRON_TZ=UTC 0 * * * *") + tickCh = make(chan time.Time) + statsCh = make(chan autobuild.Stats) + client = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsCh, + Database: wrappedDB, + Pubsub: ps, + }) + workspace = mustProvisionWorkspace(t, client, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + ) + + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) + + p, err := coderdtest.GetProvisionerForTags(realDB, time.Now(), workspace.OrganizationID, nil) + require.NoError(t, err) + next := sched.Next(workspace.LatestBuild.CreatedAt) + coderdtest.UpdateProvisionerLastSeenAt(t, realDB, p.ID, next) + + tickCh <- next + stats := <-statsCh + + // The lifecycle executor should treat the unique violation as a benign + // race, not as a hard error. + assert.Empty(t, stats.Errors, "lifecycle executor should not report unique-violation as error") +} + func TestExecutorAutostartTemplateUpdated(t *testing.T) { t.Parallel()