diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index 84fff375e0..5a141ce8cf 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -422,6 +422,23 @@ func (e *Executor) runOnce(t time.Time) Stats { Isolation: sql.LevelRepeatableRead, TxIdentifier: "lifecycle", }) + // A concurrent build (e.g. from the API or another lifecycle + // executor) may have already inserted a build with the same + // number. This is a benign race; the other actor's build + // will take effect. Clear the error so downstream checks + // (audit, notification, stats) treat this as a no-op. + if database.IsUniqueViolation(err, database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey) { + log.Info(e.ctx, "skipping workspace: concurrent build already inserted", slog.Error(err)) + err = nil + // Reset notification flags set before builder.Build. + // The build was rolled back, so this executor did not + // perform the transition. The concurrent actor handles + // both the build and any notifications. Without these + // resets, downstream code would send duplicate or + // incorrect notifications. + didAutoUpdate = false + shouldNotifyTaskPause = false + } if auditLog != nil { // If the transition didn't succeed then updating the workspace // to indicate dormant didn't either. diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go index 345647977d..89805429b9 100644 --- a/coderd/autobuild/lifecycle_executor_test.go +++ b/coderd/autobuild/lifecycle_executor_test.go @@ -4,10 +4,12 @@ import ( "context" "database/sql" "errors" + "sync/atomic" "testing" "time" "github.com/google/uuid" + "github.com/lib/pq" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" @@ -160,6 +162,92 @@ func TestMultipleLifecycleExecutors(t *testing.T) { assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) } +// uniqueViolationStore wraps a database.Store and injects a unique violation +// error from InsertWorkspaceBuild after a configurable number of successful +// calls. This simulates a concurrent build race (e.g. an API-driven start +// racing with the lifecycle executor autostart). +type uniqueViolationStore struct { + database.Store + insertCount *atomic.Int32 // pointer: shared across InTx copies + failAfterN int32 +} + +func newUniqueViolationStore(db database.Store, failAfterN int32) *uniqueViolationStore { + return &uniqueViolationStore{ + Store: db, + insertCount: &atomic.Int32{}, + failAfterN: failAfterN, + } +} + +func (s *uniqueViolationStore) InTx(fn func(database.Store) error, opts *database.TxOptions) error { + return s.Store.InTx(func(tx database.Store) error { + return fn(&uniqueViolationStore{ + Store: tx, + insertCount: s.insertCount, // shared pointer + failAfterN: s.failAfterN, + }) + }, opts) +} + +func (s *uniqueViolationStore) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error { + n := s.insertCount.Add(1) + if n > s.failAfterN { + return &pq.Error{ + Code: pq.ErrorCode("23505"), + Constraint: string(database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey), + Message: `duplicate key value violates unique constraint "workspace_builds_workspace_id_build_number_key"`, + } + } + return s.Store.InsertWorkspaceBuild(ctx, arg) +} + +func TestExecutorBuildNumberRaceIsHandled(t *testing.T) { + t.Parallel() + + // The lifecycle executor must handle a unique-violation from + // InsertWorkspaceBuild gracefully. This error occurs when a concurrent + // actor (API handler, another executor, prebuilds reconciler) inserts a + // build with the same number before the executor's INSERT lands. + // + // We inject the error via a store wrapper. The first two + // InsertWorkspaceBuild calls succeed (setup builds), then the third + // (the lifecycle executor's autostart build) gets a unique violation. + + realDB, ps := dbtestutil.NewDB(t) + wrappedDB := newUniqueViolationStore(realDB, 2) // Allow builds 1 (start) and 2 (stop); fail build 3 (autostart) + + var ( + sched, _ = cron.Weekly("CRON_TZ=UTC 0 * * * *") + tickCh = make(chan time.Time) + statsCh = make(chan autobuild.Stats) + client = coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + AutobuildTicker: tickCh, + AutobuildStats: statsCh, + Database: wrappedDB, + Pubsub: ps, + }) + workspace = mustProvisionWorkspace(t, client, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + ) + + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) + + p, err := coderdtest.GetProvisionerForTags(realDB, time.Now(), workspace.OrganizationID, nil) + require.NoError(t, err) + next := sched.Next(workspace.LatestBuild.CreatedAt) + coderdtest.UpdateProvisionerLastSeenAt(t, realDB, p.ID, next) + + tickCh <- next + stats := <-statsCh + + // The lifecycle executor should treat the unique violation as a benign + // race, not as a hard error. + assert.Empty(t, stats.Errors, "lifecycle executor should not report unique-violation as error") +} + func TestExecutorAutostartTemplateUpdated(t *testing.T) { t.Parallel()