fix(coderd/autobuild): handle concurrent build number race in lifecycle executor (#25824)

The lifecycle executor did not handle unique-violation errors from
InsertWorkspaceBuild. When a concurrent actor (API handler, another
lifecycle executor, or prebuilds reconciler) inserts a workspace build
with the same build number, PostgreSQL returns a unique constraint
violation on workspace_builds_workspace_id_build_number_key. The
lifecycle executor treated this as a hard error, logging it and storing
it in stats.Errors.

The per-workspace advisory lock (pg_try_advisory_xact_lock) prevents
two lifecycle executors from racing, but does not protect against
races with the CreateWorkspaceBuild API handler or the prebuilds
reconciler, which use different (or no) locking.

Catch the specific unique-violation error after InTx returns (where
the transaction is already rolled back) and clear it. The concurrent
actor's build takes effect; the lifecycle executor treats the
workspace as a no-op for this tick.

Closes coder/internal#455
Closes PLAT-290
This commit is contained in:
Mathias Fredriksson
2026-05-29 17:12:31 +03:00
committed by GitHub
parent 1a91d31793
commit 98d5e7948d
2 changed files with 105 additions and 0 deletions
+17
View File
@@ -422,6 +422,23 @@ func (e *Executor) runOnce(t time.Time) Stats {
Isolation: sql.LevelRepeatableRead,
TxIdentifier: "lifecycle",
})
// A concurrent build (e.g. from the API or another lifecycle
// executor) may have already inserted a build with the same
// number. This is a benign race; the other actor's build
// will take effect. Clear the error so downstream checks
// (audit, notification, stats) treat this as a no-op.
if database.IsUniqueViolation(err, database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey) {
log.Info(e.ctx, "skipping workspace: concurrent build already inserted", slog.Error(err))
err = nil
// Reset notification flags set before builder.Build.
// The build was rolled back, so this executor did not
// perform the transition. The concurrent actor handles
// both the build and any notifications. Without these
// resets, downstream code would send duplicate or
// incorrect notifications.
didAutoUpdate = false
shouldNotifyTaskPause = false
}
if auditLog != nil {
// If the transition didn't succeed then updating the workspace
// to indicate dormant didn't either.
@@ -4,10 +4,12 @@ import (
"context"
"database/sql"
"errors"
"sync/atomic"
"testing"
"time"
"github.com/google/uuid"
"github.com/lib/pq"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/goleak"
@@ -160,6 +162,92 @@ func TestMultipleLifecycleExecutors(t *testing.T) {
assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID])
}
// uniqueViolationStore wraps a database.Store and injects a unique violation
// error from InsertWorkspaceBuild after a configurable number of successful
// calls. This simulates a concurrent build race (e.g. an API-driven start
// racing with the lifecycle executor autostart).
type uniqueViolationStore struct {
database.Store
insertCount *atomic.Int32 // pointer: shared across InTx copies
failAfterN int32
}
func newUniqueViolationStore(db database.Store, failAfterN int32) *uniqueViolationStore {
return &uniqueViolationStore{
Store: db,
insertCount: &atomic.Int32{},
failAfterN: failAfterN,
}
}
func (s *uniqueViolationStore) InTx(fn func(database.Store) error, opts *database.TxOptions) error {
return s.Store.InTx(func(tx database.Store) error {
return fn(&uniqueViolationStore{
Store: tx,
insertCount: s.insertCount, // shared pointer
failAfterN: s.failAfterN,
})
}, opts)
}
func (s *uniqueViolationStore) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error {
n := s.insertCount.Add(1)
if n > s.failAfterN {
return &pq.Error{
Code: pq.ErrorCode("23505"),
Constraint: string(database.UniqueWorkspaceBuildsWorkspaceIDBuildNumberKey),
Message: `duplicate key value violates unique constraint "workspace_builds_workspace_id_build_number_key"`,
}
}
return s.Store.InsertWorkspaceBuild(ctx, arg)
}
func TestExecutorBuildNumberRaceIsHandled(t *testing.T) {
t.Parallel()
// The lifecycle executor must handle a unique-violation from
// InsertWorkspaceBuild gracefully. This error occurs when a concurrent
// actor (API handler, another executor, prebuilds reconciler) inserts a
// build with the same number before the executor's INSERT lands.
//
// We inject the error via a store wrapper. The first two
// InsertWorkspaceBuild calls succeed (setup builds), then the third
// (the lifecycle executor's autostart build) gets a unique violation.
realDB, ps := dbtestutil.NewDB(t)
wrappedDB := newUniqueViolationStore(realDB, 2) // Allow builds 1 (start) and 2 (stop); fail build 3 (autostart)
var (
sched, _ = cron.Weekly("CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan autobuild.Stats)
client = coderdtest.New(t, &coderdtest.Options{
IncludeProvisionerDaemon: true,
AutobuildTicker: tickCh,
AutobuildStats: statsCh,
Database: wrappedDB,
Pubsub: ps,
})
workspace = mustProvisionWorkspace(t, client, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.AutostartSchedule = ptr.Ref(sched.String())
})
)
workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop)
p, err := coderdtest.GetProvisionerForTags(realDB, time.Now(), workspace.OrganizationID, nil)
require.NoError(t, err)
next := sched.Next(workspace.LatestBuild.CreatedAt)
coderdtest.UpdateProvisionerLastSeenAt(t, realDB, p.ID, next)
tickCh <- next
stats := <-statsCh
// The lifecycle executor should treat the unique violation as a benign
// race, not as a hard error.
assert.Empty(t, stats.Errors, "lifecycle executor should not report unique-violation as error")
}
func TestExecutorAutostartTemplateUpdated(t *testing.T) {
t.Parallel()