From 321c2b8fceed4558c501464fddbc743fa0224543 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 28 Aug 2025 12:07:50 -0700 Subject: [PATCH] fix: fix flake in TestExecutorAutostartSkipsWhenNoProvisionersAvailable (#19478) The flake here had two causes: 1. related to usage of time.Now() in MustWaitForProvisionersAvailable and 2. the fact that UpdateProvisionerLastSeenAt can not use a time that is further in the past than the current LastSeenAt time Previously the test here was calling `coderdtest.MustWaitForProvisionersAvailable` which was using `time.Now` rather than the next tick time like the real `hasProvisionersAvailable` function does. Additionally, when using `UpdateProvisionerLastSeenAt` the underlying db query enforces that the time we're trying to set `LastSeenAt` to cannot be older than the current value. I was able to reliably reproduce the flake by executing both the `UpdateProvisionerLastSeenAt` call and `tickCh <- next` in their own goroutines, the former with a small sleep to reliably ensure we'd trigger the autobuild before we set the `LastSeenAt` time. That's when I also noticed that `coderdtest.MustWaitForProvisionersAvailable` was using `time.Now` instead of the tick time. When I updated that function to take in a tick time + added a 2nd call to `UpdateProvisionerLastSeenAt` to set an original non-stale time, we could then never get the test to pass because the later call to set the stale time would not actually modify `LastSeenAt`. On top of that, calling the provisioner daemons closer in the middle of the function doesn't really do anything of value in this test. **The fix for the flake is to keep the go routines, ensuring there would be a flake if there was not a relevant fix, but to include the fix which is to ensure that we explicitly wait for the provisioner to be stale before passing the time to `tickCh`.** --------- Signed-off-by: Callum Styan --- coderd/autobuild/lifecycle_executor_test.go | 32 ++++++++++----- coderd/coderdtest/coderdtest.go | 44 +++++++++++++++++---- enterprise/coderd/workspaces_test.go | 7 ++-- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go index df7a7ad231..1e5f0d431e 100644 --- a/coderd/autobuild/lifecycle_executor_test.go +++ b/coderd/autobuild/lifecycle_executor_test.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "sync" "testing" "time" @@ -1720,19 +1721,32 @@ func TestExecutorAutostartSkipsWhenNoProvisionersAvailable(t *testing.T) { // Stop the workspace while provisioner is available workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) - // Wait for provisioner to be available for this specific workspace - coderdtest.MustWaitForProvisionersAvailable(t, db, workspace) p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, provisionerDaemonTags) require.NoError(t, err, "Error getting provisioner for workspace") - daemon1Closer.Close() + var wg sync.WaitGroup + wg.Add(2) - // Ensure the provisioner is stale - staleTime := sched.Next(workspace.LatestBuild.CreatedAt).Add((-1 * provisionerdserver.StaleInterval) + -10*time.Second) - coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, staleTime) + next := sched.Next(workspace.LatestBuild.CreatedAt) + go func() { + defer wg.Done() + // Ensure the provisioner is stale + staleTime := next.Add(-(provisionerdserver.StaleInterval * 2)) + coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, staleTime) + p, err = coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, provisionerDaemonTags) + assert.NoError(t, err, "Error getting provisioner for workspace") + assert.Eventually(t, func() bool { return p.LastSeenAt.Time.UnixNano() == staleTime.UnixNano() }, testutil.WaitMedium, testutil.IntervalFast) + }() - // Trigger autobuild - tickCh <- sched.Next(workspace.LatestBuild.CreatedAt) + go func() { + defer wg.Done() + // Ensure the provisioner is gone or stale before triggering the autobuild + coderdtest.MustWaitForProvisionersUnavailable(t, db, workspace, provisionerDaemonTags, next) + // Trigger autobuild + tickCh <- next + }() + + wg.Wait() stats := <-statsCh @@ -1758,5 +1772,5 @@ func TestExecutorAutostartSkipsWhenNoProvisionersAvailable(t *testing.T) { }() stats = <-statsCh - assert.Len(t, stats.Transitions, 1, "should not create builds when no provisioners available") + assert.Len(t, stats.Transitions, 1, "should create builds when provisioners are available") } diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index f773053c3a..b6aafc53da 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -1649,19 +1649,48 @@ func UpdateProvisionerLastSeenAt(t *testing.T, db database.Store, id uuid.UUID, func MustWaitForAnyProvisioner(t *testing.T, db database.Store) { t.Helper() ctx := ctxWithProvisionerPermissions(testutil.Context(t, testutil.WaitShort)) - require.Eventually(t, func() bool { + // testutil.Eventually(t, func) + testutil.Eventually(ctx, t, func(ctx context.Context) (done bool) { daemons, err := db.GetProvisionerDaemons(ctx) return err == nil && len(daemons) > 0 - }, testutil.WaitShort, testutil.IntervalFast) + }, testutil.IntervalFast, "no provisioner daemons found") +} + +// MustWaitForProvisionersUnavailable waits for provisioners to become unavailable for a specific workspace +func MustWaitForProvisionersUnavailable(t *testing.T, db database.Store, workspace codersdk.Workspace, tags map[string]string, checkTime time.Time) { + t.Helper() + ctx := ctxWithProvisionerPermissions(testutil.Context(t, testutil.WaitMedium)) + + testutil.Eventually(ctx, t, func(ctx context.Context) (done bool) { + // Use the same logic as hasValidProvisioner but expect false + provisionerDaemons, err := db.GetProvisionerDaemonsByOrganization(ctx, database.GetProvisionerDaemonsByOrganizationParams{ + OrganizationID: workspace.OrganizationID, + WantTags: tags, + }) + if err != nil { + return false + } + + // Check if NO provisioners are active (all are stale or gone) + for _, pd := range provisionerDaemons { + if pd.LastSeenAt.Valid { + age := checkTime.Sub(pd.LastSeenAt.Time) + if age <= provisionerdserver.StaleInterval { + return false // Found an active provisioner, keep waiting + } + } + } + return true // No active provisioners found + }, testutil.IntervalFast, "there are still provisioners available for workspace, expected none") } // MustWaitForProvisionersAvailable waits for provisioners to be available for a specific workspace. -func MustWaitForProvisionersAvailable(t *testing.T, db database.Store, workspace codersdk.Workspace) uuid.UUID { +func MustWaitForProvisionersAvailable(t *testing.T, db database.Store, workspace codersdk.Workspace, ts time.Time) uuid.UUID { t.Helper() - ctx := ctxWithProvisionerPermissions(testutil.Context(t, testutil.WaitShort)) + ctx := ctxWithProvisionerPermissions(testutil.Context(t, testutil.WaitLong)) id := uuid.UUID{} // Get the workspace from the database - require.Eventually(t, func() bool { + testutil.Eventually(ctx, t, func(ctx context.Context) (done bool) { ws, err := db.GetWorkspaceByID(ctx, workspace.ID) if err != nil { return false @@ -1689,10 +1718,9 @@ func MustWaitForProvisionersAvailable(t *testing.T, db database.Store, workspace } // Check if any provisioners are active (not stale) - now := time.Now() for _, pd := range provisionerDaemons { if pd.LastSeenAt.Valid { - age := now.Sub(pd.LastSeenAt.Time) + age := ts.Sub(pd.LastSeenAt.Time) if age <= provisionerdserver.StaleInterval { id = pd.ID return true // Found an active provisioner @@ -1700,7 +1728,7 @@ func MustWaitForProvisionersAvailable(t *testing.T, db database.Store, workspace } } return false // No active provisioners found - }, testutil.WaitLong, testutil.IntervalFast) + }, testutil.IntervalFast, "no active provisioners available for workspace, expected at least one (non-stale)") return id } diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 31821bb798..555806b623 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -2242,13 +2242,14 @@ func TestPrebuildsAutobuild(t *testing.T) { workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, nil) + coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, sched.Next(prebuild.LatestBuild.CreatedAt)) + // Wait for provisioner to be available for this specific workspace - coderdtest.MustWaitForProvisionersAvailable(t, db, prebuild) + coderdtest.MustWaitForProvisionersAvailable(t, db, prebuild, sched.Next(prebuild.LatestBuild.CreatedAt)) tickTime := sched.Next(prebuild.LatestBuild.CreatedAt).Add(time.Minute) - p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, nil) require.NoError(t, err) - coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, tickTime) // Tick at the next scheduled time after the prebuild’s LatestBuild.CreatedAt, // since the next allowed autostart is calculated starting from that point.