diff --git a/scaletest/prebuilds/run.go b/scaletest/prebuilds/run.go index bc253bfd3d..246dd5ee68 100644 --- a/scaletest/prebuilds/run.go +++ b/scaletest/prebuilds/run.go @@ -6,6 +6,7 @@ import ( _ "embed" "html/template" "io" + "net/http" "time" "github.com/google/uuid" @@ -17,6 +18,7 @@ import ( "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/scaletest/harness" "github.com/coder/coder/v2/scaletest/loadtestutil" + "github.com/coder/coder/v2/scaletest/workspacebuild" ) type Runner struct { @@ -77,6 +79,31 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { } templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq) if err != nil { + // If the template already exists from a previous failed run, look it up so + // Cleanup() can delete it and the rerun doesn't leave orphaned resources. + var sdkErr *codersdk.Error + if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict { + existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{ + OrganizationID: r.cfg.OrganizationID, + ExactName: templateName, + }) + if listErr == nil && len(existing) > 0 { + r.template = existing[0] + logger.Warn(ctx, "template already exists from a previous run, will be cleaned up", + slog.F("template_name", r.template.Name), + slog.F("template_id", r.template.ID), + ) + // Clear any prebuild config on the orphaned template so the + // reconciler doesn't keep spawning workspaces while Cleanup() + // is trying to delete them. + if clearErr := r.pushEmptyTemplateVersion(ctx); clearErr != nil { + logger.Warn(ctx, "failed to clear prebuilds config on orphaned template", + slog.F("template_id", r.template.ID), + slog.Error(clearErr), + ) + } + } + } r.cfg.Metrics.AddError(templateName, "create_template") return xerrors.Errorf("create template: %w", err) } @@ -105,21 +132,12 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { r.cfg.DeletionSetupBarrier.Wait() logger.Info(ctx, "prebuilds paused, preparing for deletion") - // Now prepare for deletion by creating an empty template version - // At this point, prebuilds should be paused by the caller + // Now prepare for deletion by creating an empty template version. + // At this point, prebuilds should be paused by the caller. logger.Info(ctx, "creating empty template version for deletion") - emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) - if err != nil { - r.cfg.Metrics.AddError(r.template.Name, "create_empty_template_version") - return xerrors.Errorf("create empty template version for deletion: %w", err) - } - - err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ - ID: emptyVersion.ID, - }) - if err != nil { - r.cfg.Metrics.AddError(r.template.Name, "update_active_template_version") - return xerrors.Errorf("update active template version to empty for deletion: %w", err) + if err = r.pushEmptyTemplateVersion(ctx); err != nil { + r.cfg.Metrics.AddError(r.template.Name, "clear_template_prebuilds") + return xerrors.Errorf("clear template prebuilds for deletion: %w", err) } logger.Info(ctx, "waiting for all runners to reach deletion barrier") @@ -358,14 +376,79 @@ func (r *Runner) createTemplateVersion(ctx context.Context, templateID uuid.UUID var errTickerDone = xerrors.New("done") +// pushEmptyTemplateVersion pushes a new empty template version (no presets, no +// prebuilds) and makes it active. This stops the reconciler from spawning new +// prebuild workspaces for the template. +func (r *Runner) pushEmptyTemplateVersion(ctx context.Context) error { + emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) + if err != nil { + return xerrors.Errorf("create empty template version: %w", err) + } + if err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ + ID: emptyVersion.ID, + }); err != nil { + return xerrors.Errorf("update active template version: %w", err) + } + return nil +} + func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { logs = loadtestutil.NewSyncWriter(logs) logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug) - logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) + // If Run failed before the template was created, there is nothing to clean up. + if r.template.ID == uuid.Nil { + logger.Info(ctx, "template was never created, skipping cleanup") + return nil + } - err := r.client.DeleteTemplate(ctx, r.template.ID) + // Workspaces must be deleted before the template can be deleted. + workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name) if err != nil { + return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err) + } + + logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name)) + + // Retry failed workspace deletions up to maxDeletionAttempts times to + // handle transient errors (e.g. a delete build that fails due to a + // provisioner hiccup). + const maxDeletionAttempts = 3 + remaining := workspaces + for attempt := range maxDeletionAttempts { + if len(remaining) == 0 { + break + } + logger.Info(ctx, "trying to delete workspaces", + slog.F("attempt", attempt+1), + slog.F("remaining", len(remaining)), + slog.F("template_name", r.template.Name), + ) + var failed []codersdk.Workspace + for _, ws := range remaining { + cr := workspacebuild.NewCleanupRunner(r.client, ws.ID) + if err := cr.Run(ctx, ws.ID.String(), logs); err != nil { + logger.Warn(ctx, "failed to delete workspace", + slog.F("workspace_id", ws.ID), + slog.F("workspace_name", ws.Name), + slog.Error(err), + ) + failed = append(failed, ws) + } + } + remaining = failed + } + + if len(remaining) > 0 { + ids := make([]string, len(remaining)) + for i, ws := range remaining { + ids[i] = ws.ID.String() + } + return xerrors.Errorf("could not delete all workspaces after %d attempts; remaining: %v", maxDeletionAttempts, ids) + } + + logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) + if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil { return xerrors.Errorf("delete template: %w", err) } @@ -373,6 +456,28 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { return nil } +// allWorkspacesForTemplate returns all workspaces belonging to templateName, +// paginating through results until exhausted. +func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) { + const pageSize = 100 + var workspaces []codersdk.Workspace + for page := 0; ; page++ { + resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{ + Template: templateName, + Offset: page * pageSize, + Limit: pageSize, + }) + if err != nil { + return nil, xerrors.Errorf("list workspaces page %d: %w", page, err) + } + workspaces = append(workspaces, resp.Workspaces...) + if len(resp.Workspaces) < pageSize { + break + } + } + return workspaces, nil +} + //go:embed tf/main.tf.tpl var templateContent string