fix(scaletest/prebuilds): fix Runner.Cleanup() to delete workspaces (#23627)

This commit is contained in:
Callum Styan
2026-04-22 13:11:42 -07:00
committed by GitHub
parent b714fe8e71
commit 7d044fa598
+121 -16
View File
@@ -6,6 +6,7 @@ import (
_ "embed"
"html/template"
"io"
"net/http"
"time"
"github.com/google/uuid"
@@ -17,6 +18,7 @@ import (
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/scaletest/harness"
"github.com/coder/coder/v2/scaletest/loadtestutil"
"github.com/coder/coder/v2/scaletest/workspacebuild"
)
type Runner struct {
@@ -77,6 +79,31 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
}
templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq)
if err != nil {
// If the template already exists from a previous failed run, look it up so
// Cleanup() can delete it and the rerun doesn't leave orphaned resources.
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict {
existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{
OrganizationID: r.cfg.OrganizationID,
ExactName: templateName,
})
if listErr == nil && len(existing) > 0 {
r.template = existing[0]
logger.Warn(ctx, "template already exists from a previous run, will be cleaned up",
slog.F("template_name", r.template.Name),
slog.F("template_id", r.template.ID),
)
// Clear any prebuild config on the orphaned template so the
// reconciler doesn't keep spawning workspaces while Cleanup()
// is trying to delete them.
if clearErr := r.pushEmptyTemplateVersion(ctx); clearErr != nil {
logger.Warn(ctx, "failed to clear prebuilds config on orphaned template",
slog.F("template_id", r.template.ID),
slog.Error(clearErr),
)
}
}
}
r.cfg.Metrics.AddError(templateName, "create_template")
return xerrors.Errorf("create template: %w", err)
}
@@ -105,21 +132,12 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
r.cfg.DeletionSetupBarrier.Wait()
logger.Info(ctx, "prebuilds paused, preparing for deletion")
// Now prepare for deletion by creating an empty template version
// At this point, prebuilds should be paused by the caller
// Now prepare for deletion by creating an empty template version.
// At this point, prebuilds should be paused by the caller.
logger.Info(ctx, "creating empty template version for deletion")
emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0)
if err != nil {
r.cfg.Metrics.AddError(r.template.Name, "create_empty_template_version")
return xerrors.Errorf("create empty template version for deletion: %w", err)
}
err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{
ID: emptyVersion.ID,
})
if err != nil {
r.cfg.Metrics.AddError(r.template.Name, "update_active_template_version")
return xerrors.Errorf("update active template version to empty for deletion: %w", err)
if err = r.pushEmptyTemplateVersion(ctx); err != nil {
r.cfg.Metrics.AddError(r.template.Name, "clear_template_prebuilds")
return xerrors.Errorf("clear template prebuilds for deletion: %w", err)
}
logger.Info(ctx, "waiting for all runners to reach deletion barrier")
@@ -358,14 +376,79 @@ func (r *Runner) createTemplateVersion(ctx context.Context, templateID uuid.UUID
var errTickerDone = xerrors.New("done")
// pushEmptyTemplateVersion pushes a new empty template version (no presets, no
// prebuilds) and makes it active. This stops the reconciler from spawning new
// prebuild workspaces for the template.
func (r *Runner) pushEmptyTemplateVersion(ctx context.Context) error {
emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0)
if err != nil {
return xerrors.Errorf("create empty template version: %w", err)
}
if err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{
ID: emptyVersion.ID,
}); err != nil {
return xerrors.Errorf("update active template version: %w", err)
}
return nil
}
func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
logs = loadtestutil.NewSyncWriter(logs)
logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
// If Run failed before the template was created, there is nothing to clean up.
if r.template.ID == uuid.Nil {
logger.Info(ctx, "template was never created, skipping cleanup")
return nil
}
err := r.client.DeleteTemplate(ctx, r.template.ID)
// Workspaces must be deleted before the template can be deleted.
workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name)
if err != nil {
return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err)
}
logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name))
// Retry failed workspace deletions up to maxDeletionAttempts times to
// handle transient errors (e.g. a delete build that fails due to a
// provisioner hiccup).
const maxDeletionAttempts = 3
remaining := workspaces
for attempt := range maxDeletionAttempts {
if len(remaining) == 0 {
break
}
logger.Info(ctx, "trying to delete workspaces",
slog.F("attempt", attempt+1),
slog.F("remaining", len(remaining)),
slog.F("template_name", r.template.Name),
)
var failed []codersdk.Workspace
for _, ws := range remaining {
cr := workspacebuild.NewCleanupRunner(r.client, ws.ID)
if err := cr.Run(ctx, ws.ID.String(), logs); err != nil {
logger.Warn(ctx, "failed to delete workspace",
slog.F("workspace_id", ws.ID),
slog.F("workspace_name", ws.Name),
slog.Error(err),
)
failed = append(failed, ws)
}
}
remaining = failed
}
if len(remaining) > 0 {
ids := make([]string, len(remaining))
for i, ws := range remaining {
ids[i] = ws.ID.String()
}
return xerrors.Errorf("could not delete all workspaces after %d attempts; remaining: %v", maxDeletionAttempts, ids)
}
logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
return xerrors.Errorf("delete template: %w", err)
}
@@ -373,6 +456,28 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
return nil
}
// allWorkspacesForTemplate returns all workspaces belonging to templateName,
// paginating through results until exhausted.
func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) {
const pageSize = 100
var workspaces []codersdk.Workspace
for page := 0; ; page++ {
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{
Template: templateName,
Offset: page * pageSize,
Limit: pageSize,
})
if err != nil {
return nil, xerrors.Errorf("list workspaces page %d: %w", page, err)
}
workspaces = append(workspaces, resp.Workspaces...)
if len(resp.Workspaces) < pageSize {
break
}
}
return workspaces, nil
}
//go:embed tf/main.tf.tpl
var templateContent string