From ca234f346d4a1ba57721eb57f79f2d455f97d027 Mon Sep 17 00:00:00 2001 From: Susana Ferreira Date: Fri, 27 Feb 2026 14:26:48 +0000 Subject: [PATCH] fix: mark presets as validation_failed to prevent endless prebuild retries (#22085) ## Description - Updates `wsbuilder` to return a `BuildError` with `http.StatusBadRequest` to signify a "validation error" on missing or invalid parameters - Adds a short-circuit in `prebuilds.StoreReconciler` to mark presets for which creating a build returns a "validation error" as "validation failed" and skip further attempts to reconcile. - Adds a test to verify the above - Introduces a new Prometheus metric `coderd_prebuilt_workspaces_preset_validation_failed` to track the above Closes: https://github.com/coder/coder/issues/21237 --------- Co-authored-by: Cian Johnston --- coderd/wsbuilder/builderror_test.go | 64 +++ coderd/wsbuilder/wsbuilder.go | 12 +- docs/admin/integrations/prometheus.md | 1 + .../coderd/prebuilds/metricscollector.go | 60 ++- .../coderd/prebuilds/metricscollector_test.go | 6 +- enterprise/coderd/prebuilds/reconcile.go | 64 +++ enterprise/coderd/prebuilds/reconcile_test.go | 429 +++++++++++++++++- scripts/metricsdocgen/generated_metrics | 3 + 8 files changed, 596 insertions(+), 43 deletions(-) create mode 100644 coderd/wsbuilder/builderror_test.go diff --git a/coderd/wsbuilder/builderror_test.go b/coderd/wsbuilder/builderror_test.go new file mode 100644 index 0000000000..e481491cca --- /dev/null +++ b/coderd/wsbuilder/builderror_test.go @@ -0,0 +1,64 @@ +package wsbuilder_test + +import ( + "net/http" + "testing" + + "github.com/hashicorp/hcl/v2" + "github.com/stretchr/testify/require" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/dynamicparameters" + "github.com/coder/coder/v2/coderd/wsbuilder" +) + +func TestBuildErrorResponseDelegation(t *testing.T) { + t.Parallel() + + t.Run("plain_error", func(t *testing.T) { + t.Parallel() + + be := wsbuilder.BuildError{ + Status: http.StatusBadRequest, + Message: "bad", + Wrapped: xerrors.New("oops"), + } + + status, resp := be.Response() + require.Equal(t, http.StatusBadRequest, status) + require.Equal(t, "bad", resp.Message) + require.Contains(t, resp.Detail, "oops") + require.Empty(t, resp.Validations) + }) + + t.Run("responder_error", func(t *testing.T) { + t.Parallel() + + inner := &dynamicparameters.DiagnosticError{ + Message: "resolve parameters", + KeyedDiagnostics: map[string]hcl.Diagnostics{ + "param1": { + { + Severity: hcl.DiagError, + Summary: "required parameter", + }, + }, + }, + } + + be := wsbuilder.BuildError{ + Status: http.StatusBadRequest, + Message: "build error wrapper", + Wrapped: inner, + } + + status, resp := be.Response() + + // Should delegate to the inner DiagnosticError's response. + innerStatus, innerResp := inner.Response() + require.Equal(t, innerStatus, status) + require.Equal(t, innerResp.Message, resp.Message) + require.Len(t, resp.Validations, 1) + require.Equal(t, "param1", resp.Validations[0].Field) + }) +} diff --git a/coderd/wsbuilder/wsbuilder.go b/coderd/wsbuilder/wsbuilder.go index ef688a5eeb..b87806863d 100644 --- a/coderd/wsbuilder/wsbuilder.go +++ b/coderd/wsbuilder/wsbuilder.go @@ -24,6 +24,7 @@ import ( "github.com/coder/coder/v2/coderd/dynamicparameters" "github.com/coder/coder/v2/coderd/files" "github.com/coder/coder/v2/coderd/httpapi" + "github.com/coder/coder/v2/coderd/httpapi/httperror" "github.com/coder/coder/v2/coderd/prebuilds" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" @@ -280,6 +281,13 @@ func (e BuildError) Unwrap() error { } func (e BuildError) Response() (int, codersdk.Response) { + // If the wrapped error knows how to produce its own response + // (e.g. DiagnosticError with Validations), prefer that over + // the generic BuildError response. + if inner, ok := httperror.IsResponder(e.Wrapped); ok { + return inner.Response() + } + return e.Status, codersdk.Response{ Message: e.Message, Detail: e.Error(), @@ -875,7 +883,7 @@ func (b *Builder) getDynamicParameters() (names, values []string, err error) { b.richParameterValues, presetParameterValues) if err != nil { - return nil, nil, xerrors.Errorf("resolve parameters: %w", err) + return nil, nil, BuildError{http.StatusBadRequest, "resolve parameters", err} } names = make([]string, 0, len(buildValues)) @@ -1112,7 +1120,7 @@ func (b *Builder) getDynamicProvisionerTags() (map[string]string, error) { output, diags := render.Render(b.ctx, b.workspace.OwnerID, vals) tagErr := dynamicparameters.CheckTags(output, diags) if tagErr != nil { - return nil, tagErr + return nil, BuildError{http.StatusBadRequest, "workspace tags validation failed", tagErr} } for k, v := range output.WorkspaceTags.Tags() { diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index ace326019f..7fed2747d3 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -211,6 +211,7 @@ deployment. They will always be available from the agent. | `coderd_prebuilt_workspaces_failed_total` | counter | Total number of prebuilt workspaces that failed to build. | `organization_name` `preset_name` `template_name` | | `coderd_prebuilt_workspaces_metrics_last_updated` | gauge | The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached. | | | `coderd_prebuilt_workspaces_preset_hard_limited` | gauge | Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | +| `coderd_prebuilt_workspaces_preset_validation_failed` | gauge | Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` | | `coderd_prebuilt_workspaces_reconciliation_paused` | gauge | Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). | | | `coderd_prebuilt_workspaces_resource_replacements_total` | counter | Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. In Terraform, drift on immutable attributes results in resource replacement. This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource would have been recreated when claiming, thus obviating the point of pre-provisioning. See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement | `organization_name` `preset_name` `template_name` | | `coderd_prebuilt_workspaces_running` | gauge | Current number of prebuilt workspaces that are in a running state. These workspaces have started successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible). | `organization_name` `preset_name` `template_name` | diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 22f1cdff15..a233e7cd92 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -19,16 +19,17 @@ import ( const ( namespace = "coderd_prebuilt_workspaces_" - MetricCreatedCount = namespace + "created_total" - MetricFailedCount = namespace + "failed_total" - MetricClaimedCount = namespace + "claimed_total" - MetricResourceReplacementsCount = namespace + "resource_replacements_total" - MetricDesiredGauge = namespace + "desired" - MetricRunningGauge = namespace + "running" - MetricEligibleGauge = namespace + "eligible" - MetricPresetHardLimitedGauge = namespace + "preset_hard_limited" - MetricLastUpdatedGauge = namespace + "metrics_last_updated" - MetricReconciliationPausedGauge = namespace + "reconciliation_paused" + MetricCreatedCount = namespace + "created_total" + MetricFailedCount = namespace + "failed_total" + MetricClaimedCount = namespace + "claimed_total" + MetricResourceReplacementsCount = namespace + "resource_replacements_total" + MetricDesiredGauge = namespace + "desired" + MetricRunningGauge = namespace + "running" + MetricEligibleGauge = namespace + "eligible" + MetricPresetHardLimitedGauge = namespace + "preset_hard_limited" + MetricPresetValidationFailedGauge = namespace + "preset_validation_failed" + MetricLastUpdatedGauge = namespace + "metrics_last_updated" + MetricReconciliationPausedGauge = namespace + "reconciliation_paused" ) var ( @@ -89,6 +90,12 @@ var ( labels, nil, ) + presetValidationFailedDesc = prometheus.NewDesc( + MetricPresetValidationFailedGauge, + "Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise.", + labels, + nil, + ) lastUpdateDesc = prometheus.NewDesc( MetricLastUpdatedGauge, "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", @@ -121,6 +128,9 @@ type MetricsCollector struct { isPresetHardLimited map[hardLimitedPresetKey]bool isPresetHardLimitedMu sync.Mutex + isPresetValidationFailed map[hardLimitedPresetKey]bool + isPresetValidationFailedMu sync.Mutex + reconciliationPaused bool reconciliationPausedMu sync.RWMutex } @@ -131,11 +141,12 @@ func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter preb log := logger.Named("prebuilds_metrics_collector") return &MetricsCollector{ - database: db, - logger: log, - snapshotter: snapshotter, - replacementsCounter: make(map[replacementKey]float64), - isPresetHardLimited: make(map[hardLimitedPresetKey]bool), + database: db, + logger: log, + snapshotter: snapshotter, + replacementsCounter: make(map[replacementKey]float64), + isPresetHardLimited: make(map[hardLimitedPresetKey]bool), + isPresetValidationFailed: make(map[hardLimitedPresetKey]bool), } } @@ -148,6 +159,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc descCh <- presetHardLimitedDesc + descCh <- presetValidationFailedDesc descCh <- lastUpdateDesc descCh <- reconciliationPausedDesc } @@ -216,6 +228,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { } mc.isPresetHardLimitedMu.Unlock() + mc.isPresetValidationFailedMu.Lock() + for key, isValidationFailed := range mc.isPresetValidationFailed { + var val float64 + if isValidationFailed { + val = 1 + } + + metricsCh <- prometheus.MustNewConstMetric(presetValidationFailedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName) + } + mc.isPresetValidationFailedMu.Unlock() + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } @@ -306,6 +329,13 @@ func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[h mc.isPresetHardLimited = isPresetHardLimited } +func (mc *MetricsCollector) registerValidationFailedPresets(isPresetValidationFailed map[hardLimitedPresetKey]bool) { + mc.isPresetValidationFailedMu.Lock() + defer mc.isPresetValidationFailedMu.Unlock() + + mc.isPresetValidationFailed = isPresetValidationFailed +} + func (mc *MetricsCollector) setReconciliationPaused(paused bool) { mc.reconciliationPausedMu.Lock() defer mc.reconciliationPausedMu.Unlock() diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 606995e1a1..c362946734 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -432,6 +432,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, continue } + metricLoop: for _, metric := range metricFamily.GetMetric() { labelPairs := metric.GetLabel() @@ -444,7 +445,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, // Check if all requested labels match for wantName, wantValue := range labels { if metricLabels[wantName] != wantValue { - continue + continue metricLoop } } @@ -458,6 +459,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric { series := make(map[string]*prometheus_client.Metric) for _, metricFamily := range metricsFamilies { + metricLoop: for _, metric := range metricFamily.GetMetric() { labelPairs := metric.GetLabel() @@ -474,7 +476,7 @@ func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labe // Check if all requested labels match for wantName, wantValue := range labels { if metricLabels[wantName] != wantValue { - continue + continue metricLoop } } diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 9e8ce20887..6e5977828a 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "math" + "net/http" "strings" "sync" "sync/atomic" @@ -413,6 +414,7 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.Rec } c.reportHardLimitedPresets(snapshot) + c.reportValidationFailedPresets(snapshot) if len(snapshot.Presets) == 0 { logger.Debug(ctx, "no templates found with prebuilds configured") @@ -514,6 +516,42 @@ func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSna c.metrics.registerHardLimitedPresets(isPresetHardLimited) } +func (c *StoreReconciler) reportValidationFailedPresets(snapshot *prebuilds.GlobalSnapshot) { + // presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets. + // Multiple versions of a preset can exist with the same orgName, templateName, and presetName, + // because templates can have multiple versions - or deleted templates can share the same name. + presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow) + for _, preset := range snapshot.Presets { + key := hardLimitedPresetKey{ + orgName: preset.OrganizationName, + templateName: preset.TemplateName, + presetName: preset.Name, + } + + presetsMap[key] = append(presetsMap[key], preset) + } + + // Report a preset as validation-failed only if all the following conditions are met: + // - The preset has PrebuildStatus == PrebuildStatusValidationFailed + // - The preset is using the active version of its template, and the template has not been deleted + // + // The second condition is important because a validation-failed preset that has become outdated is no longer relevant. + // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it + // as validation-failed to the admin. + isPresetValidationFailed := make(map[hardLimitedPresetKey]bool) + for key, presets := range presetsMap { + for _, preset := range presets { + if preset.UsingActiveVersion && !preset.Deleted && + preset.PrebuildStatus == database.PrebuildStatusValidationFailed { + isPresetValidationFailed[key] = true + break + } + } + } + + c.metrics.registerValidationFailedPresets(isPresetValidationFailed) +} + // SnapshotState captures the current state of all prebuilds across templates. func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) { ctx, span := c.tracer.Start(ctx, "prebuilds.SnapshotState") @@ -783,11 +821,37 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge return nil } + // If preset previously failed validation (e.g. missing required parameter, + // invalid workspace tags), skip creation until the template version is updated. + // The status resets naturally when a new template version is promoted, since + // new presets are created with the default 'healthy' status. + if ps.Preset.PrebuildStatus == database.PrebuildStatusValidationFailed && action.Create > 0 { + logger.Warn(ctx, "skipping preset with validation failure for create operation") + return nil + } + var multiErr multierror.Error for range action.Create { if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil { logger.Error(ctx, "failed to create prebuild", slog.Error(err)) multiErr.Errors = append(multiErr.Errors, err) + + // A 400 BuildError means the build failed due to a validation error + // (e.g. missing parameter, invalid workspace tags). These errors are + // deterministic and will persist until the template is updated, so we + // mark the preset to prevent endless retries on every reconciliation loop. + var buildErr wsbuilder.BuildError + if xerrors.As(err, &buildErr) && buildErr.Status == http.StatusBadRequest { + logger.Warn(ctx, "marking preset as failed validation") + if dbErr := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ + Status: database.PrebuildStatusValidationFailed, + PresetID: ps.Preset.ID, + }); dbErr != nil { + logger.Error(ctx, "failed to update preset prebuild status", slog.Error(dbErr)) + } + // All prebuilds for this preset will fail the same way, so stop trying. + break + } } } diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index d85760373a..1fb67fd2d4 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -507,6 +507,37 @@ func (*brokenPublisher) Publish(event string, _ []byte) error { return xerrors.Errorf("failed to publish %q", event) } +// prebuildStoreWrapper wraps database.Store to inject errors for testing. +type prebuildStoreWrapper struct { + database.Store + insertProvisionerJobErr error + errorOnTemplateVersionID uuid.UUID +} + +func (s prebuildStoreWrapper) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { + if s.insertProvisionerJobErr != nil { + return database.ProvisionerJob{}, s.insertProvisionerJobErr + } + return s.Store.InsertProvisionerJob(ctx, arg) +} + +func (s prebuildStoreWrapper) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error { + if s.errorOnTemplateVersionID != uuid.Nil && arg.TemplateVersionID == s.errorOnTemplateVersionID { + return xerrors.Errorf("injected internal server error for template version %s", s.errorOnTemplateVersionID) + } + return s.Store.InsertWorkspaceBuild(ctx, arg) +} + +func (s prebuildStoreWrapper) InTx(fn func(database.Store) error, opts *database.TxOptions) error { + return s.Store.InTx(func(tx database.Store) error { + return fn(prebuildStoreWrapper{ + Store: tx, + insertProvisionerJobErr: s.insertProvisionerJobErr, + errorOnTemplateVersionID: s.errorOnTemplateVersionID, + }) + }, opts) +} + func TestMultiplePresetsPerTemplateVersion(t *testing.T) { t.Parallel() @@ -983,9 +1014,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) { mf, err := registry.Gather() require.NoError(t, err) metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.Nil(t, metric) @@ -1020,9 +1051,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) { mf, err = registry.Gather() require.NoError(t, err) metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.Nil(t, metric) return @@ -1036,9 +1067,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) { mf, err = registry.Gather() require.NoError(t, err) metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.NotNil(t, metric) require.NotNil(t, metric.GetGauge()) @@ -1047,6 +1078,356 @@ func TestSkippingHardLimitedPresets(t *testing.T) { } } +func TestValidationFailedPresets(t *testing.T) { + t.Parallel() + + // This test uses 5 presets sharing one DB to verify validation_failed behavior: + // | Preset | Setup | Expected After Reconcile | + // |--------|-----------------------------------------|-------------------------------------------| + // | A | Already validation_failed, desired=2 | Stays validation_failed, 0 workspaces | + // | B | Healthy, required param missing | Marked validation_failed, 0 workspaces | + // | C | Healthy, desired=3, required param | Marked validation_failed, 0 workspaces | + // | D | Healthy, DB wrapper injects 500 | Stays healthy, 0 workspaces | + // | E | Healthy, desired=1 (control) | Stays healthy, 1 workspaces | + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitMedium) + cfg := codersdk.PrebuildsConfig{} + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{}) + registry := prometheus.NewRegistry() + + // Set up shared test environment. + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org := dbgen.Organization(t, db, database.Organization{}) + _ = dbgen.OrganizationMember(t, db, database.OrganizationMember{ + OrganizationID: org.ID, + UserID: ownerID, + }) + + // Helper to create template + version + optional required param. + createTemplate := func(name string, addRequiredParam bool) (database.Template, database.TemplateVersion) { + // First create the template (with a placeholder ActiveVersionID that we'll update). + tpl := dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: ownerID, + Name: name, + }) + + // Now create the provisioner job and template version linked to the template. + job := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{ + OrganizationID: org.ID, + CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true}, + InitiatorID: ownerID, + }) + tv := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}, + OrganizationID: org.ID, + JobID: job.ID, + CreatedBy: ownerID, + }) + + // Update template to point to this version as active. + require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{ + ID: tpl.ID, + ActiveVersionID: tv.ID, + })) + + if addRequiredParam { + dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{ + TemplateVersionID: tv.ID, + Name: "required_param", + Description: "required param to trigger validation failure", + Type: "bool", + DefaultValue: "", + Required: true, + }) + } + return tpl, tv + } + + // Create templates. + tplA, tvA := createTemplate("tpl-already-failed", false) + tplB, tvB := createTemplate("tpl-will-400", true) + tplC, tvC := createTemplate("tpl-multi-create", true) + tplD, tvD := createTemplate("tpl-will-500", false) + tplE, tvE := createTemplate("tpl-control", false) + + // Create presets. + presetA := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tvA.ID, + Name: "preset-already-failed", + DesiredInstances: sql.NullInt32{Int32: 2, Valid: true}, + }) + // Mark preset A as validation_failed from the start. + err := db.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ + PresetID: presetA.ID, + Status: database.PrebuildStatusValidationFailed, + }) + require.NoError(t, err) + + presetB := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tvB.ID, + Name: "preset-will-400", + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + presetC := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tvC.ID, + Name: "preset-multi-create", + DesiredInstances: sql.NullInt32{Int32: 3, Valid: true}, + }) + presetD := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tvD.ID, + Name: "preset-will-500", + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + presetE := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tvE.ID, + Name: "preset-control", + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + + // Wrap DB to inject 500 error for template D's version. + wrappedDB := prebuildStoreWrapper{ + Store: db, + errorOnTemplateVersionID: tvD.ID, + } + + controller := prebuilds.NewStoreReconciler( + wrappedDB, pubSub, cache, cfg, logger, + clock, + registry, + newNoopEnqueuer(), + newNoopUsageCheckerPtr(), + noop.NewTracerProvider(), + 10, + nil, + ) + + // First reconcile: marks B, C as validation_failed. + _, err = controller.ReconcileAll(ctx) + require.NoError(t, err) + + // Second reconcile: updates metrics with newly-failed presets + // (metrics are updated based on snapshot taken at the START of ReconcileAll). + _, err = controller.ReconcileAll(ctx) + require.NoError(t, err) + + // Verify preset states. + verifyPreset := func(presetID uuid.UUID, expectedStatus database.PrebuildStatus, templateID uuid.UUID, expectWorkspaces int) { + preset, err := db.GetPresetByID(ctx, presetID) + require.NoError(t, err) + require.Equal(t, expectedStatus, preset.PrebuildStatus, + "preset %s should have status %s", preset.Name, expectedStatus) + + workspaces, err := db.GetWorkspacesByTemplateID(ctx, templateID) + require.NoError(t, err) + require.Len(t, workspaces, expectWorkspaces, + "template %s should have %d workspaces", templateID, expectWorkspaces) + } + + // Preset A: already validation_failed, stays that way, no workspaces. + verifyPreset(presetA.ID, database.PrebuildStatusValidationFailed, tplA.ID, 0) + // Preset B: healthy -> validation_failed due to 400 (missing required param). + verifyPreset(presetB.ID, database.PrebuildStatusValidationFailed, tplB.ID, 0) + // Preset C: healthy -> validation_failed due to 400 (missing required param), even with 3 desired instances. + verifyPreset(presetC.ID, database.PrebuildStatusValidationFailed, tplC.ID, 0) + // Preset D: stays healthy because 500 error does not mark as validation_failed. + verifyPreset(presetD.ID, database.PrebuildStatusHealthy, tplD.ID, 0) + // Preset E: stays healthy (control) + verifyPreset(presetE.ID, database.PrebuildStatusHealthy, tplE.ID, 1) + + // Verify metrics: A, B, C should have validation_failed metric set to 1. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + + // Helper to check metric value. + checkMetric := func(templateName, presetName string, expectSet bool) { + metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{ + "template_name": templateName, + "preset_name": presetName, + "organization_name": org.Name, + }) + if expectSet { + require.NotNil(t, metric, "metric should be set for preset %s", presetName) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue(), + "metric value should be 1 for preset %s", presetName) + } else { + require.Nil(t, metric, "metric should not be set for preset %s", presetName) + } + } + + checkMetric(tplA.Name, presetA.Name, true) + checkMetric(tplB.Name, presetB.Name, true) + checkMetric(tplC.Name, presetC.Name, true) + checkMetric(tplD.Name, presetD.Name, false) + checkMetric(tplE.Name, presetE.Name, false) +} + +// TestValidationFailedPresetResets verifies that when a preset is marked as +// validation_failed and a new template version is promoted, the new preset +// starts healthy and the validation_failed metric is cleared. +func TestValidationFailedPresetResets(t *testing.T) { + t.Parallel() + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitMedium) + cfg := codersdk.PrebuildsConfig{} + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{}) + registry := prometheus.NewRegistry() + + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org := dbgen.Organization(t, db, database.Organization{}) + _ = dbgen.OrganizationMember(t, db, database.OrganizationMember{ + OrganizationID: org.ID, + UserID: ownerID, + }) + + // Create a template with a required param that will cause validation failure. + tpl := dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: ownerID, + Name: "tpl-version-reset", + }) + + job1 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{ + OrganizationID: org.ID, + CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true}, + InitiatorID: ownerID, + }) + tv1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}, + OrganizationID: org.ID, + JobID: job1.ID, + CreatedBy: ownerID, + }) + require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{ + ID: tpl.ID, + ActiveVersionID: tv1.ID, + })) + + // Add a required param with no default, this triggers validation failure. + dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{ + TemplateVersionID: tv1.ID, + Name: "required_param", + Description: "required param to trigger validation failure", + Type: "bool", + DefaultValue: "", + Required: true, + }) + + preset1 := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tv1.ID, + Name: "preset-test", + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + + reconciler := prebuilds.NewStoreReconciler( + db, pubSub, cache, cfg, logger, + clock, + registry, + newNoopEnqueuer(), + newNoopUsageCheckerPtr(), + noop.NewTracerProvider(), + 10, + nil, + ) + + // First reconcile: preset gets marked as validation_failed. + _, err := reconciler.ReconcileAll(ctx) + require.NoError(t, err) + + // Verify preset is marked as validation_failed in the database. + updatedPreset, err := db.GetPresetByID(ctx, preset1.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusValidationFailed, updatedPreset.PrebuildStatus) + + // Second reconcile: metrics snapshot picks up the validation_failed status. + _, err = reconciler.ReconcileAll(ctx) + require.NoError(t, err) + + // Verify metric is set. + require.NoError(t, reconciler.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{ + "template_name": tpl.Name, + "preset_name": preset1.Name, + "organization_name": org.Name, + }) + require.NotNil(t, metric) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + + // Promote a new template version without the problematic param. + job2 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{ + OrganizationID: org.ID, + CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true}, + InitiatorID: ownerID, + }) + tv2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true}, + OrganizationID: org.ID, + JobID: job2.ID, + CreatedBy: ownerID, + }) + require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{ + ID: tpl.ID, + ActiveVersionID: tv2.ID, + })) + + // Create a preset on the new version. + preset2 := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: tv2.ID, + Name: "preset-test", // same name, new version + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + + // Reconcile with the new version active. + _, err = reconciler.ReconcileAll(ctx) + require.NoError(t, err) + + // Old preset stays validation_failed (it's now inactive, won't be reset). + oldPreset, err := db.GetPresetByID(ctx, preset1.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusValidationFailed, oldPreset.PrebuildStatus) + + // New preset is healthy. + newPreset, err := db.GetPresetByID(ctx, preset2.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusHealthy, newPreset.PrebuildStatus) + + // Metric should be cleared: the old preset is inactive, so it's no longer reported. + require.NoError(t, reconciler.ForceMetricsUpdate(ctx)) + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{ + "template_name": tpl.Name, + "preset_name": preset1.Name, + "organization_name": org.Name, + }) + require.Nil(t, metric) + + // New preset should have a workspace created. + workspaces, err := db.GetWorkspacesByTemplateID(ctx, tpl.ID) + require.NoError(t, err) + require.Len(t, workspaces, 1) +} + func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { t.Parallel() @@ -1172,9 +1553,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { mf, err := registry.Gather() require.NoError(t, err) metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.Nil(t, metric) @@ -1212,9 +1593,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { mf, err = registry.Gather() require.NoError(t, err) metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.NotNil(t, metric) require.NotNil(t, metric.GetGauge()) @@ -1263,9 +1644,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { mf, err = registry.Gather() require.NoError(t, err) metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.Nil(t, metric) }) @@ -1668,9 +2049,9 @@ func TestTrackResourceReplacement(t *testing.T) { mf, err := registry.Gather() require.NoError(t, err) require.Nil(t, findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, })) // When: a claim occurred and resource replacements are detected (_how_ is out of scope of this test). @@ -1711,9 +2092,9 @@ func TestTrackResourceReplacement(t *testing.T) { mf, err = registry.Gather() require.NoError(t, err) metric := findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{ - "template_name": template.Name, - "preset_name": preset.Name, - "org_name": org.Name, + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, }) require.NotNil(t, metric) require.NotNil(t, metric.GetCounter()) diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index 887f8f469c..9eae5000ef 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -265,6 +265,9 @@ coderd_prebuilt_workspaces_metrics_last_updated 0 # HELP coderd_prebuilt_workspaces_preset_hard_limited Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. # TYPE coderd_prebuilt_workspaces_preset_hard_limited gauge coderd_prebuilt_workspaces_preset_hard_limited{template_name="",preset_name="",organization_name=""} 0 +# HELP coderd_prebuilt_workspaces_preset_validation_failed Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. +# TYPE coderd_prebuilt_workspaces_preset_validation_failed gauge +coderd_prebuilt_workspaces_preset_validation_failed{template_name="",preset_name="",organization_name=""} 0 # HELP coderd_prebuilt_workspaces_reconciliation_paused Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). # TYPE coderd_prebuilt_workspaces_reconciliation_paused gauge coderd_prebuilt_workspaces_reconciliation_paused 0