fix: mark presets as validation_failed to prevent endless prebuild retries (#22085)

## Description

- Updates `wsbuilder` to return a `BuildError` with
`http.StatusBadRequest` to signify a "validation error" on missing or
invalid parameters
- Adds a short-circuit in `prebuilds.StoreReconciler` to mark presets
for which creating a build returns a "validation error" as "validation
failed" and skip further attempts to reconcile.
- Adds a test to verify the above
- Introduces a new Prometheus metric
`coderd_prebuilt_workspaces_preset_validation_failed` to track the above

Closes: https://github.com/coder/coder/issues/21237

---------

Co-authored-by: Cian Johnston <cian@coder.com>
This commit is contained in:
Susana Ferreira
2026-02-27 14:26:48 +00:00
committed by GitHub
parent dea451de41
commit ca234f346d
8 changed files with 596 additions and 43 deletions
+64
View File
@@ -0,0 +1,64 @@
package wsbuilder_test
import (
"net/http"
"testing"
"github.com/hashicorp/hcl/v2"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/dynamicparameters"
"github.com/coder/coder/v2/coderd/wsbuilder"
)
func TestBuildErrorResponseDelegation(t *testing.T) {
t.Parallel()
t.Run("plain_error", func(t *testing.T) {
t.Parallel()
be := wsbuilder.BuildError{
Status: http.StatusBadRequest,
Message: "bad",
Wrapped: xerrors.New("oops"),
}
status, resp := be.Response()
require.Equal(t, http.StatusBadRequest, status)
require.Equal(t, "bad", resp.Message)
require.Contains(t, resp.Detail, "oops")
require.Empty(t, resp.Validations)
})
t.Run("responder_error", func(t *testing.T) {
t.Parallel()
inner := &dynamicparameters.DiagnosticError{
Message: "resolve parameters",
KeyedDiagnostics: map[string]hcl.Diagnostics{
"param1": {
{
Severity: hcl.DiagError,
Summary: "required parameter",
},
},
},
}
be := wsbuilder.BuildError{
Status: http.StatusBadRequest,
Message: "build error wrapper",
Wrapped: inner,
}
status, resp := be.Response()
// Should delegate to the inner DiagnosticError's response.
innerStatus, innerResp := inner.Response()
require.Equal(t, innerStatus, status)
require.Equal(t, innerResp.Message, resp.Message)
require.Len(t, resp.Validations, 1)
require.Equal(t, "param1", resp.Validations[0].Field)
})
}
+10 -2
View File
@@ -24,6 +24,7 @@ import (
"github.com/coder/coder/v2/coderd/dynamicparameters"
"github.com/coder/coder/v2/coderd/files"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/coderd/httpapi/httperror"
"github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/coderd/provisionerdserver"
"github.com/coder/coder/v2/coderd/rbac"
@@ -280,6 +281,13 @@ func (e BuildError) Unwrap() error {
}
func (e BuildError) Response() (int, codersdk.Response) {
// If the wrapped error knows how to produce its own response
// (e.g. DiagnosticError with Validations), prefer that over
// the generic BuildError response.
if inner, ok := httperror.IsResponder(e.Wrapped); ok {
return inner.Response()
}
return e.Status, codersdk.Response{
Message: e.Message,
Detail: e.Error(),
@@ -875,7 +883,7 @@ func (b *Builder) getDynamicParameters() (names, values []string, err error) {
b.richParameterValues,
presetParameterValues)
if err != nil {
return nil, nil, xerrors.Errorf("resolve parameters: %w", err)
return nil, nil, BuildError{http.StatusBadRequest, "resolve parameters", err}
}
names = make([]string, 0, len(buildValues))
@@ -1112,7 +1120,7 @@ func (b *Builder) getDynamicProvisionerTags() (map[string]string, error) {
output, diags := render.Render(b.ctx, b.workspace.OwnerID, vals)
tagErr := dynamicparameters.CheckTags(output, diags)
if tagErr != nil {
return nil, tagErr
return nil, BuildError{http.StatusBadRequest, "workspace tags validation failed", tagErr}
}
for k, v := range output.WorkspaceTags.Tags() {
+1
View File
@@ -211,6 +211,7 @@ deployment. They will always be available from the agent.
| `coderd_prebuilt_workspaces_failed_total` | counter | Total number of prebuilt workspaces that failed to build. | `organization_name` `preset_name` `template_name` |
| `coderd_prebuilt_workspaces_metrics_last_updated` | gauge | The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached. | |
| `coderd_prebuilt_workspaces_preset_hard_limited` | gauge | Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` |
| `coderd_prebuilt_workspaces_preset_validation_failed` | gauge | Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` |
| `coderd_prebuilt_workspaces_reconciliation_paused` | gauge | Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). | |
| `coderd_prebuilt_workspaces_resource_replacements_total` | counter | Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. In Terraform, drift on immutable attributes results in resource replacement. This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource would have been recreated when claiming, thus obviating the point of pre-provisioning. See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement | `organization_name` `preset_name` `template_name` |
| `coderd_prebuilt_workspaces_running` | gauge | Current number of prebuilt workspaces that are in a running state. These workspaces have started successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible). | `organization_name` `preset_name` `template_name` |
+45 -15
View File
@@ -19,16 +19,17 @@ import (
const (
namespace = "coderd_prebuilt_workspaces_"
MetricCreatedCount = namespace + "created_total"
MetricFailedCount = namespace + "failed_total"
MetricClaimedCount = namespace + "claimed_total"
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
MetricReconciliationPausedGauge = namespace + "reconciliation_paused"
MetricCreatedCount = namespace + "created_total"
MetricFailedCount = namespace + "failed_total"
MetricClaimedCount = namespace + "claimed_total"
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricPresetValidationFailedGauge = namespace + "preset_validation_failed"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
MetricReconciliationPausedGauge = namespace + "reconciliation_paused"
)
var (
@@ -89,6 +90,12 @@ var (
labels,
nil,
)
presetValidationFailedDesc = prometheus.NewDesc(
MetricPresetValidationFailedGauge,
"Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise.",
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
@@ -121,6 +128,9 @@ type MetricsCollector struct {
isPresetHardLimited map[hardLimitedPresetKey]bool
isPresetHardLimitedMu sync.Mutex
isPresetValidationFailed map[hardLimitedPresetKey]bool
isPresetValidationFailedMu sync.Mutex
reconciliationPaused bool
reconciliationPausedMu sync.RWMutex
}
@@ -131,11 +141,12 @@ func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter preb
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
isPresetValidationFailed: make(map[hardLimitedPresetKey]bool),
}
}
@@ -148,6 +159,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- presetHardLimitedDesc
descCh <- presetValidationFailedDesc
descCh <- lastUpdateDesc
descCh <- reconciliationPausedDesc
}
@@ -216,6 +228,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
}
mc.isPresetHardLimitedMu.Unlock()
mc.isPresetValidationFailedMu.Lock()
for key, isValidationFailed := range mc.isPresetValidationFailed {
var val float64
if isValidationFailed {
val = 1
}
metricsCh <- prometheus.MustNewConstMetric(presetValidationFailedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
}
mc.isPresetValidationFailedMu.Unlock()
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}
@@ -306,6 +329,13 @@ func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[h
mc.isPresetHardLimited = isPresetHardLimited
}
func (mc *MetricsCollector) registerValidationFailedPresets(isPresetValidationFailed map[hardLimitedPresetKey]bool) {
mc.isPresetValidationFailedMu.Lock()
defer mc.isPresetValidationFailedMu.Unlock()
mc.isPresetValidationFailed = isPresetValidationFailed
}
func (mc *MetricsCollector) setReconciliationPaused(paused bool) {
mc.reconciliationPausedMu.Lock()
defer mc.reconciliationPausedMu.Unlock()
@@ -432,6 +432,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
continue
}
metricLoop:
for _, metric := range metricFamily.GetMetric() {
labelPairs := metric.GetLabel()
@@ -444,7 +445,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
// Check if all requested labels match
for wantName, wantValue := range labels {
if metricLabels[wantName] != wantValue {
continue
continue metricLoop
}
}
@@ -458,6 +459,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric {
series := make(map[string]*prometheus_client.Metric)
for _, metricFamily := range metricsFamilies {
metricLoop:
for _, metric := range metricFamily.GetMetric() {
labelPairs := metric.GetLabel()
@@ -474,7 +476,7 @@ func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labe
// Check if all requested labels match
for wantName, wantValue := range labels {
if metricLabels[wantName] != wantValue {
continue
continue metricLoop
}
}
+64
View File
@@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"math"
"net/http"
"strings"
"sync"
"sync/atomic"
@@ -413,6 +414,7 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.Rec
}
c.reportHardLimitedPresets(snapshot)
c.reportValidationFailedPresets(snapshot)
if len(snapshot.Presets) == 0 {
logger.Debug(ctx, "no templates found with prebuilds configured")
@@ -514,6 +516,42 @@ func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSna
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
}
func (c *StoreReconciler) reportValidationFailedPresets(snapshot *prebuilds.GlobalSnapshot) {
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
// because templates can have multiple versions - or deleted templates can share the same name.
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
for _, preset := range snapshot.Presets {
key := hardLimitedPresetKey{
orgName: preset.OrganizationName,
templateName: preset.TemplateName,
presetName: preset.Name,
}
presetsMap[key] = append(presetsMap[key], preset)
}
// Report a preset as validation-failed only if all the following conditions are met:
// - The preset has PrebuildStatus == PrebuildStatusValidationFailed
// - The preset is using the active version of its template, and the template has not been deleted
//
// The second condition is important because a validation-failed preset that has become outdated is no longer relevant.
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
// as validation-failed to the admin.
isPresetValidationFailed := make(map[hardLimitedPresetKey]bool)
for key, presets := range presetsMap {
for _, preset := range presets {
if preset.UsingActiveVersion && !preset.Deleted &&
preset.PrebuildStatus == database.PrebuildStatusValidationFailed {
isPresetValidationFailed[key] = true
break
}
}
}
c.metrics.registerValidationFailedPresets(isPresetValidationFailed)
}
// SnapshotState captures the current state of all prebuilds across templates.
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
ctx, span := c.tracer.Start(ctx, "prebuilds.SnapshotState")
@@ -783,11 +821,37 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
return nil
}
// If preset previously failed validation (e.g. missing required parameter,
// invalid workspace tags), skip creation until the template version is updated.
// The status resets naturally when a new template version is promoted, since
// new presets are created with the default 'healthy' status.
if ps.Preset.PrebuildStatus == database.PrebuildStatusValidationFailed && action.Create > 0 {
logger.Warn(ctx, "skipping preset with validation failure for create operation")
return nil
}
var multiErr multierror.Error
for range action.Create {
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
multiErr.Errors = append(multiErr.Errors, err)
// A 400 BuildError means the build failed due to a validation error
// (e.g. missing parameter, invalid workspace tags). These errors are
// deterministic and will persist until the template is updated, so we
// mark the preset to prevent endless retries on every reconciliation loop.
var buildErr wsbuilder.BuildError
if xerrors.As(err, &buildErr) && buildErr.Status == http.StatusBadRequest {
logger.Warn(ctx, "marking preset as failed validation")
if dbErr := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
Status: database.PrebuildStatusValidationFailed,
PresetID: ps.Preset.ID,
}); dbErr != nil {
logger.Error(ctx, "failed to update preset prebuild status", slog.Error(dbErr))
}
// All prebuilds for this preset will fail the same way, so stop trying.
break
}
}
}
+405 -24
View File
@@ -507,6 +507,37 @@ func (*brokenPublisher) Publish(event string, _ []byte) error {
return xerrors.Errorf("failed to publish %q", event)
}
// prebuildStoreWrapper wraps database.Store to inject errors for testing.
type prebuildStoreWrapper struct {
database.Store
insertProvisionerJobErr error
errorOnTemplateVersionID uuid.UUID
}
func (s prebuildStoreWrapper) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) {
if s.insertProvisionerJobErr != nil {
return database.ProvisionerJob{}, s.insertProvisionerJobErr
}
return s.Store.InsertProvisionerJob(ctx, arg)
}
func (s prebuildStoreWrapper) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error {
if s.errorOnTemplateVersionID != uuid.Nil && arg.TemplateVersionID == s.errorOnTemplateVersionID {
return xerrors.Errorf("injected internal server error for template version %s", s.errorOnTemplateVersionID)
}
return s.Store.InsertWorkspaceBuild(ctx, arg)
}
func (s prebuildStoreWrapper) InTx(fn func(database.Store) error, opts *database.TxOptions) error {
return s.Store.InTx(func(tx database.Store) error {
return fn(prebuildStoreWrapper{
Store: tx,
insertProvisionerJobErr: s.insertProvisionerJobErr,
errorOnTemplateVersionID: s.errorOnTemplateVersionID,
})
}, opts)
}
func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
t.Parallel()
@@ -983,9 +1014,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
mf, err := registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.Nil(t, metric)
@@ -1020,9 +1051,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.Nil(t, metric)
return
@@ -1036,9 +1067,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
@@ -1047,6 +1078,356 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
}
}
func TestValidationFailedPresets(t *testing.T) {
t.Parallel()
// This test uses 5 presets sharing one DB to verify validation_failed behavior:
// | Preset | Setup | Expected After Reconcile |
// |--------|-----------------------------------------|-------------------------------------------|
// | A | Already validation_failed, desired=2 | Stays validation_failed, 0 workspaces |
// | B | Healthy, required param missing | Marked validation_failed, 0 workspaces |
// | C | Healthy, desired=3, required param | Marked validation_failed, 0 workspaces |
// | D | Healthy, DB wrapper injects 500 | Stays healthy, 0 workspaces |
// | E | Healthy, desired=1 (control) | Stays healthy, 1 workspaces |
clock := quartz.NewMock(t)
ctx := testutil.Context(t, testutil.WaitMedium)
cfg := codersdk.PrebuildsConfig{}
logger := slogtest.Make(
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
registry := prometheus.NewRegistry()
// Set up shared test environment.
ownerID := uuid.New()
dbgen.User(t, db, database.User{
ID: ownerID,
})
org := dbgen.Organization(t, db, database.Organization{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{
OrganizationID: org.ID,
UserID: ownerID,
})
// Helper to create template + version + optional required param.
createTemplate := func(name string, addRequiredParam bool) (database.Template, database.TemplateVersion) {
// First create the template (with a placeholder ActiveVersionID that we'll update).
tpl := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: ownerID,
Name: name,
})
// Now create the provisioner job and template version linked to the template.
job := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
OrganizationID: org.ID,
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
InitiatorID: ownerID,
})
tv := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: org.ID,
JobID: job.ID,
CreatedBy: ownerID,
})
// Update template to point to this version as active.
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
ID: tpl.ID,
ActiveVersionID: tv.ID,
}))
if addRequiredParam {
dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{
TemplateVersionID: tv.ID,
Name: "required_param",
Description: "required param to trigger validation failure",
Type: "bool",
DefaultValue: "",
Required: true,
})
}
return tpl, tv
}
// Create templates.
tplA, tvA := createTemplate("tpl-already-failed", false)
tplB, tvB := createTemplate("tpl-will-400", true)
tplC, tvC := createTemplate("tpl-multi-create", true)
tplD, tvD := createTemplate("tpl-will-500", false)
tplE, tvE := createTemplate("tpl-control", false)
// Create presets.
presetA := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tvA.ID,
Name: "preset-already-failed",
DesiredInstances: sql.NullInt32{Int32: 2, Valid: true},
})
// Mark preset A as validation_failed from the start.
err := db.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
PresetID: presetA.ID,
Status: database.PrebuildStatusValidationFailed,
})
require.NoError(t, err)
presetB := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tvB.ID,
Name: "preset-will-400",
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})
presetC := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tvC.ID,
Name: "preset-multi-create",
DesiredInstances: sql.NullInt32{Int32: 3, Valid: true},
})
presetD := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tvD.ID,
Name: "preset-will-500",
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})
presetE := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tvE.ID,
Name: "preset-control",
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})
// Wrap DB to inject 500 error for template D's version.
wrappedDB := prebuildStoreWrapper{
Store: db,
errorOnTemplateVersionID: tvD.ID,
}
controller := prebuilds.NewStoreReconciler(
wrappedDB, pubSub, cache, cfg, logger,
clock,
registry,
newNoopEnqueuer(),
newNoopUsageCheckerPtr(),
noop.NewTracerProvider(),
10,
nil,
)
// First reconcile: marks B, C as validation_failed.
_, err = controller.ReconcileAll(ctx)
require.NoError(t, err)
// Second reconcile: updates metrics with newly-failed presets
// (metrics are updated based on snapshot taken at the START of ReconcileAll).
_, err = controller.ReconcileAll(ctx)
require.NoError(t, err)
// Verify preset states.
verifyPreset := func(presetID uuid.UUID, expectedStatus database.PrebuildStatus, templateID uuid.UUID, expectWorkspaces int) {
preset, err := db.GetPresetByID(ctx, presetID)
require.NoError(t, err)
require.Equal(t, expectedStatus, preset.PrebuildStatus,
"preset %s should have status %s", preset.Name, expectedStatus)
workspaces, err := db.GetWorkspacesByTemplateID(ctx, templateID)
require.NoError(t, err)
require.Len(t, workspaces, expectWorkspaces,
"template %s should have %d workspaces", templateID, expectWorkspaces)
}
// Preset A: already validation_failed, stays that way, no workspaces.
verifyPreset(presetA.ID, database.PrebuildStatusValidationFailed, tplA.ID, 0)
// Preset B: healthy -> validation_failed due to 400 (missing required param).
verifyPreset(presetB.ID, database.PrebuildStatusValidationFailed, tplB.ID, 0)
// Preset C: healthy -> validation_failed due to 400 (missing required param), even with 3 desired instances.
verifyPreset(presetC.ID, database.PrebuildStatusValidationFailed, tplC.ID, 0)
// Preset D: stays healthy because 500 error does not mark as validation_failed.
verifyPreset(presetD.ID, database.PrebuildStatusHealthy, tplD.ID, 0)
// Preset E: stays healthy (control)
verifyPreset(presetE.ID, database.PrebuildStatusHealthy, tplE.ID, 1)
// Verify metrics: A, B, C should have validation_failed metric set to 1.
require.NoError(t, controller.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
// Helper to check metric value.
checkMetric := func(templateName, presetName string, expectSet bool) {
metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
"template_name": templateName,
"preset_name": presetName,
"organization_name": org.Name,
})
if expectSet {
require.NotNil(t, metric, "metric should be set for preset %s", presetName)
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue(),
"metric value should be 1 for preset %s", presetName)
} else {
require.Nil(t, metric, "metric should not be set for preset %s", presetName)
}
}
checkMetric(tplA.Name, presetA.Name, true)
checkMetric(tplB.Name, presetB.Name, true)
checkMetric(tplC.Name, presetC.Name, true)
checkMetric(tplD.Name, presetD.Name, false)
checkMetric(tplE.Name, presetE.Name, false)
}
// TestValidationFailedPresetResets verifies that when a preset is marked as
// validation_failed and a new template version is promoted, the new preset
// starts healthy and the validation_failed metric is cleared.
func TestValidationFailedPresetResets(t *testing.T) {
t.Parallel()
clock := quartz.NewMock(t)
ctx := testutil.Context(t, testutil.WaitMedium)
cfg := codersdk.PrebuildsConfig{}
logger := slogtest.Make(
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
registry := prometheus.NewRegistry()
ownerID := uuid.New()
dbgen.User(t, db, database.User{
ID: ownerID,
})
org := dbgen.Organization(t, db, database.Organization{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{
OrganizationID: org.ID,
UserID: ownerID,
})
// Create a template with a required param that will cause validation failure.
tpl := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: ownerID,
Name: "tpl-version-reset",
})
job1 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
OrganizationID: org.ID,
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
InitiatorID: ownerID,
})
tv1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: org.ID,
JobID: job1.ID,
CreatedBy: ownerID,
})
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
ID: tpl.ID,
ActiveVersionID: tv1.ID,
}))
// Add a required param with no default, this triggers validation failure.
dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{
TemplateVersionID: tv1.ID,
Name: "required_param",
Description: "required param to trigger validation failure",
Type: "bool",
DefaultValue: "",
Required: true,
})
preset1 := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tv1.ID,
Name: "preset-test",
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})
reconciler := prebuilds.NewStoreReconciler(
db, pubSub, cache, cfg, logger,
clock,
registry,
newNoopEnqueuer(),
newNoopUsageCheckerPtr(),
noop.NewTracerProvider(),
10,
nil,
)
// First reconcile: preset gets marked as validation_failed.
_, err := reconciler.ReconcileAll(ctx)
require.NoError(t, err)
// Verify preset is marked as validation_failed in the database.
updatedPreset, err := db.GetPresetByID(ctx, preset1.ID)
require.NoError(t, err)
require.Equal(t, database.PrebuildStatusValidationFailed, updatedPreset.PrebuildStatus)
// Second reconcile: metrics snapshot picks up the validation_failed status.
_, err = reconciler.ReconcileAll(ctx)
require.NoError(t, err)
// Verify metric is set.
require.NoError(t, reconciler.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
"template_name": tpl.Name,
"preset_name": preset1.Name,
"organization_name": org.Name,
})
require.NotNil(t, metric)
require.EqualValues(t, 1, metric.GetGauge().GetValue())
// Promote a new template version without the problematic param.
job2 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
OrganizationID: org.ID,
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
InitiatorID: ownerID,
})
tv2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: org.ID,
JobID: job2.ID,
CreatedBy: ownerID,
})
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
ID: tpl.ID,
ActiveVersionID: tv2.ID,
}))
// Create a preset on the new version.
preset2 := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: tv2.ID,
Name: "preset-test", // same name, new version
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})
// Reconcile with the new version active.
_, err = reconciler.ReconcileAll(ctx)
require.NoError(t, err)
// Old preset stays validation_failed (it's now inactive, won't be reset).
oldPreset, err := db.GetPresetByID(ctx, preset1.ID)
require.NoError(t, err)
require.Equal(t, database.PrebuildStatusValidationFailed, oldPreset.PrebuildStatus)
// New preset is healthy.
newPreset, err := db.GetPresetByID(ctx, preset2.ID)
require.NoError(t, err)
require.Equal(t, database.PrebuildStatusHealthy, newPreset.PrebuildStatus)
// Metric should be cleared: the old preset is inactive, so it's no longer reported.
require.NoError(t, reconciler.ForceMetricsUpdate(ctx))
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
"template_name": tpl.Name,
"preset_name": preset1.Name,
"organization_name": org.Name,
})
require.Nil(t, metric)
// New preset should have a workspace created.
workspaces, err := db.GetWorkspacesByTemplateID(ctx, tpl.ID)
require.NoError(t, err)
require.Len(t, workspaces, 1)
}
func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
t.Parallel()
@@ -1172,9 +1553,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
mf, err := registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.Nil(t, metric)
@@ -1212,9 +1593,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
@@ -1263,9 +1644,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.Nil(t, metric)
})
@@ -1668,9 +2049,9 @@ func TestTrackResourceReplacement(t *testing.T) {
mf, err := registry.Gather()
require.NoError(t, err)
require.Nil(t, findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
}))
// When: a claim occurred and resource replacements are detected (_how_ is out of scope of this test).
@@ -1711,9 +2092,9 @@ func TestTrackResourceReplacement(t *testing.T) {
mf, err = registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetCounter())
+3
View File
@@ -265,6 +265,9 @@ coderd_prebuilt_workspaces_metrics_last_updated 0
# HELP coderd_prebuilt_workspaces_preset_hard_limited Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.
# TYPE coderd_prebuilt_workspaces_preset_hard_limited gauge
coderd_prebuilt_workspaces_preset_hard_limited{template_name="",preset_name="",organization_name=""} 0
# HELP coderd_prebuilt_workspaces_preset_validation_failed Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise.
# TYPE coderd_prebuilt_workspaces_preset_validation_failed gauge
coderd_prebuilt_workspaces_preset_validation_failed{template_name="",preset_name="",organization_name=""} 0
# HELP coderd_prebuilt_workspaces_reconciliation_paused Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused).
# TYPE coderd_prebuilt_workspaces_reconciliation_paused gauge
coderd_prebuilt_workspaces_reconciliation_paused 0