mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix: mark presets as validation_failed to prevent endless prebuild retries (#22085)
## Description - Updates `wsbuilder` to return a `BuildError` with `http.StatusBadRequest` to signify a "validation error" on missing or invalid parameters - Adds a short-circuit in `prebuilds.StoreReconciler` to mark presets for which creating a build returns a "validation error" as "validation failed" and skip further attempts to reconcile. - Adds a test to verify the above - Introduces a new Prometheus metric `coderd_prebuilt_workspaces_preset_validation_failed` to track the above Closes: https://github.com/coder/coder/issues/21237 --------- Co-authored-by: Cian Johnston <cian@coder.com>
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
package wsbuilder_test
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/hcl/v2"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/dynamicparameters"
|
||||
"github.com/coder/coder/v2/coderd/wsbuilder"
|
||||
)
|
||||
|
||||
func TestBuildErrorResponseDelegation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("plain_error", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
be := wsbuilder.BuildError{
|
||||
Status: http.StatusBadRequest,
|
||||
Message: "bad",
|
||||
Wrapped: xerrors.New("oops"),
|
||||
}
|
||||
|
||||
status, resp := be.Response()
|
||||
require.Equal(t, http.StatusBadRequest, status)
|
||||
require.Equal(t, "bad", resp.Message)
|
||||
require.Contains(t, resp.Detail, "oops")
|
||||
require.Empty(t, resp.Validations)
|
||||
})
|
||||
|
||||
t.Run("responder_error", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
inner := &dynamicparameters.DiagnosticError{
|
||||
Message: "resolve parameters",
|
||||
KeyedDiagnostics: map[string]hcl.Diagnostics{
|
||||
"param1": {
|
||||
{
|
||||
Severity: hcl.DiagError,
|
||||
Summary: "required parameter",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
be := wsbuilder.BuildError{
|
||||
Status: http.StatusBadRequest,
|
||||
Message: "build error wrapper",
|
||||
Wrapped: inner,
|
||||
}
|
||||
|
||||
status, resp := be.Response()
|
||||
|
||||
// Should delegate to the inner DiagnosticError's response.
|
||||
innerStatus, innerResp := inner.Response()
|
||||
require.Equal(t, innerStatus, status)
|
||||
require.Equal(t, innerResp.Message, resp.Message)
|
||||
require.Len(t, resp.Validations, 1)
|
||||
require.Equal(t, "param1", resp.Validations[0].Field)
|
||||
})
|
||||
}
|
||||
@@ -24,6 +24,7 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/dynamicparameters"
|
||||
"github.com/coder/coder/v2/coderd/files"
|
||||
"github.com/coder/coder/v2/coderd/httpapi"
|
||||
"github.com/coder/coder/v2/coderd/httpapi/httperror"
|
||||
"github.com/coder/coder/v2/coderd/prebuilds"
|
||||
"github.com/coder/coder/v2/coderd/provisionerdserver"
|
||||
"github.com/coder/coder/v2/coderd/rbac"
|
||||
@@ -280,6 +281,13 @@ func (e BuildError) Unwrap() error {
|
||||
}
|
||||
|
||||
func (e BuildError) Response() (int, codersdk.Response) {
|
||||
// If the wrapped error knows how to produce its own response
|
||||
// (e.g. DiagnosticError with Validations), prefer that over
|
||||
// the generic BuildError response.
|
||||
if inner, ok := httperror.IsResponder(e.Wrapped); ok {
|
||||
return inner.Response()
|
||||
}
|
||||
|
||||
return e.Status, codersdk.Response{
|
||||
Message: e.Message,
|
||||
Detail: e.Error(),
|
||||
@@ -875,7 +883,7 @@ func (b *Builder) getDynamicParameters() (names, values []string, err error) {
|
||||
b.richParameterValues,
|
||||
presetParameterValues)
|
||||
if err != nil {
|
||||
return nil, nil, xerrors.Errorf("resolve parameters: %w", err)
|
||||
return nil, nil, BuildError{http.StatusBadRequest, "resolve parameters", err}
|
||||
}
|
||||
|
||||
names = make([]string, 0, len(buildValues))
|
||||
@@ -1112,7 +1120,7 @@ func (b *Builder) getDynamicProvisionerTags() (map[string]string, error) {
|
||||
output, diags := render.Render(b.ctx, b.workspace.OwnerID, vals)
|
||||
tagErr := dynamicparameters.CheckTags(output, diags)
|
||||
if tagErr != nil {
|
||||
return nil, tagErr
|
||||
return nil, BuildError{http.StatusBadRequest, "workspace tags validation failed", tagErr}
|
||||
}
|
||||
|
||||
for k, v := range output.WorkspaceTags.Tags() {
|
||||
|
||||
@@ -211,6 +211,7 @@ deployment. They will always be available from the agent.
|
||||
| `coderd_prebuilt_workspaces_failed_total` | counter | Total number of prebuilt workspaces that failed to build. | `organization_name` `preset_name` `template_name` |
|
||||
| `coderd_prebuilt_workspaces_metrics_last_updated` | gauge | The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached. | |
|
||||
| `coderd_prebuilt_workspaces_preset_hard_limited` | gauge | Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` |
|
||||
| `coderd_prebuilt_workspaces_preset_validation_failed` | gauge | Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise. | `organization_name` `preset_name` `template_name` |
|
||||
| `coderd_prebuilt_workspaces_reconciliation_paused` | gauge | Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused). | |
|
||||
| `coderd_prebuilt_workspaces_resource_replacements_total` | counter | Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. In Terraform, drift on immutable attributes results in resource replacement. This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource would have been recreated when claiming, thus obviating the point of pre-provisioning. See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement | `organization_name` `preset_name` `template_name` |
|
||||
| `coderd_prebuilt_workspaces_running` | gauge | Current number of prebuilt workspaces that are in a running state. These workspaces have started successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible). | `organization_name` `preset_name` `template_name` |
|
||||
|
||||
@@ -19,16 +19,17 @@ import (
|
||||
const (
|
||||
namespace = "coderd_prebuilt_workspaces_"
|
||||
|
||||
MetricCreatedCount = namespace + "created_total"
|
||||
MetricFailedCount = namespace + "failed_total"
|
||||
MetricClaimedCount = namespace + "claimed_total"
|
||||
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
|
||||
MetricDesiredGauge = namespace + "desired"
|
||||
MetricRunningGauge = namespace + "running"
|
||||
MetricEligibleGauge = namespace + "eligible"
|
||||
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
|
||||
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
|
||||
MetricReconciliationPausedGauge = namespace + "reconciliation_paused"
|
||||
MetricCreatedCount = namespace + "created_total"
|
||||
MetricFailedCount = namespace + "failed_total"
|
||||
MetricClaimedCount = namespace + "claimed_total"
|
||||
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
|
||||
MetricDesiredGauge = namespace + "desired"
|
||||
MetricRunningGauge = namespace + "running"
|
||||
MetricEligibleGauge = namespace + "eligible"
|
||||
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
|
||||
MetricPresetValidationFailedGauge = namespace + "preset_validation_failed"
|
||||
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
|
||||
MetricReconciliationPausedGauge = namespace + "reconciliation_paused"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -89,6 +90,12 @@ var (
|
||||
labels,
|
||||
nil,
|
||||
)
|
||||
presetValidationFailedDesc = prometheus.NewDesc(
|
||||
MetricPresetValidationFailedGauge,
|
||||
"Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise.",
|
||||
labels,
|
||||
nil,
|
||||
)
|
||||
lastUpdateDesc = prometheus.NewDesc(
|
||||
MetricLastUpdatedGauge,
|
||||
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
|
||||
@@ -121,6 +128,9 @@ type MetricsCollector struct {
|
||||
isPresetHardLimited map[hardLimitedPresetKey]bool
|
||||
isPresetHardLimitedMu sync.Mutex
|
||||
|
||||
isPresetValidationFailed map[hardLimitedPresetKey]bool
|
||||
isPresetValidationFailedMu sync.Mutex
|
||||
|
||||
reconciliationPaused bool
|
||||
reconciliationPausedMu sync.RWMutex
|
||||
}
|
||||
@@ -131,11 +141,12 @@ func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter preb
|
||||
log := logger.Named("prebuilds_metrics_collector")
|
||||
|
||||
return &MetricsCollector{
|
||||
database: db,
|
||||
logger: log,
|
||||
snapshotter: snapshotter,
|
||||
replacementsCounter: make(map[replacementKey]float64),
|
||||
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
|
||||
database: db,
|
||||
logger: log,
|
||||
snapshotter: snapshotter,
|
||||
replacementsCounter: make(map[replacementKey]float64),
|
||||
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
|
||||
isPresetValidationFailed: make(map[hardLimitedPresetKey]bool),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +159,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
|
||||
descCh <- runningPrebuildsDesc
|
||||
descCh <- eligiblePrebuildsDesc
|
||||
descCh <- presetHardLimitedDesc
|
||||
descCh <- presetValidationFailedDesc
|
||||
descCh <- lastUpdateDesc
|
||||
descCh <- reconciliationPausedDesc
|
||||
}
|
||||
@@ -216,6 +228,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
|
||||
}
|
||||
mc.isPresetHardLimitedMu.Unlock()
|
||||
|
||||
mc.isPresetValidationFailedMu.Lock()
|
||||
for key, isValidationFailed := range mc.isPresetValidationFailed {
|
||||
var val float64
|
||||
if isValidationFailed {
|
||||
val = 1
|
||||
}
|
||||
|
||||
metricsCh <- prometheus.MustNewConstMetric(presetValidationFailedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
|
||||
}
|
||||
mc.isPresetValidationFailedMu.Unlock()
|
||||
|
||||
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
|
||||
}
|
||||
|
||||
@@ -306,6 +329,13 @@ func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[h
|
||||
mc.isPresetHardLimited = isPresetHardLimited
|
||||
}
|
||||
|
||||
func (mc *MetricsCollector) registerValidationFailedPresets(isPresetValidationFailed map[hardLimitedPresetKey]bool) {
|
||||
mc.isPresetValidationFailedMu.Lock()
|
||||
defer mc.isPresetValidationFailedMu.Unlock()
|
||||
|
||||
mc.isPresetValidationFailed = isPresetValidationFailed
|
||||
}
|
||||
|
||||
func (mc *MetricsCollector) setReconciliationPaused(paused bool) {
|
||||
mc.reconciliationPausedMu.Lock()
|
||||
defer mc.reconciliationPausedMu.Unlock()
|
||||
|
||||
@@ -432,6 +432,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
|
||||
continue
|
||||
}
|
||||
|
||||
metricLoop:
|
||||
for _, metric := range metricFamily.GetMetric() {
|
||||
labelPairs := metric.GetLabel()
|
||||
|
||||
@@ -444,7 +445,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
|
||||
// Check if all requested labels match
|
||||
for wantName, wantValue := range labels {
|
||||
if metricLabels[wantName] != wantValue {
|
||||
continue
|
||||
continue metricLoop
|
||||
}
|
||||
}
|
||||
|
||||
@@ -458,6 +459,7 @@ func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string,
|
||||
func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labels map[string]string) map[string]*prometheus_client.Metric {
|
||||
series := make(map[string]*prometheus_client.Metric)
|
||||
for _, metricFamily := range metricsFamilies {
|
||||
metricLoop:
|
||||
for _, metric := range metricFamily.GetMetric() {
|
||||
labelPairs := metric.GetLabel()
|
||||
|
||||
@@ -474,7 +476,7 @@ func findAllMetricSeries(metricsFamilies []*prometheus_client.MetricFamily, labe
|
||||
// Check if all requested labels match
|
||||
for wantName, wantValue := range labels {
|
||||
if metricLabels[wantName] != wantValue {
|
||||
continue
|
||||
continue metricLoop
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@@ -413,6 +414,7 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.Rec
|
||||
}
|
||||
|
||||
c.reportHardLimitedPresets(snapshot)
|
||||
c.reportValidationFailedPresets(snapshot)
|
||||
|
||||
if len(snapshot.Presets) == 0 {
|
||||
logger.Debug(ctx, "no templates found with prebuilds configured")
|
||||
@@ -514,6 +516,42 @@ func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSna
|
||||
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) reportValidationFailedPresets(snapshot *prebuilds.GlobalSnapshot) {
|
||||
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
|
||||
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
|
||||
// because templates can have multiple versions - or deleted templates can share the same name.
|
||||
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
|
||||
for _, preset := range snapshot.Presets {
|
||||
key := hardLimitedPresetKey{
|
||||
orgName: preset.OrganizationName,
|
||||
templateName: preset.TemplateName,
|
||||
presetName: preset.Name,
|
||||
}
|
||||
|
||||
presetsMap[key] = append(presetsMap[key], preset)
|
||||
}
|
||||
|
||||
// Report a preset as validation-failed only if all the following conditions are met:
|
||||
// - The preset has PrebuildStatus == PrebuildStatusValidationFailed
|
||||
// - The preset is using the active version of its template, and the template has not been deleted
|
||||
//
|
||||
// The second condition is important because a validation-failed preset that has become outdated is no longer relevant.
|
||||
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
|
||||
// as validation-failed to the admin.
|
||||
isPresetValidationFailed := make(map[hardLimitedPresetKey]bool)
|
||||
for key, presets := range presetsMap {
|
||||
for _, preset := range presets {
|
||||
if preset.UsingActiveVersion && !preset.Deleted &&
|
||||
preset.PrebuildStatus == database.PrebuildStatusValidationFailed {
|
||||
isPresetValidationFailed[key] = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.metrics.registerValidationFailedPresets(isPresetValidationFailed)
|
||||
}
|
||||
|
||||
// SnapshotState captures the current state of all prebuilds across templates.
|
||||
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
|
||||
ctx, span := c.tracer.Start(ctx, "prebuilds.SnapshotState")
|
||||
@@ -783,11 +821,37 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
|
||||
return nil
|
||||
}
|
||||
|
||||
// If preset previously failed validation (e.g. missing required parameter,
|
||||
// invalid workspace tags), skip creation until the template version is updated.
|
||||
// The status resets naturally when a new template version is promoted, since
|
||||
// new presets are created with the default 'healthy' status.
|
||||
if ps.Preset.PrebuildStatus == database.PrebuildStatusValidationFailed && action.Create > 0 {
|
||||
logger.Warn(ctx, "skipping preset with validation failure for create operation")
|
||||
return nil
|
||||
}
|
||||
|
||||
var multiErr multierror.Error
|
||||
for range action.Create {
|
||||
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
||||
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
||||
multiErr.Errors = append(multiErr.Errors, err)
|
||||
|
||||
// A 400 BuildError means the build failed due to a validation error
|
||||
// (e.g. missing parameter, invalid workspace tags). These errors are
|
||||
// deterministic and will persist until the template is updated, so we
|
||||
// mark the preset to prevent endless retries on every reconciliation loop.
|
||||
var buildErr wsbuilder.BuildError
|
||||
if xerrors.As(err, &buildErr) && buildErr.Status == http.StatusBadRequest {
|
||||
logger.Warn(ctx, "marking preset as failed validation")
|
||||
if dbErr := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
|
||||
Status: database.PrebuildStatusValidationFailed,
|
||||
PresetID: ps.Preset.ID,
|
||||
}); dbErr != nil {
|
||||
logger.Error(ctx, "failed to update preset prebuild status", slog.Error(dbErr))
|
||||
}
|
||||
// All prebuilds for this preset will fail the same way, so stop trying.
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -507,6 +507,37 @@ func (*brokenPublisher) Publish(event string, _ []byte) error {
|
||||
return xerrors.Errorf("failed to publish %q", event)
|
||||
}
|
||||
|
||||
// prebuildStoreWrapper wraps database.Store to inject errors for testing.
|
||||
type prebuildStoreWrapper struct {
|
||||
database.Store
|
||||
insertProvisionerJobErr error
|
||||
errorOnTemplateVersionID uuid.UUID
|
||||
}
|
||||
|
||||
func (s prebuildStoreWrapper) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) {
|
||||
if s.insertProvisionerJobErr != nil {
|
||||
return database.ProvisionerJob{}, s.insertProvisionerJobErr
|
||||
}
|
||||
return s.Store.InsertProvisionerJob(ctx, arg)
|
||||
}
|
||||
|
||||
func (s prebuildStoreWrapper) InsertWorkspaceBuild(ctx context.Context, arg database.InsertWorkspaceBuildParams) error {
|
||||
if s.errorOnTemplateVersionID != uuid.Nil && arg.TemplateVersionID == s.errorOnTemplateVersionID {
|
||||
return xerrors.Errorf("injected internal server error for template version %s", s.errorOnTemplateVersionID)
|
||||
}
|
||||
return s.Store.InsertWorkspaceBuild(ctx, arg)
|
||||
}
|
||||
|
||||
func (s prebuildStoreWrapper) InTx(fn func(database.Store) error, opts *database.TxOptions) error {
|
||||
return s.Store.InTx(func(tx database.Store) error {
|
||||
return fn(prebuildStoreWrapper{
|
||||
Store: tx,
|
||||
insertProvisionerJobErr: s.insertProvisionerJobErr,
|
||||
errorOnTemplateVersionID: s.errorOnTemplateVersionID,
|
||||
})
|
||||
}, opts)
|
||||
}
|
||||
|
||||
func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -983,9 +1014,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
mf, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.Nil(t, metric)
|
||||
|
||||
@@ -1020,9 +1051,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.Nil(t, metric)
|
||||
return
|
||||
@@ -1036,9 +1067,9 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.NotNil(t, metric)
|
||||
require.NotNil(t, metric.GetGauge())
|
||||
@@ -1047,6 +1078,356 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidationFailedPresets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// This test uses 5 presets sharing one DB to verify validation_failed behavior:
|
||||
// | Preset | Setup | Expected After Reconcile |
|
||||
// |--------|-----------------------------------------|-------------------------------------------|
|
||||
// | A | Already validation_failed, desired=2 | Stays validation_failed, 0 workspaces |
|
||||
// | B | Healthy, required param missing | Marked validation_failed, 0 workspaces |
|
||||
// | C | Healthy, desired=3, required param | Marked validation_failed, 0 workspaces |
|
||||
// | D | Healthy, DB wrapper injects 500 | Stays healthy, 0 workspaces |
|
||||
// | E | Healthy, desired=1 (control) | Stays healthy, 1 workspaces |
|
||||
|
||||
clock := quartz.NewMock(t)
|
||||
ctx := testutil.Context(t, testutil.WaitMedium)
|
||||
cfg := codersdk.PrebuildsConfig{}
|
||||
logger := slogtest.Make(
|
||||
t, &slogtest.Options{IgnoreErrors: true},
|
||||
).Leveled(slog.LevelDebug)
|
||||
db, pubSub := dbtestutil.NewDB(t)
|
||||
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
|
||||
registry := prometheus.NewRegistry()
|
||||
|
||||
// Set up shared test environment.
|
||||
ownerID := uuid.New()
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: ownerID,
|
||||
})
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{
|
||||
OrganizationID: org.ID,
|
||||
UserID: ownerID,
|
||||
})
|
||||
|
||||
// Helper to create template + version + optional required param.
|
||||
createTemplate := func(name string, addRequiredParam bool) (database.Template, database.TemplateVersion) {
|
||||
// First create the template (with a placeholder ActiveVersionID that we'll update).
|
||||
tpl := dbgen.Template(t, db, database.Template{
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: ownerID,
|
||||
Name: name,
|
||||
})
|
||||
|
||||
// Now create the provisioner job and template version linked to the template.
|
||||
job := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
|
||||
OrganizationID: org.ID,
|
||||
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
|
||||
InitiatorID: ownerID,
|
||||
})
|
||||
tv := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
|
||||
OrganizationID: org.ID,
|
||||
JobID: job.ID,
|
||||
CreatedBy: ownerID,
|
||||
})
|
||||
|
||||
// Update template to point to this version as active.
|
||||
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
|
||||
ID: tpl.ID,
|
||||
ActiveVersionID: tv.ID,
|
||||
}))
|
||||
|
||||
if addRequiredParam {
|
||||
dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{
|
||||
TemplateVersionID: tv.ID,
|
||||
Name: "required_param",
|
||||
Description: "required param to trigger validation failure",
|
||||
Type: "bool",
|
||||
DefaultValue: "",
|
||||
Required: true,
|
||||
})
|
||||
}
|
||||
return tpl, tv
|
||||
}
|
||||
|
||||
// Create templates.
|
||||
tplA, tvA := createTemplate("tpl-already-failed", false)
|
||||
tplB, tvB := createTemplate("tpl-will-400", true)
|
||||
tplC, tvC := createTemplate("tpl-multi-create", true)
|
||||
tplD, tvD := createTemplate("tpl-will-500", false)
|
||||
tplE, tvE := createTemplate("tpl-control", false)
|
||||
|
||||
// Create presets.
|
||||
presetA := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tvA.ID,
|
||||
Name: "preset-already-failed",
|
||||
DesiredInstances: sql.NullInt32{Int32: 2, Valid: true},
|
||||
})
|
||||
// Mark preset A as validation_failed from the start.
|
||||
err := db.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
|
||||
PresetID: presetA.ID,
|
||||
Status: database.PrebuildStatusValidationFailed,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
presetB := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tvB.ID,
|
||||
Name: "preset-will-400",
|
||||
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
|
||||
})
|
||||
presetC := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tvC.ID,
|
||||
Name: "preset-multi-create",
|
||||
DesiredInstances: sql.NullInt32{Int32: 3, Valid: true},
|
||||
})
|
||||
presetD := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tvD.ID,
|
||||
Name: "preset-will-500",
|
||||
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
|
||||
})
|
||||
presetE := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tvE.ID,
|
||||
Name: "preset-control",
|
||||
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
|
||||
})
|
||||
|
||||
// Wrap DB to inject 500 error for template D's version.
|
||||
wrappedDB := prebuildStoreWrapper{
|
||||
Store: db,
|
||||
errorOnTemplateVersionID: tvD.ID,
|
||||
}
|
||||
|
||||
controller := prebuilds.NewStoreReconciler(
|
||||
wrappedDB, pubSub, cache, cfg, logger,
|
||||
clock,
|
||||
registry,
|
||||
newNoopEnqueuer(),
|
||||
newNoopUsageCheckerPtr(),
|
||||
noop.NewTracerProvider(),
|
||||
10,
|
||||
nil,
|
||||
)
|
||||
|
||||
// First reconcile: marks B, C as validation_failed.
|
||||
_, err = controller.ReconcileAll(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Second reconcile: updates metrics with newly-failed presets
|
||||
// (metrics are updated based on snapshot taken at the START of ReconcileAll).
|
||||
_, err = controller.ReconcileAll(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify preset states.
|
||||
verifyPreset := func(presetID uuid.UUID, expectedStatus database.PrebuildStatus, templateID uuid.UUID, expectWorkspaces int) {
|
||||
preset, err := db.GetPresetByID(ctx, presetID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, expectedStatus, preset.PrebuildStatus,
|
||||
"preset %s should have status %s", preset.Name, expectedStatus)
|
||||
|
||||
workspaces, err := db.GetWorkspacesByTemplateID(ctx, templateID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, workspaces, expectWorkspaces,
|
||||
"template %s should have %d workspaces", templateID, expectWorkspaces)
|
||||
}
|
||||
|
||||
// Preset A: already validation_failed, stays that way, no workspaces.
|
||||
verifyPreset(presetA.ID, database.PrebuildStatusValidationFailed, tplA.ID, 0)
|
||||
// Preset B: healthy -> validation_failed due to 400 (missing required param).
|
||||
verifyPreset(presetB.ID, database.PrebuildStatusValidationFailed, tplB.ID, 0)
|
||||
// Preset C: healthy -> validation_failed due to 400 (missing required param), even with 3 desired instances.
|
||||
verifyPreset(presetC.ID, database.PrebuildStatusValidationFailed, tplC.ID, 0)
|
||||
// Preset D: stays healthy because 500 error does not mark as validation_failed.
|
||||
verifyPreset(presetD.ID, database.PrebuildStatusHealthy, tplD.ID, 0)
|
||||
// Preset E: stays healthy (control)
|
||||
verifyPreset(presetE.ID, database.PrebuildStatusHealthy, tplE.ID, 1)
|
||||
|
||||
// Verify metrics: A, B, C should have validation_failed metric set to 1.
|
||||
require.NoError(t, controller.ForceMetricsUpdate(ctx))
|
||||
mf, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
// Helper to check metric value.
|
||||
checkMetric := func(templateName, presetName string, expectSet bool) {
|
||||
metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
|
||||
"template_name": templateName,
|
||||
"preset_name": presetName,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
if expectSet {
|
||||
require.NotNil(t, metric, "metric should be set for preset %s", presetName)
|
||||
require.NotNil(t, metric.GetGauge())
|
||||
require.EqualValues(t, 1, metric.GetGauge().GetValue(),
|
||||
"metric value should be 1 for preset %s", presetName)
|
||||
} else {
|
||||
require.Nil(t, metric, "metric should not be set for preset %s", presetName)
|
||||
}
|
||||
}
|
||||
|
||||
checkMetric(tplA.Name, presetA.Name, true)
|
||||
checkMetric(tplB.Name, presetB.Name, true)
|
||||
checkMetric(tplC.Name, presetC.Name, true)
|
||||
checkMetric(tplD.Name, presetD.Name, false)
|
||||
checkMetric(tplE.Name, presetE.Name, false)
|
||||
}
|
||||
|
||||
// TestValidationFailedPresetResets verifies that when a preset is marked as
|
||||
// validation_failed and a new template version is promoted, the new preset
|
||||
// starts healthy and the validation_failed metric is cleared.
|
||||
func TestValidationFailedPresetResets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
clock := quartz.NewMock(t)
|
||||
ctx := testutil.Context(t, testutil.WaitMedium)
|
||||
cfg := codersdk.PrebuildsConfig{}
|
||||
logger := slogtest.Make(
|
||||
t, &slogtest.Options{IgnoreErrors: true},
|
||||
).Leveled(slog.LevelDebug)
|
||||
db, pubSub := dbtestutil.NewDB(t)
|
||||
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
|
||||
registry := prometheus.NewRegistry()
|
||||
|
||||
ownerID := uuid.New()
|
||||
dbgen.User(t, db, database.User{
|
||||
ID: ownerID,
|
||||
})
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{
|
||||
OrganizationID: org.ID,
|
||||
UserID: ownerID,
|
||||
})
|
||||
|
||||
// Create a template with a required param that will cause validation failure.
|
||||
tpl := dbgen.Template(t, db, database.Template{
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: ownerID,
|
||||
Name: "tpl-version-reset",
|
||||
})
|
||||
|
||||
job1 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
|
||||
OrganizationID: org.ID,
|
||||
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
|
||||
InitiatorID: ownerID,
|
||||
})
|
||||
tv1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
|
||||
OrganizationID: org.ID,
|
||||
JobID: job1.ID,
|
||||
CreatedBy: ownerID,
|
||||
})
|
||||
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
|
||||
ID: tpl.ID,
|
||||
ActiveVersionID: tv1.ID,
|
||||
}))
|
||||
|
||||
// Add a required param with no default, this triggers validation failure.
|
||||
dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{
|
||||
TemplateVersionID: tv1.ID,
|
||||
Name: "required_param",
|
||||
Description: "required param to trigger validation failure",
|
||||
Type: "bool",
|
||||
DefaultValue: "",
|
||||
Required: true,
|
||||
})
|
||||
|
||||
preset1 := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tv1.ID,
|
||||
Name: "preset-test",
|
||||
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
|
||||
})
|
||||
|
||||
reconciler := prebuilds.NewStoreReconciler(
|
||||
db, pubSub, cache, cfg, logger,
|
||||
clock,
|
||||
registry,
|
||||
newNoopEnqueuer(),
|
||||
newNoopUsageCheckerPtr(),
|
||||
noop.NewTracerProvider(),
|
||||
10,
|
||||
nil,
|
||||
)
|
||||
|
||||
// First reconcile: preset gets marked as validation_failed.
|
||||
_, err := reconciler.ReconcileAll(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify preset is marked as validation_failed in the database.
|
||||
updatedPreset, err := db.GetPresetByID(ctx, preset1.ID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.PrebuildStatusValidationFailed, updatedPreset.PrebuildStatus)
|
||||
|
||||
// Second reconcile: metrics snapshot picks up the validation_failed status.
|
||||
_, err = reconciler.ReconcileAll(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify metric is set.
|
||||
require.NoError(t, reconciler.ForceMetricsUpdate(ctx))
|
||||
mf, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric := findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
|
||||
"template_name": tpl.Name,
|
||||
"preset_name": preset1.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.NotNil(t, metric)
|
||||
require.EqualValues(t, 1, metric.GetGauge().GetValue())
|
||||
|
||||
// Promote a new template version without the problematic param.
|
||||
job2 := dbgen.ProvisionerJob(t, db, pubSub, database.ProvisionerJob{
|
||||
OrganizationID: org.ID,
|
||||
CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true},
|
||||
InitiatorID: ownerID,
|
||||
})
|
||||
tv2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
|
||||
OrganizationID: org.ID,
|
||||
JobID: job2.ID,
|
||||
CreatedBy: ownerID,
|
||||
})
|
||||
require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{
|
||||
ID: tpl.ID,
|
||||
ActiveVersionID: tv2.ID,
|
||||
}))
|
||||
|
||||
// Create a preset on the new version.
|
||||
preset2 := dbgen.Preset(t, db, database.InsertPresetParams{
|
||||
TemplateVersionID: tv2.ID,
|
||||
Name: "preset-test", // same name, new version
|
||||
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
|
||||
})
|
||||
|
||||
// Reconcile with the new version active.
|
||||
_, err = reconciler.ReconcileAll(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Old preset stays validation_failed (it's now inactive, won't be reset).
|
||||
oldPreset, err := db.GetPresetByID(ctx, preset1.ID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.PrebuildStatusValidationFailed, oldPreset.PrebuildStatus)
|
||||
|
||||
// New preset is healthy.
|
||||
newPreset, err := db.GetPresetByID(ctx, preset2.ID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.PrebuildStatusHealthy, newPreset.PrebuildStatus)
|
||||
|
||||
// Metric should be cleared: the old preset is inactive, so it's no longer reported.
|
||||
require.NoError(t, reconciler.ForceMetricsUpdate(ctx))
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetValidationFailedGauge, map[string]string{
|
||||
"template_name": tpl.Name,
|
||||
"preset_name": preset1.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.Nil(t, metric)
|
||||
|
||||
// New preset should have a workspace created.
|
||||
workspaces, err := db.GetWorkspacesByTemplateID(ctx, tpl.ID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, workspaces, 1)
|
||||
}
|
||||
|
||||
func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -1172,9 +1553,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
mf, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.Nil(t, metric)
|
||||
|
||||
@@ -1212,9 +1593,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.NotNil(t, metric)
|
||||
require.NotNil(t, metric.GetGauge())
|
||||
@@ -1263,9 +1644,9 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.Nil(t, metric)
|
||||
})
|
||||
@@ -1668,9 +2049,9 @@ func TestTrackResourceReplacement(t *testing.T) {
|
||||
mf, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
}))
|
||||
|
||||
// When: a claim occurred and resource replacements are detected (_how_ is out of scope of this test).
|
||||
@@ -1711,9 +2092,9 @@ func TestTrackResourceReplacement(t *testing.T) {
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric := findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
"template_name": template.Name,
|
||||
"preset_name": preset.Name,
|
||||
"organization_name": org.Name,
|
||||
})
|
||||
require.NotNil(t, metric)
|
||||
require.NotNil(t, metric.GetCounter())
|
||||
|
||||
@@ -265,6 +265,9 @@ coderd_prebuilt_workspaces_metrics_last_updated 0
|
||||
# HELP coderd_prebuilt_workspaces_preset_hard_limited Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.
|
||||
# TYPE coderd_prebuilt_workspaces_preset_hard_limited gauge
|
||||
coderd_prebuilt_workspaces_preset_hard_limited{template_name="",preset_name="",organization_name=""} 0
|
||||
# HELP coderd_prebuilt_workspaces_preset_validation_failed Indicates whether a given preset has validation failures (1 = validation failed). Metric is omitted otherwise.
|
||||
# TYPE coderd_prebuilt_workspaces_preset_validation_failed gauge
|
||||
coderd_prebuilt_workspaces_preset_validation_failed{template_name="",preset_name="",organization_name=""} 0
|
||||
# HELP coderd_prebuilt_workspaces_reconciliation_paused Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused).
|
||||
# TYPE coderd_prebuilt_workspaces_reconciliation_paused gauge
|
||||
coderd_prebuilt_workspaces_reconciliation_paused 0
|
||||
|
||||
Reference in New Issue
Block a user