fix: reimplement reporting of preset-hard-limited metric (#18055)

Addresses concerns raised in https://github.com/coder/coder/pull/18045
This commit is contained in:
Yevhenii Shcherbina
2025-05-28 14:18:32 -04:00
committed by GitHub
parent 6a2f22abf7
commit b330c0803c
4 changed files with 72 additions and 45 deletions
+22 -13
View File
@@ -12,11 +12,11 @@ import (
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
type GlobalSnapshot struct {
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
HardLimitedPresetsMap map[uuid.UUID]database.GetPresetsAtFailureLimitRow
}
func NewGlobalSnapshot(
@@ -26,12 +26,17 @@ func NewGlobalSnapshot(
backoffs []database.GetPresetsBackoffRow,
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
) GlobalSnapshot {
hardLimitedPresetsMap := make(map[uuid.UUID]database.GetPresetsAtFailureLimitRow, len(hardLimitedPresets))
for _, preset := range hardLimitedPresets {
hardLimitedPresetsMap[preset.PresetID] = preset
}
return GlobalSnapshot{
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
HardLimitedPresets: hardLimitedPresets,
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
HardLimitedPresetsMap: hardLimitedPresetsMap,
}
}
@@ -66,9 +71,7 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
backoffPtr = &backoff
}
_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
return row.PresetID == preset.ID
})
_, isHardLimited := s.HardLimitedPresetsMap[preset.ID]
return &PresetSnapshot{
Preset: preset,
@@ -80,6 +83,12 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
}, nil
}
func (s GlobalSnapshot) IsHardLimited(presetID uuid.UUID) bool {
_, isHardLimited := s.HardLimitedPresetsMap[presetID]
return isHardLimited
}
// filterExpiredWorkspaces splits running workspaces into expired and non-expired
// based on the preset's TTL.
// If TTL is missing or zero, all workspaces are considered non-expired.
@@ -280,16 +280,9 @@ func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
if isHardLimited {
mc.isPresetHardLimited[key] = true
} else {
delete(mc.isPresetHardLimited, key)
}
mc.isPresetHardLimited = isPresetHardLimited
}
+46 -18
View File
@@ -256,6 +256,9 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
if err != nil {
return xerrors.Errorf("determine current snapshot: %w", err)
}
c.reportHardLimitedPresets(snapshot)
if len(snapshot.Presets) == 0 {
logger.Debug(ctx, "no templates found with prebuilds configured")
return nil
@@ -296,6 +299,49 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
return err
}
func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
// because templates can have multiple versions — or deleted templates can share the same name.
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
for _, preset := range snapshot.Presets {
key := hardLimitedPresetKey{
orgName: preset.OrganizationName,
templateName: preset.TemplateName,
presetName: preset.Name,
}
presetsMap[key] = append(presetsMap[key], preset)
}
// Report a preset as hard-limited only if all the following conditions are met:
// - The preset is marked as hard-limited
// - The preset is using the active version of its template, and the template has not been deleted
//
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
// as hard-limited to the admin.
//
// This approach accounts for all relevant scenarios:
// Scenario #1: The admin created a new template version with the same preset names.
// Scenario #2: The admin created a new template version and renamed the presets.
// Scenario #3: The admin deleted a template version that contained hard-limited presets.
//
// In all of these cases, only the latest and non-deleted presets will be reported.
// All other presets will be ignored and eventually removed from Prometheus.
isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
for key, presets := range presetsMap {
for _, preset := range presets {
if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
isPresetHardLimited[key] = true
break
}
}
}
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
}
// SnapshotState captures the current state of all prebuilds across templates.
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
if err := ctx.Err(); err != nil {
@@ -361,24 +407,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)
// Report a metric only if the preset uses the latest version of the template and the template is not deleted.
// This avoids conflicts between metrics from old and new template versions.
//
// NOTE: Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
// because templates can have multiple versions — or deleted templates can share the same name.
//
// The safest approach is to report the metric only for the latest version of the preset.
// When a new template version is released, the metric for the new preset should overwrite
// the old value in Prometheus.
//
// However, theres one edge case: if an admin creates a template, it becomes hard-limited,
// then deletes the template and never creates another with the same name,
// the old preset will continue to be reported as hard-limited —
// even though its deleted. This will persist until `coderd` is restarted.
if ps.Preset.UsingActiveVersion && !ps.Preset.Deleted {
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
}
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
@@ -1034,8 +1034,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
// The metric is still set to 1, even though the preset has become outdated.
// This happens because the old value hasn't been overwritten by a newer preset yet.
// Metric is deleted after preset became outdated.
mf, err = registry.Gather()
require.NoError(t, err)
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
@@ -1043,9 +1042,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
"preset_name": preset.Name,
"org_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetGauge())
require.EqualValues(t, 1, metric.GetGauge().GetValue())
require.Nil(t, metric)
})
}
}