mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix: reimplement reporting of preset-hard-limited metric (#18055)
Addresses concerns raised in https://github.com/coder/coder/pull/18045
This commit is contained in:
committed by
GitHub
parent
6a2f22abf7
commit
b330c0803c
@@ -12,11 +12,11 @@ import (
|
||||
|
||||
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
|
||||
type GlobalSnapshot struct {
|
||||
Presets []database.GetTemplatePresetsWithPrebuildsRow
|
||||
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
|
||||
PrebuildsInProgress []database.CountInProgressPrebuildsRow
|
||||
Backoffs []database.GetPresetsBackoffRow
|
||||
HardLimitedPresets []database.GetPresetsAtFailureLimitRow
|
||||
Presets []database.GetTemplatePresetsWithPrebuildsRow
|
||||
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
|
||||
PrebuildsInProgress []database.CountInProgressPrebuildsRow
|
||||
Backoffs []database.GetPresetsBackoffRow
|
||||
HardLimitedPresetsMap map[uuid.UUID]database.GetPresetsAtFailureLimitRow
|
||||
}
|
||||
|
||||
func NewGlobalSnapshot(
|
||||
@@ -26,12 +26,17 @@ func NewGlobalSnapshot(
|
||||
backoffs []database.GetPresetsBackoffRow,
|
||||
hardLimitedPresets []database.GetPresetsAtFailureLimitRow,
|
||||
) GlobalSnapshot {
|
||||
hardLimitedPresetsMap := make(map[uuid.UUID]database.GetPresetsAtFailureLimitRow, len(hardLimitedPresets))
|
||||
for _, preset := range hardLimitedPresets {
|
||||
hardLimitedPresetsMap[preset.PresetID] = preset
|
||||
}
|
||||
|
||||
return GlobalSnapshot{
|
||||
Presets: presets,
|
||||
RunningPrebuilds: runningPrebuilds,
|
||||
PrebuildsInProgress: prebuildsInProgress,
|
||||
Backoffs: backoffs,
|
||||
HardLimitedPresets: hardLimitedPresets,
|
||||
Presets: presets,
|
||||
RunningPrebuilds: runningPrebuilds,
|
||||
PrebuildsInProgress: prebuildsInProgress,
|
||||
Backoffs: backoffs,
|
||||
HardLimitedPresetsMap: hardLimitedPresetsMap,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,9 +71,7 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
|
||||
backoffPtr = &backoff
|
||||
}
|
||||
|
||||
_, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool {
|
||||
return row.PresetID == preset.ID
|
||||
})
|
||||
_, isHardLimited := s.HardLimitedPresetsMap[preset.ID]
|
||||
|
||||
return &PresetSnapshot{
|
||||
Preset: preset,
|
||||
@@ -80,6 +83,12 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s GlobalSnapshot) IsHardLimited(presetID uuid.UUID) bool {
|
||||
_, isHardLimited := s.HardLimitedPresetsMap[presetID]
|
||||
|
||||
return isHardLimited
|
||||
}
|
||||
|
||||
// filterExpiredWorkspaces splits running workspaces into expired and non-expired
|
||||
// based on the preset's TTL.
|
||||
// If TTL is missing or zero, all workspaces are considered non-expired.
|
||||
|
||||
@@ -280,16 +280,9 @@ func (k hardLimitedPresetKey) String() string {
|
||||
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
|
||||
}
|
||||
|
||||
// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
|
||||
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
|
||||
func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) {
|
||||
mc.isPresetHardLimitedMu.Lock()
|
||||
defer mc.isPresetHardLimitedMu.Unlock()
|
||||
|
||||
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
|
||||
|
||||
if isHardLimited {
|
||||
mc.isPresetHardLimited[key] = true
|
||||
} else {
|
||||
delete(mc.isPresetHardLimited, key)
|
||||
}
|
||||
mc.isPresetHardLimited = isPresetHardLimited
|
||||
}
|
||||
|
||||
@@ -256,6 +256,9 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
|
||||
if err != nil {
|
||||
return xerrors.Errorf("determine current snapshot: %w", err)
|
||||
}
|
||||
|
||||
c.reportHardLimitedPresets(snapshot)
|
||||
|
||||
if len(snapshot.Presets) == 0 {
|
||||
logger.Debug(ctx, "no templates found with prebuilds configured")
|
||||
return nil
|
||||
@@ -296,6 +299,49 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
|
||||
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
|
||||
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
|
||||
// because templates can have multiple versions — or deleted templates can share the same name.
|
||||
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
|
||||
for _, preset := range snapshot.Presets {
|
||||
key := hardLimitedPresetKey{
|
||||
orgName: preset.OrganizationName,
|
||||
templateName: preset.TemplateName,
|
||||
presetName: preset.Name,
|
||||
}
|
||||
|
||||
presetsMap[key] = append(presetsMap[key], preset)
|
||||
}
|
||||
|
||||
// Report a preset as hard-limited only if all the following conditions are met:
|
||||
// - The preset is marked as hard-limited
|
||||
// - The preset is using the active version of its template, and the template has not been deleted
|
||||
//
|
||||
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
|
||||
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
|
||||
// as hard-limited to the admin.
|
||||
//
|
||||
// This approach accounts for all relevant scenarios:
|
||||
// Scenario #1: The admin created a new template version with the same preset names.
|
||||
// Scenario #2: The admin created a new template version and renamed the presets.
|
||||
// Scenario #3: The admin deleted a template version that contained hard-limited presets.
|
||||
//
|
||||
// In all of these cases, only the latest and non-deleted presets will be reported.
|
||||
// All other presets will be ignored and eventually removed from Prometheus.
|
||||
isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
|
||||
for key, presets := range presetsMap {
|
||||
for _, preset := range presets {
|
||||
if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
|
||||
isPresetHardLimited[key] = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
|
||||
}
|
||||
|
||||
// SnapshotState captures the current state of all prebuilds across templates.
|
||||
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -361,24 +407,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
|
||||
slog.F("preset_name", ps.Preset.Name),
|
||||
)
|
||||
|
||||
// Report a metric only if the preset uses the latest version of the template and the template is not deleted.
|
||||
// This avoids conflicts between metrics from old and new template versions.
|
||||
//
|
||||
// NOTE: Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
|
||||
// because templates can have multiple versions — or deleted templates can share the same name.
|
||||
//
|
||||
// The safest approach is to report the metric only for the latest version of the preset.
|
||||
// When a new template version is released, the metric for the new preset should overwrite
|
||||
// the old value in Prometheus.
|
||||
//
|
||||
// However, there’s one edge case: if an admin creates a template, it becomes hard-limited,
|
||||
// then deletes the template and never creates another with the same name,
|
||||
// the old preset will continue to be reported as hard-limited —
|
||||
// even though it’s deleted. This will persist until `coderd` is restarted.
|
||||
if ps.Preset.UsingActiveVersion && !ps.Preset.Deleted {
|
||||
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited)
|
||||
}
|
||||
|
||||
// If the preset reached the hard failure limit for the first time during this iteration:
|
||||
// - Mark it as hard-limited in the database
|
||||
// - Send notifications to template admins
|
||||
|
||||
@@ -1034,8 +1034,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition)
|
||||
require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition)
|
||||
|
||||
// The metric is still set to 1, even though the preset has become outdated.
|
||||
// This happens because the old value hasn't been overwritten by a newer preset yet.
|
||||
// Metric is deleted after preset became outdated.
|
||||
mf, err = registry.Gather()
|
||||
require.NoError(t, err)
|
||||
metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{
|
||||
@@ -1043,9 +1042,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
|
||||
"preset_name": preset.Name,
|
||||
"org_name": org.Name,
|
||||
})
|
||||
require.NotNil(t, metric)
|
||||
require.NotNil(t, metric.GetGauge())
|
||||
require.EqualValues(t, 1, metric.GetGauge().GetValue())
|
||||
require.Nil(t, metric)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user