Files
coder/enterprise/coderd/prebuilds/metricscollector.go
T
Spike Curtis bddb808b25 chore: arrange imports in a standard way (#21452)
Fixes all our Go file imports to match the preferred spec that we've _mostly_ been using. For example:

```
import (
	"context"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"golang.org/x/xerrors"
	"gopkg.in/natefinch/lumberjack.v2"

	"cdr.dev/slog/v3"
	"github.com/coder/coder/v2/codersdk/agentsdk"
	"github.com/coder/serpent"
)
```

3 groups: standard library, 3rd partly libs, Coder libs.

This PR makes the change across the codebase. The PR in the stack above modifies our formatting to maintain this state of affairs, and is a separate PR so it's possible to review that one in detail.
2026-01-08 15:24:11 +04:00

315 lines
11 KiB
Go

package prebuilds
import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/prebuilds"
)
const (
namespace = "coderd_prebuilt_workspaces_"
MetricCreatedCount = namespace + "created_total"
MetricFailedCount = namespace + "failed_total"
MetricClaimedCount = namespace + "claimed_total"
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
MetricReconciliationPausedGauge = namespace + "reconciliation_paused"
)
var (
labels = []string{"template_name", "preset_name", "organization_name"}
createdPrebuildsDesc = prometheus.NewDesc(
MetricCreatedCount,
"Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+
"template preset.",
labels,
nil,
)
failedPrebuildsDesc = prometheus.NewDesc(
MetricFailedCount,
"Total number of prebuilt workspaces that failed to build.",
labels,
nil,
)
claimedPrebuildsDesc = prometheus.NewDesc(
MetricClaimedCount,
"Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+
"with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.",
labels,
nil,
)
resourceReplacementsDesc = prometheus.NewDesc(
MetricResourceReplacementsCount,
"Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. "+
"In Terraform, drift on immutable attributes results in resource replacement. "+
"This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource "+
"would have been recreated when claiming, thus obviating the point of pre-provisioning. "+
"See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement",
labels,
nil,
)
desiredPrebuildsDesc = prometheus.NewDesc(
MetricDesiredGauge,
"Target number of prebuilt workspaces that should be available for each template preset.",
labels,
nil,
)
runningPrebuildsDesc = prometheus.NewDesc(
MetricRunningGauge,
"Current number of prebuilt workspaces that are in a running state. These workspaces have started "+
"successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).",
labels,
nil,
)
eligiblePrebuildsDesc = prometheus.NewDesc(
MetricEligibleGauge,
"Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+
"have completed their build process with their agent reporting 'ready' status.",
labels,
nil,
)
presetHardLimitedDesc = prometheus.NewDesc(
MetricPresetHardLimitedGauge,
"Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.",
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
[]string{},
nil,
)
reconciliationPausedDesc = prometheus.NewDesc(
MetricReconciliationPausedGauge,
"Indicates whether prebuilds reconciliation is currently paused (1 = paused, 0 = not paused).",
[]string{},
nil,
)
)
const (
metricsUpdateInterval = time.Second * 60
metricsUpdateTimeout = time.Second * 10
)
type MetricsCollector struct {
database database.Store
logger slog.Logger
snapshotter prebuilds.StateSnapshotter
latestState atomic.Pointer[metricsState]
replacementsCounter map[replacementKey]float64
replacementsCounterMu sync.Mutex
isPresetHardLimited map[hardLimitedPresetKey]bool
isPresetHardLimitedMu sync.Mutex
reconciliationPaused bool
reconciliationPausedMu sync.RWMutex
}
var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
}
}
func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- createdPrebuildsDesc
descCh <- failedPrebuildsDesc
descCh <- claimedPrebuildsDesc
descCh <- resourceReplacementsDesc
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- presetHardLimitedDesc
descCh <- lastUpdateDesc
descCh <- reconciliationPausedDesc
}
// Collect uses the cached state to set configured metrics.
// The state is cached because this function can be called multiple times per second and retrieving the current state
// is an expensive operation.
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
mc.reconciliationPausedMu.RLock()
var pausedValue float64
if mc.reconciliationPaused {
pausedValue = 1
}
mc.reconciliationPausedMu.RUnlock()
metricsCh <- prometheus.MustNewConstMetric(reconciliationPausedDesc, prometheus.GaugeValue, pausedValue)
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func.
if currentState == nil {
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set")
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0)
return
}
for _, metric := range currentState.prebuildMetrics {
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
}
mc.replacementsCounterMu.Lock()
for key, val := range mc.replacementsCounter {
metricsCh <- prometheus.MustNewConstMetric(resourceReplacementsDesc, prometheus.CounterValue, val, key.templateName, key.presetName, key.orgName)
}
mc.replacementsCounterMu.Unlock()
for _, preset := range currentState.snapshot.Presets {
if !preset.UsingActiveVersion {
continue
}
if preset.Deleted {
continue
}
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
if err != nil {
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
continue
}
state := presetSnapshot.CalculateState()
metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}
mc.isPresetHardLimitedMu.Lock()
for key, isHardLimited := range mc.isPresetHardLimited {
var val float64
if isHardLimited {
val = 1
}
metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
}
mc.isPresetHardLimitedMu.Unlock()
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}
type metricsState struct {
prebuildMetrics []database.GetPrebuildMetricsRow
snapshot *prebuilds.GlobalSnapshot
createdAt time.Time
}
// BackgroundFetch updates the metrics state every given interval.
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) {
tick := time.NewTicker(time.Nanosecond)
defer tick.Stop()
for {
select {
case <-ctx.Done():
return
case <-tick.C:
// Tick immediately, then set regular interval.
tick.Reset(updateInterval)
if err := mc.UpdateState(ctx, updateTimeout); err != nil {
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err))
}
}
}
}
// UpdateState builds the current metrics state.
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error {
start := time.Now()
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout)
defer fetchCancel()
prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx)
if err != nil {
return xerrors.Errorf("fetch prebuild metrics: %w", err)
}
snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database)
if err != nil {
return xerrors.Errorf("snapshot state: %w", err)
}
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds())))
mc.latestState.Store(&metricsState{
prebuildMetrics: prebuildMetrics,
snapshot: snapshot,
createdAt: dbtime.Now(),
})
return nil
}
type replacementKey struct {
orgName, templateName, presetName string
}
func (k replacementKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, presetName string) {
mc.replacementsCounterMu.Lock()
defer mc.replacementsCounterMu.Unlock()
key := replacementKey{orgName: orgName, templateName: templateName, presetName: presetName}
// We only track _that_ a resource replacement occurred, not how many.
// Just one is enough to ruin a prebuild, but we can't know apriori which replacement would cause this.
// For example, say we have 2 replacements: a docker_container and a null_resource; we don't know which one might
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key]++
}
type hardLimitedPresetKey struct {
orgName, templateName, presetName string
}
func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()
mc.isPresetHardLimited = isPresetHardLimited
}
func (mc *MetricsCollector) setReconciliationPaused(paused bool) {
mc.reconciliationPausedMu.Lock()
defer mc.reconciliationPausedMu.Unlock()
mc.reconciliationPaused = paused
}