mirror of
https://github.com/coder/coder.git
synced 2026-06-04 13:38:21 +00:00
5f3be6b288
This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan <callumstyan@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1258 lines
46 KiB
Go
1258 lines
46 KiB
Go
package prebuilds
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/hashicorp/go-multierror"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
"golang.org/x/sync/errgroup"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/audit"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
|
|
"github.com/coder/coder/v2/coderd/database/pubsub"
|
|
"github.com/coder/coder/v2/coderd/files"
|
|
"github.com/coder/coder/v2/coderd/notifications"
|
|
"github.com/coder/coder/v2/coderd/prebuilds"
|
|
"github.com/coder/coder/v2/coderd/rbac"
|
|
"github.com/coder/coder/v2/coderd/rbac/policy"
|
|
"github.com/coder/coder/v2/coderd/tracing"
|
|
"github.com/coder/coder/v2/coderd/wsbuilder"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
type StoreReconciler struct {
|
|
store database.Store
|
|
cfg codersdk.PrebuildsConfig
|
|
pubsub pubsub.Pubsub
|
|
fileCache *files.Cache
|
|
logger slog.Logger
|
|
clock quartz.Clock
|
|
registerer prometheus.Registerer
|
|
notifEnq notifications.Enqueuer
|
|
buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker]
|
|
tracer trace.Tracer
|
|
|
|
// mu protects the reconciler's lifecycle state.
|
|
mu sync.Mutex
|
|
running bool
|
|
stopped bool
|
|
cancelFn context.CancelCauseFunc
|
|
|
|
done chan struct{}
|
|
provisionNotifyCh chan database.ProvisionerJob
|
|
|
|
reconciliationConcurrency int
|
|
|
|
// Prebuild state metrics
|
|
metrics *MetricsCollector
|
|
// Operational metrics
|
|
reconciliationDuration prometheus.Histogram
|
|
workspaceBuilderMetrics *wsbuilder.Metrics
|
|
}
|
|
|
|
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
|
|
|
|
type DeprovisionMode int
|
|
|
|
const (
|
|
DeprovisionModeNormal DeprovisionMode = iota
|
|
DeprovisionModeOrphan
|
|
)
|
|
|
|
func (d DeprovisionMode) String() string {
|
|
switch d {
|
|
case DeprovisionModeOrphan:
|
|
return "orphan"
|
|
case DeprovisionModeNormal:
|
|
return "normal"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func NewStoreReconciler(store database.Store,
|
|
ps pubsub.Pubsub,
|
|
fileCache *files.Cache,
|
|
cfg codersdk.PrebuildsConfig,
|
|
logger slog.Logger,
|
|
clock quartz.Clock,
|
|
registerer prometheus.Registerer,
|
|
notifEnq notifications.Enqueuer,
|
|
buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker],
|
|
tracerProvider trace.TracerProvider,
|
|
maxDBConnections int,
|
|
workspaceBuilderMetrics *wsbuilder.Metrics,
|
|
) *StoreReconciler {
|
|
reconciliationConcurrency := calculateReconciliationConcurrency(maxDBConnections)
|
|
|
|
logger.Debug(context.Background(), "reconciler initialized",
|
|
slog.F("reconciliation_concurrency", reconciliationConcurrency),
|
|
slog.F("max_db_connections", maxDBConnections))
|
|
|
|
reconciler := &StoreReconciler{
|
|
store: store,
|
|
pubsub: ps,
|
|
fileCache: fileCache,
|
|
logger: logger,
|
|
cfg: cfg,
|
|
clock: clock,
|
|
registerer: registerer,
|
|
notifEnq: notifEnq,
|
|
buildUsageChecker: buildUsageChecker,
|
|
tracer: tracerProvider.Tracer(tracing.TracerName),
|
|
done: make(chan struct{}, 1),
|
|
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
|
|
reconciliationConcurrency: reconciliationConcurrency,
|
|
workspaceBuilderMetrics: workspaceBuilderMetrics,
|
|
}
|
|
|
|
if registerer != nil {
|
|
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
|
|
if err := registerer.Register(reconciler.metrics); err != nil {
|
|
// If the registerer fails to register the metrics collector, it's not fatal.
|
|
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
|
|
}
|
|
|
|
factory := promauto.With(registerer)
|
|
reconciler.reconciliationDuration = factory.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prebuilds",
|
|
Name: "reconciliation_duration_seconds",
|
|
Help: "Duration of each prebuilds reconciliation cycle.",
|
|
Buckets: prometheus.DefBuckets,
|
|
})
|
|
}
|
|
|
|
return reconciler
|
|
}
|
|
|
|
// calculateReconciliationConcurrency determines the number of concurrent
|
|
// goroutines for preset reconciliation. Each preset may perform multiple
|
|
// database operations (creates/deletes), so we limit concurrency to avoid
|
|
// exhausting the connection pool while maintaining reasonable parallelism.
|
|
//
|
|
// Uses half the pool size, with a minimum of 1 and a maximum of 5.
|
|
// TODO(ssncferreira): If this becomes a bottleneck, consider adding a configuration option.
|
|
func calculateReconciliationConcurrency(maxDBConnections int) int {
|
|
if maxDBConnections <= 0 {
|
|
return 1
|
|
}
|
|
|
|
concurrency := maxDBConnections / 2
|
|
if concurrency < 1 {
|
|
return 1
|
|
}
|
|
if concurrency > 5 {
|
|
return 5
|
|
}
|
|
|
|
return concurrency
|
|
}
|
|
|
|
func (c *StoreReconciler) Run(ctx context.Context) {
|
|
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
|
|
if reconciliationInterval <= 0 { // avoids a panic
|
|
reconciliationInterval = 5 * time.Minute
|
|
}
|
|
|
|
c.logger.Info(ctx, "starting reconciler",
|
|
slog.F("interval", reconciliationInterval),
|
|
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
|
|
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()),
|
|
slog.F("preset_concurrency", c.reconciliationConcurrency))
|
|
|
|
// Create a child context that will be canceled when:
|
|
// 1. The parent context is canceled, OR
|
|
// 2. c.cancelFn() is called to trigger shutdown
|
|
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
|
|
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
|
|
|
|
// If the reconciler was already stopped, exit early and release the context.
|
|
// Otherwise, mark it as running and store the cancel function for shutdown.
|
|
c.mu.Lock()
|
|
if c.stopped || c.running {
|
|
c.mu.Unlock()
|
|
cancel(nil)
|
|
return
|
|
}
|
|
c.running = true
|
|
c.cancelFn = cancel
|
|
c.mu.Unlock()
|
|
|
|
ticker := c.clock.NewTicker(reconciliationInterval)
|
|
defer ticker.Stop()
|
|
// Wait for all background goroutines to exit before signaling completion.
|
|
var wg sync.WaitGroup
|
|
defer func() {
|
|
wg.Wait()
|
|
c.done <- struct{}{}
|
|
}()
|
|
|
|
// Start updating metrics in the background.
|
|
if c.metrics != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
|
|
}()
|
|
}
|
|
|
|
// Publish provisioning jobs outside of database transactions.
|
|
// A connection is held while a database transaction is active; PGPubsub also tries to acquire a new connection on
|
|
// Publish, so we can exhaust available connections.
|
|
//
|
|
// A single worker dequeues from the channel, which should be sufficient.
|
|
// If any messages are missed due to congestion or errors, provisionerdserver has a backup polling mechanism which
|
|
// will periodically pick up any queued jobs (see poll(time.Duration) in coderd/provisionerdserver/acquirer.go).
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case job := <-c.provisionNotifyCh:
|
|
err := provisionerjobs.PostJob(c.pubsub, job)
|
|
if err != nil {
|
|
c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
// TODO: implement pubsub listener to allow reconciling a specific template imperatively once it has been changed,
|
|
// instead of waiting for the next reconciliation interval
|
|
case <-ticker.C:
|
|
// Trigger a new iteration on each tick.
|
|
stats, err := c.ReconcileAll(ctx)
|
|
if err != nil {
|
|
c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err))
|
|
}
|
|
|
|
if c.reconciliationDuration != nil {
|
|
c.reconciliationDuration.Observe(stats.Elapsed.Seconds())
|
|
}
|
|
c.logger.Info(ctx, "reconciliation stats",
|
|
slog.F("elapsed", stats.Elapsed),
|
|
slog.F("presets_total", stats.PresetsTotal),
|
|
slog.F("presets_reconciled", stats.PresetsReconciled),
|
|
)
|
|
case <-ctx.Done():
|
|
// nolint:gocritic // it's okay to use slog.F() for an error in this case
|
|
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
|
|
c.logger.Warn(
|
|
context.Background(),
|
|
"reconciliation loop exited",
|
|
slog.Error(ctx.Err()),
|
|
slog.F("cause", context.Cause(ctx)),
|
|
)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop triggers reconciler shutdown and waits for it to complete.
|
|
// The ctx parameter provides a timeout, if cleanup doesn't finish within
|
|
// this timeout, Stop() logs an error and returns.
|
|
func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
|
|
if cause != nil {
|
|
c.logger.Info(context.Background(), "stopping reconciler", slog.F("cause", cause.Error()))
|
|
} else {
|
|
c.logger.Info(context.Background(), "stopping reconciler")
|
|
}
|
|
|
|
// Mark the reconciler as stopped. If it was already stopped, return early.
|
|
// If the reconciler is running, we'll proceed to shut it down.
|
|
//
|
|
// NOTE: we need to *prospectively* mark this as stopped to prevent the
|
|
// reconciler from being stopped multiple times and causing problems.
|
|
c.mu.Lock()
|
|
if c.stopped {
|
|
c.mu.Unlock()
|
|
return
|
|
}
|
|
c.stopped = true
|
|
running := c.running
|
|
c.mu.Unlock()
|
|
|
|
// Unregister prebuilds state and operational metrics.
|
|
if c.metrics != nil && c.registerer != nil {
|
|
if !c.registerer.Unregister(c.metrics) {
|
|
// The API doesn't allow us to know why the de-registration failed, but it's not very consequential.
|
|
// The only time this would be an issue is if the premium license is removed, leading to the feature being
|
|
// disabled (and consequently this Stop method being called), and then adding a new license which enables the
|
|
// feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler.
|
|
c.logger.Warn(context.Background(), "failed to unregister metrics collector")
|
|
}
|
|
if c.reconciliationDuration != nil {
|
|
if !c.registerer.Unregister(c.reconciliationDuration) {
|
|
c.logger.Warn(context.Background(), "failed to unregister reconciliation duration histogram")
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the reconciler is not running, there's nothing else to do.
|
|
if !running {
|
|
return
|
|
}
|
|
|
|
// Trigger reconciler shutdown by canceling its internal context.
|
|
if c.cancelFn != nil {
|
|
c.cancelFn(cause)
|
|
}
|
|
|
|
// Wait for the reconciler to signal that it has fully exited and cleaned up.
|
|
select {
|
|
// Timeout: reconciler didn't finish cleanup within the timeout period.
|
|
case <-ctx.Done():
|
|
// nolint:gocritic // it's okay to use slog.F() for an error in this case
|
|
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
|
|
c.logger.Error(
|
|
context.Background(),
|
|
"reconciler stop exited prematurely",
|
|
slog.Error(ctx.Err()),
|
|
slog.F("cause", context.Cause(ctx)),
|
|
)
|
|
// Happy path: reconciler has successfully exited.
|
|
case <-c.done:
|
|
c.logger.Info(context.Background(), "reconciler stopped")
|
|
}
|
|
}
|
|
|
|
// ReconcileAll attempts to reconcile the desired vs actual state of all prebuilds for each
|
|
// (organization, template, template version, preset) tuple.
|
|
//
|
|
// The result is a set of provisioning actions for each preset. These actions are fire-and-forget:
|
|
// the reconciliation loop does not wait for prebuilt workspaces to complete provisioning.
|
|
//
|
|
// An outer read-only transaction holds an advisory lock ensuring only one replica reconciles at a time.
|
|
// This transaction remains open throughout the entire reconciliation cycle. Goroutines responsible for
|
|
// preset reconciliation use separate, independent write transactions (via c.store). In the rare case
|
|
// of the lock transaction failing mid-reconciliation, goroutines may continue while another replica
|
|
// acquires the lock, potentially causing temporary under/over-provisioning. Since the reconciliation
|
|
// loop is eventually consistent, subsequent cycles will converge to the desired state.
|
|
//
|
|
// NOTE: Read operations must use db (the lock transaction) while write operations must use c.store.
|
|
func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.ReconcileStats, err error) {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.ReconcileAll")
|
|
defer span.End()
|
|
|
|
start := c.clock.Now()
|
|
defer func() {
|
|
stats.Elapsed = c.clock.Since(start)
|
|
}()
|
|
|
|
logger := c.logger.With(slog.F("reconcile_context", "all"))
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err()))
|
|
return stats, nil
|
|
default:
|
|
}
|
|
|
|
logger.Debug(ctx, "starting reconciliation")
|
|
|
|
err = c.WithReconciliationLock(ctx, logger, func(ctx context.Context, db database.Store) error {
|
|
// Check if prebuilds reconciliation is paused
|
|
// Use db (lock tx) for read-only operations
|
|
settingsJSON, err := db.GetPrebuildsSettings(ctx)
|
|
if err != nil {
|
|
return xerrors.Errorf("get prebuilds settings: %w", err)
|
|
}
|
|
|
|
var settings codersdk.PrebuildsSettings
|
|
if len(settingsJSON) > 0 {
|
|
if err := json.Unmarshal([]byte(settingsJSON), &settings); err != nil {
|
|
return xerrors.Errorf("unmarshal prebuilds settings: %w", err)
|
|
}
|
|
}
|
|
|
|
if c.metrics != nil {
|
|
c.metrics.setReconciliationPaused(settings.ReconciliationPaused)
|
|
}
|
|
|
|
if settings.ReconciliationPaused {
|
|
logger.Info(ctx, "prebuilds reconciliation is paused, skipping reconciliation")
|
|
return nil
|
|
}
|
|
|
|
// MembershipReconciler performs write operations, therefore it needs to use c.store
|
|
// directly, since the lock transaction db is read-only.
|
|
membershipReconciler := NewStoreMembershipReconciler(c.store, c.clock, logger)
|
|
err = membershipReconciler.ReconcileAll(ctx, database.PrebuildsSystemUserID, PrebuiltWorkspacesGroupName)
|
|
if err != nil {
|
|
return xerrors.Errorf("reconcile prebuild membership: %w", err)
|
|
}
|
|
|
|
// Use db (lock tx) for read-only operations
|
|
snapshot, err := c.SnapshotState(ctx, db)
|
|
if err != nil {
|
|
return xerrors.Errorf("determine current snapshot: %w", err)
|
|
}
|
|
|
|
c.reportHardLimitedPresets(snapshot)
|
|
|
|
if len(snapshot.Presets) == 0 {
|
|
logger.Debug(ctx, "no templates found with prebuilds configured")
|
|
return nil
|
|
}
|
|
|
|
var eg errgroup.Group
|
|
// Limit concurrency to avoid exhausting the coderd database connection pool.
|
|
eg.SetLimit(c.reconciliationConcurrency)
|
|
|
|
presetsReconciled := 0
|
|
|
|
// Reconcile presets in parallel. Each preset in its own goroutine.
|
|
for _, preset := range snapshot.Presets {
|
|
ps, err := snapshot.FilterByPreset(preset.ID)
|
|
if err != nil {
|
|
logger.Warn(ctx, "failed to find preset snapshot", slog.Error(err), slog.F("preset_id", preset.ID.String()))
|
|
continue
|
|
}
|
|
|
|
// Performance optimization: Skip presets that won't need any database operations.
|
|
// This avoids holding a slot in the errgroup limiter, reserving capacity for
|
|
// presets that actually need database connections.
|
|
if ps.CanSkipReconciliation() {
|
|
continue
|
|
}
|
|
|
|
presetsReconciled++
|
|
|
|
eg.Go(func() error {
|
|
// Pass outer context.
|
|
err = c.ReconcilePreset(ctx, *ps)
|
|
if err != nil {
|
|
logger.Error(
|
|
ctx,
|
|
"failed to reconcile prebuilds for preset",
|
|
slog.Error(err),
|
|
slog.F("preset_id", preset.ID),
|
|
)
|
|
}
|
|
// DO NOT return error otherwise the tx will end.
|
|
return nil
|
|
})
|
|
}
|
|
|
|
stats.PresetsTotal = len(snapshot.Presets)
|
|
stats.PresetsReconciled = presetsReconciled
|
|
|
|
// Release lock only when all preset reconciliation goroutines are finished.
|
|
return eg.Wait()
|
|
})
|
|
if err != nil {
|
|
logger.Error(ctx, "failed to reconcile", slog.Error(err))
|
|
}
|
|
|
|
return stats, err
|
|
}
|
|
|
|
func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {
|
|
// presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets.
|
|
// Multiple versions of a preset can exist with the same orgName, templateName, and presetName,
|
|
// because templates can have multiple versions — or deleted templates can share the same name.
|
|
presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow)
|
|
for _, preset := range snapshot.Presets {
|
|
key := hardLimitedPresetKey{
|
|
orgName: preset.OrganizationName,
|
|
templateName: preset.TemplateName,
|
|
presetName: preset.Name,
|
|
}
|
|
|
|
presetsMap[key] = append(presetsMap[key], preset)
|
|
}
|
|
|
|
// Report a preset as hard-limited only if all the following conditions are met:
|
|
// - The preset is marked as hard-limited
|
|
// - The preset is using the active version of its template, and the template has not been deleted
|
|
//
|
|
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
|
|
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
|
|
// as hard-limited to the admin.
|
|
//
|
|
// This approach accounts for all relevant scenarios:
|
|
// Scenario #1: The admin created a new template version with the same preset names.
|
|
// Scenario #2: The admin created a new template version and renamed the presets.
|
|
// Scenario #3: The admin deleted a template version that contained hard-limited presets.
|
|
//
|
|
// In all of these cases, only the latest and non-deleted presets will be reported.
|
|
// All other presets will be ignored and eventually removed from Prometheus.
|
|
isPresetHardLimited := make(map[hardLimitedPresetKey]bool)
|
|
for key, presets := range presetsMap {
|
|
for _, preset := range presets {
|
|
if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) {
|
|
isPresetHardLimited[key] = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
c.metrics.registerHardLimitedPresets(isPresetHardLimited)
|
|
}
|
|
|
|
// SnapshotState captures the current state of all prebuilds across templates.
|
|
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.SnapshotState")
|
|
defer span.End()
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var state prebuilds.GlobalSnapshot
|
|
|
|
// If called with a store that is already in a transaction,
|
|
// InTx will reuse that transaction rather than creating a new one.
|
|
err := store.InTx(func(db database.Store) error {
|
|
// TODO: implement template-specific reconciliations later
|
|
presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, uuid.NullUUID{})
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get template presets with prebuilds: %w", err)
|
|
}
|
|
if len(presetsWithPrebuilds) == 0 {
|
|
return nil
|
|
}
|
|
|
|
presetPrebuildSchedules, err := db.GetActivePresetPrebuildSchedules(ctx)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get preset prebuild schedules: %w", err)
|
|
}
|
|
|
|
// Get results from both original and optimized queries for comparison
|
|
allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get running prebuilds: %w", err)
|
|
}
|
|
|
|
allPrebuildsInProgress, err := db.CountInProgressPrebuilds(ctx)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
|
|
}
|
|
|
|
allPendingPrebuilds, err := db.CountPendingNonActivePrebuilds(ctx)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get pending prebuilds: %w", err)
|
|
}
|
|
|
|
presetsBackoff, err := db.GetPresetsBackoff(ctx, c.clock.Now().Add(-c.cfg.ReconciliationBackoffLookback.Value()))
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
|
|
}
|
|
|
|
hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, c.cfg.FailureHardLimit.Value())
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get hard limited presets: %w", err)
|
|
}
|
|
|
|
state = prebuilds.NewGlobalSnapshot(
|
|
presetsWithPrebuilds,
|
|
presetPrebuildSchedules,
|
|
allRunningPrebuilds,
|
|
allPrebuildsInProgress,
|
|
allPendingPrebuilds,
|
|
presetsBackoff,
|
|
hardLimitedPresets,
|
|
c.clock,
|
|
c.logger,
|
|
)
|
|
return nil
|
|
}, &database.TxOptions{
|
|
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
|
|
ReadOnly: true,
|
|
TxIdentifier: "prebuilds.SnapshotState",
|
|
})
|
|
|
|
return &state, err
|
|
}
|
|
|
|
func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.PresetSnapshot) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.ReconcilePreset", trace.WithAttributes(
|
|
attribute.String("preset_id", ps.Preset.ID.String()),
|
|
attribute.String("preset_name", ps.Preset.Name),
|
|
attribute.String("template_id", ps.Preset.TemplateID.String()),
|
|
attribute.String("template_name", ps.Preset.TemplateName),
|
|
))
|
|
defer span.End()
|
|
|
|
logger := c.logger.With(
|
|
slog.F("template_id", ps.Preset.TemplateID.String()),
|
|
slog.F("template_name", ps.Preset.TemplateName),
|
|
slog.F("template_version_id", ps.Preset.TemplateVersionID),
|
|
slog.F("template_version_name", ps.Preset.TemplateVersionName),
|
|
slog.F("preset_id", ps.Preset.ID),
|
|
slog.F("preset_name", ps.Preset.Name),
|
|
)
|
|
|
|
// If the preset reached the hard failure limit for the first time during this iteration:
|
|
// - Mark it as hard-limited in the database
|
|
// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
|
|
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
|
|
logger.Warn(ctx, "preset is hard limited, notifying template admins")
|
|
|
|
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
|
|
Status: database.PrebuildStatusHardLimited,
|
|
PresetID: ps.Preset.ID,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to update preset prebuild status: %w", err)
|
|
}
|
|
}
|
|
|
|
state := ps.CalculateState()
|
|
actions, err := c.CalculateActions(ctx, ps)
|
|
if err != nil {
|
|
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err))
|
|
return err
|
|
}
|
|
|
|
fields := []slog.Field{
|
|
slog.F("desired", state.Desired), slog.F("actual", state.Actual),
|
|
slog.F("extraneous", state.Extraneous), slog.F("starting", state.Starting),
|
|
slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting),
|
|
slog.F("eligible", state.Eligible),
|
|
}
|
|
|
|
levelFn := logger.Debug
|
|
levelFn(ctx, "calculated reconciliation state for preset", fields...)
|
|
|
|
var multiErr multierror.Error
|
|
for _, action := range actions {
|
|
err = c.executeReconciliationAction(ctx, logger, ps, action)
|
|
if err != nil {
|
|
logger.Error(ctx, "failed to execute action", slog.F("type", action.ActionType), slog.Error(err))
|
|
multiErr.Errors = append(multiErr.Errors, err)
|
|
}
|
|
}
|
|
return multiErr.ErrorOrNil()
|
|
}
|
|
|
|
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) ([]*prebuilds.ReconciliationActions, error) {
|
|
if ctx.Err() != nil {
|
|
return nil, ctx.Err()
|
|
}
|
|
|
|
return snapshot.CalculateActions(c.cfg.ReconciliationBackoffInterval.Value())
|
|
}
|
|
|
|
func (c *StoreReconciler) WithReconciliationLock(
|
|
ctx context.Context,
|
|
logger slog.Logger,
|
|
fn func(ctx context.Context, db database.Store) error,
|
|
) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.WithReconciliationLock")
|
|
defer span.End()
|
|
|
|
// This tx holds a global lock, which prevents any other coderd replica from starting a reconciliation and
|
|
// possibly getting an inconsistent view of the state.
|
|
//
|
|
// The lock MUST be held until ALL modifications have been effected.
|
|
//
|
|
// It is run with RepeatableRead isolation, so it's effectively snapshotting the data at the start of the tx.
|
|
//
|
|
// This is a read-only tx, so returning an error (i.e. causing a rollback) has no impact.
|
|
return c.store.InTx(func(db database.Store) error {
|
|
start := c.clock.Now()
|
|
|
|
// Try to acquire the lock. If we can't get it, another replica is handling reconciliation.
|
|
acquired, err := db.TryAcquireLock(ctx, database.LockIDReconcilePrebuilds)
|
|
if err != nil {
|
|
// This is a real database error, not just lock contention
|
|
logger.Error(ctx, "failed to acquire reconciliation lock due to database error", slog.Error(err))
|
|
return err
|
|
}
|
|
if !acquired {
|
|
// Normal case: another replica has the lock
|
|
span.SetAttributes(attribute.Bool("lock_acquired", false))
|
|
return nil
|
|
}
|
|
span.SetAttributes(attribute.Bool("lock_acquired", true))
|
|
|
|
logger.Debug(ctx,
|
|
"acquired top-level reconciliation lock",
|
|
slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", c.clock.Since(start).Seconds())),
|
|
)
|
|
|
|
return fn(ctx, db)
|
|
}, &database.TxOptions{
|
|
Isolation: sql.LevelRepeatableRead,
|
|
ReadOnly: true,
|
|
TxIdentifier: "prebuilds.WithReconciliationLock",
|
|
})
|
|
}
|
|
|
|
// executeReconciliationAction executes a reconciliation action on the given preset snapshot.
|
|
//
|
|
// The action can be of different types (create, delete, backoff), and may internally include
|
|
// multiple items to process, for example, a delete action can contain multiple prebuild IDs to delete,
|
|
// and a create action includes a count of prebuilds to create.
|
|
//
|
|
// This method handles logging at appropriate levels and performs the necessary operations
|
|
// according to the action type. It returns an error if any part of the action fails.
|
|
func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logger slog.Logger, ps prebuilds.PresetSnapshot, action *prebuilds.ReconciliationActions) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.executeReconciliationAction", trace.WithAttributes(
|
|
attribute.Int("action_type", int(action.ActionType)),
|
|
attribute.Int("create_count", int(action.Create)),
|
|
attribute.Int("delete_count", len(action.DeleteIDs)),
|
|
))
|
|
defer span.End()
|
|
|
|
levelFn := logger.Debug
|
|
|
|
// Nothing has to be done.
|
|
if !ps.Preset.UsingActiveVersion && action.IsNoop() {
|
|
logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done",
|
|
slog.F("template_id", ps.Preset.TemplateID.String()), slog.F("template_name", ps.Preset.TemplateName),
|
|
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()), slog.F("template_version_name", ps.Preset.TemplateVersionName),
|
|
slog.F("preset_id", ps.Preset.ID.String()), slog.F("preset_name", ps.Preset.Name))
|
|
return nil
|
|
}
|
|
|
|
// nolint:gocritic // ReconcilePreset needs Prebuilds Orchestrator permissions.
|
|
prebuildsCtx := dbauthz.AsPrebuildsOrchestrator(ctx)
|
|
|
|
fields := []slog.Field{
|
|
slog.F("action_type", action.ActionType), slog.F("create_count", action.Create),
|
|
slog.F("delete_count", len(action.DeleteIDs)), slog.F("to_delete", action.DeleteIDs),
|
|
}
|
|
levelFn(ctx, "calculated reconciliation action for preset", fields...)
|
|
|
|
switch {
|
|
case action.ActionType == prebuilds.ActionTypeBackoff:
|
|
levelFn = logger.Warn
|
|
// Log at info level when there's a change to be effected.
|
|
case action.ActionType == prebuilds.ActionTypeCreate && action.Create > 0:
|
|
levelFn = logger.Info
|
|
case action.ActionType == prebuilds.ActionTypeDelete && len(action.DeleteIDs) > 0:
|
|
levelFn = logger.Info
|
|
case action.ActionType == prebuilds.ActionTypeCancelPending:
|
|
levelFn = logger.Info
|
|
}
|
|
|
|
switch action.ActionType {
|
|
case prebuilds.ActionTypeBackoff:
|
|
// If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out.
|
|
levelFn(ctx, "template prebuild state retrieved, backing off",
|
|
append(fields,
|
|
slog.F("backoff_until", action.BackoffUntil.Format(time.RFC3339)),
|
|
slog.F("backoff_secs", math.Round(action.BackoffUntil.Sub(c.clock.Now()).Seconds())),
|
|
)...)
|
|
|
|
return nil
|
|
|
|
case prebuilds.ActionTypeCreate:
|
|
// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
|
|
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
|
|
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
|
|
desired := ps.CalculateDesiredInstances(c.clock.Now())
|
|
|
|
if action.Create > desired {
|
|
logger.Critical(ctx, "determined excessive count of prebuilds to create; clamping to desired count",
|
|
slog.F("create_count", action.Create), slog.F("desired_count", desired))
|
|
|
|
action.Create = desired
|
|
}
|
|
|
|
// If preset is hard-limited, and it's a create operation, log it and exit early.
|
|
// Creation operation is disallowed for hard-limited preset.
|
|
if ps.IsHardLimited && action.Create > 0 {
|
|
logger.Warn(ctx, "skipping hard limited preset for create operation")
|
|
return nil
|
|
}
|
|
|
|
var multiErr multierror.Error
|
|
for range action.Create {
|
|
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
|
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
|
multiErr.Errors = append(multiErr.Errors, err)
|
|
}
|
|
}
|
|
|
|
return multiErr.ErrorOrNil()
|
|
|
|
case prebuilds.ActionTypeDelete:
|
|
var multiErr multierror.Error
|
|
for _, id := range action.DeleteIDs {
|
|
if err := c.deletePrebuiltWorkspace(prebuildsCtx, id, ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
|
logger.Error(ctx, "failed to delete prebuild", slog.Error(err))
|
|
multiErr.Errors = append(multiErr.Errors, err)
|
|
}
|
|
}
|
|
|
|
return multiErr.ErrorOrNil()
|
|
|
|
case prebuilds.ActionTypeCancelPending:
|
|
return c.cancelAndOrphanDeletePendingPrebuilds(ctx, ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID)
|
|
|
|
default:
|
|
return xerrors.Errorf("unknown action type: %v", action.ActionType)
|
|
}
|
|
}
|
|
|
|
func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.createPrebuiltWorkspace", trace.WithAttributes(
|
|
attribute.String("prebuild_id", prebuiltWorkspaceID.String()),
|
|
attribute.String("template_id", templateID.String()),
|
|
attribute.String("preset_id", presetID.String()),
|
|
))
|
|
defer span.End()
|
|
|
|
name, err := prebuilds.GenerateName()
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to generate unique prebuild ID: %w", err)
|
|
}
|
|
|
|
var provisionerJob *database.ProvisionerJob
|
|
err = c.store.InTx(func(db database.Store) error {
|
|
template, err := db.GetTemplateByID(ctx, templateID)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to get template: %w", err)
|
|
}
|
|
|
|
now := c.clock.Now()
|
|
|
|
minimumWorkspace, err := db.InsertWorkspace(ctx, database.InsertWorkspaceParams{
|
|
ID: prebuiltWorkspaceID,
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
OwnerID: database.PrebuildsSystemUserID,
|
|
OrganizationID: template.OrganizationID,
|
|
TemplateID: template.ID,
|
|
Name: name,
|
|
LastUsedAt: c.clock.Now(),
|
|
AutomaticUpdates: database.AutomaticUpdatesNever,
|
|
AutostartSchedule: sql.NullString{},
|
|
Ttl: sql.NullInt64{},
|
|
NextStartAt: sql.NullTime{},
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("insert workspace: %w", err)
|
|
}
|
|
|
|
// We have to refetch the workspace for the joined in fields.
|
|
workspace, err := db.GetWorkspaceByID(ctx, minimumWorkspace.ID)
|
|
if err != nil {
|
|
return xerrors.Errorf("get workspace by ID: %w", err)
|
|
}
|
|
|
|
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
|
|
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
|
|
|
|
provisionerJob, err = c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
|
|
return err
|
|
}, &database.TxOptions{
|
|
Isolation: sql.LevelRepeatableRead,
|
|
ReadOnly: false,
|
|
TxIdentifier: "prebuilds.createPrebuiltWorkspace",
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Publish provisioner job event to notify the acquirer that a new job was posted
|
|
c.publishProvisionerJob(ctx, provisionerJob, prebuiltWorkspaceID)
|
|
|
|
return nil
|
|
}
|
|
|
|
// provisionDelete provisions a delete transition for a prebuilt workspace.
|
|
//
|
|
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
|
|
// This allows the workspace to be deleted even when no provisioners are available, and is safe
|
|
// when no Terraform resources were actually created (e.g., for pending prebuilds that were canceled
|
|
// before provisioning started).
|
|
//
|
|
// IMPORTANT: This function must be called within a database transaction. It does not create its own transaction.
|
|
// The caller is responsible for managing the transaction boundary via db.InTx().
|
|
func (c *StoreReconciler) provisionDelete(ctx context.Context, db database.Store, workspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID, mode DeprovisionMode) (*database.ProvisionerJob, error) {
|
|
workspace, err := db.GetWorkspaceByID(ctx, workspaceID)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("get workspace by ID: %w", err)
|
|
}
|
|
|
|
template, err := db.GetTemplateByID(ctx, templateID)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("failed to get template: %w", err)
|
|
}
|
|
|
|
if workspace.OwnerID != database.PrebuildsSystemUserID {
|
|
return nil, xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
|
|
}
|
|
|
|
c.logger.Info(ctx, "attempting to delete prebuild", slog.F("orphan", mode.String()),
|
|
slog.F("name", workspace.Name), slog.F("workspace_id", workspaceID.String()), slog.F("preset_id", presetID.String()))
|
|
|
|
return c.provision(ctx, db, workspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace, mode)
|
|
}
|
|
|
|
// cancelAndOrphanDeletePendingPrebuilds cancels pending prebuild jobs from inactive template versions
|
|
// and orphan-deletes their associated workspaces.
|
|
//
|
|
// The cancel operation uses a criteria-based update to ensure only jobs that are still pending at
|
|
// execution time are canceled, avoiding race conditions where jobs may have transitioned to running.
|
|
//
|
|
// Since these jobs were never processed by a provisioner, no Terraform resources were created,
|
|
// making it safe to orphan-delete the workspaces (skipping Terraform destroy).
|
|
func (c *StoreReconciler) cancelAndOrphanDeletePendingPrebuilds(ctx context.Context, templateID uuid.UUID, templateVersionID uuid.UUID, presetID uuid.UUID) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.cancelAndOrphanDeletePendingPrebuilds", trace.WithAttributes(
|
|
attribute.String("template_id", templateID.String()),
|
|
attribute.String("template_version_id", templateVersionID.String()),
|
|
attribute.String("preset_id", presetID.String()),
|
|
))
|
|
defer span.End()
|
|
|
|
var canceledProvisionerJob *database.ProvisionerJob
|
|
var canceledWorkspaceID uuid.UUID
|
|
err := c.store.InTx(func(db database.Store) error {
|
|
canceledJobs, err := db.UpdatePrebuildProvisionerJobWithCancel(
|
|
ctx,
|
|
database.UpdatePrebuildProvisionerJobWithCancelParams{
|
|
Now: c.clock.Now(),
|
|
PresetID: uuid.NullUUID{
|
|
UUID: presetID,
|
|
Valid: true,
|
|
},
|
|
})
|
|
if err != nil {
|
|
c.logger.Error(ctx, "failed to cancel pending prebuild jobs",
|
|
slog.F("template_id", templateID.String()),
|
|
slog.F("template_version_id", templateVersionID.String()),
|
|
slog.F("preset_id", presetID.String()),
|
|
slog.Error(err))
|
|
return err
|
|
}
|
|
|
|
if len(canceledJobs) > 0 {
|
|
c.logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
|
|
slog.F("template_id", templateID.String()),
|
|
slog.F("template_version_id", templateVersionID.String()),
|
|
slog.F("preset_id", presetID.String()),
|
|
slog.F("count", len(canceledJobs)))
|
|
}
|
|
|
|
var multiErr multierror.Error
|
|
for _, job := range canceledJobs {
|
|
provisionerJob, err := c.provisionDelete(ctx, db, job.WorkspaceID, job.TemplateID, presetID, DeprovisionModeOrphan)
|
|
if err != nil {
|
|
c.logger.Error(ctx, "failed to orphan delete canceled prebuild",
|
|
slog.F("workspace_id", job.WorkspaceID.String()), slog.Error(err))
|
|
multiErr.Errors = append(multiErr.Errors, err)
|
|
} else if canceledProvisionerJob == nil {
|
|
canceledProvisionerJob = provisionerJob
|
|
canceledWorkspaceID = job.WorkspaceID
|
|
}
|
|
}
|
|
|
|
return multiErr.ErrorOrNil()
|
|
}, &database.TxOptions{
|
|
Isolation: sql.LevelRepeatableRead,
|
|
ReadOnly: false,
|
|
TxIdentifier: "prebuilds.cancelAndOrphanDeletePendingPrebuilds",
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Job event notifications contain organization, provisioner type, and tags.
|
|
// Since all canceled jobs have the same values, we only send one notification
|
|
// for the first successfully canceled job, which is sufficient to trigger the
|
|
// provisioner chain that processes all remaining jobs.
|
|
if canceledProvisionerJob != nil {
|
|
c.publishProvisionerJob(ctx, canceledProvisionerJob, canceledWorkspaceID)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.deletePrebuiltWorkspace", trace.WithAttributes(
|
|
attribute.String("prebuild_id", prebuiltWorkspaceID.String()),
|
|
attribute.String("template_id", templateID.String()),
|
|
attribute.String("preset_id", presetID.String()),
|
|
))
|
|
defer span.End()
|
|
|
|
var provisionerJob *database.ProvisionerJob
|
|
err := c.store.InTx(func(db database.Store) (err error) {
|
|
provisionerJob, err = c.provisionDelete(ctx, db, prebuiltWorkspaceID, templateID, presetID, DeprovisionModeNormal)
|
|
return err
|
|
}, &database.TxOptions{
|
|
Isolation: sql.LevelRepeatableRead,
|
|
ReadOnly: false,
|
|
TxIdentifier: "prebuilds.deletePrebuiltWorkspace",
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Publish provisioner job event to notify the acquirer that a new job was posted
|
|
c.publishProvisionerJob(ctx, provisionerJob, prebuiltWorkspaceID)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *StoreReconciler) provision(
|
|
ctx context.Context,
|
|
db database.Store,
|
|
prebuildID uuid.UUID,
|
|
template database.Template,
|
|
presetID uuid.UUID,
|
|
transition database.WorkspaceTransition,
|
|
workspace database.Workspace,
|
|
mode DeprovisionMode,
|
|
) (*database.ProvisionerJob, error) {
|
|
ctx, span := c.tracer.Start(ctx, "prebuilds.provision", trace.WithAttributes(
|
|
attribute.String("prebuild_id", prebuildID.String()),
|
|
attribute.String("template_id", template.ID.String()),
|
|
attribute.String("preset_id", presetID.String()),
|
|
attribute.String("transition", string(transition)),
|
|
attribute.String("workspace_id", workspace.ID.String()),
|
|
attribute.String("mode", mode.String()),
|
|
))
|
|
defer span.End()
|
|
|
|
tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("fetch preset details: %w", err)
|
|
}
|
|
|
|
var params []codersdk.WorkspaceBuildParameter
|
|
for _, param := range tvp {
|
|
// TODO: don't fetch in the first place.
|
|
if param.TemplateVersionPresetID != presetID {
|
|
continue
|
|
}
|
|
|
|
params = append(params, codersdk.WorkspaceBuildParameter{
|
|
Name: param.Name,
|
|
Value: param.Value,
|
|
})
|
|
}
|
|
|
|
builder := wsbuilder.New(workspace, transition, *c.buildUsageChecker.Load()).
|
|
Reason(database.BuildReasonInitiator).
|
|
Initiator(database.PrebuildsSystemUserID).
|
|
MarkPrebuild().
|
|
BuildMetrics(c.workspaceBuilderMetrics)
|
|
|
|
if transition != database.WorkspaceTransitionDelete {
|
|
// We don't specify the version for a delete transition,
|
|
// because the prebuilt workspace may have been created using an older template version.
|
|
// If the version isn't explicitly set, the builder will automatically use the version
|
|
// from the last workspace build — which is the desired behavior.
|
|
builder = builder.VersionID(template.ActiveVersionID)
|
|
|
|
// We only inject the required params when the prebuild is being created.
|
|
// This mirrors the behavior of regular workspace deletion (see cli/delete.go).
|
|
builder = builder.TemplateVersionPresetID(presetID)
|
|
builder = builder.RichParameterValues(params)
|
|
}
|
|
|
|
// Use orphan mode for deletes when no Terraform resources exist
|
|
if transition == database.WorkspaceTransitionDelete && mode == DeprovisionModeOrphan {
|
|
builder = builder.Orphan()
|
|
}
|
|
|
|
// Strip trace context - provisionerd is a separate service and should
|
|
// start its own trace rather than continuing the prebuilds trace.
|
|
buildCtx := trace.ContextWithSpan(ctx, tracing.NoopSpan)
|
|
|
|
_, provisionerJob, _, err := builder.Build(
|
|
buildCtx,
|
|
db,
|
|
c.fileCache,
|
|
func(_ policy.Action, _ rbac.Objecter) bool {
|
|
return true // TODO: harden?
|
|
},
|
|
audit.WorkspaceBuildBaggage{},
|
|
)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("provision workspace: %w", err)
|
|
}
|
|
if provisionerJob == nil {
|
|
// This should not happen, builder.Build() should either return a job or an error.
|
|
// Returning an error to fail fast if we hit this unexpected case.
|
|
return nil, xerrors.Errorf("provision succeeded but returned no job")
|
|
}
|
|
|
|
c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition),
|
|
slog.F("prebuild_id", prebuildID.String()), slog.F("preset_id", presetID.String()),
|
|
slog.F("job_id", provisionerJob.ID))
|
|
|
|
return provisionerJob, nil
|
|
}
|
|
|
|
// publishProvisionerJob publishes a provisioner job event to notify the acquirer that a new job has been created.
|
|
// This must be called after the database transaction that creates the job has committed to ensure
|
|
// the job is visible to provisioners when they query the database.
|
|
func (c *StoreReconciler) publishProvisionerJob(ctx context.Context, provisionerJob *database.ProvisionerJob, workspaceID uuid.UUID) {
|
|
if provisionerJob == nil {
|
|
return
|
|
}
|
|
select {
|
|
case c.provisionNotifyCh <- *provisionerJob:
|
|
default: // channel full, drop the message; provisioner will pick this job up later with its periodic check
|
|
c.logger.Warn(ctx, "provisioner job notification queue full, dropping",
|
|
slog.F("job_id", provisionerJob.ID), slog.F("prebuild_id", workspaceID.String()))
|
|
}
|
|
}
|
|
|
|
// ForceMetricsUpdate forces the metrics collector, if defined, to update its state (we cache the metrics state to
|
|
// reduce load on the database).
|
|
func (c *StoreReconciler) ForceMetricsUpdate(ctx context.Context) error {
|
|
if c.metrics == nil {
|
|
return nil
|
|
}
|
|
|
|
return c.metrics.UpdateState(ctx, time.Second*10)
|
|
}
|
|
|
|
func (c *StoreReconciler) TrackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) {
|
|
// nolint:gocritic // Necessary to query all the required data.
|
|
ctx = dbauthz.AsSystemRestricted(ctx)
|
|
// Since this may be called in a fire-and-forget fashion, we need to give up at some point.
|
|
trackCtx, trackCancel := context.WithTimeout(ctx, time.Minute)
|
|
defer trackCancel()
|
|
|
|
if err := c.trackResourceReplacement(trackCtx, workspaceID, buildID, replacements); err != nil {
|
|
c.logger.Error(ctx, "failed to track resource replacement", slog.Error(err))
|
|
}
|
|
}
|
|
|
|
// nolint:revive // Shut up it's fine.
|
|
func (c *StoreReconciler) trackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) error {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
workspace, err := c.store.GetWorkspaceByID(ctx, workspaceID)
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch workspace %q: %w", workspaceID.String(), err)
|
|
}
|
|
|
|
build, err := c.store.GetWorkspaceBuildByID(ctx, buildID)
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch workspace build %q: %w", buildID.String(), err)
|
|
}
|
|
|
|
// The first build will always be the prebuild.
|
|
prebuild, err := c.store.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, database.GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams{
|
|
WorkspaceID: workspaceID, BuildNumber: 1,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch prebuild: %w", err)
|
|
}
|
|
|
|
// This should not be possible, but defend against it.
|
|
if !prebuild.TemplateVersionPresetID.Valid || prebuild.TemplateVersionPresetID.UUID == uuid.Nil {
|
|
return xerrors.Errorf("no preset used in prebuild for workspace %q", workspaceID.String())
|
|
}
|
|
|
|
prebuildPreset, err := c.store.GetPresetByID(ctx, prebuild.TemplateVersionPresetID.UUID)
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch template preset for template version ID %q: %w", prebuild.TemplateVersionID.String(), err)
|
|
}
|
|
|
|
claimant, err := c.store.GetUserByID(ctx, workspace.OwnerID) // At this point, the workspace is owned by the new owner.
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch claimant %q: %w", workspace.OwnerID.String(), err)
|
|
}
|
|
|
|
// Use the claiming build here (not prebuild) because both should be equivalent, and we might as well spot inconsistencies now.
|
|
templateVersion, err := c.store.GetTemplateVersionByID(ctx, build.TemplateVersionID)
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch template version %q: %w", build.TemplateVersionID.String(), err)
|
|
}
|
|
|
|
org, err := c.store.GetOrganizationByID(ctx, workspace.OrganizationID)
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch org %q: %w", workspace.OrganizationID.String(), err)
|
|
}
|
|
|
|
// Track resource replacement in Prometheus metric.
|
|
if c.metrics != nil {
|
|
c.metrics.trackResourceReplacement(org.Name, workspace.TemplateName, prebuildPreset.Name)
|
|
}
|
|
|
|
// Send notification to template admins.
|
|
if c.notifEnq == nil {
|
|
c.logger.Warn(ctx, "notification enqueuer not set, cannot send resource replacement notification(s)")
|
|
return nil
|
|
}
|
|
|
|
repls := make(map[string]string, len(replacements))
|
|
for _, repl := range replacements {
|
|
repls[repl.GetResource()] = strings.Join(repl.GetPaths(), ", ")
|
|
}
|
|
|
|
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
|
|
RbacRole: []string{codersdk.RoleTemplateAdmin},
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("fetch template admins: %w", err)
|
|
}
|
|
|
|
var notifErr error
|
|
for _, templateAdmin := range templateAdmins {
|
|
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.TemplateWorkspaceResourceReplaced,
|
|
map[string]string{
|
|
"org": org.Name,
|
|
"workspace": workspace.Name,
|
|
"template": workspace.TemplateName,
|
|
"template_version": templateVersion.Name,
|
|
"preset": prebuildPreset.Name,
|
|
"workspace_build_num": fmt.Sprintf("%d", build.BuildNumber),
|
|
"claimant": claimant.Username,
|
|
},
|
|
map[string]any{
|
|
"replacements": repls,
|
|
}, "prebuilds_reconciler",
|
|
// Associate this notification with all the related entities.
|
|
workspace.ID, workspace.OwnerID, workspace.TemplateID, templateVersion.ID, prebuildPreset.ID, workspace.OrganizationID,
|
|
); err != nil {
|
|
notifErr = errors.Join(xerrors.Errorf("send notification to %q: %w", templateAdmin.ID.String(), err))
|
|
continue
|
|
}
|
|
}
|
|
|
|
return notifErr
|
|
}
|
|
|
|
type Settings struct {
|
|
ReconciliationPaused bool `json:"reconciliation_paused"`
|
|
}
|
|
|
|
func SetPrebuildsReconciliationPaused(ctx context.Context, db database.Store, paused bool) error {
|
|
settings := Settings{
|
|
ReconciliationPaused: paused,
|
|
}
|
|
settingsJSON, err := json.Marshal(settings)
|
|
if err != nil {
|
|
return xerrors.Errorf("marshal settings: %w", err)
|
|
}
|
|
return db.UpsertPrebuildsSettings(ctx, string(settingsJSON))
|
|
}
|