mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: implement reconciliation loop (#17261)
Closes https://github.com/coder/internal/issues/510 <details> <summary> Refactoring Summary </summary> ### 1) `CalculateActions` Function #### Issues Before Refactoring: - Large function (~150 lines), making it difficult to read and maintain. - The control flow is hard to follow due to complex conditional logic. - The `ReconciliationActions` struct was partially initialized early, then mutated in multiple places, making the flow error-prone. Original source: https://github.com/coder/coder/blob/fe60b569ad754245e28bac71e0ef3c83536631bb/coderd/prebuilds/state.go#L13-L167 #### Improvements After Refactoring: - Simplified and broken down into smaller, focused helper methods. - The flow of the function is now more linear and easier to understand. - Struct initialization is cleaner, avoiding partial and incremental mutations. Refactored function: https://github.com/coder/coder/blob/eeb0407d783cdda71ec2418c113f325542c47b1c/coderd/prebuilds/state.go#L67-L84 --- ### 2) `ReconciliationActions` Struct #### Issues Before Refactoring: - The struct mixed both actionable decisions and diagnostic state, which blurred its purpose. - It was unclear which fields were necessary for reconciliation logic, and which were purely for logging/observability. #### Improvements After Refactoring: - Split into two clear, purpose-specific structs: - **`ReconciliationActions`** — defines the intended reconciliation action. - **`ReconciliationState`** — captures runtime state and metadata, primarily for logging and diagnostics. Original struct: https://github.com/coder/coder/blob/fe60b569ad754245e28bac71e0ef3c83536631bb/coderd/prebuilds/reconcile.go#L29-L41 </details> --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com> Co-authored-by: Sas Swart <sas.swart.cdk@gmail.com> Co-authored-by: Danny Kopping <dannykopping@gmail.com> Co-authored-by: Dean Sheather <dean@deansheather.com> Co-authored-by: Spike Curtis <spike@coder.com> Co-authored-by: Danny Kopping <danny@coder.com>
This commit is contained in:
committed by
GitHub
parent
6a79965948
commit
27bc60d1b9
@@ -12,8 +12,7 @@ const (
|
||||
LockIDDBPurge
|
||||
LockIDNotificationsReportGenerator
|
||||
LockIDCryptoKeyRotation
|
||||
LockIDReconcileTemplatePrebuilds
|
||||
LockIDDeterminePrebuildsState
|
||||
LockIDReconcilePrebuilds
|
||||
)
|
||||
|
||||
// GenLockID generates a unique and consistent lock ID from a given string.
|
||||
|
||||
@@ -64,7 +64,7 @@ type sqlcQuerier interface {
|
||||
CleanTailnetCoordinators(ctx context.Context) error
|
||||
CleanTailnetLostPeers(ctx context.Context) error
|
||||
CleanTailnetTunnels(ctx context.Context) error
|
||||
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
|
||||
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
|
||||
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
|
||||
CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error)
|
||||
CountUnreadInboxNotificationsByUserID(ctx context.Context, userID uuid.UUID) (int64, error)
|
||||
|
||||
@@ -5938,7 +5938,7 @@ func (q *sqlQuerier) ClaimPrebuiltWorkspace(ctx context.Context, arg ClaimPrebui
|
||||
}
|
||||
|
||||
const countInProgressPrebuilds = `-- name: CountInProgressPrebuilds :many
|
||||
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
|
||||
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id
|
||||
FROM workspace_latest_builds wlb
|
||||
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
|
||||
-- We only need these counts for active template versions.
|
||||
@@ -5949,7 +5949,7 @@ FROM workspace_latest_builds wlb
|
||||
-- prebuilds that are still building.
|
||||
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
|
||||
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
|
||||
GROUP BY t.id, wpb.template_version_id, wpb.transition
|
||||
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id
|
||||
`
|
||||
|
||||
type CountInProgressPrebuildsRow struct {
|
||||
@@ -5957,9 +5957,10 @@ type CountInProgressPrebuildsRow struct {
|
||||
TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"`
|
||||
Transition WorkspaceTransition `db:"transition" json:"transition"`
|
||||
Count int32 `db:"count" json:"count"`
|
||||
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
|
||||
}
|
||||
|
||||
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
|
||||
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
|
||||
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
|
||||
func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) {
|
||||
rows, err := q.db.QueryContext(ctx, countInProgressPrebuilds)
|
||||
@@ -5975,6 +5976,7 @@ func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInPro
|
||||
&i.TemplateVersionID,
|
||||
&i.Transition,
|
||||
&i.Count,
|
||||
&i.PresetID,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -57,9 +57,9 @@ WHERE (b.transition = 'start'::workspace_transition
|
||||
AND b.job_status = 'succeeded'::provisioner_job_status);
|
||||
|
||||
-- name: CountInProgressPrebuilds :many
|
||||
-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
|
||||
-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
|
||||
-- Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
|
||||
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
|
||||
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id
|
||||
FROM workspace_latest_builds wlb
|
||||
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
|
||||
-- We only need these counts for active template versions.
|
||||
@@ -70,7 +70,7 @@ FROM workspace_latest_builds wlb
|
||||
-- prebuilds that are still building.
|
||||
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
|
||||
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
|
||||
GROUP BY t.id, wpb.template_version_id, wpb.transition;
|
||||
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id;
|
||||
|
||||
-- GetPresetsBackoff groups workspace builds by preset ID.
|
||||
-- Each preset is associated with exactly one template version ID.
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// ReconciliationOrchestrator manages the lifecycle of prebuild reconciliation.
|
||||
// It runs a continuous loop to check and reconcile prebuild states, and can be stopped gracefully.
|
||||
type ReconciliationOrchestrator interface {
|
||||
Reconciler
|
||||
|
||||
// RunLoop starts a continuous reconciliation loop that periodically calls ReconcileAll
|
||||
// to ensure all prebuilds are in their desired states. The loop runs until the context
|
||||
// is canceled or Stop is called.
|
||||
RunLoop(ctx context.Context)
|
||||
|
||||
// Stop gracefully shuts down the orchestrator with the given cause.
|
||||
// The cause is used for logging and error reporting.
|
||||
Stop(ctx context.Context, cause error)
|
||||
}
|
||||
|
||||
type Reconciler interface {
|
||||
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
|
||||
// It takes a global snapshot of the system state and then reconciles each preset
|
||||
// in parallel, creating or deleting prebuilds as needed to reach their desired states.
|
||||
ReconcileAll(ctx context.Context) error
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/util/slice"
|
||||
)
|
||||
|
||||
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
|
||||
type GlobalSnapshot struct {
|
||||
Presets []database.GetTemplatePresetsWithPrebuildsRow
|
||||
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
|
||||
PrebuildsInProgress []database.CountInProgressPrebuildsRow
|
||||
Backoffs []database.GetPresetsBackoffRow
|
||||
}
|
||||
|
||||
func NewGlobalSnapshot(
|
||||
presets []database.GetTemplatePresetsWithPrebuildsRow,
|
||||
runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow,
|
||||
prebuildsInProgress []database.CountInProgressPrebuildsRow,
|
||||
backoffs []database.GetPresetsBackoffRow,
|
||||
) GlobalSnapshot {
|
||||
return GlobalSnapshot{
|
||||
Presets: presets,
|
||||
RunningPrebuilds: runningPrebuilds,
|
||||
PrebuildsInProgress: prebuildsInProgress,
|
||||
Backoffs: backoffs,
|
||||
}
|
||||
}
|
||||
|
||||
func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, error) {
|
||||
preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
|
||||
return preset.ID == presetID
|
||||
})
|
||||
if !found {
|
||||
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
|
||||
}
|
||||
|
||||
running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuiltWorkspacesRow) bool {
|
||||
if !prebuild.CurrentPresetID.Valid {
|
||||
return false
|
||||
}
|
||||
return prebuild.CurrentPresetID.UUID == preset.ID
|
||||
})
|
||||
|
||||
inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.CountInProgressPrebuildsRow) bool {
|
||||
return prebuild.PresetID.UUID == preset.ID
|
||||
})
|
||||
|
||||
var backoffPtr *database.GetPresetsBackoffRow
|
||||
backoff, found := slice.Find(s.Backoffs, func(row database.GetPresetsBackoffRow) bool {
|
||||
return row.PresetID == preset.ID
|
||||
})
|
||||
if found {
|
||||
backoffPtr = &backoff
|
||||
}
|
||||
|
||||
return &PresetSnapshot{
|
||||
Preset: preset,
|
||||
Running: running,
|
||||
InProgress: inProgress,
|
||||
Backoff: backoffPtr,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
)
|
||||
|
||||
type NoopReconciler struct{}
|
||||
|
||||
func NewNoopReconciler() *NoopReconciler {
|
||||
return &NoopReconciler{}
|
||||
}
|
||||
|
||||
func (NoopReconciler) RunLoop(context.Context) {}
|
||||
|
||||
func (NoopReconciler) Stop(context.Context, error) {}
|
||||
|
||||
func (NoopReconciler) ReconcileAll(context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (NoopReconciler) SnapshotState(context.Context, database.Store) (*GlobalSnapshot, error) {
|
||||
return &GlobalSnapshot{}, nil
|
||||
}
|
||||
|
||||
func (NoopReconciler) ReconcilePreset(context.Context, PresetSnapshot) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (NoopReconciler) CalculateActions(context.Context, PresetSnapshot) (*ReconciliationActions, error) {
|
||||
return &ReconciliationActions{}, nil
|
||||
}
|
||||
|
||||
var _ ReconciliationOrchestrator = NoopReconciler{}
|
||||
@@ -0,0 +1,254 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/coder/quartz"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
)
|
||||
|
||||
// ActionType represents the type of action needed to reconcile prebuilds.
|
||||
type ActionType int
|
||||
|
||||
const (
|
||||
// ActionTypeUndefined represents an uninitialized or invalid action type.
|
||||
ActionTypeUndefined ActionType = iota
|
||||
|
||||
// ActionTypeCreate indicates that new prebuilds should be created.
|
||||
ActionTypeCreate
|
||||
|
||||
// ActionTypeDelete indicates that existing prebuilds should be deleted.
|
||||
ActionTypeDelete
|
||||
|
||||
// ActionTypeBackoff indicates that prebuild creation should be delayed.
|
||||
ActionTypeBackoff
|
||||
)
|
||||
|
||||
// PresetSnapshot is a filtered view of GlobalSnapshot focused on a single preset.
|
||||
// It contains the raw data needed to calculate the current state of a preset's prebuilds,
|
||||
// including running prebuilds, in-progress builds, and backoff information.
|
||||
type PresetSnapshot struct {
|
||||
Preset database.GetTemplatePresetsWithPrebuildsRow
|
||||
Running []database.GetRunningPrebuiltWorkspacesRow
|
||||
InProgress []database.CountInProgressPrebuildsRow
|
||||
Backoff *database.GetPresetsBackoffRow
|
||||
}
|
||||
|
||||
// ReconciliationState represents the processed state of a preset's prebuilds,
|
||||
// calculated from a PresetSnapshot. While PresetSnapshot contains raw data,
|
||||
// ReconciliationState contains derived metrics that are directly used to
|
||||
// determine what actions are needed (create, delete, or backoff).
|
||||
// For example, it calculates how many prebuilds are eligible, how many are
|
||||
// extraneous, and how many are in various transition states.
|
||||
type ReconciliationState struct {
|
||||
Actual int32 // Number of currently running prebuilds
|
||||
Desired int32 // Number of prebuilds desired as defined in the preset
|
||||
Eligible int32 // Number of prebuilds that are ready to be claimed
|
||||
Extraneous int32 // Number of extra running prebuilds beyond the desired count
|
||||
|
||||
// Counts of prebuilds in various transition states
|
||||
Starting int32
|
||||
Stopping int32
|
||||
Deleting int32
|
||||
}
|
||||
|
||||
// ReconciliationActions represents actions needed to reconcile the current state with the desired state.
|
||||
// Based on ActionType, exactly one of Create, DeleteIDs, or BackoffUntil will be set.
|
||||
type ReconciliationActions struct {
|
||||
// ActionType determines which field is set and what action should be taken
|
||||
ActionType ActionType
|
||||
|
||||
// Create is set when ActionType is ActionTypeCreate and indicates the number of prebuilds to create
|
||||
Create int32
|
||||
|
||||
// DeleteIDs is set when ActionType is ActionTypeDelete and contains the IDs of prebuilds to delete
|
||||
DeleteIDs []uuid.UUID
|
||||
|
||||
// BackoffUntil is set when ActionType is ActionTypeBackoff and indicates when to retry creating prebuilds
|
||||
BackoffUntil time.Time
|
||||
}
|
||||
|
||||
// CalculateState computes the current state of prebuilds for a preset, including:
|
||||
// - Actual: Number of currently running prebuilds
|
||||
// - Desired: Number of prebuilds desired as defined in the preset
|
||||
// - Eligible: Number of prebuilds that are ready to be claimed
|
||||
// - Extraneous: Number of extra running prebuilds beyond the desired count
|
||||
// - Starting/Stopping/Deleting: Counts of prebuilds in various transition states
|
||||
//
|
||||
// The function takes into account whether the preset is active (using the active template version)
|
||||
// and calculates appropriate counts based on the current state of running prebuilds and
|
||||
// in-progress transitions. This state information is used to determine what reconciliation
|
||||
// actions are needed to reach the desired state.
|
||||
func (p PresetSnapshot) CalculateState() *ReconciliationState {
|
||||
var (
|
||||
actual int32
|
||||
desired int32
|
||||
eligible int32
|
||||
extraneous int32
|
||||
)
|
||||
|
||||
if p.isActive() {
|
||||
// #nosec G115 - Safe conversion as p.Running slice length is expected to be within int32 range
|
||||
actual = int32(len(p.Running))
|
||||
desired = p.Preset.DesiredInstances.Int32
|
||||
eligible = p.countEligible()
|
||||
extraneous = max(actual-desired, 0)
|
||||
}
|
||||
|
||||
starting, stopping, deleting := p.countInProgress()
|
||||
|
||||
return &ReconciliationState{
|
||||
Actual: actual,
|
||||
Desired: desired,
|
||||
Eligible: eligible,
|
||||
Extraneous: extraneous,
|
||||
|
||||
Starting: starting,
|
||||
Stopping: stopping,
|
||||
Deleting: deleting,
|
||||
}
|
||||
}
|
||||
|
||||
// CalculateActions determines what actions are needed to reconcile the current state with the desired state.
|
||||
// The function:
|
||||
// 1. First checks if a backoff period is needed (if previous builds failed)
|
||||
// 2. If the preset is inactive (template version is not active), it will delete all running prebuilds
|
||||
// 3. For active presets, it calculates the number of prebuilds to create or delete based on:
|
||||
// - The desired number of instances
|
||||
// - Currently running prebuilds
|
||||
// - Prebuilds in transition states (starting/stopping/deleting)
|
||||
// - Any extraneous prebuilds that need to be removed
|
||||
//
|
||||
// The function returns a ReconciliationActions struct that will have exactly one action type set:
|
||||
// - ActionTypeBackoff: Only BackoffUntil is set, indicating when to retry
|
||||
// - ActionTypeCreate: Only Create is set, indicating how many prebuilds to create
|
||||
// - ActionTypeDelete: Only DeleteIDs is set, containing IDs of prebuilds to delete
|
||||
func (p PresetSnapshot) CalculateActions(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, error) {
|
||||
// TODO: align workspace states with how we represent them on the FE and the CLI
|
||||
// right now there's some slight differences which can lead to additional prebuilds being created
|
||||
|
||||
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
|
||||
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
|
||||
|
||||
actions, needsBackoff := p.needsBackoffPeriod(clock, backoffInterval)
|
||||
if needsBackoff {
|
||||
return actions, nil
|
||||
}
|
||||
|
||||
if !p.isActive() {
|
||||
return p.handleInactiveTemplateVersion()
|
||||
}
|
||||
|
||||
return p.handleActiveTemplateVersion()
|
||||
}
|
||||
|
||||
// isActive returns true if the preset's template version is the active version, and it is neither deleted nor deprecated.
|
||||
// This determines whether we should maintain prebuilds for this preset or delete them.
|
||||
func (p PresetSnapshot) isActive() bool {
|
||||
return p.Preset.UsingActiveVersion && !p.Preset.Deleted && !p.Preset.Deprecated
|
||||
}
|
||||
|
||||
// handleActiveTemplateVersion deletes excess prebuilds if there are too many,
|
||||
// otherwise creates new ones to reach the desired count.
|
||||
func (p PresetSnapshot) handleActiveTemplateVersion() (*ReconciliationActions, error) {
|
||||
state := p.CalculateState()
|
||||
|
||||
// If we have more prebuilds than desired, delete the oldest ones
|
||||
if state.Extraneous > 0 {
|
||||
return &ReconciliationActions{
|
||||
ActionType: ActionTypeDelete,
|
||||
DeleteIDs: p.getOldestPrebuildIDs(int(state.Extraneous)),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Calculate how many new prebuilds we need to create
|
||||
// We subtract starting prebuilds since they're already being created
|
||||
prebuildsToCreate := max(state.Desired-state.Actual-state.Starting, 0)
|
||||
|
||||
return &ReconciliationActions{
|
||||
ActionType: ActionTypeCreate,
|
||||
Create: prebuildsToCreate,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// handleInactiveTemplateVersion deletes all running prebuilds except those already being deleted
|
||||
// to avoid duplicate deletion attempts.
|
||||
func (p PresetSnapshot) handleInactiveTemplateVersion() (*ReconciliationActions, error) {
|
||||
prebuildsToDelete := len(p.Running)
|
||||
deleteIDs := p.getOldestPrebuildIDs(prebuildsToDelete)
|
||||
|
||||
return &ReconciliationActions{
|
||||
ActionType: ActionTypeDelete,
|
||||
DeleteIDs: deleteIDs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// needsBackoffPeriod checks if we should delay prebuild creation due to recent failures.
|
||||
// If there were failures, it calculates a backoff period based on the number of failures
|
||||
// and returns true if we're still within that period.
|
||||
func (p PresetSnapshot) needsBackoffPeriod(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, bool) {
|
||||
if p.Backoff == nil || p.Backoff.NumFailed == 0 {
|
||||
return nil, false
|
||||
}
|
||||
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
|
||||
if clock.Now().After(backoffUntil) {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return &ReconciliationActions{
|
||||
ActionType: ActionTypeBackoff,
|
||||
BackoffUntil: backoffUntil,
|
||||
}, true
|
||||
}
|
||||
|
||||
// countEligible returns the number of prebuilds that are ready to be claimed.
|
||||
// A prebuild is eligible if it's running and its agents are in ready state.
|
||||
func (p PresetSnapshot) countEligible() int32 {
|
||||
var count int32
|
||||
for _, prebuild := range p.Running {
|
||||
if prebuild.Ready {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// countInProgress returns counts of prebuilds in transition states (starting, stopping, deleting).
|
||||
// These counts are tracked at the template level, so all presets sharing the same template see the same values.
|
||||
func (p PresetSnapshot) countInProgress() (starting int32, stopping int32, deleting int32) {
|
||||
for _, progress := range p.InProgress {
|
||||
num := progress.Count
|
||||
switch progress.Transition {
|
||||
case database.WorkspaceTransitionStart:
|
||||
starting += num
|
||||
case database.WorkspaceTransitionStop:
|
||||
stopping += num
|
||||
case database.WorkspaceTransitionDelete:
|
||||
deleting += num
|
||||
}
|
||||
}
|
||||
|
||||
return starting, stopping, deleting
|
||||
}
|
||||
|
||||
// getOldestPrebuildIDs returns the IDs of the N oldest prebuilds, sorted by creation time.
|
||||
// This is used when we need to delete prebuilds, ensuring we remove the oldest ones first.
|
||||
func (p PresetSnapshot) getOldestPrebuildIDs(n int) []uuid.UUID {
|
||||
// Sort by creation time, oldest first
|
||||
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuiltWorkspacesRow) int {
|
||||
return a.CreatedAt.Compare(b.CreatedAt)
|
||||
})
|
||||
|
||||
// Take the first N IDs
|
||||
n = min(n, len(p.Running))
|
||||
ids := make([]uuid.UUID, n)
|
||||
for i := 0; i < n; i++ {
|
||||
ids[i] = p.Running[i].ID
|
||||
}
|
||||
|
||||
return ids
|
||||
}
|
||||
@@ -0,0 +1,758 @@
|
||||
package prebuilds_test
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/quartz"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/prebuilds"
|
||||
)
|
||||
|
||||
type options struct {
|
||||
templateID uuid.UUID
|
||||
templateVersionID uuid.UUID
|
||||
presetID uuid.UUID
|
||||
presetName string
|
||||
prebuiltWorkspaceID uuid.UUID
|
||||
workspaceName string
|
||||
}
|
||||
|
||||
// templateID is common across all option sets.
|
||||
var templateID = uuid.UUID{1}
|
||||
|
||||
const (
|
||||
backoffInterval = time.Second * 5
|
||||
|
||||
optionSet0 = iota
|
||||
optionSet1
|
||||
optionSet2
|
||||
)
|
||||
|
||||
var opts = map[uint]options{
|
||||
optionSet0: {
|
||||
templateID: templateID,
|
||||
templateVersionID: uuid.UUID{11},
|
||||
presetID: uuid.UUID{12},
|
||||
presetName: "my-preset",
|
||||
prebuiltWorkspaceID: uuid.UUID{13},
|
||||
workspaceName: "prebuilds0",
|
||||
},
|
||||
optionSet1: {
|
||||
templateID: templateID,
|
||||
templateVersionID: uuid.UUID{21},
|
||||
presetID: uuid.UUID{22},
|
||||
presetName: "my-preset",
|
||||
prebuiltWorkspaceID: uuid.UUID{23},
|
||||
workspaceName: "prebuilds1",
|
||||
},
|
||||
optionSet2: {
|
||||
templateID: templateID,
|
||||
templateVersionID: uuid.UUID{31},
|
||||
presetID: uuid.UUID{32},
|
||||
presetName: "my-preset",
|
||||
prebuiltWorkspaceID: uuid.UUID{33},
|
||||
workspaceName: "prebuilds2",
|
||||
},
|
||||
}
|
||||
|
||||
// A new template version with a preset without prebuilds configured should result in no prebuilds being created.
|
||||
func TestNoPrebuilds(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 0, current),
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
|
||||
validateState(t, prebuilds.ReconciliationState{ /*all zero values*/ }, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 0,
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// A new template version with a preset with prebuilds configured should result in a new prebuild being created.
|
||||
func TestNetNew(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 1, current),
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Desired: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// A new template version is created with a preset with prebuilds configured; this outdates the older version and
|
||||
// requires the old prebuilds to be destroyed and new prebuilds to be created.
|
||||
func TestOutdatedPrebuilds(t *testing.T) {
|
||||
t.Parallel()
|
||||
outdated := opts[optionSet0]
|
||||
current := opts[optionSet1]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
// GIVEN: 2 presets, one outdated and one new.
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(false, 1, outdated),
|
||||
preset(true, 1, current),
|
||||
}
|
||||
|
||||
// GIVEN: a running prebuild for the outdated preset.
|
||||
running := []database.GetRunningPrebuiltWorkspacesRow{
|
||||
prebuiltWorkspace(outdated, clock),
|
||||
}
|
||||
|
||||
// GIVEN: no in-progress builds.
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the outdated preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
ps, err := snapshot.FilterByPreset(outdated.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: we should identify that this prebuild is outdated and needs to be deleted.
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeDelete,
|
||||
DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID},
|
||||
}, *actions)
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
ps, err = snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: we should not be blocked from creating a new prebuild while the outdate one deletes.
|
||||
state = ps.CalculateState()
|
||||
actions, err = ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{Desired: 1}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// Make sure that outdated prebuild will be deleted, even if deletion of another outdated prebuild is already in progress.
|
||||
func TestDeleteOutdatedPrebuilds(t *testing.T) {
|
||||
t.Parallel()
|
||||
outdated := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
// GIVEN: 1 outdated preset.
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(false, 1, outdated),
|
||||
}
|
||||
|
||||
// GIVEN: one running prebuild for the outdated preset.
|
||||
running := []database.GetRunningPrebuiltWorkspacesRow{
|
||||
prebuiltWorkspace(outdated, clock),
|
||||
}
|
||||
|
||||
// GIVEN: one deleting prebuild for the outdated preset.
|
||||
inProgress := []database.CountInProgressPrebuildsRow{
|
||||
{
|
||||
TemplateID: outdated.templateID,
|
||||
TemplateVersionID: outdated.templateVersionID,
|
||||
Transition: database.WorkspaceTransitionDelete,
|
||||
Count: 1,
|
||||
PresetID: uuid.NullUUID{
|
||||
UUID: outdated.presetID,
|
||||
Valid: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// WHEN: calculating the outdated preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
ps, err := snapshot.FilterByPreset(outdated.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: we should identify that this prebuild is outdated and needs to be deleted.
|
||||
// Despite the fact that deletion of another outdated prebuild is already in progress.
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Deleting: 1,
|
||||
}, *state)
|
||||
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeDelete,
|
||||
DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID},
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// A new template version is created with a preset with prebuilds configured; while a prebuild is provisioning up or down,
|
||||
// the calculated actions should indicate the state correctly.
|
||||
func TestInProgressActions(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
transition database.WorkspaceTransition
|
||||
desired int32
|
||||
running int32
|
||||
inProgress int32
|
||||
checkFn func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions)
|
||||
}{
|
||||
// With no running prebuilds and one starting, no creations/deletions should take place.
|
||||
{
|
||||
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStart),
|
||||
transition: database.WorkspaceTransitionStart,
|
||||
desired: 1,
|
||||
running: 0,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Desired: 1, Starting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With one running prebuild and one starting, no creations/deletions should occur since we're approaching the correct state.
|
||||
{
|
||||
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStart),
|
||||
transition: database.WorkspaceTransitionStart,
|
||||
desired: 2,
|
||||
running: 1,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Starting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With one running prebuild and one starting, no creations/deletions should occur
|
||||
// SIDE-NOTE: once the starting prebuild completes, the older of the two will be considered extraneous since we only desire 2.
|
||||
{
|
||||
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStart),
|
||||
transition: database.WorkspaceTransitionStart,
|
||||
desired: 2,
|
||||
running: 2,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Starting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With one prebuild desired and one stopping, a new prebuild will be created.
|
||||
{
|
||||
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStop),
|
||||
transition: database.WorkspaceTransitionStop,
|
||||
desired: 1,
|
||||
running: 0,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Desired: 1, Stopping: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With 3 prebuilds desired, 2 running, and 1 stopping, a new prebuild will be created.
|
||||
{
|
||||
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStop),
|
||||
transition: database.WorkspaceTransitionStop,
|
||||
desired: 3,
|
||||
running: 2,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 3, Stopping: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With 3 prebuilds desired, 3 running, and 1 stopping, no creations/deletions should occur since the desired state is already achieved.
|
||||
{
|
||||
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStop),
|
||||
transition: database.WorkspaceTransitionStop,
|
||||
desired: 3,
|
||||
running: 3,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 3, Desired: 3, Stopping: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With one prebuild desired and one deleting, a new prebuild will be created.
|
||||
{
|
||||
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionDelete),
|
||||
transition: database.WorkspaceTransitionDelete,
|
||||
desired: 1,
|
||||
running: 0,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Desired: 1, Deleting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With 2 prebuilds desired, 1 running, and 1 deleting, a new prebuild will be created.
|
||||
{
|
||||
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionDelete),
|
||||
transition: database.WorkspaceTransitionDelete,
|
||||
desired: 2,
|
||||
running: 1,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Deleting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With 2 prebuilds desired, 2 running, and 1 deleting, no creations/deletions should occur since the desired state is already achieved.
|
||||
{
|
||||
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionDelete),
|
||||
transition: database.WorkspaceTransitionDelete,
|
||||
desired: 2,
|
||||
running: 2,
|
||||
inProgress: 1,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Deleting: 1}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
}, actions)
|
||||
},
|
||||
},
|
||||
// With 3 prebuilds desired, 1 running, and 2 starting, no creations should occur since the builds are in progress.
|
||||
{
|
||||
name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionStart),
|
||||
transition: database.WorkspaceTransitionStart,
|
||||
desired: 3,
|
||||
running: 1,
|
||||
inProgress: 2,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 3, Starting: 2}, state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{ActionType: prebuilds.ActionTypeCreate, Create: 0}, actions)
|
||||
},
|
||||
},
|
||||
// With 3 prebuilds desired, 5 running, and 2 deleting, no deletions should occur since the builds are in progress.
|
||||
{
|
||||
name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionDelete),
|
||||
transition: database.WorkspaceTransitionDelete,
|
||||
desired: 3,
|
||||
running: 5,
|
||||
inProgress: 2,
|
||||
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
|
||||
expectedState := prebuilds.ReconciliationState{Actual: 5, Desired: 3, Deleting: 2, Extraneous: 2}
|
||||
expectedActions := prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeDelete,
|
||||
}
|
||||
|
||||
validateState(t, expectedState, state)
|
||||
assert.EqualValuesf(t, expectedActions.ActionType, actions.ActionType, "'ActionType' did not match expectation")
|
||||
assert.Len(t, actions.DeleteIDs, 2, "'deleteIDs' did not match expectation")
|
||||
assert.EqualValuesf(t, expectedActions.Create, actions.Create, "'create' did not match expectation")
|
||||
assert.EqualValuesf(t, expectedActions.BackoffUntil, actions.BackoffUntil, "'BackoffUntil' did not match expectation")
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GIVEN: a preset.
|
||||
defaultPreset := preset(true, tc.desired, current)
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
defaultPreset,
|
||||
}
|
||||
|
||||
// GIVEN: running prebuilt workspaces for the preset.
|
||||
running := make([]database.GetRunningPrebuiltWorkspacesRow, 0, tc.running)
|
||||
for range tc.running {
|
||||
name, err := prebuilds.GenerateName()
|
||||
require.NoError(t, err)
|
||||
running = append(running, database.GetRunningPrebuiltWorkspacesRow{
|
||||
ID: uuid.New(),
|
||||
Name: name,
|
||||
TemplateID: current.templateID,
|
||||
TemplateVersionID: current.templateVersionID,
|
||||
CurrentPresetID: uuid.NullUUID{UUID: current.presetID, Valid: true},
|
||||
Ready: false,
|
||||
CreatedAt: clock.Now(),
|
||||
})
|
||||
}
|
||||
|
||||
// GIVEN: some prebuilds for the preset which are currently transitioning.
|
||||
inProgress := []database.CountInProgressPrebuildsRow{
|
||||
{
|
||||
TemplateID: current.templateID,
|
||||
TemplateVersionID: current.templateVersionID,
|
||||
Transition: tc.transition,
|
||||
Count: tc.inProgress,
|
||||
PresetID: uuid.NullUUID{
|
||||
UUID: defaultPreset.ID,
|
||||
Valid: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: we should identify that this prebuild is in progress.
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
tc.checkFn(*state, *actions)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Additional prebuilds exist for a given preset configuration; these must be deleted.
|
||||
func TestExtraneous(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
// GIVEN: a preset with 1 desired prebuild.
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 1, current),
|
||||
}
|
||||
|
||||
var older uuid.UUID
|
||||
// GIVEN: 2 running prebuilds for the preset.
|
||||
running := []database.GetRunningPrebuiltWorkspacesRow{
|
||||
prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow {
|
||||
// The older of the running prebuilds will be deleted in order to maintain freshness.
|
||||
row.CreatedAt = clock.Now().Add(-time.Hour)
|
||||
older = row.ID
|
||||
return row
|
||||
}),
|
||||
prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow {
|
||||
row.CreatedAt = clock.Now()
|
||||
return row
|
||||
}),
|
||||
}
|
||||
|
||||
// GIVEN: NO prebuilds in progress.
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: an extraneous prebuild is detected and marked for deletion.
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Actual: 2, Desired: 1, Extraneous: 1, Eligible: 2,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeDelete,
|
||||
DeleteIDs: []uuid.UUID{older},
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// A template marked as deprecated will not have prebuilds running.
|
||||
func TestDeprecated(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
// GIVEN: a preset with 1 desired prebuild.
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 1, current, func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow {
|
||||
row.Deprecated = true
|
||||
return row
|
||||
}),
|
||||
}
|
||||
|
||||
// GIVEN: 1 running prebuilds for the preset.
|
||||
running := []database.GetRunningPrebuiltWorkspacesRow{
|
||||
prebuiltWorkspace(current, clock),
|
||||
}
|
||||
|
||||
// GIVEN: NO prebuilds in progress.
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
|
||||
ps, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: all running prebuilds should be deleted because the template is deprecated.
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeDelete,
|
||||
DeleteIDs: []uuid.UUID{current.prebuiltWorkspaceID},
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// If the latest build failed, backoff exponentially with the given interval.
|
||||
func TestLatestBuildFailed(t *testing.T) {
|
||||
t.Parallel()
|
||||
current := opts[optionSet0]
|
||||
other := opts[optionSet1]
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
// GIVEN: two presets.
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 1, current),
|
||||
preset(true, 1, other),
|
||||
}
|
||||
|
||||
// GIVEN: running prebuilds only for one preset (the other will be failing, as evidenced by the backoffs below).
|
||||
running := []database.GetRunningPrebuiltWorkspacesRow{
|
||||
prebuiltWorkspace(other, clock),
|
||||
}
|
||||
|
||||
// GIVEN: NO prebuilds in progress.
|
||||
var inProgress []database.CountInProgressPrebuildsRow
|
||||
|
||||
// GIVEN: a backoff entry.
|
||||
lastBuildTime := clock.Now()
|
||||
numFailed := 1
|
||||
backoffs := []database.GetPresetsBackoffRow{
|
||||
{
|
||||
TemplateVersionID: current.templateVersionID,
|
||||
PresetID: current.presetID,
|
||||
NumFailed: int32(numFailed),
|
||||
LastBuildAt: lastBuildTime,
|
||||
},
|
||||
}
|
||||
|
||||
// WHEN: calculating the current preset's state.
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs)
|
||||
psCurrent, err := snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: reconciliation should backoff.
|
||||
state := psCurrent.CalculateState()
|
||||
actions, err := psCurrent.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Actual: 0, Desired: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeBackoff,
|
||||
BackoffUntil: lastBuildTime.Add(time.Duration(numFailed) * backoffInterval),
|
||||
}, *actions)
|
||||
|
||||
// WHEN: calculating the other preset's state.
|
||||
psOther, err := snapshot.FilterByPreset(other.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
// THEN: it should NOT be in backoff because all is OK.
|
||||
state = psOther.CalculateState()
|
||||
actions, err = psOther.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Actual: 1, Desired: 1, Eligible: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
BackoffUntil: time.Time{},
|
||||
}, *actions)
|
||||
|
||||
// WHEN: the clock is advanced a backoff interval.
|
||||
clock.Advance(backoffInterval + time.Microsecond)
|
||||
|
||||
// THEN: a new prebuild should be created.
|
||||
psCurrent, err = snapshot.FilterByPreset(current.presetID)
|
||||
require.NoError(t, err)
|
||||
state = psCurrent.CalculateState()
|
||||
actions, err = psCurrent.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Actual: 0, Desired: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1, // <--- NOTE: we're now able to create a new prebuild because the interval has elapsed.
|
||||
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
templateID := uuid.New()
|
||||
templateVersionID := uuid.New()
|
||||
presetOpts1 := options{
|
||||
templateID: templateID,
|
||||
templateVersionID: templateVersionID,
|
||||
presetID: uuid.New(),
|
||||
presetName: "my-preset-1",
|
||||
prebuiltWorkspaceID: uuid.New(),
|
||||
workspaceName: "prebuilds1",
|
||||
}
|
||||
presetOpts2 := options{
|
||||
templateID: templateID,
|
||||
templateVersionID: templateVersionID,
|
||||
presetID: uuid.New(),
|
||||
presetName: "my-preset-2",
|
||||
prebuiltWorkspaceID: uuid.New(),
|
||||
workspaceName: "prebuilds2",
|
||||
}
|
||||
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
presets := []database.GetTemplatePresetsWithPrebuildsRow{
|
||||
preset(true, 1, presetOpts1),
|
||||
preset(true, 1, presetOpts2),
|
||||
}
|
||||
|
||||
inProgress := []database.CountInProgressPrebuildsRow{
|
||||
{
|
||||
TemplateID: templateID,
|
||||
TemplateVersionID: templateVersionID,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
Count: 1,
|
||||
PresetID: uuid.NullUUID{
|
||||
UUID: presetOpts1.presetID,
|
||||
Valid: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil)
|
||||
|
||||
// Nothing has to be created for preset 1.
|
||||
{
|
||||
ps, err := snapshot.FilterByPreset(presetOpts1.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Starting: 1,
|
||||
Desired: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 0,
|
||||
}, *actions)
|
||||
}
|
||||
|
||||
// One prebuild has to be created for preset 2. Make sure preset 1 doesn't block preset 2.
|
||||
{
|
||||
ps, err := snapshot.FilterByPreset(presetOpts2.presetID)
|
||||
require.NoError(t, err)
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := ps.CalculateActions(clock, backoffInterval)
|
||||
require.NoError(t, err)
|
||||
|
||||
validateState(t, prebuilds.ReconciliationState{
|
||||
Starting: 0,
|
||||
Desired: 1,
|
||||
}, *state)
|
||||
validateActions(t, prebuilds.ReconciliationActions{
|
||||
ActionType: prebuilds.ActionTypeCreate,
|
||||
Create: 1,
|
||||
}, *actions)
|
||||
}
|
||||
}
|
||||
|
||||
func preset(active bool, instances int32, opts options, muts ...func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow {
|
||||
entry := database.GetTemplatePresetsWithPrebuildsRow{
|
||||
TemplateID: opts.templateID,
|
||||
TemplateVersionID: opts.templateVersionID,
|
||||
ID: opts.presetID,
|
||||
UsingActiveVersion: active,
|
||||
Name: opts.presetName,
|
||||
DesiredInstances: sql.NullInt32{
|
||||
Valid: true,
|
||||
Int32: instances,
|
||||
},
|
||||
Deleted: false,
|
||||
Deprecated: false,
|
||||
}
|
||||
|
||||
for _, mut := range muts {
|
||||
entry = mut(entry)
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
func prebuiltWorkspace(
|
||||
opts options,
|
||||
clock quartz.Clock,
|
||||
muts ...func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow,
|
||||
) database.GetRunningPrebuiltWorkspacesRow {
|
||||
entry := database.GetRunningPrebuiltWorkspacesRow{
|
||||
ID: opts.prebuiltWorkspaceID,
|
||||
Name: opts.workspaceName,
|
||||
TemplateID: opts.templateID,
|
||||
TemplateVersionID: opts.templateVersionID,
|
||||
CurrentPresetID: uuid.NullUUID{UUID: opts.presetID, Valid: true},
|
||||
Ready: true,
|
||||
CreatedAt: clock.Now(),
|
||||
}
|
||||
|
||||
for _, mut := range muts {
|
||||
entry = mut(entry)
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
func validateState(t *testing.T, expected, actual prebuilds.ReconciliationState) {
|
||||
require.Equal(t, expected, actual)
|
||||
}
|
||||
|
||||
// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for
|
||||
// prebuilds align with zero values.
|
||||
func validateActions(t *testing.T, expected, actual prebuilds.ReconciliationActions) {
|
||||
require.Equal(t, expected, actual)
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/base32"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GenerateName generates a 20-byte prebuild name which should safe to use without truncation in most situations.
|
||||
// UUIDs may be too long for a resource name in cloud providers (since this ID will be used in the prebuild's name).
|
||||
//
|
||||
// We're generating a 9-byte suffix (72 bits of entropy):
|
||||
// 1 - e^(-1e9^2 / (2 * 2^72)) = ~0.01% likelihood of collision in 1 billion IDs.
|
||||
// See https://en.wikipedia.org/wiki/Birthday_attack.
|
||||
func GenerateName() (string, error) {
|
||||
b := make([]byte, 9)
|
||||
|
||||
_, err := rand.Read(b)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Encode the bytes to Base32 (A-Z2-7), strip any '=' padding
|
||||
return fmt.Sprintf("prebuild-%s", strings.ToLower(base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(b))), nil
|
||||
}
|
||||
@@ -90,6 +90,17 @@ func Find[T any](haystack []T, cond func(T) bool) (T, bool) {
|
||||
return empty, false
|
||||
}
|
||||
|
||||
// Filter returns all elements that satisfy the condition.
|
||||
func Filter[T any](haystack []T, cond func(T) bool) []T {
|
||||
out := make([]T, 0, len(haystack))
|
||||
for _, hay := range haystack {
|
||||
if cond(hay) {
|
||||
out = append(out, hay)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Overlap returns if the 2 sets have any overlap (element(s) in common)
|
||||
func Overlap[T comparable](a []T, b []T) bool {
|
||||
return OverlapCompare(a, b, func(a, b T) bool {
|
||||
|
||||
@@ -2,6 +2,7 @@ package slice_test
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -82,6 +83,64 @@ func TestContains(t *testing.T) {
|
||||
)
|
||||
}
|
||||
|
||||
func TestFilter(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
type testCase[T any] struct {
|
||||
haystack []T
|
||||
cond func(T) bool
|
||||
expected []T
|
||||
}
|
||||
|
||||
{
|
||||
testCases := []*testCase[int]{
|
||||
{
|
||||
haystack: []int{1, 2, 3, 4, 5},
|
||||
cond: func(num int) bool {
|
||||
return num%2 == 1
|
||||
},
|
||||
expected: []int{1, 3, 5},
|
||||
},
|
||||
{
|
||||
haystack: []int{1, 2, 3, 4, 5},
|
||||
cond: func(num int) bool {
|
||||
return num%2 == 0
|
||||
},
|
||||
expected: []int{2, 4},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
actual := slice.Filter(tc.haystack, tc.cond)
|
||||
require.Equal(t, tc.expected, actual)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
testCases := []*testCase[string]{
|
||||
{
|
||||
haystack: []string{"hello", "hi", "bye"},
|
||||
cond: func(str string) bool {
|
||||
return strings.HasPrefix(str, "h")
|
||||
},
|
||||
expected: []string{"hello", "hi"},
|
||||
},
|
||||
{
|
||||
haystack: []string{"hello", "hi", "bye"},
|
||||
cond: func(str string) bool {
|
||||
return strings.HasPrefix(str, "b")
|
||||
},
|
||||
expected: []string{"bye"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
actual := slice.Filter(tc.haystack, tc.cond)
|
||||
require.Equal(t, tc.expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestOverlap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -791,6 +791,19 @@ type NotificationsWebhookConfig struct {
|
||||
Endpoint serpent.URL `json:"endpoint" typescript:",notnull"`
|
||||
}
|
||||
|
||||
type PrebuildsConfig struct {
|
||||
// ReconciliationInterval defines how often the workspace prebuilds state should be reconciled.
|
||||
ReconciliationInterval serpent.Duration `json:"reconciliation_interval" typescript:",notnull"`
|
||||
|
||||
// ReconciliationBackoffInterval specifies the amount of time to increase the backoff interval
|
||||
// when errors occur during reconciliation.
|
||||
ReconciliationBackoffInterval serpent.Duration `json:"reconciliation_backoff_interval" typescript:",notnull"`
|
||||
|
||||
// ReconciliationBackoffLookback determines the time window to look back when calculating
|
||||
// the number of failed prebuilds, which influences the backoff strategy.
|
||||
ReconciliationBackoffLookback serpent.Duration `json:"reconciliation_backoff_lookback" typescript:",notnull"`
|
||||
}
|
||||
|
||||
const (
|
||||
annotationFormatDuration = "format_duration"
|
||||
annotationEnterpriseKey = "enterprise"
|
||||
|
||||
@@ -0,0 +1,541 @@
|
||||
package prebuilds
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-multierror"
|
||||
|
||||
"github.com/coder/quartz"
|
||||
|
||||
"github.com/coder/coder/v2/coderd/audit"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
|
||||
"github.com/coder/coder/v2/coderd/database/pubsub"
|
||||
"github.com/coder/coder/v2/coderd/prebuilds"
|
||||
"github.com/coder/coder/v2/coderd/rbac"
|
||||
"github.com/coder/coder/v2/coderd/rbac/policy"
|
||||
"github.com/coder/coder/v2/coderd/wsbuilder"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
type StoreReconciler struct {
|
||||
store database.Store
|
||||
cfg codersdk.PrebuildsConfig
|
||||
pubsub pubsub.Pubsub
|
||||
logger slog.Logger
|
||||
clock quartz.Clock
|
||||
|
||||
cancelFn context.CancelCauseFunc
|
||||
stopped atomic.Bool
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
|
||||
|
||||
func NewStoreReconciler(
|
||||
store database.Store,
|
||||
ps pubsub.Pubsub,
|
||||
cfg codersdk.PrebuildsConfig,
|
||||
logger slog.Logger,
|
||||
clock quartz.Clock,
|
||||
) *StoreReconciler {
|
||||
return &StoreReconciler{
|
||||
store: store,
|
||||
pubsub: ps,
|
||||
logger: logger,
|
||||
cfg: cfg,
|
||||
clock: clock,
|
||||
done: make(chan struct{}, 1),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) RunLoop(ctx context.Context) {
|
||||
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
|
||||
if reconciliationInterval <= 0 { // avoids a panic
|
||||
reconciliationInterval = 5 * time.Minute
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "starting reconciler",
|
||||
slog.F("interval", reconciliationInterval),
|
||||
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
|
||||
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))
|
||||
|
||||
ticker := c.clock.NewTicker(reconciliationInterval)
|
||||
defer ticker.Stop()
|
||||
defer func() {
|
||||
c.done <- struct{}{}
|
||||
}()
|
||||
|
||||
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
|
||||
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
|
||||
c.cancelFn = cancel
|
||||
|
||||
for {
|
||||
select {
|
||||
// TODO: implement pubsub listener to allow reconciling a specific template imperatively once it has been changed,
|
||||
// instead of waiting for the next reconciliation interval
|
||||
case <-ticker.C:
|
||||
// Trigger a new iteration on each tick.
|
||||
err := c.ReconcileAll(ctx)
|
||||
if err != nil {
|
||||
c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err))
|
||||
}
|
||||
case <-ctx.Done():
|
||||
// nolint:gocritic // it's okay to use slog.F() for an error in this case
|
||||
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
|
||||
c.logger.Warn(
|
||||
context.Background(),
|
||||
"reconciliation loop exited",
|
||||
slog.Error(ctx.Err()),
|
||||
slog.F("cause", context.Cause(ctx)),
|
||||
)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
|
||||
if cause != nil {
|
||||
c.logger.Error(context.Background(), "stopping reconciler due to an error", slog.Error(cause))
|
||||
} else {
|
||||
c.logger.Info(context.Background(), "gracefully stopping reconciler")
|
||||
}
|
||||
|
||||
if c.isStopped() {
|
||||
return
|
||||
}
|
||||
c.stopped.Store(true)
|
||||
if c.cancelFn != nil {
|
||||
c.cancelFn(cause)
|
||||
}
|
||||
|
||||
select {
|
||||
// Give up waiting for control loop to exit.
|
||||
case <-ctx.Done():
|
||||
// nolint:gocritic // it's okay to use slog.F() for an error in this case
|
||||
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
|
||||
c.logger.Error(
|
||||
context.Background(),
|
||||
"reconciler stop exited prematurely",
|
||||
slog.Error(ctx.Err()),
|
||||
slog.F("cause", context.Cause(ctx)),
|
||||
)
|
||||
// Wait for the control loop to exit.
|
||||
case <-c.done:
|
||||
c.logger.Info(context.Background(), "reconciler stopped")
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) isStopped() bool {
|
||||
return c.stopped.Load()
|
||||
}
|
||||
|
||||
// ReconcileAll will attempt to resolve the desired vs actual state of all templates which have presets with prebuilds configured.
|
||||
//
|
||||
// NOTE:
|
||||
//
|
||||
// This function will kick of n provisioner jobs, based on the calculated state modifications.
|
||||
//
|
||||
// These provisioning jobs are fire-and-forget. We DO NOT wait for the prebuilt workspaces to complete their
|
||||
// provisioning. As a consequence, it's possible that another reconciliation run will occur, which will mean that
|
||||
// multiple preset versions could be reconciling at once. This may mean some temporary over-provisioning, but the
|
||||
// reconciliation loop will bring these resources back into their desired numbers in an EVENTUALLY-consistent way.
|
||||
//
|
||||
// For example: we could decide to provision 1 new instance in this reconciliation.
|
||||
// While that workspace is being provisioned, another template version is created which means this same preset will
|
||||
// be reconciled again, leading to another workspace being provisioned. Two workspace builds will be occurring
|
||||
// simultaneously for the same preset, but once both jobs have completed the reconciliation loop will notice the
|
||||
// extraneous instance and delete it.
|
||||
func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
|
||||
logger := c.logger.With(slog.F("reconcile_context", "all"))
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err()))
|
||||
return nil
|
||||
default:
|
||||
}
|
||||
|
||||
logger.Debug(ctx, "starting reconciliation")
|
||||
|
||||
err := c.WithReconciliationLock(ctx, logger, func(ctx context.Context, db database.Store) error {
|
||||
snapshot, err := c.SnapshotState(ctx, db)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("determine current snapshot: %w", err)
|
||||
}
|
||||
if len(snapshot.Presets) == 0 {
|
||||
logger.Debug(ctx, "no templates found with prebuilds configured")
|
||||
return nil
|
||||
}
|
||||
|
||||
var eg errgroup.Group
|
||||
// Reconcile presets in parallel. Each preset in its own goroutine.
|
||||
for _, preset := range snapshot.Presets {
|
||||
ps, err := snapshot.FilterByPreset(preset.ID)
|
||||
if err != nil {
|
||||
logger.Warn(ctx, "failed to find preset snapshot", slog.Error(err), slog.F("preset_id", preset.ID.String()))
|
||||
continue
|
||||
}
|
||||
|
||||
eg.Go(func() error {
|
||||
// Pass outer context.
|
||||
err = c.ReconcilePreset(ctx, *ps)
|
||||
if err != nil {
|
||||
logger.Error(
|
||||
ctx,
|
||||
"failed to reconcile prebuilds for preset",
|
||||
slog.Error(err),
|
||||
slog.F("preset_id", preset.ID),
|
||||
)
|
||||
}
|
||||
// DO NOT return error otherwise the tx will end.
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// Release lock only when all preset reconciliation goroutines are finished.
|
||||
return eg.Wait()
|
||||
})
|
||||
if err != nil {
|
||||
logger.Error(ctx, "failed to reconcile", slog.Error(err))
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// SnapshotState captures the current state of all prebuilds across templates.
|
||||
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var state prebuilds.GlobalSnapshot
|
||||
|
||||
err := store.InTx(func(db database.Store) error {
|
||||
// TODO: implement template-specific reconciliations later
|
||||
presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, uuid.NullUUID{})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get template presets with prebuilds: %w", err)
|
||||
}
|
||||
if len(presetsWithPrebuilds) == 0 {
|
||||
return nil
|
||||
}
|
||||
allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get running prebuilds: %w", err)
|
||||
}
|
||||
|
||||
allPrebuildsInProgress, err := db.CountInProgressPrebuilds(ctx)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
|
||||
}
|
||||
|
||||
presetsBackoff, err := db.GetPresetsBackoff(ctx, c.clock.Now().Add(-c.cfg.ReconciliationBackoffLookback.Value()))
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
|
||||
}
|
||||
|
||||
state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff)
|
||||
return nil
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
|
||||
ReadOnly: true,
|
||||
TxIdentifier: "prebuilds_state_determination",
|
||||
})
|
||||
|
||||
return &state, err
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.PresetSnapshot) error {
|
||||
logger := c.logger.With(
|
||||
slog.F("template_id", ps.Preset.TemplateID.String()),
|
||||
slog.F("template_name", ps.Preset.TemplateName),
|
||||
slog.F("template_version_id", ps.Preset.TemplateVersionID),
|
||||
slog.F("template_version_name", ps.Preset.TemplateVersionName),
|
||||
slog.F("preset_id", ps.Preset.ID),
|
||||
slog.F("preset_name", ps.Preset.Name),
|
||||
)
|
||||
|
||||
state := ps.CalculateState()
|
||||
actions, err := c.CalculateActions(ctx, ps)
|
||||
if err != nil {
|
||||
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID))
|
||||
return nil
|
||||
}
|
||||
|
||||
// nolint:gocritic // ReconcilePreset needs Prebuilds Orchestrator permissions.
|
||||
prebuildsCtx := dbauthz.AsPrebuildsOrchestrator(ctx)
|
||||
|
||||
levelFn := logger.Debug
|
||||
switch {
|
||||
case actions.ActionType == prebuilds.ActionTypeBackoff:
|
||||
levelFn = logger.Warn
|
||||
// Log at info level when there's a change to be effected.
|
||||
case actions.ActionType == prebuilds.ActionTypeCreate && actions.Create > 0:
|
||||
levelFn = logger.Info
|
||||
case actions.ActionType == prebuilds.ActionTypeDelete && len(actions.DeleteIDs) > 0:
|
||||
levelFn = logger.Info
|
||||
}
|
||||
|
||||
fields := []any{
|
||||
slog.F("action_type", actions.ActionType),
|
||||
slog.F("create_count", actions.Create), slog.F("delete_count", len(actions.DeleteIDs)),
|
||||
slog.F("to_delete", actions.DeleteIDs),
|
||||
slog.F("desired", state.Desired), slog.F("actual", state.Actual),
|
||||
slog.F("extraneous", state.Extraneous), slog.F("starting", state.Starting),
|
||||
slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting),
|
||||
slog.F("eligible", state.Eligible),
|
||||
}
|
||||
|
||||
levelFn(ctx, "calculated reconciliation actions for preset", fields...)
|
||||
|
||||
switch actions.ActionType {
|
||||
case prebuilds.ActionTypeBackoff:
|
||||
// If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out.
|
||||
levelFn(ctx, "template prebuild state retrieved, backing off",
|
||||
append(fields,
|
||||
slog.F("backoff_until", actions.BackoffUntil.Format(time.RFC3339)),
|
||||
slog.F("backoff_secs", math.Round(actions.BackoffUntil.Sub(c.clock.Now()).Seconds())),
|
||||
)...)
|
||||
|
||||
return nil
|
||||
|
||||
case prebuilds.ActionTypeCreate:
|
||||
// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
|
||||
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
|
||||
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
|
||||
desired := ps.Preset.DesiredInstances.Int32
|
||||
if actions.Create > desired {
|
||||
logger.Critical(ctx, "determined excessive count of prebuilds to create; clamping to desired count",
|
||||
slog.F("create_count", actions.Create), slog.F("desired_count", desired))
|
||||
|
||||
actions.Create = desired
|
||||
}
|
||||
|
||||
var multiErr multierror.Error
|
||||
|
||||
for range actions.Create {
|
||||
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
||||
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
||||
multiErr.Errors = append(multiErr.Errors, err)
|
||||
}
|
||||
}
|
||||
|
||||
return multiErr.ErrorOrNil()
|
||||
|
||||
case prebuilds.ActionTypeDelete:
|
||||
var multiErr multierror.Error
|
||||
|
||||
for _, id := range actions.DeleteIDs {
|
||||
if err := c.deletePrebuiltWorkspace(prebuildsCtx, id, ps.Preset.TemplateID, ps.Preset.ID); err != nil {
|
||||
logger.Error(ctx, "failed to delete prebuild", slog.Error(err))
|
||||
multiErr.Errors = append(multiErr.Errors, err)
|
||||
}
|
||||
}
|
||||
|
||||
return multiErr.ErrorOrNil()
|
||||
|
||||
default:
|
||||
return xerrors.Errorf("unknown action type: %v", actions.ActionType)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) {
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
return snapshot.CalculateActions(c.clock, c.cfg.ReconciliationBackoffInterval.Value())
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) WithReconciliationLock(
|
||||
ctx context.Context,
|
||||
logger slog.Logger,
|
||||
fn func(ctx context.Context, db database.Store) error,
|
||||
) error {
|
||||
// This tx holds a global lock, which prevents any other coderd replica from starting a reconciliation and
|
||||
// possibly getting an inconsistent view of the state.
|
||||
//
|
||||
// The lock MUST be held until ALL modifications have been effected.
|
||||
//
|
||||
// It is run with RepeatableRead isolation, so it's effectively snapshotting the data at the start of the tx.
|
||||
//
|
||||
// This is a read-only tx, so returning an error (i.e. causing a rollback) has no impact.
|
||||
return c.store.InTx(func(db database.Store) error {
|
||||
start := c.clock.Now()
|
||||
|
||||
// Try to acquire the lock. If we can't get it, another replica is handling reconciliation.
|
||||
acquired, err := db.TryAcquireLock(ctx, database.LockIDReconcilePrebuilds)
|
||||
if err != nil {
|
||||
// This is a real database error, not just lock contention
|
||||
logger.Error(ctx, "failed to acquire reconciliation lock due to database error", slog.Error(err))
|
||||
return err
|
||||
}
|
||||
if !acquired {
|
||||
// Normal case: another replica has the lock
|
||||
return nil
|
||||
}
|
||||
|
||||
logger.Debug(ctx,
|
||||
"acquired top-level reconciliation lock",
|
||||
slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", c.clock.Since(start).Seconds())),
|
||||
)
|
||||
|
||||
return fn(ctx, db)
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead,
|
||||
ReadOnly: true,
|
||||
TxIdentifier: "prebuilds",
|
||||
})
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
|
||||
name, err := prebuilds.GenerateName()
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to generate unique prebuild ID: %w", err)
|
||||
}
|
||||
|
||||
return c.store.InTx(func(db database.Store) error {
|
||||
template, err := db.GetTemplateByID(ctx, templateID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get template: %w", err)
|
||||
}
|
||||
|
||||
now := c.clock.Now()
|
||||
|
||||
minimumWorkspace, err := db.InsertWorkspace(ctx, database.InsertWorkspaceParams{
|
||||
ID: prebuiltWorkspaceID,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
OwnerID: prebuilds.SystemUserID,
|
||||
OrganizationID: template.OrganizationID,
|
||||
TemplateID: template.ID,
|
||||
Name: name,
|
||||
LastUsedAt: c.clock.Now(),
|
||||
AutomaticUpdates: database.AutomaticUpdatesNever,
|
||||
AutostartSchedule: sql.NullString{},
|
||||
Ttl: sql.NullInt64{},
|
||||
NextStartAt: sql.NullTime{},
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("insert workspace: %w", err)
|
||||
}
|
||||
|
||||
// We have to refetch the workspace for the joined in fields.
|
||||
workspace, err := db.GetWorkspaceByID(ctx, minimumWorkspace.ID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace by ID: %w", err)
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
|
||||
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
|
||||
|
||||
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace)
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead,
|
||||
ReadOnly: false,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
|
||||
return c.store.InTx(func(db database.Store) error {
|
||||
workspace, err := db.GetWorkspaceByID(ctx, prebuiltWorkspaceID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace by ID: %w", err)
|
||||
}
|
||||
|
||||
template, err := db.GetTemplateByID(ctx, templateID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to get template: %w", err)
|
||||
}
|
||||
|
||||
if workspace.OwnerID != prebuilds.SystemUserID {
|
||||
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "attempting to delete prebuild",
|
||||
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
|
||||
|
||||
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace)
|
||||
}, &database.TxOptions{
|
||||
Isolation: sql.LevelRepeatableRead,
|
||||
ReadOnly: false,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *StoreReconciler) provision(
|
||||
ctx context.Context,
|
||||
db database.Store,
|
||||
prebuildID uuid.UUID,
|
||||
template database.Template,
|
||||
presetID uuid.UUID,
|
||||
transition database.WorkspaceTransition,
|
||||
workspace database.Workspace,
|
||||
) error {
|
||||
tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("fetch preset details: %w", err)
|
||||
}
|
||||
|
||||
var params []codersdk.WorkspaceBuildParameter
|
||||
for _, param := range tvp {
|
||||
// TODO: don't fetch in the first place.
|
||||
if param.TemplateVersionPresetID != presetID {
|
||||
continue
|
||||
}
|
||||
|
||||
params = append(params, codersdk.WorkspaceBuildParameter{
|
||||
Name: param.Name,
|
||||
Value: param.Value,
|
||||
})
|
||||
}
|
||||
|
||||
builder := wsbuilder.New(workspace, transition).
|
||||
Reason(database.BuildReasonInitiator).
|
||||
Initiator(prebuilds.SystemUserID).
|
||||
VersionID(template.ActiveVersionID).
|
||||
MarkPrebuild().
|
||||
TemplateVersionPresetID(presetID)
|
||||
|
||||
// We only inject the required params when the prebuild is being created.
|
||||
// This mirrors the behavior of regular workspace deletion (see cli/delete.go).
|
||||
if transition != database.WorkspaceTransitionDelete {
|
||||
builder = builder.RichParameterValues(params)
|
||||
}
|
||||
|
||||
_, provisionerJob, _, err := builder.Build(
|
||||
ctx,
|
||||
db,
|
||||
func(_ policy.Action, _ rbac.Objecter) bool {
|
||||
return true // TODO: harden?
|
||||
},
|
||||
audit.WorkspaceBuildBaggage{},
|
||||
)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("provision workspace: %w", err)
|
||||
}
|
||||
|
||||
err = provisionerjobs.PostJob(c.pubsub, *provisionerJob)
|
||||
if err != nil {
|
||||
// Client probably doesn't care about this error, so just log it.
|
||||
c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err))
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition),
|
||||
slog.F("prebuild_id", prebuildID.String()), slog.F("preset_id", presetID.String()),
|
||||
slog.F("job_id", provisionerJob.ID))
|
||||
|
||||
return nil
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Generated
+7
@@ -1695,6 +1695,13 @@ export interface PprofConfig {
|
||||
readonly address: string;
|
||||
}
|
||||
|
||||
// From codersdk/deployment.go
|
||||
export interface PrebuildsConfig {
|
||||
readonly reconciliation_interval: number;
|
||||
readonly reconciliation_backoff_interval: number;
|
||||
readonly reconciliation_backoff_lookback: number;
|
||||
}
|
||||
|
||||
// From codersdk/presets.go
|
||||
export interface Preset {
|
||||
readonly ID: string;
|
||||
|
||||
Reference in New Issue
Block a user