feat: implement reconciliation loop (#17261)

Closes https://github.com/coder/internal/issues/510

<details>
<summary> Refactoring Summary </summary>

### 1) `CalculateActions` Function

#### Issues Before Refactoring:

- Large function (~150 lines), making it difficult to read and maintain.
- The control flow is hard to follow due to complex conditional logic.
- The `ReconciliationActions` struct was partially initialized early,
then mutated in multiple places, making the flow error-prone.

Original source:  

https://github.com/coder/coder/blob/fe60b569ad754245e28bac71e0ef3c83536631bb/coderd/prebuilds/state.go#L13-L167

#### Improvements After Refactoring:

- Simplified and broken down into smaller, focused helper methods.
- The flow of the function is now more linear and easier to understand.
- Struct initialization is cleaner, avoiding partial and incremental
mutations.

Refactored function:  

https://github.com/coder/coder/blob/eeb0407d783cdda71ec2418c113f325542c47b1c/coderd/prebuilds/state.go#L67-L84

---

### 2) `ReconciliationActions` Struct

#### Issues Before Refactoring:

- The struct mixed both actionable decisions and diagnostic state, which
blurred its purpose.
- It was unclear which fields were necessary for reconciliation logic,
and which were purely for logging/observability.

#### Improvements After Refactoring:

- Split into two clear, purpose-specific structs:
- **`ReconciliationActions`** — defines the intended reconciliation
action.
- **`ReconciliationState`** — captures runtime state and metadata,
primarily for logging and diagnostics.

Original struct:  

https://github.com/coder/coder/blob/fe60b569ad754245e28bac71e0ef3c83536631bb/coderd/prebuilds/reconcile.go#L29-L41

</details>

---------

Signed-off-by: Danny Kopping <dannykopping@gmail.com>
Co-authored-by: Sas Swart <sas.swart.cdk@gmail.com>
Co-authored-by: Danny Kopping <dannykopping@gmail.com>
Co-authored-by: Dean Sheather <dean@deansheather.com>
Co-authored-by: Spike Curtis <spike@coder.com>
Co-authored-by: Danny Kopping <danny@coder.com>
This commit is contained in:
Yevhenii Shcherbina
2025-04-17 09:29:29 -04:00
committed by GitHub
parent 6a79965948
commit 27bc60d1b9
16 changed files with 2834 additions and 9 deletions
+1 -2
View File
@@ -12,8 +12,7 @@ const (
LockIDDBPurge
LockIDNotificationsReportGenerator
LockIDCryptoKeyRotation
LockIDReconcileTemplatePrebuilds
LockIDDeterminePrebuildsState
LockIDReconcilePrebuilds
)
// GenLockID generates a unique and consistent lock ID from a given string.
+1 -1
View File
@@ -64,7 +64,7 @@ type sqlcQuerier interface {
CleanTailnetCoordinators(ctx context.Context) error
CleanTailnetLostPeers(ctx context.Context) error
CleanTailnetTunnels(ctx context.Context) error
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error)
CountUnreadInboxNotificationsByUserID(ctx context.Context, userID uuid.UUID) (int64, error)
+5 -3
View File
@@ -5938,7 +5938,7 @@ func (q *sqlQuerier) ClaimPrebuiltWorkspace(ctx context.Context, arg ClaimPrebui
}
const countInProgressPrebuilds = `-- name: CountInProgressPrebuilds :many
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id
FROM workspace_latest_builds wlb
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
-- We only need these counts for active template versions.
@@ -5949,7 +5949,7 @@ FROM workspace_latest_builds wlb
-- prebuilds that are still building.
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
GROUP BY t.id, wpb.template_version_id, wpb.transition
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id
`
type CountInProgressPrebuildsRow struct {
@@ -5957,9 +5957,10 @@ type CountInProgressPrebuildsRow struct {
TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"`
Transition WorkspaceTransition `db:"transition" json:"transition"`
Count int32 `db:"count" json:"count"`
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
}
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) {
rows, err := q.db.QueryContext(ctx, countInProgressPrebuilds)
@@ -5975,6 +5976,7 @@ func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInPro
&i.TemplateVersionID,
&i.Transition,
&i.Count,
&i.PresetID,
); err != nil {
return nil, err
}
+3 -3
View File
@@ -57,9 +57,9 @@ WHERE (b.transition = 'start'::workspace_transition
AND b.job_status = 'succeeded'::provisioner_job_status);
-- name: CountInProgressPrebuilds :many
-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition.
-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
-- Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id
FROM workspace_latest_builds wlb
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
-- We only need these counts for active template versions.
@@ -70,7 +70,7 @@ FROM workspace_latest_builds wlb
-- prebuilds that are still building.
INNER JOIN templates t ON t.active_version_id = wlb.template_version_id
WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status)
GROUP BY t.id, wpb.template_version_id, wpb.transition;
GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id;
-- GetPresetsBackoff groups workspace builds by preset ID.
-- Each preset is associated with exactly one template version ID.
+27
View File
@@ -0,0 +1,27 @@
package prebuilds
import (
"context"
)
// ReconciliationOrchestrator manages the lifecycle of prebuild reconciliation.
// It runs a continuous loop to check and reconcile prebuild states, and can be stopped gracefully.
type ReconciliationOrchestrator interface {
Reconciler
// RunLoop starts a continuous reconciliation loop that periodically calls ReconcileAll
// to ensure all prebuilds are in their desired states. The loop runs until the context
// is canceled or Stop is called.
RunLoop(ctx context.Context)
// Stop gracefully shuts down the orchestrator with the given cause.
// The cause is used for logging and error reporting.
Stop(ctx context.Context, cause error)
}
type Reconciler interface {
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
// It takes a global snapshot of the system state and then reconciles each preset
// in parallel, creating or deleting prebuilds as needed to reach their desired states.
ReconcileAll(ctx context.Context) error
}
+66
View File
@@ -0,0 +1,66 @@
package prebuilds
import (
"github.com/google/uuid"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/slice"
)
// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates.
type GlobalSnapshot struct {
Presets []database.GetTemplatePresetsWithPrebuildsRow
RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow
PrebuildsInProgress []database.CountInProgressPrebuildsRow
Backoffs []database.GetPresetsBackoffRow
}
func NewGlobalSnapshot(
presets []database.GetTemplatePresetsWithPrebuildsRow,
runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow,
prebuildsInProgress []database.CountInProgressPrebuildsRow,
backoffs []database.GetPresetsBackoffRow,
) GlobalSnapshot {
return GlobalSnapshot{
Presets: presets,
RunningPrebuilds: runningPrebuilds,
PrebuildsInProgress: prebuildsInProgress,
Backoffs: backoffs,
}
}
func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, error) {
preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
return preset.ID == presetID
})
if !found {
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
}
running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuiltWorkspacesRow) bool {
if !prebuild.CurrentPresetID.Valid {
return false
}
return prebuild.CurrentPresetID.UUID == preset.ID
})
inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.CountInProgressPrebuildsRow) bool {
return prebuild.PresetID.UUID == preset.ID
})
var backoffPtr *database.GetPresetsBackoffRow
backoff, found := slice.Find(s.Backoffs, func(row database.GetPresetsBackoffRow) bool {
return row.PresetID == preset.ID
})
if found {
backoffPtr = &backoff
}
return &PresetSnapshot{
Preset: preset,
Running: running,
InProgress: inProgress,
Backoff: backoffPtr,
}, nil
}
+35
View File
@@ -0,0 +1,35 @@
package prebuilds
import (
"context"
"github.com/coder/coder/v2/coderd/database"
)
type NoopReconciler struct{}
func NewNoopReconciler() *NoopReconciler {
return &NoopReconciler{}
}
func (NoopReconciler) RunLoop(context.Context) {}
func (NoopReconciler) Stop(context.Context, error) {}
func (NoopReconciler) ReconcileAll(context.Context) error {
return nil
}
func (NoopReconciler) SnapshotState(context.Context, database.Store) (*GlobalSnapshot, error) {
return &GlobalSnapshot{}, nil
}
func (NoopReconciler) ReconcilePreset(context.Context, PresetSnapshot) error {
return nil
}
func (NoopReconciler) CalculateActions(context.Context, PresetSnapshot) (*ReconciliationActions, error) {
return &ReconciliationActions{}, nil
}
var _ ReconciliationOrchestrator = NoopReconciler{}
+254
View File
@@ -0,0 +1,254 @@
package prebuilds
import (
"slices"
"time"
"github.com/google/uuid"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/database"
)
// ActionType represents the type of action needed to reconcile prebuilds.
type ActionType int
const (
// ActionTypeUndefined represents an uninitialized or invalid action type.
ActionTypeUndefined ActionType = iota
// ActionTypeCreate indicates that new prebuilds should be created.
ActionTypeCreate
// ActionTypeDelete indicates that existing prebuilds should be deleted.
ActionTypeDelete
// ActionTypeBackoff indicates that prebuild creation should be delayed.
ActionTypeBackoff
)
// PresetSnapshot is a filtered view of GlobalSnapshot focused on a single preset.
// It contains the raw data needed to calculate the current state of a preset's prebuilds,
// including running prebuilds, in-progress builds, and backoff information.
type PresetSnapshot struct {
Preset database.GetTemplatePresetsWithPrebuildsRow
Running []database.GetRunningPrebuiltWorkspacesRow
InProgress []database.CountInProgressPrebuildsRow
Backoff *database.GetPresetsBackoffRow
}
// ReconciliationState represents the processed state of a preset's prebuilds,
// calculated from a PresetSnapshot. While PresetSnapshot contains raw data,
// ReconciliationState contains derived metrics that are directly used to
// determine what actions are needed (create, delete, or backoff).
// For example, it calculates how many prebuilds are eligible, how many are
// extraneous, and how many are in various transition states.
type ReconciliationState struct {
Actual int32 // Number of currently running prebuilds
Desired int32 // Number of prebuilds desired as defined in the preset
Eligible int32 // Number of prebuilds that are ready to be claimed
Extraneous int32 // Number of extra running prebuilds beyond the desired count
// Counts of prebuilds in various transition states
Starting int32
Stopping int32
Deleting int32
}
// ReconciliationActions represents actions needed to reconcile the current state with the desired state.
// Based on ActionType, exactly one of Create, DeleteIDs, or BackoffUntil will be set.
type ReconciliationActions struct {
// ActionType determines which field is set and what action should be taken
ActionType ActionType
// Create is set when ActionType is ActionTypeCreate and indicates the number of prebuilds to create
Create int32
// DeleteIDs is set when ActionType is ActionTypeDelete and contains the IDs of prebuilds to delete
DeleteIDs []uuid.UUID
// BackoffUntil is set when ActionType is ActionTypeBackoff and indicates when to retry creating prebuilds
BackoffUntil time.Time
}
// CalculateState computes the current state of prebuilds for a preset, including:
// - Actual: Number of currently running prebuilds
// - Desired: Number of prebuilds desired as defined in the preset
// - Eligible: Number of prebuilds that are ready to be claimed
// - Extraneous: Number of extra running prebuilds beyond the desired count
// - Starting/Stopping/Deleting: Counts of prebuilds in various transition states
//
// The function takes into account whether the preset is active (using the active template version)
// and calculates appropriate counts based on the current state of running prebuilds and
// in-progress transitions. This state information is used to determine what reconciliation
// actions are needed to reach the desired state.
func (p PresetSnapshot) CalculateState() *ReconciliationState {
var (
actual int32
desired int32
eligible int32
extraneous int32
)
if p.isActive() {
// #nosec G115 - Safe conversion as p.Running slice length is expected to be within int32 range
actual = int32(len(p.Running))
desired = p.Preset.DesiredInstances.Int32
eligible = p.countEligible()
extraneous = max(actual-desired, 0)
}
starting, stopping, deleting := p.countInProgress()
return &ReconciliationState{
Actual: actual,
Desired: desired,
Eligible: eligible,
Extraneous: extraneous,
Starting: starting,
Stopping: stopping,
Deleting: deleting,
}
}
// CalculateActions determines what actions are needed to reconcile the current state with the desired state.
// The function:
// 1. First checks if a backoff period is needed (if previous builds failed)
// 2. If the preset is inactive (template version is not active), it will delete all running prebuilds
// 3. For active presets, it calculates the number of prebuilds to create or delete based on:
// - The desired number of instances
// - Currently running prebuilds
// - Prebuilds in transition states (starting/stopping/deleting)
// - Any extraneous prebuilds that need to be removed
//
// The function returns a ReconciliationActions struct that will have exactly one action type set:
// - ActionTypeBackoff: Only BackoffUntil is set, indicating when to retry
// - ActionTypeCreate: Only Create is set, indicating how many prebuilds to create
// - ActionTypeDelete: Only DeleteIDs is set, containing IDs of prebuilds to delete
func (p PresetSnapshot) CalculateActions(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, error) {
// TODO: align workspace states with how we represent them on the FE and the CLI
// right now there's some slight differences which can lead to additional prebuilds being created
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
actions, needsBackoff := p.needsBackoffPeriod(clock, backoffInterval)
if needsBackoff {
return actions, nil
}
if !p.isActive() {
return p.handleInactiveTemplateVersion()
}
return p.handleActiveTemplateVersion()
}
// isActive returns true if the preset's template version is the active version, and it is neither deleted nor deprecated.
// This determines whether we should maintain prebuilds for this preset or delete them.
func (p PresetSnapshot) isActive() bool {
return p.Preset.UsingActiveVersion && !p.Preset.Deleted && !p.Preset.Deprecated
}
// handleActiveTemplateVersion deletes excess prebuilds if there are too many,
// otherwise creates new ones to reach the desired count.
func (p PresetSnapshot) handleActiveTemplateVersion() (*ReconciliationActions, error) {
state := p.CalculateState()
// If we have more prebuilds than desired, delete the oldest ones
if state.Extraneous > 0 {
return &ReconciliationActions{
ActionType: ActionTypeDelete,
DeleteIDs: p.getOldestPrebuildIDs(int(state.Extraneous)),
}, nil
}
// Calculate how many new prebuilds we need to create
// We subtract starting prebuilds since they're already being created
prebuildsToCreate := max(state.Desired-state.Actual-state.Starting, 0)
return &ReconciliationActions{
ActionType: ActionTypeCreate,
Create: prebuildsToCreate,
}, nil
}
// handleInactiveTemplateVersion deletes all running prebuilds except those already being deleted
// to avoid duplicate deletion attempts.
func (p PresetSnapshot) handleInactiveTemplateVersion() (*ReconciliationActions, error) {
prebuildsToDelete := len(p.Running)
deleteIDs := p.getOldestPrebuildIDs(prebuildsToDelete)
return &ReconciliationActions{
ActionType: ActionTypeDelete,
DeleteIDs: deleteIDs,
}, nil
}
// needsBackoffPeriod checks if we should delay prebuild creation due to recent failures.
// If there were failures, it calculates a backoff period based on the number of failures
// and returns true if we're still within that period.
func (p PresetSnapshot) needsBackoffPeriod(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, bool) {
if p.Backoff == nil || p.Backoff.NumFailed == 0 {
return nil, false
}
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
if clock.Now().After(backoffUntil) {
return nil, false
}
return &ReconciliationActions{
ActionType: ActionTypeBackoff,
BackoffUntil: backoffUntil,
}, true
}
// countEligible returns the number of prebuilds that are ready to be claimed.
// A prebuild is eligible if it's running and its agents are in ready state.
func (p PresetSnapshot) countEligible() int32 {
var count int32
for _, prebuild := range p.Running {
if prebuild.Ready {
count++
}
}
return count
}
// countInProgress returns counts of prebuilds in transition states (starting, stopping, deleting).
// These counts are tracked at the template level, so all presets sharing the same template see the same values.
func (p PresetSnapshot) countInProgress() (starting int32, stopping int32, deleting int32) {
for _, progress := range p.InProgress {
num := progress.Count
switch progress.Transition {
case database.WorkspaceTransitionStart:
starting += num
case database.WorkspaceTransitionStop:
stopping += num
case database.WorkspaceTransitionDelete:
deleting += num
}
}
return starting, stopping, deleting
}
// getOldestPrebuildIDs returns the IDs of the N oldest prebuilds, sorted by creation time.
// This is used when we need to delete prebuilds, ensuring we remove the oldest ones first.
func (p PresetSnapshot) getOldestPrebuildIDs(n int) []uuid.UUID {
// Sort by creation time, oldest first
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuiltWorkspacesRow) int {
return a.CreatedAt.Compare(b.CreatedAt)
})
// Take the first N IDs
n = min(n, len(p.Running))
ids := make([]uuid.UUID, n)
for i := 0; i < n; i++ {
ids[i] = p.Running[i].ID
}
return ids
}
+758
View File
@@ -0,0 +1,758 @@
package prebuilds_test
import (
"database/sql"
"fmt"
"testing"
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/prebuilds"
)
type options struct {
templateID uuid.UUID
templateVersionID uuid.UUID
presetID uuid.UUID
presetName string
prebuiltWorkspaceID uuid.UUID
workspaceName string
}
// templateID is common across all option sets.
var templateID = uuid.UUID{1}
const (
backoffInterval = time.Second * 5
optionSet0 = iota
optionSet1
optionSet2
)
var opts = map[uint]options{
optionSet0: {
templateID: templateID,
templateVersionID: uuid.UUID{11},
presetID: uuid.UUID{12},
presetName: "my-preset",
prebuiltWorkspaceID: uuid.UUID{13},
workspaceName: "prebuilds0",
},
optionSet1: {
templateID: templateID,
templateVersionID: uuid.UUID{21},
presetID: uuid.UUID{22},
presetName: "my-preset",
prebuiltWorkspaceID: uuid.UUID{23},
workspaceName: "prebuilds1",
},
optionSet2: {
templateID: templateID,
templateVersionID: uuid.UUID{31},
presetID: uuid.UUID{32},
presetName: "my-preset",
prebuiltWorkspaceID: uuid.UUID{33},
workspaceName: "prebuilds2",
},
}
// A new template version with a preset without prebuilds configured should result in no prebuilds being created.
func TestNoPrebuilds(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
clock := quartz.NewMock(t)
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 0, current),
}
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
ps, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{ /*all zero values*/ }, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 0,
}, *actions)
}
// A new template version with a preset with prebuilds configured should result in a new prebuild being created.
func TestNetNew(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
clock := quartz.NewMock(t)
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 1, current),
}
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil)
ps, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Desired: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, *actions)
}
// A new template version is created with a preset with prebuilds configured; this outdates the older version and
// requires the old prebuilds to be destroyed and new prebuilds to be created.
func TestOutdatedPrebuilds(t *testing.T) {
t.Parallel()
outdated := opts[optionSet0]
current := opts[optionSet1]
clock := quartz.NewMock(t)
// GIVEN: 2 presets, one outdated and one new.
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(false, 1, outdated),
preset(true, 1, current),
}
// GIVEN: a running prebuild for the outdated preset.
running := []database.GetRunningPrebuiltWorkspacesRow{
prebuiltWorkspace(outdated, clock),
}
// GIVEN: no in-progress builds.
var inProgress []database.CountInProgressPrebuildsRow
// WHEN: calculating the outdated preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
ps, err := snapshot.FilterByPreset(outdated.presetID)
require.NoError(t, err)
// THEN: we should identify that this prebuild is outdated and needs to be deleted.
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeDelete,
DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID},
}, *actions)
// WHEN: calculating the current preset's state.
ps, err = snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
// THEN: we should not be blocked from creating a new prebuild while the outdate one deletes.
state = ps.CalculateState()
actions, err = ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{Desired: 1}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, *actions)
}
// Make sure that outdated prebuild will be deleted, even if deletion of another outdated prebuild is already in progress.
func TestDeleteOutdatedPrebuilds(t *testing.T) {
t.Parallel()
outdated := opts[optionSet0]
clock := quartz.NewMock(t)
// GIVEN: 1 outdated preset.
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(false, 1, outdated),
}
// GIVEN: one running prebuild for the outdated preset.
running := []database.GetRunningPrebuiltWorkspacesRow{
prebuiltWorkspace(outdated, clock),
}
// GIVEN: one deleting prebuild for the outdated preset.
inProgress := []database.CountInProgressPrebuildsRow{
{
TemplateID: outdated.templateID,
TemplateVersionID: outdated.templateVersionID,
Transition: database.WorkspaceTransitionDelete,
Count: 1,
PresetID: uuid.NullUUID{
UUID: outdated.presetID,
Valid: true,
},
},
}
// WHEN: calculating the outdated preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
ps, err := snapshot.FilterByPreset(outdated.presetID)
require.NoError(t, err)
// THEN: we should identify that this prebuild is outdated and needs to be deleted.
// Despite the fact that deletion of another outdated prebuild is already in progress.
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Deleting: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeDelete,
DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID},
}, *actions)
}
// A new template version is created with a preset with prebuilds configured; while a prebuild is provisioning up or down,
// the calculated actions should indicate the state correctly.
func TestInProgressActions(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
clock := quartz.NewMock(t)
cases := []struct {
name string
transition database.WorkspaceTransition
desired int32
running int32
inProgress int32
checkFn func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions)
}{
// With no running prebuilds and one starting, no creations/deletions should take place.
{
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStart),
transition: database.WorkspaceTransitionStart,
desired: 1,
running: 0,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Desired: 1, Starting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
}, actions)
},
},
// With one running prebuild and one starting, no creations/deletions should occur since we're approaching the correct state.
{
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStart),
transition: database.WorkspaceTransitionStart,
desired: 2,
running: 1,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Starting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
}, actions)
},
},
// With one running prebuild and one starting, no creations/deletions should occur
// SIDE-NOTE: once the starting prebuild completes, the older of the two will be considered extraneous since we only desire 2.
{
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStart),
transition: database.WorkspaceTransitionStart,
desired: 2,
running: 2,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Starting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
}, actions)
},
},
// With one prebuild desired and one stopping, a new prebuild will be created.
{
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStop),
transition: database.WorkspaceTransitionStop,
desired: 1,
running: 0,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Desired: 1, Stopping: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, actions)
},
},
// With 3 prebuilds desired, 2 running, and 1 stopping, a new prebuild will be created.
{
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStop),
transition: database.WorkspaceTransitionStop,
desired: 3,
running: 2,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 3, Stopping: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, actions)
},
},
// With 3 prebuilds desired, 3 running, and 1 stopping, no creations/deletions should occur since the desired state is already achieved.
{
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStop),
transition: database.WorkspaceTransitionStop,
desired: 3,
running: 3,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 3, Desired: 3, Stopping: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
}, actions)
},
},
// With one prebuild desired and one deleting, a new prebuild will be created.
{
name: fmt.Sprintf("%s-short", database.WorkspaceTransitionDelete),
transition: database.WorkspaceTransitionDelete,
desired: 1,
running: 0,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Desired: 1, Deleting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, actions)
},
},
// With 2 prebuilds desired, 1 running, and 1 deleting, a new prebuild will be created.
{
name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionDelete),
transition: database.WorkspaceTransitionDelete,
desired: 2,
running: 1,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Deleting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, actions)
},
},
// With 2 prebuilds desired, 2 running, and 1 deleting, no creations/deletions should occur since the desired state is already achieved.
{
name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionDelete),
transition: database.WorkspaceTransitionDelete,
desired: 2,
running: 2,
inProgress: 1,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Deleting: 1}, state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
}, actions)
},
},
// With 3 prebuilds desired, 1 running, and 2 starting, no creations should occur since the builds are in progress.
{
name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionStart),
transition: database.WorkspaceTransitionStart,
desired: 3,
running: 1,
inProgress: 2,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 3, Starting: 2}, state)
validateActions(t, prebuilds.ReconciliationActions{ActionType: prebuilds.ActionTypeCreate, Create: 0}, actions)
},
},
// With 3 prebuilds desired, 5 running, and 2 deleting, no deletions should occur since the builds are in progress.
{
name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionDelete),
transition: database.WorkspaceTransitionDelete,
desired: 3,
running: 5,
inProgress: 2,
checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) {
expectedState := prebuilds.ReconciliationState{Actual: 5, Desired: 3, Deleting: 2, Extraneous: 2}
expectedActions := prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeDelete,
}
validateState(t, expectedState, state)
assert.EqualValuesf(t, expectedActions.ActionType, actions.ActionType, "'ActionType' did not match expectation")
assert.Len(t, actions.DeleteIDs, 2, "'deleteIDs' did not match expectation")
assert.EqualValuesf(t, expectedActions.Create, actions.Create, "'create' did not match expectation")
assert.EqualValuesf(t, expectedActions.BackoffUntil, actions.BackoffUntil, "'BackoffUntil' did not match expectation")
},
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
// GIVEN: a preset.
defaultPreset := preset(true, tc.desired, current)
presets := []database.GetTemplatePresetsWithPrebuildsRow{
defaultPreset,
}
// GIVEN: running prebuilt workspaces for the preset.
running := make([]database.GetRunningPrebuiltWorkspacesRow, 0, tc.running)
for range tc.running {
name, err := prebuilds.GenerateName()
require.NoError(t, err)
running = append(running, database.GetRunningPrebuiltWorkspacesRow{
ID: uuid.New(),
Name: name,
TemplateID: current.templateID,
TemplateVersionID: current.templateVersionID,
CurrentPresetID: uuid.NullUUID{UUID: current.presetID, Valid: true},
Ready: false,
CreatedAt: clock.Now(),
})
}
// GIVEN: some prebuilds for the preset which are currently transitioning.
inProgress := []database.CountInProgressPrebuildsRow{
{
TemplateID: current.templateID,
TemplateVersionID: current.templateVersionID,
Transition: tc.transition,
Count: tc.inProgress,
PresetID: uuid.NullUUID{
UUID: defaultPreset.ID,
Valid: true,
},
},
}
// WHEN: calculating the current preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
ps, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
// THEN: we should identify that this prebuild is in progress.
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
tc.checkFn(*state, *actions)
})
}
}
// Additional prebuilds exist for a given preset configuration; these must be deleted.
func TestExtraneous(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
clock := quartz.NewMock(t)
// GIVEN: a preset with 1 desired prebuild.
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 1, current),
}
var older uuid.UUID
// GIVEN: 2 running prebuilds for the preset.
running := []database.GetRunningPrebuiltWorkspacesRow{
prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow {
// The older of the running prebuilds will be deleted in order to maintain freshness.
row.CreatedAt = clock.Now().Add(-time.Hour)
older = row.ID
return row
}),
prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow {
row.CreatedAt = clock.Now()
return row
}),
}
// GIVEN: NO prebuilds in progress.
var inProgress []database.CountInProgressPrebuildsRow
// WHEN: calculating the current preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
ps, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
// THEN: an extraneous prebuild is detected and marked for deletion.
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Actual: 2, Desired: 1, Extraneous: 1, Eligible: 2,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeDelete,
DeleteIDs: []uuid.UUID{older},
}, *actions)
}
// A template marked as deprecated will not have prebuilds running.
func TestDeprecated(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
clock := quartz.NewMock(t)
// GIVEN: a preset with 1 desired prebuild.
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 1, current, func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow {
row.Deprecated = true
return row
}),
}
// GIVEN: 1 running prebuilds for the preset.
running := []database.GetRunningPrebuiltWorkspacesRow{
prebuiltWorkspace(current, clock),
}
// GIVEN: NO prebuilds in progress.
var inProgress []database.CountInProgressPrebuildsRow
// WHEN: calculating the current preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil)
ps, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
// THEN: all running prebuilds should be deleted because the template is deprecated.
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeDelete,
DeleteIDs: []uuid.UUID{current.prebuiltWorkspaceID},
}, *actions)
}
// If the latest build failed, backoff exponentially with the given interval.
func TestLatestBuildFailed(t *testing.T) {
t.Parallel()
current := opts[optionSet0]
other := opts[optionSet1]
clock := quartz.NewMock(t)
// GIVEN: two presets.
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 1, current),
preset(true, 1, other),
}
// GIVEN: running prebuilds only for one preset (the other will be failing, as evidenced by the backoffs below).
running := []database.GetRunningPrebuiltWorkspacesRow{
prebuiltWorkspace(other, clock),
}
// GIVEN: NO prebuilds in progress.
var inProgress []database.CountInProgressPrebuildsRow
// GIVEN: a backoff entry.
lastBuildTime := clock.Now()
numFailed := 1
backoffs := []database.GetPresetsBackoffRow{
{
TemplateVersionID: current.templateVersionID,
PresetID: current.presetID,
NumFailed: int32(numFailed),
LastBuildAt: lastBuildTime,
},
}
// WHEN: calculating the current preset's state.
snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs)
psCurrent, err := snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
// THEN: reconciliation should backoff.
state := psCurrent.CalculateState()
actions, err := psCurrent.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Actual: 0, Desired: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeBackoff,
BackoffUntil: lastBuildTime.Add(time.Duration(numFailed) * backoffInterval),
}, *actions)
// WHEN: calculating the other preset's state.
psOther, err := snapshot.FilterByPreset(other.presetID)
require.NoError(t, err)
// THEN: it should NOT be in backoff because all is OK.
state = psOther.CalculateState()
actions, err = psOther.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Actual: 1, Desired: 1, Eligible: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
BackoffUntil: time.Time{},
}, *actions)
// WHEN: the clock is advanced a backoff interval.
clock.Advance(backoffInterval + time.Microsecond)
// THEN: a new prebuild should be created.
psCurrent, err = snapshot.FilterByPreset(current.presetID)
require.NoError(t, err)
state = psCurrent.CalculateState()
actions, err = psCurrent.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Actual: 0, Desired: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1, // <--- NOTE: we're now able to create a new prebuild because the interval has elapsed.
}, *actions)
}
func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
t.Parallel()
templateID := uuid.New()
templateVersionID := uuid.New()
presetOpts1 := options{
templateID: templateID,
templateVersionID: templateVersionID,
presetID: uuid.New(),
presetName: "my-preset-1",
prebuiltWorkspaceID: uuid.New(),
workspaceName: "prebuilds1",
}
presetOpts2 := options{
templateID: templateID,
templateVersionID: templateVersionID,
presetID: uuid.New(),
presetName: "my-preset-2",
prebuiltWorkspaceID: uuid.New(),
workspaceName: "prebuilds2",
}
clock := quartz.NewMock(t)
presets := []database.GetTemplatePresetsWithPrebuildsRow{
preset(true, 1, presetOpts1),
preset(true, 1, presetOpts2),
}
inProgress := []database.CountInProgressPrebuildsRow{
{
TemplateID: templateID,
TemplateVersionID: templateVersionID,
Transition: database.WorkspaceTransitionStart,
Count: 1,
PresetID: uuid.NullUUID{
UUID: presetOpts1.presetID,
Valid: true,
},
},
}
snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil)
// Nothing has to be created for preset 1.
{
ps, err := snapshot.FilterByPreset(presetOpts1.presetID)
require.NoError(t, err)
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Starting: 1,
Desired: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 0,
}, *actions)
}
// One prebuild has to be created for preset 2. Make sure preset 1 doesn't block preset 2.
{
ps, err := snapshot.FilterByPreset(presetOpts2.presetID)
require.NoError(t, err)
state := ps.CalculateState()
actions, err := ps.CalculateActions(clock, backoffInterval)
require.NoError(t, err)
validateState(t, prebuilds.ReconciliationState{
Starting: 0,
Desired: 1,
}, *state)
validateActions(t, prebuilds.ReconciliationActions{
ActionType: prebuilds.ActionTypeCreate,
Create: 1,
}, *actions)
}
}
func preset(active bool, instances int32, opts options, muts ...func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow {
entry := database.GetTemplatePresetsWithPrebuildsRow{
TemplateID: opts.templateID,
TemplateVersionID: opts.templateVersionID,
ID: opts.presetID,
UsingActiveVersion: active,
Name: opts.presetName,
DesiredInstances: sql.NullInt32{
Valid: true,
Int32: instances,
},
Deleted: false,
Deprecated: false,
}
for _, mut := range muts {
entry = mut(entry)
}
return entry
}
func prebuiltWorkspace(
opts options,
clock quartz.Clock,
muts ...func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow,
) database.GetRunningPrebuiltWorkspacesRow {
entry := database.GetRunningPrebuiltWorkspacesRow{
ID: opts.prebuiltWorkspaceID,
Name: opts.workspaceName,
TemplateID: opts.templateID,
TemplateVersionID: opts.templateVersionID,
CurrentPresetID: uuid.NullUUID{UUID: opts.presetID, Valid: true},
Ready: true,
CreatedAt: clock.Now(),
}
for _, mut := range muts {
entry = mut(entry)
}
return entry
}
func validateState(t *testing.T, expected, actual prebuilds.ReconciliationState) {
require.Equal(t, expected, actual)
}
// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for
// prebuilds align with zero values.
func validateActions(t *testing.T, expected, actual prebuilds.ReconciliationActions) {
require.Equal(t, expected, actual)
}
+26
View File
@@ -0,0 +1,26 @@
package prebuilds
import (
"crypto/rand"
"encoding/base32"
"fmt"
"strings"
)
// GenerateName generates a 20-byte prebuild name which should safe to use without truncation in most situations.
// UUIDs may be too long for a resource name in cloud providers (since this ID will be used in the prebuild's name).
//
// We're generating a 9-byte suffix (72 bits of entropy):
// 1 - e^(-1e9^2 / (2 * 2^72)) = ~0.01% likelihood of collision in 1 billion IDs.
// See https://en.wikipedia.org/wiki/Birthday_attack.
func GenerateName() (string, error) {
b := make([]byte, 9)
_, err := rand.Read(b)
if err != nil {
return "", err
}
// Encode the bytes to Base32 (A-Z2-7), strip any '=' padding
return fmt.Sprintf("prebuild-%s", strings.ToLower(base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(b))), nil
}
+11
View File
@@ -90,6 +90,17 @@ func Find[T any](haystack []T, cond func(T) bool) (T, bool) {
return empty, false
}
// Filter returns all elements that satisfy the condition.
func Filter[T any](haystack []T, cond func(T) bool) []T {
out := make([]T, 0, len(haystack))
for _, hay := range haystack {
if cond(hay) {
out = append(out, hay)
}
}
return out
}
// Overlap returns if the 2 sets have any overlap (element(s) in common)
func Overlap[T comparable](a []T, b []T) bool {
return OverlapCompare(a, b, func(a, b T) bool {
+59
View File
@@ -2,6 +2,7 @@ package slice_test
import (
"math/rand"
"strings"
"testing"
"github.com/google/uuid"
@@ -82,6 +83,64 @@ func TestContains(t *testing.T) {
)
}
func TestFilter(t *testing.T) {
t.Parallel()
type testCase[T any] struct {
haystack []T
cond func(T) bool
expected []T
}
{
testCases := []*testCase[int]{
{
haystack: []int{1, 2, 3, 4, 5},
cond: func(num int) bool {
return num%2 == 1
},
expected: []int{1, 3, 5},
},
{
haystack: []int{1, 2, 3, 4, 5},
cond: func(num int) bool {
return num%2 == 0
},
expected: []int{2, 4},
},
}
for _, tc := range testCases {
actual := slice.Filter(tc.haystack, tc.cond)
require.Equal(t, tc.expected, actual)
}
}
{
testCases := []*testCase[string]{
{
haystack: []string{"hello", "hi", "bye"},
cond: func(str string) bool {
return strings.HasPrefix(str, "h")
},
expected: []string{"hello", "hi"},
},
{
haystack: []string{"hello", "hi", "bye"},
cond: func(str string) bool {
return strings.HasPrefix(str, "b")
},
expected: []string{"bye"},
},
}
for _, tc := range testCases {
actual := slice.Filter(tc.haystack, tc.cond)
require.Equal(t, tc.expected, actual)
}
}
}
func TestOverlap(t *testing.T) {
t.Parallel()
+13
View File
@@ -791,6 +791,19 @@ type NotificationsWebhookConfig struct {
Endpoint serpent.URL `json:"endpoint" typescript:",notnull"`
}
type PrebuildsConfig struct {
// ReconciliationInterval defines how often the workspace prebuilds state should be reconciled.
ReconciliationInterval serpent.Duration `json:"reconciliation_interval" typescript:",notnull"`
// ReconciliationBackoffInterval specifies the amount of time to increase the backoff interval
// when errors occur during reconciliation.
ReconciliationBackoffInterval serpent.Duration `json:"reconciliation_backoff_interval" typescript:",notnull"`
// ReconciliationBackoffLookback determines the time window to look back when calculating
// the number of failed prebuilds, which influences the backoff strategy.
ReconciliationBackoffLookback serpent.Duration `json:"reconciliation_backoff_lookback" typescript:",notnull"`
}
const (
annotationFormatDuration = "format_duration"
annotationEnterpriseKey = "enterprise"
+541
View File
@@ -0,0 +1,541 @@
package prebuilds
import (
"context"
"database/sql"
"fmt"
"math"
"sync/atomic"
"time"
"github.com/hashicorp/go-multierror"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/audit"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/codersdk"
"cdr.dev/slog"
"github.com/google/uuid"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
)
type StoreReconciler struct {
store database.Store
cfg codersdk.PrebuildsConfig
pubsub pubsub.Pubsub
logger slog.Logger
clock quartz.Clock
cancelFn context.CancelCauseFunc
stopped atomic.Bool
done chan struct{}
}
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
func NewStoreReconciler(
store database.Store,
ps pubsub.Pubsub,
cfg codersdk.PrebuildsConfig,
logger slog.Logger,
clock quartz.Clock,
) *StoreReconciler {
return &StoreReconciler{
store: store,
pubsub: ps,
logger: logger,
cfg: cfg,
clock: clock,
done: make(chan struct{}, 1),
}
}
func (c *StoreReconciler) RunLoop(ctx context.Context) {
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
if reconciliationInterval <= 0 { // avoids a panic
reconciliationInterval = 5 * time.Minute
}
c.logger.Info(ctx, "starting reconciler",
slog.F("interval", reconciliationInterval),
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))
ticker := c.clock.NewTicker(reconciliationInterval)
defer ticker.Stop()
defer func() {
c.done <- struct{}{}
}()
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
c.cancelFn = cancel
for {
select {
// TODO: implement pubsub listener to allow reconciling a specific template imperatively once it has been changed,
// instead of waiting for the next reconciliation interval
case <-ticker.C:
// Trigger a new iteration on each tick.
err := c.ReconcileAll(ctx)
if err != nil {
c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err))
}
case <-ctx.Done():
// nolint:gocritic // it's okay to use slog.F() for an error in this case
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
c.logger.Warn(
context.Background(),
"reconciliation loop exited",
slog.Error(ctx.Err()),
slog.F("cause", context.Cause(ctx)),
)
return
}
}
}
func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
if cause != nil {
c.logger.Error(context.Background(), "stopping reconciler due to an error", slog.Error(cause))
} else {
c.logger.Info(context.Background(), "gracefully stopping reconciler")
}
if c.isStopped() {
return
}
c.stopped.Store(true)
if c.cancelFn != nil {
c.cancelFn(cause)
}
select {
// Give up waiting for control loop to exit.
case <-ctx.Done():
// nolint:gocritic // it's okay to use slog.F() for an error in this case
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
c.logger.Error(
context.Background(),
"reconciler stop exited prematurely",
slog.Error(ctx.Err()),
slog.F("cause", context.Cause(ctx)),
)
// Wait for the control loop to exit.
case <-c.done:
c.logger.Info(context.Background(), "reconciler stopped")
}
}
func (c *StoreReconciler) isStopped() bool {
return c.stopped.Load()
}
// ReconcileAll will attempt to resolve the desired vs actual state of all templates which have presets with prebuilds configured.
//
// NOTE:
//
// This function will kick of n provisioner jobs, based on the calculated state modifications.
//
// These provisioning jobs are fire-and-forget. We DO NOT wait for the prebuilt workspaces to complete their
// provisioning. As a consequence, it's possible that another reconciliation run will occur, which will mean that
// multiple preset versions could be reconciling at once. This may mean some temporary over-provisioning, but the
// reconciliation loop will bring these resources back into their desired numbers in an EVENTUALLY-consistent way.
//
// For example: we could decide to provision 1 new instance in this reconciliation.
// While that workspace is being provisioned, another template version is created which means this same preset will
// be reconciled again, leading to another workspace being provisioned. Two workspace builds will be occurring
// simultaneously for the same preset, but once both jobs have completed the reconciliation loop will notice the
// extraneous instance and delete it.
func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
logger := c.logger.With(slog.F("reconcile_context", "all"))
select {
case <-ctx.Done():
logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err()))
return nil
default:
}
logger.Debug(ctx, "starting reconciliation")
err := c.WithReconciliationLock(ctx, logger, func(ctx context.Context, db database.Store) error {
snapshot, err := c.SnapshotState(ctx, db)
if err != nil {
return xerrors.Errorf("determine current snapshot: %w", err)
}
if len(snapshot.Presets) == 0 {
logger.Debug(ctx, "no templates found with prebuilds configured")
return nil
}
var eg errgroup.Group
// Reconcile presets in parallel. Each preset in its own goroutine.
for _, preset := range snapshot.Presets {
ps, err := snapshot.FilterByPreset(preset.ID)
if err != nil {
logger.Warn(ctx, "failed to find preset snapshot", slog.Error(err), slog.F("preset_id", preset.ID.String()))
continue
}
eg.Go(func() error {
// Pass outer context.
err = c.ReconcilePreset(ctx, *ps)
if err != nil {
logger.Error(
ctx,
"failed to reconcile prebuilds for preset",
slog.Error(err),
slog.F("preset_id", preset.ID),
)
}
// DO NOT return error otherwise the tx will end.
return nil
})
}
// Release lock only when all preset reconciliation goroutines are finished.
return eg.Wait()
})
if err != nil {
logger.Error(ctx, "failed to reconcile", slog.Error(err))
}
return err
}
// SnapshotState captures the current state of all prebuilds across templates.
func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) {
if err := ctx.Err(); err != nil {
return nil, err
}
var state prebuilds.GlobalSnapshot
err := store.InTx(func(db database.Store) error {
// TODO: implement template-specific reconciliations later
presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, uuid.NullUUID{})
if err != nil {
return xerrors.Errorf("failed to get template presets with prebuilds: %w", err)
}
if len(presetsWithPrebuilds) == 0 {
return nil
}
allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
if err != nil {
return xerrors.Errorf("failed to get running prebuilds: %w", err)
}
allPrebuildsInProgress, err := db.CountInProgressPrebuilds(ctx)
if err != nil {
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
}
presetsBackoff, err := db.GetPresetsBackoff(ctx, c.clock.Now().Add(-c.cfg.ReconciliationBackoffLookback.Value()))
if err != nil {
return xerrors.Errorf("failed to get backoffs for presets: %w", err)
}
state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff)
return nil
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
ReadOnly: true,
TxIdentifier: "prebuilds_state_determination",
})
return &state, err
}
func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.PresetSnapshot) error {
logger := c.logger.With(
slog.F("template_id", ps.Preset.TemplateID.String()),
slog.F("template_name", ps.Preset.TemplateName),
slog.F("template_version_id", ps.Preset.TemplateVersionID),
slog.F("template_version_name", ps.Preset.TemplateVersionName),
slog.F("preset_id", ps.Preset.ID),
slog.F("preset_name", ps.Preset.Name),
)
state := ps.CalculateState()
actions, err := c.CalculateActions(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID))
return nil
}
// nolint:gocritic // ReconcilePreset needs Prebuilds Orchestrator permissions.
prebuildsCtx := dbauthz.AsPrebuildsOrchestrator(ctx)
levelFn := logger.Debug
switch {
case actions.ActionType == prebuilds.ActionTypeBackoff:
levelFn = logger.Warn
// Log at info level when there's a change to be effected.
case actions.ActionType == prebuilds.ActionTypeCreate && actions.Create > 0:
levelFn = logger.Info
case actions.ActionType == prebuilds.ActionTypeDelete && len(actions.DeleteIDs) > 0:
levelFn = logger.Info
}
fields := []any{
slog.F("action_type", actions.ActionType),
slog.F("create_count", actions.Create), slog.F("delete_count", len(actions.DeleteIDs)),
slog.F("to_delete", actions.DeleteIDs),
slog.F("desired", state.Desired), slog.F("actual", state.Actual),
slog.F("extraneous", state.Extraneous), slog.F("starting", state.Starting),
slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting),
slog.F("eligible", state.Eligible),
}
levelFn(ctx, "calculated reconciliation actions for preset", fields...)
switch actions.ActionType {
case prebuilds.ActionTypeBackoff:
// If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out.
levelFn(ctx, "template prebuild state retrieved, backing off",
append(fields,
slog.F("backoff_until", actions.BackoffUntil.Format(time.RFC3339)),
slog.F("backoff_secs", math.Round(actions.BackoffUntil.Sub(c.clock.Now()).Seconds())),
)...)
return nil
case prebuilds.ActionTypeCreate:
// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
desired := ps.Preset.DesiredInstances.Int32
if actions.Create > desired {
logger.Critical(ctx, "determined excessive count of prebuilds to create; clamping to desired count",
slog.F("create_count", actions.Create), slog.F("desired_count", desired))
actions.Create = desired
}
var multiErr multierror.Error
for range actions.Create {
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
multiErr.Errors = append(multiErr.Errors, err)
}
}
return multiErr.ErrorOrNil()
case prebuilds.ActionTypeDelete:
var multiErr multierror.Error
for _, id := range actions.DeleteIDs {
if err := c.deletePrebuiltWorkspace(prebuildsCtx, id, ps.Preset.TemplateID, ps.Preset.ID); err != nil {
logger.Error(ctx, "failed to delete prebuild", slog.Error(err))
multiErr.Errors = append(multiErr.Errors, err)
}
}
return multiErr.ErrorOrNil()
default:
return xerrors.Errorf("unknown action type: %v", actions.ActionType)
}
}
func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) {
if ctx.Err() != nil {
return nil, ctx.Err()
}
return snapshot.CalculateActions(c.clock, c.cfg.ReconciliationBackoffInterval.Value())
}
func (c *StoreReconciler) WithReconciliationLock(
ctx context.Context,
logger slog.Logger,
fn func(ctx context.Context, db database.Store) error,
) error {
// This tx holds a global lock, which prevents any other coderd replica from starting a reconciliation and
// possibly getting an inconsistent view of the state.
//
// The lock MUST be held until ALL modifications have been effected.
//
// It is run with RepeatableRead isolation, so it's effectively snapshotting the data at the start of the tx.
//
// This is a read-only tx, so returning an error (i.e. causing a rollback) has no impact.
return c.store.InTx(func(db database.Store) error {
start := c.clock.Now()
// Try to acquire the lock. If we can't get it, another replica is handling reconciliation.
acquired, err := db.TryAcquireLock(ctx, database.LockIDReconcilePrebuilds)
if err != nil {
// This is a real database error, not just lock contention
logger.Error(ctx, "failed to acquire reconciliation lock due to database error", slog.Error(err))
return err
}
if !acquired {
// Normal case: another replica has the lock
return nil
}
logger.Debug(ctx,
"acquired top-level reconciliation lock",
slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", c.clock.Since(start).Seconds())),
)
return fn(ctx, db)
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
ReadOnly: true,
TxIdentifier: "prebuilds",
})
}
func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
name, err := prebuilds.GenerateName()
if err != nil {
return xerrors.Errorf("failed to generate unique prebuild ID: %w", err)
}
return c.store.InTx(func(db database.Store) error {
template, err := db.GetTemplateByID(ctx, templateID)
if err != nil {
return xerrors.Errorf("failed to get template: %w", err)
}
now := c.clock.Now()
minimumWorkspace, err := db.InsertWorkspace(ctx, database.InsertWorkspaceParams{
ID: prebuiltWorkspaceID,
CreatedAt: now,
UpdatedAt: now,
OwnerID: prebuilds.SystemUserID,
OrganizationID: template.OrganizationID,
TemplateID: template.ID,
Name: name,
LastUsedAt: c.clock.Now(),
AutomaticUpdates: database.AutomaticUpdatesNever,
AutostartSchedule: sql.NullString{},
Ttl: sql.NullInt64{},
NextStartAt: sql.NullTime{},
})
if err != nil {
return xerrors.Errorf("insert workspace: %w", err)
}
// We have to refetch the workspace for the joined in fields.
workspace, err := db.GetWorkspaceByID(ctx, minimumWorkspace.ID)
if err != nil {
return xerrors.Errorf("get workspace by ID: %w", err)
}
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace)
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
ReadOnly: false,
})
}
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
return c.store.InTx(func(db database.Store) error {
workspace, err := db.GetWorkspaceByID(ctx, prebuiltWorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by ID: %w", err)
}
template, err := db.GetTemplateByID(ctx, templateID)
if err != nil {
return xerrors.Errorf("failed to get template: %w", err)
}
if workspace.OwnerID != prebuilds.SystemUserID {
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
}
c.logger.Info(ctx, "attempting to delete prebuild",
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace)
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
ReadOnly: false,
})
}
func (c *StoreReconciler) provision(
ctx context.Context,
db database.Store,
prebuildID uuid.UUID,
template database.Template,
presetID uuid.UUID,
transition database.WorkspaceTransition,
workspace database.Workspace,
) error {
tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
if err != nil {
return xerrors.Errorf("fetch preset details: %w", err)
}
var params []codersdk.WorkspaceBuildParameter
for _, param := range tvp {
// TODO: don't fetch in the first place.
if param.TemplateVersionPresetID != presetID {
continue
}
params = append(params, codersdk.WorkspaceBuildParameter{
Name: param.Name,
Value: param.Value,
})
}
builder := wsbuilder.New(workspace, transition).
Reason(database.BuildReasonInitiator).
Initiator(prebuilds.SystemUserID).
VersionID(template.ActiveVersionID).
MarkPrebuild().
TemplateVersionPresetID(presetID)
// We only inject the required params when the prebuild is being created.
// This mirrors the behavior of regular workspace deletion (see cli/delete.go).
if transition != database.WorkspaceTransitionDelete {
builder = builder.RichParameterValues(params)
}
_, provisionerJob, _, err := builder.Build(
ctx,
db,
func(_ policy.Action, _ rbac.Objecter) bool {
return true // TODO: harden?
},
audit.WorkspaceBuildBaggage{},
)
if err != nil {
return xerrors.Errorf("provision workspace: %w", err)
}
err = provisionerjobs.PostJob(c.pubsub, *provisionerJob)
if err != nil {
// Client probably doesn't care about this error, so just log it.
c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err))
}
c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition),
slog.F("prebuild_id", prebuildID.String()), slog.F("preset_id", presetID.String()),
slog.F("job_id", provisionerJob.ID))
return nil
}
File diff suppressed because it is too large Load Diff
+7
View File
@@ -1695,6 +1695,13 @@ export interface PprofConfig {
readonly address: string;
}
// From codersdk/deployment.go
export interface PrebuildsConfig {
readonly reconciliation_interval: number;
readonly reconciliation_backoff_interval: number;
readonly reconciliation_backoff_lookback: number;
}
// From codersdk/presets.go
export interface Preset {
readonly ID: string;