mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: cancel pending prebuilds from non-active template versions (#20387)
## Description This PR introduces an optimization to automatically cancel pending prebuild-related jobs from non-active template versions in the reconciliation loop. ## Problem Currently, when a template is configured with more prebuild instances than available provisioners, the provisioner queue can become flooded with pending prebuild jobs. This issue is worsened when provisioning/deprovisioning operations take a long time. When the prebuild reconciliation loop generates jobs faster than provisioners can process them, pending jobs accumulate in the queue. Since prebuilt workspaces should always run the latest active template version, pending prebuild jobs from non-active versions become obsolete once a new version is promoted. ## Solution The reconciliation loop cancels pending prebuild-related jobs from non-active template versions that match the following criteria: * Build number: 1 (initial build created by the reconciliation loop) * Job status: `pending` * Not yet picked up by a provisioner (`worker_id` is `NULL`) * Owned by the prebuilds system user * Workspace transition: `start` This prevents the queue from being cluttered with stale prebuild jobs that would provision workspaces on an outdated template version that would consequently need to be deprovisioned. ## Changes * Added new SQL query `CountPendingNonActivePrebuilds` to identify presets with pending jobs from non-active versions * Added new SQL query `UpdatePrebuildProvisionerJobWithCancel` to cancel jobs for a specific preset * New reconciliation action type `ActionTypeCancelPending` handles the cancellation logic * Cancellation is non-blocking: failures to cancel prebuild jobs are logged as errors and don't prevent other reconciliation actions ## Follow-up PR Canceling pending prebuild jobs leaves workspaces in a Canceled state. While no Terraform resources need to be destroyed (since jobs were canceled before provisioning started), these database records should still be cleaned up. This will be addressed in a follow-up PR. Closes: https://github.com/coder/coder/issues/20242
This commit is contained in:
@@ -7924,7 +7924,7 @@ type CountInProgressPrebuildsRow struct {
|
||||
}
|
||||
|
||||
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
|
||||
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
|
||||
// Prebuild considered in-progress if it's in the "pending", "starting", "stopping", or "deleting" state.
|
||||
func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) {
|
||||
rows, err := q.db.QueryContext(ctx, countInProgressPrebuilds)
|
||||
if err != nil {
|
||||
@@ -7954,6 +7954,58 @@ func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInPro
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const countPendingNonActivePrebuilds = `-- name: CountPendingNonActivePrebuilds :many
|
||||
SELECT
|
||||
wpb.template_version_preset_id AS preset_id,
|
||||
COUNT(*)::int AS count
|
||||
FROM workspace_prebuild_builds wpb
|
||||
INNER JOIN provisioner_jobs pj ON pj.id = wpb.job_id
|
||||
INNER JOIN workspaces w ON w.id = wpb.workspace_id
|
||||
INNER JOIN templates t ON t.id = w.template_id
|
||||
WHERE
|
||||
wpb.template_version_id != t.active_version_id
|
||||
-- Only considers initial builds, i.e. created by the reconciliation loop
|
||||
AND wpb.build_number = 1
|
||||
-- Only consider 'start' transitions (provisioning), not 'stop'/'delete' (deprovisioning)
|
||||
-- Deprovisioning jobs should complete naturally as they're already cleaning up resources
|
||||
AND wpb.transition = 'start'::workspace_transition
|
||||
-- Pending jobs that have not yet been picked up by a provisioner
|
||||
AND pj.job_status = 'pending'::provisioner_job_status
|
||||
AND pj.worker_id IS NULL
|
||||
AND pj.canceled_at IS NULL
|
||||
AND pj.completed_at IS NULL
|
||||
GROUP BY wpb.template_version_preset_id
|
||||
`
|
||||
|
||||
type CountPendingNonActivePrebuildsRow struct {
|
||||
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
|
||||
Count int32 `db:"count" json:"count"`
|
||||
}
|
||||
|
||||
// CountPendingNonActivePrebuilds returns the number of pending prebuilds for non-active template versions
|
||||
func (q *sqlQuerier) CountPendingNonActivePrebuilds(ctx context.Context) ([]CountPendingNonActivePrebuildsRow, error) {
|
||||
rows, err := q.db.QueryContext(ctx, countPendingNonActivePrebuilds)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []CountPendingNonActivePrebuildsRow
|
||||
for rows.Next() {
|
||||
var i CountPendingNonActivePrebuildsRow
|
||||
if err := rows.Scan(&i.PresetID, &i.Count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const findMatchingPresetID = `-- name: FindMatchingPresetID :one
|
||||
WITH provided_params AS (
|
||||
SELECT
|
||||
@@ -8396,6 +8448,65 @@ func (q *sqlQuerier) GetTemplatePresetsWithPrebuilds(ctx context.Context, templa
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const updatePrebuildProvisionerJobWithCancel = `-- name: UpdatePrebuildProvisionerJobWithCancel :many
|
||||
UPDATE provisioner_jobs
|
||||
SET
|
||||
canceled_at = $1::timestamptz,
|
||||
completed_at = $1::timestamptz
|
||||
WHERE id IN (
|
||||
SELECT pj.id
|
||||
FROM provisioner_jobs pj
|
||||
INNER JOIN workspace_prebuild_builds wpb ON wpb.job_id = pj.id
|
||||
INNER JOIN workspaces w ON w.id = wpb.workspace_id
|
||||
INNER JOIN templates t ON t.id = w.template_id
|
||||
WHERE
|
||||
wpb.template_version_id != t.active_version_id
|
||||
AND wpb.template_version_preset_id = $2
|
||||
-- Only considers initial builds, i.e. created by the reconciliation loop
|
||||
AND wpb.build_number = 1
|
||||
-- Only consider 'start' transitions (provisioning), not 'stop'/'delete' (deprovisioning)
|
||||
-- Deprovisioning jobs should complete naturally as they're already cleaning up resources
|
||||
AND wpb.transition = 'start'::workspace_transition
|
||||
-- Pending jobs that have not yet been picked up by a provisioner
|
||||
AND pj.job_status = 'pending'::provisioner_job_status
|
||||
AND pj.worker_id IS NULL
|
||||
AND pj.canceled_at IS NULL
|
||||
AND pj.completed_at IS NULL
|
||||
)
|
||||
RETURNING id
|
||||
`
|
||||
|
||||
type UpdatePrebuildProvisionerJobWithCancelParams struct {
|
||||
Now time.Time `db:"now" json:"now"`
|
||||
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
|
||||
}
|
||||
|
||||
// Cancels all pending provisioner jobs for prebuilt workspaces on a specific preset from an
|
||||
// inactive template version.
|
||||
// This is an optimization to clean up stale pending jobs.
|
||||
func (q *sqlQuerier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg UpdatePrebuildProvisionerJobWithCancelParams) ([]uuid.UUID, error) {
|
||||
rows, err := q.db.QueryContext(ctx, updatePrebuildProvisionerJobWithCancel, arg.Now, arg.PresetID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []uuid.UUID
|
||||
for rows.Next() {
|
||||
var id uuid.UUID
|
||||
if err := rows.Scan(&id); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, id)
|
||||
}
|
||||
if err := rows.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const getActivePresetPrebuildSchedules = `-- name: GetActivePresetPrebuildSchedules :many
|
||||
SELECT
|
||||
tvpps.id, tvpps.preset_id, tvpps.cron_expression, tvpps.desired_instances
|
||||
|
||||
Reference in New Issue
Block a user