feat: cancel pending prebuilds from non-active template versions (#20387)

## Description

This PR introduces an optimization to automatically cancel pending
prebuild-related jobs from non-active template versions in the
reconciliation loop.

## Problem

Currently, when a template is configured with more prebuild instances
than available provisioners, the provisioner queue can become flooded
with pending prebuild jobs. This issue is worsened when
provisioning/deprovisioning operations take a long time.

When the prebuild reconciliation loop generates jobs faster than
provisioners can process them, pending jobs accumulate in the queue.
Since prebuilt workspaces should always run the latest active template
version, pending prebuild jobs from non-active versions become obsolete
once a new version is promoted.

## Solution

The reconciliation loop cancels pending prebuild-related jobs from
non-active template versions that match the following criteria:

* Build number: 1 (initial build created by the reconciliation loop)
* Job status: `pending`
* Not yet picked up by a provisioner (`worker_id` is `NULL`)
* Owned by the prebuilds system user
* Workspace transition: `start`

This prevents the queue from being cluttered with stale prebuild jobs
that would provision workspaces on an outdated template version that
would consequently need to be deprovisioned.

## Changes

* Added new SQL query `CountPendingNonActivePrebuilds` to identify
presets with pending jobs from non-active versions
* Added new SQL query `UpdatePrebuildProvisionerJobWithCancel` to cancel
jobs for a specific preset
* New reconciliation action type `ActionTypeCancelPending` handles the
cancellation logic
* Cancellation is non-blocking: failures to cancel prebuild jobs are
logged as errors and don't prevent other reconciliation actions

## Follow-up PR

Canceling pending prebuild jobs leaves workspaces in a Canceled state.
While no Terraform resources need to be destroyed (since jobs were
canceled before provisioning started), these database records should
still be cleaned up. This will be addressed in a follow-up PR.

Closes: https://github.com/coder/coder/issues/20242
This commit is contained in:
Susana Ferreira
2025-10-24 15:27:49 +01:00
committed by GitHub
parent c301a0d804
commit f6e86c6fdb
13 changed files with 1016 additions and 63 deletions
+112 -1
View File
@@ -7924,7 +7924,7 @@ type CountInProgressPrebuildsRow struct {
}
// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
// Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state.
// Prebuild considered in-progress if it's in the "pending", "starting", "stopping", or "deleting" state.
func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) {
rows, err := q.db.QueryContext(ctx, countInProgressPrebuilds)
if err != nil {
@@ -7954,6 +7954,58 @@ func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInPro
return items, nil
}
const countPendingNonActivePrebuilds = `-- name: CountPendingNonActivePrebuilds :many
SELECT
wpb.template_version_preset_id AS preset_id,
COUNT(*)::int AS count
FROM workspace_prebuild_builds wpb
INNER JOIN provisioner_jobs pj ON pj.id = wpb.job_id
INNER JOIN workspaces w ON w.id = wpb.workspace_id
INNER JOIN templates t ON t.id = w.template_id
WHERE
wpb.template_version_id != t.active_version_id
-- Only considers initial builds, i.e. created by the reconciliation loop
AND wpb.build_number = 1
-- Only consider 'start' transitions (provisioning), not 'stop'/'delete' (deprovisioning)
-- Deprovisioning jobs should complete naturally as they're already cleaning up resources
AND wpb.transition = 'start'::workspace_transition
-- Pending jobs that have not yet been picked up by a provisioner
AND pj.job_status = 'pending'::provisioner_job_status
AND pj.worker_id IS NULL
AND pj.canceled_at IS NULL
AND pj.completed_at IS NULL
GROUP BY wpb.template_version_preset_id
`
type CountPendingNonActivePrebuildsRow struct {
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
Count int32 `db:"count" json:"count"`
}
// CountPendingNonActivePrebuilds returns the number of pending prebuilds for non-active template versions
func (q *sqlQuerier) CountPendingNonActivePrebuilds(ctx context.Context) ([]CountPendingNonActivePrebuildsRow, error) {
rows, err := q.db.QueryContext(ctx, countPendingNonActivePrebuilds)
if err != nil {
return nil, err
}
defer rows.Close()
var items []CountPendingNonActivePrebuildsRow
for rows.Next() {
var i CountPendingNonActivePrebuildsRow
if err := rows.Scan(&i.PresetID, &i.Count); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Close(); err != nil {
return nil, err
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const findMatchingPresetID = `-- name: FindMatchingPresetID :one
WITH provided_params AS (
SELECT
@@ -8396,6 +8448,65 @@ func (q *sqlQuerier) GetTemplatePresetsWithPrebuilds(ctx context.Context, templa
return items, nil
}
const updatePrebuildProvisionerJobWithCancel = `-- name: UpdatePrebuildProvisionerJobWithCancel :many
UPDATE provisioner_jobs
SET
canceled_at = $1::timestamptz,
completed_at = $1::timestamptz
WHERE id IN (
SELECT pj.id
FROM provisioner_jobs pj
INNER JOIN workspace_prebuild_builds wpb ON wpb.job_id = pj.id
INNER JOIN workspaces w ON w.id = wpb.workspace_id
INNER JOIN templates t ON t.id = w.template_id
WHERE
wpb.template_version_id != t.active_version_id
AND wpb.template_version_preset_id = $2
-- Only considers initial builds, i.e. created by the reconciliation loop
AND wpb.build_number = 1
-- Only consider 'start' transitions (provisioning), not 'stop'/'delete' (deprovisioning)
-- Deprovisioning jobs should complete naturally as they're already cleaning up resources
AND wpb.transition = 'start'::workspace_transition
-- Pending jobs that have not yet been picked up by a provisioner
AND pj.job_status = 'pending'::provisioner_job_status
AND pj.worker_id IS NULL
AND pj.canceled_at IS NULL
AND pj.completed_at IS NULL
)
RETURNING id
`
type UpdatePrebuildProvisionerJobWithCancelParams struct {
Now time.Time `db:"now" json:"now"`
PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"`
}
// Cancels all pending provisioner jobs for prebuilt workspaces on a specific preset from an
// inactive template version.
// This is an optimization to clean up stale pending jobs.
func (q *sqlQuerier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg UpdatePrebuildProvisionerJobWithCancelParams) ([]uuid.UUID, error) {
rows, err := q.db.QueryContext(ctx, updatePrebuildProvisionerJobWithCancel, arg.Now, arg.PresetID)
if err != nil {
return nil, err
}
defer rows.Close()
var items []uuid.UUID
for rows.Next() {
var id uuid.UUID
if err := rows.Scan(&id); err != nil {
return nil, err
}
items = append(items, id)
}
if err := rows.Close(); err != nil {
return nil, err
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const getActivePresetPrebuildSchedules = `-- name: GetActivePresetPrebuildSchedules :many
SELECT
tvpps.id, tvpps.preset_id, tvpps.cron_expression, tvpps.desired_instances