feat: add provisioner job queue wait time histogram and jobs enqueued counter (#21869)

This PR adds some metrics to help identify job enqueue rates and
latencies. This work was initiated as a way to help reduce the cost of
the observation/measurement itself for autostart scaletests, which
impacts our ability to identify/reason about the load caused by
autostart. See: https://github.com/coder/internal/issues/1209

I've extended the metrics here to account for regular user initiated
builds, prebuilds, autostarts, etc. IMO there is still the question here
of whether we want to include or need the `transition` label, which is
only present on workspace builds. Including it does lead to an increase
in cardinality, and in the case of the histogram (when not using native
histograms) that's at least a few extra series for every bucket. We
could remove the transition label there but keep it on the counter.

Additionally, the histogram is currently observing latencies for other
jobs, such as template builds/version imports, those do not have a
transition type associated with them.

Tested briefly in a workspace, can see metric values like the following:
-
`coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"}
1`
-
`coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"}
1`

---------

Signed-off-by: Callum Styan <callumstyan@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Callum Styan
2026-02-12 13:40:47 -08:00
committed by GitHub
parent b1f48f8d47
commit 5f3be6b288
20 changed files with 363 additions and 24 deletions
+21 -18
View File
@@ -48,9 +48,10 @@ type Executor struct {
tick <-chan time.Time
statsCh chan<- Stats
// NotificationsEnqueuer handles enqueueing notifications for delivery by SMTP, webhook, etc.
notificationsEnqueuer notifications.Enqueuer
reg prometheus.Registerer
experiments codersdk.Experiments
notificationsEnqueuer notifications.Enqueuer
reg prometheus.Registerer
experiments codersdk.Experiments
workspaceBuilderMetrics *wsbuilder.Metrics
metrics executorMetrics
}
@@ -67,23 +68,24 @@ type Stats struct {
}
// New returns a new wsactions executor.
func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments) *Executor {
func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments, workspaceBuilderMetrics *wsbuilder.Metrics) *Executor {
factory := promauto.With(reg)
le := &Executor{
//nolint:gocritic // Autostart has a limited set of permissions.
ctx: dbauthz.AsAutostart(ctx),
db: db,
ps: ps,
fileCache: fc,
templateScheduleStore: tss,
tick: tick,
log: log.Named("autobuild"),
auditor: auditor,
accessControlStore: acs,
buildUsageChecker: buildUsageChecker,
notificationsEnqueuer: enqueuer,
reg: reg,
experiments: exp,
ctx: dbauthz.AsAutostart(ctx),
db: db,
ps: ps,
fileCache: fc,
templateScheduleStore: tss,
tick: tick,
log: log.Named("autobuild"),
auditor: auditor,
accessControlStore: acs,
buildUsageChecker: buildUsageChecker,
notificationsEnqueuer: enqueuer,
reg: reg,
experiments: exp,
workspaceBuilderMetrics: workspaceBuilderMetrics,
metrics: executorMetrics{
autobuildExecutionDuration: factory.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
@@ -335,7 +337,8 @@ func (e *Executor) runOnce(t time.Time) Stats {
SetLastWorkspaceBuildInTx(&latestBuild).
SetLastWorkspaceBuildJobInTx(&latestJob).
Experiments(e.experiments).
Reason(reason)
Reason(reason).
BuildMetrics(e.workspaceBuilderMetrics)
log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition))
if nextTransition == database.WorkspaceTransitionStart &&
useActiveVersion(accessControl, ws) {