feat: add provisioner job queue wait time histogram and jobs enqueued counter (#21869)

This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan <callumstyan@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-06-02 20:48:20 +00:00 · 2026-02-12 13:40:47 -08:00
parent b1f48f8d47
commit 5f3be6b288
20 changed files with 363 additions and 24 deletions
@@ -13,6 +13,7 @@ type Metrics struct {
 	logger                   slog.Logger
 	workspaceCreationTimings *prometheus.HistogramVec
 	workspaceClaimTimings    *prometheus.HistogramVec
+	jobQueueWait             *prometheus.HistogramVec
 }

 type WorkspaceTimingType int
@@ -29,6 +30,12 @@ const (
 	workspaceTypePrebuild = "prebuild"
 )

+// BuildReasonPrebuild is the build_reason metric label value for prebuild
+// operations. This is distinct from database.BuildReason values since prebuilds
+// use BuildReasonInitiator in the database but we want to track them separately
+// in metrics. This is also used as a label value by the metrics in wsbuilder.
+const BuildReasonPrebuild = workspaceTypePrebuild
+
 type WorkspaceTimingFlags struct {
 	IsPrebuild   bool
 	IsClaim      bool
@@ -90,6 +97,30 @@ func NewMetrics(logger slog.Logger) *Metrics {
 			NativeHistogramZeroThreshold:    0,
 			NativeHistogramMaxZeroThreshold: 0,
 		}, []string{"organization_name", "template_name", "preset_name"}),
+		jobQueueWait: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Namespace: "coderd",
+			Name:      "provisioner_job_queue_wait_seconds",
+			Help:      "Time from job creation to acquisition by a provisioner daemon.",
+			Buckets: []float64{
+				0.1,  // 100ms
+				0.5,  // 500ms
+				1,    // 1s
+				5,    // 5s
+				10,   // 10s
+				30,   // 30s
+				60,   // 1m
+				120,  // 2m
+				300,  // 5m
+				600,  // 10m
+				900,  // 15m
+				1800, // 30m
+			},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: time.Hour,
+			NativeHistogramZeroThreshold:    0,
+			NativeHistogramMaxZeroThreshold: 0,
+		}, []string{"provisioner_type", "job_type", "transition", "build_reason"}),
 	}
 }

@@ -97,7 +128,10 @@ func (m *Metrics) Register(reg prometheus.Registerer) error {
 	if err := reg.Register(m.workspaceCreationTimings); err != nil {
 		return err
 	}
-	return reg.Register(m.workspaceClaimTimings)
+	if err := reg.Register(m.workspaceClaimTimings); err != nil {
+		return err
+	}
+	return reg.Register(m.jobQueueWait)
 }

 // IsTrackable returns true if the workspace build should be tracked in metrics.
@@ -162,3 +196,9 @@ func (m *Metrics) UpdateWorkspaceTimingsMetrics(
 		// Not a trackable build type (e.g. restart, stop, subsequent builds)
 	}
 }
+
+// ObserveJobQueueWait records the time a provisioner job spent waiting in the queue.
+// For non-workspace-build jobs, transition and buildReason should be empty strings.
+func (m *Metrics) ObserveJobQueueWait(provisionerType, jobType, transition, buildReason string, waitSeconds float64) {
+	m.jobQueueWait.WithLabelValues(provisionerType, jobType, transition, buildReason).Observe(waitSeconds)
+}
@@ -478,6 +478,10 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo
 		TraceMetadata: jobTraceMetadata,
 	}

+	// jobTransition and jobBuildReason are used for metrics; only set for workspace builds.
+	var jobTransition string
+	var jobBuildReason string
+
 	switch job.Type {
 	case database.ProvisionerJobTypeWorkspaceBuild:
 		var input WorkspaceProvisionJob
@@ -584,6 +588,15 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo
 		if err != nil {
 			return nil, failJob(fmt.Sprintf("convert workspace transition: %s", err))
 		}
+		jobTransition = string(workspaceBuild.Transition)
+		// Prebuilds use BuildReasonInitiator in the database but we want to
+		// track them separately in metrics. Check the initiator ID to detect
+		// prebuild jobs.
+		if job.InitiatorID == database.PrebuildsSystemUserID {
+			jobBuildReason = BuildReasonPrebuild
+		} else {
+			jobBuildReason = string(workspaceBuild.Reason)
+		}

 		// A previous workspace build exists
 		var lastWorkspaceBuildParameters []database.WorkspaceBuildParameter
@@ -825,6 +838,12 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo
 		return nil, failJob(fmt.Sprintf("payload was too big: %d > %d", protobuf.Size(protoJob), drpcsdk.MaxMessageSize))
 	}

+	// Record the time the job spent waiting in the queue.
+	if s.metrics != nil && job.StartedAt.Valid && job.Provisioner.Valid() {
+		queueWaitSeconds := job.StartedAt.Time.Sub(job.CreatedAt).Seconds()
+		s.metrics.ObserveJobQueueWait(string(job.Provisioner), string(job.Type), jobTransition, jobBuildReason, queueWaitSeconds)
+	}
+
 	return protoJob, err
 }