mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: add provisioner job queue wait time histogram and jobs enqueued counter (#21869)
This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan <callumstyan@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
package wsbuilder
|
||||
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
// Metrics holds metrics related to workspace build creation.
|
||||
type Metrics struct {
|
||||
workspaceBuildsEnqueued *prometheus.CounterVec
|
||||
}
|
||||
|
||||
// Metric label values for build status.
|
||||
const (
|
||||
BuildStatusSuccess = "success"
|
||||
BuildStatusFailed = "failed"
|
||||
)
|
||||
|
||||
func NewMetrics(reg prometheus.Registerer) (*Metrics, error) {
|
||||
m := &Metrics{
|
||||
workspaceBuildsEnqueued: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "coderd",
|
||||
Name: "workspace_builds_enqueued_total",
|
||||
Help: "Total number of workspace build enqueue attempts.",
|
||||
}, []string{"provisioner_type", "build_reason", "transition", "status"}),
|
||||
}
|
||||
|
||||
if reg != nil {
|
||||
if err := reg.Register(m.workspaceBuildsEnqueued); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// RecordBuildEnqueued records a workspace build enqueue attempt. It determines
|
||||
// the status based on whether an error occurred and increments the counter.
|
||||
func (m *Metrics) RecordBuildEnqueued(provisionerType, buildReason, transition string, err error) {
|
||||
status := BuildStatusSuccess
|
||||
if err != nil {
|
||||
status = BuildStatusFailed
|
||||
}
|
||||
m.workspaceBuildsEnqueued.WithLabelValues(provisionerType, buildReason, transition, status).Inc()
|
||||
}
|
||||
@@ -90,6 +90,8 @@ type Builder struct {
|
||||
|
||||
prebuiltWorkspaceBuildStage sdkproto.PrebuiltWorkspaceBuildStage
|
||||
verifyNoLegacyParametersOnce bool
|
||||
|
||||
buildMetrics *Metrics
|
||||
}
|
||||
|
||||
type UsageChecker interface {
|
||||
@@ -253,6 +255,12 @@ func (b Builder) TemplateVersionPresetID(id uuid.UUID) Builder {
|
||||
return b
|
||||
}
|
||||
|
||||
func (b Builder) BuildMetrics(m *Metrics) Builder {
|
||||
// nolint: revive
|
||||
b.buildMetrics = m
|
||||
return b
|
||||
}
|
||||
|
||||
type BuildError struct {
|
||||
// Status is a suitable HTTP status code
|
||||
Status int
|
||||
@@ -313,11 +321,34 @@ func (b *Builder) Build(
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
b.recordBuildMetrics(provisionerJob, err)
|
||||
return nil, nil, nil, xerrors.Errorf("build tx: %w", err)
|
||||
}
|
||||
b.recordBuildMetrics(provisionerJob, nil)
|
||||
return workspaceBuild, provisionerJob, provisionerDaemons, nil
|
||||
}
|
||||
|
||||
// recordBuildMetrics records the workspace build enqueue metric if metrics are
|
||||
// configured. It determines the appropriate build reason label, using "prebuild"
|
||||
// for prebuild operations instead of the database reason.
|
||||
func (b *Builder) recordBuildMetrics(job *database.ProvisionerJob, err error) {
|
||||
if b.buildMetrics == nil {
|
||||
return
|
||||
}
|
||||
if job == nil || !job.Provisioner.Valid() {
|
||||
return
|
||||
}
|
||||
|
||||
// Determine the build reason for metrics. Prebuilds use BuildReasonInitiator
|
||||
// in the database but we want to track them separately in metrics.
|
||||
buildReason := string(b.reason)
|
||||
if b.prebuiltWorkspaceBuildStage == sdkproto.PrebuiltWorkspaceBuildStage_CREATE {
|
||||
buildReason = provisionerdserver.BuildReasonPrebuild
|
||||
}
|
||||
|
||||
b.buildMetrics.RecordBuildEnqueued(string(job.Provisioner), buildReason, string(b.trans), err)
|
||||
}
|
||||
|
||||
// buildTx contains the business logic of computing a new build. Attributes of the new database objects are computed
|
||||
// in a functional style, rather than imperative, to emphasize the logic of how they are defined. A simple cache
|
||||
// of database-fetched objects is stored on the struct to ensure we only fetch things once, even if they are used in
|
||||
|
||||
Reference in New Issue
Block a user