mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
5f3be6b288
This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan <callumstyan@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
205 lines
6.7 KiB
Go
205 lines
6.7 KiB
Go
package provisionerdserver
|
||
|
||
import (
|
||
"context"
|
||
"time"
|
||
|
||
"github.com/prometheus/client_golang/prometheus"
|
||
|
||
"cdr.dev/slog/v3"
|
||
)
|
||
|
||
type Metrics struct {
|
||
logger slog.Logger
|
||
workspaceCreationTimings *prometheus.HistogramVec
|
||
workspaceClaimTimings *prometheus.HistogramVec
|
||
jobQueueWait *prometheus.HistogramVec
|
||
}
|
||
|
||
type WorkspaceTimingType int
|
||
|
||
const (
|
||
Unsupported WorkspaceTimingType = iota
|
||
WorkspaceCreation
|
||
PrebuildCreation
|
||
PrebuildClaim
|
||
)
|
||
|
||
const (
|
||
workspaceTypeRegular = "regular"
|
||
workspaceTypePrebuild = "prebuild"
|
||
)
|
||
|
||
// BuildReasonPrebuild is the build_reason metric label value for prebuild
|
||
// operations. This is distinct from database.BuildReason values since prebuilds
|
||
// use BuildReasonInitiator in the database but we want to track them separately
|
||
// in metrics. This is also used as a label value by the metrics in wsbuilder.
|
||
const BuildReasonPrebuild = workspaceTypePrebuild
|
||
|
||
type WorkspaceTimingFlags struct {
|
||
IsPrebuild bool
|
||
IsClaim bool
|
||
IsFirstBuild bool
|
||
}
|
||
|
||
func NewMetrics(logger slog.Logger) *Metrics {
|
||
log := logger.Named("provisionerd_server_metrics")
|
||
|
||
return &Metrics{
|
||
logger: log,
|
||
workspaceCreationTimings: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||
Namespace: "coderd",
|
||
Name: "workspace_creation_duration_seconds",
|
||
Help: "Time to create a workspace by organization, template, preset, and type (regular or prebuild).",
|
||
Buckets: []float64{
|
||
1, // 1s
|
||
10,
|
||
30,
|
||
60, // 1min
|
||
60 * 5,
|
||
60 * 10,
|
||
60 * 30, // 30min
|
||
60 * 60, // 1hr
|
||
},
|
||
NativeHistogramBucketFactor: 1.1,
|
||
// Max number of native buckets kept at once to bound memory.
|
||
NativeHistogramMaxBucketNumber: 100,
|
||
// Merge/flush small buckets periodically to control churn.
|
||
NativeHistogramMinResetDuration: time.Hour,
|
||
// Treat tiny values as zero (helps with noisy near-zero latencies).
|
||
NativeHistogramZeroThreshold: 0,
|
||
NativeHistogramMaxZeroThreshold: 0,
|
||
}, []string{"organization_name", "template_name", "preset_name", "type"}),
|
||
workspaceClaimTimings: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||
Namespace: "coderd",
|
||
Name: "prebuilt_workspace_claim_duration_seconds",
|
||
Help: "Time to claim a prebuilt workspace by organization, template, and preset.",
|
||
// Higher resolution between 1–5m to show typical prebuild claim times.
|
||
// Cap at 5m since longer claims diminish prebuild value.
|
||
Buckets: []float64{
|
||
1, // 1s
|
||
5,
|
||
10,
|
||
20,
|
||
30,
|
||
60, // 1m
|
||
120, // 2m
|
||
180, // 3m
|
||
240, // 4m
|
||
300, // 5m
|
||
},
|
||
NativeHistogramBucketFactor: 1.1,
|
||
// Max number of native buckets kept at once to bound memory.
|
||
NativeHistogramMaxBucketNumber: 100,
|
||
// Merge/flush small buckets periodically to control churn.
|
||
NativeHistogramMinResetDuration: time.Hour,
|
||
// Treat tiny values as zero (helps with noisy near-zero latencies).
|
||
NativeHistogramZeroThreshold: 0,
|
||
NativeHistogramMaxZeroThreshold: 0,
|
||
}, []string{"organization_name", "template_name", "preset_name"}),
|
||
jobQueueWait: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||
Namespace: "coderd",
|
||
Name: "provisioner_job_queue_wait_seconds",
|
||
Help: "Time from job creation to acquisition by a provisioner daemon.",
|
||
Buckets: []float64{
|
||
0.1, // 100ms
|
||
0.5, // 500ms
|
||
1, // 1s
|
||
5, // 5s
|
||
10, // 10s
|
||
30, // 30s
|
||
60, // 1m
|
||
120, // 2m
|
||
300, // 5m
|
||
600, // 10m
|
||
900, // 15m
|
||
1800, // 30m
|
||
},
|
||
NativeHistogramBucketFactor: 1.1,
|
||
NativeHistogramMaxBucketNumber: 100,
|
||
NativeHistogramMinResetDuration: time.Hour,
|
||
NativeHistogramZeroThreshold: 0,
|
||
NativeHistogramMaxZeroThreshold: 0,
|
||
}, []string{"provisioner_type", "job_type", "transition", "build_reason"}),
|
||
}
|
||
}
|
||
|
||
func (m *Metrics) Register(reg prometheus.Registerer) error {
|
||
if err := reg.Register(m.workspaceCreationTimings); err != nil {
|
||
return err
|
||
}
|
||
if err := reg.Register(m.workspaceClaimTimings); err != nil {
|
||
return err
|
||
}
|
||
return reg.Register(m.jobQueueWait)
|
||
}
|
||
|
||
// IsTrackable returns true if the workspace build should be tracked in metrics.
|
||
// This includes workspace creation, prebuild creation, and prebuild claims.
|
||
func (f WorkspaceTimingFlags) IsTrackable() bool {
|
||
return f.IsPrebuild || f.IsClaim || f.IsFirstBuild
|
||
}
|
||
|
||
// getWorkspaceTimingType classifies a workspace build:
|
||
// - PrebuildCreation: creation of a prebuilt workspace
|
||
// - PrebuildClaim: claim of an existing prebuilt workspace
|
||
// - WorkspaceCreation: first build of a regular (non-prebuilt) workspace
|
||
//
|
||
// Note: order matters. Creating a prebuilt workspace is also a first build
|
||
// (IsPrebuild && IsFirstBuild). We check IsPrebuild before IsFirstBuild so
|
||
// prebuilds take precedence. This is the only case where two flags can be true.
|
||
func getWorkspaceTimingType(flags WorkspaceTimingFlags) WorkspaceTimingType {
|
||
switch {
|
||
case flags.IsPrebuild:
|
||
return PrebuildCreation
|
||
case flags.IsClaim:
|
||
return PrebuildClaim
|
||
case flags.IsFirstBuild:
|
||
return WorkspaceCreation
|
||
default:
|
||
return Unsupported
|
||
}
|
||
}
|
||
|
||
// UpdateWorkspaceTimingsMetrics updates the workspace timing metrics based on the workspace build type
|
||
func (m *Metrics) UpdateWorkspaceTimingsMetrics(
|
||
ctx context.Context,
|
||
flags WorkspaceTimingFlags,
|
||
organizationName string,
|
||
templateName string,
|
||
presetName string,
|
||
buildTime float64,
|
||
) {
|
||
m.logger.Debug(ctx, "update workspace timings metrics",
|
||
slog.F("organization_name", organizationName),
|
||
slog.F("template_name", templateName),
|
||
slog.F("preset_name", presetName),
|
||
slog.F("is_prebuild", flags.IsPrebuild),
|
||
slog.F("is_claim", flags.IsClaim),
|
||
slog.F("is_workspace_first_build", flags.IsFirstBuild))
|
||
|
||
workspaceTimingType := getWorkspaceTimingType(flags)
|
||
switch workspaceTimingType {
|
||
case WorkspaceCreation:
|
||
// Regular workspace creation (without prebuild pool)
|
||
m.workspaceCreationTimings.
|
||
WithLabelValues(organizationName, templateName, presetName, workspaceTypeRegular).Observe(buildTime)
|
||
case PrebuildCreation:
|
||
// Prebuilt workspace creation duration
|
||
m.workspaceCreationTimings.
|
||
WithLabelValues(organizationName, templateName, presetName, workspaceTypePrebuild).Observe(buildTime)
|
||
case PrebuildClaim:
|
||
// Prebuilt workspace claim duration
|
||
m.workspaceClaimTimings.
|
||
WithLabelValues(organizationName, templateName, presetName).Observe(buildTime)
|
||
default:
|
||
// Not a trackable build type (e.g. restart, stop, subsequent builds)
|
||
}
|
||
}
|
||
|
||
// ObserveJobQueueWait records the time a provisioner job spent waiting in the queue.
|
||
// For non-workspace-build jobs, transition and buildReason should be empty strings.
|
||
func (m *Metrics) ObserveJobQueueWait(provisionerType, jobType, transition, buildReason string, waitSeconds float64) {
|
||
m.jobQueueWait.WithLabelValues(provisionerType, jobType, transition, buildReason).Observe(waitSeconds)
|
||
}
|