From 5f3be6b288e649ed4b44950aa4cb68453d6e04d1 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 12 Feb 2026 13:40:47 -0800 Subject: [PATCH] feat: add provisioner job queue wait time histogram and jobs enqueued counter (#21869) This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan Co-authored-by: Claude Opus 4.5 --- cli/server.go | 9 +- coderd/autobuild/lifecycle_executor.go | 39 ++--- coderd/coderd.go | 1 + coderd/coderdtest/coderdtest.go | 3 + coderd/provisionerdserver/metrics.go | 42 +++++- .../provisionerdserver/provisionerdserver.go | 19 +++ coderd/workspacebuilds.go | 3 +- coderd/workspaces.go | 3 +- coderd/workspaces_test.go | 137 ++++++++++++++++++ coderd/wsbuilder/metrics.go | 42 ++++++ coderd/wsbuilder/wsbuilder.go | 31 ++++ docs/admin/integrations/prometheus.md | 2 + enterprise/cli/create_test.go | 2 + enterprise/coderd/coderd.go | 1 + enterprise/coderd/prebuilds/claim_test.go | 1 + .../coderd/prebuilds/metricscollector_test.go | 5 + enterprise/coderd/prebuilds/reconcile.go | 8 +- enterprise/coderd/prebuilds/reconcile_test.go | 23 +++ enterprise/coderd/workspaces_test.go | 6 + scripts/metricsdocgen/metrics | 10 ++ 20 files changed, 363 insertions(+), 24 deletions(-) create mode 100644 coderd/wsbuilder/metrics.go diff --git a/cli/server.go b/cli/server.go index fbee77b0bf..06d7feaaed 100644 --- a/cli/server.go +++ b/cli/server.go @@ -95,6 +95,7 @@ import ( "github.com/coder/coder/v2/coderd/webpush" "github.com/coder/coder/v2/coderd/workspaceapps/appurl" "github.com/coder/coder/v2/coderd/workspacestats" + "github.com/coder/coder/v2/coderd/wsbuilder" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/drpcsdk" "github.com/coder/coder/v2/cryptorand" @@ -935,6 +936,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.StatsBatcher = batcher defer closeBatcher() + wsBuilderMetrics, err := wsbuilder.NewMetrics(options.PrometheusRegistry) + if err != nil { + return xerrors.Errorf("failed to register workspace builder metrics: %w", err) + } + options.WorkspaceBuilderMetrics = wsBuilderMetrics + // Manage notifications. var ( notificationsCfg = options.DeploymentValues.Notifications @@ -1118,7 +1125,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. autobuildTicker := time.NewTicker(vals.AutobuildPollInterval.Value()) defer autobuildTicker.Stop() autobuildExecutor := autobuild.NewExecutor( - ctx, options.Database, options.Pubsub, coderAPI.FileCache, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, coderAPI.BuildUsageChecker, logger, autobuildTicker.C, options.NotificationsEnqueuer, coderAPI.Experiments) + ctx, options.Database, options.Pubsub, coderAPI.FileCache, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, coderAPI.BuildUsageChecker, logger, autobuildTicker.C, options.NotificationsEnqueuer, coderAPI.Experiments, coderAPI.WorkspaceBuilderMetrics) autobuildExecutor.Run() jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value()) diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index c3a5873dbf..d26e9f47ca 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -48,9 +48,10 @@ type Executor struct { tick <-chan time.Time statsCh chan<- Stats // NotificationsEnqueuer handles enqueueing notifications for delivery by SMTP, webhook, etc. - notificationsEnqueuer notifications.Enqueuer - reg prometheus.Registerer - experiments codersdk.Experiments + notificationsEnqueuer notifications.Enqueuer + reg prometheus.Registerer + experiments codersdk.Experiments + workspaceBuilderMetrics *wsbuilder.Metrics metrics executorMetrics } @@ -67,23 +68,24 @@ type Stats struct { } // New returns a new wsactions executor. -func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments) *Executor { +func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments, workspaceBuilderMetrics *wsbuilder.Metrics) *Executor { factory := promauto.With(reg) le := &Executor{ //nolint:gocritic // Autostart has a limited set of permissions. - ctx: dbauthz.AsAutostart(ctx), - db: db, - ps: ps, - fileCache: fc, - templateScheduleStore: tss, - tick: tick, - log: log.Named("autobuild"), - auditor: auditor, - accessControlStore: acs, - buildUsageChecker: buildUsageChecker, - notificationsEnqueuer: enqueuer, - reg: reg, - experiments: exp, + ctx: dbauthz.AsAutostart(ctx), + db: db, + ps: ps, + fileCache: fc, + templateScheduleStore: tss, + tick: tick, + log: log.Named("autobuild"), + auditor: auditor, + accessControlStore: acs, + buildUsageChecker: buildUsageChecker, + notificationsEnqueuer: enqueuer, + reg: reg, + experiments: exp, + workspaceBuilderMetrics: workspaceBuilderMetrics, metrics: executorMetrics{ autobuildExecutionDuration: factory.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", @@ -335,7 +337,8 @@ func (e *Executor) runOnce(t time.Time) Stats { SetLastWorkspaceBuildInTx(&latestBuild). SetLastWorkspaceBuildJobInTx(&latestJob). Experiments(e.experiments). - Reason(reason) + Reason(reason). + BuildMetrics(e.workspaceBuilderMetrics) log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition)) if nextTransition == database.WorkspaceTransitionStart && useActiveVersion(accessControl, ws) { diff --git a/coderd/coderd.go b/coderd/coderd.go index 6a85df1c3d..fffea6a19e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -245,6 +245,7 @@ type Options struct { MetadataBatcherOptions []metadatabatcher.Option ProvisionerdServerMetrics *provisionerdserver.Metrics + WorkspaceBuilderMetrics *wsbuilder.Metrics // WorkspaceAppAuditSessionTimeout allows changing the timeout for audit // sessions. Raising or lowering this value will directly affect the write diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 29ab950b23..338be47c27 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -191,6 +191,7 @@ type Options struct { TelemetryReporter telemetry.Reporter ProvisionerdServerMetrics *provisionerdserver.Metrics + WorkspaceBuilderMetrics *wsbuilder.Metrics UsageInserter usage.Inserter } @@ -399,6 +400,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can options.AutobuildTicker, options.NotificationsEnqueuer, experiments, + options.WorkspaceBuilderMetrics, ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() @@ -620,6 +622,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can AppEncryptionKeyCache: options.APIKeyEncryptionCache, OIDCConvertKeyCache: options.OIDCConvertKeyCache, ProvisionerdServerMetrics: options.ProvisionerdServerMetrics, + WorkspaceBuilderMetrics: options.WorkspaceBuilderMetrics, } } diff --git a/coderd/provisionerdserver/metrics.go b/coderd/provisionerdserver/metrics.go index 1acc67a28d..b1fc925a86 100644 --- a/coderd/provisionerdserver/metrics.go +++ b/coderd/provisionerdserver/metrics.go @@ -13,6 +13,7 @@ type Metrics struct { logger slog.Logger workspaceCreationTimings *prometheus.HistogramVec workspaceClaimTimings *prometheus.HistogramVec + jobQueueWait *prometheus.HistogramVec } type WorkspaceTimingType int @@ -29,6 +30,12 @@ const ( workspaceTypePrebuild = "prebuild" ) +// BuildReasonPrebuild is the build_reason metric label value for prebuild +// operations. This is distinct from database.BuildReason values since prebuilds +// use BuildReasonInitiator in the database but we want to track them separately +// in metrics. This is also used as a label value by the metrics in wsbuilder. +const BuildReasonPrebuild = workspaceTypePrebuild + type WorkspaceTimingFlags struct { IsPrebuild bool IsClaim bool @@ -90,6 +97,30 @@ func NewMetrics(logger slog.Logger) *Metrics { NativeHistogramZeroThreshold: 0, NativeHistogramMaxZeroThreshold: 0, }, []string{"organization_name", "template_name", "preset_name"}), + jobQueueWait: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Name: "provisioner_job_queue_wait_seconds", + Help: "Time from job creation to acquisition by a provisioner daemon.", + Buckets: []float64{ + 0.1, // 100ms + 0.5, // 500ms + 1, // 1s + 5, // 5s + 10, // 10s + 30, // 30s + 60, // 1m + 120, // 2m + 300, // 5m + 600, // 10m + 900, // 15m + 1800, // 30m + }, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, + NativeHistogramZeroThreshold: 0, + NativeHistogramMaxZeroThreshold: 0, + }, []string{"provisioner_type", "job_type", "transition", "build_reason"}), } } @@ -97,7 +128,10 @@ func (m *Metrics) Register(reg prometheus.Registerer) error { if err := reg.Register(m.workspaceCreationTimings); err != nil { return err } - return reg.Register(m.workspaceClaimTimings) + if err := reg.Register(m.workspaceClaimTimings); err != nil { + return err + } + return reg.Register(m.jobQueueWait) } // IsTrackable returns true if the workspace build should be tracked in metrics. @@ -162,3 +196,9 @@ func (m *Metrics) UpdateWorkspaceTimingsMetrics( // Not a trackable build type (e.g. restart, stop, subsequent builds) } } + +// ObserveJobQueueWait records the time a provisioner job spent waiting in the queue. +// For non-workspace-build jobs, transition and buildReason should be empty strings. +func (m *Metrics) ObserveJobQueueWait(provisionerType, jobType, transition, buildReason string, waitSeconds float64) { + m.jobQueueWait.WithLabelValues(provisionerType, jobType, transition, buildReason).Observe(waitSeconds) +} diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index 6b47ecb7bf..1f527a2998 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -478,6 +478,10 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo TraceMetadata: jobTraceMetadata, } + // jobTransition and jobBuildReason are used for metrics; only set for workspace builds. + var jobTransition string + var jobBuildReason string + switch job.Type { case database.ProvisionerJobTypeWorkspaceBuild: var input WorkspaceProvisionJob @@ -584,6 +588,15 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo if err != nil { return nil, failJob(fmt.Sprintf("convert workspace transition: %s", err)) } + jobTransition = string(workspaceBuild.Transition) + // Prebuilds use BuildReasonInitiator in the database but we want to + // track them separately in metrics. Check the initiator ID to detect + // prebuild jobs. + if job.InitiatorID == database.PrebuildsSystemUserID { + jobBuildReason = BuildReasonPrebuild + } else { + jobBuildReason = string(workspaceBuild.Reason) + } // A previous workspace build exists var lastWorkspaceBuildParameters []database.WorkspaceBuildParameter @@ -825,6 +838,12 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo return nil, failJob(fmt.Sprintf("payload was too big: %d > %d", protobuf.Size(protoJob), drpcsdk.MaxMessageSize)) } + // Record the time the job spent waiting in the queue. + if s.metrics != nil && job.StartedAt.Valid && job.Provisioner.Valid() { + queueWaitSeconds := job.StartedAt.Time.Sub(job.CreatedAt).Seconds() + s.metrics.ObserveJobQueueWait(string(job.Provisioner), string(job.Type), jobTransition, jobBuildReason, queueWaitSeconds) + } + return protoJob, err } diff --git a/coderd/workspacebuilds.go b/coderd/workspacebuilds.go index 482740f362..b9bc5d8d71 100644 --- a/coderd/workspacebuilds.go +++ b/coderd/workspacebuilds.go @@ -382,7 +382,8 @@ func (api *API) postWorkspaceBuildsInternal( LogLevel(string(createBuild.LogLevel)). DeploymentValues(api.Options.DeploymentValues). Experiments(api.Experiments). - TemplateVersionPresetID(createBuild.TemplateVersionPresetID) + TemplateVersionPresetID(createBuild.TemplateVersionPresetID). + BuildMetrics(api.WorkspaceBuilderMetrics) if (transition == database.WorkspaceTransitionStart || transition == database.WorkspaceTransitionStop) && createBuild.Reason != "" { builder = builder.Reason(database.BuildReason(createBuild.Reason)) diff --git a/coderd/workspaces.go b/coderd/workspaces.go index c0dc00ed80..c4461fefd0 100644 --- a/coderd/workspaces.go +++ b/coderd/workspaces.go @@ -787,7 +787,8 @@ func createWorkspace( ActiveVersion(). Experiments(api.Experiments). DeploymentValues(api.DeploymentValues). - RichParameterValues(req.RichParameterValues) + RichParameterValues(req.RichParameterValues). + BuildMetrics(api.WorkspaceBuilderMetrics) if req.TemplateVersionID != uuid.Nil { builder = builder.VersionID(req.TemplateVersionID) } diff --git a/coderd/workspaces_test.go b/coderd/workspaces_test.go index 7078770852..5a6626cdc3 100644 --- a/coderd/workspaces_test.go +++ b/coderd/workspaces_test.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -21,7 +22,9 @@ import ( "github.com/coder/coder/v2/agent/agenttest" "github.com/coder/coder/v2/coderd" "github.com/coder/coder/v2/coderd/audit" + "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/coderdtest" + "github.com/coder/coder/v2/coderd/coderdtest/promhelp" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbfake" @@ -30,6 +33,7 @@ import ( "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" + "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" "github.com/coder/coder/v2/coderd/render" @@ -37,6 +41,7 @@ import ( "github.com/coder/coder/v2/coderd/schedule/cron" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" + "github.com/coder/coder/v2/coderd/wsbuilder" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/cryptorand" "github.com/coder/coder/v2/provisioner/echo" @@ -5901,3 +5906,135 @@ func TestWorkspaceCreateWithImplicitPreset(t *testing.T) { require.Equal(t, preset2ID, *ws2.LatestBuild.TemplateVersionPresetID) }) } + +func TestProvisionerJobQueueWaitMetric(t *testing.T) { + t.Parallel() + + logger := testutil.Logger(t) + reg := prometheus.NewRegistry() + metrics := provisionerdserver.NewMetrics(logger) + err := metrics.Register(reg) + require.NoError(t, err) + + client := coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + ProvisionerdServerMetrics: metrics, + }) + user := coderdtest.CreateFirstUser(t, client) + + // Create a template version - this triggers a template_version_import job. + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + + // Check that the queue wait metric was recorded for the template_version_import job. + importMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "job_type": string(database.ProvisionerJobTypeTemplateVersionImport), + "transition": "", + "build_reason": "", + }) + require.NotNil(t, importMetric, "import job metric should be recorded") + importHistogram := importMetric.GetHistogram() + require.NotNil(t, importHistogram) + require.Equal(t, uint64(1), importHistogram.GetSampleCount(), "import job should have 1 sample") + require.Greater(t, importHistogram.GetSampleSum(), 0.0, "import job queue wait should be non-zero") + + // Create a template and workspace - this triggers a workspace_build job. + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + // Check that the queue wait metric was recorded for the workspace_build job. + buildMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "job_type": string(database.ProvisionerJobTypeWorkspaceBuild), + "transition": string(database.WorkspaceTransitionStart), + "build_reason": string(database.BuildReasonInitiator), + }) + require.NotNil(t, buildMetric, "workspace build job metric should be recorded") + buildHistogram := buildMetric.GetHistogram() + require.NotNil(t, buildHistogram) + require.Equal(t, uint64(1), buildHistogram.GetSampleCount(), "workspace build job should have 1 sample") + require.Greater(t, buildHistogram.GetSampleSum(), 0.0, "workspace build job queue wait should be non-zero") +} + +func TestWorkspaceBuildsEnqueuedMetric(t *testing.T) { + t.Parallel() + + var ( + logger = testutil.Logger(t) + reg = prometheus.NewRegistry() + metrics = provisionerdserver.NewMetrics(logger) + + sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *") + tickCh = make(chan time.Time) + statsCh = make(chan autobuild.Stats) + ) + + err := metrics.Register(reg) + require.NoError(t, err) + + wsBuilderMetrics, err := wsbuilder.NewMetrics(reg) + require.NoError(t, err) + + client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + ProvisionerdServerMetrics: metrics, + WorkspaceBuilderMetrics: wsBuilderMetrics, + AutobuildTicker: tickCh, + AutobuildStats: statsCh, + }) + user := coderdtest.CreateFirstUser(t, client) + + // Create a template and workspace with autostart schedule. + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + // Stop the workspace to prepare for autostart. + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) + + // Trigger an autostart build via the autobuild ticker. This verifies that + // autostart builds are recorded with build_reason="autostart". + p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, map[string]string{}) + require.NoError(t, err) + + go func() { + tickTime := sched.Next(workspace.LatestBuild.CreatedAt) + coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, tickTime) + tickCh <- tickTime + close(tickCh) + }() + + // Wait for the autostart to complete. + stats := <-statsCh + require.Len(t, stats.Errors, 0) + require.Len(t, stats.Transitions, 1) + require.Contains(t, stats.Transitions, workspace.ID) + require.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) + + // Verify the workspace was autostarted. + workspace = coderdtest.MustWorkspace(t, client, workspace.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + require.Equal(t, codersdk.BuildReasonAutostart, workspace.LatestBuild.Reason) + + // Now check the autostart metric was recorded. + autostartCount := promhelp.CounterValue(t, reg, "coderd_workspace_builds_enqueued_total", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "build_reason": string(database.BuildReasonAutostart), + "transition": string(database.WorkspaceTransitionStart), + "status": wsbuilder.BuildStatusSuccess, + }) + require.Equal(t, 1, autostartCount, "autostart should record 1 enqueue with build_reason=autostart") +} + +func mustSchedule(t *testing.T, s string) *cron.Schedule { + t.Helper() + sched, err := cron.Weekly(s) + require.NoError(t, err) + return sched +} diff --git a/coderd/wsbuilder/metrics.go b/coderd/wsbuilder/metrics.go new file mode 100644 index 0000000000..f3e0dedbc9 --- /dev/null +++ b/coderd/wsbuilder/metrics.go @@ -0,0 +1,42 @@ +package wsbuilder + +import "github.com/prometheus/client_golang/prometheus" + +// Metrics holds metrics related to workspace build creation. +type Metrics struct { + workspaceBuildsEnqueued *prometheus.CounterVec +} + +// Metric label values for build status. +const ( + BuildStatusSuccess = "success" + BuildStatusFailed = "failed" +) + +func NewMetrics(reg prometheus.Registerer) (*Metrics, error) { + m := &Metrics{ + workspaceBuildsEnqueued: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Name: "workspace_builds_enqueued_total", + Help: "Total number of workspace build enqueue attempts.", + }, []string{"provisioner_type", "build_reason", "transition", "status"}), + } + + if reg != nil { + if err := reg.Register(m.workspaceBuildsEnqueued); err != nil { + return nil, err + } + } + + return m, nil +} + +// RecordBuildEnqueued records a workspace build enqueue attempt. It determines +// the status based on whether an error occurred and increments the counter. +func (m *Metrics) RecordBuildEnqueued(provisionerType, buildReason, transition string, err error) { + status := BuildStatusSuccess + if err != nil { + status = BuildStatusFailed + } + m.workspaceBuildsEnqueued.WithLabelValues(provisionerType, buildReason, transition, status).Inc() +} diff --git a/coderd/wsbuilder/wsbuilder.go b/coderd/wsbuilder/wsbuilder.go index 917030029a..a72127f1f0 100644 --- a/coderd/wsbuilder/wsbuilder.go +++ b/coderd/wsbuilder/wsbuilder.go @@ -90,6 +90,8 @@ type Builder struct { prebuiltWorkspaceBuildStage sdkproto.PrebuiltWorkspaceBuildStage verifyNoLegacyParametersOnce bool + + buildMetrics *Metrics } type UsageChecker interface { @@ -253,6 +255,12 @@ func (b Builder) TemplateVersionPresetID(id uuid.UUID) Builder { return b } +func (b Builder) BuildMetrics(m *Metrics) Builder { + // nolint: revive + b.buildMetrics = m + return b +} + type BuildError struct { // Status is a suitable HTTP status code Status int @@ -313,11 +321,34 @@ func (b *Builder) Build( return err }) if err != nil { + b.recordBuildMetrics(provisionerJob, err) return nil, nil, nil, xerrors.Errorf("build tx: %w", err) } + b.recordBuildMetrics(provisionerJob, nil) return workspaceBuild, provisionerJob, provisionerDaemons, nil } +// recordBuildMetrics records the workspace build enqueue metric if metrics are +// configured. It determines the appropriate build reason label, using "prebuild" +// for prebuild operations instead of the database reason. +func (b *Builder) recordBuildMetrics(job *database.ProvisionerJob, err error) { + if b.buildMetrics == nil { + return + } + if job == nil || !job.Provisioner.Valid() { + return + } + + // Determine the build reason for metrics. Prebuilds use BuildReasonInitiator + // in the database but we want to track them separately in metrics. + buildReason := string(b.reason) + if b.prebuiltWorkspaceBuildStage == sdkproto.PrebuiltWorkspaceBuildStage_CREATE { + buildReason = provisionerdserver.BuildReasonPrebuild + } + + b.buildMetrics.RecordBuildEnqueued(string(job.Provisioner), buildReason, string(b.trans), err) +} + // buildTx contains the business logic of computing a new build. Attributes of the new database objects are computed // in a functional style, rather than imperative, to emphasize the logic of how they are defined. A simple cache // of database-fetched objects is stored on the struct to ensure we only fetch things once, even if they are used in diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 5c3822a349..ba71040c01 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -158,11 +158,13 @@ deployment. They will always be available from the agent. | `coderd_oauth2_external_requests_rate_limit_used` | gauge | The number of requests made in this interval. | `name` `resource` | | `coderd_oauth2_external_requests_total` | counter | The total number of api calls made to external oauth2 providers. 'status_code' will be 0 if the request failed with no response. | `name` `source` `status_code` | | `coderd_prebuilt_workspace_claim_duration_seconds` | histogram | Time to claim a prebuilt workspace by organization, template, and preset. | `organization_name` `preset_name` `template_name` | +| `coderd_provisioner_job_queue_wait_seconds` | histogram | Time from job creation to acquisition by a provisioner daemon. | `build_reason` `job_type` `provisioner_type` `transition` | | `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | | `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | | `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | | | `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` | | `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` | +| `coderd_workspace_builds_enqueued_total` | counter | Total number of workspace build enqueue attempts. | `build_reason` `provisioner_type` `status` `transition` | | `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | | `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | | `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | diff --git a/enterprise/cli/create_test.go b/enterprise/cli/create_test.go index f14e901e45..9c1fa8bcf6 100644 --- a/enterprise/cli/create_test.go +++ b/enterprise/cli/create_test.go @@ -370,6 +370,7 @@ func TestEnterpriseCreateWithPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -483,6 +484,7 @@ func TestEnterpriseCreateWithPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f6e386a76e..6b66adacda 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -1331,6 +1331,7 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio api.AGPL.BuildUsageChecker, api.TracerProvider, int(api.DeploymentValues.PostgresConnMaxOpen.Value()), + api.AGPL.WorkspaceBuilderMetrics, ) return reconciler, prebuilds.NewEnterpriseClaimer() } diff --git a/enterprise/coderd/prebuilds/claim_test.go b/enterprise/coderd/prebuilds/claim_test.go index aa711f14e0..e58913ed40 100644 --- a/enterprise/coderd/prebuilds/claim_test.go +++ b/enterprise/coderd/prebuilds/claim_test.go @@ -174,6 +174,7 @@ func TestClaimPrebuild(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 2ea9667076..606995e1a1 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -204,6 +204,7 @@ func TestMetricsCollector(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -344,6 +345,7 @@ func TestMetricsCollector_DuplicateTemplateNames(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -500,6 +502,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -537,6 +540,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -574,6 +578,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index b41b5541fc..9e8ce20887 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -65,7 +65,8 @@ type StoreReconciler struct { // Prebuild state metrics metrics *MetricsCollector // Operational metrics - reconciliationDuration prometheus.Histogram + reconciliationDuration prometheus.Histogram + workspaceBuilderMetrics *wsbuilder.Metrics } var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} @@ -99,6 +100,7 @@ func NewStoreReconciler(store database.Store, buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], tracerProvider trace.TracerProvider, maxDBConnections int, + workspaceBuilderMetrics *wsbuilder.Metrics, ) *StoreReconciler { reconciliationConcurrency := calculateReconciliationConcurrency(maxDBConnections) @@ -120,6 +122,7 @@ func NewStoreReconciler(store database.Store, done: make(chan struct{}, 1), provisionNotifyCh: make(chan database.ProvisionerJob, 10), reconciliationConcurrency: reconciliationConcurrency, + workspaceBuilderMetrics: workspaceBuilderMetrics, } if registerer != nil { @@ -1052,7 +1055,8 @@ func (c *StoreReconciler) provision( builder := wsbuilder.New(workspace, transition, *c.buildUsageChecker.Load()). Reason(database.BuildReasonInitiator). Initiator(database.PrebuildsSystemUserID). - MarkPrebuild() + MarkPrebuild(). + BuildMetrics(c.workspaceBuilderMetrics) if transition != database.WorkspaceTransitionDelete { // We don't specify the version for a delete transition, diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index d0d40666ac..d85760373a 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -61,6 +61,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // given a template version with no presets @@ -112,6 +113,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // given there are presets, but no prebuilds @@ -450,6 +452,7 @@ func (tc testCase) run(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Run the reconciliation multiple times to ensure idempotency @@ -527,6 +530,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -658,6 +662,7 @@ func TestPrebuildScheduling(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -767,6 +772,7 @@ func TestInvalidPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -837,6 +843,7 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -939,6 +946,7 @@ func TestSkippingHardLimitedPresets(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset. @@ -1090,6 +1098,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset. @@ -1291,6 +1300,7 @@ func TestRunLoop(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -1423,6 +1433,7 @@ func TestReconcilerLifecycle(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // When: the reconciler is stopped (simulating the prebuilds feature being disabled) @@ -1438,6 +1449,7 @@ func TestReconcilerLifecycle(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Gracefully stop the reconciliation loop @@ -1471,6 +1483,7 @@ func TestFailedBuildBackoff(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Given: an active template version with presets and prebuilds configured. @@ -1595,6 +1608,7 @@ func TestReconciliationLock(t *testing.T) { newNoopEnqueuer(), newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { lockObtained := mutex.TryLock() @@ -1633,6 +1647,7 @@ func TestTrackResourceReplacement(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Given: a template admin to receive a notification. @@ -1793,6 +1808,7 @@ func TestExpiredPrebuildsMultipleActions(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset @@ -2258,6 +2274,7 @@ func TestCancelPendingPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -2503,6 +2520,7 @@ func TestCancelPendingPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -2576,6 +2594,7 @@ func TestReconciliationStats(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -3066,6 +3085,7 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Setup a template with a preset that should create prebuilds @@ -3172,6 +3192,7 @@ func BenchmarkReconcileAll_NoOps(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) org := dbgen.Organization(b, db, database.Organization{}) @@ -3283,6 +3304,7 @@ func BenchmarkReconcileAll_ConnectionContention(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) // Create presets from active template versions that need reconciliation actions @@ -3402,6 +3424,7 @@ func BenchmarkReconcileAll_Mix(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) org := dbgen.Organization(b, db, database.Organization{}) diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 59b390fb1b..49d5694a5c 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -1991,6 +1991,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2115,6 +2116,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2239,6 +2241,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2385,6 +2388,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2532,6 +2536,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2979,6 +2984,7 @@ func TestWorkspaceProvisionerdServerMetrics(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 2788e4800d..bae5f6eda4 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -990,3 +990,13 @@ coderd_license_warnings 0 # HELP coderd_license_errors The number of active license errors. # TYPE coderd_license_errors gauge coderd_license_errors 0 +# HELP coderd_provisioner_job_queue_wait_seconds Time from job creation to acquisition by a provisioner daemon. +# TYPE coderd_provisioner_job_queue_wait_seconds histogram +coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop",le="300"} 1 +coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop",le="+Inf"} 1 +coderd_provisioner_job_queue_wait_seconds_sum{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop"} 0.01191 +coderd_provisioner_job_queue_wait_seconds_count{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop"} 1 +# HELP coderd_workspace_builds_enqueued_total Total number of workspace build enqueue attempts. +# TYPE coderd_workspace_builds_enqueued_total counter +coderd_workspace_builds_enqueued_total{build_reason="dashboard",provisioner_type="terraform",status="success",transition="start"} 1 +coderd_workspace_builds_enqueued_total{build_reason="initiator",provisioner_type="terraform",status="success",transition="stop"} 1