diff --git a/cli/server.go b/cli/server.go index fbee77b0bf..06d7feaaed 100644 --- a/cli/server.go +++ b/cli/server.go @@ -95,6 +95,7 @@ import ( "github.com/coder/coder/v2/coderd/webpush" "github.com/coder/coder/v2/coderd/workspaceapps/appurl" "github.com/coder/coder/v2/coderd/workspacestats" + "github.com/coder/coder/v2/coderd/wsbuilder" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/drpcsdk" "github.com/coder/coder/v2/cryptorand" @@ -935,6 +936,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.StatsBatcher = batcher defer closeBatcher() + wsBuilderMetrics, err := wsbuilder.NewMetrics(options.PrometheusRegistry) + if err != nil { + return xerrors.Errorf("failed to register workspace builder metrics: %w", err) + } + options.WorkspaceBuilderMetrics = wsBuilderMetrics + // Manage notifications. var ( notificationsCfg = options.DeploymentValues.Notifications @@ -1118,7 +1125,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. autobuildTicker := time.NewTicker(vals.AutobuildPollInterval.Value()) defer autobuildTicker.Stop() autobuildExecutor := autobuild.NewExecutor( - ctx, options.Database, options.Pubsub, coderAPI.FileCache, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, coderAPI.BuildUsageChecker, logger, autobuildTicker.C, options.NotificationsEnqueuer, coderAPI.Experiments) + ctx, options.Database, options.Pubsub, coderAPI.FileCache, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, coderAPI.BuildUsageChecker, logger, autobuildTicker.C, options.NotificationsEnqueuer, coderAPI.Experiments, coderAPI.WorkspaceBuilderMetrics) autobuildExecutor.Run() jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value()) diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index c3a5873dbf..d26e9f47ca 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -48,9 +48,10 @@ type Executor struct { tick <-chan time.Time statsCh chan<- Stats // NotificationsEnqueuer handles enqueueing notifications for delivery by SMTP, webhook, etc. - notificationsEnqueuer notifications.Enqueuer - reg prometheus.Registerer - experiments codersdk.Experiments + notificationsEnqueuer notifications.Enqueuer + reg prometheus.Registerer + experiments codersdk.Experiments + workspaceBuilderMetrics *wsbuilder.Metrics metrics executorMetrics } @@ -67,23 +68,24 @@ type Stats struct { } // New returns a new wsactions executor. -func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments) *Executor { +func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments, workspaceBuilderMetrics *wsbuilder.Metrics) *Executor { factory := promauto.With(reg) le := &Executor{ //nolint:gocritic // Autostart has a limited set of permissions. - ctx: dbauthz.AsAutostart(ctx), - db: db, - ps: ps, - fileCache: fc, - templateScheduleStore: tss, - tick: tick, - log: log.Named("autobuild"), - auditor: auditor, - accessControlStore: acs, - buildUsageChecker: buildUsageChecker, - notificationsEnqueuer: enqueuer, - reg: reg, - experiments: exp, + ctx: dbauthz.AsAutostart(ctx), + db: db, + ps: ps, + fileCache: fc, + templateScheduleStore: tss, + tick: tick, + log: log.Named("autobuild"), + auditor: auditor, + accessControlStore: acs, + buildUsageChecker: buildUsageChecker, + notificationsEnqueuer: enqueuer, + reg: reg, + experiments: exp, + workspaceBuilderMetrics: workspaceBuilderMetrics, metrics: executorMetrics{ autobuildExecutionDuration: factory.NewHistogram(prometheus.HistogramOpts{ Namespace: "coderd", @@ -335,7 +337,8 @@ func (e *Executor) runOnce(t time.Time) Stats { SetLastWorkspaceBuildInTx(&latestBuild). SetLastWorkspaceBuildJobInTx(&latestJob). Experiments(e.experiments). - Reason(reason) + Reason(reason). + BuildMetrics(e.workspaceBuilderMetrics) log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition)) if nextTransition == database.WorkspaceTransitionStart && useActiveVersion(accessControl, ws) { diff --git a/coderd/coderd.go b/coderd/coderd.go index 6a85df1c3d..fffea6a19e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -245,6 +245,7 @@ type Options struct { MetadataBatcherOptions []metadatabatcher.Option ProvisionerdServerMetrics *provisionerdserver.Metrics + WorkspaceBuilderMetrics *wsbuilder.Metrics // WorkspaceAppAuditSessionTimeout allows changing the timeout for audit // sessions. Raising or lowering this value will directly affect the write diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 29ab950b23..338be47c27 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -191,6 +191,7 @@ type Options struct { TelemetryReporter telemetry.Reporter ProvisionerdServerMetrics *provisionerdserver.Metrics + WorkspaceBuilderMetrics *wsbuilder.Metrics UsageInserter usage.Inserter } @@ -399,6 +400,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can options.AutobuildTicker, options.NotificationsEnqueuer, experiments, + options.WorkspaceBuilderMetrics, ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() @@ -620,6 +622,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can AppEncryptionKeyCache: options.APIKeyEncryptionCache, OIDCConvertKeyCache: options.OIDCConvertKeyCache, ProvisionerdServerMetrics: options.ProvisionerdServerMetrics, + WorkspaceBuilderMetrics: options.WorkspaceBuilderMetrics, } } diff --git a/coderd/provisionerdserver/metrics.go b/coderd/provisionerdserver/metrics.go index 1acc67a28d..b1fc925a86 100644 --- a/coderd/provisionerdserver/metrics.go +++ b/coderd/provisionerdserver/metrics.go @@ -13,6 +13,7 @@ type Metrics struct { logger slog.Logger workspaceCreationTimings *prometheus.HistogramVec workspaceClaimTimings *prometheus.HistogramVec + jobQueueWait *prometheus.HistogramVec } type WorkspaceTimingType int @@ -29,6 +30,12 @@ const ( workspaceTypePrebuild = "prebuild" ) +// BuildReasonPrebuild is the build_reason metric label value for prebuild +// operations. This is distinct from database.BuildReason values since prebuilds +// use BuildReasonInitiator in the database but we want to track them separately +// in metrics. This is also used as a label value by the metrics in wsbuilder. +const BuildReasonPrebuild = workspaceTypePrebuild + type WorkspaceTimingFlags struct { IsPrebuild bool IsClaim bool @@ -90,6 +97,30 @@ func NewMetrics(logger slog.Logger) *Metrics { NativeHistogramZeroThreshold: 0, NativeHistogramMaxZeroThreshold: 0, }, []string{"organization_name", "template_name", "preset_name"}), + jobQueueWait: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Name: "provisioner_job_queue_wait_seconds", + Help: "Time from job creation to acquisition by a provisioner daemon.", + Buckets: []float64{ + 0.1, // 100ms + 0.5, // 500ms + 1, // 1s + 5, // 5s + 10, // 10s + 30, // 30s + 60, // 1m + 120, // 2m + 300, // 5m + 600, // 10m + 900, // 15m + 1800, // 30m + }, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, + NativeHistogramZeroThreshold: 0, + NativeHistogramMaxZeroThreshold: 0, + }, []string{"provisioner_type", "job_type", "transition", "build_reason"}), } } @@ -97,7 +128,10 @@ func (m *Metrics) Register(reg prometheus.Registerer) error { if err := reg.Register(m.workspaceCreationTimings); err != nil { return err } - return reg.Register(m.workspaceClaimTimings) + if err := reg.Register(m.workspaceClaimTimings); err != nil { + return err + } + return reg.Register(m.jobQueueWait) } // IsTrackable returns true if the workspace build should be tracked in metrics. @@ -162,3 +196,9 @@ func (m *Metrics) UpdateWorkspaceTimingsMetrics( // Not a trackable build type (e.g. restart, stop, subsequent builds) } } + +// ObserveJobQueueWait records the time a provisioner job spent waiting in the queue. +// For non-workspace-build jobs, transition and buildReason should be empty strings. +func (m *Metrics) ObserveJobQueueWait(provisionerType, jobType, transition, buildReason string, waitSeconds float64) { + m.jobQueueWait.WithLabelValues(provisionerType, jobType, transition, buildReason).Observe(waitSeconds) +} diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index 6b47ecb7bf..1f527a2998 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -478,6 +478,10 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo TraceMetadata: jobTraceMetadata, } + // jobTransition and jobBuildReason are used for metrics; only set for workspace builds. + var jobTransition string + var jobBuildReason string + switch job.Type { case database.ProvisionerJobTypeWorkspaceBuild: var input WorkspaceProvisionJob @@ -584,6 +588,15 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo if err != nil { return nil, failJob(fmt.Sprintf("convert workspace transition: %s", err)) } + jobTransition = string(workspaceBuild.Transition) + // Prebuilds use BuildReasonInitiator in the database but we want to + // track them separately in metrics. Check the initiator ID to detect + // prebuild jobs. + if job.InitiatorID == database.PrebuildsSystemUserID { + jobBuildReason = BuildReasonPrebuild + } else { + jobBuildReason = string(workspaceBuild.Reason) + } // A previous workspace build exists var lastWorkspaceBuildParameters []database.WorkspaceBuildParameter @@ -825,6 +838,12 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo return nil, failJob(fmt.Sprintf("payload was too big: %d > %d", protobuf.Size(protoJob), drpcsdk.MaxMessageSize)) } + // Record the time the job spent waiting in the queue. + if s.metrics != nil && job.StartedAt.Valid && job.Provisioner.Valid() { + queueWaitSeconds := job.StartedAt.Time.Sub(job.CreatedAt).Seconds() + s.metrics.ObserveJobQueueWait(string(job.Provisioner), string(job.Type), jobTransition, jobBuildReason, queueWaitSeconds) + } + return protoJob, err } diff --git a/coderd/workspacebuilds.go b/coderd/workspacebuilds.go index 482740f362..b9bc5d8d71 100644 --- a/coderd/workspacebuilds.go +++ b/coderd/workspacebuilds.go @@ -382,7 +382,8 @@ func (api *API) postWorkspaceBuildsInternal( LogLevel(string(createBuild.LogLevel)). DeploymentValues(api.Options.DeploymentValues). Experiments(api.Experiments). - TemplateVersionPresetID(createBuild.TemplateVersionPresetID) + TemplateVersionPresetID(createBuild.TemplateVersionPresetID). + BuildMetrics(api.WorkspaceBuilderMetrics) if (transition == database.WorkspaceTransitionStart || transition == database.WorkspaceTransitionStop) && createBuild.Reason != "" { builder = builder.Reason(database.BuildReason(createBuild.Reason)) diff --git a/coderd/workspaces.go b/coderd/workspaces.go index c0dc00ed80..c4461fefd0 100644 --- a/coderd/workspaces.go +++ b/coderd/workspaces.go @@ -787,7 +787,8 @@ func createWorkspace( ActiveVersion(). Experiments(api.Experiments). DeploymentValues(api.DeploymentValues). - RichParameterValues(req.RichParameterValues) + RichParameterValues(req.RichParameterValues). + BuildMetrics(api.WorkspaceBuilderMetrics) if req.TemplateVersionID != uuid.Nil { builder = builder.VersionID(req.TemplateVersionID) } diff --git a/coderd/workspaces_test.go b/coderd/workspaces_test.go index 7078770852..5a6626cdc3 100644 --- a/coderd/workspaces_test.go +++ b/coderd/workspaces_test.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -21,7 +22,9 @@ import ( "github.com/coder/coder/v2/agent/agenttest" "github.com/coder/coder/v2/coderd" "github.com/coder/coder/v2/coderd/audit" + "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/coderdtest" + "github.com/coder/coder/v2/coderd/coderdtest/promhelp" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbfake" @@ -30,6 +33,7 @@ import ( "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" + "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" "github.com/coder/coder/v2/coderd/render" @@ -37,6 +41,7 @@ import ( "github.com/coder/coder/v2/coderd/schedule/cron" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" + "github.com/coder/coder/v2/coderd/wsbuilder" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/cryptorand" "github.com/coder/coder/v2/provisioner/echo" @@ -5901,3 +5906,135 @@ func TestWorkspaceCreateWithImplicitPreset(t *testing.T) { require.Equal(t, preset2ID, *ws2.LatestBuild.TemplateVersionPresetID) }) } + +func TestProvisionerJobQueueWaitMetric(t *testing.T) { + t.Parallel() + + logger := testutil.Logger(t) + reg := prometheus.NewRegistry() + metrics := provisionerdserver.NewMetrics(logger) + err := metrics.Register(reg) + require.NoError(t, err) + + client := coderdtest.New(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + ProvisionerdServerMetrics: metrics, + }) + user := coderdtest.CreateFirstUser(t, client) + + // Create a template version - this triggers a template_version_import job. + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + + // Check that the queue wait metric was recorded for the template_version_import job. + importMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "job_type": string(database.ProvisionerJobTypeTemplateVersionImport), + "transition": "", + "build_reason": "", + }) + require.NotNil(t, importMetric, "import job metric should be recorded") + importHistogram := importMetric.GetHistogram() + require.NotNil(t, importHistogram) + require.Equal(t, uint64(1), importHistogram.GetSampleCount(), "import job should have 1 sample") + require.Greater(t, importHistogram.GetSampleSum(), 0.0, "import job queue wait should be non-zero") + + // Create a template and workspace - this triggers a workspace_build job. + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + // Check that the queue wait metric was recorded for the workspace_build job. + buildMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "job_type": string(database.ProvisionerJobTypeWorkspaceBuild), + "transition": string(database.WorkspaceTransitionStart), + "build_reason": string(database.BuildReasonInitiator), + }) + require.NotNil(t, buildMetric, "workspace build job metric should be recorded") + buildHistogram := buildMetric.GetHistogram() + require.NotNil(t, buildHistogram) + require.Equal(t, uint64(1), buildHistogram.GetSampleCount(), "workspace build job should have 1 sample") + require.Greater(t, buildHistogram.GetSampleSum(), 0.0, "workspace build job queue wait should be non-zero") +} + +func TestWorkspaceBuildsEnqueuedMetric(t *testing.T) { + t.Parallel() + + var ( + logger = testutil.Logger(t) + reg = prometheus.NewRegistry() + metrics = provisionerdserver.NewMetrics(logger) + + sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *") + tickCh = make(chan time.Time) + statsCh = make(chan autobuild.Stats) + ) + + err := metrics.Register(reg) + require.NoError(t, err) + + wsBuilderMetrics, err := wsbuilder.NewMetrics(reg) + require.NoError(t, err) + + client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{ + IncludeProvisionerDaemon: true, + ProvisionerdServerMetrics: metrics, + WorkspaceBuilderMetrics: wsBuilderMetrics, + AutobuildTicker: tickCh, + AutobuildStats: statsCh, + }) + user := coderdtest.CreateFirstUser(t, client) + + // Create a template and workspace with autostart schedule. + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.AutostartSchedule = ptr.Ref(sched.String()) + }) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + + // Stop the workspace to prepare for autostart. + workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop) + + // Trigger an autostart build via the autobuild ticker. This verifies that + // autostart builds are recorded with build_reason="autostart". + p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, map[string]string{}) + require.NoError(t, err) + + go func() { + tickTime := sched.Next(workspace.LatestBuild.CreatedAt) + coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, tickTime) + tickCh <- tickTime + close(tickCh) + }() + + // Wait for the autostart to complete. + stats := <-statsCh + require.Len(t, stats.Errors, 0) + require.Len(t, stats.Transitions, 1) + require.Contains(t, stats.Transitions, workspace.ID) + require.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) + + // Verify the workspace was autostarted. + workspace = coderdtest.MustWorkspace(t, client, workspace.ID) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID) + require.Equal(t, codersdk.BuildReasonAutostart, workspace.LatestBuild.Reason) + + // Now check the autostart metric was recorded. + autostartCount := promhelp.CounterValue(t, reg, "coderd_workspace_builds_enqueued_total", prometheus.Labels{ + "provisioner_type": string(database.ProvisionerTypeEcho), + "build_reason": string(database.BuildReasonAutostart), + "transition": string(database.WorkspaceTransitionStart), + "status": wsbuilder.BuildStatusSuccess, + }) + require.Equal(t, 1, autostartCount, "autostart should record 1 enqueue with build_reason=autostart") +} + +func mustSchedule(t *testing.T, s string) *cron.Schedule { + t.Helper() + sched, err := cron.Weekly(s) + require.NoError(t, err) + return sched +} diff --git a/coderd/wsbuilder/metrics.go b/coderd/wsbuilder/metrics.go new file mode 100644 index 0000000000..f3e0dedbc9 --- /dev/null +++ b/coderd/wsbuilder/metrics.go @@ -0,0 +1,42 @@ +package wsbuilder + +import "github.com/prometheus/client_golang/prometheus" + +// Metrics holds metrics related to workspace build creation. +type Metrics struct { + workspaceBuildsEnqueued *prometheus.CounterVec +} + +// Metric label values for build status. +const ( + BuildStatusSuccess = "success" + BuildStatusFailed = "failed" +) + +func NewMetrics(reg prometheus.Registerer) (*Metrics, error) { + m := &Metrics{ + workspaceBuildsEnqueued: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Name: "workspace_builds_enqueued_total", + Help: "Total number of workspace build enqueue attempts.", + }, []string{"provisioner_type", "build_reason", "transition", "status"}), + } + + if reg != nil { + if err := reg.Register(m.workspaceBuildsEnqueued); err != nil { + return nil, err + } + } + + return m, nil +} + +// RecordBuildEnqueued records a workspace build enqueue attempt. It determines +// the status based on whether an error occurred and increments the counter. +func (m *Metrics) RecordBuildEnqueued(provisionerType, buildReason, transition string, err error) { + status := BuildStatusSuccess + if err != nil { + status = BuildStatusFailed + } + m.workspaceBuildsEnqueued.WithLabelValues(provisionerType, buildReason, transition, status).Inc() +} diff --git a/coderd/wsbuilder/wsbuilder.go b/coderd/wsbuilder/wsbuilder.go index 917030029a..a72127f1f0 100644 --- a/coderd/wsbuilder/wsbuilder.go +++ b/coderd/wsbuilder/wsbuilder.go @@ -90,6 +90,8 @@ type Builder struct { prebuiltWorkspaceBuildStage sdkproto.PrebuiltWorkspaceBuildStage verifyNoLegacyParametersOnce bool + + buildMetrics *Metrics } type UsageChecker interface { @@ -253,6 +255,12 @@ func (b Builder) TemplateVersionPresetID(id uuid.UUID) Builder { return b } +func (b Builder) BuildMetrics(m *Metrics) Builder { + // nolint: revive + b.buildMetrics = m + return b +} + type BuildError struct { // Status is a suitable HTTP status code Status int @@ -313,11 +321,34 @@ func (b *Builder) Build( return err }) if err != nil { + b.recordBuildMetrics(provisionerJob, err) return nil, nil, nil, xerrors.Errorf("build tx: %w", err) } + b.recordBuildMetrics(provisionerJob, nil) return workspaceBuild, provisionerJob, provisionerDaemons, nil } +// recordBuildMetrics records the workspace build enqueue metric if metrics are +// configured. It determines the appropriate build reason label, using "prebuild" +// for prebuild operations instead of the database reason. +func (b *Builder) recordBuildMetrics(job *database.ProvisionerJob, err error) { + if b.buildMetrics == nil { + return + } + if job == nil || !job.Provisioner.Valid() { + return + } + + // Determine the build reason for metrics. Prebuilds use BuildReasonInitiator + // in the database but we want to track them separately in metrics. + buildReason := string(b.reason) + if b.prebuiltWorkspaceBuildStage == sdkproto.PrebuiltWorkspaceBuildStage_CREATE { + buildReason = provisionerdserver.BuildReasonPrebuild + } + + b.buildMetrics.RecordBuildEnqueued(string(job.Provisioner), buildReason, string(b.trans), err) +} + // buildTx contains the business logic of computing a new build. Attributes of the new database objects are computed // in a functional style, rather than imperative, to emphasize the logic of how they are defined. A simple cache // of database-fetched objects is stored on the struct to ensure we only fetch things once, even if they are used in diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 5c3822a349..ba71040c01 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -158,11 +158,13 @@ deployment. They will always be available from the agent. | `coderd_oauth2_external_requests_rate_limit_used` | gauge | The number of requests made in this interval. | `name` `resource` | | `coderd_oauth2_external_requests_total` | counter | The total number of api calls made to external oauth2 providers. 'status_code' will be 0 if the request failed with no response. | `name` `source` `status_code` | | `coderd_prebuilt_workspace_claim_duration_seconds` | histogram | Time to claim a prebuilt workspace by organization, template, and preset. | `organization_name` `preset_name` `template_name` | +| `coderd_provisioner_job_queue_wait_seconds` | histogram | Time from job creation to acquisition by a provisioner daemon. | `build_reason` `job_type` `provisioner_type` `transition` | | `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | | `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | | `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | | | `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` | | `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` | +| `coderd_workspace_builds_enqueued_total` | counter | Total number of workspace build enqueue attempts. | `build_reason` `provisioner_type` `status` `transition` | | `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | | `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | | `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | diff --git a/enterprise/cli/create_test.go b/enterprise/cli/create_test.go index f14e901e45..9c1fa8bcf6 100644 --- a/enterprise/cli/create_test.go +++ b/enterprise/cli/create_test.go @@ -370,6 +370,7 @@ func TestEnterpriseCreateWithPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -483,6 +484,7 @@ func TestEnterpriseCreateWithPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f6e386a76e..6b66adacda 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -1331,6 +1331,7 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio api.AGPL.BuildUsageChecker, api.TracerProvider, int(api.DeploymentValues.PostgresConnMaxOpen.Value()), + api.AGPL.WorkspaceBuilderMetrics, ) return reconciler, prebuilds.NewEnterpriseClaimer() } diff --git a/enterprise/coderd/prebuilds/claim_test.go b/enterprise/coderd/prebuilds/claim_test.go index aa711f14e0..e58913ed40 100644 --- a/enterprise/coderd/prebuilds/claim_test.go +++ b/enterprise/coderd/prebuilds/claim_test.go @@ -174,6 +174,7 @@ func TestClaimPrebuild(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 2ea9667076..606995e1a1 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -204,6 +204,7 @@ func TestMetricsCollector(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -344,6 +345,7 @@ func TestMetricsCollector_DuplicateTemplateNames(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -500,6 +502,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -537,6 +540,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) @@ -574,6 +578,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ctx := testutil.Context(t, testutil.WaitLong) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index b41b5541fc..9e8ce20887 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -65,7 +65,8 @@ type StoreReconciler struct { // Prebuild state metrics metrics *MetricsCollector // Operational metrics - reconciliationDuration prometheus.Histogram + reconciliationDuration prometheus.Histogram + workspaceBuilderMetrics *wsbuilder.Metrics } var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} @@ -99,6 +100,7 @@ func NewStoreReconciler(store database.Store, buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker], tracerProvider trace.TracerProvider, maxDBConnections int, + workspaceBuilderMetrics *wsbuilder.Metrics, ) *StoreReconciler { reconciliationConcurrency := calculateReconciliationConcurrency(maxDBConnections) @@ -120,6 +122,7 @@ func NewStoreReconciler(store database.Store, done: make(chan struct{}, 1), provisionNotifyCh: make(chan database.ProvisionerJob, 10), reconciliationConcurrency: reconciliationConcurrency, + workspaceBuilderMetrics: workspaceBuilderMetrics, } if registerer != nil { @@ -1052,7 +1055,8 @@ func (c *StoreReconciler) provision( builder := wsbuilder.New(workspace, transition, *c.buildUsageChecker.Load()). Reason(database.BuildReasonInitiator). Initiator(database.PrebuildsSystemUserID). - MarkPrebuild() + MarkPrebuild(). + BuildMetrics(c.workspaceBuilderMetrics) if transition != database.WorkspaceTransitionDelete { // We don't specify the version for a delete transition, diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index d0d40666ac..d85760373a 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -61,6 +61,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // given a template version with no presets @@ -112,6 +113,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // given there are presets, but no prebuilds @@ -450,6 +452,7 @@ func (tc testCase) run(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Run the reconciliation multiple times to ensure idempotency @@ -527,6 +530,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -658,6 +662,7 @@ func TestPrebuildScheduling(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -767,6 +772,7 @@ func TestInvalidPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -837,6 +843,7 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -939,6 +946,7 @@ func TestSkippingHardLimitedPresets(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset. @@ -1090,6 +1098,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset. @@ -1291,6 +1300,7 @@ func TestRunLoop(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) ownerID := uuid.New() @@ -1423,6 +1433,7 @@ func TestReconcilerLifecycle(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // When: the reconciler is stopped (simulating the prebuilds feature being disabled) @@ -1438,6 +1449,7 @@ func TestReconcilerLifecycle(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Gracefully stop the reconciliation loop @@ -1471,6 +1483,7 @@ func TestFailedBuildBackoff(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Given: an active template version with presets and prebuilds configured. @@ -1595,6 +1608,7 @@ func TestReconciliationLock(t *testing.T) { newNoopEnqueuer(), newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { lockObtained := mutex.TryLock() @@ -1633,6 +1647,7 @@ func TestTrackResourceReplacement(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Given: a template admin to receive a notification. @@ -1793,6 +1808,7 @@ func TestExpiredPrebuildsMultipleActions(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Set up test environment with a template, version, and preset @@ -2258,6 +2274,7 @@ func TestCancelPendingPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -2503,6 +2520,7 @@ func TestCancelPendingPrebuilds(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -2576,6 +2594,7 @@ func TestReconciliationStats(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) owner := coderdtest.CreateFirstUser(t, client) @@ -3066,6 +3085,7 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), 10, + nil, ) // Setup a template with a preset that should create prebuilds @@ -3172,6 +3192,7 @@ func BenchmarkReconcileAll_NoOps(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) org := dbgen.Organization(b, db, database.Organization{}) @@ -3283,6 +3304,7 @@ func BenchmarkReconcileAll_ConnectionContention(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) // Create presets from active template versions that need reconciliation actions @@ -3402,6 +3424,7 @@ func BenchmarkReconcileAll_Mix(b *testing.B) { newNoopUsageCheckerPtr(), noop.NewTracerProvider(), maxOpenConns, + nil, ) org := dbgen.Organization(b, db, database.Organization{}) diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 59b390fb1b..49d5694a5c 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -1991,6 +1991,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2115,6 +2116,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2239,6 +2241,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2385,6 +2388,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2532,6 +2536,7 @@ func TestPrebuildsAutobuild(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -2979,6 +2984,7 @@ func TestWorkspaceProvisionerdServerMetrics(t *testing.T) { api.AGPL.BuildUsageChecker, noop.NewTracerProvider(), 10, + nil, ) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer() api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 2788e4800d..bae5f6eda4 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -990,3 +990,13 @@ coderd_license_warnings 0 # HELP coderd_license_errors The number of active license errors. # TYPE coderd_license_errors gauge coderd_license_errors 0 +# HELP coderd_provisioner_job_queue_wait_seconds Time from job creation to acquisition by a provisioner daemon. +# TYPE coderd_provisioner_job_queue_wait_seconds histogram +coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop",le="300"} 1 +coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop",le="+Inf"} 1 +coderd_provisioner_job_queue_wait_seconds_sum{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop"} 0.01191 +coderd_provisioner_job_queue_wait_seconds_count{build_reason="initiator",job_type="workspace_build",provisioner_type="terraform",transition="stop"} 1 +# HELP coderd_workspace_builds_enqueued_total Total number of workspace build enqueue attempts. +# TYPE coderd_workspace_builds_enqueued_total counter +coderd_workspace_builds_enqueued_total{build_reason="dashboard",provisioner_type="terraform",status="success",transition="start"} 1 +coderd_workspace_builds_enqueued_total{build_reason="initiator",provisioner_type="terraform",status="success",transition="stop"} 1