diff --git a/coderd/agentapi/api.go b/coderd/agentapi/api.go index 9dc3811796..0c343a4e9d 100644 --- a/coderd/agentapi/api.go +++ b/coderd/agentapi/api.go @@ -89,6 +89,7 @@ type Options struct { PublishWorkspaceAgentLogsUpdateFn func(ctx context.Context, workspaceAgentID uuid.UUID, msg agentsdk.LogsNotifyMessage) NetworkTelemetryHandler func(batch []*tailnetproto.TelemetryEvent) BoundaryUsageTracker *boundaryusage.Tracker + LifecycleMetrics *LifecycleMetrics AccessURL *url.URL AppHostname string @@ -170,6 +171,7 @@ func New(opts Options, workspace database.Workspace) *API { Database: opts.Database, Log: opts.Log, PublishWorkspaceUpdateFn: api.publishWorkspaceUpdate, + Metrics: opts.LifecycleMetrics, } api.AppsAPI = &AppsAPI{ diff --git a/coderd/agentapi/lifecycle.go b/coderd/agentapi/lifecycle.go index 06d3097187..61d9d5e37c 100644 --- a/coderd/agentapi/lifecycle.go +++ b/coderd/agentapi/lifecycle.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "slices" + "sync" "time" "github.com/google/uuid" @@ -31,7 +32,9 @@ type LifecycleAPI struct { Log slog.Logger PublishWorkspaceUpdateFn func(context.Context, *database.WorkspaceAgent, wspubsub.WorkspaceEventKind) error - TimeNowFn func() time.Time // defaults to dbtime.Now() + TimeNowFn func() time.Time // defaults to dbtime.Now() + Metrics *LifecycleMetrics + emitMetricsOnce sync.Once } func (a *LifecycleAPI) now() time.Time { @@ -125,6 +128,17 @@ func (a *LifecycleAPI) UpdateLifecycle(ctx context.Context, req *agentproto.Upda } } + // Emit build duration metric when agent transitions to a terminal startup state. + // We only emit once per agent connection to avoid duplicate metrics. + switch lifecycleState { + case database.WorkspaceAgentLifecycleStateReady, + database.WorkspaceAgentLifecycleStateStartTimeout, + database.WorkspaceAgentLifecycleStateStartError: + a.emitMetricsOnce.Do(func() { + a.emitBuildDurationMetric(ctx, workspaceAgent.ResourceID) + }) + } + return req.Lifecycle, nil } diff --git a/coderd/agentapi/lifecycle_test.go b/coderd/agentapi/lifecycle_test.go index f9962dd79c..2457af8a22 100644 --- a/coderd/agentapi/lifecycle_test.go +++ b/coderd/agentapi/lifecycle_test.go @@ -9,12 +9,14 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" "google.golang.org/protobuf/types/known/timestamppb" agentproto "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/coderd/agentapi" + "github.com/coder/coder/v2/coderd/coderdtest/promhelp" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbmock" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -22,6 +24,10 @@ import ( "github.com/coder/coder/v2/testutil" ) +// fullMetricName is the fully-qualified Prometheus metric name +// (namespace + name) used for gathering in tests. +const fullMetricName = "coderd_" + agentapi.BuildDurationMetricName + func TestUpdateLifecycle(t *testing.T) { t.Parallel() @@ -30,6 +36,12 @@ func TestUpdateLifecycle(t *testing.T) { someTime = dbtime.Time(someTime) now := dbtime.Now() + // Fixed times for build duration metric assertions. + // The expected duration is exactly 90 seconds. + buildCreatedAt := dbtime.Time(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)) + agentReadyAt := dbtime.Time(time.Date(2025, 1, 1, 0, 1, 30, 0, time.UTC)) + expectedDuration := agentReadyAt.Sub(buildCreatedAt).Seconds() // 90.0 + var ( workspaceID = uuid.New() agentCreated = database.WorkspaceAgent{ @@ -105,6 +117,19 @@ func TestUpdateLifecycle(t *testing.T) { Valid: true, }, }).Return(nil) + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: buildCreatedAt, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: true, + LastAgentReadyAt: agentReadyAt, + WorstStatus: "success", + }, nil) + + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) api := &agentapi.LifecycleAPI{ AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { @@ -113,6 +138,7 @@ func TestUpdateLifecycle(t *testing.T) { WorkspaceID: workspaceID, Database: dbM, Log: testutil.Logger(t), + Metrics: metrics, // Test that nil publish fn works. PublishWorkspaceUpdateFn: nil, } @@ -122,6 +148,16 @@ func TestUpdateLifecycle(t *testing.T) { }) require.NoError(t, err) require.Equal(t, lifecycle, resp) + + got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "success", + "is_prebuild": "false", + }) + require.Equal(t, uint64(1), got.GetSampleCount()) + require.Equal(t, expectedDuration, got.GetSampleSum()) }) // This test jumps from CREATING to READY, skipping STARTED. Both the @@ -147,8 +183,21 @@ func TestUpdateLifecycle(t *testing.T) { Valid: true, }, }).Return(nil) + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: buildCreatedAt, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: true, + LastAgentReadyAt: agentReadyAt, + WorstStatus: "success", + }, nil) publishCalled := false + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) + api := &agentapi.LifecycleAPI{ AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { return agentCreated, nil @@ -156,6 +205,7 @@ func TestUpdateLifecycle(t *testing.T) { WorkspaceID: workspaceID, Database: dbM, Log: testutil.Logger(t), + Metrics: metrics, PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error { publishCalled = true return nil @@ -168,6 +218,16 @@ func TestUpdateLifecycle(t *testing.T) { require.NoError(t, err) require.Equal(t, lifecycle, resp) require.True(t, publishCalled) + + got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "success", + "is_prebuild": "false", + }) + require.Equal(t, uint64(1), got.GetSampleCount()) + require.Equal(t, expectedDuration, got.GetSampleSum()) }) t.Run("NoTimeSpecified", func(t *testing.T) { @@ -194,6 +254,19 @@ func TestUpdateLifecycle(t *testing.T) { Valid: true, }, }) + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: buildCreatedAt, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: true, + LastAgentReadyAt: agentReadyAt, + WorstStatus: "success", + }, nil) + + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) api := &agentapi.LifecycleAPI{ AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { @@ -202,6 +275,7 @@ func TestUpdateLifecycle(t *testing.T) { WorkspaceID: workspaceID, Database: dbM, Log: testutil.Logger(t), + Metrics: metrics, PublishWorkspaceUpdateFn: nil, TimeNowFn: func() time.Time { return now @@ -213,6 +287,16 @@ func TestUpdateLifecycle(t *testing.T) { }) require.NoError(t, err) require.Equal(t, lifecycle, resp) + + got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "success", + "is_prebuild": "false", + }) + require.Equal(t, uint64(1), got.GetSampleCount()) + require.Equal(t, expectedDuration, got.GetSampleSum()) }) t.Run("AllStates", func(t *testing.T) { @@ -228,6 +312,9 @@ func TestUpdateLifecycle(t *testing.T) { dbM := dbmock.NewMockStore(gomock.NewController(t)) var publishCalled int64 + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) + api := &agentapi.LifecycleAPI{ AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { return agent, nil @@ -235,6 +322,7 @@ func TestUpdateLifecycle(t *testing.T) { WorkspaceID: workspaceID, Database: dbM, Log: testutil.Logger(t), + Metrics: metrics, PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error { atomic.AddInt64(&publishCalled, 1) return nil @@ -277,6 +365,20 @@ func TestUpdateLifecycle(t *testing.T) { ReadyAt: expectedReadyAt, }).Times(1).Return(nil) + // The first ready state triggers the build duration metric query. + if state == agentproto.Lifecycle_READY || state == agentproto.Lifecycle_START_TIMEOUT || state == agentproto.Lifecycle_START_ERROR { + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agent.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: someTime, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: true, + LastAgentReadyAt: stateNow, + WorstStatus: "success", + }, nil).MaxTimes(1) + } + resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{ Lifecycle: lifecycle, }) @@ -322,6 +424,164 @@ func TestUpdateLifecycle(t *testing.T) { require.Nil(t, resp) require.False(t, publishCalled) }) + + // Test that metric is NOT emitted when not all agents are ready (multi-agent case). + t.Run("MetricNotEmittedWhenNotAllAgentsReady", func(t *testing.T) { + t.Parallel() + + lifecycle := &agentproto.Lifecycle{ + State: agentproto.Lifecycle_READY, + ChangedAt: timestamppb.New(now), + } + + dbM := dbmock.NewMockStore(gomock.NewController(t)) + dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil) + // Return AllAgentsReady = false to simulate multi-agent case where not all are ready. + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: someTime, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: false, // Not all agents ready yet + LastAgentReadyAt: time.Time{}, // No ready time yet + WorstStatus: "success", + }, nil) + + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) + + api := &agentapi.LifecycleAPI{ + AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { + return agentStarting, nil + }, + WorkspaceID: workspaceID, + Database: dbM, + Log: testutil.Logger(t), + Metrics: metrics, + PublishWorkspaceUpdateFn: nil, + } + + resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{ + Lifecycle: lifecycle, + }) + require.NoError(t, err) + require.Equal(t, lifecycle, resp) + + require.Nil(t, promhelp.MetricValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "success", + "is_prebuild": "false", + }), "metric should not be emitted when not all agents are ready") + }) + + // Test that prebuild label is "true" when owner is prebuild system user. + t.Run("PrebuildLabelTrue", func(t *testing.T) { + t.Parallel() + + lifecycle := &agentproto.Lifecycle{ + State: agentproto.Lifecycle_READY, + ChangedAt: timestamppb.New(now), + } + + dbM := dbmock.NewMockStore(gomock.NewController(t)) + dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil) + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: buildCreatedAt, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: true, // Prebuild workspace + AllAgentsReady: true, + LastAgentReadyAt: agentReadyAt, + WorstStatus: "success", + }, nil) + + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) + + api := &agentapi.LifecycleAPI{ + AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { + return agentStarting, nil + }, + WorkspaceID: workspaceID, + Database: dbM, + Log: testutil.Logger(t), + Metrics: metrics, + PublishWorkspaceUpdateFn: nil, + } + + resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{ + Lifecycle: lifecycle, + }) + require.NoError(t, err) + require.Equal(t, lifecycle, resp) + + got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "success", + "is_prebuild": "true", + }) + require.Equal(t, uint64(1), got.GetSampleCount()) + require.Equal(t, expectedDuration, got.GetSampleSum()) + }) + + // Test worst status is used when one agent has an error. + t.Run("WorstStatusError", func(t *testing.T) { + t.Parallel() + + lifecycle := &agentproto.Lifecycle{ + State: agentproto.Lifecycle_READY, + ChangedAt: timestamppb.New(now), + } + + dbM := dbmock.NewMockStore(gomock.NewController(t)) + dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil) + dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{ + CreatedAt: buildCreatedAt, + Transition: database.WorkspaceTransitionStart, + TemplateName: "test-template", + OrganizationName: "test-org", + IsPrebuild: false, + AllAgentsReady: true, + LastAgentReadyAt: agentReadyAt, + WorstStatus: "error", // One agent had an error + }, nil) + + reg := prometheus.NewRegistry() + metrics := agentapi.NewLifecycleMetrics(reg) + + api := &agentapi.LifecycleAPI{ + AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) { + return agentStarting, nil + }, + WorkspaceID: workspaceID, + Database: dbM, + Log: testutil.Logger(t), + Metrics: metrics, + PublishWorkspaceUpdateFn: nil, + } + + resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{ + Lifecycle: lifecycle, + }) + require.NoError(t, err) + require.Equal(t, lifecycle, resp) + + got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{ + "template_name": "test-template", + "organization_name": "test-org", + "transition": "start", + "status": "error", + "is_prebuild": "false", + }) + require.Equal(t, uint64(1), got.GetSampleCount()) + require.Equal(t, expectedDuration, got.GetSampleSum()) + }) } func TestUpdateStartup(t *testing.T) { diff --git a/coderd/agentapi/metrics.go b/coderd/agentapi/metrics.go new file mode 100644 index 0000000000..16dba69dec --- /dev/null +++ b/coderd/agentapi/metrics.go @@ -0,0 +1,97 @@ +package agentapi + +import ( + "context" + "strconv" + "time" + + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + + "cdr.dev/slog/v3" +) + +// BuildDurationMetricName is the short name for the end-to-end +// workspace build duration histogram. The full metric name is +// prefixed with the namespace "coderd_". +const BuildDurationMetricName = "template_workspace_build_duration_seconds" + +// LifecycleMetrics contains Prometheus metrics for the lifecycle API. +type LifecycleMetrics struct { + BuildDuration *prometheus.HistogramVec +} + +// NewLifecycleMetrics creates and registers all lifecycle-related +// Prometheus metrics. +// +// The build duration histogram tracks the end-to-end duration from +// workspace build creation to agent ready, by template. It is +// recorded by the coderd replica handling the agent's connection +// when the last agent reports ready. In multi-replica deployments, +// each replica only has observations for agents it handles. +// +// The "is_prebuild" label distinguishes prebuild creation (background, +// no user waiting) from user-initiated builds (regular workspace +// creation or prebuild claims). +func NewLifecycleMetrics(reg prometheus.Registerer) *LifecycleMetrics { + m := &LifecycleMetrics{ + BuildDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Name: BuildDurationMetricName, + Help: "Duration from workspace build creation to agent ready, by template.", + Buckets: []float64{ + 1, // 1s + 10, + 30, + 60, // 1min + 60 * 5, + 60 * 10, + 60 * 30, // 30min + 60 * 60, // 1hr + }, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, + }, []string{"template_name", "organization_name", "transition", "status", "is_prebuild"}), + } + reg.MustRegister(m.BuildDuration) + return m +} + +// emitBuildDurationMetric records the end-to-end workspace build +// duration from build creation to when all agents are ready. +func (a *LifecycleAPI) emitBuildDurationMetric(ctx context.Context, resourceID uuid.UUID) { + if a.Metrics == nil { + return + } + + buildInfo, err := a.Database.GetWorkspaceBuildMetricsByResourceID(ctx, resourceID) + if err != nil { + a.Log.Warn(ctx, "failed to get build info for metrics", slog.Error(err)) + return + } + + // Wait until all agents have reached a terminal startup state. + if !buildInfo.AllAgentsReady { + return + } + + // LastAgentReadyAt is the MAX(ready_at) across all agents. Since + // we only get here when AllAgentsReady is true, this should always + // be valid. + if buildInfo.LastAgentReadyAt.IsZero() { + a.Log.Warn(ctx, "last_agent_ready_at is unexpectedly zero", + slog.F("last_agent_ready_at", buildInfo.LastAgentReadyAt)) + return + } + + duration := buildInfo.LastAgentReadyAt.Sub(buildInfo.CreatedAt).Seconds() + + a.Metrics.BuildDuration.WithLabelValues( + buildInfo.TemplateName, + buildInfo.OrganizationName, + string(buildInfo.Transition), + buildInfo.WorstStatus, + strconv.FormatBool(buildInfo.IsPrebuild), + ).Observe(duration) +} diff --git a/coderd/coderd.go b/coderd/coderd.go index 5e39c302c7..942e785d36 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -42,6 +42,7 @@ import ( "cdr.dev/slog/v3" agentproto "github.com/coder/coder/v2/agent/proto" "github.com/coder/coder/v2/buildinfo" + "github.com/coder/coder/v2/coderd/agentapi" "github.com/coder/coder/v2/coderd/agentapi/metadatabatcher" _ "github.com/coder/coder/v2/coderd/apidoc" // Used for swagger docs. "github.com/coder/coder/v2/coderd/appearance" @@ -754,6 +755,7 @@ func New(options *Options) *API { api.agentProvider = stn if options.DeploymentValues.Prometheus.Enable { options.PrometheusRegistry.MustRegister(stn) + api.lifecycleMetrics = agentapi.NewLifecycleMetrics(options.PrometheusRegistry) } api.NetworkTelemetryBatcher = tailnet.NewNetworkTelemetryBatcher( quartz.NewReal(), @@ -1888,8 +1890,9 @@ type API struct { healthCheckCache atomic.Pointer[healthsdk.HealthcheckReport] healthCheckProgress healthcheck.Progress - statsReporter *workspacestats.Reporter - metadataBatcher *metadatabatcher.Batcher + statsReporter *workspacestats.Reporter + metadataBatcher *metadatabatcher.Batcher + lifecycleMetrics *agentapi.LifecycleMetrics Acquirer *provisionerdserver.Acquirer // dbRolluper rolls up template usage stats from raw agent and app diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5695123a98..629f249534 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -3886,6 +3886,14 @@ func (q *querier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Conte return q.db.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, arg) } +func (q *querier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) { + // Verify access to the resource first. + if _, err := q.GetWorkspaceResourceByID(ctx, id); err != nil { + return database.GetWorkspaceBuildMetricsByResourceIDRow{}, err + } + return q.db.GetWorkspaceBuildMetricsByResourceID(ctx, id) +} + func (q *querier) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) { // Authorized call to get the workspace build. If we can read the build, // we can read the params. diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index f443f41b6f..1fb1c81f7a 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -2036,6 +2036,18 @@ func (s *MethodTestSuite) TestWorkspace() { dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes() check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(res) })) + s.Run("GetWorkspaceBuildMetricsByResourceID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { + ws := testutil.Fake(s.T(), faker, database.Workspace{}) + build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID}) + job := testutil.Fake(s.T(), faker, database.ProvisionerJob{ID: build.JobID, Type: database.ProvisionerJobTypeWorkspaceBuild}) + res := testutil.Fake(s.T(), faker, database.WorkspaceResource{JobID: build.JobID}) + dbm.EXPECT().GetWorkspaceResourceByID(gomock.Any(), res.ID).Return(res, nil).AnyTimes() + dbm.EXPECT().GetProvisionerJobByID(gomock.Any(), res.JobID).Return(job, nil).AnyTimes() + dbm.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), res.JobID).Return(build, nil).AnyTimes() + dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes() + dbm.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), res.ID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{}, nil).AnyTimes() + check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(database.GetWorkspaceBuildMetricsByResourceIDRow{}) + })) s.Run("Build/GetWorkspaceResourcesByJobID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { ws := testutil.Fake(s.T(), faker, database.Workspace{}) build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID}) diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 8c7eab5246..00bddf3688 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -2406,6 +2406,14 @@ func (m queryMetricsStore) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx cont return r0, r1 } +func (m queryMetricsStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) { + start := time.Now() + r0, r1 := m.s.GetWorkspaceBuildMetricsByResourceID(ctx, id) + m.queryLatencies.WithLabelValues("GetWorkspaceBuildMetricsByResourceID").Observe(time.Since(start).Seconds()) + m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "GetWorkspaceBuildMetricsByResourceID").Inc() + return r0, r1 +} + func (m queryMetricsStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) { start := time.Now() r0, r1 := m.s.GetWorkspaceBuildParameters(ctx, workspaceBuildID) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 774a7b36cc..15d0ac6cea 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -4499,6 +4499,21 @@ func (mr *MockStoreMockRecorder) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ct return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildByWorkspaceIDAndBuildNumber", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildByWorkspaceIDAndBuildNumber), ctx, arg) } +// GetWorkspaceBuildMetricsByResourceID mocks base method. +func (m *MockStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetWorkspaceBuildMetricsByResourceID", ctx, id) + ret0, _ := ret[0].(database.GetWorkspaceBuildMetricsByResourceIDRow) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetWorkspaceBuildMetricsByResourceID indicates an expected call of GetWorkspaceBuildMetricsByResourceID. +func (mr *MockStoreMockRecorder) GetWorkspaceBuildMetricsByResourceID(ctx, id any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildMetricsByResourceID", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildMetricsByResourceID), ctx, id) +} + // GetWorkspaceBuildParameters mocks base method. func (m *MockStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 91b440fb09..4ecc0c5017 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -501,6 +501,9 @@ type sqlcQuerier interface { GetWorkspaceBuildByID(ctx context.Context, id uuid.UUID) (WorkspaceBuild, error) GetWorkspaceBuildByJobID(ctx context.Context, jobID uuid.UUID) (WorkspaceBuild, error) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Context, arg GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams) (WorkspaceBuild, error) + // Returns build metadata for e2e workspace build duration metrics. + // Also checks if all agents are ready and returns the worst status. + GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]WorkspaceBuildParameter, error) GetWorkspaceBuildParametersByBuildIDs(ctx context.Context, workspaceBuildIds []uuid.UUID) ([]WorkspaceBuildParameter, error) GetWorkspaceBuildStatsByTemplates(ctx context.Context, since time.Time) ([]GetWorkspaceBuildStatsByTemplatesRow, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 4837cf5d8e..99cd7eae5d 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -21344,6 +21344,62 @@ func (q *sqlQuerier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Co return i, err } +const getWorkspaceBuildMetricsByResourceID = `-- name: GetWorkspaceBuildMetricsByResourceID :one +SELECT + wb.created_at, + wb.transition, + t.name AS template_name, + o.name AS organization_name, + (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild, + -- All agents must have ready_at set (terminal startup state) + COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready, + -- Latest ready_at across all agents (for duration calculation) + MAX(wa.ready_at)::timestamptz AS last_agent_ready_at, + -- Worst status: error > timeout > ready + CASE + WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error' + WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout' + ELSE 'success' + END AS worst_status +FROM workspace_builds wb +JOIN workspaces w ON wb.workspace_id = w.id +JOIN templates t ON w.template_id = t.id +JOIN organizations o ON t.organization_id = o.id +JOIN workspace_resources wr ON wr.job_id = wb.job_id +JOIN workspace_agents wa ON wa.resource_id = wr.id +WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1) +GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id +` + +type GetWorkspaceBuildMetricsByResourceIDRow struct { + CreatedAt time.Time `db:"created_at" json:"created_at"` + Transition WorkspaceTransition `db:"transition" json:"transition"` + TemplateName string `db:"template_name" json:"template_name"` + OrganizationName string `db:"organization_name" json:"organization_name"` + IsPrebuild bool `db:"is_prebuild" json:"is_prebuild"` + AllAgentsReady bool `db:"all_agents_ready" json:"all_agents_ready"` + LastAgentReadyAt time.Time `db:"last_agent_ready_at" json:"last_agent_ready_at"` + WorstStatus string `db:"worst_status" json:"worst_status"` +} + +// Returns build metadata for e2e workspace build duration metrics. +// Also checks if all agents are ready and returns the worst status. +func (q *sqlQuerier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error) { + row := q.db.QueryRowContext(ctx, getWorkspaceBuildMetricsByResourceID, id) + var i GetWorkspaceBuildMetricsByResourceIDRow + err := row.Scan( + &i.CreatedAt, + &i.Transition, + &i.TemplateName, + &i.OrganizationName, + &i.IsPrebuild, + &i.AllAgentsReady, + &i.LastAgentReadyAt, + &i.WorstStatus, + ) + return i, err +} + const getWorkspaceBuildStatsByTemplates = `-- name: GetWorkspaceBuildStatsByTemplates :many SELECT w.template_id, diff --git a/coderd/database/queries/workspacebuilds.sql b/coderd/database/queries/workspacebuilds.sql index cf13b30758..aee50f7edf 100644 --- a/coderd/database/queries/workspacebuilds.sql +++ b/coderd/database/queries/workspacebuilds.sql @@ -243,3 +243,31 @@ SET has_external_agent = @has_external_agent, updated_at = @updated_at::timestamptz WHERE id = @id::uuid; + +-- name: GetWorkspaceBuildMetricsByResourceID :one +-- Returns build metadata for e2e workspace build duration metrics. +-- Also checks if all agents are ready and returns the worst status. +SELECT + wb.created_at, + wb.transition, + t.name AS template_name, + o.name AS organization_name, + (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild, + -- All agents must have ready_at set (terminal startup state) + COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready, + -- Latest ready_at across all agents (for duration calculation) + MAX(wa.ready_at)::timestamptz AS last_agent_ready_at, + -- Worst status: error > timeout > ready + CASE + WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error' + WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout' + ELSE 'success' + END AS worst_status +FROM workspace_builds wb +JOIN workspaces w ON wb.workspace_id = w.id +JOIN templates t ON w.template_id = t.id +JOIN organizations o ON t.organization_id = o.id +JOIN workspace_resources wr ON wr.job_id = wb.job_id +JOIN workspace_agents wa ON wa.resource_id = wr.id +WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1) +GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id; diff --git a/coderd/workspaceagentsrpc.go b/coderd/workspaceagentsrpc.go index ae8682cc59..4e4cbce1ea 100644 --- a/coderd/workspaceagentsrpc.go +++ b/coderd/workspaceagentsrpc.go @@ -158,6 +158,7 @@ func (api *API) workspaceAgentRPC(rw http.ResponseWriter, r *http.Request) { DerpMapUpdateFrequency: api.Options.DERPMapUpdateFrequency, ExternalAuthConfigs: api.ExternalAuthConfigs, Experiments: api.Experiments, + LifecycleMetrics: api.lifecycleMetrics, // Optional: UpdateAgentMetricsFn: api.UpdateAgentMetrics, diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index aa2b9056fd..5c3822a349 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -162,6 +162,7 @@ deployment. They will always be available from the agent. | `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | | `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | | | `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` | +| `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` | | `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | | `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | | `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | @@ -211,6 +212,7 @@ The following metrics support native histograms: * `coderd_workspace_creation_duration_seconds` * `coderd_prebuilt_workspace_claim_duration_seconds` +* `coderd_template_coderd_template_workspace_build_duration_seconds` Native histograms are an **experimental** Prometheus feature that removes the need to predefine bucket boundaries and allows higher-resolution buckets that adapt to deployment characteristics. Whether a metric is exposed as classic or native depends entirely on the Prometheus server configuration (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details): diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index c57cd12cbb..2788e4800d 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -740,6 +740,19 @@ coderd_workspace_creation_duration_seconds_bucket{organization_name="{organizati coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="+Inf"} 1 coderd_workspace_creation_duration_seconds_sum{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild"} 4.406214 coderd_workspace_creation_duration_seconds_count{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild"} 1 +# HELP coderd_template_workspace_build_duration_seconds Duration from workspace build creation to agent ready, by template. +# TYPE coderd_template_workspace_build_duration_seconds histogram +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="1"} 0 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="10"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="30"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="60"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="300"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="600"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="1800"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="3600"} 1 +coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="+Inf"} 1 +coderd_template_workspace_build_duration_seconds_sum{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start"} 7.241532 +coderd_template_workspace_build_duration_seconds_count{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start"} 1 # HELP coderd_prebuilt_workspace_claim_duration_seconds Time to claim a prebuilt workspace by organization, template, and preset. # TYPE coderd_prebuilt_workspace_claim_duration_seconds histogram coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="1"} 0