feat: add e2e workspace build duration metric (#21739)

Adds coderd_template_workspace_build_duration_seconds histogram that
tracks the full duration from workspace build creation to agent ready.
This captures the complete user-perceived build time including
provisioning and agent startup.

The metric is emitted when the agent reports ready/error/timeout via the
lifecycle API, ensuring each build is counted exactly once per replica.
This commit is contained in:
Jon Ayers
2026-02-06 16:26:02 -06:00
committed by GitHub
parent a31e476623
commit 6035e45cb8
15 changed files with 525 additions and 3 deletions
+2
View File
@@ -89,6 +89,7 @@ type Options struct {
PublishWorkspaceAgentLogsUpdateFn func(ctx context.Context, workspaceAgentID uuid.UUID, msg agentsdk.LogsNotifyMessage)
NetworkTelemetryHandler func(batch []*tailnetproto.TelemetryEvent)
BoundaryUsageTracker *boundaryusage.Tracker
LifecycleMetrics *LifecycleMetrics
AccessURL *url.URL
AppHostname string
@@ -170,6 +171,7 @@ func New(opts Options, workspace database.Workspace) *API {
Database: opts.Database,
Log: opts.Log,
PublishWorkspaceUpdateFn: api.publishWorkspaceUpdate,
Metrics: opts.LifecycleMetrics,
}
api.AppsAPI = &AppsAPI{
+15 -1
View File
@@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"slices"
"sync"
"time"
"github.com/google/uuid"
@@ -31,7 +32,9 @@ type LifecycleAPI struct {
Log slog.Logger
PublishWorkspaceUpdateFn func(context.Context, *database.WorkspaceAgent, wspubsub.WorkspaceEventKind) error
TimeNowFn func() time.Time // defaults to dbtime.Now()
TimeNowFn func() time.Time // defaults to dbtime.Now()
Metrics *LifecycleMetrics
emitMetricsOnce sync.Once
}
func (a *LifecycleAPI) now() time.Time {
@@ -125,6 +128,17 @@ func (a *LifecycleAPI) UpdateLifecycle(ctx context.Context, req *agentproto.Upda
}
}
// Emit build duration metric when agent transitions to a terminal startup state.
// We only emit once per agent connection to avoid duplicate metrics.
switch lifecycleState {
case database.WorkspaceAgentLifecycleStateReady,
database.WorkspaceAgentLifecycleStateStartTimeout,
database.WorkspaceAgentLifecycleStateStartError:
a.emitMetricsOnce.Do(func() {
a.emitBuildDurationMetric(ctx, workspaceAgent.ResourceID)
})
}
return req.Lifecycle, nil
}
+260
View File
@@ -9,12 +9,14 @@ import (
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
"google.golang.org/protobuf/types/known/timestamppb"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi"
"github.com/coder/coder/v2/coderd/coderdtest/promhelp"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbmock"
"github.com/coder/coder/v2/coderd/database/dbtime"
@@ -22,6 +24,10 @@ import (
"github.com/coder/coder/v2/testutil"
)
// fullMetricName is the fully-qualified Prometheus metric name
// (namespace + name) used for gathering in tests.
const fullMetricName = "coderd_" + agentapi.BuildDurationMetricName
func TestUpdateLifecycle(t *testing.T) {
t.Parallel()
@@ -30,6 +36,12 @@ func TestUpdateLifecycle(t *testing.T) {
someTime = dbtime.Time(someTime)
now := dbtime.Now()
// Fixed times for build duration metric assertions.
// The expected duration is exactly 90 seconds.
buildCreatedAt := dbtime.Time(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
agentReadyAt := dbtime.Time(time.Date(2025, 1, 1, 0, 1, 30, 0, time.UTC))
expectedDuration := agentReadyAt.Sub(buildCreatedAt).Seconds() // 90.0
var (
workspaceID = uuid.New()
agentCreated = database.WorkspaceAgent{
@@ -105,6 +117,19 @@ func TestUpdateLifecycle(t *testing.T) {
Valid: true,
},
}).Return(nil)
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: buildCreatedAt,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: true,
LastAgentReadyAt: agentReadyAt,
WorstStatus: "success",
}, nil)
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
@@ -113,6 +138,7 @@ func TestUpdateLifecycle(t *testing.T) {
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
// Test that nil publish fn works.
PublishWorkspaceUpdateFn: nil,
}
@@ -122,6 +148,16 @@ func TestUpdateLifecycle(t *testing.T) {
})
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "success",
"is_prebuild": "false",
})
require.Equal(t, uint64(1), got.GetSampleCount())
require.Equal(t, expectedDuration, got.GetSampleSum())
})
// This test jumps from CREATING to READY, skipping STARTED. Both the
@@ -147,8 +183,21 @@ func TestUpdateLifecycle(t *testing.T) {
Valid: true,
},
}).Return(nil)
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: buildCreatedAt,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: true,
LastAgentReadyAt: agentReadyAt,
WorstStatus: "success",
}, nil)
publishCalled := false
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
return agentCreated, nil
@@ -156,6 +205,7 @@ func TestUpdateLifecycle(t *testing.T) {
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error {
publishCalled = true
return nil
@@ -168,6 +218,16 @@ func TestUpdateLifecycle(t *testing.T) {
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
require.True(t, publishCalled)
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "success",
"is_prebuild": "false",
})
require.Equal(t, uint64(1), got.GetSampleCount())
require.Equal(t, expectedDuration, got.GetSampleSum())
})
t.Run("NoTimeSpecified", func(t *testing.T) {
@@ -194,6 +254,19 @@ func TestUpdateLifecycle(t *testing.T) {
Valid: true,
},
})
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: buildCreatedAt,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: true,
LastAgentReadyAt: agentReadyAt,
WorstStatus: "success",
}, nil)
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
@@ -202,6 +275,7 @@ func TestUpdateLifecycle(t *testing.T) {
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: nil,
TimeNowFn: func() time.Time {
return now
@@ -213,6 +287,16 @@ func TestUpdateLifecycle(t *testing.T) {
})
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "success",
"is_prebuild": "false",
})
require.Equal(t, uint64(1), got.GetSampleCount())
require.Equal(t, expectedDuration, got.GetSampleSum())
})
t.Run("AllStates", func(t *testing.T) {
@@ -228,6 +312,9 @@ func TestUpdateLifecycle(t *testing.T) {
dbM := dbmock.NewMockStore(gomock.NewController(t))
var publishCalled int64
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
return agent, nil
@@ -235,6 +322,7 @@ func TestUpdateLifecycle(t *testing.T) {
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error {
atomic.AddInt64(&publishCalled, 1)
return nil
@@ -277,6 +365,20 @@ func TestUpdateLifecycle(t *testing.T) {
ReadyAt: expectedReadyAt,
}).Times(1).Return(nil)
// The first ready state triggers the build duration metric query.
if state == agentproto.Lifecycle_READY || state == agentproto.Lifecycle_START_TIMEOUT || state == agentproto.Lifecycle_START_ERROR {
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agent.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: someTime,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: true,
LastAgentReadyAt: stateNow,
WorstStatus: "success",
}, nil).MaxTimes(1)
}
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
Lifecycle: lifecycle,
})
@@ -322,6 +424,164 @@ func TestUpdateLifecycle(t *testing.T) {
require.Nil(t, resp)
require.False(t, publishCalled)
})
// Test that metric is NOT emitted when not all agents are ready (multi-agent case).
t.Run("MetricNotEmittedWhenNotAllAgentsReady", func(t *testing.T) {
t.Parallel()
lifecycle := &agentproto.Lifecycle{
State: agentproto.Lifecycle_READY,
ChangedAt: timestamppb.New(now),
}
dbM := dbmock.NewMockStore(gomock.NewController(t))
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
// Return AllAgentsReady = false to simulate multi-agent case where not all are ready.
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: someTime,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: false, // Not all agents ready yet
LastAgentReadyAt: time.Time{}, // No ready time yet
WorstStatus: "success",
}, nil)
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
return agentStarting, nil
},
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: nil,
}
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
Lifecycle: lifecycle,
})
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
require.Nil(t, promhelp.MetricValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "success",
"is_prebuild": "false",
}), "metric should not be emitted when not all agents are ready")
})
// Test that prebuild label is "true" when owner is prebuild system user.
t.Run("PrebuildLabelTrue", func(t *testing.T) {
t.Parallel()
lifecycle := &agentproto.Lifecycle{
State: agentproto.Lifecycle_READY,
ChangedAt: timestamppb.New(now),
}
dbM := dbmock.NewMockStore(gomock.NewController(t))
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: buildCreatedAt,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: true, // Prebuild workspace
AllAgentsReady: true,
LastAgentReadyAt: agentReadyAt,
WorstStatus: "success",
}, nil)
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
return agentStarting, nil
},
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: nil,
}
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
Lifecycle: lifecycle,
})
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "success",
"is_prebuild": "true",
})
require.Equal(t, uint64(1), got.GetSampleCount())
require.Equal(t, expectedDuration, got.GetSampleSum())
})
// Test worst status is used when one agent has an error.
t.Run("WorstStatusError", func(t *testing.T) {
t.Parallel()
lifecycle := &agentproto.Lifecycle{
State: agentproto.Lifecycle_READY,
ChangedAt: timestamppb.New(now),
}
dbM := dbmock.NewMockStore(gomock.NewController(t))
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
CreatedAt: buildCreatedAt,
Transition: database.WorkspaceTransitionStart,
TemplateName: "test-template",
OrganizationName: "test-org",
IsPrebuild: false,
AllAgentsReady: true,
LastAgentReadyAt: agentReadyAt,
WorstStatus: "error", // One agent had an error
}, nil)
reg := prometheus.NewRegistry()
metrics := agentapi.NewLifecycleMetrics(reg)
api := &agentapi.LifecycleAPI{
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
return agentStarting, nil
},
WorkspaceID: workspaceID,
Database: dbM,
Log: testutil.Logger(t),
Metrics: metrics,
PublishWorkspaceUpdateFn: nil,
}
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
Lifecycle: lifecycle,
})
require.NoError(t, err)
require.Equal(t, lifecycle, resp)
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
"template_name": "test-template",
"organization_name": "test-org",
"transition": "start",
"status": "error",
"is_prebuild": "false",
})
require.Equal(t, uint64(1), got.GetSampleCount())
require.Equal(t, expectedDuration, got.GetSampleSum())
})
}
func TestUpdateStartup(t *testing.T) {
+97
View File
@@ -0,0 +1,97 @@
package agentapi
import (
"context"
"strconv"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"cdr.dev/slog/v3"
)
// BuildDurationMetricName is the short name for the end-to-end
// workspace build duration histogram. The full metric name is
// prefixed with the namespace "coderd_".
const BuildDurationMetricName = "template_workspace_build_duration_seconds"
// LifecycleMetrics contains Prometheus metrics for the lifecycle API.
type LifecycleMetrics struct {
BuildDuration *prometheus.HistogramVec
}
// NewLifecycleMetrics creates and registers all lifecycle-related
// Prometheus metrics.
//
// The build duration histogram tracks the end-to-end duration from
// workspace build creation to agent ready, by template. It is
// recorded by the coderd replica handling the agent's connection
// when the last agent reports ready. In multi-replica deployments,
// each replica only has observations for agents it handles.
//
// The "is_prebuild" label distinguishes prebuild creation (background,
// no user waiting) from user-initiated builds (regular workspace
// creation or prebuild claims).
func NewLifecycleMetrics(reg prometheus.Registerer) *LifecycleMetrics {
m := &LifecycleMetrics{
BuildDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "coderd",
Name: BuildDurationMetricName,
Help: "Duration from workspace build creation to agent ready, by template.",
Buckets: []float64{
1, // 1s
10,
30,
60, // 1min
60 * 5,
60 * 10,
60 * 30, // 30min
60 * 60, // 1hr
},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: time.Hour,
}, []string{"template_name", "organization_name", "transition", "status", "is_prebuild"}),
}
reg.MustRegister(m.BuildDuration)
return m
}
// emitBuildDurationMetric records the end-to-end workspace build
// duration from build creation to when all agents are ready.
func (a *LifecycleAPI) emitBuildDurationMetric(ctx context.Context, resourceID uuid.UUID) {
if a.Metrics == nil {
return
}
buildInfo, err := a.Database.GetWorkspaceBuildMetricsByResourceID(ctx, resourceID)
if err != nil {
a.Log.Warn(ctx, "failed to get build info for metrics", slog.Error(err))
return
}
// Wait until all agents have reached a terminal startup state.
if !buildInfo.AllAgentsReady {
return
}
// LastAgentReadyAt is the MAX(ready_at) across all agents. Since
// we only get here when AllAgentsReady is true, this should always
// be valid.
if buildInfo.LastAgentReadyAt.IsZero() {
a.Log.Warn(ctx, "last_agent_ready_at is unexpectedly zero",
slog.F("last_agent_ready_at", buildInfo.LastAgentReadyAt))
return
}
duration := buildInfo.LastAgentReadyAt.Sub(buildInfo.CreatedAt).Seconds()
a.Metrics.BuildDuration.WithLabelValues(
buildInfo.TemplateName,
buildInfo.OrganizationName,
string(buildInfo.Transition),
buildInfo.WorstStatus,
strconv.FormatBool(buildInfo.IsPrebuild),
).Observe(duration)
}