feat: add e2e workspace build duration metric (#21739)

Adds coderd_template_workspace_build_duration_seconds histogram that
tracks the full duration from workspace build creation to agent ready.
This captures the complete user-perceived build time including
provisioning and agent startup.

The metric is emitted when the agent reports ready/error/timeout via the
lifecycle API, ensuring each build is counted exactly once per replica.
This commit is contained in:
Jon Ayers
2026-02-06 16:26:02 -06:00
committed by GitHub
parent a31e476623
commit 6035e45cb8
15 changed files with 525 additions and 3 deletions
+8
View File
@@ -3886,6 +3886,14 @@ func (q *querier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Conte
return q.db.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, arg)
}
func (q *querier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
// Verify access to the resource first.
if _, err := q.GetWorkspaceResourceByID(ctx, id); err != nil {
return database.GetWorkspaceBuildMetricsByResourceIDRow{}, err
}
return q.db.GetWorkspaceBuildMetricsByResourceID(ctx, id)
}
func (q *querier) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
// Authorized call to get the workspace build. If we can read the build,
// we can read the params.
+12
View File
@@ -2036,6 +2036,18 @@ func (s *MethodTestSuite) TestWorkspace() {
dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes()
check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(res)
}))
s.Run("GetWorkspaceBuildMetricsByResourceID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
ws := testutil.Fake(s.T(), faker, database.Workspace{})
build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID})
job := testutil.Fake(s.T(), faker, database.ProvisionerJob{ID: build.JobID, Type: database.ProvisionerJobTypeWorkspaceBuild})
res := testutil.Fake(s.T(), faker, database.WorkspaceResource{JobID: build.JobID})
dbm.EXPECT().GetWorkspaceResourceByID(gomock.Any(), res.ID).Return(res, nil).AnyTimes()
dbm.EXPECT().GetProvisionerJobByID(gomock.Any(), res.JobID).Return(job, nil).AnyTimes()
dbm.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), res.JobID).Return(build, nil).AnyTimes()
dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes()
dbm.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), res.ID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{}, nil).AnyTimes()
check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(database.GetWorkspaceBuildMetricsByResourceIDRow{})
}))
s.Run("Build/GetWorkspaceResourcesByJobID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
ws := testutil.Fake(s.T(), faker, database.Workspace{})
build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID})
@@ -2406,6 +2406,14 @@ func (m queryMetricsStore) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx cont
return r0, r1
}
func (m queryMetricsStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
start := time.Now()
r0, r1 := m.s.GetWorkspaceBuildMetricsByResourceID(ctx, id)
m.queryLatencies.WithLabelValues("GetWorkspaceBuildMetricsByResourceID").Observe(time.Since(start).Seconds())
m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "GetWorkspaceBuildMetricsByResourceID").Inc()
return r0, r1
}
func (m queryMetricsStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
start := time.Now()
r0, r1 := m.s.GetWorkspaceBuildParameters(ctx, workspaceBuildID)
+15
View File
@@ -4499,6 +4499,21 @@ func (mr *MockStoreMockRecorder) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ct
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildByWorkspaceIDAndBuildNumber", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildByWorkspaceIDAndBuildNumber), ctx, arg)
}
// GetWorkspaceBuildMetricsByResourceID mocks base method.
func (m *MockStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "GetWorkspaceBuildMetricsByResourceID", ctx, id)
ret0, _ := ret[0].(database.GetWorkspaceBuildMetricsByResourceIDRow)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// GetWorkspaceBuildMetricsByResourceID indicates an expected call of GetWorkspaceBuildMetricsByResourceID.
func (mr *MockStoreMockRecorder) GetWorkspaceBuildMetricsByResourceID(ctx, id any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildMetricsByResourceID", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildMetricsByResourceID), ctx, id)
}
// GetWorkspaceBuildParameters mocks base method.
func (m *MockStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
m.ctrl.T.Helper()
+3
View File
@@ -501,6 +501,9 @@ type sqlcQuerier interface {
GetWorkspaceBuildByID(ctx context.Context, id uuid.UUID) (WorkspaceBuild, error)
GetWorkspaceBuildByJobID(ctx context.Context, jobID uuid.UUID) (WorkspaceBuild, error)
GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Context, arg GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams) (WorkspaceBuild, error)
// Returns build metadata for e2e workspace build duration metrics.
// Also checks if all agents are ready and returns the worst status.
GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error)
GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]WorkspaceBuildParameter, error)
GetWorkspaceBuildParametersByBuildIDs(ctx context.Context, workspaceBuildIds []uuid.UUID) ([]WorkspaceBuildParameter, error)
GetWorkspaceBuildStatsByTemplates(ctx context.Context, since time.Time) ([]GetWorkspaceBuildStatsByTemplatesRow, error)
+56
View File
@@ -21344,6 +21344,62 @@ func (q *sqlQuerier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Co
return i, err
}
const getWorkspaceBuildMetricsByResourceID = `-- name: GetWorkspaceBuildMetricsByResourceID :one
SELECT
wb.created_at,
wb.transition,
t.name AS template_name,
o.name AS organization_name,
(w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild,
-- All agents must have ready_at set (terminal startup state)
COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready,
-- Latest ready_at across all agents (for duration calculation)
MAX(wa.ready_at)::timestamptz AS last_agent_ready_at,
-- Worst status: error > timeout > ready
CASE
WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error'
WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout'
ELSE 'success'
END AS worst_status
FROM workspace_builds wb
JOIN workspaces w ON wb.workspace_id = w.id
JOIN templates t ON w.template_id = t.id
JOIN organizations o ON t.organization_id = o.id
JOIN workspace_resources wr ON wr.job_id = wb.job_id
JOIN workspace_agents wa ON wa.resource_id = wr.id
WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1)
GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id
`
type GetWorkspaceBuildMetricsByResourceIDRow struct {
CreatedAt time.Time `db:"created_at" json:"created_at"`
Transition WorkspaceTransition `db:"transition" json:"transition"`
TemplateName string `db:"template_name" json:"template_name"`
OrganizationName string `db:"organization_name" json:"organization_name"`
IsPrebuild bool `db:"is_prebuild" json:"is_prebuild"`
AllAgentsReady bool `db:"all_agents_ready" json:"all_agents_ready"`
LastAgentReadyAt time.Time `db:"last_agent_ready_at" json:"last_agent_ready_at"`
WorstStatus string `db:"worst_status" json:"worst_status"`
}
// Returns build metadata for e2e workspace build duration metrics.
// Also checks if all agents are ready and returns the worst status.
func (q *sqlQuerier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error) {
row := q.db.QueryRowContext(ctx, getWorkspaceBuildMetricsByResourceID, id)
var i GetWorkspaceBuildMetricsByResourceIDRow
err := row.Scan(
&i.CreatedAt,
&i.Transition,
&i.TemplateName,
&i.OrganizationName,
&i.IsPrebuild,
&i.AllAgentsReady,
&i.LastAgentReadyAt,
&i.WorstStatus,
)
return i, err
}
const getWorkspaceBuildStatsByTemplates = `-- name: GetWorkspaceBuildStatsByTemplates :many
SELECT
w.template_id,
@@ -243,3 +243,31 @@ SET
has_external_agent = @has_external_agent,
updated_at = @updated_at::timestamptz
WHERE id = @id::uuid;
-- name: GetWorkspaceBuildMetricsByResourceID :one
-- Returns build metadata for e2e workspace build duration metrics.
-- Also checks if all agents are ready and returns the worst status.
SELECT
wb.created_at,
wb.transition,
t.name AS template_name,
o.name AS organization_name,
(w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild,
-- All agents must have ready_at set (terminal startup state)
COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready,
-- Latest ready_at across all agents (for duration calculation)
MAX(wa.ready_at)::timestamptz AS last_agent_ready_at,
-- Worst status: error > timeout > ready
CASE
WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error'
WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout'
ELSE 'success'
END AS worst_status
FROM workspace_builds wb
JOIN workspaces w ON wb.workspace_id = w.id
JOIN templates t ON w.template_id = t.id
JOIN organizations o ON t.organization_id = o.id
JOIN workspace_resources wr ON wr.job_id = wb.job_id
JOIN workspace_agents wa ON wa.resource_id = wr.id
WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1)
GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id;