mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: add e2e workspace build duration metric (#21739)
Adds coderd_template_workspace_build_duration_seconds histogram that tracks the full duration from workspace build creation to agent ready. This captures the complete user-perceived build time including provisioning and agent startup. The metric is emitted when the agent reports ready/error/timeout via the lifecycle API, ensuring each build is counted exactly once per replica.
This commit is contained in:
@@ -89,6 +89,7 @@ type Options struct {
|
||||
PublishWorkspaceAgentLogsUpdateFn func(ctx context.Context, workspaceAgentID uuid.UUID, msg agentsdk.LogsNotifyMessage)
|
||||
NetworkTelemetryHandler func(batch []*tailnetproto.TelemetryEvent)
|
||||
BoundaryUsageTracker *boundaryusage.Tracker
|
||||
LifecycleMetrics *LifecycleMetrics
|
||||
|
||||
AccessURL *url.URL
|
||||
AppHostname string
|
||||
@@ -170,6 +171,7 @@ func New(opts Options, workspace database.Workspace) *API {
|
||||
Database: opts.Database,
|
||||
Log: opts.Log,
|
||||
PublishWorkspaceUpdateFn: api.publishWorkspaceUpdate,
|
||||
Metrics: opts.LifecycleMetrics,
|
||||
}
|
||||
|
||||
api.AppsAPI = &AppsAPI{
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -31,7 +32,9 @@ type LifecycleAPI struct {
|
||||
Log slog.Logger
|
||||
PublishWorkspaceUpdateFn func(context.Context, *database.WorkspaceAgent, wspubsub.WorkspaceEventKind) error
|
||||
|
||||
TimeNowFn func() time.Time // defaults to dbtime.Now()
|
||||
TimeNowFn func() time.Time // defaults to dbtime.Now()
|
||||
Metrics *LifecycleMetrics
|
||||
emitMetricsOnce sync.Once
|
||||
}
|
||||
|
||||
func (a *LifecycleAPI) now() time.Time {
|
||||
@@ -125,6 +128,17 @@ func (a *LifecycleAPI) UpdateLifecycle(ctx context.Context, req *agentproto.Upda
|
||||
}
|
||||
}
|
||||
|
||||
// Emit build duration metric when agent transitions to a terminal startup state.
|
||||
// We only emit once per agent connection to avoid duplicate metrics.
|
||||
switch lifecycleState {
|
||||
case database.WorkspaceAgentLifecycleStateReady,
|
||||
database.WorkspaceAgentLifecycleStateStartTimeout,
|
||||
database.WorkspaceAgentLifecycleStateStartError:
|
||||
a.emitMetricsOnce.Do(func() {
|
||||
a.emitBuildDurationMetric(ctx, workspaceAgent.ResourceID)
|
||||
})
|
||||
}
|
||||
|
||||
return req.Lifecycle, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -9,12 +9,14 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/mock/gomock"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/agentapi"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest/promhelp"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbmock"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
@@ -22,6 +24,10 @@ import (
|
||||
"github.com/coder/coder/v2/testutil"
|
||||
)
|
||||
|
||||
// fullMetricName is the fully-qualified Prometheus metric name
|
||||
// (namespace + name) used for gathering in tests.
|
||||
const fullMetricName = "coderd_" + agentapi.BuildDurationMetricName
|
||||
|
||||
func TestUpdateLifecycle(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -30,6 +36,12 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
someTime = dbtime.Time(someTime)
|
||||
now := dbtime.Now()
|
||||
|
||||
// Fixed times for build duration metric assertions.
|
||||
// The expected duration is exactly 90 seconds.
|
||||
buildCreatedAt := dbtime.Time(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
|
||||
agentReadyAt := dbtime.Time(time.Date(2025, 1, 1, 0, 1, 30, 0, time.UTC))
|
||||
expectedDuration := agentReadyAt.Sub(buildCreatedAt).Seconds() // 90.0
|
||||
|
||||
var (
|
||||
workspaceID = uuid.New()
|
||||
agentCreated = database.WorkspaceAgent{
|
||||
@@ -105,6 +117,19 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
Valid: true,
|
||||
},
|
||||
}).Return(nil)
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: buildCreatedAt,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: agentReadyAt,
|
||||
WorstStatus: "success",
|
||||
}, nil)
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
@@ -113,6 +138,7 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
// Test that nil publish fn works.
|
||||
PublishWorkspaceUpdateFn: nil,
|
||||
}
|
||||
@@ -122,6 +148,16 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
|
||||
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "success",
|
||||
"is_prebuild": "false",
|
||||
})
|
||||
require.Equal(t, uint64(1), got.GetSampleCount())
|
||||
require.Equal(t, expectedDuration, got.GetSampleSum())
|
||||
})
|
||||
|
||||
// This test jumps from CREATING to READY, skipping STARTED. Both the
|
||||
@@ -147,8 +183,21 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
Valid: true,
|
||||
},
|
||||
}).Return(nil)
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: buildCreatedAt,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: agentReadyAt,
|
||||
WorstStatus: "success",
|
||||
}, nil)
|
||||
|
||||
publishCalled := false
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
return agentCreated, nil
|
||||
@@ -156,6 +205,7 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error {
|
||||
publishCalled = true
|
||||
return nil
|
||||
@@ -168,6 +218,16 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
require.True(t, publishCalled)
|
||||
|
||||
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "success",
|
||||
"is_prebuild": "false",
|
||||
})
|
||||
require.Equal(t, uint64(1), got.GetSampleCount())
|
||||
require.Equal(t, expectedDuration, got.GetSampleSum())
|
||||
})
|
||||
|
||||
t.Run("NoTimeSpecified", func(t *testing.T) {
|
||||
@@ -194,6 +254,19 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
Valid: true,
|
||||
},
|
||||
})
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentCreated.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: buildCreatedAt,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: agentReadyAt,
|
||||
WorstStatus: "success",
|
||||
}, nil)
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
@@ -202,6 +275,7 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: nil,
|
||||
TimeNowFn: func() time.Time {
|
||||
return now
|
||||
@@ -213,6 +287,16 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
|
||||
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "success",
|
||||
"is_prebuild": "false",
|
||||
})
|
||||
require.Equal(t, uint64(1), got.GetSampleCount())
|
||||
require.Equal(t, expectedDuration, got.GetSampleSum())
|
||||
})
|
||||
|
||||
t.Run("AllStates", func(t *testing.T) {
|
||||
@@ -228,6 +312,9 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
dbM := dbmock.NewMockStore(gomock.NewController(t))
|
||||
|
||||
var publishCalled int64
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
return agent, nil
|
||||
@@ -235,6 +322,7 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: func(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error {
|
||||
atomic.AddInt64(&publishCalled, 1)
|
||||
return nil
|
||||
@@ -277,6 +365,20 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
ReadyAt: expectedReadyAt,
|
||||
}).Times(1).Return(nil)
|
||||
|
||||
// The first ready state triggers the build duration metric query.
|
||||
if state == agentproto.Lifecycle_READY || state == agentproto.Lifecycle_START_TIMEOUT || state == agentproto.Lifecycle_START_ERROR {
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agent.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: someTime,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: stateNow,
|
||||
WorstStatus: "success",
|
||||
}, nil).MaxTimes(1)
|
||||
}
|
||||
|
||||
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
|
||||
Lifecycle: lifecycle,
|
||||
})
|
||||
@@ -322,6 +424,164 @@ func TestUpdateLifecycle(t *testing.T) {
|
||||
require.Nil(t, resp)
|
||||
require.False(t, publishCalled)
|
||||
})
|
||||
|
||||
// Test that metric is NOT emitted when not all agents are ready (multi-agent case).
|
||||
t.Run("MetricNotEmittedWhenNotAllAgentsReady", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
lifecycle := &agentproto.Lifecycle{
|
||||
State: agentproto.Lifecycle_READY,
|
||||
ChangedAt: timestamppb.New(now),
|
||||
}
|
||||
|
||||
dbM := dbmock.NewMockStore(gomock.NewController(t))
|
||||
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
|
||||
// Return AllAgentsReady = false to simulate multi-agent case where not all are ready.
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: someTime,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: false, // Not all agents ready yet
|
||||
LastAgentReadyAt: time.Time{}, // No ready time yet
|
||||
WorstStatus: "success",
|
||||
}, nil)
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
return agentStarting, nil
|
||||
},
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: nil,
|
||||
}
|
||||
|
||||
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
|
||||
Lifecycle: lifecycle,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
|
||||
require.Nil(t, promhelp.MetricValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "success",
|
||||
"is_prebuild": "false",
|
||||
}), "metric should not be emitted when not all agents are ready")
|
||||
})
|
||||
|
||||
// Test that prebuild label is "true" when owner is prebuild system user.
|
||||
t.Run("PrebuildLabelTrue", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
lifecycle := &agentproto.Lifecycle{
|
||||
State: agentproto.Lifecycle_READY,
|
||||
ChangedAt: timestamppb.New(now),
|
||||
}
|
||||
|
||||
dbM := dbmock.NewMockStore(gomock.NewController(t))
|
||||
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: buildCreatedAt,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: true, // Prebuild workspace
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: agentReadyAt,
|
||||
WorstStatus: "success",
|
||||
}, nil)
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
return agentStarting, nil
|
||||
},
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: nil,
|
||||
}
|
||||
|
||||
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
|
||||
Lifecycle: lifecycle,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
|
||||
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "success",
|
||||
"is_prebuild": "true",
|
||||
})
|
||||
require.Equal(t, uint64(1), got.GetSampleCount())
|
||||
require.Equal(t, expectedDuration, got.GetSampleSum())
|
||||
})
|
||||
|
||||
// Test worst status is used when one agent has an error.
|
||||
t.Run("WorstStatusError", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
lifecycle := &agentproto.Lifecycle{
|
||||
State: agentproto.Lifecycle_READY,
|
||||
ChangedAt: timestamppb.New(now),
|
||||
}
|
||||
|
||||
dbM := dbmock.NewMockStore(gomock.NewController(t))
|
||||
dbM.EXPECT().UpdateWorkspaceAgentLifecycleStateByID(gomock.Any(), gomock.Any()).Return(nil)
|
||||
dbM.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), agentStarting.ResourceID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{
|
||||
CreatedAt: buildCreatedAt,
|
||||
Transition: database.WorkspaceTransitionStart,
|
||||
TemplateName: "test-template",
|
||||
OrganizationName: "test-org",
|
||||
IsPrebuild: false,
|
||||
AllAgentsReady: true,
|
||||
LastAgentReadyAt: agentReadyAt,
|
||||
WorstStatus: "error", // One agent had an error
|
||||
}, nil)
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := agentapi.NewLifecycleMetrics(reg)
|
||||
|
||||
api := &agentapi.LifecycleAPI{
|
||||
AgentFn: func(ctx context.Context) (database.WorkspaceAgent, error) {
|
||||
return agentStarting, nil
|
||||
},
|
||||
WorkspaceID: workspaceID,
|
||||
Database: dbM,
|
||||
Log: testutil.Logger(t),
|
||||
Metrics: metrics,
|
||||
PublishWorkspaceUpdateFn: nil,
|
||||
}
|
||||
|
||||
resp, err := api.UpdateLifecycle(context.Background(), &agentproto.UpdateLifecycleRequest{
|
||||
Lifecycle: lifecycle,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, lifecycle, resp)
|
||||
|
||||
got := promhelp.HistogramValue(t, reg, fullMetricName, prometheus.Labels{
|
||||
"template_name": "test-template",
|
||||
"organization_name": "test-org",
|
||||
"transition": "start",
|
||||
"status": "error",
|
||||
"is_prebuild": "false",
|
||||
})
|
||||
require.Equal(t, uint64(1), got.GetSampleCount())
|
||||
require.Equal(t, expectedDuration, got.GetSampleSum())
|
||||
})
|
||||
}
|
||||
|
||||
func TestUpdateStartup(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
package agentapi
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"cdr.dev/slog/v3"
|
||||
)
|
||||
|
||||
// BuildDurationMetricName is the short name for the end-to-end
|
||||
// workspace build duration histogram. The full metric name is
|
||||
// prefixed with the namespace "coderd_".
|
||||
const BuildDurationMetricName = "template_workspace_build_duration_seconds"
|
||||
|
||||
// LifecycleMetrics contains Prometheus metrics for the lifecycle API.
|
||||
type LifecycleMetrics struct {
|
||||
BuildDuration *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
// NewLifecycleMetrics creates and registers all lifecycle-related
|
||||
// Prometheus metrics.
|
||||
//
|
||||
// The build duration histogram tracks the end-to-end duration from
|
||||
// workspace build creation to agent ready, by template. It is
|
||||
// recorded by the coderd replica handling the agent's connection
|
||||
// when the last agent reports ready. In multi-replica deployments,
|
||||
// each replica only has observations for agents it handles.
|
||||
//
|
||||
// The "is_prebuild" label distinguishes prebuild creation (background,
|
||||
// no user waiting) from user-initiated builds (regular workspace
|
||||
// creation or prebuild claims).
|
||||
func NewLifecycleMetrics(reg prometheus.Registerer) *LifecycleMetrics {
|
||||
m := &LifecycleMetrics{
|
||||
BuildDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "coderd",
|
||||
Name: BuildDurationMetricName,
|
||||
Help: "Duration from workspace build creation to agent ready, by template.",
|
||||
Buckets: []float64{
|
||||
1, // 1s
|
||||
10,
|
||||
30,
|
||||
60, // 1min
|
||||
60 * 5,
|
||||
60 * 10,
|
||||
60 * 30, // 30min
|
||||
60 * 60, // 1hr
|
||||
},
|
||||
NativeHistogramBucketFactor: 1.1,
|
||||
NativeHistogramMaxBucketNumber: 100,
|
||||
NativeHistogramMinResetDuration: time.Hour,
|
||||
}, []string{"template_name", "organization_name", "transition", "status", "is_prebuild"}),
|
||||
}
|
||||
reg.MustRegister(m.BuildDuration)
|
||||
return m
|
||||
}
|
||||
|
||||
// emitBuildDurationMetric records the end-to-end workspace build
|
||||
// duration from build creation to when all agents are ready.
|
||||
func (a *LifecycleAPI) emitBuildDurationMetric(ctx context.Context, resourceID uuid.UUID) {
|
||||
if a.Metrics == nil {
|
||||
return
|
||||
}
|
||||
|
||||
buildInfo, err := a.Database.GetWorkspaceBuildMetricsByResourceID(ctx, resourceID)
|
||||
if err != nil {
|
||||
a.Log.Warn(ctx, "failed to get build info for metrics", slog.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
// Wait until all agents have reached a terminal startup state.
|
||||
if !buildInfo.AllAgentsReady {
|
||||
return
|
||||
}
|
||||
|
||||
// LastAgentReadyAt is the MAX(ready_at) across all agents. Since
|
||||
// we only get here when AllAgentsReady is true, this should always
|
||||
// be valid.
|
||||
if buildInfo.LastAgentReadyAt.IsZero() {
|
||||
a.Log.Warn(ctx, "last_agent_ready_at is unexpectedly zero",
|
||||
slog.F("last_agent_ready_at", buildInfo.LastAgentReadyAt))
|
||||
return
|
||||
}
|
||||
|
||||
duration := buildInfo.LastAgentReadyAt.Sub(buildInfo.CreatedAt).Seconds()
|
||||
|
||||
a.Metrics.BuildDuration.WithLabelValues(
|
||||
buildInfo.TemplateName,
|
||||
buildInfo.OrganizationName,
|
||||
string(buildInfo.Transition),
|
||||
buildInfo.WorstStatus,
|
||||
strconv.FormatBool(buildInfo.IsPrebuild),
|
||||
).Observe(duration)
|
||||
}
|
||||
+5
-2
@@ -42,6 +42,7 @@ import (
|
||||
"cdr.dev/slog/v3"
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/buildinfo"
|
||||
"github.com/coder/coder/v2/coderd/agentapi"
|
||||
"github.com/coder/coder/v2/coderd/agentapi/metadatabatcher"
|
||||
_ "github.com/coder/coder/v2/coderd/apidoc" // Used for swagger docs.
|
||||
"github.com/coder/coder/v2/coderd/appearance"
|
||||
@@ -754,6 +755,7 @@ func New(options *Options) *API {
|
||||
api.agentProvider = stn
|
||||
if options.DeploymentValues.Prometheus.Enable {
|
||||
options.PrometheusRegistry.MustRegister(stn)
|
||||
api.lifecycleMetrics = agentapi.NewLifecycleMetrics(options.PrometheusRegistry)
|
||||
}
|
||||
api.NetworkTelemetryBatcher = tailnet.NewNetworkTelemetryBatcher(
|
||||
quartz.NewReal(),
|
||||
@@ -1888,8 +1890,9 @@ type API struct {
|
||||
healthCheckCache atomic.Pointer[healthsdk.HealthcheckReport]
|
||||
healthCheckProgress healthcheck.Progress
|
||||
|
||||
statsReporter *workspacestats.Reporter
|
||||
metadataBatcher *metadatabatcher.Batcher
|
||||
statsReporter *workspacestats.Reporter
|
||||
metadataBatcher *metadatabatcher.Batcher
|
||||
lifecycleMetrics *agentapi.LifecycleMetrics
|
||||
|
||||
Acquirer *provisionerdserver.Acquirer
|
||||
// dbRolluper rolls up template usage stats from raw agent and app
|
||||
|
||||
@@ -3886,6 +3886,14 @@ func (q *querier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Conte
|
||||
return q.db.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, arg)
|
||||
}
|
||||
|
||||
func (q *querier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
|
||||
// Verify access to the resource first.
|
||||
if _, err := q.GetWorkspaceResourceByID(ctx, id); err != nil {
|
||||
return database.GetWorkspaceBuildMetricsByResourceIDRow{}, err
|
||||
}
|
||||
return q.db.GetWorkspaceBuildMetricsByResourceID(ctx, id)
|
||||
}
|
||||
|
||||
func (q *querier) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
|
||||
// Authorized call to get the workspace build. If we can read the build,
|
||||
// we can read the params.
|
||||
|
||||
@@ -2036,6 +2036,18 @@ func (s *MethodTestSuite) TestWorkspace() {
|
||||
dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes()
|
||||
check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(res)
|
||||
}))
|
||||
s.Run("GetWorkspaceBuildMetricsByResourceID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
|
||||
ws := testutil.Fake(s.T(), faker, database.Workspace{})
|
||||
build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID})
|
||||
job := testutil.Fake(s.T(), faker, database.ProvisionerJob{ID: build.JobID, Type: database.ProvisionerJobTypeWorkspaceBuild})
|
||||
res := testutil.Fake(s.T(), faker, database.WorkspaceResource{JobID: build.JobID})
|
||||
dbm.EXPECT().GetWorkspaceResourceByID(gomock.Any(), res.ID).Return(res, nil).AnyTimes()
|
||||
dbm.EXPECT().GetProvisionerJobByID(gomock.Any(), res.JobID).Return(job, nil).AnyTimes()
|
||||
dbm.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), res.JobID).Return(build, nil).AnyTimes()
|
||||
dbm.EXPECT().GetWorkspaceByID(gomock.Any(), build.WorkspaceID).Return(ws, nil).AnyTimes()
|
||||
dbm.EXPECT().GetWorkspaceBuildMetricsByResourceID(gomock.Any(), res.ID).Return(database.GetWorkspaceBuildMetricsByResourceIDRow{}, nil).AnyTimes()
|
||||
check.Args(res.ID).Asserts(ws, policy.ActionRead).Returns(database.GetWorkspaceBuildMetricsByResourceIDRow{})
|
||||
}))
|
||||
s.Run("Build/GetWorkspaceResourcesByJobID", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
|
||||
ws := testutil.Fake(s.T(), faker, database.Workspace{})
|
||||
build := testutil.Fake(s.T(), faker, database.WorkspaceBuild{WorkspaceID: ws.ID})
|
||||
|
||||
@@ -2406,6 +2406,14 @@ func (m queryMetricsStore) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx cont
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
func (m queryMetricsStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
|
||||
start := time.Now()
|
||||
r0, r1 := m.s.GetWorkspaceBuildMetricsByResourceID(ctx, id)
|
||||
m.queryLatencies.WithLabelValues("GetWorkspaceBuildMetricsByResourceID").Observe(time.Since(start).Seconds())
|
||||
m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "GetWorkspaceBuildMetricsByResourceID").Inc()
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
func (m queryMetricsStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
|
||||
start := time.Now()
|
||||
r0, r1 := m.s.GetWorkspaceBuildParameters(ctx, workspaceBuildID)
|
||||
|
||||
@@ -4499,6 +4499,21 @@ func (mr *MockStoreMockRecorder) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ct
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildByWorkspaceIDAndBuildNumber", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildByWorkspaceIDAndBuildNumber), ctx, arg)
|
||||
}
|
||||
|
||||
// GetWorkspaceBuildMetricsByResourceID mocks base method.
|
||||
func (m *MockStore) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (database.GetWorkspaceBuildMetricsByResourceIDRow, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "GetWorkspaceBuildMetricsByResourceID", ctx, id)
|
||||
ret0, _ := ret[0].(database.GetWorkspaceBuildMetricsByResourceIDRow)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// GetWorkspaceBuildMetricsByResourceID indicates an expected call of GetWorkspaceBuildMetricsByResourceID.
|
||||
func (mr *MockStoreMockRecorder) GetWorkspaceBuildMetricsByResourceID(ctx, id any) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetWorkspaceBuildMetricsByResourceID", reflect.TypeOf((*MockStore)(nil).GetWorkspaceBuildMetricsByResourceID), ctx, id)
|
||||
}
|
||||
|
||||
// GetWorkspaceBuildParameters mocks base method.
|
||||
func (m *MockStore) GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]database.WorkspaceBuildParameter, error) {
|
||||
m.ctrl.T.Helper()
|
||||
|
||||
@@ -501,6 +501,9 @@ type sqlcQuerier interface {
|
||||
GetWorkspaceBuildByID(ctx context.Context, id uuid.UUID) (WorkspaceBuild, error)
|
||||
GetWorkspaceBuildByJobID(ctx context.Context, jobID uuid.UUID) (WorkspaceBuild, error)
|
||||
GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Context, arg GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams) (WorkspaceBuild, error)
|
||||
// Returns build metadata for e2e workspace build duration metrics.
|
||||
// Also checks if all agents are ready and returns the worst status.
|
||||
GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error)
|
||||
GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]WorkspaceBuildParameter, error)
|
||||
GetWorkspaceBuildParametersByBuildIDs(ctx context.Context, workspaceBuildIds []uuid.UUID) ([]WorkspaceBuildParameter, error)
|
||||
GetWorkspaceBuildStatsByTemplates(ctx context.Context, since time.Time) ([]GetWorkspaceBuildStatsByTemplatesRow, error)
|
||||
|
||||
@@ -21344,6 +21344,62 @@ func (q *sqlQuerier) GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx context.Co
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getWorkspaceBuildMetricsByResourceID = `-- name: GetWorkspaceBuildMetricsByResourceID :one
|
||||
SELECT
|
||||
wb.created_at,
|
||||
wb.transition,
|
||||
t.name AS template_name,
|
||||
o.name AS organization_name,
|
||||
(w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild,
|
||||
-- All agents must have ready_at set (terminal startup state)
|
||||
COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready,
|
||||
-- Latest ready_at across all agents (for duration calculation)
|
||||
MAX(wa.ready_at)::timestamptz AS last_agent_ready_at,
|
||||
-- Worst status: error > timeout > ready
|
||||
CASE
|
||||
WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error'
|
||||
WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout'
|
||||
ELSE 'success'
|
||||
END AS worst_status
|
||||
FROM workspace_builds wb
|
||||
JOIN workspaces w ON wb.workspace_id = w.id
|
||||
JOIN templates t ON w.template_id = t.id
|
||||
JOIN organizations o ON t.organization_id = o.id
|
||||
JOIN workspace_resources wr ON wr.job_id = wb.job_id
|
||||
JOIN workspace_agents wa ON wa.resource_id = wr.id
|
||||
WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1)
|
||||
GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id
|
||||
`
|
||||
|
||||
type GetWorkspaceBuildMetricsByResourceIDRow struct {
|
||||
CreatedAt time.Time `db:"created_at" json:"created_at"`
|
||||
Transition WorkspaceTransition `db:"transition" json:"transition"`
|
||||
TemplateName string `db:"template_name" json:"template_name"`
|
||||
OrganizationName string `db:"organization_name" json:"organization_name"`
|
||||
IsPrebuild bool `db:"is_prebuild" json:"is_prebuild"`
|
||||
AllAgentsReady bool `db:"all_agents_ready" json:"all_agents_ready"`
|
||||
LastAgentReadyAt time.Time `db:"last_agent_ready_at" json:"last_agent_ready_at"`
|
||||
WorstStatus string `db:"worst_status" json:"worst_status"`
|
||||
}
|
||||
|
||||
// Returns build metadata for e2e workspace build duration metrics.
|
||||
// Also checks if all agents are ready and returns the worst status.
|
||||
func (q *sqlQuerier) GetWorkspaceBuildMetricsByResourceID(ctx context.Context, id uuid.UUID) (GetWorkspaceBuildMetricsByResourceIDRow, error) {
|
||||
row := q.db.QueryRowContext(ctx, getWorkspaceBuildMetricsByResourceID, id)
|
||||
var i GetWorkspaceBuildMetricsByResourceIDRow
|
||||
err := row.Scan(
|
||||
&i.CreatedAt,
|
||||
&i.Transition,
|
||||
&i.TemplateName,
|
||||
&i.OrganizationName,
|
||||
&i.IsPrebuild,
|
||||
&i.AllAgentsReady,
|
||||
&i.LastAgentReadyAt,
|
||||
&i.WorstStatus,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getWorkspaceBuildStatsByTemplates = `-- name: GetWorkspaceBuildStatsByTemplates :many
|
||||
SELECT
|
||||
w.template_id,
|
||||
|
||||
@@ -243,3 +243,31 @@ SET
|
||||
has_external_agent = @has_external_agent,
|
||||
updated_at = @updated_at::timestamptz
|
||||
WHERE id = @id::uuid;
|
||||
|
||||
-- name: GetWorkspaceBuildMetricsByResourceID :one
|
||||
-- Returns build metadata for e2e workspace build duration metrics.
|
||||
-- Also checks if all agents are ready and returns the worst status.
|
||||
SELECT
|
||||
wb.created_at,
|
||||
wb.transition,
|
||||
t.name AS template_name,
|
||||
o.name AS organization_name,
|
||||
(w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0') AS is_prebuild,
|
||||
-- All agents must have ready_at set (terminal startup state)
|
||||
COUNT(*) FILTER (WHERE wa.ready_at IS NULL) = 0 AS all_agents_ready,
|
||||
-- Latest ready_at across all agents (for duration calculation)
|
||||
MAX(wa.ready_at)::timestamptz AS last_agent_ready_at,
|
||||
-- Worst status: error > timeout > ready
|
||||
CASE
|
||||
WHEN bool_or(wa.lifecycle_state = 'start_error') THEN 'error'
|
||||
WHEN bool_or(wa.lifecycle_state = 'start_timeout') THEN 'timeout'
|
||||
ELSE 'success'
|
||||
END AS worst_status
|
||||
FROM workspace_builds wb
|
||||
JOIN workspaces w ON wb.workspace_id = w.id
|
||||
JOIN templates t ON w.template_id = t.id
|
||||
JOIN organizations o ON t.organization_id = o.id
|
||||
JOIN workspace_resources wr ON wr.job_id = wb.job_id
|
||||
JOIN workspace_agents wa ON wa.resource_id = wr.id
|
||||
WHERE wb.job_id = (SELECT job_id FROM workspace_resources WHERE workspace_resources.id = $1)
|
||||
GROUP BY wb.created_at, wb.transition, t.name, o.name, w.owner_id;
|
||||
|
||||
@@ -158,6 +158,7 @@ func (api *API) workspaceAgentRPC(rw http.ResponseWriter, r *http.Request) {
|
||||
DerpMapUpdateFrequency: api.Options.DERPMapUpdateFrequency,
|
||||
ExternalAuthConfigs: api.ExternalAuthConfigs,
|
||||
Experiments: api.Experiments,
|
||||
LifecycleMetrics: api.lifecycleMetrics,
|
||||
|
||||
// Optional:
|
||||
UpdateAgentMetricsFn: api.UpdateAgentMetrics,
|
||||
|
||||
@@ -162,6 +162,7 @@ deployment. They will always be available from the agent.
|
||||
| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` |
|
||||
| `coderd_provisionerd_num_daemons` | gauge | The number of provisioner daemons. | |
|
||||
| `coderd_provisionerd_workspace_build_timings_seconds` | histogram | The time taken for a workspace to build. | `status` `template_name` `template_version` `workspace_transition` |
|
||||
| `coderd_template_workspace_build_duration_seconds` | histogram | Duration from workspace build creation to agent ready, by template. | `is_prebuild` `organization_name` `status` `template_name` `transition` |
|
||||
| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` |
|
||||
| `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` |
|
||||
| `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` |
|
||||
@@ -211,6 +212,7 @@ The following metrics support native histograms:
|
||||
|
||||
* `coderd_workspace_creation_duration_seconds`
|
||||
* `coderd_prebuilt_workspace_claim_duration_seconds`
|
||||
* `coderd_template_coderd_template_workspace_build_duration_seconds`
|
||||
|
||||
Native histograms are an **experimental** Prometheus feature that removes the need to predefine bucket boundaries and allows higher-resolution buckets that adapt to deployment characteristics.
|
||||
Whether a metric is exposed as classic or native depends entirely on the Prometheus server configuration (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details):
|
||||
|
||||
@@ -740,6 +740,19 @@ coderd_workspace_creation_duration_seconds_bucket{organization_name="{organizati
|
||||
coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="+Inf"} 1
|
||||
coderd_workspace_creation_duration_seconds_sum{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild"} 4.406214
|
||||
coderd_workspace_creation_duration_seconds_count{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild"} 1
|
||||
# HELP coderd_template_workspace_build_duration_seconds Duration from workspace build creation to agent ready, by template.
|
||||
# TYPE coderd_template_workspace_build_duration_seconds histogram
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="1"} 0
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="10"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="30"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="60"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="300"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="600"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="1800"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="3600"} 1
|
||||
coderd_template_workspace_build_duration_seconds_bucket{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start",le="+Inf"} 1
|
||||
coderd_template_workspace_build_duration_seconds_sum{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start"} 7.241532
|
||||
coderd_template_workspace_build_duration_seconds_count{is_prebuild="false",organization_name="{organization}",status="success",template_name="docker",transition="start"} 1
|
||||
# HELP coderd_prebuilt_workspace_claim_duration_seconds Time to claim a prebuilt workspace by organization, template, and preset.
|
||||
# TYPE coderd_prebuilt_workspace_claim_duration_seconds histogram
|
||||
coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="1"} 0
|
||||
|
||||
Reference in New Issue
Block a user