mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: add provisioner job queue wait time histogram and jobs enqueued counter (#21869)
This PR adds some metrics to help identify job enqueue rates and latencies. This work was initiated as a way to help reduce the cost of the observation/measurement itself for autostart scaletests, which impacts our ability to identify/reason about the load caused by autostart. See: https://github.com/coder/internal/issues/1209 I've extended the metrics here to account for regular user initiated builds, prebuilds, autostarts, etc. IMO there is still the question here of whether we want to include or need the `transition` label, which is only present on workspace builds. Including it does lead to an increase in cardinality, and in the case of the histogram (when not using native histograms) that's at least a few extra series for every bucket. We could remove the transition label there but keep it on the counter. Additionally, the histogram is currently observing latencies for other jobs, such as template builds/version imports, those do not have a transition type associated with them. Tested briefly in a workspace, can see metric values like the following: - `coderd_workspace_builds_enqueued_total{build_reason="autostart",provisioner_type="terraform",status="success",transition="start"} 1` - `coderd_provisioner_job_queue_wait_seconds_bucket{build_reason="autostart",job_type="workspace_build",provisioner_type="terraform",transition="start",le="0.025"} 1` --------- Signed-off-by: Callum Styan <callumstyan@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
@@ -21,7 +22,9 @@ import (
|
||||
"github.com/coder/coder/v2/agent/agenttest"
|
||||
"github.com/coder/coder/v2/coderd"
|
||||
"github.com/coder/coder/v2/coderd/audit"
|
||||
"github.com/coder/coder/v2/coderd/autobuild"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||
"github.com/coder/coder/v2/coderd/coderdtest/promhelp"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbfake"
|
||||
@@ -30,6 +33,7 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/coderd/notifications"
|
||||
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
|
||||
"github.com/coder/coder/v2/coderd/provisionerdserver"
|
||||
"github.com/coder/coder/v2/coderd/rbac"
|
||||
"github.com/coder/coder/v2/coderd/rbac/policy"
|
||||
"github.com/coder/coder/v2/coderd/render"
|
||||
@@ -37,6 +41,7 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/schedule/cron"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
"github.com/coder/coder/v2/coderd/util/slice"
|
||||
"github.com/coder/coder/v2/coderd/wsbuilder"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/cryptorand"
|
||||
"github.com/coder/coder/v2/provisioner/echo"
|
||||
@@ -5901,3 +5906,135 @@ func TestWorkspaceCreateWithImplicitPreset(t *testing.T) {
|
||||
require.Equal(t, preset2ID, *ws2.LatestBuild.TemplateVersionPresetID)
|
||||
})
|
||||
}
|
||||
|
||||
func TestProvisionerJobQueueWaitMetric(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
logger := testutil.Logger(t)
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := provisionerdserver.NewMetrics(logger)
|
||||
err := metrics.Register(reg)
|
||||
require.NoError(t, err)
|
||||
|
||||
client := coderdtest.New(t, &coderdtest.Options{
|
||||
IncludeProvisionerDaemon: true,
|
||||
ProvisionerdServerMetrics: metrics,
|
||||
})
|
||||
user := coderdtest.CreateFirstUser(t, client)
|
||||
|
||||
// Create a template version - this triggers a template_version_import job.
|
||||
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil)
|
||||
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
|
||||
|
||||
// Check that the queue wait metric was recorded for the template_version_import job.
|
||||
importMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{
|
||||
"provisioner_type": string(database.ProvisionerTypeEcho),
|
||||
"job_type": string(database.ProvisionerJobTypeTemplateVersionImport),
|
||||
"transition": "",
|
||||
"build_reason": "",
|
||||
})
|
||||
require.NotNil(t, importMetric, "import job metric should be recorded")
|
||||
importHistogram := importMetric.GetHistogram()
|
||||
require.NotNil(t, importHistogram)
|
||||
require.Equal(t, uint64(1), importHistogram.GetSampleCount(), "import job should have 1 sample")
|
||||
require.Greater(t, importHistogram.GetSampleSum(), 0.0, "import job queue wait should be non-zero")
|
||||
|
||||
// Create a template and workspace - this triggers a workspace_build job.
|
||||
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
|
||||
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
|
||||
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
|
||||
|
||||
// Check that the queue wait metric was recorded for the workspace_build job.
|
||||
buildMetric := promhelp.MetricValue(t, reg, "coderd_provisioner_job_queue_wait_seconds", prometheus.Labels{
|
||||
"provisioner_type": string(database.ProvisionerTypeEcho),
|
||||
"job_type": string(database.ProvisionerJobTypeWorkspaceBuild),
|
||||
"transition": string(database.WorkspaceTransitionStart),
|
||||
"build_reason": string(database.BuildReasonInitiator),
|
||||
})
|
||||
require.NotNil(t, buildMetric, "workspace build job metric should be recorded")
|
||||
buildHistogram := buildMetric.GetHistogram()
|
||||
require.NotNil(t, buildHistogram)
|
||||
require.Equal(t, uint64(1), buildHistogram.GetSampleCount(), "workspace build job should have 1 sample")
|
||||
require.Greater(t, buildHistogram.GetSampleSum(), 0.0, "workspace build job queue wait should be non-zero")
|
||||
}
|
||||
|
||||
func TestWorkspaceBuildsEnqueuedMetric(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
logger = testutil.Logger(t)
|
||||
reg = prometheus.NewRegistry()
|
||||
metrics = provisionerdserver.NewMetrics(logger)
|
||||
|
||||
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
|
||||
tickCh = make(chan time.Time)
|
||||
statsCh = make(chan autobuild.Stats)
|
||||
)
|
||||
|
||||
err := metrics.Register(reg)
|
||||
require.NoError(t, err)
|
||||
|
||||
wsBuilderMetrics, err := wsbuilder.NewMetrics(reg)
|
||||
require.NoError(t, err)
|
||||
|
||||
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{
|
||||
IncludeProvisionerDaemon: true,
|
||||
ProvisionerdServerMetrics: metrics,
|
||||
WorkspaceBuilderMetrics: wsBuilderMetrics,
|
||||
AutobuildTicker: tickCh,
|
||||
AutobuildStats: statsCh,
|
||||
})
|
||||
user := coderdtest.CreateFirstUser(t, client)
|
||||
|
||||
// Create a template and workspace with autostart schedule.
|
||||
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil)
|
||||
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
|
||||
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
|
||||
workspace := coderdtest.CreateWorkspace(t, client, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
|
||||
cwr.AutostartSchedule = ptr.Ref(sched.String())
|
||||
})
|
||||
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
|
||||
|
||||
// Stop the workspace to prepare for autostart.
|
||||
workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID, codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop)
|
||||
|
||||
// Trigger an autostart build via the autobuild ticker. This verifies that
|
||||
// autostart builds are recorded with build_reason="autostart".
|
||||
p, err := coderdtest.GetProvisionerForTags(db, time.Now(), workspace.OrganizationID, map[string]string{})
|
||||
require.NoError(t, err)
|
||||
|
||||
go func() {
|
||||
tickTime := sched.Next(workspace.LatestBuild.CreatedAt)
|
||||
coderdtest.UpdateProvisionerLastSeenAt(t, db, p.ID, tickTime)
|
||||
tickCh <- tickTime
|
||||
close(tickCh)
|
||||
}()
|
||||
|
||||
// Wait for the autostart to complete.
|
||||
stats := <-statsCh
|
||||
require.Len(t, stats.Errors, 0)
|
||||
require.Len(t, stats.Transitions, 1)
|
||||
require.Contains(t, stats.Transitions, workspace.ID)
|
||||
require.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID])
|
||||
|
||||
// Verify the workspace was autostarted.
|
||||
workspace = coderdtest.MustWorkspace(t, client, workspace.ID)
|
||||
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
|
||||
require.Equal(t, codersdk.BuildReasonAutostart, workspace.LatestBuild.Reason)
|
||||
|
||||
// Now check the autostart metric was recorded.
|
||||
autostartCount := promhelp.CounterValue(t, reg, "coderd_workspace_builds_enqueued_total", prometheus.Labels{
|
||||
"provisioner_type": string(database.ProvisionerTypeEcho),
|
||||
"build_reason": string(database.BuildReasonAutostart),
|
||||
"transition": string(database.WorkspaceTransitionStart),
|
||||
"status": wsbuilder.BuildStatusSuccess,
|
||||
})
|
||||
require.Equal(t, 1, autostartCount, "autostart should record 1 enqueue with build_reason=autostart")
|
||||
}
|
||||
|
||||
func mustSchedule(t *testing.T, s string) *cron.Schedule {
|
||||
t.Helper()
|
||||
sched, err := cron.Weekly(s)
|
||||
require.NoError(t, err)
|
||||
return sched
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user