diff --git a/agent/agent.go b/agent/agent.go index 1b5247331b..4a7b9a827b 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -35,6 +35,8 @@ import ( "tailscale.com/types/netlogtype" "cdr.dev/slog" + "github.com/coder/retry" + "github.com/coder/coder/v2/agent/agentproc" "github.com/coder/coder/v2/agent/agentscripts" "github.com/coder/coder/v2/agent/agentssh" @@ -45,7 +47,6 @@ import ( "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/coder/v2/tailnet" - "github.com/coder/retry" ) const ( @@ -222,8 +223,10 @@ type agent struct { connCountReconnectingPTY atomic.Int64 prometheusRegistry *prometheus.Registry - metrics *agentMetrics - syscaller agentproc.Syscaller + // metrics are prometheus registered metrics that will be collected and + // labeled in Coder with the agent + workspace. + metrics *agentMetrics + syscaller agentproc.Syscaller // modifiedProcs is used for testing process priority management. modifiedProcs chan []*agentproc.Process @@ -252,6 +255,9 @@ func (a *agent) init(ctx context.Context) { Filesystem: a.filesystem, PatchLogs: a.client.PatchLogs, }) + // Register runner metrics. If the prom registry is nil, the metrics + // will not report anywhere. + a.scriptRunner.RegisterMetrics(a.prometheusRegistry) go a.runLoop(ctx) } @@ -745,9 +751,12 @@ func (a *agent) run(ctx context.Context) error { return xerrors.Errorf("init script runner: %w", err) } err = a.trackConnGoroutine(func() { + start := time.Now() err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool { return script.RunOnStart }) + // Measure the time immediately after the script has finished + dur := time.Since(start).Seconds() if err != nil { a.logger.Warn(ctx, "startup script(s) failed", slog.Error(err)) if errors.Is(err, agentscripts.ErrTimeout) { @@ -758,6 +767,12 @@ func (a *agent) run(ctx context.Context) error { } else { a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady) } + + label := "false" + if err == nil { + label = "true" + } + a.metrics.startupScriptSeconds.WithLabelValues(label).Set(dur) a.scriptRunner.StartCron() }) if err != nil { diff --git a/agent/agent_test.go b/agent/agent_test.go index 19e28346ad..69a4a1ac91 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -46,6 +46,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/agent" "github.com/coder/coder/v2/agent/agentproc" "github.com/coder/coder/v2/agent/agentproc/agentproctest" @@ -2235,6 +2236,17 @@ func TestAgent_Metrics_SSH(t *testing.T) { Type: agentsdk.AgentMetricTypeCounter, Value: 0, }, + { + Name: "coderd_agentstats_startup_script_seconds", + Type: agentsdk.AgentMetricTypeGauge, + Value: 0, + Labels: []agentsdk.AgentMetricLabel{ + { + Name: "success", + Value: "true", + }, + }, + }, } var actual []*promgo.MetricFamily diff --git a/agent/agentscripts/agentscripts.go b/agent/agentscripts/agentscripts.go index 3acc48b0a1..8b3aaf9a22 100644 --- a/agent/agentscripts/agentscripts.go +++ b/agent/agentscripts/agentscripts.go @@ -13,12 +13,14 @@ import ( "sync/atomic" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/robfig/cron/v3" "github.com/spf13/afero" "golang.org/x/sync/errgroup" "golang.org/x/xerrors" "cdr.dev/slog" + "github.com/coder/coder/v2/agent/agentssh" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" @@ -57,6 +59,11 @@ func New(opts Options) *Runner { cronCtxCancel: cronCtxCancel, cron: cron.New(cron.WithParser(parser)), closed: make(chan struct{}), + scriptsExecuted: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "scripts", + Name: "executed_total", + }, []string{"success"}), } } @@ -71,6 +78,19 @@ type Runner struct { cron *cron.Cron initialized atomic.Bool scripts []codersdk.WorkspaceAgentScript + + // scriptsExecuted includes all scripts executed by the workspace agent. Agents + // execute startup scripts, and scripts on a cron schedule. Both will increment + // this counter. + scriptsExecuted *prometheus.CounterVec +} + +func (r *Runner) RegisterMetrics(reg prometheus.Registerer) { + if reg == nil { + // If no registry, do nothing. + return + } + reg.MustRegister(r.scriptsExecuted) } // Init initializes the runner with the provided scripts. @@ -90,7 +110,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error { } script := script _, err := r.cron.AddFunc(script.Cron, func() { - err := r.run(r.cronCtx, script) + err := r.trackRun(r.cronCtx, script) if err != nil { r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err)) } @@ -131,7 +151,7 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp } script := script eg.Go(func() error { - err := r.run(ctx, script) + err := r.trackRun(ctx, script) if err != nil { return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err) } @@ -141,6 +161,17 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp return eg.Wait() } +// trackRun wraps "run" with metrics. +func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error { + err := r.run(ctx, script) + if err != nil { + r.scriptsExecuted.WithLabelValues("false").Add(1) + } else { + r.scriptsExecuted.WithLabelValues("true").Add(1) + } + return err +} + // run executes the provided script with the timeout. // If the timeout is exceeded, the process is sent an interrupt signal. // If the process does not exit after a few seconds, it is forcefully killed. diff --git a/agent/metrics.go b/agent/metrics.go index ddbe6f49be..d987bad9a5 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -17,6 +17,9 @@ import ( type agentMetrics struct { connectionsTotal prometheus.Counter reconnectingPTYErrors *prometheus.CounterVec + // startupScriptSeconds is the time in seconds that the start script(s) + // took to run. This is reported once per agent. + startupScriptSeconds *prometheus.GaugeVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -35,9 +38,18 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { ) registerer.MustRegister(reconnectingPTYErrors) + startupScriptSeconds := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "startup_script_seconds", + Help: "Amount of time taken to run the startup script in seconds.", + }, []string{"success"}) + registerer.MustRegister(startupScriptSeconds) + return &agentMetrics{ connectionsTotal: connectionsTotal, reconnectingPTYErrors: reconnectingPTYErrors, + startupScriptSeconds: startupScriptSeconds, } } diff --git a/coderd/coderd.go b/coderd/coderd.go index 4475eaf3fa..31a0ec2af2 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -38,8 +38,10 @@ import ( _ "github.com/coder/coder/v2/coderd/apidoc" "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/healthcheck/derphealth" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "cdr.dev/slog" + "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/cli/clibase" "github.com/coder/coder/v2/coderd/audit" @@ -168,7 +170,7 @@ type Options struct { HTTPClient *http.Client - UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) + UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.AgentMetricLabels, metrics []agentsdk.AgentMetric) StatsBatcher *batchstats.Batcher WorkspaceAppsStatsCollectorOptions workspaceapps.StatsCollectorOptions diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index db36b9e320..42e33ddb89 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1918,7 +1918,7 @@ func (q *querier) GetWorkspaceBuildsCreatedAfter(ctx context.Context, createdAt return q.db.GetWorkspaceBuildsCreatedAfter(ctx, createdAt) } -func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { return fetch(q.log, q.auth, q.db.GetWorkspaceByAgentID)(ctx, agentID) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index c210aa00da..9a2b3f3483 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -1065,21 +1065,30 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(ws.ID).Asserts(ws, rbac.ActionRead).Returns(b) })) s.Run("GetWorkspaceAgentByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(agt) })) s.Run("GetWorkspaceAgentByInstanceID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) check.Args(agt.AuthInstanceID.String).Asserts(ws, rbac.ActionRead).Returns(agt) })) s.Run("UpdateWorkspaceAgentLifecycleStateByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1089,7 +1098,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("UpdateWorkspaceAgentLogOverflowByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1099,7 +1111,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("UpdateWorkspaceAgentStartupByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1111,7 +1126,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("GetWorkspaceAgentLogsAfter", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1120,7 +1138,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionRead).Returns([]database.WorkspaceAgentLog{}) })) s.Run("GetWorkspaceAppByAgentIDAndSlug", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1132,7 +1153,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionRead).Returns(app) })) s.Run("GetWorkspaceAppsByAgentID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1173,11 +1197,17 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(database.GetWorkspaceBuildsByWorkspaceIDParams{WorkspaceID: ws.ID}).Asserts(ws, rbac.ActionRead) // ordering })) s.Run("GetWorkspaceByAgentID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) - check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(ws) + check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(database.GetWorkspaceByAgentIDRow{ + Workspace: ws, + TemplateName: tpl.Name, + }) })) s.Run("GetWorkspaceByOwnerIDAndName", s.Subtest(func(db database.Store, check *expects) { ws := dbgen.Workspace(s.T(), db, database.Workspace{}) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index f733bdfd1b..5442367245 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -4293,11 +4293,24 @@ func (q *FakeQuerier) GetWorkspaceBuildsCreatedAfter(_ context.Context, after ti return workspaceBuilds, nil } -func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { q.mutex.RLock() defer q.mutex.RUnlock() - return q.getWorkspaceByAgentIDNoLock(ctx, agentID) + w, err := q.getWorkspaceByAgentIDNoLock(ctx, agentID) + if err != nil { + return database.GetWorkspaceByAgentIDRow{}, err + } + + tpl, err := q.getTemplateByIDNoLock(ctx, w.TemplateID) + if err != nil { + return database.GetWorkspaceByAgentIDRow{}, err + } + + return database.GetWorkspaceByAgentIDRow{ + Workspace: w, + TemplateName: tpl.Name, + }, nil } func (q *FakeQuerier) GetWorkspaceByID(ctx context.Context, id uuid.UUID) (database.Workspace, error) { diff --git a/coderd/database/dbmetrics/dbmetrics.go b/coderd/database/dbmetrics/dbmetrics.go index 55145d3529..81a1cff902 100644 --- a/coderd/database/dbmetrics/dbmetrics.go +++ b/coderd/database/dbmetrics/dbmetrics.go @@ -1124,7 +1124,7 @@ func (m metricsStore) GetWorkspaceBuildsCreatedAfter(ctx context.Context, create return builds, err } -func (m metricsStore) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (m metricsStore) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { start := time.Now() workspace, err := m.s.GetWorkspaceByAgentID(ctx, agentID) m.queryLatencies.WithLabelValues("GetWorkspaceByAgentID").Observe(time.Since(start).Seconds()) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 8c2b828370..a44eb50343 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2344,10 +2344,10 @@ func (mr *MockStoreMockRecorder) GetWorkspaceBuildsCreatedAfter(arg0, arg1 inter } // GetWorkspaceByAgentID mocks base method. -func (m *MockStore) GetWorkspaceByAgentID(arg0 context.Context, arg1 uuid.UUID) (database.Workspace, error) { +func (m *MockStore) GetWorkspaceByAgentID(arg0 context.Context, arg1 uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetWorkspaceByAgentID", arg0, arg1) - ret0, _ := ret[0].(database.Workspace) + ret0, _ := ret[0].(database.GetWorkspaceByAgentIDRow) ret1, _ := ret[1].(error) return ret0, ret1 } diff --git a/coderd/database/modelmethods.go b/coderd/database/modelmethods.go index 8d15af65aa..685c138c95 100644 --- a/coderd/database/modelmethods.go +++ b/coderd/database/modelmethods.go @@ -148,6 +148,10 @@ func (g Group) RBACObject() rbac.Object { InOrg(g.OrganizationID) } +func (w GetWorkspaceByAgentIDRow) RBACObject() rbac.Object { + return w.Workspace.RBACObject() +} + func (w Workspace) RBACObject() rbac.Object { return rbac.ResourceWorkspace.WithID(w.ID). InOrg(w.OrganizationID). diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 73638c8206..43a0181b2d 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -231,7 +231,7 @@ type sqlcQuerier interface { GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]WorkspaceBuildParameter, error) GetWorkspaceBuildsByWorkspaceID(ctx context.Context, arg GetWorkspaceBuildsByWorkspaceIDParams) ([]WorkspaceBuild, error) GetWorkspaceBuildsCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceBuild, error) - GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (Workspace, error) + GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (GetWorkspaceByAgentIDRow, error) GetWorkspaceByID(ctx context.Context, id uuid.UUID) (Workspace, error) GetWorkspaceByOwnerIDAndName(ctx context.Context, arg GetWorkspaceByOwnerIDAndNameParams) (Workspace, error) GetWorkspaceByWorkspaceAppID(ctx context.Context, workspaceAppID uuid.UUID) (Workspace, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 16d3de793a..03d595d9ec 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -10539,9 +10539,12 @@ func (q *sqlQuerier) GetDeploymentWorkspaceStats(ctx context.Context) (GetDeploy const getWorkspaceByAgentID = `-- name: GetWorkspaceByAgentID :one SELECT - id, created_at, updated_at, owner_id, organization_id, template_id, deleted, name, autostart_schedule, ttl, last_used_at, dormant_at, deleting_at, automatic_updates + workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates, + templates.name as template_name FROM workspaces +INNER JOIN + templates ON workspaces.template_id = templates.id WHERE workspaces.id = ( SELECT @@ -10567,24 +10570,30 @@ WHERE ) ` -func (q *sqlQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (Workspace, error) { +type GetWorkspaceByAgentIDRow struct { + Workspace Workspace `db:"workspace" json:"workspace"` + TemplateName string `db:"template_name" json:"template_name"` +} + +func (q *sqlQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (GetWorkspaceByAgentIDRow, error) { row := q.db.QueryRowContext(ctx, getWorkspaceByAgentID, agentID) - var i Workspace + var i GetWorkspaceByAgentIDRow err := row.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.OwnerID, - &i.OrganizationID, - &i.TemplateID, - &i.Deleted, - &i.Name, - &i.AutostartSchedule, - &i.Ttl, - &i.LastUsedAt, - &i.DormantAt, - &i.DeletingAt, - &i.AutomaticUpdates, + &i.Workspace.ID, + &i.Workspace.CreatedAt, + &i.Workspace.UpdatedAt, + &i.Workspace.OwnerID, + &i.Workspace.OrganizationID, + &i.Workspace.TemplateID, + &i.Workspace.Deleted, + &i.Workspace.Name, + &i.Workspace.AutostartSchedule, + &i.Workspace.Ttl, + &i.Workspace.LastUsedAt, + &i.Workspace.DormantAt, + &i.Workspace.DeletingAt, + &i.Workspace.AutomaticUpdates, + &i.TemplateName, ) return i, err } diff --git a/coderd/database/queries/workspaces.sql b/coderd/database/queries/workspaces.sql index 7862497ebf..d9ff657fd2 100644 --- a/coderd/database/queries/workspaces.sql +++ b/coderd/database/queries/workspaces.sql @@ -46,9 +46,12 @@ WHERE -- name: GetWorkspaceByAgentID :one SELECT - * + sqlc.embed(workspaces), + templates.name as template_name FROM workspaces +INNER JOIN + templates ON workspaces.template_id = templates.id WHERE workspaces.id = ( SELECT diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index b1091b2451..d3d19bf239 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -47,6 +47,7 @@ type updateRequest struct { username string workspaceName string agentName string + templateName string metrics []agentsdk.AgentMetric @@ -59,6 +60,7 @@ type annotatedMetric struct { username string workspaceName string agentName string + templateName string expiryDate time.Time } @@ -74,7 +76,7 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) labels = append(labels, agentMetricsLabels...) - labelValues = append(labelValues, am.username, am.workspaceName, am.agentName) + labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName) for _, l := range am.Labels { labels = append(labels, l.Name) @@ -160,6 +162,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { username: req.username, workspaceName: req.workspaceName, agentName: req.agentName, + templateName: req.templateName, AgentMetric: m, @@ -227,7 +230,16 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel} +var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} + +// AgentMetricLabels are the labels used to decorate an agent's metrics. +// This list should match the list of labels in agentMetricsLabels. +type AgentMetricLabels struct { + Username string + WorkspaceName string + AgentName string + TemplateName string +} func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { output := make(chan []prometheus.Metric, 1) @@ -246,12 +258,13 @@ func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { } } -func (ma *MetricsAggregator) Update(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) { +func (ma *MetricsAggregator) Update(ctx context.Context, labels AgentMetricLabels, metrics []agentsdk.AgentMetric) { select { case ma.updateCh <- updateRequest{ - username: username, - workspaceName: workspaceName, - agentName: agentName, + username: labels.Username, + workspaceName: labels.WorkspaceName, + agentName: labels.AgentName, + templateName: labels.TemplateName, metrics: metrics, timestamp: time.Now(), diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 45f0de1485..ec305b9d44 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -2,6 +2,7 @@ package prometheusmetrics_test import ( "context" + "sort" "sync/atomic" "testing" "time" @@ -12,6 +13,7 @@ import ( "github.com/stretchr/testify/require" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/coder/v2/cryptorand" @@ -22,8 +24,16 @@ const ( testWorkspaceName = "yogi-workspace" testUsername = "yogi-bear" testAgentName = "main-agent" + testTemplateName = "main-template" ) +var testLabels = prometheusmetrics.AgentMetricLabels{ + Username: testUsername, + WorkspaceName: testWorkspaceName, + AgentName: testAgentName, + TemplateName: testTemplateName, +} + func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { t.Parallel() @@ -58,6 +68,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { {Name: "agent_name", Value: testAgentName}, {Name: "username", Value: testUsername}, {Name: "workspace_name", Value: testWorkspaceName}, + {Name: "template_name", Value: testTemplateName}, } expected := []agentsdk.AgentMetric{ {Name: "a_counter_one", Type: agentsdk.AgentMetricTypeCounter, Value: 1, Labels: commonLabels}, @@ -69,13 +80,14 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { {Name: "hello", Value: "world"}, {Name: "username", Value: testUsername}, {Name: "workspace_name", Value: testWorkspaceName}, + {Name: "template_name", Value: testTemplateName}, }}, {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6, Labels: commonLabels}, } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given1) - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given2) + metricsAggregator.Update(ctx, testLabels, given1) + metricsAggregator.Update(ctx, testLabels, given2) // then require.Eventually(t, func() bool { @@ -119,6 +131,10 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua } dtoLabels := asMetricAgentLabels(d.GetLabel()) + // dto labels are sorted in alphabetical order. + sort.Slice(e.Labels, func(i, j int) bool { + return e.Labels[i].Name < e.Labels[j].Name + }) require.Equal(t, e.Labels, dtoLabels, d.String()) } return true @@ -154,7 +170,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given) + metricsAggregator.Update(ctx, testLabels, given) time.Sleep(time.Millisecond * 10) // Ensure that metric is expired @@ -220,7 +236,7 @@ func Benchmark_MetricsAggregator_Run(b *testing.B) { b.Logf("N=%d sending %d metrics", b.N, numMetrics) var nGot atomic.Int64 b.StartTimer() - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, metrics) + metricsAggregator.Update(ctx, testLabels, metrics) for i := 0; i < numMetrics; i++ { select { case <-ctx.Done(): diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 7145c2afa3..e1928fec5f 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -17,6 +17,7 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -24,6 +25,7 @@ import ( ) const ( + templateNameLabel = "template_name" agentNameLabel = "agent_name" usernameLabel = "username" workspaceNameLabel = "workspace_name" @@ -154,7 +156,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{usernameLabel, workspaceNameLabel, "template_name", "template_version"})) + }, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 988fc125b9..fb1ce86203 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -37,6 +37,7 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/httpapi" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/codersdk" @@ -572,7 +573,7 @@ func (api *API) workspaceAgentLogs(rw http.ResponseWriter, r *http.Request) { return } - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error fetching workspace by agent id.", @@ -580,6 +581,7 @@ func (api *API) workspaceAgentLogs(rw http.ResponseWriter, r *http.Request) { }) return } + workspace := row.Workspace api.WebsocketWaitMutex.Lock() api.WebsocketWaitGroup.Add(1) @@ -1648,7 +1650,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Failed to get workspace.", @@ -1656,6 +1658,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques }) return } + workspace := row.Workspace var req agentsdk.Stats if !httpapi.Read(ctx, rw, r, &req) { @@ -1681,7 +1684,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques var nextAutostart time.Time if workspace.AutostartSchedule.String != "" { templateSchedule, err := (*(api.TemplateScheduleStore.Load())).Get(ctx, api.Database, workspace.TemplateID) - // If the template schedule fails to load, just default to bumping without the next trasition and log it. + // If the template schedule fails to load, just default to bumping without the next transition and log it. if err != nil { api.Logger.Warn(ctx, "failed to load template schedule bumping activity, defaulting to bumping by 60min", slog.F("workspace_id", workspace.ID), @@ -1727,7 +1730,12 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques return xerrors.Errorf("can't get user: %w", err) } - api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, req.Metrics) + api.Options.UpdateAgentMetrics(ctx, prometheusmetrics.AgentMetricLabels{ + Username: user.Username, + WorkspaceName: workspace.Name, + AgentName: workspaceAgent.Name, + TemplateName: row.TemplateName, + }, req.Metrics) return nil }) } @@ -2103,7 +2111,7 @@ func (api *API) workspaceAgentReportLifecycle(rw http.ResponseWriter, r *http.Re ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Failed to get workspace.", @@ -2111,6 +2119,7 @@ func (api *API) workspaceAgentReportLifecycle(rw http.ResponseWriter, r *http.Re }) return } + workspace := row.Workspace var req agentsdk.PostLifecycleRequest if !httpapi.Read(ctx, rw, r, &req) { diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f5faf6c0d0..06bed3bd22 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -78,72 +78,74 @@ spec: -| Name | Type | Description | Labels | -| ----------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `slug` `template_name` | -| `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | -| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | -| `coderd_license_active_users` | gauge | The number of active users. | | -| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | -| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | -| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| ----------------------------------------------------- | --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | +| `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` | +| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_startup_script_seconds` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `slug` `template_name` | +| `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | +| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | +| `coderd_license_active_users` | gauge | The number of active users. | | +| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | +| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index d55e5cd966..06889bce35 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -15,9 +15,15 @@ coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",stat coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 # HELP coderd_agents_up The number of active agents per workspace. # TYPE coderd_agents_up gauge -coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 -coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 -coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 +coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{template_name="gcp", username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agentstats_startup_script_seconds The number of seconds the startup script took to execute. +# TYPE coderd_agentstats_startup_script_seconds gauge +coderd_agentstats_startup_script_seconds{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1.969900304 +# HELP agent_scripts_executed_total Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. +# TYPE agent_scripts_executed_total counter +agent_scripts_executed_total{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1 # HELP coderd_agentstats_connection_count The number of established connections by agent # TYPE coderd_agentstats_connection_count gauge coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_name="workspace1"} 2