diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index cf1dedc602..4da842ffb2 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -3166,6 +3166,13 @@ func (q *querier) GetTelemetryItems(ctx context.Context) ([]database.TelemetryIt return q.db.GetTelemetryItems(ctx) } +func (q *querier) GetTelemetryTaskEvents(ctx context.Context, arg database.GetTelemetryTaskEventsParams) ([]database.GetTelemetryTaskEventsRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceTask.All()); err != nil { + return nil, err + } + return q.db.GetTelemetryTaskEvents(ctx, arg) +} + func (q *querier) GetTemplateAppInsights(ctx context.Context, arg database.GetTemplateAppInsightsParams) ([]database.GetTemplateAppInsightsRow, error) { if err := q.authorizeTemplateInsights(ctx, arg.TemplateIDs); err != nil { return nil, err diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index faf4749262..5215955901 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -1326,6 +1326,11 @@ func (s *MethodTestSuite) TestTemplate() { dbm.EXPECT().GetTemplateInsightsByTemplate(gomock.Any(), arg).Return([]database.GetTemplateInsightsByTemplateRow{}, nil).AnyTimes() check.Args(arg).Asserts(rbac.ResourceTemplate, policy.ActionViewInsights) })) + s.Run("GetTelemetryTaskEvents", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) { + arg := database.GetTelemetryTaskEventsParams{} + dbm.EXPECT().GetTelemetryTaskEvents(gomock.Any(), arg).Return([]database.GetTelemetryTaskEventsRow{}, nil).AnyTimes() + check.Args(arg).Asserts(rbac.ResourceTask.All(), policy.ActionRead) + })) s.Run("GetTemplateAppInsights", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) { arg := database.GetTemplateAppInsightsParams{} dbm.EXPECT().GetTemplateAppInsights(gomock.Any(), arg).Return([]database.GetTemplateAppInsightsRow{}, nil).AnyTimes() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 416b0b8ce2..ac694e9b8d 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -1790,6 +1790,14 @@ func (m queryMetricsStore) GetTelemetryItems(ctx context.Context) ([]database.Te return r0, r1 } +func (m queryMetricsStore) GetTelemetryTaskEvents(ctx context.Context, createdAfter database.GetTelemetryTaskEventsParams) ([]database.GetTelemetryTaskEventsRow, error) { + start := time.Now() + r0, r1 := m.s.GetTelemetryTaskEvents(ctx, createdAfter) + m.queryLatencies.WithLabelValues("GetTelemetryTaskEvents").Observe(time.Since(start).Seconds()) + m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "GetTelemetryTaskEvents").Inc() + return r0, r1 +} + func (m queryMetricsStore) GetTemplateAppInsights(ctx context.Context, arg database.GetTemplateAppInsightsParams) ([]database.GetTemplateAppInsightsRow, error) { start := time.Now() r0, r1 := m.s.GetTemplateAppInsights(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 09198081ae..6bee7705b8 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -3314,6 +3314,21 @@ func (mr *MockStoreMockRecorder) GetTelemetryItems(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTelemetryItems", reflect.TypeOf((*MockStore)(nil).GetTelemetryItems), ctx) } +// GetTelemetryTaskEvents mocks base method. +func (m *MockStore) GetTelemetryTaskEvents(ctx context.Context, arg database.GetTelemetryTaskEventsParams) ([]database.GetTelemetryTaskEventsRow, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetTelemetryTaskEvents", ctx, arg) + ret0, _ := ret[0].([]database.GetTelemetryTaskEventsRow) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetTelemetryTaskEvents indicates an expected call of GetTelemetryTaskEvents. +func (mr *MockStoreMockRecorder) GetTelemetryTaskEvents(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTelemetryTaskEvents", reflect.TypeOf((*MockStore)(nil).GetTelemetryTaskEvents), ctx, arg) +} + // GetTemplateAppInsights mocks base method. func (m *MockStore) GetTemplateAppInsights(ctx context.Context, arg database.GetTemplateAppInsightsParams) ([]database.GetTemplateAppInsightsRow, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index c698c0d93c..3b6c07dd05 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -361,6 +361,23 @@ type sqlcQuerier interface { GetTaskSnapshot(ctx context.Context, taskID uuid.UUID) (TaskSnapshot, error) GetTelemetryItem(ctx context.Context, key string) (TelemetryItem, error) GetTelemetryItems(ctx context.Context) ([]TelemetryItem, error) + // Returns all data needed to build task lifecycle events for telemetry + // in a single round-trip. For each task whose workspace is in the + // given set, fetches: + // - the latest workspace app binding (task_workspace_apps) + // - the most recent stop and start builds (workspace_builds) + // - the last "working" app status (workspace_app_statuses) + // - the first app status after resume, for active workspaces + // + // Assumptions: + // - 1:1 relationship between tasks and workspaces. All builds on the + // workspace are considered task-related. + // - Idle duration approximation: If the agent reports "working", does + // work, then reports "done", we miss that working time. + // - lws and active_dur join across all historical app IDs for the task, + // because each resume cycle provisions a new app ID. This ensures + // pre-pause statuses contribute to idle duration and active duration. + GetTelemetryTaskEvents(ctx context.Context, arg GetTelemetryTaskEventsParams) ([]GetTelemetryTaskEventsRow, error) // GetTemplateAppInsights returns the aggregate usage of each app in a given // timeframe. The result can be filtered on template_ids, meaning only user data // from workspaces based on those templates will be included. diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 1ffbf9ce77..02c93a4742 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -13334,6 +13334,203 @@ func (q *sqlQuerier) GetTaskSnapshot(ctx context.Context, taskID uuid.UUID) (Tas return i, err } +const getTelemetryTaskEvents = `-- name: GetTelemetryTaskEvents :many +WITH task_app_ids AS ( + SELECT task_id, workspace_app_id + FROM task_workspace_apps +), +task_status_timeline AS ( + -- All app statuses across every historical app for each task, + -- plus synthetic "boundary" rows at each stop/start build transition. + -- This allows us to correctly take gaps due to pause/resume into account. + SELECT tai.task_id, was.created_at, was.state::text AS state + FROM workspace_app_statuses was + JOIN task_app_ids tai ON tai.workspace_app_id = was.app_id + UNION ALL + SELECT t.id AS task_id, wb.created_at, '_boundary' AS state + FROM tasks t + JOIN workspace_builds wb ON wb.workspace_id = t.workspace_id + WHERE t.deleted_at IS NULL + AND t.workspace_id IS NOT NULL + AND wb.build_number > 1 +), +task_event_data AS ( + SELECT + t.id AS task_id, + t.workspace_id, + twa.workspace_app_id, + -- Latest stop build. + stop_build.created_at AS stop_build_created_at, + stop_build.reason AS stop_build_reason, + -- Latest start build (task_resume only). + start_build.created_at AS start_build_created_at, + start_build.reason AS start_build_reason, + start_build.build_number AS start_build_number, + -- Last "working" app status (for idle duration). + lws.created_at AS last_working_status_at, + -- First app status after resume (for resume-to-status duration). + -- Only populated for workspaces in an active phase (started more + -- recently than stopped). + fsar.created_at AS first_status_after_resume_at, + -- Cumulative time spent in "working" state. + active_dur.total_working_ms AS active_duration_ms + FROM tasks t + LEFT JOIN LATERAL ( + SELECT task_app.workspace_app_id + FROM task_workspace_apps task_app + WHERE task_app.task_id = t.id + ORDER BY task_app.workspace_build_number DESC + LIMIT 1 + ) twa ON TRUE + LEFT JOIN LATERAL ( + SELECT wb.created_at, wb.reason, wb.build_number + FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.transition = 'stop' + ORDER BY wb.build_number DESC + LIMIT 1 + ) stop_build ON TRUE + LEFT JOIN LATERAL ( + SELECT wb.created_at, wb.reason, wb.build_number + FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.transition = 'start' + ORDER BY wb.build_number DESC + LIMIT 1 + ) start_build ON TRUE + LEFT JOIN LATERAL ( + SELECT tst.created_at + FROM task_status_timeline tst + WHERE tst.task_id = t.id + AND tst.state = 'working' + -- Only consider status before the latest pause so that + -- post-resume statuses don't mask pre-pause idle time. + AND (stop_build.created_at IS NULL + OR tst.created_at <= stop_build.created_at) + ORDER BY tst.created_at DESC + LIMIT 1 + ) lws ON TRUE + LEFT JOIN LATERAL ( + SELECT was.created_at + FROM workspace_app_statuses was + WHERE was.app_id = twa.workspace_app_id + AND was.created_at > start_build.created_at + ORDER BY was.created_at ASC + LIMIT 1 + ) fsar ON twa.workspace_app_id IS NOT NULL + AND start_build.created_at IS NOT NULL + AND (stop_build.created_at IS NULL + OR start_build.created_at > stop_build.created_at) + -- Active duration: cumulative time spent in "working" state across all + -- historical app IDs for this task. Uses LEAD() to convert point-in-time + -- statuses into intervals, then sums intervals where state='working'. For + -- the last status, falls back to stop_build time (if paused) or @now (if + -- still running). + LEFT JOIN LATERAL ( + SELECT COALESCE( + SUM(EXTRACT(EPOCH FROM (interval_end - interval_start)) * 1000)::bigint, + 0 + )::bigint AS total_working_ms + FROM ( + SELECT + tst.created_at AS interval_start, + COALESCE( + LEAD(tst.created_at) OVER (ORDER BY tst.created_at ASC, CASE WHEN tst.state = '_boundary' THEN 1 ELSE 0 END ASC), + CASE WHEN stop_build.created_at IS NOT NULL + AND (start_build.created_at IS NULL + OR stop_build.created_at > start_build.created_at) + THEN stop_build.created_at + ELSE $1::timestamptz + END + ) AS interval_end, + tst.state + FROM task_status_timeline tst + WHERE tst.task_id = t.id + ) intervals + WHERE intervals.state = 'working' + ) active_dur ON TRUE + WHERE t.deleted_at IS NULL + AND t.workspace_id IS NOT NULL + AND EXISTS ( + SELECT 1 FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.created_at > $2 + ) +) +SELECT task_id, workspace_id, workspace_app_id, stop_build_created_at, stop_build_reason, start_build_created_at, start_build_reason, start_build_number, last_working_status_at, first_status_after_resume_at, active_duration_ms FROM task_event_data +ORDER BY task_id +` + +type GetTelemetryTaskEventsParams struct { + Now time.Time `db:"now" json:"now"` + CreatedAfter time.Time `db:"created_after" json:"created_after"` +} + +type GetTelemetryTaskEventsRow struct { + TaskID uuid.UUID `db:"task_id" json:"task_id"` + WorkspaceID uuid.NullUUID `db:"workspace_id" json:"workspace_id"` + WorkspaceAppID uuid.NullUUID `db:"workspace_app_id" json:"workspace_app_id"` + StopBuildCreatedAt sql.NullTime `db:"stop_build_created_at" json:"stop_build_created_at"` + StopBuildReason NullBuildReason `db:"stop_build_reason" json:"stop_build_reason"` + StartBuildCreatedAt sql.NullTime `db:"start_build_created_at" json:"start_build_created_at"` + StartBuildReason NullBuildReason `db:"start_build_reason" json:"start_build_reason"` + StartBuildNumber sql.NullInt32 `db:"start_build_number" json:"start_build_number"` + LastWorkingStatusAt sql.NullTime `db:"last_working_status_at" json:"last_working_status_at"` + FirstStatusAfterResumeAt sql.NullTime `db:"first_status_after_resume_at" json:"first_status_after_resume_at"` + ActiveDurationMs int64 `db:"active_duration_ms" json:"active_duration_ms"` +} + +// Returns all data needed to build task lifecycle events for telemetry +// in a single round-trip. For each task whose workspace is in the +// given set, fetches: +// - the latest workspace app binding (task_workspace_apps) +// - the most recent stop and start builds (workspace_builds) +// - the last "working" app status (workspace_app_statuses) +// - the first app status after resume, for active workspaces +// +// Assumptions: +// - 1:1 relationship between tasks and workspaces. All builds on the +// workspace are considered task-related. +// - Idle duration approximation: If the agent reports "working", does +// work, then reports "done", we miss that working time. +// - lws and active_dur join across all historical app IDs for the task, +// because each resume cycle provisions a new app ID. This ensures +// pre-pause statuses contribute to idle duration and active duration. +func (q *sqlQuerier) GetTelemetryTaskEvents(ctx context.Context, arg GetTelemetryTaskEventsParams) ([]GetTelemetryTaskEventsRow, error) { + rows, err := q.db.QueryContext(ctx, getTelemetryTaskEvents, arg.Now, arg.CreatedAfter) + if err != nil { + return nil, err + } + defer rows.Close() + var items []GetTelemetryTaskEventsRow + for rows.Next() { + var i GetTelemetryTaskEventsRow + if err := rows.Scan( + &i.TaskID, + &i.WorkspaceID, + &i.WorkspaceAppID, + &i.StopBuildCreatedAt, + &i.StopBuildReason, + &i.StartBuildCreatedAt, + &i.StartBuildReason, + &i.StartBuildNumber, + &i.LastWorkingStatusAt, + &i.FirstStatusAfterResumeAt, + &i.ActiveDurationMs, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const insertTask = `-- name: InsertTask :one INSERT INTO tasks (id, organization_id, owner_id, name, display_name, workspace_id, template_version_id, template_parameters, prompt, created_at) diff --git a/coderd/database/queries/tasks.sql b/coderd/database/queries/tasks.sql index 012acbd976..0673c78cc3 100644 --- a/coderd/database/queries/tasks.sql +++ b/coderd/database/queries/tasks.sql @@ -100,3 +100,146 @@ FROM task_snapshots WHERE task_id = $1; + +-- name: GetTelemetryTaskEvents :many +-- Returns all data needed to build task lifecycle events for telemetry +-- in a single round-trip. For each task whose workspace is in the +-- given set, fetches: +-- - the latest workspace app binding (task_workspace_apps) +-- - the most recent stop and start builds (workspace_builds) +-- - the last "working" app status (workspace_app_statuses) +-- - the first app status after resume, for active workspaces +-- +-- Assumptions: +-- - 1:1 relationship between tasks and workspaces. All builds on the +-- workspace are considered task-related. +-- - Idle duration approximation: If the agent reports "working", does +-- work, then reports "done", we miss that working time. +-- - lws and active_dur join across all historical app IDs for the task, +-- because each resume cycle provisions a new app ID. This ensures +-- pre-pause statuses contribute to idle duration and active duration. +WITH task_app_ids AS ( + SELECT task_id, workspace_app_id + FROM task_workspace_apps +), +task_status_timeline AS ( + -- All app statuses across every historical app for each task, + -- plus synthetic "boundary" rows at each stop/start build transition. + -- This allows us to correctly take gaps due to pause/resume into account. + SELECT tai.task_id, was.created_at, was.state::text AS state + FROM workspace_app_statuses was + JOIN task_app_ids tai ON tai.workspace_app_id = was.app_id + UNION ALL + SELECT t.id AS task_id, wb.created_at, '_boundary' AS state + FROM tasks t + JOIN workspace_builds wb ON wb.workspace_id = t.workspace_id + WHERE t.deleted_at IS NULL + AND t.workspace_id IS NOT NULL + AND wb.build_number > 1 +), +task_event_data AS ( + SELECT + t.id AS task_id, + t.workspace_id, + twa.workspace_app_id, + -- Latest stop build. + stop_build.created_at AS stop_build_created_at, + stop_build.reason AS stop_build_reason, + -- Latest start build (task_resume only). + start_build.created_at AS start_build_created_at, + start_build.reason AS start_build_reason, + start_build.build_number AS start_build_number, + -- Last "working" app status (for idle duration). + lws.created_at AS last_working_status_at, + -- First app status after resume (for resume-to-status duration). + -- Only populated for workspaces in an active phase (started more + -- recently than stopped). + fsar.created_at AS first_status_after_resume_at, + -- Cumulative time spent in "working" state. + active_dur.total_working_ms AS active_duration_ms + FROM tasks t + LEFT JOIN LATERAL ( + SELECT task_app.workspace_app_id + FROM task_workspace_apps task_app + WHERE task_app.task_id = t.id + ORDER BY task_app.workspace_build_number DESC + LIMIT 1 + ) twa ON TRUE + LEFT JOIN LATERAL ( + SELECT wb.created_at, wb.reason, wb.build_number + FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.transition = 'stop' + ORDER BY wb.build_number DESC + LIMIT 1 + ) stop_build ON TRUE + LEFT JOIN LATERAL ( + SELECT wb.created_at, wb.reason, wb.build_number + FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.transition = 'start' + ORDER BY wb.build_number DESC + LIMIT 1 + ) start_build ON TRUE + LEFT JOIN LATERAL ( + SELECT tst.created_at + FROM task_status_timeline tst + WHERE tst.task_id = t.id + AND tst.state = 'working' + -- Only consider status before the latest pause so that + -- post-resume statuses don't mask pre-pause idle time. + AND (stop_build.created_at IS NULL + OR tst.created_at <= stop_build.created_at) + ORDER BY tst.created_at DESC + LIMIT 1 + ) lws ON TRUE + LEFT JOIN LATERAL ( + SELECT was.created_at + FROM workspace_app_statuses was + WHERE was.app_id = twa.workspace_app_id + AND was.created_at > start_build.created_at + ORDER BY was.created_at ASC + LIMIT 1 + ) fsar ON twa.workspace_app_id IS NOT NULL + AND start_build.created_at IS NOT NULL + AND (stop_build.created_at IS NULL + OR start_build.created_at > stop_build.created_at) + -- Active duration: cumulative time spent in "working" state across all + -- historical app IDs for this task. Uses LEAD() to convert point-in-time + -- statuses into intervals, then sums intervals where state='working'. For + -- the last status, falls back to stop_build time (if paused) or @now (if + -- still running). + LEFT JOIN LATERAL ( + SELECT COALESCE( + SUM(EXTRACT(EPOCH FROM (interval_end - interval_start)) * 1000)::bigint, + 0 + )::bigint AS total_working_ms + FROM ( + SELECT + tst.created_at AS interval_start, + COALESCE( + LEAD(tst.created_at) OVER (ORDER BY tst.created_at ASC, CASE WHEN tst.state = '_boundary' THEN 1 ELSE 0 END ASC), + CASE WHEN stop_build.created_at IS NOT NULL + AND (start_build.created_at IS NULL + OR stop_build.created_at > start_build.created_at) + THEN stop_build.created_at + ELSE @now::timestamptz + END + ) AS interval_end, + tst.state + FROM task_status_timeline tst + WHERE tst.task_id = t.id + ) intervals + WHERE intervals.state = 'working' + ) active_dur ON TRUE + WHERE t.deleted_at IS NULL + AND t.workspace_id IS NOT NULL + AND EXISTS ( + SELECT 1 FROM workspace_builds wb + WHERE wb.workspace_id = t.workspace_id + AND wb.created_at > @created_after + ) +) +SELECT * FROM task_event_data +ORDER BY task_id; + diff --git a/coderd/database/queries/workspaceapps.sql b/coderd/database/queries/workspaceapps.sql index bf605f2cce..5f826d2985 100644 --- a/coderd/database/queries/workspaceapps.sql +++ b/coderd/database/queries/workspaceapps.sql @@ -87,3 +87,4 @@ SELECT DISTINCT ON (workspace_id) FROM workspace_app_statuses WHERE workspace_id = ANY(@ids :: uuid[]) ORDER BY workspace_id, created_at DESC; + diff --git a/coderd/database/sqlc.yaml b/coderd/database/sqlc.yaml index d6a2269845..78d20574c6 100644 --- a/coderd/database/sqlc.yaml +++ b/coderd/database/sqlc.yaml @@ -124,6 +124,24 @@ sql: - column: "tasks_with_status.workspace_app_health" go_type: type: "NullWorkspaceAppHealth" + # Workaround for sqlc not interpreting the left join correctly + # in the combined telemetry query. + - column: "task_event_data.start_build_number" + go_type: "database/sql.NullInt32" + - column: "task_event_data.stop_build_created_at" + go_type: "database/sql.NullTime" + - column: "task_event_data.stop_build_reason" + go_type: + type: "NullBuildReason" + - column: "task_event_data.start_build_created_at" + go_type: "database/sql.NullTime" + - column: "task_event_data.start_build_reason" + go_type: + type: "NullBuildReason" + - column: "task_event_data.last_working_status_at" + go_type: "database/sql.NullTime" + - column: "task_event_data.first_status_after_resume_at" + go_type: "database/sql.NullTime" rename: group_member: GroupMemberTable group_members_expanded: GroupMember diff --git a/coderd/telemetry/telemetry.go b/coderd/telemetry/telemetry.go index 40b07d56d9..b39ad95fc5 100644 --- a/coderd/telemetry/telemetry.go +++ b/coderd/telemetry/telemetry.go @@ -416,9 +416,10 @@ func checkIDPOrgSync(ctx context.Context, db database.Store, values *codersdk.De func (r *remoteReporter) createSnapshot() (*Snapshot, error) { var ( ctx = r.ctx + now = r.options.Clock.Now() // For resources that grow in size very quickly (like workspace builds), // we only report events that occurred within the past hour. - createdAfter = dbtime.Time(r.options.Clock.Now().Add(-1 * time.Hour)).UTC() + createdAfter = dbtime.Time(now.Add(-1 * time.Hour)).UTC() eg errgroup.Group snapshot = &Snapshot{ DeploymentID: r.options.DeploymentID, @@ -740,17 +741,19 @@ func (r *remoteReporter) createSnapshot() (*Snapshot, error) { return nil }) eg.Go(func() error { - dbTasks, err := r.options.Database.ListTasks(ctx, database.ListTasksParams{ - OwnerID: uuid.Nil, - OrganizationID: uuid.Nil, - Status: "", - }) + tasks, err := CollectTasks(ctx, r.options.Database) if err != nil { - return err + return xerrors.Errorf("collect tasks telemetry: %w", err) } - for _, dbTask := range dbTasks { - snapshot.Tasks = append(snapshot.Tasks, ConvertTask(dbTask)) + snapshot.Tasks = tasks + return nil + }) + eg.Go(func() error { + events, err := CollectTaskEvents(ctx, r.options.Database, createdAfter, now) + if err != nil { + return xerrors.Errorf("collect task events telemetry: %w", err) } + snapshot.TaskEvents = events return nil }) eg.Go(func() error { @@ -902,6 +905,129 @@ func (r *remoteReporter) collectBoundaryUsageSummary(ctx context.Context) (*Boun }, nil } +func CollectTasks(ctx context.Context, db database.Store) ([]Task, error) { + dbTasks, err := db.ListTasks(ctx, database.ListTasksParams{ + OwnerID: uuid.Nil, + OrganizationID: uuid.Nil, + Status: "", + }) + if err != nil { + return nil, xerrors.Errorf("list tasks: %w", err) + } + if len(dbTasks) == 0 { + return []Task{}, nil + } + + tasks := make([]Task, 0, len(dbTasks)) + for _, dbTask := range dbTasks { + tasks = append(tasks, ConvertTask(dbTask)) + } + return tasks, nil +} + +// buildTaskEvent constructs a TaskEvent from the combined query row. +func buildTaskEvent( + row database.GetTelemetryTaskEventsRow, + createdAfter time.Time, + now time.Time, +) TaskEvent { + event := TaskEvent{ + TaskID: row.TaskID.String(), + } + + var ( + hasStartBuild = row.StartBuildCreatedAt.Valid + isResumed = hasStartBuild && row.StartBuildNumber.Valid && row.StartBuildNumber.Int32 > 1 + hasStopBuild = row.StopBuildCreatedAt.Valid + startedAfterStop = hasStartBuild && hasStopBuild && row.StartBuildCreatedAt.Time.After(row.StopBuildCreatedAt.Time) + currentlyPaused = hasStopBuild && !startedAfterStop + ) + + // Pause-related fields (requires a stop build). + if hasStopBuild { + event.LastPausedAt = &row.StopBuildCreatedAt.Time + switch { + case row.StopBuildReason.Valid && row.StopBuildReason.BuildReason == database.BuildReasonTaskAutoPause: + event.PauseReason = ptr.Ref("auto") + case row.StopBuildReason.Valid && row.StopBuildReason.BuildReason == database.BuildReasonTaskManualPause: + event.PauseReason = ptr.Ref("manual") + default: + event.PauseReason = ptr.Ref("other") + } + + // Idle duration: time between last working status and the pause. + if row.LastWorkingStatusAt.Valid && + row.StopBuildCreatedAt.Time.After(row.LastWorkingStatusAt.Time) { + idle := row.StopBuildCreatedAt.Time.Sub(row.LastWorkingStatusAt.Time) + event.IdleDurationMS = ptr.Ref(idle.Milliseconds()) + } + } + + // Resume-related fields (requires task_resume start after stop). + if startedAfterStop { + // Paused duration: time between pause and resume. + if row.StartBuildCreatedAt.Time.After(createdAfter) { + paused := row.StartBuildCreatedAt.Time.Sub(row.StopBuildCreatedAt.Time) + event.PausedDurationMS = ptr.Ref(paused.Milliseconds()) + } + + // Below only relevant for "resumed" tasks, not when initially created. + if isResumed { + event.LastResumedAt = &row.StartBuildCreatedAt.Time + switch { + // TODO(Cian): will this exist? Future readers may know better than I. + // case row.StartBuildReason == database.BuildReasonTaskAutoResume: + // event.ResumeReason = ptr.Ref("auto") + case row.StartBuildReason.BuildReason == database.BuildReasonTaskResume: + event.ResumeReason = ptr.Ref("manual") + default: // Task resumed by starting workspace? + event.ResumeReason = ptr.Ref("other") + } + } + } + + // Unresolved pause: report current paused duration. + if currentlyPaused { + paused := now.Sub(row.StopBuildCreatedAt.Time) + event.PausedDurationMS = ptr.Ref(paused.Milliseconds()) + } + + // Resume-to-status duration. + if row.FirstStatusAfterResumeAt.Valid && isResumed { + delta := row.FirstStatusAfterResumeAt.Time.Sub(row.StartBuildCreatedAt.Time) + event.ResumeToStatusMS = ptr.Ref(delta.Milliseconds()) + } + + // Active duration: from SQL calculation. + if row.ActiveDurationMs > 0 { + event.ActiveDurationMS = ptr.Ref(row.ActiveDurationMs) + } + + return event +} + +// CollectTaskEvents collects lifecycle events for tasks with recent activity. +func CollectTaskEvents(ctx context.Context, db database.Store, createdAfter, now time.Time) ([]TaskEvent, error) { + rows, err := db.GetTelemetryTaskEvents(ctx, database.GetTelemetryTaskEventsParams{ + CreatedAfter: createdAfter, + Now: now, + }) + if err != nil { + return nil, xerrors.Errorf("get telemetry task events: %w", err) + } + events := make([]TaskEvent, 0, len(rows)) + for _, row := range rows { + events = append(events, buildTaskEvent(row, createdAfter, now)) + } + return events, nil +} + +// HashContent returns a SHA256 hash of the content as a hex string. +// This is useful for hashing sensitive content like prompts for telemetry. +func HashContent(content string) string { + return fmt.Sprintf("%x", sha256.Sum256([]byte(content))) +} + // ConvertAPIKey anonymizes an API key. func ConvertAPIKey(apiKey database.APIKey) APIKey { a := APIKey{ @@ -1370,6 +1496,7 @@ type Snapshot struct { NetworkEvents []NetworkEvent `json:"network_events"` Organizations []Organization `json:"organizations"` Tasks []Task `json:"tasks"` + TaskEvents []TaskEvent `json:"task_events"` TelemetryItems []TelemetryItem `json:"telemetry_items"` UserTailnetConnections []UserTailnetConnection `json:"user_tailnet_connections"` PrebuiltWorkspaces []PrebuiltWorkspace `json:"prebuilt_workspaces"` @@ -1931,25 +2058,36 @@ type Task struct { WorkspaceAppID *string `json:"workspace_app_id"` TemplateVersionID string `json:"template_version_id"` PromptHash string `json:"prompt_hash"` // Prompt is hashed for privacy. - CreatedAt time.Time `json:"created_at"` Status string `json:"status"` + CreatedAt time.Time `json:"created_at"` } -// ConvertTask anonymizes a Task. +// TaskEvent represents lifecycle events for a task (pause/resume +// cycles). The createdAfter parameter gates PausedDurationMS so +// that only recent pause/resume pairs are reported. +type TaskEvent struct { + TaskID string `json:"task_id"` + LastPausedAt *time.Time `json:"last_paused_at"` + LastResumedAt *time.Time `json:"last_resumed_at"` + PauseReason *string `json:"pause_reason"` + ResumeReason *string `json:"resume_reason"` + IdleDurationMS *int64 `json:"idle_duration_ms"` + PausedDurationMS *int64 `json:"paused_duration_ms"` + ResumeToStatusMS *int64 `json:"resume_to_status_ms"` + ActiveDurationMS *int64 `json:"active_duration_ms"` +} + +// ConvertTask converts a database Task to a telemetry Task. func ConvertTask(task database.Task) Task { - t := &Task{ - ID: task.ID.String(), - OrganizationID: task.OrganizationID.String(), - OwnerID: task.OwnerID.String(), - Name: task.Name, - WorkspaceID: nil, - WorkspaceBuildNumber: nil, - WorkspaceAgentID: nil, - WorkspaceAppID: nil, - TemplateVersionID: task.TemplateVersionID.String(), - PromptHash: fmt.Sprintf("%x", sha256.Sum256([]byte(task.Prompt))), - CreatedAt: task.CreatedAt, - Status: string(task.Status), + t := Task{ + ID: task.ID.String(), + OrganizationID: task.OrganizationID.String(), + OwnerID: task.OwnerID.String(), + Name: task.Name, + TemplateVersionID: task.TemplateVersionID.String(), + PromptHash: HashContent(task.Prompt), + Status: string(task.Status), + CreatedAt: task.CreatedAt, } if task.WorkspaceID.Valid { t.WorkspaceID = ptr.Ref(task.WorkspaceID.UUID.String()) @@ -1963,7 +2101,7 @@ func ConvertTask(task database.Task) Task { if task.WorkspaceAppID.Valid { t.WorkspaceAppID = ptr.Ref(task.WorkspaceAppID.UUID.String()) } - return *t + return t } type telemetryItemKey string diff --git a/coderd/telemetry/telemetry_test.go b/coderd/telemetry/telemetry_test.go index 249546ec3d..f679dfee9d 100644 --- a/coderd/telemetry/telemetry_test.go +++ b/coderd/telemetry/telemetry_test.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "encoding/json" + "fmt" "net/http" "net/http/httptest" "net/url" @@ -13,6 +14,7 @@ import ( "time" "github.com/go-chi/chi/v5" + "github.com/google/go-cmp/cmp" "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -21,12 +23,14 @@ import ( "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/coderd/boundaryusage" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbfake" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/idpsync" "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/telemetry" + "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/testutil" "github.com/coder/quartz" @@ -313,6 +317,17 @@ func TestTelemetry(t *testing.T) { require.Equal(t, string(database.WorkspaceAgentSubsystemEnvbox), wsa.Subsystems[0]) require.Equal(t, string(database.WorkspaceAgentSubsystemExectrace), wsa.Subsystems[1]) require.Len(t, snapshot.Tasks, 1) + require.Len(t, snapshot.TaskEvents, 1) + taskEvent := snapshot.TaskEvents[0] + assert.Equal(t, task.ID.String(), taskEvent.TaskID) + assert.Nil(t, taskEvent.LastResumedAt) + assert.Nil(t, taskEvent.LastPausedAt) + assert.Nil(t, taskEvent.PauseReason) + assert.Nil(t, taskEvent.ResumeReason) + assert.Nil(t, taskEvent.IdleDurationMS) + assert.Nil(t, taskEvent.PausedDurationMS) + assert.Nil(t, taskEvent.ResumeToStatusMS) + assert.Nil(t, taskEvent.ActiveDurationMS) for _, snapTask := range snapshot.Tasks { assert.Equal(t, task.ID.String(), snapTask.ID) assert.Equal(t, task.OrganizationID.String(), snapTask.OrganizationID) @@ -326,6 +341,7 @@ func TestTelemetry(t *testing.T) { assert.Equal(t, taskWA.WorkspaceAppID.UUID.String(), *snapTask.WorkspaceAppID) assert.Equal(t, task.TemplateVersionID.String(), snapTask.TemplateVersionID) assert.Equal(t, "e196fe22e61cfa32d8c38749e0ce348108bb4cae29e2c36cdcce7e77faa9eb5f", snapTask.PromptHash) + assert.Equal(t, string(task.Status), snapTask.Status) assert.Equal(t, task.CreatedAt.UTC(), snapTask.CreatedAt.UTC()) } @@ -675,6 +691,573 @@ func TestPrebuiltWorkspacesTelemetry(t *testing.T) { } } +// taskTelemetryHelper is a grab bag of stuff useful in task telemetry test cases +type taskTelemetryHelper struct { + t *testing.T + ctx context.Context + db database.Store + org database.Organization + user database.User +} + +// createBuild creates a workspace build with the given parameters, +// handling provisioner job creation automatically. +func (h *taskTelemetryHelper) createBuild( + resp dbfake.WorkspaceResponse, + buildNumber int32, + createdAt time.Time, + transition database.WorkspaceTransition, + reason database.BuildReason, +) (database.WorkspaceBuild, *database.WorkspaceApp) { + job := dbgen.ProvisionerJob(h.t, h.db, nil, database.ProvisionerJob{ + Provisioner: database.ProvisionerTypeTerraform, + StorageMethod: database.ProvisionerStorageMethodFile, + Type: database.ProvisionerJobTypeWorkspaceBuild, + OrganizationID: h.org.ID, + }) + bld := dbgen.WorkspaceBuild(h.t, h.db, database.WorkspaceBuild{ + WorkspaceID: resp.Workspace.ID, + TemplateVersionID: resp.TemplateVersion.ID, + JobID: job.ID, + Transition: transition, + Reason: reason, + BuildNumber: buildNumber, + CreatedAt: createdAt, + HasAITask: sql.NullBool{ + Bool: true, + Valid: true, + }, + }) + if transition == database.WorkspaceTransitionStart { + require.NotEmpty(h.t, resp.Agents, "need at least one agent") + agt := resp.Agents[0] + // App IDs are regenerated by provisionerd each build. + app := dbgen.WorkspaceApp(h.t, h.db, database.WorkspaceApp{ + AgentID: agt.ID, + }) + _, err := h.db.UpsertTaskWorkspaceApp(h.ctx, database.UpsertTaskWorkspaceAppParams{ + TaskID: resp.Task.ID, + WorkspaceBuildNumber: buildNumber, + WorkspaceAgentID: uuid.NullUUID{UUID: agt.ID, Valid: true}, + WorkspaceAppID: uuid.NullUUID{UUID: app.ID, Valid: true}, + }) + require.NoError(h.t, err, "failed to upsert task app") + return bld, &app + } + return bld, nil +} + +// nolint: dupl // Test code is better WET than DRY. +func TestTasksTelemetry(t *testing.T) { + t.Parallel() + + // Define a fixed reference time for deterministic testing. + now := time.Date(2025, 1, 15, 12, 0, 0, 0, time.UTC) + + createAppStatus := func(ctx context.Context, db database.Store, wsID uuid.UUID, agentID, appID uuid.UUID, state database.WorkspaceAppStatusState, message string, createdAt time.Time) { + _, err := db.InsertWorkspaceAppStatus(ctx, database.InsertWorkspaceAppStatusParams{ + ID: uuid.New(), + CreatedAt: createdAt, + WorkspaceID: wsID, + AgentID: agentID, + AppID: appID, + State: state, + Message: message, + }) + require.NoError(t, err) + } + + getApp := func(ctx context.Context, db database.Store, agentID uuid.UUID) database.WorkspaceApp { + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agentID) + require.NoError(t, err) + require.NotEmpty(t, apps, "expected at least one app") + return apps[0] + } + + type statusSpec struct { + state database.WorkspaceAppStatusState + message string + offset time.Duration + } + + type buildSpec struct { + buildNumber int32 + offset time.Duration + transition database.WorkspaceTransition + reason database.BuildReason + statuses []statusSpec // created after this build, using this build's app + } + + tests := []struct { + name string + + // Input: DB setup. + skipWorkspace bool + createdOffset time.Duration + buildOffset *time.Duration + extraBuilds []buildSpec + appStatuses []statusSpec + + // Expected output. + expectEvent bool + lastPausedOffset *time.Duration + lastResumedOffset *time.Duration + pauseReason *string + resumeReason *string + idleDurationMS *int64 + pausedDurationMS *int64 + resumeToStatusMS *int64 + activeDurationMS *int64 + }{ + { + name: "no workspace - all lifecycle fields nil", + skipWorkspace: true, + createdOffset: -1 * time.Hour, + }, + { + name: "running workspace - no pause/resume events", + createdOffset: -45 * time.Minute, + buildOffset: ptr.Ref(-30 * time.Minute), + expectEvent: true, + }, + { + name: "with app status - no lifecycle events", + createdOffset: -90 * time.Minute, + buildOffset: ptr.Ref(-45 * time.Minute), + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Task started", -40 * time.Minute}, + }, + expectEvent: true, + // ResumeToStatusMS is nil because initial start (BuildReasonInitiator) + // doesn't count - only task_resume starts are considered. + activeDurationMS: ptr.Ref(int64(40 * time.Minute / time.Millisecond)), + }, + { + name: "auto paused - LastPausedAt and PauseReason=auto", + createdOffset: -3 * time.Hour, + extraBuilds: []buildSpec{ + {2, -20 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-20 * time.Minute), + pauseReason: ptr.Ref("auto"), + pausedDurationMS: ptr.Ref(20 * time.Minute.Milliseconds()), // Ongoing pause. + }, + { + name: "manual paused - LastPausedAt and PauseReason=manual", + createdOffset: -4 * time.Hour, + extraBuilds: []buildSpec{ + {2, -15 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskManualPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-15 * time.Minute), + pauseReason: ptr.Ref("manual"), + pausedDurationMS: ptr.Ref(15 * time.Minute.Milliseconds()), // Ongoing pause. + }, + { + name: "paused with idle time - IdleDurationMS calculated", + createdOffset: -5 * time.Hour, + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Working on something", -40 * time.Minute}, + {database.WorkspaceAppStatusStateIdle, "Idle now", -35 * time.Minute}, + }, + extraBuilds: []buildSpec{ + {2, -25 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-25 * time.Minute), + pauseReason: ptr.Ref("auto"), + idleDurationMS: ptr.Ref(15 * time.Minute.Milliseconds()), // Last working (-40) to stop (-25). + activeDurationMS: ptr.Ref(5 * time.Minute.Milliseconds()), // -40 min (working) to -35 min (idle). + pausedDurationMS: ptr.Ref(25 * time.Minute.Milliseconds()), // Ongoing pause: now - (-25min). + }, + { + name: "paused with working status after pause - IdleDurationMS nil", + createdOffset: -5 * time.Hour, + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Working after pause", -20 * time.Minute}, + }, + extraBuilds: []buildSpec{ + {2, -25 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-25 * time.Minute), + pauseReason: ptr.Ref("auto"), + pausedDurationMS: ptr.Ref(25 * time.Minute.Milliseconds()), // Ongoing pause. + // IdleDurationMS is nil because "last working" is after pause. + // ActiveDurationMS is nil because working→stop interval is negative. + }, + { + name: "recently resumed - PausedDurationMS calculated", + createdOffset: -6 * time.Hour, + extraBuilds: []buildSpec{ + {2, -50 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -10 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-50 * time.Minute), + lastResumedOffset: ptr.Ref(-10 * time.Minute), + pauseReason: ptr.Ref("auto"), + resumeReason: ptr.Ref("manual"), + pausedDurationMS: ptr.Ref(40 * time.Minute.Milliseconds()), + }, + { + // This test verifies that we do not double-report task events outside of the window. + name: "resumed long ago - PausedDurationMS nil", + createdOffset: -10 * time.Hour, + extraBuilds: []buildSpec{ + {2, -5 * time.Hour, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -2 * time.Hour, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, nil}, + }, + expectEvent: false, + }, + { + name: "multiple cycles - captures latest pause/resume", + createdOffset: -8 * time.Hour, + extraBuilds: []buildSpec{ + {2, -3 * time.Hour, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -150 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, nil}, + {4, -30 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskManualPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-30 * time.Minute), + pauseReason: ptr.Ref("manual"), + pausedDurationMS: ptr.Ref(30 * time.Minute.Milliseconds()), // Ongoing pause: now - (-30min). + }, + { + name: "currently paused after recent resume - reports ongoing pause", + createdOffset: -6 * time.Hour, + extraBuilds: []buildSpec{ + {2, -50 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -30 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, nil}, + {4, -10 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskManualPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-10 * time.Minute), + pauseReason: ptr.Ref("manual"), + pausedDurationMS: ptr.Ref(10 * time.Minute.Milliseconds()), // Ongoing pause: now - pause time. + }, + { + name: "multiple cycles with recent resume - pairs with preceding pause", + createdOffset: -6 * time.Hour, + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "started work", -6 * time.Hour}, + }, + extraBuilds: []buildSpec{ + {2, -50 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -30 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "resumed work", -25 * time.Minute}, + }}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-50 * time.Minute), + lastResumedOffset: ptr.Ref(-30 * time.Minute), + pauseReason: ptr.Ref("auto"), + resumeReason: ptr.Ref("manual"), + pausedDurationMS: ptr.Ref(20 * time.Minute.Milliseconds()), + resumeToStatusMS: ptr.Ref((5 * time.Minute).Milliseconds()), + // Build 1 ("started work") -> Build 2 (stop) (5h10m) + Build 3 ("resumed work") -> now (25m) + // TODO(cian): We define IdleDurationMS as "the time from the last working status to pause". + // We know that the task has reported working since T-6h and got auto-paused at T-50m. + // We can reasonably assume that it has been 'idle' from when it was stopped (T-30m) to + // its next report at T-25m. This is covered by ResumeToStatusMS. + // But do we consider the time since its last report (T-6h) to its being auto-paused + // as truly "idle"? + idleDurationMS: ptr.Ref(310 * time.Minute.Milliseconds()), + activeDurationMS: ptr.Ref((5*time.Hour + 10*time.Minute + 25*time.Minute).Milliseconds()), + }, + { + name: "all fields populated - full lifecycle", + createdOffset: -7 * time.Hour, + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Started working", -390 * time.Minute}, + {database.WorkspaceAppStatusStateWorking, "Still working", -45 * time.Minute}, + }, + extraBuilds: []buildSpec{ + {2, -35 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -5 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonTaskResume, []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Resumed work", -3 * time.Minute}, + {database.WorkspaceAppStatusStateIdle, "Finished work", -2 * time.Minute}, + }}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-35 * time.Minute), + lastResumedOffset: ptr.Ref(-5 * time.Minute), + pauseReason: ptr.Ref("auto"), + resumeReason: ptr.Ref("manual"), + idleDurationMS: ptr.Ref(10 * time.Minute.Milliseconds()), + pausedDurationMS: ptr.Ref(30 * time.Minute.Milliseconds()), + resumeToStatusMS: ptr.Ref((2 * time.Minute).Milliseconds()), + // Active duration: (-390 to -35) + (-3 to -2) = 355 + 1 = 356 min. + activeDurationMS: ptr.Ref(356 * time.Minute.Milliseconds()), + }, + { + name: "non-task_resume builds are tracked as other", + createdOffset: -4 * time.Hour, + extraBuilds: []buildSpec{ + {2, -60 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + {3, -30 * time.Minute, database.WorkspaceTransitionStart, database.BuildReasonInitiator, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-60 * time.Minute), + pauseReason: ptr.Ref("auto"), + resumeReason: ptr.Ref("other"), + // LastResumedAt is set because isResumed is true (build_number > 1) + // even though the start reason isn't task_resume. + lastResumedOffset: ptr.Ref(-30 * time.Minute), + // PausedDurationMS reports ongoing pause: now - (-60min) = 60min. + pausedDurationMS: ptr.Ref(30 * time.Minute.Milliseconds()), + }, + { + name: "simple ongoing pause reports duration", + createdOffset: -3 * time.Hour, + extraBuilds: []buildSpec{ + {2, -45 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-45 * time.Minute), + pauseReason: ptr.Ref("auto"), + // No resume, so ongoing pause: now - (-45min) = 45min. + pausedDurationMS: ptr.Ref(45 * time.Minute.Milliseconds()), + }, + { + name: "active duration with paused task", + createdOffset: -2 * time.Hour, + buildOffset: ptr.Ref(-2 * time.Hour), + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Started", -90 * time.Minute}, + {database.WorkspaceAppStatusStateIdle, "Thinking", -60 * time.Minute}, // 30min working + {database.WorkspaceAppStatusStateWorking, "Resumed", -45 * time.Minute}, + {database.WorkspaceAppStatusStateComplete, "Done", -30 * time.Minute}, // 15min working + }, + extraBuilds: []buildSpec{ + {2, -25 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-25 * time.Minute), + pauseReason: ptr.Ref("auto"), + idleDurationMS: ptr.Ref(20 * time.Minute.Milliseconds()), // Last working (-45) to stop (-25). + activeDurationMS: ptr.Ref(45 * time.Minute.Milliseconds()), // 30 + 15 = 45min of "working". + pausedDurationMS: ptr.Ref(25 * time.Minute.Milliseconds()), // Ongoing pause. + }, + { + // When a workspace_app_status and a workspace_build share + // the exact same created_at timestamp, the ordering inside + // task_status_timeline is ambiguous. The boundary row must + // sort after real statuses so that LEAD() and the lws + // lateral join produce deterministic results. + name: "status and build at same timestamp - deterministic ordering", + createdOffset: -3 * time.Hour, + buildOffset: ptr.Ref(-2 * time.Hour), + appStatuses: []statusSpec{ + {database.WorkspaceAppStatusStateWorking, "Started work", -90 * time.Minute}, + // This status has the exact same timestamp as the + // stop build below, exercising the tiebreaker. + {database.WorkspaceAppStatusStateWorking, "Last update before pause", -30 * time.Minute}, + }, + extraBuilds: []buildSpec{ + {2, -30 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-30 * time.Minute), + pauseReason: ptr.Ref("auto"), + // IdleDurationMS is nil: the Go code requires + // stop.After(lastWorking), which is false when equal. + // Active: -90m (working) → -30m (boundary/stop) = 60 min. + activeDurationMS: ptr.Ref(60 * time.Minute.Milliseconds()), + pausedDurationMS: ptr.Ref(30 * time.Minute.Milliseconds()), + }, + { + // SQL filter: EXISTS (workspace_builds.created_at > createdAfter). + // This task has only old builds (7 days ago), so it won't match + // the 1-hour createdAfter filter and should not return an event. + name: "old task with no recent builds - not returned", + createdOffset: -7 * 24 * time.Hour, + buildOffset: ptr.Ref(-7 * 24 * time.Hour), + expectEvent: false, + }, + { + // SQL filter: EXISTS (workspace_builds.created_at > createdAfter). + // This task was created 7 days ago, but has a recent stop build, + // so it should match the filter and return an event. + name: "old task with recent build - returned", + createdOffset: -7 * 24 * time.Hour, + buildOffset: ptr.Ref(-7 * 24 * time.Hour), + extraBuilds: []buildSpec{ + {2, -30 * time.Minute, database.WorkspaceTransitionStop, database.BuildReasonTaskAutoPause, nil}, + }, + expectEvent: true, + lastPausedOffset: ptr.Ref(-30 * time.Minute), + pauseReason: ptr.Ref("auto"), + pausedDurationMS: ptr.Ref(30 * time.Minute.Milliseconds()), // Ongoing pause. + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitMedium) + db, _ := dbtestutil.NewDB(t) + org, err := db.GetDefaultOrganization(ctx) + require.NoError(t, err) + user := dbgen.User(t, db, database.User{}) + _ = dbgen.OrganizationMember(t, db, database.OrganizationMember{ + UserID: user.ID, + OrganizationID: org.ID, + }) + h := &taskTelemetryHelper{ + t: t, + ctx: ctx, + db: db, + org: org, + user: user, + } + + // Create a deleted task. This is a test antagonist that should never show up in results. + deletedTaskResp := dbfake.WorkspaceBuild(h.t, h.db, database.WorkspaceTable{ + OrganizationID: h.org.ID, + OwnerID: h.user.ID, + }).WithTask(database.TaskTable{ + Prompt: fmt.Sprintf("deleted-task-%s", t.Name()), + CreatedAt: now.Add(-100 * time.Hour), + }, nil).Seed(database.WorkspaceBuild{ + Transition: database.WorkspaceTransitionStart, + Reason: database.BuildReasonInitiator, + BuildNumber: 1, + CreatedAt: now.Add(-100 * time.Hour), + }).Succeeded().Do() + _, err = db.DeleteTask(h.ctx, database.DeleteTaskParams{ + DeletedAt: now.Add(-99 * time.Hour), + ID: deletedTaskResp.Task.ID, + }) + require.NoError(h.t, err, "creating deleted task antagonist") + + var expectedTask telemetry.Task + + if tt.skipWorkspace { + tv := dbgen.TemplateVersion(t, h.db, database.TemplateVersion{ + OrganizationID: h.org.ID, + CreatedBy: h.user.ID, + HasAITask: sql.NullBool{Bool: true, Valid: true}, + }) + task := dbgen.Task(h.t, h.db, database.TaskTable{ + OwnerID: h.user.ID, + OrganizationID: h.org.ID, + WorkspaceID: uuid.NullUUID{}, + TemplateVersionID: tv.ID, + Prompt: fmt.Sprintf("pending-task-%s", t.Name()), + CreatedAt: now.Add(tt.createdOffset), + }) + expectedTask = telemetry.Task{ + ID: task.ID.String(), + OrganizationID: h.org.ID.String(), + OwnerID: h.user.ID.String(), + Name: task.Name, + TemplateVersionID: tv.ID.String(), + PromptHash: telemetry.HashContent(task.Prompt), + Status: "pending", + CreatedAt: task.CreatedAt, + } + } else { + buildCreatedAt := now.Add(tt.createdOffset) + if tt.buildOffset != nil { + buildCreatedAt = now.Add(*tt.buildOffset) + } + + resp := dbfake.WorkspaceBuild(h.t, h.db, database.WorkspaceTable{ + OrganizationID: h.org.ID, + OwnerID: h.user.ID, + }).WithTask(database.TaskTable{ + Prompt: fmt.Sprintf("task-%s", t.Name()), + CreatedAt: now.Add(tt.createdOffset), + }, nil).Seed(database.WorkspaceBuild{ + Transition: database.WorkspaceTransitionStart, + Reason: database.BuildReasonInitiator, + BuildNumber: 1, + CreatedAt: buildCreatedAt, + }).Succeeded().Do() + + app := getApp(h.ctx, h.db, resp.Agents[0].ID) + + for _, s := range tt.appStatuses { + createAppStatus(h.ctx, h.db, resp.Workspace.ID, resp.Agents[0].ID, app.ID, s.state, s.message, now.Add(s.offset)) + } + + for _, b := range tt.extraBuilds { + bld, bldApp := h.createBuild(resp, b.buildNumber, now.Add(b.offset), b.transition, b.reason) + _ = bld + if bldApp != nil { + for _, s := range b.statuses { + createAppStatus(h.ctx, h.db, resp.Workspace.ID, resp.Agents[0].ID, bldApp.ID, s.state, s.message, now.Add(s.offset)) + } + } + } + + // Refresh the task + updated, err := h.db.GetTaskByID(ctx, resp.Task.ID) + require.NoError(t, err, "fetching updated task") + expectedTask = telemetry.Task{ + ID: updated.ID.String(), + OrganizationID: updated.OrganizationID.String(), + OwnerID: updated.OwnerID.String(), + Name: updated.Name, + WorkspaceID: ptr.Ref(updated.WorkspaceID.UUID.String()), + WorkspaceBuildNumber: ptr.Ref(int64(updated.WorkspaceBuildNumber.Int32)), + WorkspaceAgentID: ptr.Ref(updated.WorkspaceAgentID.UUID.String()), + WorkspaceAppID: ptr.Ref(updated.WorkspaceAppID.UUID.String()), + TemplateVersionID: updated.TemplateVersionID.String(), + PromptHash: telemetry.HashContent(updated.Prompt), + Status: string(updated.Status), + CreatedAt: updated.CreatedAt, + } + } + + actualTasks, err := telemetry.CollectTasks(h.ctx, h.db) + require.NoError(t, err, "unexpected error collecting tasks telemetry") + // Invariant: deleted tasks should NEVER appear in results. + require.Len(t, actualTasks, 1, "expected exactly one task") + + if diff := cmp.Diff(expectedTask, actualTasks[0]); diff != "" { + t.Fatalf("test case %q: task diff (-want +got):\n%s", tt.name, diff) + } + + actualEvents, err := telemetry.CollectTaskEvents(h.ctx, h.db, now.Add(-1*time.Hour), now) + require.NoError(t, err) + if !tt.expectEvent { + require.Empty(t, actualEvents) + } else { + expectedEvent := telemetry.TaskEvent{ + TaskID: expectedTask.ID, + } + if tt.lastPausedOffset != nil { + t := now.Add(*tt.lastPausedOffset) + expectedEvent.LastPausedAt = &t + } + if tt.lastResumedOffset != nil { + t := now.Add(*tt.lastResumedOffset) + expectedEvent.LastResumedAt = &t + } + expectedEvent.PauseReason = tt.pauseReason + expectedEvent.ResumeReason = tt.resumeReason + expectedEvent.IdleDurationMS = tt.idleDurationMS + expectedEvent.PausedDurationMS = tt.pausedDurationMS + expectedEvent.ResumeToStatusMS = tt.resumeToStatusMS + expectedEvent.ActiveDurationMS = tt.activeDurationMS + + // Each test case creates exactly one workspace with lifecycle + // activity, so we expect exactly one event. + require.Len(t, actualEvents, 1) + actual := actualEvents[0] + + if diff := cmp.Diff(expectedEvent, actual); diff != "" { + t.Fatalf("test case %q: event diff (-want +got):\n%s", tt.name, diff) + } + } + }) + } +} + type mockDB struct { database.Store } @@ -767,7 +1350,7 @@ func TestRecordTelemetryStatus(t *testing.T) { require.Nil(t, snapshot1) } - for i := 0; i < 3; i++ { + for range 3 { // Whatever happens, subsequent calls should not report if telemetryEnabled didn't change snapshot2, err := telemetry.RecordTelemetryStatus(ctx, logger, db, testCase.telemetryEnabled) require.NoError(t, err)