From 39bf3ba6282733a88ebaa8fe8a1af045da57c36b Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Sat, 30 Aug 2025 03:39:37 +1000 Subject: [PATCH] chore: replace GetManagedAgentCount query with aggregate table (#19636) - Removes GetManagedAgentCount query - Adds new table `usage_events_daily` which stores aggregated usage events by the type and UTC day - Adds trigger to update the values in this table when a new row is inserted into `usage_events` - Adds a migration that adds `usage_events_daily` rows for existing data in `usage_events` - Adds tests for the trigger - Adds tests for the backfill query in the migration Since the `usage_events` table is unreleased currently, this migration will do nothing on real deployments and will only affect preview deployments such as dogfood. Closes https://github.com/coder/internal/issues/943 --- coderd/database/dbauthz/dbauthz.go | 15 +- coderd/database/dbauthz/dbauthz_test.go | 14 +- coderd/database/dbmetrics/querymetrics.go | 14 +- coderd/database/dbmock/dbmock.go | 30 ++-- coderd/database/dump.sql | 47 +++++++ .../000362_aggregate_usage_events.down.sql | 3 + .../000362_aggregate_usage_events.up.sql | 65 +++++++++ coderd/database/migrations/migrate_test.go | 106 +++++++++++++++ coderd/database/models.go | 8 ++ coderd/database/querier.go | 11 +- coderd/database/querier_test.go | 128 ++++++++++++++++++ coderd/database/queries.sql.go | 76 +++++------ coderd/database/queries/licenses.sql | 25 ---- coderd/database/queries/usageevents.sql | 25 +++- coderd/database/unique_constraint.go | 1 + enterprise/coderd/coderd.go | 8 +- enterprise/coderd/license/license.go | 15 +- enterprise/coderd/license/license_test.go | 13 +- 18 files changed, 488 insertions(+), 116 deletions(-) create mode 100644 coderd/database/migrations/000362_aggregate_usage_events.down.sql create mode 100644 coderd/database/migrations/000362_aggregate_usage_events.up.sql diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 53c58a5de1..a87e49ef2d 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2252,14 +2252,6 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) { return q.db.GetLogoURL(ctx) } -func (q *querier) GetManagedAgentCount(ctx context.Context, arg database.GetManagedAgentCountParams) (int64, error) { - // Must be able to read all workspaces to check usage. - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspace); err != nil { - return 0, xerrors.Errorf("authorize read all workspaces: %w", err) - } - return q.db.GetManagedAgentCount(ctx, arg) -} - func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil { return nil, err @@ -3058,6 +3050,13 @@ func (q *querier) GetTemplatesWithFilter(ctx context.Context, arg database.GetTe return q.db.GetAuthorizedTemplates(ctx, arg, prep) } +func (q *querier) GetTotalUsageDCManagedAgentsV1(ctx context.Context, arg database.GetTotalUsageDCManagedAgentsV1Params) (int64, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceUsageEvent); err != nil { + return 0, err + } + return q.db.GetTotalUsageDCManagedAgentsV1(ctx, arg) +} + func (q *querier) GetUnexpiredLicenses(ctx context.Context) ([]database.License, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceLicense); err != nil { return nil, err diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 40caad0818..a51fdd397a 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -723,12 +723,6 @@ func (s *MethodTestSuite) TestLicense() { dbm.EXPECT().GetAnnouncementBanners(gomock.Any()).Return("value", nil).AnyTimes() check.Args().Asserts().Returns("value") })) - s.Run("GetManagedAgentCount", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) { - start := dbtime.Now() - end := start.Add(time.Hour) - dbm.EXPECT().GetManagedAgentCount(gomock.Any(), database.GetManagedAgentCountParams{StartTime: start, EndTime: end}).Return(int64(0), nil).AnyTimes() - check.Args(database.GetManagedAgentCountParams{StartTime: start, EndTime: end}).Asserts(rbac.ResourceWorkspace, policy.ActionRead).Returns(int64(0)) - })) } func (s *MethodTestSuite) TestOrganization() { @@ -4284,4 +4278,12 @@ func (s *MethodTestSuite) TestUsageEvents() { db.EXPECT().UpdateUsageEventsPostPublish(gomock.Any(), params).Return(nil) check.Args(params).Asserts(rbac.ResourceUsageEvent, policy.ActionUpdate) })) + + s.Run("GetTotalUsageDCManagedAgentsV1", s.Mocked(func(db *dbmock.MockStore, faker *gofakeit.Faker, check *expects) { + db.EXPECT().GetTotalUsageDCManagedAgentsV1(gomock.Any(), gomock.Any()).Return(int64(1), nil) + check.Args(database.GetTotalUsageDCManagedAgentsV1Params{ + StartDate: time.Time{}, + EndDate: time.Time{}, + }).Asserts(rbac.ResourceUsageEvent, policy.ActionRead) + })) } diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 3f729acdcc..c1943e8e7a 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -978,13 +978,6 @@ func (m queryMetricsStore) GetLogoURL(ctx context.Context) (string, error) { return url, err } -func (m queryMetricsStore) GetManagedAgentCount(ctx context.Context, arg database.GetManagedAgentCountParams) (int64, error) { - start := time.Now() - r0, r1 := m.s.GetManagedAgentCount(ctx, arg) - m.queryLatencies.WithLabelValues("GetManagedAgentCount").Observe(time.Since(start).Seconds()) - return r0, r1 -} - func (m queryMetricsStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { start := time.Now() r0, r1 := m.s.GetNotificationMessagesByStatus(ctx, arg) @@ -1615,6 +1608,13 @@ func (m queryMetricsStore) GetTemplatesWithFilter(ctx context.Context, arg datab return templates, err } +func (m queryMetricsStore) GetTotalUsageDCManagedAgentsV1(ctx context.Context, arg database.GetTotalUsageDCManagedAgentsV1Params) (int64, error) { + start := time.Now() + r0, r1 := m.s.GetTotalUsageDCManagedAgentsV1(ctx, arg) + m.queryLatencies.WithLabelValues("GetTotalUsageDCManagedAgentsV1").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetUnexpiredLicenses(ctx context.Context) ([]database.License, error) { start := time.Now() licenses, err := m.s.GetUnexpiredLicenses(ctx) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 4f01933baf..f16d72899c 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2041,21 +2041,6 @@ func (mr *MockStoreMockRecorder) GetLogoURL(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLogoURL", reflect.TypeOf((*MockStore)(nil).GetLogoURL), ctx) } -// GetManagedAgentCount mocks base method. -func (m *MockStore) GetManagedAgentCount(ctx context.Context, arg database.GetManagedAgentCountParams) (int64, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetManagedAgentCount", ctx, arg) - ret0, _ := ret[0].(int64) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetManagedAgentCount indicates an expected call of GetManagedAgentCount. -func (mr *MockStoreMockRecorder) GetManagedAgentCount(ctx, arg any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetManagedAgentCount", reflect.TypeOf((*MockStore)(nil).GetManagedAgentCount), ctx, arg) -} - // GetNotificationMessagesByStatus mocks base method. func (m *MockStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { m.ctrl.T.Helper() @@ -3436,6 +3421,21 @@ func (mr *MockStoreMockRecorder) GetTemplatesWithFilter(ctx, arg any) *gomock.Ca return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTemplatesWithFilter", reflect.TypeOf((*MockStore)(nil).GetTemplatesWithFilter), ctx, arg) } +// GetTotalUsageDCManagedAgentsV1 mocks base method. +func (m *MockStore) GetTotalUsageDCManagedAgentsV1(ctx context.Context, arg database.GetTotalUsageDCManagedAgentsV1Params) (int64, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetTotalUsageDCManagedAgentsV1", ctx, arg) + ret0, _ := ret[0].(int64) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetTotalUsageDCManagedAgentsV1 indicates an expected call of GetTotalUsageDCManagedAgentsV1. +func (mr *MockStoreMockRecorder) GetTotalUsageDCManagedAgentsV1(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTotalUsageDCManagedAgentsV1", reflect.TypeOf((*MockStore)(nil).GetTotalUsageDCManagedAgentsV1), ctx, arg) +} + // GetUnexpiredLicenses mocks base method. func (m *MockStore) GetUnexpiredLicenses(ctx context.Context) ([]database.License, error) { m.ctrl.T.Helper() diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 066fe0b1b8..273ef55b96 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -361,6 +361,38 @@ CREATE TYPE workspace_transition AS ENUM ( 'delete' ); +CREATE FUNCTION aggregate_usage_event() RETURNS trigger + LANGUAGE plpgsql + AS $$ +BEGIN + -- Check for supported event types and throw error for unknown types + IF NEW.event_type NOT IN ('dc_managed_agents_v1') THEN + RAISE EXCEPTION 'Unhandled usage event type in aggregate_usage_event: %', NEW.event_type; + END IF; + + INSERT INTO usage_events_daily (day, event_type, usage_data) + VALUES ( + -- Extract the date from the created_at timestamp, always using UTC for + -- consistency + date_trunc('day', NEW.created_at AT TIME ZONE 'UTC')::date, + NEW.event_type, + NEW.event_data + ) + ON CONFLICT (day, event_type) DO UPDATE SET + usage_data = CASE + -- Handle simple counter events by summing the count + WHEN NEW.event_type IN ('dc_managed_agents_v1') THEN + jsonb_build_object( + 'count', + COALESCE((usage_events_daily.usage_data->>'count')::bigint, 0) + + COALESCE((NEW.event_data->>'count')::bigint, 0) + ) + END; + + RETURN NEW; +END; +$$; + CREATE FUNCTION check_workspace_agent_name_unique() RETURNS trigger LANGUAGE plpgsql AS $$ @@ -1860,6 +1892,16 @@ COMMENT ON COLUMN usage_events.published_at IS 'Set to a timestamp when the even COMMENT ON COLUMN usage_events.failure_message IS 'Set to an error message when the event is temporarily or permanently unsuccessfully published to the usage collector service.'; +CREATE TABLE usage_events_daily ( + day date NOT NULL, + event_type text NOT NULL, + usage_data jsonb NOT NULL +); + +COMMENT ON TABLE usage_events_daily IS 'usage_events_daily is a daily rollup of usage events. It stores the total usage for each event type by day.'; + +COMMENT ON COLUMN usage_events_daily.day IS 'The date of the summed usage events, always in UTC.'; + CREATE TABLE user_configs ( user_id uuid NOT NULL, key character varying(256) NOT NULL, @@ -2711,6 +2753,9 @@ ALTER TABLE ONLY template_versions ALTER TABLE ONLY templates ADD CONSTRAINT templates_pkey PRIMARY KEY (id); +ALTER TABLE ONLY usage_events_daily + ADD CONSTRAINT usage_events_daily_pkey PRIMARY KEY (day, event_type); + ALTER TABLE ONLY usage_events ADD CONSTRAINT usage_events_pkey PRIMARY KEY (id); @@ -3034,6 +3079,8 @@ CREATE TRIGGER tailnet_notify_peer_change AFTER INSERT OR DELETE OR UPDATE ON ta CREATE TRIGGER tailnet_notify_tunnel_change AFTER INSERT OR DELETE OR UPDATE ON tailnet_tunnels FOR EACH ROW EXECUTE FUNCTION tailnet_notify_tunnel_change(); +CREATE TRIGGER trigger_aggregate_usage_event AFTER INSERT ON usage_events FOR EACH ROW EXECUTE FUNCTION aggregate_usage_event(); + CREATE TRIGGER trigger_delete_group_members_on_org_member_delete BEFORE DELETE ON organization_members FOR EACH ROW EXECUTE FUNCTION delete_group_members_on_org_member_delete(); CREATE TRIGGER trigger_delete_oauth2_provider_app_token AFTER DELETE ON oauth2_provider_app_tokens FOR EACH ROW EXECUTE FUNCTION delete_deleted_oauth2_provider_app_token_api_key(); diff --git a/coderd/database/migrations/000362_aggregate_usage_events.down.sql b/coderd/database/migrations/000362_aggregate_usage_events.down.sql new file mode 100644 index 0000000000..ca49a1a3a2 --- /dev/null +++ b/coderd/database/migrations/000362_aggregate_usage_events.down.sql @@ -0,0 +1,3 @@ +DROP TRIGGER IF EXISTS trigger_aggregate_usage_event ON usage_events; +DROP FUNCTION IF EXISTS aggregate_usage_event(); +DROP TABLE IF EXISTS usage_events_daily; diff --git a/coderd/database/migrations/000362_aggregate_usage_events.up.sql b/coderd/database/migrations/000362_aggregate_usage_events.up.sql new file mode 100644 index 0000000000..58af0398eb --- /dev/null +++ b/coderd/database/migrations/000362_aggregate_usage_events.up.sql @@ -0,0 +1,65 @@ +CREATE TABLE usage_events_daily ( + day date NOT NULL, -- always grouped by day in UTC + event_type text NOT NULL, + usage_data jsonb NOT NULL, + PRIMARY KEY (day, event_type) +); + +COMMENT ON TABLE usage_events_daily IS 'usage_events_daily is a daily rollup of usage events. It stores the total usage for each event type by day.'; +COMMENT ON COLUMN usage_events_daily.day IS 'The date of the summed usage events, always in UTC.'; + +-- Function to handle usage event aggregation +CREATE OR REPLACE FUNCTION aggregate_usage_event() +RETURNS TRIGGER AS $$ +BEGIN + -- Check for supported event types and throw error for unknown types + IF NEW.event_type NOT IN ('dc_managed_agents_v1') THEN + RAISE EXCEPTION 'Unhandled usage event type in aggregate_usage_event: %', NEW.event_type; + END IF; + + INSERT INTO usage_events_daily (day, event_type, usage_data) + VALUES ( + -- Extract the date from the created_at timestamp, always using UTC for + -- consistency + date_trunc('day', NEW.created_at AT TIME ZONE 'UTC')::date, + NEW.event_type, + NEW.event_data + ) + ON CONFLICT (day, event_type) DO UPDATE SET + usage_data = CASE + -- Handle simple counter events by summing the count + WHEN NEW.event_type IN ('dc_managed_agents_v1') THEN + jsonb_build_object( + 'count', + COALESCE((usage_events_daily.usage_data->>'count')::bigint, 0) + + COALESCE((NEW.event_data->>'count')::bigint, 0) + ) + END; + + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Create trigger to automatically aggregate usage events +CREATE TRIGGER trigger_aggregate_usage_event + AFTER INSERT ON usage_events + FOR EACH ROW + EXECUTE FUNCTION aggregate_usage_event(); + +-- Populate usage_events_daily with existing data +INSERT INTO + usage_events_daily (day, event_type, usage_data) +SELECT + date_trunc('day', created_at AT TIME ZONE 'UTC')::date AS day, + event_type, + jsonb_build_object('count', SUM((event_data->>'count')::bigint)) AS usage_data +FROM + usage_events +WHERE + -- The only event type we currently support is dc_managed_agents_v1 + event_type = 'dc_managed_agents_v1' +GROUP BY + date_trunc('day', created_at AT TIME ZONE 'UTC')::date, + event_type +ON CONFLICT (day, event_type) DO UPDATE SET + usage_data = EXCLUDED.usage_data; diff --git a/coderd/database/migrations/migrate_test.go b/coderd/database/migrations/migrate_test.go index f5d84e6532..f31a3adb0e 100644 --- a/coderd/database/migrations/migrate_test.go +++ b/coderd/database/migrations/migrate_test.go @@ -9,17 +9,20 @@ import ( "slices" "sync" "testing" + "time" "github.com/golang-migrate/migrate/v4" migratepostgres "github.com/golang-migrate/migrate/v4/database/postgres" "github.com/golang-migrate/migrate/v4/source" "github.com/golang-migrate/migrate/v4/source/iofs" "github.com/golang-migrate/migrate/v4/source/stub" + "github.com/google/uuid" "github.com/lib/pq" "github.com/stretchr/testify/require" "go.uber.org/goleak" "golang.org/x/sync/errgroup" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/migrations" "github.com/coder/coder/v2/testutil" @@ -363,3 +366,106 @@ func TestMigrateUpWithFixtures(t *testing.T) { }) } } + +// TestMigration000362AggregateUsageEvents tests the migration that aggregates +// usage events into daily rows correctly. +func TestMigration000362AggregateUsageEvents(t *testing.T) { + t.Parallel() + + const migrationVersion = 362 + + // Similarly to the other test, this test will probably time out in CI. + ctx := testutil.Context(t, testutil.WaitSuperLong) + + sqlDB := testSQLDB(t) + db := database.New(sqlDB) + + // Migrate up to the migration before the one that aggregates usage events. + next, err := migrations.Stepper(sqlDB) + require.NoError(t, err) + for { + version, more, err := next() + require.NoError(t, err) + if !more { + t.Fatalf("migration %d not found", migrationVersion) + } + if version == migrationVersion-1 { + break + } + } + + locSydney, err := time.LoadLocation("Australia/Sydney") + require.NoError(t, err) + + usageEvents := []struct { + // The only possible event type is dc_managed_agents_v1 when this + // migration gets applied. + eventData []byte + createdAt time.Time + }{ + { + eventData: []byte(`{"count": 41}`), + createdAt: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + }, + { + eventData: []byte(`{"count": 1}`), + // 2025-01-01 in UTC + createdAt: time.Date(2025, 1, 2, 8, 38, 57, 0, locSydney), + }, + { + eventData: []byte(`{"count": 1}`), + createdAt: time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), + }, + } + expectedDailyRows := []struct { + day time.Time + usageData []byte + }{ + { + day: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + usageData: []byte(`{"count": 42}`), + }, + { + day: time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), + usageData: []byte(`{"count": 1}`), + }, + } + + for _, usageEvent := range usageEvents { + err := db.InsertUsageEvent(ctx, database.InsertUsageEventParams{ + ID: uuid.New().String(), + EventType: "dc_managed_agents_v1", + EventData: usageEvent.eventData, + CreatedAt: usageEvent.createdAt, + }) + require.NoError(t, err) + } + + // Migrate up to the migration that aggregates usage events. + version, _, err := next() + require.NoError(t, err) + require.EqualValues(t, migrationVersion, version) + + // Get all of the newly created daily rows. This query is not exposed in the + // querier interface intentionally. + rows, err := sqlDB.QueryContext(ctx, "SELECT day, event_type, usage_data FROM usage_events_daily ORDER BY day ASC") + require.NoError(t, err, "perform query") + defer rows.Close() + var out []database.UsageEventsDaily + for rows.Next() { + var row database.UsageEventsDaily + err := rows.Scan(&row.Day, &row.EventType, &row.UsageData) + require.NoError(t, err, "scan row") + out = append(out, row) + } + + // Verify that the daily rows match our expectations. + require.Len(t, out, len(expectedDailyRows)) + for i, row := range out { + require.Equal(t, "dc_managed_agents_v1", row.EventType) + // The read row might be `+0000` rather than `UTC` specifically, so just + // ensure it's within 1 second of the expected time. + require.WithinDuration(t, expectedDailyRows[i].day, row.Day, time.Second) + require.JSONEq(t, string(expectedDailyRows[i].usageData), string(row.UsageData)) + } +} diff --git a/coderd/database/models.go b/coderd/database/models.go index effd436f4d..99107713b0 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -3778,6 +3778,14 @@ type UsageEvent struct { FailureMessage sql.NullString `db:"failure_message" json:"failure_message"` } +// usage_events_daily is a daily rollup of usage events. It stores the total usage for each event type by day. +type UsageEventsDaily struct { + // The date of the summed usage events, always in UTC. + Day time.Time `db:"day" json:"day"` + EventType string `db:"event_type" json:"event_type"` + UsageData json.RawMessage `db:"usage_data" json:"usage_data"` +} + type User struct { ID uuid.UUID `db:"id" json:"id"` Email string `db:"email" json:"email"` diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 6e955b82b0..f0b5cb6db4 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -222,8 +222,6 @@ type sqlcQuerier interface { GetLicenseByID(ctx context.Context, id int32) (License, error) GetLicenses(ctx context.Context) ([]License, error) GetLogoURL(ctx context.Context) (string, error) - // This isn't strictly a license query, but it's related to license enforcement. - GetManagedAgentCount(ctx context.Context, arg GetManagedAgentCountParams) (int64, error) GetNotificationMessagesByStatus(ctx context.Context, arg GetNotificationMessagesByStatusParams) ([]NotificationMessage, error) // Fetch the notification report generator log indicating recent activity. GetNotificationReportGeneratorLogByTemplate(ctx context.Context, templateID uuid.UUID) (NotificationReportGeneratorLog, error) @@ -372,6 +370,15 @@ type sqlcQuerier interface { GetTemplateVersionsCreatedAfter(ctx context.Context, createdAt time.Time) ([]TemplateVersion, error) GetTemplates(ctx context.Context) ([]Template, error) GetTemplatesWithFilter(ctx context.Context, arg GetTemplatesWithFilterParams) ([]Template, error) + // Gets the total number of managed agents created between two dates. Uses the + // aggregate table to avoid large scans or a complex index on the usage_events + // table. + // + // This has the trade off that we can't count accurately between two exact + // timestamps. The provided timestamps will be converted to UTC and truncated to + // the events that happened on and between the two dates. Both dates are + // inclusive. + GetTotalUsageDCManagedAgentsV1(ctx context.Context, arg GetTotalUsageDCManagedAgentsV1Params) (int64, error) GetUnexpiredLicenses(ctx context.Context) ([]License, error) // GetUserActivityInsights returns the ranking with top active users. // The result can be filtered on template_ids, meaning only user data diff --git a/coderd/database/querier_test.go b/coderd/database/querier_test.go index a8b3c186ed..c7daaaed35 100644 --- a/coderd/database/querier_test.go +++ b/coderd/database/querier_test.go @@ -6652,3 +6652,131 @@ func TestGetLatestWorkspaceBuildsByWorkspaceIDs(t *testing.T) { require.Equal(t, expB.BuildNumber, b.BuildNumber, "unexpected build number") } } + +func TestUsageEventsTrigger(t *testing.T) { + t.Parallel() + + // This is not exposed in the querier interface intentionally. + getDailyRows := func(ctx context.Context, sqlDB *sql.DB) []database.UsageEventsDaily { + t.Helper() + rows, err := sqlDB.QueryContext(ctx, "SELECT day, event_type, usage_data FROM usage_events_daily ORDER BY day ASC") + require.NoError(t, err, "perform query") + defer rows.Close() + + var out []database.UsageEventsDaily + for rows.Next() { + var row database.UsageEventsDaily + err := rows.Scan(&row.Day, &row.EventType, &row.UsageData) + require.NoError(t, err, "scan row") + out = append(out, row) + } + return out + } + + t.Run("OK", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t) + + // Assert there are no daily rows. + rows := getDailyRows(ctx, sqlDB) + require.Len(t, rows, 0) + + // Insert a usage event. + err := db.InsertUsageEvent(ctx, database.InsertUsageEventParams{ + ID: "1", + EventType: "dc_managed_agents_v1", + EventData: []byte(`{"count": 41}`), + CreatedAt: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + }) + require.NoError(t, err) + + // Assert there is one daily row that contains the correct data. + rows = getDailyRows(ctx, sqlDB) + require.Len(t, rows, 1) + require.Equal(t, "dc_managed_agents_v1", rows[0].EventType) + require.JSONEq(t, `{"count": 41}`, string(rows[0].UsageData)) + // The read row might be `+0000` rather than `UTC` specifically, so just + // ensure it's within 1 second of the expected time. + require.WithinDuration(t, time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), rows[0].Day, time.Second) + + // Insert a new usage event on the same UTC day, should increment the count. + locSydney, err := time.LoadLocation("Australia/Sydney") + require.NoError(t, err) + err = db.InsertUsageEvent(ctx, database.InsertUsageEventParams{ + ID: "2", + EventType: "dc_managed_agents_v1", + EventData: []byte(`{"count": 1}`), + // Insert it at a random point during the same day. Sydney is +1000 or + // +1100, so 8am in Sydney is the previous day in UTC. + CreatedAt: time.Date(2025, 1, 2, 8, 38, 57, 0, locSydney), + }) + require.NoError(t, err) + + // There should still be only one daily row with the incremented count. + rows = getDailyRows(ctx, sqlDB) + require.Len(t, rows, 1) + require.Equal(t, "dc_managed_agents_v1", rows[0].EventType) + require.JSONEq(t, `{"count": 42}`, string(rows[0].UsageData)) + require.WithinDuration(t, time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), rows[0].Day, time.Second) + + // TODO: when we have a new event type, we should test that adding an + // event with a different event type on the same day creates a new daily + // row. + + // Insert a new usage event on a different day, should create a new daily + // row. + err = db.InsertUsageEvent(ctx, database.InsertUsageEventParams{ + ID: "3", + EventType: "dc_managed_agents_v1", + EventData: []byte(`{"count": 1}`), + CreatedAt: time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), + }) + require.NoError(t, err) + + // There should now be two daily rows. + rows = getDailyRows(ctx, sqlDB) + require.Len(t, rows, 2) + // Output is sorted by day ascending, so the first row should be the + // previous day's row. + require.Equal(t, "dc_managed_agents_v1", rows[0].EventType) + require.JSONEq(t, `{"count": 42}`, string(rows[0].UsageData)) + require.WithinDuration(t, time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), rows[0].Day, time.Second) + require.Equal(t, "dc_managed_agents_v1", rows[1].EventType) + require.JSONEq(t, `{"count": 1}`, string(rows[1].UsageData)) + require.WithinDuration(t, time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), rows[1].Day, time.Second) + }) + + t.Run("UnknownEventType", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t) + + // Relax the usage_events.event_type check constraint to see what + // happens when we insert a usage event that the trigger doesn't know + // about. + _, err := sqlDB.ExecContext(ctx, "ALTER TABLE usage_events DROP CONSTRAINT usage_event_type_check") + require.NoError(t, err) + + // Insert a usage event with an unknown event type. + err = db.InsertUsageEvent(ctx, database.InsertUsageEventParams{ + ID: "broken", + EventType: "dean's cool event", + EventData: []byte(`{"my": "cool json"}`), + CreatedAt: time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC), + }) + require.ErrorContains(t, err, "Unhandled usage event type in aggregate_usage_event") + + // The event should've been blocked. + var count int + err = sqlDB.QueryRowContext(ctx, "SELECT COUNT(*) FROM usage_events WHERE id = 'broken'").Scan(&count) + require.NoError(t, err) + require.Equal(t, 0, count) + + // We should not have any daily rows. + rows := getDailyRows(ctx, sqlDB) + require.Len(t, rows, 0) + }) +} diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index d5495c4df5..78f61ee59e 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -4334,44 +4334,6 @@ func (q *sqlQuerier) GetLicenses(ctx context.Context) ([]License, error) { return items, nil } -const getManagedAgentCount = `-- name: GetManagedAgentCount :one -SELECT - COUNT(DISTINCT wb.id) AS count -FROM - workspace_builds AS wb -JOIN - provisioner_jobs AS pj -ON - wb.job_id = pj.id -WHERE - wb.transition = 'start'::workspace_transition - AND wb.has_ai_task = true - -- Only count jobs that are pending, running or succeeded. Other statuses - -- like cancel(ed|ing), failed or unknown are not considered as managed - -- agent usage. These workspace builds are typically unusable anyway. - AND pj.job_status IN ( - 'pending'::provisioner_job_status, - 'running'::provisioner_job_status, - 'succeeded'::provisioner_job_status - ) - -- Jobs are counted at the time they are created, not when they are - -- completed, as pending jobs haven't completed yet. - AND wb.created_at BETWEEN $1::timestamptz AND $2::timestamptz -` - -type GetManagedAgentCountParams struct { - StartTime time.Time `db:"start_time" json:"start_time"` - EndTime time.Time `db:"end_time" json:"end_time"` -} - -// This isn't strictly a license query, but it's related to license enforcement. -func (q *sqlQuerier) GetManagedAgentCount(ctx context.Context, arg GetManagedAgentCountParams) (int64, error) { - row := q.db.QueryRowContext(ctx, getManagedAgentCount, arg.StartTime, arg.EndTime) - var count int64 - err := row.Scan(&count) - return count, err -} - const getUnexpiredLicenses = `-- name: GetUnexpiredLicenses :many SELECT id, uploaded_at, jwt, exp, uuid FROM licenses @@ -13634,6 +13596,40 @@ func (q *sqlQuerier) DisableForeignKeysAndTriggers(ctx context.Context) error { return err } +const getTotalUsageDCManagedAgentsV1 = `-- name: GetTotalUsageDCManagedAgentsV1 :one +SELECT + -- The first cast is necessary since you can't sum strings, and the second + -- cast is necessary to make sqlc happy. + COALESCE(SUM((usage_data->>'count')::bigint), 0)::bigint AS total_count +FROM + usage_events_daily +WHERE + event_type = 'dc_managed_agents_v1' + -- Parentheses are necessary to avoid sqlc from generating an extra + -- argument. + AND day BETWEEN date_trunc('day', ($1::timestamptz) AT TIME ZONE 'UTC')::date AND date_trunc('day', ($2::timestamptz) AT TIME ZONE 'UTC')::date +` + +type GetTotalUsageDCManagedAgentsV1Params struct { + StartDate time.Time `db:"start_date" json:"start_date"` + EndDate time.Time `db:"end_date" json:"end_date"` +} + +// Gets the total number of managed agents created between two dates. Uses the +// aggregate table to avoid large scans or a complex index on the usage_events +// table. +// +// This has the trade off that we can't count accurately between two exact +// timestamps. The provided timestamps will be converted to UTC and truncated to +// the events that happened on and between the two dates. Both dates are +// inclusive. +func (q *sqlQuerier) GetTotalUsageDCManagedAgentsV1(ctx context.Context, arg GetTotalUsageDCManagedAgentsV1Params) (int64, error) { + row := q.db.QueryRowContext(ctx, getTotalUsageDCManagedAgentsV1, arg.StartDate, arg.EndDate) + var total_count int64 + err := row.Scan(&total_count) + return total_count, err +} + const insertUsageEvent = `-- name: InsertUsageEvent :exec INSERT INTO usage_events ( @@ -13693,7 +13689,7 @@ WITH usage_events AS ( -- than an hour ago. This is so we can retry publishing -- events where the replica exited or couldn't update the -- row. - -- The parenthesis around @now::timestamptz are necessary to + -- The parentheses around @now::timestamptz are necessary to -- avoid sqlc from generating an extra argument. OR potential_event.publish_started_at < ($1::timestamptz) - INTERVAL '1 hour' ) @@ -13701,7 +13697,7 @@ WITH usage_events AS ( -- always permanently reject these events anyways. This is to -- avoid duplicate events being billed to customers, as -- Metronome will only deduplicate events within 34 days. - -- Also, the same parenthesis thing here as above. + -- Also, the same parentheses thing here as above. AND potential_event.created_at > ($1::timestamptz) - INTERVAL '30 days' ORDER BY potential_event.created_at ASC FOR UPDATE SKIP LOCKED diff --git a/coderd/database/queries/licenses.sql b/coderd/database/queries/licenses.sql index ac864a94d1..3512a46514 100644 --- a/coderd/database/queries/licenses.sql +++ b/coderd/database/queries/licenses.sql @@ -35,28 +35,3 @@ DELETE FROM licenses WHERE id = $1 RETURNING id; - --- name: GetManagedAgentCount :one --- This isn't strictly a license query, but it's related to license enforcement. -SELECT - COUNT(DISTINCT wb.id) AS count -FROM - workspace_builds AS wb -JOIN - provisioner_jobs AS pj -ON - wb.job_id = pj.id -WHERE - wb.transition = 'start'::workspace_transition - AND wb.has_ai_task = true - -- Only count jobs that are pending, running or succeeded. Other statuses - -- like cancel(ed|ing), failed or unknown are not considered as managed - -- agent usage. These workspace builds are typically unusable anyway. - AND pj.job_status IN ( - 'pending'::provisioner_job_status, - 'running'::provisioner_job_status, - 'succeeded'::provisioner_job_status - ) - -- Jobs are counted at the time they are created, not when they are - -- completed, as pending jobs haven't completed yet. - AND wb.created_at BETWEEN @start_time::timestamptz AND @end_time::timestamptz; diff --git a/coderd/database/queries/usageevents.sql b/coderd/database/queries/usageevents.sql index 85b53e04fd..291e275c60 100644 --- a/coderd/database/queries/usageevents.sql +++ b/coderd/database/queries/usageevents.sql @@ -39,7 +39,7 @@ WITH usage_events AS ( -- than an hour ago. This is so we can retry publishing -- events where the replica exited or couldn't update the -- row. - -- The parenthesis around @now::timestamptz are necessary to + -- The parentheses around @now::timestamptz are necessary to -- avoid sqlc from generating an extra argument. OR potential_event.publish_started_at < (@now::timestamptz) - INTERVAL '1 hour' ) @@ -47,7 +47,7 @@ WITH usage_events AS ( -- always permanently reject these events anyways. This is to -- avoid duplicate events being billed to customers, as -- Metronome will only deduplicate events within 34 days. - -- Also, the same parenthesis thing here as above. + -- Also, the same parentheses thing here as above. AND potential_event.created_at > (@now::timestamptz) - INTERVAL '30 days' ORDER BY potential_event.created_at ASC FOR UPDATE SKIP LOCKED @@ -84,3 +84,24 @@ WHERE -- zero, so this is the best we can do. AND cardinality(@ids::text[]) = cardinality(@failure_messages::text[]) AND cardinality(@ids::text[]) = cardinality(@set_published_ats::boolean[]); + +-- name: GetTotalUsageDCManagedAgentsV1 :one +-- Gets the total number of managed agents created between two dates. Uses the +-- aggregate table to avoid large scans or a complex index on the usage_events +-- table. +-- +-- This has the trade off that we can't count accurately between two exact +-- timestamps. The provided timestamps will be converted to UTC and truncated to +-- the events that happened on and between the two dates. Both dates are +-- inclusive. +SELECT + -- The first cast is necessary since you can't sum strings, and the second + -- cast is necessary to make sqlc happy. + COALESCE(SUM((usage_data->>'count')::bigint), 0)::bigint AS total_count +FROM + usage_events_daily +WHERE + event_type = 'dc_managed_agents_v1' + -- Parentheses are necessary to avoid sqlc from generating an extra + -- argument. + AND day BETWEEN date_trunc('day', (@start_date::timestamptz) AT TIME ZONE 'UTC')::date AND date_trunc('day', (@end_date::timestamptz) AT TIME ZONE 'UTC')::date; diff --git a/coderd/database/unique_constraint.go b/coderd/database/unique_constraint.go index 1b0b13ea2b..ddb83a339f 100644 --- a/coderd/database/unique_constraint.go +++ b/coderd/database/unique_constraint.go @@ -67,6 +67,7 @@ const ( UniqueTemplateVersionsPkey UniqueConstraint = "template_versions_pkey" // ALTER TABLE ONLY template_versions ADD CONSTRAINT template_versions_pkey PRIMARY KEY (id); UniqueTemplateVersionsTemplateIDNameKey UniqueConstraint = "template_versions_template_id_name_key" // ALTER TABLE ONLY template_versions ADD CONSTRAINT template_versions_template_id_name_key UNIQUE (template_id, name); UniqueTemplatesPkey UniqueConstraint = "templates_pkey" // ALTER TABLE ONLY templates ADD CONSTRAINT templates_pkey PRIMARY KEY (id); + UniqueUsageEventsDailyPkey UniqueConstraint = "usage_events_daily_pkey" // ALTER TABLE ONLY usage_events_daily ADD CONSTRAINT usage_events_daily_pkey PRIMARY KEY (day, event_type); UniqueUsageEventsPkey UniqueConstraint = "usage_events_pkey" // ALTER TABLE ONLY usage_events ADD CONSTRAINT usage_events_pkey PRIMARY KEY (id); UniqueUserConfigsPkey UniqueConstraint = "user_configs_pkey" // ALTER TABLE ONLY user_configs ADD CONSTRAINT user_configs_pkey PRIMARY KEY (user_id, key); UniqueUserDeletedPkey UniqueConstraint = "user_deleted_pkey" // ALTER TABLE ONLY user_deleted ADD CONSTRAINT user_deleted_pkey PRIMARY KEY (id); diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index a81e165854..0d276eef86 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -984,10 +984,10 @@ func (api *API) CheckBuildUsage(ctx context.Context, store database.Store, templ // This check is intentionally not committed to the database. It's fine if // it's not 100% accurate or allows for minor breaches due to build races. - // nolint:gocritic // Requires permission to read all workspaces to read managed agent count. - managedAgentCount, err := store.GetManagedAgentCount(agpldbauthz.AsSystemRestricted(ctx), database.GetManagedAgentCountParams{ - StartTime: managedAgentLimit.UsagePeriod.Start, - EndTime: managedAgentLimit.UsagePeriod.End, + // nolint:gocritic // Requires permission to read all usage events. + managedAgentCount, err := store.GetTotalUsageDCManagedAgentsV1(agpldbauthz.AsSystemRestricted(ctx), database.GetTotalUsageDCManagedAgentsV1Params{ + StartDate: managedAgentLimit.UsagePeriod.Start, + EndDate: managedAgentLimit.UsagePeriod.End, }) if err != nil { return wsbuilder.UsageCheckResponse{}, xerrors.Errorf("get managed agent count: %w", err) diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index d2913f7e0e..5d0fc9b9fb 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -125,10 +125,19 @@ func Entitlements( ExternalWorkspaceCount: int64(len(externalWorkspaces)), ExternalTemplateCount: int64(len(externalTemplates)), ManagedAgentCountFn: func(ctx context.Context, startTime time.Time, endTime time.Time) (int64, error) { + // This is not super accurate, as the start and end times will be + // truncated to the date in UTC timezone. This is an optimization + // so we can use an aggregate table instead of scanning the usage + // events table. + // + // High accuracy is not super necessary, as we give buffers in our + // licenses (e.g. higher hard limit) to account for additional + // usage. + // // nolint:gocritic // Requires permission to read all workspaces to read managed agent count. - return db.GetManagedAgentCount(dbauthz.AsSystemRestricted(ctx), database.GetManagedAgentCountParams{ - StartTime: startTime, - EndTime: endTime, + return db.GetTotalUsageDCManagedAgentsV1(dbauthz.AsSystemRestricted(ctx), database.GetTotalUsageDCManagedAgentsV1Params{ + StartDate: startTime, + EndDate: endTime, }) }, }) diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index c457b7f076..1889cb7105 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -827,12 +827,17 @@ func TestEntitlements(t *testing.T) { GetActiveUserCount(gomock.Any(), false). Return(int64(1), nil) mDB.EXPECT(). - GetManagedAgentCount(gomock.Any(), gomock.Cond(func(params database.GetManagedAgentCountParams) bool { - // gomock doesn't seem to compare times very nicely. - if !assert.WithinDuration(t, licenseOpts.NotBefore, params.StartTime, time.Second) { + GetTotalUsageDCManagedAgentsV1(gomock.Any(), gomock.Cond(func(params database.GetTotalUsageDCManagedAgentsV1Params) bool { + // gomock doesn't seem to compare times very nicely, so check + // them manually. + // + // The query truncates these times to the date in UTC timezone, + // but we still check that we're passing in the correct + // timestamp in the first place. + if !assert.WithinDuration(t, licenseOpts.NotBefore, params.StartDate, time.Second) { return false } - if !assert.WithinDuration(t, licenseOpts.ExpiresAt, params.EndTime, time.Second) { + if !assert.WithinDuration(t, licenseOpts.ExpiresAt, params.EndDate, time.Second) { return false } return true