Files
Zach a31e476623 fix: make boundary usage telemetry collection atomic (#21907)
Previously, UpsertBoundaryUsageStats (INSERT...ON CONFLICT DO UPDATE) and
GetAndResetBoundaryUsageSummary (DELETE...RETURNING) could race during
telemetry period cutover. Without serialization, an upsert concurrent with the
delete could lose data (deleted right after being written) or commit after the
delete (miscounted in the next period). Both operations now acquire
LockIDBoundaryUsageStats within a transaction to ensure a clean cutover.
2026-02-06 09:52:17 -07:00

154 lines
5.4 KiB
Go

package boundaryusage
import (
"context"
"sync"
"time"
"github.com/google/uuid"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
)
// Tracker tracks boundary usage for telemetry reporting.
//
// Unique user/workspace counts are tracked both cumulatively and as deltas since
// the last flush. The delta is needed because when a new telemetry period starts
// (the DB row is deleted), we must only insert data accumulated since the last
// flush. If we used cumulative values, stale data from the previous period would
// be written to the new row and then lost when subsequent updates overwrite it.
//
// Request counts are tracked as deltas and accumulated in the database.
type Tracker struct {
mu sync.Mutex
// Cumulative unique counts for the current period (used on UPDATE to
// replace the DB value with accurate totals).
workspaces map[uuid.UUID]struct{}
users map[uuid.UUID]struct{}
// Delta unique counts since last flush (used on INSERT to avoid writing
// stale data from the previous period).
workspacesDelta map[uuid.UUID]struct{}
usersDelta map[uuid.UUID]struct{}
// Request deltas (always reset when flushing, accumulated in DB).
allowedRequests int64
deniedRequests int64
usageSinceLastFlush bool
}
// NewTracker creates a new boundary usage tracker.
func NewTracker() *Tracker {
return &Tracker{
workspaces: make(map[uuid.UUID]struct{}),
users: make(map[uuid.UUID]struct{}),
workspacesDelta: make(map[uuid.UUID]struct{}),
usersDelta: make(map[uuid.UUID]struct{}),
}
}
// Track records boundary usage for a workspace.
func (t *Tracker) Track(workspaceID, ownerID uuid.UUID, allowed, denied int64) {
t.mu.Lock()
defer t.mu.Unlock()
t.workspaces[workspaceID] = struct{}{}
t.users[ownerID] = struct{}{}
t.workspacesDelta[workspaceID] = struct{}{}
t.usersDelta[ownerID] = struct{}{}
t.allowedRequests += allowed
t.deniedRequests += denied
t.usageSinceLastFlush = true
}
// FlushToDB writes stats to the database. For unique counts, cumulative values
// are used on UPDATE (replacing the DB value) while delta values are used on
// INSERT (starting fresh). Request counts are always deltas, accumulated in DB.
// All deltas are reset immediately after snapshot so Track() calls during the
// DB operation are preserved for the next flush.
func (t *Tracker) FlushToDB(ctx context.Context, db database.Store, replicaID uuid.UUID) error {
t.mu.Lock()
if !t.usageSinceLastFlush {
t.mu.Unlock()
return nil
}
// Snapshot all values.
workspaceCount := int64(len(t.workspaces)) // cumulative, for UPDATE
userCount := int64(len(t.users)) // cumulative, for UPDATE
workspaceDelta := int64(len(t.workspacesDelta)) // delta, for INSERT
userDelta := int64(len(t.usersDelta)) // delta, for INSERT
allowed := t.allowedRequests // delta, accumulated in DB
denied := t.deniedRequests // delta, accumulated in DB
// Reset all deltas immediately so Track() calls during the DB operation
// below are preserved for the next flush.
t.workspacesDelta = make(map[uuid.UUID]struct{})
t.usersDelta = make(map[uuid.UUID]struct{})
t.allowedRequests = 0
t.deniedRequests = 0
t.usageSinceLastFlush = false
t.mu.Unlock()
//nolint:gocritic // This is the actual package doing boundary usage tracking.
authCtx := dbauthz.AsBoundaryUsageTracker(ctx)
err := db.InTx(func(tx database.Store) error {
// The advisory lock ensures a clean period cutover by preventing
// this upsert from racing with the aggregate+delete in
// GetAndResetBoundaryUsageSummary. Without it, upserted data
// could be lost or miscounted across periods.
if err := tx.AcquireLock(authCtx, database.LockIDBoundaryUsageStats); err != nil {
return err
}
_, err := tx.UpsertBoundaryUsageStats(authCtx, database.UpsertBoundaryUsageStatsParams{
ReplicaID: replicaID,
UniqueWorkspacesCount: workspaceCount, // cumulative, for UPDATE
UniqueUsersCount: userCount, // cumulative, for UPDATE
UniqueWorkspacesDelta: workspaceDelta, // delta, for INSERT
UniqueUsersDelta: userDelta, // delta, for INSERT
AllowedRequests: allowed,
DeniedRequests: denied,
})
return err
}, nil)
// Always reset cumulative counts to prevent unbounded memory growth (e.g.
// if the DB is unreachable). Copy delta maps to preserve any Track() calls
// that occurred during the DB operation above.
t.mu.Lock()
t.workspaces = make(map[uuid.UUID]struct{})
t.users = make(map[uuid.UUID]struct{})
for id := range t.workspacesDelta {
t.workspaces[id] = struct{}{}
}
for id := range t.usersDelta {
t.users[id] = struct{}{}
}
t.mu.Unlock()
return err
}
// StartFlushLoop begins the periodic flush loop that writes accumulated stats
// to the database. It blocks until the context is canceled. Flushes every
// minute to keep stats reasonably fresh for telemetry collection (which runs
// every 30 minutes by default) without excessive DB writes.
func (t *Tracker) StartFlushLoop(ctx context.Context, log slog.Logger, db database.Store, replicaID uuid.UUID) {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := t.FlushToDB(ctx, db, replicaID); err != nil {
log.Warn(ctx, "failed to flush boundary usage stats", slog.Error(err))
}
}
}
}