mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
a31e476623
Previously, UpsertBoundaryUsageStats (INSERT...ON CONFLICT DO UPDATE) and GetAndResetBoundaryUsageSummary (DELETE...RETURNING) could race during telemetry period cutover. Without serialization, an upsert concurrent with the delete could lose data (deleted right after being written) or commit after the delete (miscounted in the next period). Both operations now acquire LockIDBoundaryUsageStats within a transaction to ensure a clean cutover.
154 lines
5.4 KiB
Go
154 lines
5.4 KiB
Go
package boundaryusage
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
)
|
|
|
|
// Tracker tracks boundary usage for telemetry reporting.
|
|
//
|
|
// Unique user/workspace counts are tracked both cumulatively and as deltas since
|
|
// the last flush. The delta is needed because when a new telemetry period starts
|
|
// (the DB row is deleted), we must only insert data accumulated since the last
|
|
// flush. If we used cumulative values, stale data from the previous period would
|
|
// be written to the new row and then lost when subsequent updates overwrite it.
|
|
//
|
|
// Request counts are tracked as deltas and accumulated in the database.
|
|
type Tracker struct {
|
|
mu sync.Mutex
|
|
|
|
// Cumulative unique counts for the current period (used on UPDATE to
|
|
// replace the DB value with accurate totals).
|
|
workspaces map[uuid.UUID]struct{}
|
|
users map[uuid.UUID]struct{}
|
|
|
|
// Delta unique counts since last flush (used on INSERT to avoid writing
|
|
// stale data from the previous period).
|
|
workspacesDelta map[uuid.UUID]struct{}
|
|
usersDelta map[uuid.UUID]struct{}
|
|
|
|
// Request deltas (always reset when flushing, accumulated in DB).
|
|
allowedRequests int64
|
|
deniedRequests int64
|
|
|
|
usageSinceLastFlush bool
|
|
}
|
|
|
|
// NewTracker creates a new boundary usage tracker.
|
|
func NewTracker() *Tracker {
|
|
return &Tracker{
|
|
workspaces: make(map[uuid.UUID]struct{}),
|
|
users: make(map[uuid.UUID]struct{}),
|
|
workspacesDelta: make(map[uuid.UUID]struct{}),
|
|
usersDelta: make(map[uuid.UUID]struct{}),
|
|
}
|
|
}
|
|
|
|
// Track records boundary usage for a workspace.
|
|
func (t *Tracker) Track(workspaceID, ownerID uuid.UUID, allowed, denied int64) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
t.workspaces[workspaceID] = struct{}{}
|
|
t.users[ownerID] = struct{}{}
|
|
t.workspacesDelta[workspaceID] = struct{}{}
|
|
t.usersDelta[ownerID] = struct{}{}
|
|
t.allowedRequests += allowed
|
|
t.deniedRequests += denied
|
|
t.usageSinceLastFlush = true
|
|
}
|
|
|
|
// FlushToDB writes stats to the database. For unique counts, cumulative values
|
|
// are used on UPDATE (replacing the DB value) while delta values are used on
|
|
// INSERT (starting fresh). Request counts are always deltas, accumulated in DB.
|
|
// All deltas are reset immediately after snapshot so Track() calls during the
|
|
// DB operation are preserved for the next flush.
|
|
func (t *Tracker) FlushToDB(ctx context.Context, db database.Store, replicaID uuid.UUID) error {
|
|
t.mu.Lock()
|
|
if !t.usageSinceLastFlush {
|
|
t.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// Snapshot all values.
|
|
workspaceCount := int64(len(t.workspaces)) // cumulative, for UPDATE
|
|
userCount := int64(len(t.users)) // cumulative, for UPDATE
|
|
workspaceDelta := int64(len(t.workspacesDelta)) // delta, for INSERT
|
|
userDelta := int64(len(t.usersDelta)) // delta, for INSERT
|
|
allowed := t.allowedRequests // delta, accumulated in DB
|
|
denied := t.deniedRequests // delta, accumulated in DB
|
|
|
|
// Reset all deltas immediately so Track() calls during the DB operation
|
|
// below are preserved for the next flush.
|
|
t.workspacesDelta = make(map[uuid.UUID]struct{})
|
|
t.usersDelta = make(map[uuid.UUID]struct{})
|
|
t.allowedRequests = 0
|
|
t.deniedRequests = 0
|
|
t.usageSinceLastFlush = false
|
|
t.mu.Unlock()
|
|
|
|
//nolint:gocritic // This is the actual package doing boundary usage tracking.
|
|
authCtx := dbauthz.AsBoundaryUsageTracker(ctx)
|
|
err := db.InTx(func(tx database.Store) error {
|
|
// The advisory lock ensures a clean period cutover by preventing
|
|
// this upsert from racing with the aggregate+delete in
|
|
// GetAndResetBoundaryUsageSummary. Without it, upserted data
|
|
// could be lost or miscounted across periods.
|
|
if err := tx.AcquireLock(authCtx, database.LockIDBoundaryUsageStats); err != nil {
|
|
return err
|
|
}
|
|
_, err := tx.UpsertBoundaryUsageStats(authCtx, database.UpsertBoundaryUsageStatsParams{
|
|
ReplicaID: replicaID,
|
|
UniqueWorkspacesCount: workspaceCount, // cumulative, for UPDATE
|
|
UniqueUsersCount: userCount, // cumulative, for UPDATE
|
|
UniqueWorkspacesDelta: workspaceDelta, // delta, for INSERT
|
|
UniqueUsersDelta: userDelta, // delta, for INSERT
|
|
AllowedRequests: allowed,
|
|
DeniedRequests: denied,
|
|
})
|
|
return err
|
|
}, nil)
|
|
|
|
// Always reset cumulative counts to prevent unbounded memory growth (e.g.
|
|
// if the DB is unreachable). Copy delta maps to preserve any Track() calls
|
|
// that occurred during the DB operation above.
|
|
t.mu.Lock()
|
|
t.workspaces = make(map[uuid.UUID]struct{})
|
|
t.users = make(map[uuid.UUID]struct{})
|
|
for id := range t.workspacesDelta {
|
|
t.workspaces[id] = struct{}{}
|
|
}
|
|
for id := range t.usersDelta {
|
|
t.users[id] = struct{}{}
|
|
}
|
|
t.mu.Unlock()
|
|
|
|
return err
|
|
}
|
|
|
|
// StartFlushLoop begins the periodic flush loop that writes accumulated stats
|
|
// to the database. It blocks until the context is canceled. Flushes every
|
|
// minute to keep stats reasonably fresh for telemetry collection (which runs
|
|
// every 30 minutes by default) without excessive DB writes.
|
|
func (t *Tracker) StartFlushLoop(ctx context.Context, log slog.Logger, db database.Store, replicaID uuid.UUID) {
|
|
ticker := time.NewTicker(time.Minute)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if err := t.FlushToDB(ctx, db, replicaID); err != nil {
|
|
log.Warn(ctx, "failed to flush boundary usage stats", slog.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|