mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
90aeea5649
Previously there were two issues that could cause incorrect boundary usage telemetry data. 1. Bad handling across snapshot intervals: After telemetry snapshot deleted the DB row, the next flush would INSERT the stale cumulative data (which included already-reported usage). This would then be overwritten by subsequent UPDATE flushes, causing the delta between the last snapshot and the reset to be lost (under-reporting usage). Additionally, if there was no new usage after the reset, the tracker would carry over all usage from the previous period into the next period (over-reporting usage). 2. Missed usage from a race condition: Track() calls between the first mutex unlock and second mutex lock in FlushToDB() were lost. The data wasn't included in the current flush (already snapshotted) and was wiped by the subsequent reset. This is likely low impact to overall usage numbers in the real world. Fix by tracking unique workspace/user deltas separately from cumulative values and always tracking delta allowed/denied requests. Deltas are used for INSERT (fresh start after reset), cumulative for UPDATE (accurate unique counts within a period). All counters reset atomically before the DB operation so Track() calls during the operation are preserved for the next flush.
143 lines
4.9 KiB
Go
143 lines
4.9 KiB
Go
package boundaryusage
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
)
|
|
|
|
// Tracker tracks boundary usage for telemetry reporting.
|
|
//
|
|
// Unique user/workspace counts are tracked both cumulatively and as deltas since
|
|
// the last flush. The delta is needed because when a new telemetry period starts
|
|
// (the DB row is deleted), we must only insert data accumulated since the last
|
|
// flush. If we used cumulative values, stale data from the previous period would
|
|
// be written to the new row and then lost when subsequent updates overwrite it.
|
|
//
|
|
// Request counts are tracked as deltas and accumulated in the database.
|
|
type Tracker struct {
|
|
mu sync.Mutex
|
|
|
|
// Cumulative unique counts for the current period (used on UPDATE to
|
|
// replace the DB value with accurate totals).
|
|
workspaces map[uuid.UUID]struct{}
|
|
users map[uuid.UUID]struct{}
|
|
|
|
// Delta unique counts since last flush (used on INSERT to avoid writing
|
|
// stale data from the previous period).
|
|
workspacesDelta map[uuid.UUID]struct{}
|
|
usersDelta map[uuid.UUID]struct{}
|
|
|
|
// Request deltas (always reset when flushing, accumulated in DB).
|
|
allowedRequests int64
|
|
deniedRequests int64
|
|
|
|
usageSinceLastFlush bool
|
|
}
|
|
|
|
// NewTracker creates a new boundary usage tracker.
|
|
func NewTracker() *Tracker {
|
|
return &Tracker{
|
|
workspaces: make(map[uuid.UUID]struct{}),
|
|
users: make(map[uuid.UUID]struct{}),
|
|
workspacesDelta: make(map[uuid.UUID]struct{}),
|
|
usersDelta: make(map[uuid.UUID]struct{}),
|
|
}
|
|
}
|
|
|
|
// Track records boundary usage for a workspace.
|
|
func (t *Tracker) Track(workspaceID, ownerID uuid.UUID, allowed, denied int64) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
t.workspaces[workspaceID] = struct{}{}
|
|
t.users[ownerID] = struct{}{}
|
|
t.workspacesDelta[workspaceID] = struct{}{}
|
|
t.usersDelta[ownerID] = struct{}{}
|
|
t.allowedRequests += allowed
|
|
t.deniedRequests += denied
|
|
t.usageSinceLastFlush = true
|
|
}
|
|
|
|
// FlushToDB writes stats to the database. For unique counts, cumulative values
|
|
// are used on UPDATE (replacing the DB value) while delta values are used on
|
|
// INSERT (starting fresh). Request counts are always deltas, accumulated in DB.
|
|
// All deltas are reset immediately after snapshot so Track() calls during the
|
|
// DB operation are preserved for the next flush.
|
|
func (t *Tracker) FlushToDB(ctx context.Context, db database.Store, replicaID uuid.UUID) error {
|
|
t.mu.Lock()
|
|
if !t.usageSinceLastFlush {
|
|
t.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// Snapshot all values.
|
|
workspaceCount := int64(len(t.workspaces)) // cumulative, for UPDATE
|
|
userCount := int64(len(t.users)) // cumulative, for UPDATE
|
|
workspaceDelta := int64(len(t.workspacesDelta)) // delta, for INSERT
|
|
userDelta := int64(len(t.usersDelta)) // delta, for INSERT
|
|
allowed := t.allowedRequests // delta, accumulated in DB
|
|
denied := t.deniedRequests // delta, accumulated in DB
|
|
|
|
// Reset all deltas immediately so Track() calls during the DB operation
|
|
// below are preserved for the next flush.
|
|
t.workspacesDelta = make(map[uuid.UUID]struct{})
|
|
t.usersDelta = make(map[uuid.UUID]struct{})
|
|
t.allowedRequests = 0
|
|
t.deniedRequests = 0
|
|
t.usageSinceLastFlush = false
|
|
t.mu.Unlock()
|
|
|
|
//nolint:gocritic // This is the actual package doing boundary usage tracking.
|
|
_, err := db.UpsertBoundaryUsageStats(dbauthz.AsBoundaryUsageTracker(ctx), database.UpsertBoundaryUsageStatsParams{
|
|
ReplicaID: replicaID,
|
|
UniqueWorkspacesCount: workspaceCount, // cumulative, for UPDATE
|
|
UniqueUsersCount: userCount, // cumulative, for UPDATE
|
|
UniqueWorkspacesDelta: workspaceDelta, // delta, for INSERT
|
|
UniqueUsersDelta: userDelta, // delta, for INSERT
|
|
AllowedRequests: allowed,
|
|
DeniedRequests: denied,
|
|
})
|
|
|
|
// Always reset cumulative counts to prevent unbounded memory growth (e.g.
|
|
// if the DB is unreachable). Copy delta maps to preserve any Track() calls
|
|
// that occurred during the DB operation above.
|
|
t.mu.Lock()
|
|
t.workspaces = make(map[uuid.UUID]struct{})
|
|
t.users = make(map[uuid.UUID]struct{})
|
|
for id := range t.workspacesDelta {
|
|
t.workspaces[id] = struct{}{}
|
|
}
|
|
for id := range t.usersDelta {
|
|
t.users[id] = struct{}{}
|
|
}
|
|
t.mu.Unlock()
|
|
|
|
return err
|
|
}
|
|
|
|
// StartFlushLoop begins the periodic flush loop that writes accumulated stats
|
|
// to the database. It blocks until the context is canceled. Flushes every
|
|
// minute to keep stats reasonably fresh for telemetry collection (which runs
|
|
// every 30 minutes by default) without excessive DB writes.
|
|
func (t *Tracker) StartFlushLoop(ctx context.Context, log slog.Logger, db database.Store, replicaID uuid.UUID) {
|
|
ticker := time.NewTicker(time.Minute)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if err := t.FlushToDB(ctx, db, replicaID); err != nil {
|
|
log.Warn(ctx, "failed to flush boundary usage stats", slog.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|