coder/coderd/workspacestats/tracker.go

package workspacestats

import (
	"bytes"
	"context"
	"flag"
	"os"
	"sort"
	"sync"
	"time"

	"github.com/google/uuid"

	"cdr.dev/slog/v3"
	"cdr.dev/slog/v3/sloggers/sloghuman"
	"github.com/coder/coder/v2/coderd/database"
	"github.com/coder/coder/v2/coderd/database/dbauthz"
)

var DefaultFlushInterval = 60 * time.Second

// Store is a subset of database.Store
type Store interface {
	BatchUpdateWorkspaceLastUsedAt(context.Context, database.BatchUpdateWorkspaceLastUsedAtParams) error
}

// UsageTracker tracks and de-bounces updates to workspace usage activity.
// It keeps an internal map of workspace IDs that have been used and
// periodically flushes this to its configured Store.
type UsageTracker struct {
	log         slog.Logger      // you know, for logs
	flushLock   sync.Mutex       // protects m
	flushErrors int              // tracks the number of consecutive errors flushing
	m           *uuidSet         // stores workspace ids
	s           Store            // for flushing data
	tickCh      <-chan time.Time // controls flush interval
	stopTick    func()           // stops flushing
	stopCh      chan struct{}    // signals us to stop
	stopOnce    sync.Once        // because you only stop once
	doneCh      chan struct{}    // signifies that we have stopped
	flushCh     chan int         // used for testing.
}

// NewTracker returns a new Tracker. It is the caller's responsibility
// to call Close().
func NewTracker(s Store, opts ...TrackerOption) *UsageTracker {
	tr := &UsageTracker{
		log:      slog.Make(sloghuman.Sink(os.Stderr)),
		m:        &uuidSet{},
		s:        s,
		tickCh:   nil,
		stopTick: nil,
		stopCh:   make(chan struct{}),
		doneCh:   make(chan struct{}),
		flushCh:  nil,
	}
	for _, opt := range opts {
		opt(tr)
	}
	if tr.tickCh == nil && tr.stopTick == nil {
		tick := time.NewTicker(DefaultFlushInterval)
		tr.tickCh = tick.C
		tr.stopTick = tick.Stop
	}
	go tr.loop()
	return tr
}

type TrackerOption func(*UsageTracker)

// TrackerWithLogger sets the logger to be used by Tracker.
func TrackerWithLogger(log slog.Logger) TrackerOption {
	return func(h *UsageTracker) {
		h.log = log
	}
}

// TrackerWithFlushInterval allows configuring the flush interval of Tracker.
func TrackerWithFlushInterval(d time.Duration) TrackerOption {
	return func(h *UsageTracker) {
		ticker := time.NewTicker(d)
		h.tickCh = ticker.C
		h.stopTick = ticker.Stop
	}
}

// TrackerWithTickFlush allows passing two channels: one that reads
// a time.Time, and one that returns the number of marked workspaces
// every time Tracker flushes.
// For testing only and will panic if used outside of tests.
func TrackerWithTickFlush(tickCh <-chan time.Time, flushCh chan int) TrackerOption {
	if flag.Lookup("test.v") == nil {
		panic("developer error: WithTickFlush is not to be used outside of tests.")
	}
	return func(h *UsageTracker) {
		h.tickCh = tickCh
		h.stopTick = func() {}
		h.flushCh = flushCh
	}
}

// Add marks the workspace with the given ID as having been used recently.
// Tracker will periodically flush this to its configured Store.
func (tr *UsageTracker) Add(workspaceID uuid.UUID) {
	tr.m.Add(workspaceID)
}

// flush updates last_used_at of all current workspace IDs.
// If this is held while a previous flush is in progress, it will
// deadlock until the previous flush has completed.
func (tr *UsageTracker) flush(now time.Time) {
	// Copy our current set of IDs
	ids := tr.m.UniqueAndClear()
	count := len(ids)
	if tr.flushCh != nil { // only used for testing
		defer func() {
			tr.flushCh <- count
		}()
	}
	if count == 0 {
		tr.log.Debug(context.Background(), "nothing to flush")
		return
	}

	// Set a short-ish timeout for this. We don't want to hang forever.
	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()
	// nolint: gocritic // system function
	authCtx := dbauthz.AsSystemRestricted(ctx)
	tr.flushLock.Lock()
	defer tr.flushLock.Unlock()
	if err := tr.s.BatchUpdateWorkspaceLastUsedAt(authCtx, database.BatchUpdateWorkspaceLastUsedAtParams{
		LastUsedAt: now,
		IDs:        ids,
	}); err != nil {
		// A single failure to flush is likely not a huge problem. If the workspace is still connected at
		// the next iteration, either another coderd instance will likely have this data or the CLI
		// will tell us again that the workspace is in use.
		tr.flushErrors++
		if tr.flushErrors > 1 {
			tr.log.Error(ctx, "multiple failures updating workspaces last_used_at", slog.F("count", count), slog.F("consecutive_errors", tr.flushErrors), slog.Error(err))
			// TODO: if this keeps failing, it indicates a fundamental problem with the database connection.
			// How to surface it correctly to admins besides just screaming into the logs?
		} else {
			tr.log.Warn(ctx, "failed updating workspaces last_used_at", slog.F("count", count), slog.Error(err))
		}
		return
	}
	tr.flushErrors = 0
	tr.log.Info(ctx, "updated workspaces last_used_at", slog.F("count", count), slog.F("now", now))
}

// loop periodically flushes every tick.
// If loop is called after Close, it will exit immediately and log an error.
func (tr *UsageTracker) loop() {
	select {
	case <-tr.doneCh:
		tr.log.Error(context.Background(), "developer error: Loop called after Close")
		return
	default:
	}
	defer func() {
		close(tr.doneCh)
		tr.log.Debug(context.Background(), "workspace usage tracker loop exited")
	}()
	for {
		select {
		case <-tr.stopCh:
			return
		case now, ok := <-tr.tickCh:
			if !ok {
				return
			}
			// NOTE: we do not update last_used_at with the time at which each workspace was added.
			// Instead, we update with the time of the flush. If the BatchUpdateWorkspacesLastUsedAt
			// query can be rewritten to update each id with a corresponding last_used_at timestamp
			// then we could capture the exact usage time of each workspace. For now however, as
			// we perform this query at a regular interval, the time of the flush is 'close enough'
			// for the purposes of both dormancy (and for autostop, in future).
			tr.flush(now.UTC())
		}
	}
}

// Close stops Tracker and returns once Loop has exited.
// After calling Close(), Loop must not be called.
func (tr *UsageTracker) Close() error {
	tr.stopOnce.Do(func() {
		tr.stopCh <- struct{}{}
		tr.stopTick()
		<-tr.doneCh
	})
	return nil
}

// uuidSet is a set of UUIDs. Safe for concurrent usage.
// The zero value can be used.
type uuidSet struct {
	l sync.Mutex
	m map[uuid.UUID]struct{}
}

func (s *uuidSet) Add(id uuid.UUID) {
	s.l.Lock()
	defer s.l.Unlock()
	if s.m == nil {
		s.m = make(map[uuid.UUID]struct{})
	}
	s.m[id] = struct{}{}
}

// UniqueAndClear returns the unique set of entries in s and
// resets the internal map.
func (s *uuidSet) UniqueAndClear() []uuid.UUID {
	s.l.Lock()
	defer s.l.Unlock()
	if s.m == nil {
		s.m = make(map[uuid.UUID]struct{})
		return []uuid.UUID{}
	}
	l := make([]uuid.UUID, 0)
	for k := range s.m {
		l = append(l, k)
	}
	// For ease of testing, sort the IDs lexically
	sort.Slice(l, func(i, j int) bool {
		// For some unfathomable reason, byte arrays are not comparable?
		// See https://github.com/golang/go/issues/61004
		return bytes.Compare(l[i][:], l[j][:]) < 0
	})
	clear(s.m)
	return l
}