mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
d0e71d20fd
The stats reporter only installed the connstats callback on the TUN device after the report loop negotiated an interval with the server. Traffic that flowed before that point (e.g. an SSH handshake) was silently dropped from tracking because the TUN wrapper's stats.Load() returned nil. On macOS CI under scheduler pressure, this window was wide enough for the entire SSH handshake to complete untracked. Install the connstats callback immediately in newStatsReporter using the interval from agent.Options.StatsReportInterval. The default (5 minutes) matches the server-side default, so the report loop does not replace the tracker on the common path. Tests set it to the agenttest.StatsInterval (500ms) to match the fake server, also avoiding replacement. If the server returns a different interval, the existing reconciliation in reportLocked handles it. Add a StatsCallbackRace subtest to TestAgent_Stats_SSH that uses a channel barrier to prove the fix is load-bearing: on unfixed code the barrier blocks reportLoop while SSH connects (traffic lost), on fixed code the callback is already installed at creation (traffic captured). Fixes flaky TestAgent_Stats_SSH, TestAgent_Stats_ReconnectingPTY, and TestAgent_Stats_Magic by ensuring the connstats callback is always installed before network traffic can flow. Closes https://github.com/coder/internal/issues/505
148 lines
4.5 KiB
Go
148 lines
4.5 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"maps"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/xerrors"
|
|
"tailscale.com/types/netlogtype"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/agent/proto"
|
|
)
|
|
|
|
const maxConns = 2048
|
|
|
|
type networkStatsSource interface {
|
|
SetConnStatsCallback(maxPeriod time.Duration, maxConns int, dump func(start, end time.Time, virtual, physical map[netlogtype.Connection]netlogtype.Counts))
|
|
}
|
|
|
|
type statsCollector interface {
|
|
Collect(ctx context.Context, networkStats map[netlogtype.Connection]netlogtype.Counts) *proto.Stats
|
|
}
|
|
|
|
type statsDest interface {
|
|
UpdateStats(ctx context.Context, req *proto.UpdateStatsRequest) (*proto.UpdateStatsResponse, error)
|
|
}
|
|
|
|
// statsReporter is a subcomponent of the agent that handles registering the stats callback on the
|
|
// networkStatsSource (tailnet.Conn in prod), handling the callback, calling back to the
|
|
// statsCollector (agent in prod) to collect additional stats, then sending the update to the
|
|
// statsDest (agent API in prod)
|
|
type statsReporter struct {
|
|
*sync.Cond
|
|
networkStats map[netlogtype.Connection]netlogtype.Counts
|
|
unreported bool
|
|
lastInterval time.Duration
|
|
|
|
source networkStatsSource
|
|
collector statsCollector
|
|
logger slog.Logger
|
|
}
|
|
|
|
// DefaultStatsReportInterval matches coderd.Options.AgentStatsRefreshInterval.
|
|
const DefaultStatsReportInterval = 5 * time.Minute
|
|
|
|
func newStatsReporter(logger slog.Logger, source networkStatsSource, collector statsCollector, interval time.Duration) *statsReporter {
|
|
s := &statsReporter{
|
|
Cond: sync.NewCond(&sync.Mutex{}),
|
|
logger: logger,
|
|
source: source,
|
|
collector: collector,
|
|
lastInterval: interval,
|
|
}
|
|
// Install the callback immediately so traffic is tracked before
|
|
// reportLoop starts. reportLoop replaces it only if the
|
|
// server-negotiated interval differs.
|
|
source.SetConnStatsCallback(interval, maxConns, s.callback)
|
|
return s
|
|
}
|
|
|
|
func (s *statsReporter) callback(_, _ time.Time, virtual, _ map[netlogtype.Connection]netlogtype.Counts) {
|
|
s.L.Lock()
|
|
defer s.L.Unlock()
|
|
s.logger.Debug(context.Background(), "got stats callback")
|
|
// Accumulate stats until they've been reported.
|
|
if s.unreported && len(s.networkStats) > 0 {
|
|
for k, v := range virtual {
|
|
s.networkStats[k] = s.networkStats[k].Add(v)
|
|
}
|
|
} else {
|
|
s.networkStats = maps.Clone(virtual)
|
|
s.unreported = true
|
|
}
|
|
s.Broadcast()
|
|
}
|
|
|
|
// reportLoop reports collected stats to the server.
|
|
//
|
|
// The connstats callback is already installed by newStatsReporter;
|
|
// reportLoop only replaces it if the server returns a different interval.
|
|
//
|
|
// It's intended to be called within the larger retry loop that establishes a
|
|
// connection to the agent API, then passes that connection to go routines like
|
|
// this that use it. There is no retry and we fail on the first error since
|
|
// this will be inside a larger retry loop.
|
|
func (s *statsReporter) reportLoop(ctx context.Context, dest statsDest) error {
|
|
// send an initial, blank report to get the interval
|
|
resp, err := dest.UpdateStats(ctx, &proto.UpdateStatsRequest{})
|
|
if err != nil {
|
|
return xerrors.Errorf("initial update: %w", err)
|
|
}
|
|
interval := resp.ReportInterval.AsDuration()
|
|
if interval != s.lastInterval {
|
|
s.lastInterval = interval
|
|
s.source.SetConnStatsCallback(s.lastInterval, maxConns, s.callback)
|
|
}
|
|
|
|
// use a separate goroutine to monitor the context so that we notice immediately, rather than
|
|
// waiting for the next callback (which might never come if we are closing!)
|
|
ctxDone := false
|
|
go func() {
|
|
<-ctx.Done()
|
|
s.L.Lock()
|
|
defer s.L.Unlock()
|
|
ctxDone = true
|
|
s.Broadcast()
|
|
}()
|
|
defer s.logger.Debug(ctx, "reportLoop exiting")
|
|
|
|
s.L.Lock()
|
|
defer s.L.Unlock()
|
|
for {
|
|
for !s.unreported && !ctxDone {
|
|
s.Wait()
|
|
}
|
|
if ctxDone {
|
|
return nil
|
|
}
|
|
s.unreported = false
|
|
if err = s.reportLocked(ctx, dest, s.networkStats); err != nil {
|
|
return xerrors.Errorf("report stats: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *statsReporter) reportLocked(
|
|
ctx context.Context, dest statsDest, networkStats map[netlogtype.Connection]netlogtype.Counts,
|
|
) error {
|
|
// here we want to do our collecting/reporting while it is unlocked, but then relock
|
|
// when we return to reportLoop.
|
|
s.L.Unlock()
|
|
defer s.L.Lock()
|
|
stats := s.collector.Collect(ctx, networkStats)
|
|
resp, err := dest.UpdateStats(ctx, &proto.UpdateStatsRequest{Stats: stats})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
interval := resp.GetReportInterval().AsDuration()
|
|
if interval != s.lastInterval {
|
|
s.logger.Info(ctx, "new stats report interval", slog.F("interval", interval))
|
|
s.lastInterval = interval
|
|
s.source.SetConnStatsCallback(s.lastInterval, maxConns, s.callback)
|
|
}
|
|
return nil
|
|
}
|