fix: wait for server tailnet background routines to exit on Close (#15183)

fixes https://github.com/coder/internal/issues/114

We need to wait for ServerTailnet goroutines to finish when closing down, otherwise we can race with the shutdown of coderd & the coordinator, which causes errors.
This commit is contained in:
Spike Curtis
2024-10-23 10:09:56 +04:00
committed by GitHub
parent 343f8ec9ab
commit 32d5875fa4
2 changed files with 24 additions and 9 deletions
+22 -9
View File
@@ -91,13 +91,15 @@ func NewServerTailnet(
})
}
derpMapUpdaterClosed := make(chan struct{})
bgRoutines := &sync.WaitGroup{}
originalDerpMap := derpMapFn()
// it's important to set the DERPRegionDialer above _before_ we set the DERP map so that if
// there is an embedded relay, we use the local in-memory dialer.
conn.SetDERPMap(originalDerpMap)
bgRoutines.Add(1)
go func() {
defer close(derpMapUpdaterClosed)
defer bgRoutines.Done()
defer logger.Debug(ctx, "polling DERPMap exited")
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
@@ -120,7 +122,7 @@ func NewServerTailnet(
tn := &ServerTailnet{
ctx: serverCtx,
cancel: cancel,
derpMapUpdaterClosed: derpMapUpdaterClosed,
bgRoutines: bgRoutines,
logger: logger,
tracer: traceProvider.Tracer(tracing.TracerName),
conn: conn,
@@ -170,8 +172,15 @@ func NewServerTailnet(
// registering the callback also triggers send of the initial node
tn.coordinatee.SetNodeCallback(tn.nodeCallback)
go tn.watchAgentUpdates()
go tn.expireOldAgents()
tn.bgRoutines.Add(2)
go func() {
defer tn.bgRoutines.Done()
tn.watchAgentUpdates()
}()
go func() {
defer tn.bgRoutines.Done()
tn.expireOldAgents()
}()
return tn, nil
}
@@ -204,6 +213,7 @@ func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
}
func (s *ServerTailnet) expireOldAgents() {
defer s.logger.Debug(s.ctx, "stopped expiring old agents")
const (
tick = 5 * time.Minute
cutoff = 30 * time.Minute
@@ -255,6 +265,7 @@ func (s *ServerTailnet) doExpireOldAgents(cutoff time.Duration) {
}
func (s *ServerTailnet) watchAgentUpdates() {
defer s.logger.Debug(s.ctx, "stopped watching agent updates")
for {
conn := s.getAgentConn()
resp, ok := conn.NextUpdate(s.ctx)
@@ -317,9 +328,9 @@ func (s *ServerTailnet) reinitCoordinator() {
}
type ServerTailnet struct {
ctx context.Context
cancel func()
derpMapUpdaterClosed chan struct{}
ctx context.Context
cancel func()
bgRoutines *sync.WaitGroup
logger slog.Logger
tracer trace.Tracer
@@ -532,10 +543,12 @@ func (c *netConnCloser) Close() error {
}
func (s *ServerTailnet) Close() error {
s.logger.Info(s.ctx, "closing server tailnet")
defer s.logger.Debug(s.ctx, "server tailnet close complete")
s.cancel()
_ = s.conn.Close()
s.transport.CloseIdleConnections()
<-s.derpMapUpdaterClosed
s.bgRoutines.Wait()
return nil
}
+2
View File
@@ -419,6 +419,8 @@ func (s *Server) RegisterNow() error {
}
func (s *Server) Close() error {
s.Logger.Info(s.ctx, "closing workspace proxy server")
defer s.Logger.Debug(s.ctx, "finished closing workspace proxy server")
s.cancel()
var err error