fix(tailnet): preserve DNS hosts across control plane reconnections (#24253)

When the control plane connection drops and reconnects, a new
`tunnelUpdater` is created with empty workspace state. This causes the
in-memory DNS resolver to lose all host records, breaking `.coder` name
resolution until the server sends a fresh workspace snapshot.

If the API is unreachable (e.g., the route goes through a VPN that is
also reconnecting), the snapshot never arrives and DNS stays broken
indefinitely — requiring a full Coder Desktop restart.

Fix: carry workspace state from the previous `tunnelUpdater` to the new
one on reconnect, and immediately re-apply DNS hosts so the resolver
stays populated during the reconnection window.

Fixes https://linear.app/codercom/issue/PLAT-110

<details><summary>Investigation & decision log</summary>

### Root cause analysis

Customer diagnostic data from Roblox (March 31) showed:
- NRPT rule present (`.coder` → `fd60:627a:a42b::53`) — routing is
correct
- DNS resolver returns NXDOMAIN for everything including the sentinel
`is.coder--connect--enabled--right--now.coder` — resolver is running but
has zero host records
- Coder Connect UI shows "connected" — the WireGuard data plane is up

The resolver is empty because
`TunnelAllWorkspaceUpdatesController.New()` creates a fresh
`tunnelUpdater` with `workspaces: make(map[uuid.UUID]*Workspace)`
(empty). The previous updater's workspace data is discarded. If the
server's workspace snapshot is delayed or the API is unreachable, the
resolver has no records to serve.

This is compounded by GlobalProtect VPN reconnects: the Coder API is
behind the VPN, so when GP reconnects, the API route is temporarily lost
and the snapshot can't arrive.

### What this PR changes

- `TunnelAllWorkspaceUpdatesController.New()` now clones workspace state
from the previous updater before creating the new one
- Immediately re-applies DNS hosts with the inherited state (log:
`re-applying DNS hosts from previous session`)
- When the server's snapshot arrives, it replaces the inherited data
normally
- If `SetDNSHosts` fails during re-apply, it's logged as a warning and
not fatal — the recvLoop will program DNS when the snapshot arrives

### What this PR does NOT fix (future work)

- **Tunnel binary restart**: when the tunnel process itself is killed
and relaunched, all in-memory state is lost. A DNS host cache on disk
would be needed for this case.
- **NRPT rule cleanup on startup**: the Tailscale fork's
`nrptRuleDatabase` constructor unconditionally deletes all NRPT rules on
engine creation. Deferring cleanup to the first successful `SetDNS` call
would reduce the DNS gap.
- **Hosts file retry**: the `setHosts()` retry in the Tailscale fork
(5×10ms) is too short for environments where endpoint security locks the
file.

These are tracked as follow-up items in the `coder/tailscale` fork.

</details>

> 🤖 Generated by Coder Agents
This commit is contained in:
Garrett Delfosse
2026-04-29 12:29:44 -04:00
committed by GitHub
parent a24dc19d49
commit 54d650ea79
2 changed files with 261 additions and 1 deletions
+51 -1
View File
@@ -986,6 +986,30 @@ func (a *Agent) Clone() Agent {
func (t *TunnelAllWorkspaceUpdatesController) New(client WorkspaceUpdatesClient) CloserWaiter {
t.mu.Lock()
defer t.mu.Unlock()
// Preserve workspace state from the previous updater so that DNS
// hosts remain programmed while we wait for the new server
// snapshot. Without this, a control-plane reconnection would
// leave the internal DNS resolver empty until the first
// workspace update arrives, breaking .coder resolution.
var previousWorkspaces map[uuid.UUID]*Workspace
if t.updater != nil {
t.updater.Lock()
if len(t.updater.workspaces) > 0 {
previousWorkspaces = make(map[uuid.UUID]*Workspace, len(t.updater.workspaces))
for id, w := range t.updater.workspaces {
clone := w.Clone()
previousWorkspaces[id] = &clone
}
}
t.updater.Unlock()
}
workspaces := make(map[uuid.UUID]*Workspace)
if previousWorkspaces != nil {
workspaces = previousWorkspaces
}
updater := &tunnelUpdater{
client: client,
errChan: make(chan error, 1),
@@ -996,8 +1020,26 @@ func (t *TunnelAllWorkspaceUpdatesController) New(client WorkspaceUpdatesClient)
updateHandler: t.updateHandler,
ownerUsername: t.ownerUsername,
recvLoopDone: make(chan struct{}),
workspaces: make(map[uuid.UUID]*Workspace),
workspaces: workspaces,
}
// If we inherited workspace state, immediately re-program DNS
// hosts so the resolver stays populated during the reconnection
// window.
if len(previousWorkspaces) > 0 {
dnsNames := updater.updateDNSNamesLocked()
if updater.dnsHostsSetter != nil {
t.logger.Debug(context.Background(), "re-applying DNS hosts from previous session",
slog.F("num_hosts", len(dnsNames)),
)
if err := updater.dnsHostsSetter.SetDNSHosts(dnsNames); err != nil {
t.logger.Warn(context.Background(), "failed to re-apply DNS hosts from previous session",
slog.Error(err),
)
}
}
}
t.updater = updater
go t.updater.recvLoop()
return t.updater
@@ -1157,6 +1199,14 @@ func (t *tunnelUpdater) handleUpdate(update *proto.WorkspaceUpdate, updateKind U
t.Lock()
defer t.Unlock()
// Snapshots represent the complete state, not a diff. Clear any
// inherited or previously accumulated workspace data so that
// workspaces absent from the snapshot (e.g. stopped while we
// were disconnected) do not linger.
if updateKind == Snapshot {
t.workspaces = make(map[uuid.UUID]*Workspace)
}
currentUpdate := WorkspaceUpdate{
UpsertedWorkspaces: []*Workspace{},
UpsertedAgents: []*Agent{},
+210
View File
@@ -2023,6 +2023,216 @@ func TestTunnelAllWorkspaceUpdatesController_HandleErrors(t *testing.T) {
}
}
func TestTunnelAllWorkspaceUpdatesController_ReconnectPreservesDNS(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
logger := testutil.Logger(t)
fConn := &fakeCoordinatee{}
tsc := tailnet.NewTunnelSrcCoordController(logger, fConn)
fDNS := newFakeDNSSetter(ctx, t)
uut := tailnet.NewTunnelAllWorkspaceUpdatesController(logger, tsc,
tailnet.WithDNS(fDNS, "testy", tailnet.DNSNameOptions{Suffix: "mctest"}),
)
// Connect coordinator
coordC := newFakeCoordinatorClient(ctx, t)
coordCW := tsc.New(coordC)
// Connect first workspace update client and send a snapshot
w1ID := testUUID(1)
w1a1ID := testUUID(1, 1)
updateC1 := newFakeWorkspaceUpdateClient(ctx, t)
updateCW1 := uut.New(updateC1)
initUp := &proto.WorkspaceUpdate{
UpsertedWorkspaces: []*proto.Workspace{
{Id: w1ID[:], Name: "w1"},
},
UpsertedAgents: []*proto.Agent{
{Id: w1a1ID[:], Name: "w1a1", WorkspaceId: w1ID[:]},
},
}
upRecvCall := testutil.TryReceive(ctx, t, updateC1.recv)
testutil.RequireSend(ctx, t, upRecvCall.resp, initUp)
// Consume the AddTunnel coordination call
coordCall := testutil.TryReceive(ctx, t, coordC.reqs)
testutil.RequireSend(ctx, t, coordCall.err, nil)
// Consume the initial DNS set call
dnsCall := testutil.TryReceive(ctx, t, fDNS.calls)
require.NotEmpty(t, dnsCall.hosts)
initialHosts := dnsCall.hosts
testutil.RequireSend(ctx, t, dnsCall.err, nil)
// Simulate disconnect: hang up the first workspace update client
upRecvCall = testutil.TryReceive(ctx, t, updateC1.recv)
testutil.RequireSend(ctx, t, upRecvCall.err, io.EOF)
err := testutil.TryReceive(ctx, t, updateCW1.Wait())
require.ErrorIs(t, err, io.EOF)
// Reconnect with a new workspace update client. The controller
// should carry over workspace state and immediately re-apply DNS.
updateC2 := newFakeWorkspaceUpdateClient(ctx, t)
// New() will call SetDNSHosts synchronously with the inherited
// hosts. We need to consume that call concurrently because the
// fake DNS setter blocks until the test reads from the channel.
reconnDNSHosts := make(chan map[dnsname.FQDN][]netip.Addr, 1)
go func() {
call := testutil.TryReceive(ctx, t, fDNS.calls)
reconnDNSHosts <- call.hosts
testutil.RequireSend(ctx, t, call.err, nil)
}()
updateCW2 := uut.New(updateC2)
// Verify DNS was re-applied with the previously known hosts
// before any server snapshot arrives on the new client.
gotHosts := testutil.TryReceive(ctx, t, reconnDNSHosts)
require.Equal(t, initialHosts, gotHosts)
// Cleanup: tear down the second workspace update client
upRecvCall = testutil.TryReceive(ctx, t, updateC2.recv)
testutil.RequireSend(ctx, t, upRecvCall.err, io.EOF)
err = testutil.TryReceive(ctx, t, updateCW2.Wait())
require.ErrorIs(t, err, io.EOF)
// Cleanup: tear down coordinator
coordRecv := testutil.TryReceive(ctx, t, coordC.resps)
testutil.RequireSend(ctx, t, coordRecv.err, io.EOF)
cCall := testutil.TryReceive(ctx, t, coordC.close)
testutil.RequireSend(ctx, t, cCall, nil)
err = testutil.TryReceive(ctx, t, coordCW.Wait())
require.ErrorIs(t, err, io.EOF)
}
func TestTunnelAllWorkspaceUpdatesController_SnapshotClearsInheritedState(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
logger := testutil.Logger(t)
fConn := &fakeCoordinatee{}
tsc := tailnet.NewTunnelSrcCoordController(logger, fConn)
fDNS := newFakeDNSSetter(ctx, t)
uut := tailnet.NewTunnelAllWorkspaceUpdatesController(logger, tsc,
tailnet.WithDNS(fDNS, "testy", tailnet.DNSNameOptions{Suffix: "mctest"}),
)
coordC := newFakeCoordinatorClient(ctx, t)
coordCW := tsc.New(coordC)
// First connection: send snapshot with two workspaces.
w1ID := testUUID(1)
w1a1ID := testUUID(1, 1)
w2ID := testUUID(2)
w2a1ID := testUUID(2, 1)
updateC1 := newFakeWorkspaceUpdateClient(ctx, t)
updateCW1 := uut.New(updateC1)
initUp := &proto.WorkspaceUpdate{
UpsertedWorkspaces: []*proto.Workspace{
{Id: w1ID[:], Name: "w1"},
{Id: w2ID[:], Name: "w2"},
},
UpsertedAgents: []*proto.Agent{
{Id: w1a1ID[:], Name: "w1a1", WorkspaceId: w1ID[:]},
{Id: w2a1ID[:], Name: "w2a1", WorkspaceId: w2ID[:]},
},
}
upRecvCall := testutil.TryReceive(ctx, t, updateC1.recv)
testutil.RequireSend(ctx, t, upRecvCall.resp, initUp)
// Consume AddTunnel coordination calls for both agents.
for range 2 {
coordCall := testutil.TryReceive(ctx, t, coordC.reqs)
testutil.RequireSend(ctx, t, coordCall.err, nil)
}
// Consume the initial DNS set call.
dnsCall := testutil.TryReceive(ctx, t, fDNS.calls)
require.NotEmpty(t, dnsCall.hosts)
initialHosts := dnsCall.hosts
testutil.RequireSend(ctx, t, dnsCall.err, nil)
// Verify both workspaces are in the DNS hosts.
var initialHostCount int
for range initialHosts {
initialHostCount++
}
require.Greater(t, initialHostCount, 0)
// Simulate disconnect.
upRecvCall = testutil.TryReceive(ctx, t, updateC1.recv)
testutil.RequireSend(ctx, t, upRecvCall.err, io.EOF)
err := testutil.TryReceive(ctx, t, updateCW1.Wait())
require.ErrorIs(t, err, io.EOF)
// Reconnect. New() re-applies inherited DNS synchronously.
updateC2 := newFakeWorkspaceUpdateClient(ctx, t)
reconnDNSHosts := make(chan map[dnsname.FQDN][]netip.Addr, 1)
go func() {
call := testutil.TryReceive(ctx, t, fDNS.calls)
reconnDNSHosts <- call.hosts
testutil.RequireSend(ctx, t, call.err, nil)
}()
updateCW2 := uut.New(updateC2)
// The inherited DNS should have both workspaces.
gotHosts := testutil.TryReceive(ctx, t, reconnDNSHosts)
require.Equal(t, initialHosts, gotHosts)
// Server sends a snapshot that only contains w2 (w1 stopped
// while we were disconnected). Because it is a snapshot, w1
// must be removed from the updater state.
snapshotUp := &proto.WorkspaceUpdate{
UpsertedWorkspaces: []*proto.Workspace{
{Id: w2ID[:], Name: "w2"},
},
UpsertedAgents: []*proto.Agent{
{Id: w2a1ID[:], Name: "w2a1", WorkspaceId: w2ID[:]},
},
}
upRecvCall = testutil.TryReceive(ctx, t, updateC2.recv)
testutil.RequireSend(ctx, t, upRecvCall.resp, snapshotUp)
// The snapshot should trigger a RemoveTunnel for w1's agent and
// an AddTunnel for w2's agent (since state was cleared and rebuilt).
// Consume all coordination calls.
coordCall := testutil.TryReceive(ctx, t, coordC.reqs)
testutil.RequireSend(ctx, t, coordCall.err, nil)
// Consume the DNS update after the snapshot.
dnsCall = testutil.TryReceive(ctx, t, fDNS.calls)
require.NotEmpty(t, dnsCall.hosts)
testutil.RequireSend(ctx, t, dnsCall.err, nil)
// After the snapshot, DNS should only contain w2 entries. The
// stale w1 entries must be gone.
for fqdn := range dnsCall.hosts {
require.NotContains(t, string(fqdn), "w1",
"stale workspace w1 should not be in DNS after snapshot")
}
// Cleanup.
upRecvCall = testutil.TryReceive(ctx, t, updateC2.recv)
testutil.RequireSend(ctx, t, upRecvCall.err, io.EOF)
err = testutil.TryReceive(ctx, t, updateCW2.Wait())
require.ErrorIs(t, err, io.EOF)
coordRecv := testutil.TryReceive(ctx, t, coordC.resps)
testutil.RequireSend(ctx, t, coordRecv.err, io.EOF)
cCall := testutil.TryReceive(ctx, t, coordC.close)
testutil.RequireSend(ctx, t, cCall, nil)
err = testutil.TryReceive(ctx, t, coordCW.Wait())
require.ErrorIs(t, err, io.EOF)
}
func TestBasicDERPController_RewriteDERPMap(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)