mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: modify coordinators to send errors and peers to log them (#17467)
Adds support to our coordinator implementations to send Error updates before disconnecting clients. I was recently debugging a connection issue where the client was getting repeatedly disconnected from the Coordinator, but since we never send any error information it was really hard without server logs. This PR aims to correct that, by sending a CoordinateResponse with `Error` set in cases where we disconnect a client without them asking us to. It also logs the error whenever we get one in the client controller.
This commit is contained in:
@@ -284,6 +284,17 @@ func (c *BasicCoordination) respLoop() {
|
||||
return
|
||||
}
|
||||
|
||||
if resp.Error != "" {
|
||||
// ReadyForHandshake error can occur during race conditions, where we send a ReadyForHandshake message,
|
||||
// but the source has already disconnected from the tunnel by the time we do. So, just log at warning.
|
||||
if strings.HasPrefix(resp.Error, ReadyForHandshakeError) {
|
||||
c.logger.Warn(context.Background(), "coordination warning", slog.F("msg", resp.Error))
|
||||
} else {
|
||||
c.logger.Error(context.Background(),
|
||||
"coordination protocol error", slog.F("error", resp.Error))
|
||||
}
|
||||
}
|
||||
|
||||
err = c.coordinatee.UpdatePeers(resp.GetPeerUpdates())
|
||||
if err != nil {
|
||||
c.logger.Debug(context.Background(), "failed to update peers", slog.Error(err))
|
||||
|
||||
+49
-15
@@ -24,7 +24,10 @@ const (
|
||||
// dropping updates
|
||||
ResponseBufferSize = 512
|
||||
// RequestBufferSize is the max number of requests to buffer per connection
|
||||
RequestBufferSize = 32
|
||||
RequestBufferSize = 32
|
||||
CloseErrOverwritten = "peer ID overwritten by new connection"
|
||||
CloseErrCoordinatorClose = "coordinator closed"
|
||||
ReadyForHandshakeError = "ready for handshake error"
|
||||
)
|
||||
|
||||
// Coordinator exchanges nodes with agents to establish connections.
|
||||
@@ -97,6 +100,18 @@ var (
|
||||
ErrAlreadyRemoved = xerrors.New("already removed")
|
||||
)
|
||||
|
||||
type AuthorizationError struct {
|
||||
Wrapped error
|
||||
}
|
||||
|
||||
func (e AuthorizationError) Error() string {
|
||||
return fmt.Sprintf("authorization: %s", e.Wrapped.Error())
|
||||
}
|
||||
|
||||
func (e AuthorizationError) Unwrap() error {
|
||||
return e.Wrapped
|
||||
}
|
||||
|
||||
// NewCoordinator constructs a new in-memory connection coordinator. This
|
||||
// coordinator is incompatible with multiple Coder replicas as all node data is
|
||||
// in-memory.
|
||||
@@ -161,8 +176,12 @@ func (c *coordinator) Coordinate(
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
p.reqLoop(ctx, logger, c.core.handleRequest)
|
||||
err := c.core.lostPeer(p)
|
||||
loopErr := p.reqLoop(ctx, logger, c.core.handleRequest)
|
||||
closeErrStr := ""
|
||||
if loopErr != nil {
|
||||
closeErrStr = loopErr.Error()
|
||||
}
|
||||
err := c.core.lostPeer(p, closeErrStr)
|
||||
if xerrors.Is(err, ErrClosed) || xerrors.Is(err, ErrAlreadyRemoved) {
|
||||
return
|
||||
}
|
||||
@@ -227,7 +246,7 @@ func (c *core) handleRequest(ctx context.Context, p *peer, req *proto.Coordinate
|
||||
}
|
||||
|
||||
if err := pr.auth.Authorize(ctx, req); err != nil {
|
||||
return xerrors.Errorf("authorize request: %w", err)
|
||||
return AuthorizationError{Wrapped: err}
|
||||
}
|
||||
|
||||
if req.UpdateSelf != nil {
|
||||
@@ -270,7 +289,7 @@ func (c *core) handleRequest(ctx context.Context, p *peer, req *proto.Coordinate
|
||||
}
|
||||
}
|
||||
if req.Disconnect != nil {
|
||||
c.removePeerLocked(p.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "graceful disconnect")
|
||||
c.removePeerLocked(p.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "graceful disconnect", "")
|
||||
}
|
||||
if rfhs := req.ReadyForHandshake; rfhs != nil {
|
||||
err := c.handleReadyForHandshakeLocked(pr, rfhs)
|
||||
@@ -298,7 +317,7 @@ func (c *core) handleReadyForHandshakeLocked(src *peer, rfhs []*proto.Coordinate
|
||||
// don't want to kill its connection.
|
||||
select {
|
||||
case src.resps <- &proto.CoordinateResponse{
|
||||
Error: fmt.Sprintf("you do not share a tunnel with %q", dstID.String()),
|
||||
Error: fmt.Sprintf("%s: you do not share a tunnel with %q", ReadyForHandshakeError, dstID.String()),
|
||||
}:
|
||||
default:
|
||||
return ErrWouldBlock
|
||||
@@ -344,7 +363,7 @@ func (c *core) updateTunnelPeersLocked(id uuid.UUID, n *proto.Node, k proto.Coor
|
||||
err := other.updateMappingLocked(id, n, k, reason)
|
||||
if err != nil {
|
||||
other.logger.Error(context.Background(), "failed to update mapping", slog.Error(err))
|
||||
c.removePeerLocked(other.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update")
|
||||
c.removePeerLocked(other.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update", "failed to update tunnel peer mapping")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -360,7 +379,8 @@ func (c *core) addTunnelLocked(src *peer, dstID uuid.UUID) error {
|
||||
err := src.updateMappingLocked(dstID, dst.node, proto.CoordinateResponse_PeerUpdate_NODE, "add tunnel")
|
||||
if err != nil {
|
||||
src.logger.Error(context.Background(), "failed update of tunnel src", slog.Error(err))
|
||||
c.removePeerLocked(src.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update")
|
||||
c.removePeerLocked(src.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update",
|
||||
"failed to update tunnel dest mapping")
|
||||
// if the source fails, then the tunnel is also removed and there is no reason to continue
|
||||
// processing.
|
||||
return err
|
||||
@@ -370,7 +390,8 @@ func (c *core) addTunnelLocked(src *peer, dstID uuid.UUID) error {
|
||||
err := dst.updateMappingLocked(src.id, src.node, proto.CoordinateResponse_PeerUpdate_NODE, "add tunnel")
|
||||
if err != nil {
|
||||
dst.logger.Error(context.Background(), "failed update of tunnel dst", slog.Error(err))
|
||||
c.removePeerLocked(dst.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update")
|
||||
c.removePeerLocked(dst.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update",
|
||||
"failed to update tunnel src mapping")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -381,7 +402,7 @@ func (c *core) removeTunnelLocked(src *peer, dstID uuid.UUID) error {
|
||||
err := src.updateMappingLocked(dstID, nil, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "remove tunnel")
|
||||
if err != nil {
|
||||
src.logger.Error(context.Background(), "failed to update", slog.Error(err))
|
||||
c.removePeerLocked(src.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update")
|
||||
c.removePeerLocked(src.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update", "failed to remove tunnel dest mapping")
|
||||
// removing the peer also removes all other tunnels and notifies destinations, so it's safe to
|
||||
// return here.
|
||||
return err
|
||||
@@ -391,7 +412,7 @@ func (c *core) removeTunnelLocked(src *peer, dstID uuid.UUID) error {
|
||||
err = dst.updateMappingLocked(src.id, nil, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "remove tunnel")
|
||||
if err != nil {
|
||||
dst.logger.Error(context.Background(), "failed to update", slog.Error(err))
|
||||
c.removePeerLocked(dst.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update")
|
||||
c.removePeerLocked(dst.id, proto.CoordinateResponse_PeerUpdate_DISCONNECTED, "failed update", "failed to remove tunnel src mapping")
|
||||
// don't return here because we still want to remove the tunnel, and an error at the
|
||||
// destination doesn't count as an error removing the tunnel at the source.
|
||||
}
|
||||
@@ -413,6 +434,11 @@ func (c *core) initPeer(p *peer) error {
|
||||
if old, ok := c.peers[p.id]; ok {
|
||||
// rare and interesting enough to log at Info, but it isn't an error per se
|
||||
old.logger.Info(context.Background(), "overwritten by new connection")
|
||||
select {
|
||||
case old.resps <- &proto.CoordinateResponse{Error: CloseErrOverwritten}:
|
||||
default:
|
||||
// pass
|
||||
}
|
||||
close(old.resps)
|
||||
p.overwrites = old.overwrites + 1
|
||||
}
|
||||
@@ -433,7 +459,7 @@ func (c *core) initPeer(p *peer) error {
|
||||
|
||||
// removePeer removes and cleans up a lost peer. It updates all peers it shares a tunnel with, deletes
|
||||
// all tunnels from which the removed peer is the source.
|
||||
func (c *core) lostPeer(p *peer) error {
|
||||
func (c *core) lostPeer(p *peer, closeErr string) error {
|
||||
c.mutex.Lock()
|
||||
defer c.mutex.Unlock()
|
||||
c.logger.Debug(context.Background(), "lostPeer", slog.F("peer_id", p.id))
|
||||
@@ -443,11 +469,11 @@ func (c *core) lostPeer(p *peer) error {
|
||||
if existing, ok := c.peers[p.id]; !ok || existing != p {
|
||||
return ErrAlreadyRemoved
|
||||
}
|
||||
c.removePeerLocked(p.id, proto.CoordinateResponse_PeerUpdate_LOST, "lost")
|
||||
c.removePeerLocked(p.id, proto.CoordinateResponse_PeerUpdate_LOST, "lost", closeErr)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *core) removePeerLocked(id uuid.UUID, kind proto.CoordinateResponse_PeerUpdate_Kind, reason string) {
|
||||
func (c *core) removePeerLocked(id uuid.UUID, kind proto.CoordinateResponse_PeerUpdate_Kind, reason, closeErr string) {
|
||||
p, ok := c.peers[id]
|
||||
if !ok {
|
||||
c.logger.Critical(context.Background(), "removed non-existent peer %s", id)
|
||||
@@ -455,6 +481,13 @@ func (c *core) removePeerLocked(id uuid.UUID, kind proto.CoordinateResponse_Peer
|
||||
}
|
||||
c.updateTunnelPeersLocked(id, nil, kind, reason)
|
||||
c.tunnels.removeAll(id)
|
||||
if closeErr != "" {
|
||||
select {
|
||||
case p.resps <- &proto.CoordinateResponse{Error: closeErr}:
|
||||
default:
|
||||
// blocked, pass.
|
||||
}
|
||||
}
|
||||
close(p.resps)
|
||||
delete(c.peers, id)
|
||||
}
|
||||
@@ -487,7 +520,8 @@ func (c *core) close() error {
|
||||
for id := range c.peers {
|
||||
// when closing, mark them as LOST so that we don't disrupt in-progress
|
||||
// connections.
|
||||
c.removePeerLocked(id, proto.CoordinateResponse_PeerUpdate_LOST, "coordinator close")
|
||||
c.removePeerLocked(id, proto.CoordinateResponse_PeerUpdate_LOST, "coordinator close",
|
||||
CloseErrCoordinatorClose)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -58,7 +58,8 @@ func TestCoordinator(t *testing.T) {
|
||||
},
|
||||
PreferredDerp: 10,
|
||||
})
|
||||
client.AssertEventuallyResponsesClosed()
|
||||
client.AssertEventuallyResponsesClosed(
|
||||
tailnet.AuthorizationError{Wrapped: tailnet.InvalidAddressBitsError{Bits: 64}}.Error())
|
||||
})
|
||||
|
||||
t.Run("AgentWithoutClients", func(t *testing.T) {
|
||||
@@ -95,13 +96,13 @@ func TestCoordinator(t *testing.T) {
|
||||
}()
|
||||
agent := test.NewAgent(ctx, t, coordinator, "agent")
|
||||
defer agent.Close(ctx)
|
||||
prefix := tailnet.TailscaleServicePrefix.RandomPrefix()
|
||||
agent.UpdateNode(&proto.Node{
|
||||
Addresses: []string{
|
||||
tailnet.TailscaleServicePrefix.RandomPrefix().String(),
|
||||
},
|
||||
Addresses: []string{prefix.String()},
|
||||
PreferredDerp: 10,
|
||||
})
|
||||
agent.AssertEventuallyResponsesClosed()
|
||||
agent.AssertEventuallyResponsesClosed(
|
||||
tailnet.AuthorizationError{Wrapped: tailnet.InvalidNodeAddressError{Addr: prefix.Addr().String()}}.Error())
|
||||
})
|
||||
|
||||
t.Run("AgentWithoutClients_InvalidBits", func(t *testing.T) {
|
||||
@@ -122,7 +123,8 @@ func TestCoordinator(t *testing.T) {
|
||||
},
|
||||
PreferredDerp: 10,
|
||||
})
|
||||
agent.AssertEventuallyResponsesClosed()
|
||||
agent.AssertEventuallyResponsesClosed(
|
||||
tailnet.AuthorizationError{Wrapped: tailnet.InvalidAddressBitsError{Bits: 64}}.Error())
|
||||
})
|
||||
|
||||
t.Run("AgentWithClient", func(t *testing.T) {
|
||||
@@ -198,7 +200,7 @@ func TestCoordinator(t *testing.T) {
|
||||
agent2.AssertEventuallyHasDERP(client.ID, 2)
|
||||
|
||||
// This original agent channels should've been closed forcefully.
|
||||
agent1.AssertEventuallyResponsesClosed()
|
||||
agent1.AssertEventuallyResponsesClosed(tailnet.CloseErrOverwritten)
|
||||
})
|
||||
|
||||
t.Run("AgentAck", func(t *testing.T) {
|
||||
|
||||
+5
-5
@@ -121,24 +121,24 @@ func (p *peer) storeMappingLocked(
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *peer) reqLoop(ctx context.Context, logger slog.Logger, handler func(context.Context, *peer, *proto.CoordinateRequest) error) {
|
||||
func (p *peer) reqLoop(ctx context.Context, logger slog.Logger, handler func(context.Context, *peer, *proto.CoordinateRequest) error) error {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Debug(ctx, "peerReadLoop context done")
|
||||
return
|
||||
return ctx.Err()
|
||||
case req, ok := <-p.reqs:
|
||||
if !ok {
|
||||
logger.Debug(ctx, "peerReadLoop channel closed")
|
||||
return
|
||||
return nil
|
||||
}
|
||||
logger.Debug(ctx, "peerReadLoop got request")
|
||||
if err := handler(ctx, p, req); err != nil {
|
||||
if xerrors.Is(err, ErrAlreadyRemoved) || xerrors.Is(err, ErrClosed) {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
logger.Error(ctx, "peerReadLoop error handling request", slog.Error(err), slog.F("request", req))
|
||||
return
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ func GracefulDisconnectTest(ctx context.Context, t *testing.T, coordinator tailn
|
||||
|
||||
p2.Disconnect()
|
||||
p1.AssertEventuallyDisconnected(p2.ID)
|
||||
p2.AssertEventuallyResponsesClosed()
|
||||
p2.AssertEventuallyResponsesClosed("")
|
||||
}
|
||||
|
||||
func LostTest(ctx context.Context, t *testing.T, coordinator tailnet.CoordinatorV2) {
|
||||
|
||||
@@ -230,13 +230,21 @@ func (p *Peer) AssertEventuallyLost(other uuid.UUID) {
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Peer) AssertEventuallyResponsesClosed() {
|
||||
func (p *Peer) AssertEventuallyResponsesClosed(expectedError string) {
|
||||
gotErr := false
|
||||
p.t.Helper()
|
||||
for {
|
||||
err := p.readOneResp()
|
||||
if xerrors.Is(err, errResponsesClosed) {
|
||||
if !gotErr && expectedError != "" {
|
||||
p.t.Errorf("responses closed without error '%s'", expectedError)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil && expectedError != "" && err.Error() == expectedError {
|
||||
gotErr = true
|
||||
continue
|
||||
}
|
||||
if !assert.NoError(p.t, err) {
|
||||
return
|
||||
}
|
||||
|
||||
+20
-3
@@ -2,6 +2,7 @@ package tailnet
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/netip"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -12,6 +13,22 @@ import (
|
||||
|
||||
var legacyWorkspaceAgentIP = netip.MustParseAddr("fd7a:115c:a1e0:49d6:b259:b7ac:b1b2:48f4")
|
||||
|
||||
type InvalidAddressBitsError struct {
|
||||
Bits int
|
||||
}
|
||||
|
||||
func (e InvalidAddressBitsError) Error() string {
|
||||
return fmt.Sprintf("invalid address bits, expected 128, got %d", e.Bits)
|
||||
}
|
||||
|
||||
type InvalidNodeAddressError struct {
|
||||
Addr string
|
||||
}
|
||||
|
||||
func (e InvalidNodeAddressError) Error() string {
|
||||
return fmt.Sprintf("invalid node address, got %s", e.Addr)
|
||||
}
|
||||
|
||||
type CoordinateeAuth interface {
|
||||
Authorize(ctx context.Context, req *proto.CoordinateRequest) error
|
||||
}
|
||||
@@ -61,13 +78,13 @@ func (a AgentCoordinateeAuth) Authorize(_ context.Context, req *proto.Coordinate
|
||||
}
|
||||
|
||||
if pre.Bits() != 128 {
|
||||
return xerrors.Errorf("invalid address bits, expected 128, got %d", pre.Bits())
|
||||
return InvalidAddressBitsError{pre.Bits()}
|
||||
}
|
||||
|
||||
if TailscaleServicePrefix.AddrFromUUID(a.ID).Compare(pre.Addr()) != 0 &&
|
||||
CoderServicePrefix.AddrFromUUID(a.ID).Compare(pre.Addr()) != 0 &&
|
||||
legacyWorkspaceAgentIP.Compare(pre.Addr()) != 0 {
|
||||
return xerrors.Errorf("invalid node address, got %s", pre.Addr().String())
|
||||
return InvalidNodeAddressError{pre.Addr().String()}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -104,7 +121,7 @@ func handleClientNodeRequests(req *proto.CoordinateRequest) error {
|
||||
}
|
||||
|
||||
if pre.Bits() != 128 {
|
||||
return xerrors.Errorf("invalid address bits, expected 128, got %d", pre.Bits())
|
||||
return InvalidAddressBitsError{pre.Bits()}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user