From 1afc6d4fd02eceee3c2a5e9740d2a4e2e67daba7 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Tue, 19 May 2026 09:47:03 -0500 Subject: [PATCH] feat: structured disconnect attribution for agent logs (#25191) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements [PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution): adds structured disconnect attribution to disconnect logs throughout the agent and tailnet packages. Every disconnect log site now carries structured slog fields. All existing logs remain; existing messages are preserved with the fields added alongside. New fields on disconnect log lines: - `connect_type` — which layer disconnected: `server_to_agent`, `agent_to_client`, or `client_to_server` - `disconnect_reason` — categorical reason: `graceful`, `network_error`, `server_shutdown`, etc. - `disconnect_expected` — whether the disconnect is normal operation (`true`) or should be investigated (`false`) - `disconnect_initiator` — who started it: `client`, `agent`, `server`, or `network` (control-plane sites only) - `disconnect_detail` — free-form supplemental info (where useful) ## What's covered **Control plane (`server_to_agent`):** coordination RPC, DERP map subscriber, agent runLoop, agent Close, `BasicCoordination.Close`, `Controller.run`. **Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY, JetBrains port-forwarding.
Control-plane sites | Site | Reason | Initiator | |---|---|---| | `agent/agent.go` `runLoop` EOF | `network_error` | `network` | | `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` / `graceful` / `network_error` | `agent` / `server` / `network` | | `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared `classifyCoordinatorRPCExit`) | same | | `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail | `agent` | | `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` | `agent` | | `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or `network_error` | `c.initiator` | | `tailnet/controllers.go` `Controller.run` `net.ErrClosed` | `network_error` | `network` |
Data-plane sites | Site | Reason | Notes | |---|---|---| | `agent/agentssh/agentssh.go` SSH session closed | free-form (`graceful`, `process exited with error status: N`, etc.) | Also sets `closeCause("normal exit")` for clean exits so coderd's `connection_log.DisconnectReason` is no longer empty | | `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`, error string, or `graceful` | | | `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or error string | Previously passed empty reason |
Bug fix The deferred `disconnected from coordination RPC` log no longer fires when the initial `Coordinate()` RPC call fails before any connection is established.
Refs PLAT-60. --- _This PR was prepared by Coder Agents on behalf of @Emyrk._ **Manually QA'd a lot of common disconnects** --------- Co-authored-by: Coder Agents --- agent/agent.go | 65 ++++++++- agent/agent_internal_test.go | 56 ++++++++ agent/agentssh/agentssh.go | 15 +- agent/agentssh/jetbrainstrack.go | 9 +- agent/reconnectingpty/server.go | 35 +++-- coderd/tailnet.go | 2 + codersdk/disconnect.go | 228 +++++++++++++++++++++++++++++++ codersdk/disconnect_test.go | 94 +++++++++++++ site/src/api/typesGenerated.ts | 55 ++++++++ tailnet/controllers.go | 62 ++++++++- 10 files changed, 595 insertions(+), 26 deletions(-) create mode 100644 codersdk/disconnect.go create mode 100644 codersdk/disconnect_test.go diff --git a/agent/agent.go b/agent/agent.go index 53873b1f65..33ad83c804 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -514,7 +514,12 @@ func (a *agent) runLoop() { return } if errors.Is(err, io.EOF) { - a.logger.Info(ctx, "disconnected from coderd") + a.logger.Info(ctx, "disconnected from coderd", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + codersdk.DisconnectReasonNetworkError.SlogField(), + codersdk.DisconnectReasonNetworkError.SlogExpectedField(), + codersdk.DisconnectInitiatorNetwork.SlogField(), + ) continue } a.logger.Warn(ctx, "run exited with error", slog.Error(err)) @@ -1878,16 +1883,43 @@ func (a *agent) createTailnet( return network, nil } +// classifyCoordinatorRPCExit determines the DisconnectReason and +// DisconnectInitiator for a coordinator-style RPC (the coordination RPC +// and the DERP map subscriber RPC) that has just returned. A canceled +// local context means the agent itself is shutting down. A non-nil +// return error without context cancellation means the stream broke +// unexpectedly. +func classifyCoordinatorRPCExit(ctx context.Context, retErr error) (codersdk.DisconnectReason, codersdk.DisconnectInitiator) { + localShutdown := ctx.Err() != nil + switch { + case localShutdown: + return codersdk.DisconnectReasonServerShutdown, codersdk.DisconnectInitiatorAgent + case retErr == nil: + return codersdk.DisconnectReasonGraceful, codersdk.DisconnectInitiatorServer + default: + return codersdk.DisconnectReasonNetworkError, codersdk.DisconnectInitiatorNetwork + } +} + // runCoordinator runs a coordinator and returns whether a reconnect // should occur. -func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error { - defer a.logger.Debug(ctx, "disconnected from coordination RPC") +func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) { // we run the RPC on the hardCtx so that we have a chance to send the disconnect message if we // gracefully shut down. coordinate, err := tClient.Coordinate(a.hardCtx) if err != nil { return xerrors.Errorf("failed to connect to the coordinate endpoint: %w", err) } + defer func() { + reason, initiator := classifyCoordinatorRPCExit(ctx, retErr) + a.logger.Debug(ctx, "disconnected from coordination RPC", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + reason.SlogField(), + reason.SlogExpectedField(), + initiator.SlogField(), + slog.Error(retErr), + ) + }() defer func() { cErr := coordinate.Close() if cErr != nil { @@ -1935,8 +1967,7 @@ func (a *agent) setCoordDisconnected() chan struct{} { } // runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur. -func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error { - defer a.logger.Debug(ctx, "disconnected from derp map RPC") +func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) { ctx, cancel := context.WithCancel(ctx) defer cancel() stream, err := tClient.StreamDERPMaps(ctx, &tailnetproto.StreamDERPMapsRequest{}) @@ -1948,6 +1979,15 @@ func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.D if cErr != nil { a.logger.Debug(ctx, "error closing DERPMap stream", slog.Error(err)) } + + reason, initiator := classifyCoordinatorRPCExit(ctx, retErr) + a.logger.Debug(ctx, "disconnected from derp map RPC", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + reason.SlogField(), + reason.SlogExpectedField(), + initiator.SlogField(), + slog.Error(retErr), + ) }() a.logger.Info(ctx, "connected to derp map RPC") for { @@ -2260,9 +2300,20 @@ lifecycleWaitLoop: // Wait for graceful disconnect from the Coordinator RPC select { case <-a.hardCtx.Done(): - a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect") + a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogExpectedField(), + codersdk.DisconnectInitiatorAgent.SlogField(), + codersdk.SlogDisconnectDetail("timed out waiting for coordinator RPC to disconnect"), + ) case <-coordDisconnected: - a.logger.Debug(context.Background(), "coordinator RPC disconnected") + a.logger.Debug(context.Background(), "coordinator RPC disconnected", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogExpectedField(), + codersdk.DisconnectInitiatorAgent.SlogField(), + ) } // Wait for logs to be sent diff --git a/agent/agent_internal_test.go b/agent/agent_internal_test.go index e49238e1b6..9f131eb6a1 100644 --- a/agent/agent_internal_test.go +++ b/agent/agent_internal_test.go @@ -1,17 +1,20 @@ package agent import ( + "context" "path/filepath" "runtime" "testing" "github.com/google/uuid" "github.com/stretchr/testify/require" + "golang.org/x/xerrors" "cdr.dev/slog/v3" "cdr.dev/slog/v3/sloggers/slogtest" "github.com/coder/coder/v2/agent/agentcontextconfig" "github.com/coder/coder/v2/agent/proto" + "github.com/coder/coder/v2/codersdk" agentsdk "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/coder/v2/testutil" ) @@ -86,3 +89,56 @@ func TestContextConfigAPI_InitOnce(t *testing.T) { require.NotEmpty(t, mcpFiles2) require.Contains(t, mcpFiles2[0], dir2) } + +func TestClassifyCoordinatorRPCExit(t *testing.T) { + t.Parallel() + + canceled, cancel := context.WithCancel(context.Background()) + cancel() + + cases := []struct { + name string + ctx context.Context + retErr error + reason codersdk.DisconnectReason + initiator codersdk.DisconnectInitiator + }{ + { + name: "local shutdown, no error", + ctx: canceled, + retErr: nil, + reason: codersdk.DisconnectReasonServerShutdown, + initiator: codersdk.DisconnectInitiatorAgent, + }, + { + name: "local shutdown, with cleanup error", + ctx: canceled, + retErr: xerrors.New("close timed out"), + reason: codersdk.DisconnectReasonServerShutdown, + initiator: codersdk.DisconnectInitiatorAgent, + }, + { + name: "remote graceful, no error", + ctx: context.Background(), + retErr: nil, + reason: codersdk.DisconnectReasonGraceful, + initiator: codersdk.DisconnectInitiatorServer, + }, + { + name: "stream broke unexpectedly", + ctx: context.Background(), + retErr: xerrors.New("read: connection reset"), + reason: codersdk.DisconnectReasonNetworkError, + initiator: codersdk.DisconnectInitiatorNetwork, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + reason, initiator := classifyCoordinatorRPCExit(tc.ctx, tc.retErr) + require.Equal(t, tc.reason, reason) + require.Equal(t, tc.initiator, initiator) + }) + } +} diff --git a/agent/agentssh/agentssh.go b/agent/agentssh/agentssh.go index 1828185690..9a7e2ead31 100644 --- a/agent/agentssh/agentssh.go +++ b/agent/agentssh/agentssh.go @@ -462,17 +462,23 @@ func (s *Server) sessionHandler(session ssh.Session) { logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("raw_type", magicTypeRaw)) } - closeCause := func(string) {} + closeCause := func(_ string) {} if reportSession { - var reason string - closeCause = func(r string) { reason = r } + var reason codersdk.DisconnectReason + closeCause = func(r string) { reason = codersdk.DisconnectReason(r) } scr := &sessionCloseTracker{Session: session} session = scr disconnected := s.config.ReportConnection(id, magicType, remoteAddrString) defer func() { - disconnected(scr.exitCode(), reason) + logger.Info(ctx, "ssh session closed", + codersdk.ConnectionDirectionAgentToClient.SlogField(), + reason.SlogField(), + reason.SlogExpectedField(), + slog.F("exit_code", scr.exitCode()), + ) + disconnected(scr.exitCode(), string(reason)) }() } @@ -567,6 +573,7 @@ func (s *Server) sessionHandler(session ssh.Session) { _ = session.Exit(MagicSessionErrorCode) return } + closeCause(string(codersdk.DisconnectReasonGraceful)) logger.Info(ctx, "normal ssh session exit") _ = session.Exit(0) } diff --git a/agent/agentssh/jetbrainstrack.go b/agent/agentssh/jetbrainstrack.go index e4a63a091d..2ea54b5430 100644 --- a/agent/agentssh/jetbrainstrack.go +++ b/agent/agentssh/jetbrainstrack.go @@ -11,6 +11,7 @@ import ( gossh "golang.org/x/crypto/ssh" "cdr.dev/slog/v3" + "github.com/coder/coder/v2/codersdk" ) // localForwardChannelData is copied from the ssh package. @@ -85,9 +86,13 @@ func (w *JetbrainsChannelWatcher) Accept() (gossh.Channel, <-chan *gossh.Request Channel: c, done: func() { w.jetbrainsCounter.Add(-1) - disconnected(0, "") + disconnected(0, "normal close") // nolint: gocritic // JetBrains is a proper noun and should be capitalized - w.logger.Debug(context.Background(), "JetBrains watcher channel closed") + w.logger.Debug(context.Background(), "JetBrains channel closed", + codersdk.ConnectionDirectionAgentToClient.SlogField(), + codersdk.DisconnectReasonGraceful.SlogField(), + codersdk.DisconnectReasonGraceful.SlogExpectedField(), + ) }, }, r, err } diff --git a/agent/reconnectingpty/server.go b/agent/reconnectingpty/server.go index cedd86bbd4..d915aded34 100644 --- a/agent/reconnectingpty/server.go +++ b/agent/reconnectingpty/server.go @@ -17,6 +17,7 @@ import ( "github.com/coder/coder/v2/agent/agentcontainers" "github.com/coder/coder/v2/agent/agentssh" "github.com/coder/coder/v2/agent/usershell" + "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/workspacesdk" ) @@ -95,6 +96,11 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err select { case <-closed: case <-hardCtx.Done(): + clog.Info(hardCtx, "reconnecting pty closed", + codersdk.ConnectionDirectionAgentToClient.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogField(), + codersdk.DisconnectReasonServerShutdown.SlogExpectedField(), + ) disconnected(1, "server shut down") _ = conn.Close() } @@ -104,15 +110,28 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err defer close(closed) defer wg.Done() err := s.handleConn(ctx, clog, conn) - if err != nil { - if ctx.Err() != nil { - disconnected(1, "server shutting down") - } else { - disconnected(1, err.Error()) - } - } else { - disconnected(0, "") + var reason codersdk.DisconnectReason + var code int + var detail string + switch { + case err != nil && ctx.Err() != nil: + reason = codersdk.DisconnectReasonServerShutdown + code = 1 + case err != nil: + reason = codersdk.DisconnectReasonNetworkError + detail = err.Error() + code = 1 + default: + reason = codersdk.DisconnectReasonGraceful } + clog.Info(ctx, "reconnecting pty closed", + codersdk.ConnectionDirectionAgentToClient.SlogField(), + reason.SlogField(), + reason.SlogExpectedField(), + codersdk.SlogDisconnectDetail(detail), + slog.F("exit_code", code), + ) + disconnected(code, string(reason)) }() } wg.Wait() diff --git a/coderd/tailnet.go b/coderd/tailnet.go index b69c687d3a..4d73c89fd1 100644 --- a/coderd/tailnet.go +++ b/coderd/tailnet.go @@ -529,6 +529,8 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra Logger: logger, Coordinatee: coordinatee, SendAcks: false, // we are a client, connecting to multiple agents + Initiator: codersdk.DisconnectInitiatorServer, + Direction: codersdk.ConnectionDirectionServerToAgent, }, logger: logger, tracer: tracer, diff --git a/codersdk/disconnect.go b/codersdk/disconnect.go new file mode 100644 index 0000000000..56345bf586 --- /dev/null +++ b/codersdk/disconnect.go @@ -0,0 +1,228 @@ +package codersdk + +import "cdr.dev/slog/v3" + +// SlogDisconnectDetail is the slog field for the free-form, human-readable +// detail string that supplements the structured reason. Use it for +// "exited with code 137" style information that does not fit a category. +func SlogDisconnectDetail(detail string) slog.Field { + return slog.F("disconnect_detail", detail) +} + +// DisconnectReason categorizes why a workspace connection ended. It is +// emitted as a slog field at every disconnect log site so operators can +// filter and aggregate disconnects without parsing free-form reason +// strings. +// +// The set of values intentionally stays small. Use DisconnectReasonUnknown +// when no other value applies; do not invent ad-hoc strings. Add a new +// constant here (and update its godoc) when a new disconnect class is +// genuinely distinct from the existing ones. +type DisconnectReason string + +func (r DisconnectReason) SlogField() slog.Field { + if r == "" { + return slog.F("disconnect_reason", "unknown") + } + return slog.F("disconnect_reason", r) +} + +func (r DisconnectReason) SlogExpectedField() slog.Field { + return slog.F("disconnect_expected", r.Expected()) +} + +const ( + // DisconnectReasonUnknown is the zero value. Use it when the disconnect + // path cannot determine a more specific reason. Treat any disconnect + // logged with this value as a bug to investigate, not as a normal + // outcome. + DisconnectReasonUnknown DisconnectReason = "" + + // DisconnectReasonGraceful indicates the connection ended cleanly: the + // remote side acknowledged a Disconnect message, an SSH session exited + // with status 0, or a PTY closed without error. This is the expected + // "happy path" outcome. + DisconnectReasonGraceful DisconnectReason = "graceful" + + // DisconnectReasonClientClosed indicates the client side closed the + // connection without an error (for example, the user closed their + // terminal or quit their IDE). The session ran to a natural end from + // the client's perspective. + DisconnectReasonClientClosed DisconnectReason = "client_closed" + + // DisconnectReasonServerShutdown indicates the workspace agent or + // coderd is shutting down and is closing connections as part of that + // shutdown. The connection itself was healthy; the process is going + // away. + DisconnectReasonServerShutdown DisconnectReason = "server_shutdown" + + // DisconnectReasonNetworkError indicates the transport failed: an EOF, + // a read or write error, a context cancellation caused by a timeout, + // or a similar I/O failure. The connection did not end cleanly. + DisconnectReasonNetworkError DisconnectReason = "network_error" + + // DisconnectReasonProtocolError indicates a dRPC, SSH, or tailnet + // protocol violation by the peer. Distinct from network errors because + // the bytes flowed but the contents were unparsable or unexpected. + DisconnectReasonProtocolError DisconnectReason = "protocol_error" + + // DisconnectReasonWorkspaceStopped indicates the workspace itself was + // stopped or deleted while the connection was open, so coderd closed + // outstanding sessions on its behalf. + DisconnectReasonWorkspaceStopped DisconnectReason = "workspace_stopped" + + // DisconnectReasonControlPlaneLost indicates the agent or client lost + // its coordination RPC to coderd. The data plane (peer-to-peer or + // DERP) may still be functional; this records the control plane + // outcome specifically. + DisconnectReasonControlPlaneLost DisconnectReason = "control_plane_lost" +) + +// Valid reports whether r is a known DisconnectReason. The zero value +// (DisconnectReasonUnknown) is considered valid since it is the explicit +// "no information" reason. +func (r DisconnectReason) Valid() bool { + switch r { + case DisconnectReasonUnknown, + DisconnectReasonGraceful, + DisconnectReasonClientClosed, + DisconnectReasonServerShutdown, + DisconnectReasonNetworkError, + DisconnectReasonProtocolError, + DisconnectReasonWorkspaceStopped, + DisconnectReasonControlPlaneLost: + return true + default: + return false + } +} + +// Expected reports whether a disconnect with this reason is part of +// normal operation. Operators can use this to split dashboards or alerts +// into "expected" and "investigate" buckets without enumerating every +// reason. +func (r DisconnectReason) Expected() bool { + switch r { + case DisconnectReasonGraceful, + DisconnectReasonClientClosed, + DisconnectReasonServerShutdown, + DisconnectReasonWorkspaceStopped: + return true + case DisconnectReasonUnknown, + DisconnectReasonNetworkError, + DisconnectReasonProtocolError, + DisconnectReasonControlPlaneLost: + return false + default: + // Unknown reason values are treated as not expected so that + // new emit sites that forget to classify themselves surface + // in the "investigate" bucket by default. + return false + } +} + +// DisconnectInitiator identifies which side caused the disconnect. It +// pairs with DisconnectReason: the reason describes what happened, the +// initiator describes who started it. +type DisconnectInitiator string + +const ( + // DisconnectInitiatorUnknown means the disconnect site cannot + // attribute the close to a specific side. Avoid this where possible. + DisconnectInitiatorUnknown DisconnectInitiator = "" + + // DisconnectInitiatorClient means the user-facing side (CLI, VS Code + // extension, JetBrains plugin, Coder Desktop) closed the connection. + DisconnectInitiatorClient DisconnectInitiator = "client" + + // DisconnectInitiatorAgent means the workspace agent closed the + // connection. + DisconnectInitiatorAgent DisconnectInitiator = "agent" + + // DisconnectInitiatorServer means coderd (or a workspace proxy) + // closed the connection. + DisconnectInitiatorServer DisconnectInitiator = "server" + + // DisconnectInitiatorNetwork means an underlying network or transport + // fault caused the close. Neither end deliberately initiated it. + DisconnectInitiatorNetwork DisconnectInitiator = "network" +) + +func (i DisconnectInitiator) SlogField() slog.Field { + return slog.F("disconnect_initiator", i) +} + +// Valid reports whether i is a known DisconnectInitiator. +func (i DisconnectInitiator) Valid() bool { + switch i { + case DisconnectInitiatorUnknown, + DisconnectInitiatorClient, + DisconnectInitiatorAgent, + DisconnectInitiatorServer, + DisconnectInitiatorNetwork: + return true + default: + return false + } +} + +// ConnectionDirection identifies which layer a disconnect log belongs to. +// It tells operators at a glance whether a log is about the control plane +// (server to agent) or the data plane (agent to client). +type ConnectionDirection string + +func (d ConnectionDirection) SlogField() slog.Field { + return slog.F("connect_type", d) +} + +const ( + // ConnectionDirectionServerToAgent is the control-plane connection + // between coderd and the workspace agent (coordination RPC, DERP map + // subscriber, agent runLoop). + ConnectionDirectionServerToAgent ConnectionDirection = "server_to_agent" + + // ConnectionDirectionAgentToClient is a data-plane session between + // the workspace agent and a user's client (SSH, reconnecting PTY, + // JetBrains port-forwarding). + ConnectionDirectionAgentToClient ConnectionDirection = "agent_to_client" + + // ConnectionDirectionClientToServer is a connection from a user's + // client to coderd (e.g. the CLI's WebSocket to the coordinator). + // Not yet instrumented. + ConnectionDirectionClientToServer ConnectionDirection = "client_to_server" +) + +// ConnectionMethod describes the network path a workspace connection +// took at the moment a disconnect log was emitted. It is intended for +// observability only; do not switch behavior on it. +type ConnectionMethod string + +func (m ConnectionMethod) SlogField() slog.Field { + return slog.F("connection_method", m) +} + +const ( + // ConnectionMethodUnknown means the disconnect site does not have + // the information to determine the connection path. + ConnectionMethodUnknown ConnectionMethod = "" + + // ConnectionMethodDirect means the peers were communicating over a + // direct, peer-to-peer connection (NAT-traversed via STUN). + ConnectionMethodDirect ConnectionMethod = "direct" + + // ConnectionMethodDERP means the peers were communicating through a + // DERP relay rather than directly. + ConnectionMethodDERP ConnectionMethod = "derp" +) + +// Valid reports whether m is a known ConnectionMethod. +func (m ConnectionMethod) Valid() bool { + switch m { + case ConnectionMethodUnknown, + ConnectionMethodDirect, + ConnectionMethodDERP: + return true + default: + return false + } +} diff --git a/codersdk/disconnect_test.go b/codersdk/disconnect_test.go new file mode 100644 index 0000000000..6210374806 --- /dev/null +++ b/codersdk/disconnect_test.go @@ -0,0 +1,94 @@ +package codersdk_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/codersdk" +) + +func TestDisconnectReason_Valid(t *testing.T) { + t.Parallel() + + cases := []struct { + reason codersdk.DisconnectReason + valid bool + }{ + {codersdk.DisconnectReasonUnknown, true}, + {codersdk.DisconnectReasonGraceful, true}, + {codersdk.DisconnectReasonClientClosed, true}, + {codersdk.DisconnectReasonServerShutdown, true}, + {codersdk.DisconnectReasonNetworkError, true}, + {codersdk.DisconnectReasonProtocolError, true}, + {codersdk.DisconnectReasonWorkspaceStopped, true}, + {codersdk.DisconnectReasonControlPlaneLost, true}, + {codersdk.DisconnectReason("not_a_real_reason"), false}, + } + + for _, c := range cases { + require.Equal(t, c.valid, c.reason.Valid(), "reason=%q", c.reason) + } +} + +func TestDisconnectReason_Expected(t *testing.T) { + t.Parallel() + + expected := map[codersdk.DisconnectReason]bool{ + codersdk.DisconnectReasonGraceful: true, + codersdk.DisconnectReasonClientClosed: true, + codersdk.DisconnectReasonServerShutdown: true, + codersdk.DisconnectReasonWorkspaceStopped: true, + + codersdk.DisconnectReasonUnknown: false, + codersdk.DisconnectReasonNetworkError: false, + codersdk.DisconnectReasonProtocolError: false, + codersdk.DisconnectReasonControlPlaneLost: false, + } + + for reason, want := range expected { + require.Equal(t, want, reason.Expected(), "reason=%q", reason) + } + + // Unknown values default to not-expected so that uncategorized + // emit sites surface in the "investigate" bucket. + require.False(t, codersdk.DisconnectReason("not_a_real_reason").Expected()) +} + +func TestDisconnectInitiator_Valid(t *testing.T) { + t.Parallel() + + cases := []struct { + initiator codersdk.DisconnectInitiator + valid bool + }{ + {codersdk.DisconnectInitiatorUnknown, true}, + {codersdk.DisconnectInitiatorClient, true}, + {codersdk.DisconnectInitiatorAgent, true}, + {codersdk.DisconnectInitiatorServer, true}, + {codersdk.DisconnectInitiatorNetwork, true}, + {codersdk.DisconnectInitiator("nobody"), false}, + } + + for _, c := range cases { + require.Equal(t, c.valid, c.initiator.Valid(), "initiator=%q", c.initiator) + } +} + +func TestConnectionMethod_Valid(t *testing.T) { + t.Parallel() + + cases := []struct { + method codersdk.ConnectionMethod + valid bool + }{ + {codersdk.ConnectionMethodUnknown, true}, + {codersdk.ConnectionMethodDirect, true}, + {codersdk.ConnectionMethodDERP, true}, + {codersdk.ConnectionMethod("magic"), false}, + } + + for _, c := range cases { + require.Equal(t, c.valid, c.method.Valid(), "method=%q", c.method) + } +} diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index d374a6fd2f..75e4bb2b10 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -2998,6 +2998,18 @@ export interface ChatWorkspaceTTLResponse { */ export const CoderDesktopTelemetryHeader = "Coder-Desktop-Telemetry"; +// From codersdk/disconnect.go +export type ConnectionDirection = + | "agent_to_client" + | "client_to_server" + | "server_to_agent"; + +export const ConnectionDirections: ConnectionDirection[] = [ + "agent_to_client", + "client_to_server", + "server_to_agent", +]; + // From codersdk/insights.go /** * ConnectionLatency shows the latency for a connection. @@ -3089,6 +3101,11 @@ export interface ConnectionLogsRequest extends Pagination { readonly q?: string; } +// From codersdk/disconnect.go +export type ConnectionMethod = "derp" | "direct" | ""; + +export const ConnectionMethods: ConnectionMethod[] = ["derp", "direct", ""]; + // From codersdk/connectionlog.go export type ConnectionType = | "jetbrains" @@ -4032,6 +4049,44 @@ export const DiagnosticSeverityStrings: DiagnosticSeverityString[] = [ "warning", ]; +// From codersdk/disconnect.go +export type DisconnectInitiator = + | "agent" + | "client" + | "network" + | "server" + | ""; + +export const DisconnectInitiators: DisconnectInitiator[] = [ + "agent", + "client", + "network", + "server", + "", +]; + +// From codersdk/disconnect.go +export type DisconnectReason = + | "client_closed" + | "control_plane_lost" + | "graceful" + | "network_error" + | "protocol_error" + | "server_shutdown" + | "" + | "workspace_stopped"; + +export const DisconnectReasons: DisconnectReason[] = [ + "client_closed", + "control_plane_lost", + "graceful", + "network_error", + "protocol_error", + "server_shutdown", + "", + "workspace_stopped", +]; + // From codersdk/workspaceagents.go export type DisplayApp = | "port_forwarding_helper" diff --git a/tailnet/controllers.go b/tailnet/controllers.go index 35ef075878..6d5b55af60 100644 --- a/tailnet/controllers.go +++ b/tailnet/controllers.go @@ -154,6 +154,17 @@ type BasicCoordinationController struct { Logger slog.Logger Coordinatee Coordinatee SendAcks bool + // Initiator labels which side of the coordination this controller + // represents (e.g. "agent", "client", "server"). It is attached as + // a slog field to disconnect-related logs emitted by this + // coordination so operators can attribute connection-close events + // without inspecting call sites. Leave unset on synthetic + // controllers (tests, in-memory fakes) and the field will be + // omitted from log output. + Initiator codersdk.DisconnectInitiator + // Direction labels the connection layer (server_to_agent, + // agent_to_client, client_to_server) for disconnect logs. + Direction codersdk.ConnectionDirection } // New satisfies the method on the CoordinationController interface @@ -170,6 +181,8 @@ func (c *BasicCoordinationController) NewCoordination(client CoordinatorClient) client: client, respLoopDone: make(chan struct{}), sendAcks: c.SendAcks, + initiator: c.Initiator, + direction: c.Direction, } c.Coordinatee.SetNodeCallback(func(node *Node) { @@ -211,6 +224,8 @@ type BasicCoordination struct { client CoordinatorClient respLoopDone chan struct{} sendAcks bool + initiator codersdk.DisconnectInitiator + direction codersdk.ConnectionDirection } // CloseClient forcibly closes the underlying coordinator client connection @@ -243,11 +258,24 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) { err := c.client.Send(&proto.CoordinateRequest{Disconnect: &proto.CoordinateRequest_Disconnect{}}) c.Unlock() if err != nil && !xerrors.Is(err, io.EOF) { + reason := codersdk.DisconnectReasonNetworkError // Log but don't return early; we must still clean up below. - c.logger.Warn(context.Background(), "failed to send disconnect", slog.Error(err)) + c.logger.Warn(context.Background(), "failed to send disconnect", + c.direction.SlogField(), + reason.SlogExpectedField(), + reason.SlogField(), + c.initiator.SlogField(), + slog.Error(err), + ) retErr = xerrors.Errorf("send disconnect: %w", err) } else { - c.logger.Debug(context.Background(), "sent disconnect") + reason := codersdk.DisconnectReasonGraceful + c.logger.Debug(context.Background(), "sent disconnect", + c.direction.SlogField(), + reason.SlogExpectedField(), + reason.SlogField(), + c.initiator.SlogField(), + ) } // We shouldn't just close the protocol right away, because the way dRPC streams work is @@ -256,10 +284,23 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) { // Disconnect message, so we should wait around for that until the context expires. select { case <-c.respLoopDone: - c.logger.Debug(ctx, "responses closed after disconnect") + reason := codersdk.DisconnectReasonGraceful + c.logger.Debug(ctx, "responses closed after disconnect", + c.direction.SlogField(), + reason.SlogExpectedField(), + reason.SlogField(), + c.initiator.SlogField(), + ) return retErr case <-ctx.Done(): - c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close") + reason := codersdk.DisconnectReasonNetworkError + c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close", + c.direction.SlogField(), + reason.SlogExpectedField(), + reason.SlogField(), + c.initiator.SlogField(), + codersdk.SlogDisconnectDetail("context expired before coordinator hung up"), + ) } // forcefully close the stream protoErr := c.client.Close() @@ -369,6 +410,8 @@ func NewTunnelSrcCoordController( Logger: logger, Coordinatee: coordinatee, SendAcks: false, + Initiator: codersdk.DisconnectInitiatorClient, + Direction: codersdk.ConnectionDirectionClientToServer, }, dests: make(map[uuid.UUID]struct{}), } @@ -508,6 +551,8 @@ func NewAgentCoordinationController( Logger: logger, Coordinatee: coordinatee, SendAcks: true, + Initiator: codersdk.DisconnectInitiatorAgent, + Direction: codersdk.ConnectionDirectionServerToAgent, } } @@ -1505,7 +1550,14 @@ func (c *Controller) Run(ctx context.Context) { } if errors.Is(err, net.ErrClosed) { - c.logger.Warn(c.ctx, "control plane connection closed, retrying", slog.Error(err)) + reason := codersdk.DisconnectReasonNetworkError + c.logger.Warn(c.ctx, "control plane connection closed, retrying", + codersdk.ConnectionDirectionServerToAgent.SlogField(), + reason.SlogField(), + reason.SlogExpectedField(), + codersdk.DisconnectInitiatorNetwork.SlogField(), + slog.Error(err), + ) continue }