feat: structured disconnect attribution for agent logs (#25191)

Implements
[PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution):
adds structured disconnect attribution to disconnect logs throughout the
agent and tailnet packages.

Every disconnect log site now carries structured slog fields. All
existing logs remain; existing messages are preserved with the fields
added alongside.

New fields on disconnect log lines:

- `connect_type` — which layer disconnected: `server_to_agent`,
`agent_to_client`, or `client_to_server`
- `disconnect_reason` — categorical reason: `graceful`, `network_error`,
`server_shutdown`, etc.
- `disconnect_expected` — whether the disconnect is normal operation
(`true`) or should be investigated (`false`)
- `disconnect_initiator` — who started it: `client`, `agent`, `server`,
or `network` (control-plane sites only)
- `disconnect_detail` — free-form supplemental info (where useful)

## What's covered

**Control plane (`server_to_agent`):** coordination RPC, DERP map
subscriber, agent runLoop, agent Close, `BasicCoordination.Close`,
`Controller.run`.

**Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY,
JetBrains port-forwarding.

<details>
<summary>Control-plane sites</summary>

| Site | Reason | Initiator |
|---|---|---|
| `agent/agent.go` `runLoop` EOF | `network_error` | `network` |
| `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` /
`graceful` / `network_error` | `agent` / `server` / `network` |
| `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared
`classifyCoordinatorRPCExit`) | same |
| `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail
| `agent` |
| `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` |
`agent` |
| `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or
`network_error` | `c.initiator` |
| `tailnet/controllers.go` `Controller.run` `net.ErrClosed` |
`network_error` | `network` |

</details>

<details>
<summary>Data-plane sites</summary>

| Site | Reason | Notes |
|---|---|---|
| `agent/agentssh/agentssh.go` SSH session closed | free-form
(`graceful`, `process exited with error status: N`, etc.) | Also sets
`closeCause("normal exit")` for clean exits so coderd's
`connection_log.DisconnectReason` is no longer empty |
| `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`,
error string, or `graceful` | |
| `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or
error string | Previously passed empty reason |

</details>

<details>
<summary>Bug fix</summary>

The deferred `disconnected from coordination RPC` log no longer fires
when the initial `Coordinate()` RPC call fails before any connection is
established.

</details>

Refs PLAT-60.

---

_This PR was prepared by Coder Agents on behalf of @Emyrk._
**Manually QA'd a lot of common disconnects**

---------

Co-authored-by: Coder Agents <noreply@coder.com>
This commit is contained in:
Steven Masley
2026-05-19 09:47:03 -05:00
committed by GitHub
parent c9c933a300
commit 1afc6d4fd0
10 changed files with 595 additions and 26 deletions
+58 -7
View File
@@ -514,7 +514,12 @@ func (a *agent) runLoop() {
return
}
if errors.Is(err, io.EOF) {
a.logger.Info(ctx, "disconnected from coderd")
a.logger.Info(ctx, "disconnected from coderd",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
codersdk.DisconnectReasonNetworkError.SlogField(),
codersdk.DisconnectReasonNetworkError.SlogExpectedField(),
codersdk.DisconnectInitiatorNetwork.SlogField(),
)
continue
}
a.logger.Warn(ctx, "run exited with error", slog.Error(err))
@@ -1878,16 +1883,43 @@ func (a *agent) createTailnet(
return network, nil
}
// classifyCoordinatorRPCExit determines the DisconnectReason and
// DisconnectInitiator for a coordinator-style RPC (the coordination RPC
// and the DERP map subscriber RPC) that has just returned. A canceled
// local context means the agent itself is shutting down. A non-nil
// return error without context cancellation means the stream broke
// unexpectedly.
func classifyCoordinatorRPCExit(ctx context.Context, retErr error) (codersdk.DisconnectReason, codersdk.DisconnectInitiator) {
localShutdown := ctx.Err() != nil
switch {
case localShutdown:
return codersdk.DisconnectReasonServerShutdown, codersdk.DisconnectInitiatorAgent
case retErr == nil:
return codersdk.DisconnectReasonGraceful, codersdk.DisconnectInitiatorServer
default:
return codersdk.DisconnectReasonNetworkError, codersdk.DisconnectInitiatorNetwork
}
}
// runCoordinator runs a coordinator and returns whether a reconnect
// should occur.
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
defer a.logger.Debug(ctx, "disconnected from coordination RPC")
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) {
// we run the RPC on the hardCtx so that we have a chance to send the disconnect message if we
// gracefully shut down.
coordinate, err := tClient.Coordinate(a.hardCtx)
if err != nil {
return xerrors.Errorf("failed to connect to the coordinate endpoint: %w", err)
}
defer func() {
reason, initiator := classifyCoordinatorRPCExit(ctx, retErr)
a.logger.Debug(ctx, "disconnected from coordination RPC",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
initiator.SlogField(),
slog.Error(retErr),
)
}()
defer func() {
cErr := coordinate.Close()
if cErr != nil {
@@ -1935,8 +1967,7 @@ func (a *agent) setCoordDisconnected() chan struct{} {
}
// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur.
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
defer a.logger.Debug(ctx, "disconnected from derp map RPC")
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
stream, err := tClient.StreamDERPMaps(ctx, &tailnetproto.StreamDERPMapsRequest{})
@@ -1948,6 +1979,15 @@ func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.D
if cErr != nil {
a.logger.Debug(ctx, "error closing DERPMap stream", slog.Error(err))
}
reason, initiator := classifyCoordinatorRPCExit(ctx, retErr)
a.logger.Debug(ctx, "disconnected from derp map RPC",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
initiator.SlogField(),
slog.Error(retErr),
)
}()
a.logger.Info(ctx, "connected to derp map RPC")
for {
@@ -2260,9 +2300,20 @@ lifecycleWaitLoop:
// Wait for graceful disconnect from the Coordinator RPC
select {
case <-a.hardCtx.Done():
a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect")
a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
codersdk.DisconnectInitiatorAgent.SlogField(),
codersdk.SlogDisconnectDetail("timed out waiting for coordinator RPC to disconnect"),
)
case <-coordDisconnected:
a.logger.Debug(context.Background(), "coordinator RPC disconnected")
a.logger.Debug(context.Background(), "coordinator RPC disconnected",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
codersdk.DisconnectInitiatorAgent.SlogField(),
)
}
// Wait for logs to be sent
+56
View File
@@ -1,17 +1,20 @@
package agent
import (
"context"
"path/filepath"
"runtime"
"testing"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"cdr.dev/slog/v3/sloggers/slogtest"
"github.com/coder/coder/v2/agent/agentcontextconfig"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/codersdk"
agentsdk "github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/testutil"
)
@@ -86,3 +89,56 @@ func TestContextConfigAPI_InitOnce(t *testing.T) {
require.NotEmpty(t, mcpFiles2)
require.Contains(t, mcpFiles2[0], dir2)
}
func TestClassifyCoordinatorRPCExit(t *testing.T) {
t.Parallel()
canceled, cancel := context.WithCancel(context.Background())
cancel()
cases := []struct {
name string
ctx context.Context
retErr error
reason codersdk.DisconnectReason
initiator codersdk.DisconnectInitiator
}{
{
name: "local shutdown, no error",
ctx: canceled,
retErr: nil,
reason: codersdk.DisconnectReasonServerShutdown,
initiator: codersdk.DisconnectInitiatorAgent,
},
{
name: "local shutdown, with cleanup error",
ctx: canceled,
retErr: xerrors.New("close timed out"),
reason: codersdk.DisconnectReasonServerShutdown,
initiator: codersdk.DisconnectInitiatorAgent,
},
{
name: "remote graceful, no error",
ctx: context.Background(),
retErr: nil,
reason: codersdk.DisconnectReasonGraceful,
initiator: codersdk.DisconnectInitiatorServer,
},
{
name: "stream broke unexpectedly",
ctx: context.Background(),
retErr: xerrors.New("read: connection reset"),
reason: codersdk.DisconnectReasonNetworkError,
initiator: codersdk.DisconnectInitiatorNetwork,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
reason, initiator := classifyCoordinatorRPCExit(tc.ctx, tc.retErr)
require.Equal(t, tc.reason, reason)
require.Equal(t, tc.initiator, initiator)
})
}
}
+11 -4
View File
@@ -462,17 +462,23 @@ func (s *Server) sessionHandler(session ssh.Session) {
logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("raw_type", magicTypeRaw))
}
closeCause := func(string) {}
closeCause := func(_ string) {}
if reportSession {
var reason string
closeCause = func(r string) { reason = r }
var reason codersdk.DisconnectReason
closeCause = func(r string) { reason = codersdk.DisconnectReason(r) }
scr := &sessionCloseTracker{Session: session}
session = scr
disconnected := s.config.ReportConnection(id, magicType, remoteAddrString)
defer func() {
disconnected(scr.exitCode(), reason)
logger.Info(ctx, "ssh session closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
slog.F("exit_code", scr.exitCode()),
)
disconnected(scr.exitCode(), string(reason))
}()
}
@@ -567,6 +573,7 @@ func (s *Server) sessionHandler(session ssh.Session) {
_ = session.Exit(MagicSessionErrorCode)
return
}
closeCause(string(codersdk.DisconnectReasonGraceful))
logger.Info(ctx, "normal ssh session exit")
_ = session.Exit(0)
}
+7 -2
View File
@@ -11,6 +11,7 @@ import (
gossh "golang.org/x/crypto/ssh"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/codersdk"
)
// localForwardChannelData is copied from the ssh package.
@@ -85,9 +86,13 @@ func (w *JetbrainsChannelWatcher) Accept() (gossh.Channel, <-chan *gossh.Request
Channel: c,
done: func() {
w.jetbrainsCounter.Add(-1)
disconnected(0, "")
disconnected(0, "normal close")
// nolint: gocritic // JetBrains is a proper noun and should be capitalized
w.logger.Debug(context.Background(), "JetBrains watcher channel closed")
w.logger.Debug(context.Background(), "JetBrains channel closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
codersdk.DisconnectReasonGraceful.SlogField(),
codersdk.DisconnectReasonGraceful.SlogExpectedField(),
)
},
}, r, err
}
+27 -8
View File
@@ -17,6 +17,7 @@ import (
"github.com/coder/coder/v2/agent/agentcontainers"
"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/usershell"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
@@ -95,6 +96,11 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err
select {
case <-closed:
case <-hardCtx.Done():
clog.Info(hardCtx, "reconnecting pty closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
)
disconnected(1, "server shut down")
_ = conn.Close()
}
@@ -104,15 +110,28 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err
defer close(closed)
defer wg.Done()
err := s.handleConn(ctx, clog, conn)
if err != nil {
if ctx.Err() != nil {
disconnected(1, "server shutting down")
} else {
disconnected(1, err.Error())
}
} else {
disconnected(0, "")
var reason codersdk.DisconnectReason
var code int
var detail string
switch {
case err != nil && ctx.Err() != nil:
reason = codersdk.DisconnectReasonServerShutdown
code = 1
case err != nil:
reason = codersdk.DisconnectReasonNetworkError
detail = err.Error()
code = 1
default:
reason = codersdk.DisconnectReasonGraceful
}
clog.Info(ctx, "reconnecting pty closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
codersdk.SlogDisconnectDetail(detail),
slog.F("exit_code", code),
)
disconnected(code, string(reason))
}()
}
wg.Wait()
+2
View File
@@ -529,6 +529,8 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
Logger: logger,
Coordinatee: coordinatee,
SendAcks: false, // we are a client, connecting to multiple agents
Initiator: codersdk.DisconnectInitiatorServer,
Direction: codersdk.ConnectionDirectionServerToAgent,
},
logger: logger,
tracer: tracer,
+228
View File
@@ -0,0 +1,228 @@
package codersdk
import "cdr.dev/slog/v3"
// SlogDisconnectDetail is the slog field for the free-form, human-readable
// detail string that supplements the structured reason. Use it for
// "exited with code 137" style information that does not fit a category.
func SlogDisconnectDetail(detail string) slog.Field {
return slog.F("disconnect_detail", detail)
}
// DisconnectReason categorizes why a workspace connection ended. It is
// emitted as a slog field at every disconnect log site so operators can
// filter and aggregate disconnects without parsing free-form reason
// strings.
//
// The set of values intentionally stays small. Use DisconnectReasonUnknown
// when no other value applies; do not invent ad-hoc strings. Add a new
// constant here (and update its godoc) when a new disconnect class is
// genuinely distinct from the existing ones.
type DisconnectReason string
func (r DisconnectReason) SlogField() slog.Field {
if r == "" {
return slog.F("disconnect_reason", "unknown")
}
return slog.F("disconnect_reason", r)
}
func (r DisconnectReason) SlogExpectedField() slog.Field {
return slog.F("disconnect_expected", r.Expected())
}
const (
// DisconnectReasonUnknown is the zero value. Use it when the disconnect
// path cannot determine a more specific reason. Treat any disconnect
// logged with this value as a bug to investigate, not as a normal
// outcome.
DisconnectReasonUnknown DisconnectReason = ""
// DisconnectReasonGraceful indicates the connection ended cleanly: the
// remote side acknowledged a Disconnect message, an SSH session exited
// with status 0, or a PTY closed without error. This is the expected
// "happy path" outcome.
DisconnectReasonGraceful DisconnectReason = "graceful"
// DisconnectReasonClientClosed indicates the client side closed the
// connection without an error (for example, the user closed their
// terminal or quit their IDE). The session ran to a natural end from
// the client's perspective.
DisconnectReasonClientClosed DisconnectReason = "client_closed"
// DisconnectReasonServerShutdown indicates the workspace agent or
// coderd is shutting down and is closing connections as part of that
// shutdown. The connection itself was healthy; the process is going
// away.
DisconnectReasonServerShutdown DisconnectReason = "server_shutdown"
// DisconnectReasonNetworkError indicates the transport failed: an EOF,
// a read or write error, a context cancellation caused by a timeout,
// or a similar I/O failure. The connection did not end cleanly.
DisconnectReasonNetworkError DisconnectReason = "network_error"
// DisconnectReasonProtocolError indicates a dRPC, SSH, or tailnet
// protocol violation by the peer. Distinct from network errors because
// the bytes flowed but the contents were unparsable or unexpected.
DisconnectReasonProtocolError DisconnectReason = "protocol_error"
// DisconnectReasonWorkspaceStopped indicates the workspace itself was
// stopped or deleted while the connection was open, so coderd closed
// outstanding sessions on its behalf.
DisconnectReasonWorkspaceStopped DisconnectReason = "workspace_stopped"
// DisconnectReasonControlPlaneLost indicates the agent or client lost
// its coordination RPC to coderd. The data plane (peer-to-peer or
// DERP) may still be functional; this records the control plane
// outcome specifically.
DisconnectReasonControlPlaneLost DisconnectReason = "control_plane_lost"
)
// Valid reports whether r is a known DisconnectReason. The zero value
// (DisconnectReasonUnknown) is considered valid since it is the explicit
// "no information" reason.
func (r DisconnectReason) Valid() bool {
switch r {
case DisconnectReasonUnknown,
DisconnectReasonGraceful,
DisconnectReasonClientClosed,
DisconnectReasonServerShutdown,
DisconnectReasonNetworkError,
DisconnectReasonProtocolError,
DisconnectReasonWorkspaceStopped,
DisconnectReasonControlPlaneLost:
return true
default:
return false
}
}
// Expected reports whether a disconnect with this reason is part of
// normal operation. Operators can use this to split dashboards or alerts
// into "expected" and "investigate" buckets without enumerating every
// reason.
func (r DisconnectReason) Expected() bool {
switch r {
case DisconnectReasonGraceful,
DisconnectReasonClientClosed,
DisconnectReasonServerShutdown,
DisconnectReasonWorkspaceStopped:
return true
case DisconnectReasonUnknown,
DisconnectReasonNetworkError,
DisconnectReasonProtocolError,
DisconnectReasonControlPlaneLost:
return false
default:
// Unknown reason values are treated as not expected so that
// new emit sites that forget to classify themselves surface
// in the "investigate" bucket by default.
return false
}
}
// DisconnectInitiator identifies which side caused the disconnect. It
// pairs with DisconnectReason: the reason describes what happened, the
// initiator describes who started it.
type DisconnectInitiator string
const (
// DisconnectInitiatorUnknown means the disconnect site cannot
// attribute the close to a specific side. Avoid this where possible.
DisconnectInitiatorUnknown DisconnectInitiator = ""
// DisconnectInitiatorClient means the user-facing side (CLI, VS Code
// extension, JetBrains plugin, Coder Desktop) closed the connection.
DisconnectInitiatorClient DisconnectInitiator = "client"
// DisconnectInitiatorAgent means the workspace agent closed the
// connection.
DisconnectInitiatorAgent DisconnectInitiator = "agent"
// DisconnectInitiatorServer means coderd (or a workspace proxy)
// closed the connection.
DisconnectInitiatorServer DisconnectInitiator = "server"
// DisconnectInitiatorNetwork means an underlying network or transport
// fault caused the close. Neither end deliberately initiated it.
DisconnectInitiatorNetwork DisconnectInitiator = "network"
)
func (i DisconnectInitiator) SlogField() slog.Field {
return slog.F("disconnect_initiator", i)
}
// Valid reports whether i is a known DisconnectInitiator.
func (i DisconnectInitiator) Valid() bool {
switch i {
case DisconnectInitiatorUnknown,
DisconnectInitiatorClient,
DisconnectInitiatorAgent,
DisconnectInitiatorServer,
DisconnectInitiatorNetwork:
return true
default:
return false
}
}
// ConnectionDirection identifies which layer a disconnect log belongs to.
// It tells operators at a glance whether a log is about the control plane
// (server to agent) or the data plane (agent to client).
type ConnectionDirection string
func (d ConnectionDirection) SlogField() slog.Field {
return slog.F("connect_type", d)
}
const (
// ConnectionDirectionServerToAgent is the control-plane connection
// between coderd and the workspace agent (coordination RPC, DERP map
// subscriber, agent runLoop).
ConnectionDirectionServerToAgent ConnectionDirection = "server_to_agent"
// ConnectionDirectionAgentToClient is a data-plane session between
// the workspace agent and a user's client (SSH, reconnecting PTY,
// JetBrains port-forwarding).
ConnectionDirectionAgentToClient ConnectionDirection = "agent_to_client"
// ConnectionDirectionClientToServer is a connection from a user's
// client to coderd (e.g. the CLI's WebSocket to the coordinator).
// Not yet instrumented.
ConnectionDirectionClientToServer ConnectionDirection = "client_to_server"
)
// ConnectionMethod describes the network path a workspace connection
// took at the moment a disconnect log was emitted. It is intended for
// observability only; do not switch behavior on it.
type ConnectionMethod string
func (m ConnectionMethod) SlogField() slog.Field {
return slog.F("connection_method", m)
}
const (
// ConnectionMethodUnknown means the disconnect site does not have
// the information to determine the connection path.
ConnectionMethodUnknown ConnectionMethod = ""
// ConnectionMethodDirect means the peers were communicating over a
// direct, peer-to-peer connection (NAT-traversed via STUN).
ConnectionMethodDirect ConnectionMethod = "direct"
// ConnectionMethodDERP means the peers were communicating through a
// DERP relay rather than directly.
ConnectionMethodDERP ConnectionMethod = "derp"
)
// Valid reports whether m is a known ConnectionMethod.
func (m ConnectionMethod) Valid() bool {
switch m {
case ConnectionMethodUnknown,
ConnectionMethodDirect,
ConnectionMethodDERP:
return true
default:
return false
}
}
+94
View File
@@ -0,0 +1,94 @@
package codersdk_test
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/codersdk"
)
func TestDisconnectReason_Valid(t *testing.T) {
t.Parallel()
cases := []struct {
reason codersdk.DisconnectReason
valid bool
}{
{codersdk.DisconnectReasonUnknown, true},
{codersdk.DisconnectReasonGraceful, true},
{codersdk.DisconnectReasonClientClosed, true},
{codersdk.DisconnectReasonServerShutdown, true},
{codersdk.DisconnectReasonNetworkError, true},
{codersdk.DisconnectReasonProtocolError, true},
{codersdk.DisconnectReasonWorkspaceStopped, true},
{codersdk.DisconnectReasonControlPlaneLost, true},
{codersdk.DisconnectReason("not_a_real_reason"), false},
}
for _, c := range cases {
require.Equal(t, c.valid, c.reason.Valid(), "reason=%q", c.reason)
}
}
func TestDisconnectReason_Expected(t *testing.T) {
t.Parallel()
expected := map[codersdk.DisconnectReason]bool{
codersdk.DisconnectReasonGraceful: true,
codersdk.DisconnectReasonClientClosed: true,
codersdk.DisconnectReasonServerShutdown: true,
codersdk.DisconnectReasonWorkspaceStopped: true,
codersdk.DisconnectReasonUnknown: false,
codersdk.DisconnectReasonNetworkError: false,
codersdk.DisconnectReasonProtocolError: false,
codersdk.DisconnectReasonControlPlaneLost: false,
}
for reason, want := range expected {
require.Equal(t, want, reason.Expected(), "reason=%q", reason)
}
// Unknown values default to not-expected so that uncategorized
// emit sites surface in the "investigate" bucket.
require.False(t, codersdk.DisconnectReason("not_a_real_reason").Expected())
}
func TestDisconnectInitiator_Valid(t *testing.T) {
t.Parallel()
cases := []struct {
initiator codersdk.DisconnectInitiator
valid bool
}{
{codersdk.DisconnectInitiatorUnknown, true},
{codersdk.DisconnectInitiatorClient, true},
{codersdk.DisconnectInitiatorAgent, true},
{codersdk.DisconnectInitiatorServer, true},
{codersdk.DisconnectInitiatorNetwork, true},
{codersdk.DisconnectInitiator("nobody"), false},
}
for _, c := range cases {
require.Equal(t, c.valid, c.initiator.Valid(), "initiator=%q", c.initiator)
}
}
func TestConnectionMethod_Valid(t *testing.T) {
t.Parallel()
cases := []struct {
method codersdk.ConnectionMethod
valid bool
}{
{codersdk.ConnectionMethodUnknown, true},
{codersdk.ConnectionMethodDirect, true},
{codersdk.ConnectionMethodDERP, true},
{codersdk.ConnectionMethod("magic"), false},
}
for _, c := range cases {
require.Equal(t, c.valid, c.method.Valid(), "method=%q", c.method)
}
}
+55
View File
@@ -2998,6 +2998,18 @@ export interface ChatWorkspaceTTLResponse {
*/
export const CoderDesktopTelemetryHeader = "Coder-Desktop-Telemetry";
// From codersdk/disconnect.go
export type ConnectionDirection =
| "agent_to_client"
| "client_to_server"
| "server_to_agent";
export const ConnectionDirections: ConnectionDirection[] = [
"agent_to_client",
"client_to_server",
"server_to_agent",
];
// From codersdk/insights.go
/**
* ConnectionLatency shows the latency for a connection.
@@ -3089,6 +3101,11 @@ export interface ConnectionLogsRequest extends Pagination {
readonly q?: string;
}
// From codersdk/disconnect.go
export type ConnectionMethod = "derp" | "direct" | "";
export const ConnectionMethods: ConnectionMethod[] = ["derp", "direct", ""];
// From codersdk/connectionlog.go
export type ConnectionType =
| "jetbrains"
@@ -4032,6 +4049,44 @@ export const DiagnosticSeverityStrings: DiagnosticSeverityString[] = [
"warning",
];
// From codersdk/disconnect.go
export type DisconnectInitiator =
| "agent"
| "client"
| "network"
| "server"
| "";
export const DisconnectInitiators: DisconnectInitiator[] = [
"agent",
"client",
"network",
"server",
"",
];
// From codersdk/disconnect.go
export type DisconnectReason =
| "client_closed"
| "control_plane_lost"
| "graceful"
| "network_error"
| "protocol_error"
| "server_shutdown"
| ""
| "workspace_stopped";
export const DisconnectReasons: DisconnectReason[] = [
"client_closed",
"control_plane_lost",
"graceful",
"network_error",
"protocol_error",
"server_shutdown",
"",
"workspace_stopped",
];
// From codersdk/workspaceagents.go
export type DisplayApp =
| "port_forwarding_helper"
+57 -5
View File
@@ -154,6 +154,17 @@ type BasicCoordinationController struct {
Logger slog.Logger
Coordinatee Coordinatee
SendAcks bool
// Initiator labels which side of the coordination this controller
// represents (e.g. "agent", "client", "server"). It is attached as
// a slog field to disconnect-related logs emitted by this
// coordination so operators can attribute connection-close events
// without inspecting call sites. Leave unset on synthetic
// controllers (tests, in-memory fakes) and the field will be
// omitted from log output.
Initiator codersdk.DisconnectInitiator
// Direction labels the connection layer (server_to_agent,
// agent_to_client, client_to_server) for disconnect logs.
Direction codersdk.ConnectionDirection
}
// New satisfies the method on the CoordinationController interface
@@ -170,6 +181,8 @@ func (c *BasicCoordinationController) NewCoordination(client CoordinatorClient)
client: client,
respLoopDone: make(chan struct{}),
sendAcks: c.SendAcks,
initiator: c.Initiator,
direction: c.Direction,
}
c.Coordinatee.SetNodeCallback(func(node *Node) {
@@ -211,6 +224,8 @@ type BasicCoordination struct {
client CoordinatorClient
respLoopDone chan struct{}
sendAcks bool
initiator codersdk.DisconnectInitiator
direction codersdk.ConnectionDirection
}
// CloseClient forcibly closes the underlying coordinator client connection
@@ -243,11 +258,24 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) {
err := c.client.Send(&proto.CoordinateRequest{Disconnect: &proto.CoordinateRequest_Disconnect{}})
c.Unlock()
if err != nil && !xerrors.Is(err, io.EOF) {
reason := codersdk.DisconnectReasonNetworkError
// Log but don't return early; we must still clean up below.
c.logger.Warn(context.Background(), "failed to send disconnect", slog.Error(err))
c.logger.Warn(context.Background(), "failed to send disconnect",
c.direction.SlogField(),
reason.SlogExpectedField(),
reason.SlogField(),
c.initiator.SlogField(),
slog.Error(err),
)
retErr = xerrors.Errorf("send disconnect: %w", err)
} else {
c.logger.Debug(context.Background(), "sent disconnect")
reason := codersdk.DisconnectReasonGraceful
c.logger.Debug(context.Background(), "sent disconnect",
c.direction.SlogField(),
reason.SlogExpectedField(),
reason.SlogField(),
c.initiator.SlogField(),
)
}
// We shouldn't just close the protocol right away, because the way dRPC streams work is
@@ -256,10 +284,23 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) {
// Disconnect message, so we should wait around for that until the context expires.
select {
case <-c.respLoopDone:
c.logger.Debug(ctx, "responses closed after disconnect")
reason := codersdk.DisconnectReasonGraceful
c.logger.Debug(ctx, "responses closed after disconnect",
c.direction.SlogField(),
reason.SlogExpectedField(),
reason.SlogField(),
c.initiator.SlogField(),
)
return retErr
case <-ctx.Done():
c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close")
reason := codersdk.DisconnectReasonNetworkError
c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close",
c.direction.SlogField(),
reason.SlogExpectedField(),
reason.SlogField(),
c.initiator.SlogField(),
codersdk.SlogDisconnectDetail("context expired before coordinator hung up"),
)
}
// forcefully close the stream
protoErr := c.client.Close()
@@ -369,6 +410,8 @@ func NewTunnelSrcCoordController(
Logger: logger,
Coordinatee: coordinatee,
SendAcks: false,
Initiator: codersdk.DisconnectInitiatorClient,
Direction: codersdk.ConnectionDirectionClientToServer,
},
dests: make(map[uuid.UUID]struct{}),
}
@@ -508,6 +551,8 @@ func NewAgentCoordinationController(
Logger: logger,
Coordinatee: coordinatee,
SendAcks: true,
Initiator: codersdk.DisconnectInitiatorAgent,
Direction: codersdk.ConnectionDirectionServerToAgent,
}
}
@@ -1505,7 +1550,14 @@ func (c *Controller) Run(ctx context.Context) {
}
if errors.Is(err, net.ErrClosed) {
c.logger.Warn(c.ctx, "control plane connection closed, retrying", slog.Error(err))
reason := codersdk.DisconnectReasonNetworkError
c.logger.Warn(c.ctx, "control plane connection closed, retrying",
codersdk.ConnectionDirectionServerToAgent.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
codersdk.DisconnectInitiatorNetwork.SlogField(),
slog.Error(err),
)
continue
}