mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: structured disconnect attribution for agent logs (#25191)
Implements [PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution): adds structured disconnect attribution to disconnect logs throughout the agent and tailnet packages. Every disconnect log site now carries structured slog fields. All existing logs remain; existing messages are preserved with the fields added alongside. New fields on disconnect log lines: - `connect_type` — which layer disconnected: `server_to_agent`, `agent_to_client`, or `client_to_server` - `disconnect_reason` — categorical reason: `graceful`, `network_error`, `server_shutdown`, etc. - `disconnect_expected` — whether the disconnect is normal operation (`true`) or should be investigated (`false`) - `disconnect_initiator` — who started it: `client`, `agent`, `server`, or `network` (control-plane sites only) - `disconnect_detail` — free-form supplemental info (where useful) ## What's covered **Control plane (`server_to_agent`):** coordination RPC, DERP map subscriber, agent runLoop, agent Close, `BasicCoordination.Close`, `Controller.run`. **Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY, JetBrains port-forwarding. <details> <summary>Control-plane sites</summary> | Site | Reason | Initiator | |---|---|---| | `agent/agent.go` `runLoop` EOF | `network_error` | `network` | | `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` / `graceful` / `network_error` | `agent` / `server` / `network` | | `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared `classifyCoordinatorRPCExit`) | same | | `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail | `agent` | | `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` | `agent` | | `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or `network_error` | `c.initiator` | | `tailnet/controllers.go` `Controller.run` `net.ErrClosed` | `network_error` | `network` | </details> <details> <summary>Data-plane sites</summary> | Site | Reason | Notes | |---|---|---| | `agent/agentssh/agentssh.go` SSH session closed | free-form (`graceful`, `process exited with error status: N`, etc.) | Also sets `closeCause("normal exit")` for clean exits so coderd's `connection_log.DisconnectReason` is no longer empty | | `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`, error string, or `graceful` | | | `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or error string | Previously passed empty reason | </details> <details> <summary>Bug fix</summary> The deferred `disconnected from coordination RPC` log no longer fires when the initial `Coordinate()` RPC call fails before any connection is established. </details> Refs PLAT-60. --- _This PR was prepared by Coder Agents on behalf of @Emyrk._ **Manually QA'd a lot of common disconnects** --------- Co-authored-by: Coder Agents <noreply@coder.com>
This commit is contained in:
+58
-7
@@ -514,7 +514,12 @@ func (a *agent) runLoop() {
|
||||
return
|
||||
}
|
||||
if errors.Is(err, io.EOF) {
|
||||
a.logger.Info(ctx, "disconnected from coderd")
|
||||
a.logger.Info(ctx, "disconnected from coderd",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
codersdk.DisconnectReasonNetworkError.SlogField(),
|
||||
codersdk.DisconnectReasonNetworkError.SlogExpectedField(),
|
||||
codersdk.DisconnectInitiatorNetwork.SlogField(),
|
||||
)
|
||||
continue
|
||||
}
|
||||
a.logger.Warn(ctx, "run exited with error", slog.Error(err))
|
||||
@@ -1878,16 +1883,43 @@ func (a *agent) createTailnet(
|
||||
return network, nil
|
||||
}
|
||||
|
||||
// classifyCoordinatorRPCExit determines the DisconnectReason and
|
||||
// DisconnectInitiator for a coordinator-style RPC (the coordination RPC
|
||||
// and the DERP map subscriber RPC) that has just returned. A canceled
|
||||
// local context means the agent itself is shutting down. A non-nil
|
||||
// return error without context cancellation means the stream broke
|
||||
// unexpectedly.
|
||||
func classifyCoordinatorRPCExit(ctx context.Context, retErr error) (codersdk.DisconnectReason, codersdk.DisconnectInitiator) {
|
||||
localShutdown := ctx.Err() != nil
|
||||
switch {
|
||||
case localShutdown:
|
||||
return codersdk.DisconnectReasonServerShutdown, codersdk.DisconnectInitiatorAgent
|
||||
case retErr == nil:
|
||||
return codersdk.DisconnectReasonGraceful, codersdk.DisconnectInitiatorServer
|
||||
default:
|
||||
return codersdk.DisconnectReasonNetworkError, codersdk.DisconnectInitiatorNetwork
|
||||
}
|
||||
}
|
||||
|
||||
// runCoordinator runs a coordinator and returns whether a reconnect
|
||||
// should occur.
|
||||
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
|
||||
defer a.logger.Debug(ctx, "disconnected from coordination RPC")
|
||||
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) {
|
||||
// we run the RPC on the hardCtx so that we have a chance to send the disconnect message if we
|
||||
// gracefully shut down.
|
||||
coordinate, err := tClient.Coordinate(a.hardCtx)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to connect to the coordinate endpoint: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
reason, initiator := classifyCoordinatorRPCExit(ctx, retErr)
|
||||
a.logger.Debug(ctx, "disconnected from coordination RPC",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
reason.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
initiator.SlogField(),
|
||||
slog.Error(retErr),
|
||||
)
|
||||
}()
|
||||
defer func() {
|
||||
cErr := coordinate.Close()
|
||||
if cErr != nil {
|
||||
@@ -1935,8 +1967,7 @@ func (a *agent) setCoordDisconnected() chan struct{} {
|
||||
}
|
||||
|
||||
// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur.
|
||||
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
|
||||
defer a.logger.Debug(ctx, "disconnected from derp map RPC")
|
||||
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) (retErr error) {
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
stream, err := tClient.StreamDERPMaps(ctx, &tailnetproto.StreamDERPMapsRequest{})
|
||||
@@ -1948,6 +1979,15 @@ func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.D
|
||||
if cErr != nil {
|
||||
a.logger.Debug(ctx, "error closing DERPMap stream", slog.Error(err))
|
||||
}
|
||||
|
||||
reason, initiator := classifyCoordinatorRPCExit(ctx, retErr)
|
||||
a.logger.Debug(ctx, "disconnected from derp map RPC",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
reason.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
initiator.SlogField(),
|
||||
slog.Error(retErr),
|
||||
)
|
||||
}()
|
||||
a.logger.Info(ctx, "connected to derp map RPC")
|
||||
for {
|
||||
@@ -2260,9 +2300,20 @@ lifecycleWaitLoop:
|
||||
// Wait for graceful disconnect from the Coordinator RPC
|
||||
select {
|
||||
case <-a.hardCtx.Done():
|
||||
a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect")
|
||||
a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
|
||||
codersdk.DisconnectInitiatorAgent.SlogField(),
|
||||
codersdk.SlogDisconnectDetail("timed out waiting for coordinator RPC to disconnect"),
|
||||
)
|
||||
case <-coordDisconnected:
|
||||
a.logger.Debug(context.Background(), "coordinator RPC disconnected")
|
||||
a.logger.Debug(context.Background(), "coordinator RPC disconnected",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
|
||||
codersdk.DisconnectInitiatorAgent.SlogField(),
|
||||
)
|
||||
}
|
||||
|
||||
// Wait for logs to be sent
|
||||
|
||||
@@ -1,17 +1,20 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"cdr.dev/slog/v3"
|
||||
"cdr.dev/slog/v3/sloggers/slogtest"
|
||||
"github.com/coder/coder/v2/agent/agentcontextconfig"
|
||||
"github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
agentsdk "github.com/coder/coder/v2/codersdk/agentsdk"
|
||||
"github.com/coder/coder/v2/testutil"
|
||||
)
|
||||
@@ -86,3 +89,56 @@ func TestContextConfigAPI_InitOnce(t *testing.T) {
|
||||
require.NotEmpty(t, mcpFiles2)
|
||||
require.Contains(t, mcpFiles2[0], dir2)
|
||||
}
|
||||
|
||||
func TestClassifyCoordinatorRPCExit(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
canceled, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
ctx context.Context
|
||||
retErr error
|
||||
reason codersdk.DisconnectReason
|
||||
initiator codersdk.DisconnectInitiator
|
||||
}{
|
||||
{
|
||||
name: "local shutdown, no error",
|
||||
ctx: canceled,
|
||||
retErr: nil,
|
||||
reason: codersdk.DisconnectReasonServerShutdown,
|
||||
initiator: codersdk.DisconnectInitiatorAgent,
|
||||
},
|
||||
{
|
||||
name: "local shutdown, with cleanup error",
|
||||
ctx: canceled,
|
||||
retErr: xerrors.New("close timed out"),
|
||||
reason: codersdk.DisconnectReasonServerShutdown,
|
||||
initiator: codersdk.DisconnectInitiatorAgent,
|
||||
},
|
||||
{
|
||||
name: "remote graceful, no error",
|
||||
ctx: context.Background(),
|
||||
retErr: nil,
|
||||
reason: codersdk.DisconnectReasonGraceful,
|
||||
initiator: codersdk.DisconnectInitiatorServer,
|
||||
},
|
||||
{
|
||||
name: "stream broke unexpectedly",
|
||||
ctx: context.Background(),
|
||||
retErr: xerrors.New("read: connection reset"),
|
||||
reason: codersdk.DisconnectReasonNetworkError,
|
||||
initiator: codersdk.DisconnectInitiatorNetwork,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
reason, initiator := classifyCoordinatorRPCExit(tc.ctx, tc.retErr)
|
||||
require.Equal(t, tc.reason, reason)
|
||||
require.Equal(t, tc.initiator, initiator)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -462,17 +462,23 @@ func (s *Server) sessionHandler(session ssh.Session) {
|
||||
logger.Warn(ctx, "invalid magic ssh session type specified", slog.F("raw_type", magicTypeRaw))
|
||||
}
|
||||
|
||||
closeCause := func(string) {}
|
||||
closeCause := func(_ string) {}
|
||||
if reportSession {
|
||||
var reason string
|
||||
closeCause = func(r string) { reason = r }
|
||||
var reason codersdk.DisconnectReason
|
||||
closeCause = func(r string) { reason = codersdk.DisconnectReason(r) }
|
||||
|
||||
scr := &sessionCloseTracker{Session: session}
|
||||
session = scr
|
||||
|
||||
disconnected := s.config.ReportConnection(id, magicType, remoteAddrString)
|
||||
defer func() {
|
||||
disconnected(scr.exitCode(), reason)
|
||||
logger.Info(ctx, "ssh session closed",
|
||||
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
||||
reason.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
slog.F("exit_code", scr.exitCode()),
|
||||
)
|
||||
disconnected(scr.exitCode(), string(reason))
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -567,6 +573,7 @@ func (s *Server) sessionHandler(session ssh.Session) {
|
||||
_ = session.Exit(MagicSessionErrorCode)
|
||||
return
|
||||
}
|
||||
closeCause(string(codersdk.DisconnectReasonGraceful))
|
||||
logger.Info(ctx, "normal ssh session exit")
|
||||
_ = session.Exit(0)
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
gossh "golang.org/x/crypto/ssh"
|
||||
|
||||
"cdr.dev/slog/v3"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
)
|
||||
|
||||
// localForwardChannelData is copied from the ssh package.
|
||||
@@ -85,9 +86,13 @@ func (w *JetbrainsChannelWatcher) Accept() (gossh.Channel, <-chan *gossh.Request
|
||||
Channel: c,
|
||||
done: func() {
|
||||
w.jetbrainsCounter.Add(-1)
|
||||
disconnected(0, "")
|
||||
disconnected(0, "normal close")
|
||||
// nolint: gocritic // JetBrains is a proper noun and should be capitalized
|
||||
w.logger.Debug(context.Background(), "JetBrains watcher channel closed")
|
||||
w.logger.Debug(context.Background(), "JetBrains channel closed",
|
||||
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
||||
codersdk.DisconnectReasonGraceful.SlogField(),
|
||||
codersdk.DisconnectReasonGraceful.SlogExpectedField(),
|
||||
)
|
||||
},
|
||||
}, r, err
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/coder/coder/v2/agent/agentcontainers"
|
||||
"github.com/coder/coder/v2/agent/agentssh"
|
||||
"github.com/coder/coder/v2/agent/usershell"
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
||||
)
|
||||
|
||||
@@ -95,6 +96,11 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err
|
||||
select {
|
||||
case <-closed:
|
||||
case <-hardCtx.Done():
|
||||
clog.Info(hardCtx, "reconnecting pty closed",
|
||||
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogField(),
|
||||
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
|
||||
)
|
||||
disconnected(1, "server shut down")
|
||||
_ = conn.Close()
|
||||
}
|
||||
@@ -104,15 +110,28 @@ func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr err
|
||||
defer close(closed)
|
||||
defer wg.Done()
|
||||
err := s.handleConn(ctx, clog, conn)
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
disconnected(1, "server shutting down")
|
||||
} else {
|
||||
disconnected(1, err.Error())
|
||||
}
|
||||
} else {
|
||||
disconnected(0, "")
|
||||
var reason codersdk.DisconnectReason
|
||||
var code int
|
||||
var detail string
|
||||
switch {
|
||||
case err != nil && ctx.Err() != nil:
|
||||
reason = codersdk.DisconnectReasonServerShutdown
|
||||
code = 1
|
||||
case err != nil:
|
||||
reason = codersdk.DisconnectReasonNetworkError
|
||||
detail = err.Error()
|
||||
code = 1
|
||||
default:
|
||||
reason = codersdk.DisconnectReasonGraceful
|
||||
}
|
||||
clog.Info(ctx, "reconnecting pty closed",
|
||||
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
||||
reason.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
codersdk.SlogDisconnectDetail(detail),
|
||||
slog.F("exit_code", code),
|
||||
)
|
||||
disconnected(code, string(reason))
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
@@ -529,6 +529,8 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
|
||||
Logger: logger,
|
||||
Coordinatee: coordinatee,
|
||||
SendAcks: false, // we are a client, connecting to multiple agents
|
||||
Initiator: codersdk.DisconnectInitiatorServer,
|
||||
Direction: codersdk.ConnectionDirectionServerToAgent,
|
||||
},
|
||||
logger: logger,
|
||||
tracer: tracer,
|
||||
|
||||
@@ -0,0 +1,228 @@
|
||||
package codersdk
|
||||
|
||||
import "cdr.dev/slog/v3"
|
||||
|
||||
// SlogDisconnectDetail is the slog field for the free-form, human-readable
|
||||
// detail string that supplements the structured reason. Use it for
|
||||
// "exited with code 137" style information that does not fit a category.
|
||||
func SlogDisconnectDetail(detail string) slog.Field {
|
||||
return slog.F("disconnect_detail", detail)
|
||||
}
|
||||
|
||||
// DisconnectReason categorizes why a workspace connection ended. It is
|
||||
// emitted as a slog field at every disconnect log site so operators can
|
||||
// filter and aggregate disconnects without parsing free-form reason
|
||||
// strings.
|
||||
//
|
||||
// The set of values intentionally stays small. Use DisconnectReasonUnknown
|
||||
// when no other value applies; do not invent ad-hoc strings. Add a new
|
||||
// constant here (and update its godoc) when a new disconnect class is
|
||||
// genuinely distinct from the existing ones.
|
||||
type DisconnectReason string
|
||||
|
||||
func (r DisconnectReason) SlogField() slog.Field {
|
||||
if r == "" {
|
||||
return slog.F("disconnect_reason", "unknown")
|
||||
}
|
||||
return slog.F("disconnect_reason", r)
|
||||
}
|
||||
|
||||
func (r DisconnectReason) SlogExpectedField() slog.Field {
|
||||
return slog.F("disconnect_expected", r.Expected())
|
||||
}
|
||||
|
||||
const (
|
||||
// DisconnectReasonUnknown is the zero value. Use it when the disconnect
|
||||
// path cannot determine a more specific reason. Treat any disconnect
|
||||
// logged with this value as a bug to investigate, not as a normal
|
||||
// outcome.
|
||||
DisconnectReasonUnknown DisconnectReason = ""
|
||||
|
||||
// DisconnectReasonGraceful indicates the connection ended cleanly: the
|
||||
// remote side acknowledged a Disconnect message, an SSH session exited
|
||||
// with status 0, or a PTY closed without error. This is the expected
|
||||
// "happy path" outcome.
|
||||
DisconnectReasonGraceful DisconnectReason = "graceful"
|
||||
|
||||
// DisconnectReasonClientClosed indicates the client side closed the
|
||||
// connection without an error (for example, the user closed their
|
||||
// terminal or quit their IDE). The session ran to a natural end from
|
||||
// the client's perspective.
|
||||
DisconnectReasonClientClosed DisconnectReason = "client_closed"
|
||||
|
||||
// DisconnectReasonServerShutdown indicates the workspace agent or
|
||||
// coderd is shutting down and is closing connections as part of that
|
||||
// shutdown. The connection itself was healthy; the process is going
|
||||
// away.
|
||||
DisconnectReasonServerShutdown DisconnectReason = "server_shutdown"
|
||||
|
||||
// DisconnectReasonNetworkError indicates the transport failed: an EOF,
|
||||
// a read or write error, a context cancellation caused by a timeout,
|
||||
// or a similar I/O failure. The connection did not end cleanly.
|
||||
DisconnectReasonNetworkError DisconnectReason = "network_error"
|
||||
|
||||
// DisconnectReasonProtocolError indicates a dRPC, SSH, or tailnet
|
||||
// protocol violation by the peer. Distinct from network errors because
|
||||
// the bytes flowed but the contents were unparsable or unexpected.
|
||||
DisconnectReasonProtocolError DisconnectReason = "protocol_error"
|
||||
|
||||
// DisconnectReasonWorkspaceStopped indicates the workspace itself was
|
||||
// stopped or deleted while the connection was open, so coderd closed
|
||||
// outstanding sessions on its behalf.
|
||||
DisconnectReasonWorkspaceStopped DisconnectReason = "workspace_stopped"
|
||||
|
||||
// DisconnectReasonControlPlaneLost indicates the agent or client lost
|
||||
// its coordination RPC to coderd. The data plane (peer-to-peer or
|
||||
// DERP) may still be functional; this records the control plane
|
||||
// outcome specifically.
|
||||
DisconnectReasonControlPlaneLost DisconnectReason = "control_plane_lost"
|
||||
)
|
||||
|
||||
// Valid reports whether r is a known DisconnectReason. The zero value
|
||||
// (DisconnectReasonUnknown) is considered valid since it is the explicit
|
||||
// "no information" reason.
|
||||
func (r DisconnectReason) Valid() bool {
|
||||
switch r {
|
||||
case DisconnectReasonUnknown,
|
||||
DisconnectReasonGraceful,
|
||||
DisconnectReasonClientClosed,
|
||||
DisconnectReasonServerShutdown,
|
||||
DisconnectReasonNetworkError,
|
||||
DisconnectReasonProtocolError,
|
||||
DisconnectReasonWorkspaceStopped,
|
||||
DisconnectReasonControlPlaneLost:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Expected reports whether a disconnect with this reason is part of
|
||||
// normal operation. Operators can use this to split dashboards or alerts
|
||||
// into "expected" and "investigate" buckets without enumerating every
|
||||
// reason.
|
||||
func (r DisconnectReason) Expected() bool {
|
||||
switch r {
|
||||
case DisconnectReasonGraceful,
|
||||
DisconnectReasonClientClosed,
|
||||
DisconnectReasonServerShutdown,
|
||||
DisconnectReasonWorkspaceStopped:
|
||||
return true
|
||||
case DisconnectReasonUnknown,
|
||||
DisconnectReasonNetworkError,
|
||||
DisconnectReasonProtocolError,
|
||||
DisconnectReasonControlPlaneLost:
|
||||
return false
|
||||
default:
|
||||
// Unknown reason values are treated as not expected so that
|
||||
// new emit sites that forget to classify themselves surface
|
||||
// in the "investigate" bucket by default.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// DisconnectInitiator identifies which side caused the disconnect. It
|
||||
// pairs with DisconnectReason: the reason describes what happened, the
|
||||
// initiator describes who started it.
|
||||
type DisconnectInitiator string
|
||||
|
||||
const (
|
||||
// DisconnectInitiatorUnknown means the disconnect site cannot
|
||||
// attribute the close to a specific side. Avoid this where possible.
|
||||
DisconnectInitiatorUnknown DisconnectInitiator = ""
|
||||
|
||||
// DisconnectInitiatorClient means the user-facing side (CLI, VS Code
|
||||
// extension, JetBrains plugin, Coder Desktop) closed the connection.
|
||||
DisconnectInitiatorClient DisconnectInitiator = "client"
|
||||
|
||||
// DisconnectInitiatorAgent means the workspace agent closed the
|
||||
// connection.
|
||||
DisconnectInitiatorAgent DisconnectInitiator = "agent"
|
||||
|
||||
// DisconnectInitiatorServer means coderd (or a workspace proxy)
|
||||
// closed the connection.
|
||||
DisconnectInitiatorServer DisconnectInitiator = "server"
|
||||
|
||||
// DisconnectInitiatorNetwork means an underlying network or transport
|
||||
// fault caused the close. Neither end deliberately initiated it.
|
||||
DisconnectInitiatorNetwork DisconnectInitiator = "network"
|
||||
)
|
||||
|
||||
func (i DisconnectInitiator) SlogField() slog.Field {
|
||||
return slog.F("disconnect_initiator", i)
|
||||
}
|
||||
|
||||
// Valid reports whether i is a known DisconnectInitiator.
|
||||
func (i DisconnectInitiator) Valid() bool {
|
||||
switch i {
|
||||
case DisconnectInitiatorUnknown,
|
||||
DisconnectInitiatorClient,
|
||||
DisconnectInitiatorAgent,
|
||||
DisconnectInitiatorServer,
|
||||
DisconnectInitiatorNetwork:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// ConnectionDirection identifies which layer a disconnect log belongs to.
|
||||
// It tells operators at a glance whether a log is about the control plane
|
||||
// (server to agent) or the data plane (agent to client).
|
||||
type ConnectionDirection string
|
||||
|
||||
func (d ConnectionDirection) SlogField() slog.Field {
|
||||
return slog.F("connect_type", d)
|
||||
}
|
||||
|
||||
const (
|
||||
// ConnectionDirectionServerToAgent is the control-plane connection
|
||||
// between coderd and the workspace agent (coordination RPC, DERP map
|
||||
// subscriber, agent runLoop).
|
||||
ConnectionDirectionServerToAgent ConnectionDirection = "server_to_agent"
|
||||
|
||||
// ConnectionDirectionAgentToClient is a data-plane session between
|
||||
// the workspace agent and a user's client (SSH, reconnecting PTY,
|
||||
// JetBrains port-forwarding).
|
||||
ConnectionDirectionAgentToClient ConnectionDirection = "agent_to_client"
|
||||
|
||||
// ConnectionDirectionClientToServer is a connection from a user's
|
||||
// client to coderd (e.g. the CLI's WebSocket to the coordinator).
|
||||
// Not yet instrumented.
|
||||
ConnectionDirectionClientToServer ConnectionDirection = "client_to_server"
|
||||
)
|
||||
|
||||
// ConnectionMethod describes the network path a workspace connection
|
||||
// took at the moment a disconnect log was emitted. It is intended for
|
||||
// observability only; do not switch behavior on it.
|
||||
type ConnectionMethod string
|
||||
|
||||
func (m ConnectionMethod) SlogField() slog.Field {
|
||||
return slog.F("connection_method", m)
|
||||
}
|
||||
|
||||
const (
|
||||
// ConnectionMethodUnknown means the disconnect site does not have
|
||||
// the information to determine the connection path.
|
||||
ConnectionMethodUnknown ConnectionMethod = ""
|
||||
|
||||
// ConnectionMethodDirect means the peers were communicating over a
|
||||
// direct, peer-to-peer connection (NAT-traversed via STUN).
|
||||
ConnectionMethodDirect ConnectionMethod = "direct"
|
||||
|
||||
// ConnectionMethodDERP means the peers were communicating through a
|
||||
// DERP relay rather than directly.
|
||||
ConnectionMethodDERP ConnectionMethod = "derp"
|
||||
)
|
||||
|
||||
// Valid reports whether m is a known ConnectionMethod.
|
||||
func (m ConnectionMethod) Valid() bool {
|
||||
switch m {
|
||||
case ConnectionMethodUnknown,
|
||||
ConnectionMethodDirect,
|
||||
ConnectionMethodDERP:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package codersdk_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
)
|
||||
|
||||
func TestDisconnectReason_Valid(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
reason codersdk.DisconnectReason
|
||||
valid bool
|
||||
}{
|
||||
{codersdk.DisconnectReasonUnknown, true},
|
||||
{codersdk.DisconnectReasonGraceful, true},
|
||||
{codersdk.DisconnectReasonClientClosed, true},
|
||||
{codersdk.DisconnectReasonServerShutdown, true},
|
||||
{codersdk.DisconnectReasonNetworkError, true},
|
||||
{codersdk.DisconnectReasonProtocolError, true},
|
||||
{codersdk.DisconnectReasonWorkspaceStopped, true},
|
||||
{codersdk.DisconnectReasonControlPlaneLost, true},
|
||||
{codersdk.DisconnectReason("not_a_real_reason"), false},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
require.Equal(t, c.valid, c.reason.Valid(), "reason=%q", c.reason)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDisconnectReason_Expected(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
expected := map[codersdk.DisconnectReason]bool{
|
||||
codersdk.DisconnectReasonGraceful: true,
|
||||
codersdk.DisconnectReasonClientClosed: true,
|
||||
codersdk.DisconnectReasonServerShutdown: true,
|
||||
codersdk.DisconnectReasonWorkspaceStopped: true,
|
||||
|
||||
codersdk.DisconnectReasonUnknown: false,
|
||||
codersdk.DisconnectReasonNetworkError: false,
|
||||
codersdk.DisconnectReasonProtocolError: false,
|
||||
codersdk.DisconnectReasonControlPlaneLost: false,
|
||||
}
|
||||
|
||||
for reason, want := range expected {
|
||||
require.Equal(t, want, reason.Expected(), "reason=%q", reason)
|
||||
}
|
||||
|
||||
// Unknown values default to not-expected so that uncategorized
|
||||
// emit sites surface in the "investigate" bucket.
|
||||
require.False(t, codersdk.DisconnectReason("not_a_real_reason").Expected())
|
||||
}
|
||||
|
||||
func TestDisconnectInitiator_Valid(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
initiator codersdk.DisconnectInitiator
|
||||
valid bool
|
||||
}{
|
||||
{codersdk.DisconnectInitiatorUnknown, true},
|
||||
{codersdk.DisconnectInitiatorClient, true},
|
||||
{codersdk.DisconnectInitiatorAgent, true},
|
||||
{codersdk.DisconnectInitiatorServer, true},
|
||||
{codersdk.DisconnectInitiatorNetwork, true},
|
||||
{codersdk.DisconnectInitiator("nobody"), false},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
require.Equal(t, c.valid, c.initiator.Valid(), "initiator=%q", c.initiator)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectionMethod_Valid(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
method codersdk.ConnectionMethod
|
||||
valid bool
|
||||
}{
|
||||
{codersdk.ConnectionMethodUnknown, true},
|
||||
{codersdk.ConnectionMethodDirect, true},
|
||||
{codersdk.ConnectionMethodDERP, true},
|
||||
{codersdk.ConnectionMethod("magic"), false},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
require.Equal(t, c.valid, c.method.Valid(), "method=%q", c.method)
|
||||
}
|
||||
}
|
||||
Generated
+55
@@ -2998,6 +2998,18 @@ export interface ChatWorkspaceTTLResponse {
|
||||
*/
|
||||
export const CoderDesktopTelemetryHeader = "Coder-Desktop-Telemetry";
|
||||
|
||||
// From codersdk/disconnect.go
|
||||
export type ConnectionDirection =
|
||||
| "agent_to_client"
|
||||
| "client_to_server"
|
||||
| "server_to_agent";
|
||||
|
||||
export const ConnectionDirections: ConnectionDirection[] = [
|
||||
"agent_to_client",
|
||||
"client_to_server",
|
||||
"server_to_agent",
|
||||
];
|
||||
|
||||
// From codersdk/insights.go
|
||||
/**
|
||||
* ConnectionLatency shows the latency for a connection.
|
||||
@@ -3089,6 +3101,11 @@ export interface ConnectionLogsRequest extends Pagination {
|
||||
readonly q?: string;
|
||||
}
|
||||
|
||||
// From codersdk/disconnect.go
|
||||
export type ConnectionMethod = "derp" | "direct" | "";
|
||||
|
||||
export const ConnectionMethods: ConnectionMethod[] = ["derp", "direct", ""];
|
||||
|
||||
// From codersdk/connectionlog.go
|
||||
export type ConnectionType =
|
||||
| "jetbrains"
|
||||
@@ -4032,6 +4049,44 @@ export const DiagnosticSeverityStrings: DiagnosticSeverityString[] = [
|
||||
"warning",
|
||||
];
|
||||
|
||||
// From codersdk/disconnect.go
|
||||
export type DisconnectInitiator =
|
||||
| "agent"
|
||||
| "client"
|
||||
| "network"
|
||||
| "server"
|
||||
| "";
|
||||
|
||||
export const DisconnectInitiators: DisconnectInitiator[] = [
|
||||
"agent",
|
||||
"client",
|
||||
"network",
|
||||
"server",
|
||||
"",
|
||||
];
|
||||
|
||||
// From codersdk/disconnect.go
|
||||
export type DisconnectReason =
|
||||
| "client_closed"
|
||||
| "control_plane_lost"
|
||||
| "graceful"
|
||||
| "network_error"
|
||||
| "protocol_error"
|
||||
| "server_shutdown"
|
||||
| ""
|
||||
| "workspace_stopped";
|
||||
|
||||
export const DisconnectReasons: DisconnectReason[] = [
|
||||
"client_closed",
|
||||
"control_plane_lost",
|
||||
"graceful",
|
||||
"network_error",
|
||||
"protocol_error",
|
||||
"server_shutdown",
|
||||
"",
|
||||
"workspace_stopped",
|
||||
];
|
||||
|
||||
// From codersdk/workspaceagents.go
|
||||
export type DisplayApp =
|
||||
| "port_forwarding_helper"
|
||||
|
||||
+57
-5
@@ -154,6 +154,17 @@ type BasicCoordinationController struct {
|
||||
Logger slog.Logger
|
||||
Coordinatee Coordinatee
|
||||
SendAcks bool
|
||||
// Initiator labels which side of the coordination this controller
|
||||
// represents (e.g. "agent", "client", "server"). It is attached as
|
||||
// a slog field to disconnect-related logs emitted by this
|
||||
// coordination so operators can attribute connection-close events
|
||||
// without inspecting call sites. Leave unset on synthetic
|
||||
// controllers (tests, in-memory fakes) and the field will be
|
||||
// omitted from log output.
|
||||
Initiator codersdk.DisconnectInitiator
|
||||
// Direction labels the connection layer (server_to_agent,
|
||||
// agent_to_client, client_to_server) for disconnect logs.
|
||||
Direction codersdk.ConnectionDirection
|
||||
}
|
||||
|
||||
// New satisfies the method on the CoordinationController interface
|
||||
@@ -170,6 +181,8 @@ func (c *BasicCoordinationController) NewCoordination(client CoordinatorClient)
|
||||
client: client,
|
||||
respLoopDone: make(chan struct{}),
|
||||
sendAcks: c.SendAcks,
|
||||
initiator: c.Initiator,
|
||||
direction: c.Direction,
|
||||
}
|
||||
|
||||
c.Coordinatee.SetNodeCallback(func(node *Node) {
|
||||
@@ -211,6 +224,8 @@ type BasicCoordination struct {
|
||||
client CoordinatorClient
|
||||
respLoopDone chan struct{}
|
||||
sendAcks bool
|
||||
initiator codersdk.DisconnectInitiator
|
||||
direction codersdk.ConnectionDirection
|
||||
}
|
||||
|
||||
// CloseClient forcibly closes the underlying coordinator client connection
|
||||
@@ -243,11 +258,24 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) {
|
||||
err := c.client.Send(&proto.CoordinateRequest{Disconnect: &proto.CoordinateRequest_Disconnect{}})
|
||||
c.Unlock()
|
||||
if err != nil && !xerrors.Is(err, io.EOF) {
|
||||
reason := codersdk.DisconnectReasonNetworkError
|
||||
// Log but don't return early; we must still clean up below.
|
||||
c.logger.Warn(context.Background(), "failed to send disconnect", slog.Error(err))
|
||||
c.logger.Warn(context.Background(), "failed to send disconnect",
|
||||
c.direction.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
reason.SlogField(),
|
||||
c.initiator.SlogField(),
|
||||
slog.Error(err),
|
||||
)
|
||||
retErr = xerrors.Errorf("send disconnect: %w", err)
|
||||
} else {
|
||||
c.logger.Debug(context.Background(), "sent disconnect")
|
||||
reason := codersdk.DisconnectReasonGraceful
|
||||
c.logger.Debug(context.Background(), "sent disconnect",
|
||||
c.direction.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
reason.SlogField(),
|
||||
c.initiator.SlogField(),
|
||||
)
|
||||
}
|
||||
|
||||
// We shouldn't just close the protocol right away, because the way dRPC streams work is
|
||||
@@ -256,10 +284,23 @@ func (c *BasicCoordination) Close(ctx context.Context) (retErr error) {
|
||||
// Disconnect message, so we should wait around for that until the context expires.
|
||||
select {
|
||||
case <-c.respLoopDone:
|
||||
c.logger.Debug(ctx, "responses closed after disconnect")
|
||||
reason := codersdk.DisconnectReasonGraceful
|
||||
c.logger.Debug(ctx, "responses closed after disconnect",
|
||||
c.direction.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
reason.SlogField(),
|
||||
c.initiator.SlogField(),
|
||||
)
|
||||
return retErr
|
||||
case <-ctx.Done():
|
||||
c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close")
|
||||
reason := codersdk.DisconnectReasonNetworkError
|
||||
c.logger.Warn(ctx, "context expired while waiting for coordinate responses to close",
|
||||
c.direction.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
reason.SlogField(),
|
||||
c.initiator.SlogField(),
|
||||
codersdk.SlogDisconnectDetail("context expired before coordinator hung up"),
|
||||
)
|
||||
}
|
||||
// forcefully close the stream
|
||||
protoErr := c.client.Close()
|
||||
@@ -369,6 +410,8 @@ func NewTunnelSrcCoordController(
|
||||
Logger: logger,
|
||||
Coordinatee: coordinatee,
|
||||
SendAcks: false,
|
||||
Initiator: codersdk.DisconnectInitiatorClient,
|
||||
Direction: codersdk.ConnectionDirectionClientToServer,
|
||||
},
|
||||
dests: make(map[uuid.UUID]struct{}),
|
||||
}
|
||||
@@ -508,6 +551,8 @@ func NewAgentCoordinationController(
|
||||
Logger: logger,
|
||||
Coordinatee: coordinatee,
|
||||
SendAcks: true,
|
||||
Initiator: codersdk.DisconnectInitiatorAgent,
|
||||
Direction: codersdk.ConnectionDirectionServerToAgent,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1505,7 +1550,14 @@ func (c *Controller) Run(ctx context.Context) {
|
||||
}
|
||||
|
||||
if errors.Is(err, net.ErrClosed) {
|
||||
c.logger.Warn(c.ctx, "control plane connection closed, retrying", slog.Error(err))
|
||||
reason := codersdk.DisconnectReasonNetworkError
|
||||
c.logger.Warn(c.ctx, "control plane connection closed, retrying",
|
||||
codersdk.ConnectionDirectionServerToAgent.SlogField(),
|
||||
reason.SlogField(),
|
||||
reason.SlogExpectedField(),
|
||||
codersdk.DisconnectInitiatorNetwork.SlogField(),
|
||||
slog.Error(err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user