Files
coder/agent/reconnectingpty/server.go
T
Steven Masley 1afc6d4fd0 feat: structured disconnect attribution for agent logs (#25191)
Implements
[PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution):
adds structured disconnect attribution to disconnect logs throughout the
agent and tailnet packages.

Every disconnect log site now carries structured slog fields. All
existing logs remain; existing messages are preserved with the fields
added alongside.

New fields on disconnect log lines:

- `connect_type` — which layer disconnected: `server_to_agent`,
`agent_to_client`, or `client_to_server`
- `disconnect_reason` — categorical reason: `graceful`, `network_error`,
`server_shutdown`, etc.
- `disconnect_expected` — whether the disconnect is normal operation
(`true`) or should be investigated (`false`)
- `disconnect_initiator` — who started it: `client`, `agent`, `server`,
or `network` (control-plane sites only)
- `disconnect_detail` — free-form supplemental info (where useful)

## What's covered

**Control plane (`server_to_agent`):** coordination RPC, DERP map
subscriber, agent runLoop, agent Close, `BasicCoordination.Close`,
`Controller.run`.

**Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY,
JetBrains port-forwarding.

<details>
<summary>Control-plane sites</summary>

| Site | Reason | Initiator |
|---|---|---|
| `agent/agent.go` `runLoop` EOF | `network_error` | `network` |
| `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` /
`graceful` / `network_error` | `agent` / `server` / `network` |
| `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared
`classifyCoordinatorRPCExit`) | same |
| `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail
| `agent` |
| `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` |
`agent` |
| `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or
`network_error` | `c.initiator` |
| `tailnet/controllers.go` `Controller.run` `net.ErrClosed` |
`network_error` | `network` |

</details>

<details>
<summary>Data-plane sites</summary>

| Site | Reason | Notes |
|---|---|---|
| `agent/agentssh/agentssh.go` SSH session closed | free-form
(`graceful`, `process exited with error status: N`, etc.) | Also sets
`closeCause("normal exit")` for clean exits so coderd's
`connection_log.DisconnectReason` is no longer empty |
| `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`,
error string, or `graceful` | |
| `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or
error string | Previously passed empty reason |

</details>

<details>
<summary>Bug fix</summary>

The deferred `disconnected from coordination RPC` log no longer fires
when the initial `Coordinate()` RPC call fails before any connection is
established.

</details>

Refs PLAT-60.

---

_This PR was prepared by Coder Agents on behalf of @Emyrk._
**Manually QA'd a lot of common disconnects**

---------

Co-authored-by: Coder Agents <noreply@coder.com>
2026-05-19 09:47:03 -05:00

266 lines
7.6 KiB
Go

package reconnectingpty
import (
"context"
"encoding/binary"
"encoding/json"
"net"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/agent/agentcontainers"
"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/usershell"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
type reportConnectionFunc func(id uuid.UUID, ip string) (disconnected func(code int, reason string))
type Server struct {
logger slog.Logger
connectionsTotal prometheus.Counter
errorsTotal *prometheus.CounterVec
commandCreator *agentssh.Server
reportConnection reportConnectionFunc
connCount atomic.Int64
reconnectingPTYs sync.Map
timeout time.Duration
// Experimental: allow connecting to running containers via Docker exec.
// Note that this is different from the devcontainers feature, which uses
// subagents.
ExperimentalContainers bool
}
// NewServer returns a new ReconnectingPTY server
func NewServer(logger slog.Logger, commandCreator *agentssh.Server, reportConnection reportConnectionFunc,
connectionsTotal prometheus.Counter, errorsTotal *prometheus.CounterVec,
timeout time.Duration, opts ...func(*Server),
) *Server {
if reportConnection == nil {
reportConnection = func(uuid.UUID, string) func(int, string) {
return func(int, string) {}
}
}
s := &Server{
logger: logger,
commandCreator: commandCreator,
reportConnection: reportConnection,
connectionsTotal: connectionsTotal,
errorsTotal: errorsTotal,
timeout: timeout,
}
for _, o := range opts {
o(s)
}
return s
}
func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr error) {
var wg sync.WaitGroup
for {
if ctx.Err() != nil {
break
}
conn, err := l.Accept()
if err != nil {
s.logger.Debug(ctx, "accept pty failed", slog.Error(err))
retErr = err
break
}
clog := s.logger.With(
slog.F("remote", conn.RemoteAddr()),
slog.F("local", conn.LocalAddr()))
clog.Info(ctx, "accepted conn")
// It's not safe to assume RemoteAddr() returns a non-nil value. slog.F usage is fine because it correctly
// handles nil.
// c.f. https://github.com/coder/internal/issues/1143
remoteAddr := conn.RemoteAddr()
remoteAddrString := ""
if remoteAddr != nil {
remoteAddrString = remoteAddr.String()
}
wg.Add(1)
disconnected := s.reportConnection(uuid.New(), remoteAddrString)
closed := make(chan struct{})
go func() {
defer wg.Done()
select {
case <-closed:
case <-hardCtx.Done():
clog.Info(hardCtx, "reconnecting pty closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogField(),
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
)
disconnected(1, "server shut down")
_ = conn.Close()
}
}()
wg.Add(1)
go func() {
defer close(closed)
defer wg.Done()
err := s.handleConn(ctx, clog, conn)
var reason codersdk.DisconnectReason
var code int
var detail string
switch {
case err != nil && ctx.Err() != nil:
reason = codersdk.DisconnectReasonServerShutdown
code = 1
case err != nil:
reason = codersdk.DisconnectReasonNetworkError
detail = err.Error()
code = 1
default:
reason = codersdk.DisconnectReasonGraceful
}
clog.Info(ctx, "reconnecting pty closed",
codersdk.ConnectionDirectionAgentToClient.SlogField(),
reason.SlogField(),
reason.SlogExpectedField(),
codersdk.SlogDisconnectDetail(detail),
slog.F("exit_code", code),
)
disconnected(code, string(reason))
}()
}
wg.Wait()
return retErr
}
func (s *Server) ConnCount() int64 {
return s.connCount.Load()
}
func (s *Server) handleConn(ctx context.Context, logger slog.Logger, conn net.Conn) (retErr error) {
defer conn.Close()
s.connectionsTotal.Add(1)
s.connCount.Add(1)
defer s.connCount.Add(-1)
// This cannot use a JSON decoder, since that can
// buffer additional data that is required for the PTY.
rawLen := make([]byte, 2)
_, err := conn.Read(rawLen)
if err != nil {
// logging at info since a single incident isn't too worrying (the client could just have
// hung up), but if we get a lot of these we'd want to investigate.
logger.Info(ctx, "failed to read AgentReconnectingPTYInit length", slog.Error(err))
return nil
}
length := binary.LittleEndian.Uint16(rawLen)
data := make([]byte, length)
_, err = conn.Read(data)
if err != nil {
// logging at info since a single incident isn't too worrying (the client could just have
// hung up), but if we get a lot of these we'd want to investigate.
logger.Info(ctx, "failed to read AgentReconnectingPTYInit", slog.Error(err))
return nil
}
var msg workspacesdk.AgentReconnectingPTYInit
err = json.Unmarshal(data, &msg)
if err != nil {
logger.Warn(ctx, "failed to unmarshal init", slog.F("raw", data))
return nil
}
connectionID := uuid.NewString()
connLogger := logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID), slog.F("container", msg.Container), slog.F("container_user", msg.ContainerUser))
connLogger.Debug(ctx, "starting handler")
defer func() {
if err := retErr; err != nil {
// If the context is done, we don't want to log this as an error since it's expected.
if ctx.Err() != nil {
connLogger.Info(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err))
} else {
connLogger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err))
}
}
connLogger.Info(ctx, "reconnecting pty connection closed")
}()
var rpty ReconnectingPTY
sendConnected := make(chan ReconnectingPTY, 1)
// On store, reserve this ID to prevent multiple concurrent new connections.
waitReady, ok := s.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected)
if ok {
close(sendConnected) // Unused.
connLogger.Debug(ctx, "connecting to existing reconnecting pty")
c, ok := waitReady.(chan ReconnectingPTY)
if !ok {
return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady)
}
rpty, ok = <-c
if !ok || rpty == nil {
return xerrors.Errorf("reconnecting pty closed before connection")
}
c <- rpty // Put it back for the next reconnect.
} else {
connLogger.Debug(ctx, "creating new reconnecting pty")
connected := false
defer func() {
if !connected && retErr != nil {
s.reconnectingPTYs.Delete(msg.ID)
close(sendConnected)
}
}()
var ei usershell.EnvInfoer
if s.ExperimentalContainers && msg.Container != "" {
dei, err := agentcontainers.EnvInfo(ctx, s.commandCreator.Execer, msg.Container, msg.ContainerUser)
if err != nil {
return xerrors.Errorf("get container env info: %w", err)
}
ei = dei
s.logger.Info(ctx, "got container env info", slog.F("container", msg.Container))
}
// Empty command will default to the users shell!
cmd, err := s.commandCreator.CreateCommand(ctx, msg.Command, nil, ei)
if err != nil {
s.errorsTotal.WithLabelValues("create_command").Add(1)
return xerrors.Errorf("create command: %w", err)
}
rpty = New(ctx,
logger.With(slog.F("message_id", msg.ID)),
s.commandCreator.Execer,
cmd,
&Options{
Timeout: s.timeout,
Metrics: s.errorsTotal,
BackendType: msg.BackendType,
},
)
done := make(chan struct{})
go func() {
select {
case <-done:
case <-ctx.Done():
rpty.Close(ctx.Err())
}
}()
go func() {
rpty.Wait()
s.reconnectingPTYs.Delete(msg.ID)
}()
connected = true
sendConnected <- rpty
}
return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, connLogger)
}