mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
1afc6d4fd0
Implements [PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution): adds structured disconnect attribution to disconnect logs throughout the agent and tailnet packages. Every disconnect log site now carries structured slog fields. All existing logs remain; existing messages are preserved with the fields added alongside. New fields on disconnect log lines: - `connect_type` — which layer disconnected: `server_to_agent`, `agent_to_client`, or `client_to_server` - `disconnect_reason` — categorical reason: `graceful`, `network_error`, `server_shutdown`, etc. - `disconnect_expected` — whether the disconnect is normal operation (`true`) or should be investigated (`false`) - `disconnect_initiator` — who started it: `client`, `agent`, `server`, or `network` (control-plane sites only) - `disconnect_detail` — free-form supplemental info (where useful) ## What's covered **Control plane (`server_to_agent`):** coordination RPC, DERP map subscriber, agent runLoop, agent Close, `BasicCoordination.Close`, `Controller.run`. **Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY, JetBrains port-forwarding. <details> <summary>Control-plane sites</summary> | Site | Reason | Initiator | |---|---|---| | `agent/agent.go` `runLoop` EOF | `network_error` | `network` | | `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` / `graceful` / `network_error` | `agent` / `server` / `network` | | `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared `classifyCoordinatorRPCExit`) | same | | `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail | `agent` | | `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` | `agent` | | `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or `network_error` | `c.initiator` | | `tailnet/controllers.go` `Controller.run` `net.ErrClosed` | `network_error` | `network` | </details> <details> <summary>Data-plane sites</summary> | Site | Reason | Notes | |---|---|---| | `agent/agentssh/agentssh.go` SSH session closed | free-form (`graceful`, `process exited with error status: N`, etc.) | Also sets `closeCause("normal exit")` for clean exits so coderd's `connection_log.DisconnectReason` is no longer empty | | `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`, error string, or `graceful` | | | `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or error string | Previously passed empty reason | </details> <details> <summary>Bug fix</summary> The deferred `disconnected from coordination RPC` log no longer fires when the initial `Coordinate()` RPC call fails before any connection is established. </details> Refs PLAT-60. --- _This PR was prepared by Coder Agents on behalf of @Emyrk._ **Manually QA'd a lot of common disconnects** --------- Co-authored-by: Coder Agents <noreply@coder.com>
266 lines
7.6 KiB
Go
266 lines
7.6 KiB
Go
package reconnectingpty
|
|
|
|
import (
|
|
"context"
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"net"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/agent/agentcontainers"
|
|
"github.com/coder/coder/v2/agent/agentssh"
|
|
"github.com/coder/coder/v2/agent/usershell"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
|
)
|
|
|
|
type reportConnectionFunc func(id uuid.UUID, ip string) (disconnected func(code int, reason string))
|
|
|
|
type Server struct {
|
|
logger slog.Logger
|
|
connectionsTotal prometheus.Counter
|
|
errorsTotal *prometheus.CounterVec
|
|
commandCreator *agentssh.Server
|
|
reportConnection reportConnectionFunc
|
|
connCount atomic.Int64
|
|
reconnectingPTYs sync.Map
|
|
timeout time.Duration
|
|
// Experimental: allow connecting to running containers via Docker exec.
|
|
// Note that this is different from the devcontainers feature, which uses
|
|
// subagents.
|
|
ExperimentalContainers bool
|
|
}
|
|
|
|
// NewServer returns a new ReconnectingPTY server
|
|
func NewServer(logger slog.Logger, commandCreator *agentssh.Server, reportConnection reportConnectionFunc,
|
|
connectionsTotal prometheus.Counter, errorsTotal *prometheus.CounterVec,
|
|
timeout time.Duration, opts ...func(*Server),
|
|
) *Server {
|
|
if reportConnection == nil {
|
|
reportConnection = func(uuid.UUID, string) func(int, string) {
|
|
return func(int, string) {}
|
|
}
|
|
}
|
|
s := &Server{
|
|
logger: logger,
|
|
commandCreator: commandCreator,
|
|
reportConnection: reportConnection,
|
|
connectionsTotal: connectionsTotal,
|
|
errorsTotal: errorsTotal,
|
|
timeout: timeout,
|
|
}
|
|
for _, o := range opts {
|
|
o(s)
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (s *Server) Serve(ctx, hardCtx context.Context, l net.Listener) (retErr error) {
|
|
var wg sync.WaitGroup
|
|
for {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
conn, err := l.Accept()
|
|
if err != nil {
|
|
s.logger.Debug(ctx, "accept pty failed", slog.Error(err))
|
|
retErr = err
|
|
break
|
|
}
|
|
clog := s.logger.With(
|
|
slog.F("remote", conn.RemoteAddr()),
|
|
slog.F("local", conn.LocalAddr()))
|
|
clog.Info(ctx, "accepted conn")
|
|
|
|
// It's not safe to assume RemoteAddr() returns a non-nil value. slog.F usage is fine because it correctly
|
|
// handles nil.
|
|
// c.f. https://github.com/coder/internal/issues/1143
|
|
remoteAddr := conn.RemoteAddr()
|
|
remoteAddrString := ""
|
|
if remoteAddr != nil {
|
|
remoteAddrString = remoteAddr.String()
|
|
}
|
|
|
|
wg.Add(1)
|
|
disconnected := s.reportConnection(uuid.New(), remoteAddrString)
|
|
closed := make(chan struct{})
|
|
go func() {
|
|
defer wg.Done()
|
|
select {
|
|
case <-closed:
|
|
case <-hardCtx.Done():
|
|
clog.Info(hardCtx, "reconnecting pty closed",
|
|
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
|
codersdk.DisconnectReasonServerShutdown.SlogField(),
|
|
codersdk.DisconnectReasonServerShutdown.SlogExpectedField(),
|
|
)
|
|
disconnected(1, "server shut down")
|
|
_ = conn.Close()
|
|
}
|
|
}()
|
|
wg.Add(1)
|
|
go func() {
|
|
defer close(closed)
|
|
defer wg.Done()
|
|
err := s.handleConn(ctx, clog, conn)
|
|
var reason codersdk.DisconnectReason
|
|
var code int
|
|
var detail string
|
|
switch {
|
|
case err != nil && ctx.Err() != nil:
|
|
reason = codersdk.DisconnectReasonServerShutdown
|
|
code = 1
|
|
case err != nil:
|
|
reason = codersdk.DisconnectReasonNetworkError
|
|
detail = err.Error()
|
|
code = 1
|
|
default:
|
|
reason = codersdk.DisconnectReasonGraceful
|
|
}
|
|
clog.Info(ctx, "reconnecting pty closed",
|
|
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
|
reason.SlogField(),
|
|
reason.SlogExpectedField(),
|
|
codersdk.SlogDisconnectDetail(detail),
|
|
slog.F("exit_code", code),
|
|
)
|
|
disconnected(code, string(reason))
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
return retErr
|
|
}
|
|
|
|
func (s *Server) ConnCount() int64 {
|
|
return s.connCount.Load()
|
|
}
|
|
|
|
func (s *Server) handleConn(ctx context.Context, logger slog.Logger, conn net.Conn) (retErr error) {
|
|
defer conn.Close()
|
|
s.connectionsTotal.Add(1)
|
|
s.connCount.Add(1)
|
|
defer s.connCount.Add(-1)
|
|
|
|
// This cannot use a JSON decoder, since that can
|
|
// buffer additional data that is required for the PTY.
|
|
rawLen := make([]byte, 2)
|
|
_, err := conn.Read(rawLen)
|
|
if err != nil {
|
|
// logging at info since a single incident isn't too worrying (the client could just have
|
|
// hung up), but if we get a lot of these we'd want to investigate.
|
|
logger.Info(ctx, "failed to read AgentReconnectingPTYInit length", slog.Error(err))
|
|
return nil
|
|
}
|
|
length := binary.LittleEndian.Uint16(rawLen)
|
|
data := make([]byte, length)
|
|
_, err = conn.Read(data)
|
|
if err != nil {
|
|
// logging at info since a single incident isn't too worrying (the client could just have
|
|
// hung up), but if we get a lot of these we'd want to investigate.
|
|
logger.Info(ctx, "failed to read AgentReconnectingPTYInit", slog.Error(err))
|
|
return nil
|
|
}
|
|
var msg workspacesdk.AgentReconnectingPTYInit
|
|
err = json.Unmarshal(data, &msg)
|
|
if err != nil {
|
|
logger.Warn(ctx, "failed to unmarshal init", slog.F("raw", data))
|
|
return nil
|
|
}
|
|
|
|
connectionID := uuid.NewString()
|
|
connLogger := logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID), slog.F("container", msg.Container), slog.F("container_user", msg.ContainerUser))
|
|
connLogger.Debug(ctx, "starting handler")
|
|
|
|
defer func() {
|
|
if err := retErr; err != nil {
|
|
// If the context is done, we don't want to log this as an error since it's expected.
|
|
if ctx.Err() != nil {
|
|
connLogger.Info(ctx, "reconnecting pty failed with attach error (agent closed)", slog.Error(err))
|
|
} else {
|
|
connLogger.Error(ctx, "reconnecting pty failed with attach error", slog.Error(err))
|
|
}
|
|
}
|
|
connLogger.Info(ctx, "reconnecting pty connection closed")
|
|
}()
|
|
|
|
var rpty ReconnectingPTY
|
|
sendConnected := make(chan ReconnectingPTY, 1)
|
|
// On store, reserve this ID to prevent multiple concurrent new connections.
|
|
waitReady, ok := s.reconnectingPTYs.LoadOrStore(msg.ID, sendConnected)
|
|
if ok {
|
|
close(sendConnected) // Unused.
|
|
connLogger.Debug(ctx, "connecting to existing reconnecting pty")
|
|
c, ok := waitReady.(chan ReconnectingPTY)
|
|
if !ok {
|
|
return xerrors.Errorf("found invalid type in reconnecting pty map: %T", waitReady)
|
|
}
|
|
rpty, ok = <-c
|
|
if !ok || rpty == nil {
|
|
return xerrors.Errorf("reconnecting pty closed before connection")
|
|
}
|
|
c <- rpty // Put it back for the next reconnect.
|
|
} else {
|
|
connLogger.Debug(ctx, "creating new reconnecting pty")
|
|
|
|
connected := false
|
|
defer func() {
|
|
if !connected && retErr != nil {
|
|
s.reconnectingPTYs.Delete(msg.ID)
|
|
close(sendConnected)
|
|
}
|
|
}()
|
|
|
|
var ei usershell.EnvInfoer
|
|
if s.ExperimentalContainers && msg.Container != "" {
|
|
dei, err := agentcontainers.EnvInfo(ctx, s.commandCreator.Execer, msg.Container, msg.ContainerUser)
|
|
if err != nil {
|
|
return xerrors.Errorf("get container env info: %w", err)
|
|
}
|
|
ei = dei
|
|
s.logger.Info(ctx, "got container env info", slog.F("container", msg.Container))
|
|
}
|
|
// Empty command will default to the users shell!
|
|
cmd, err := s.commandCreator.CreateCommand(ctx, msg.Command, nil, ei)
|
|
if err != nil {
|
|
s.errorsTotal.WithLabelValues("create_command").Add(1)
|
|
return xerrors.Errorf("create command: %w", err)
|
|
}
|
|
|
|
rpty = New(ctx,
|
|
logger.With(slog.F("message_id", msg.ID)),
|
|
s.commandCreator.Execer,
|
|
cmd,
|
|
&Options{
|
|
Timeout: s.timeout,
|
|
Metrics: s.errorsTotal,
|
|
BackendType: msg.BackendType,
|
|
},
|
|
)
|
|
|
|
done := make(chan struct{})
|
|
go func() {
|
|
select {
|
|
case <-done:
|
|
case <-ctx.Done():
|
|
rpty.Close(ctx.Err())
|
|
}
|
|
}()
|
|
|
|
go func() {
|
|
rpty.Wait()
|
|
s.reconnectingPTYs.Delete(msg.ID)
|
|
}()
|
|
|
|
connected = true
|
|
sendConnected <- rpty
|
|
}
|
|
return rpty.Attach(ctx, connectionID, conn, msg.Height, msg.Width, connLogger)
|
|
}
|