mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
1afc6d4fd0
Implements [PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution): adds structured disconnect attribution to disconnect logs throughout the agent and tailnet packages. Every disconnect log site now carries structured slog fields. All existing logs remain; existing messages are preserved with the fields added alongside. New fields on disconnect log lines: - `connect_type` — which layer disconnected: `server_to_agent`, `agent_to_client`, or `client_to_server` - `disconnect_reason` — categorical reason: `graceful`, `network_error`, `server_shutdown`, etc. - `disconnect_expected` — whether the disconnect is normal operation (`true`) or should be investigated (`false`) - `disconnect_initiator` — who started it: `client`, `agent`, `server`, or `network` (control-plane sites only) - `disconnect_detail` — free-form supplemental info (where useful) ## What's covered **Control plane (`server_to_agent`):** coordination RPC, DERP map subscriber, agent runLoop, agent Close, `BasicCoordination.Close`, `Controller.run`. **Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY, JetBrains port-forwarding. <details> <summary>Control-plane sites</summary> | Site | Reason | Initiator | |---|---|---| | `agent/agent.go` `runLoop` EOF | `network_error` | `network` | | `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` / `graceful` / `network_error` | `agent` / `server` / `network` | | `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared `classifyCoordinatorRPCExit`) | same | | `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail | `agent` | | `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` | `agent` | | `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or `network_error` | `c.initiator` | | `tailnet/controllers.go` `Controller.run` `net.ErrClosed` | `network_error` | `network` | </details> <details> <summary>Data-plane sites</summary> | Site | Reason | Notes | |---|---|---| | `agent/agentssh/agentssh.go` SSH session closed | free-form (`graceful`, `process exited with error status: N`, etc.) | Also sets `closeCause("normal exit")` for clean exits so coderd's `connection_log.DisconnectReason` is no longer empty | | `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`, error string, or `graceful` | | | `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or error string | Previously passed empty reason | </details> <details> <summary>Bug fix</summary> The deferred `disconnected from coordination RPC` log no longer fires when the initial `Coordinate()` RPC call fails before any connection is established. </details> Refs PLAT-60. --- _This PR was prepared by Coder Agents on behalf of @Emyrk._ **Manually QA'd a lot of common disconnects** --------- Co-authored-by: Coder Agents <noreply@coder.com>
127 lines
3.5 KiB
Go
127 lines
3.5 KiB
Go
package agentssh
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/gliderlabs/ssh"
|
|
"github.com/google/uuid"
|
|
"go.uber.org/atomic"
|
|
gossh "golang.org/x/crypto/ssh"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
)
|
|
|
|
// localForwardChannelData is copied from the ssh package.
|
|
type localForwardChannelData struct {
|
|
DestAddr string
|
|
DestPort uint32
|
|
|
|
OriginAddr string
|
|
OriginPort uint32
|
|
}
|
|
|
|
// JetbrainsChannelWatcher is used to track JetBrains port forwarded (Gateway)
|
|
// channels. If the port forward is something other than JetBrains, this struct
|
|
// is a noop.
|
|
type JetbrainsChannelWatcher struct {
|
|
gossh.NewChannel
|
|
jetbrainsCounter *atomic.Int64
|
|
logger slog.Logger
|
|
originAddr string
|
|
reportConnection reportConnectionFunc
|
|
}
|
|
|
|
func NewJetbrainsChannelWatcher(ctx ssh.Context, logger slog.Logger, reportConnection reportConnectionFunc, newChannel gossh.NewChannel, counter *atomic.Int64) gossh.NewChannel {
|
|
d := localForwardChannelData{}
|
|
if err := gossh.Unmarshal(newChannel.ExtraData(), &d); err != nil {
|
|
// If the data fails to unmarshal, do nothing.
|
|
logger.Warn(ctx, "failed to unmarshal port forward data", slog.Error(err))
|
|
return newChannel
|
|
}
|
|
|
|
// If we do get a port, we should be able to get the matching PID and from
|
|
// there look up the invocation.
|
|
cmdline, err := getListeningPortProcessCmdline(d.DestPort)
|
|
if err != nil {
|
|
logger.Warn(ctx, "failed to inspect port",
|
|
slog.F("destination_port", d.DestPort),
|
|
slog.Error(err))
|
|
return newChannel
|
|
}
|
|
|
|
// If this is not JetBrains, then we do not need to do anything special. We
|
|
// attempt to match on something that appears unique to JetBrains software.
|
|
if !isJetbrainsProcess(cmdline) {
|
|
return newChannel
|
|
}
|
|
|
|
logger.Debug(ctx, "discovered forwarded JetBrains process",
|
|
slog.F("destination_port", d.DestPort))
|
|
|
|
return &JetbrainsChannelWatcher{
|
|
NewChannel: newChannel,
|
|
jetbrainsCounter: counter,
|
|
logger: logger.With(slog.F("destination_port", d.DestPort)),
|
|
originAddr: d.OriginAddr,
|
|
reportConnection: reportConnection,
|
|
}
|
|
}
|
|
|
|
func (w *JetbrainsChannelWatcher) Accept() (gossh.Channel, <-chan *gossh.Request, error) {
|
|
disconnected := w.reportConnection(uuid.New(), MagicSessionTypeJetBrains, w.originAddr)
|
|
|
|
c, r, err := w.NewChannel.Accept()
|
|
if err != nil {
|
|
disconnected(1, err.Error())
|
|
return c, r, err
|
|
}
|
|
w.jetbrainsCounter.Add(1)
|
|
// nolint: gocritic // JetBrains is a proper noun and should be capitalized
|
|
w.logger.Debug(context.Background(), "JetBrains watcher accepted channel")
|
|
|
|
return &ChannelOnClose{
|
|
Channel: c,
|
|
done: func() {
|
|
w.jetbrainsCounter.Add(-1)
|
|
disconnected(0, "normal close")
|
|
// nolint: gocritic // JetBrains is a proper noun and should be capitalized
|
|
w.logger.Debug(context.Background(), "JetBrains channel closed",
|
|
codersdk.ConnectionDirectionAgentToClient.SlogField(),
|
|
codersdk.DisconnectReasonGraceful.SlogField(),
|
|
codersdk.DisconnectReasonGraceful.SlogExpectedField(),
|
|
)
|
|
},
|
|
}, r, err
|
|
}
|
|
|
|
type ChannelOnClose struct {
|
|
gossh.Channel
|
|
// once ensures close only decrements the counter once.
|
|
// Because close can be called multiple times.
|
|
once sync.Once
|
|
done func()
|
|
}
|
|
|
|
func (c *ChannelOnClose) Close() error {
|
|
c.once.Do(c.done)
|
|
return c.Channel.Close()
|
|
}
|
|
|
|
func isJetbrainsProcess(cmdline string) bool {
|
|
opts := []string{
|
|
MagicProcessCmdlineJetBrains,
|
|
MagicProcessCmdlineToolbox,
|
|
MagicProcessCmdlineGateway,
|
|
}
|
|
|
|
for _, opt := range opts {
|
|
if strings.Contains(strings.ToLower(cmdline), strings.ToLower(opt)) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|