mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
1afc6d4fd0
Implements [PLAT-60](https://linear.app/codercom/issue/PLAT-60/enhance-disconnect-logs-with-structured-reason-attribution): adds structured disconnect attribution to disconnect logs throughout the agent and tailnet packages. Every disconnect log site now carries structured slog fields. All existing logs remain; existing messages are preserved with the fields added alongside. New fields on disconnect log lines: - `connect_type` — which layer disconnected: `server_to_agent`, `agent_to_client`, or `client_to_server` - `disconnect_reason` — categorical reason: `graceful`, `network_error`, `server_shutdown`, etc. - `disconnect_expected` — whether the disconnect is normal operation (`true`) or should be investigated (`false`) - `disconnect_initiator` — who started it: `client`, `agent`, `server`, or `network` (control-plane sites only) - `disconnect_detail` — free-form supplemental info (where useful) ## What's covered **Control plane (`server_to_agent`):** coordination RPC, DERP map subscriber, agent runLoop, agent Close, `BasicCoordination.Close`, `Controller.run`. **Data plane (`agent_to_client`):** SSH sessions, reconnecting PTY, JetBrains port-forwarding. <details> <summary>Control-plane sites</summary> | Site | Reason | Initiator | |---|---|---| | `agent/agent.go` `runLoop` EOF | `network_error` | `network` | | `agent/agent.go` `runCoordinator` deferred exit | `server_shutdown` / `graceful` / `network_error` | `agent` / `server` / `network` | | `agent/agent.go` `runDERPMapSubscriber` deferred exit | same (shared `classifyCoordinatorRPCExit`) | same | | `agent/agent.go` `Close` shutdown timeout | `server_shutdown` + detail | `agent` | | `agent/agent.go` `Close` clean coord disconnect | `server_shutdown` | `agent` | | `tailnet/controllers.go` `BasicCoordination.Close` | `graceful` or `network_error` | `c.initiator` | | `tailnet/controllers.go` `Controller.run` `net.ErrClosed` | `network_error` | `network` | </details> <details> <summary>Data-plane sites</summary> | Site | Reason | Notes | |---|---|---| | `agent/agentssh/agentssh.go` SSH session closed | free-form (`graceful`, `process exited with error status: N`, etc.) | Also sets `closeCause("normal exit")` for clean exits so coderd's `connection_log.DisconnectReason` is no longer empty | | `agent/reconnectingpty/server.go` PTY closed | `server_shutdown`, error string, or `graceful` | | | `agent/agentssh/jetbrainstrack.go` channel closed | `normal close` or error string | Previously passed empty reason | </details> <details> <summary>Bug fix</summary> The deferred `disconnected from coordination RPC` log no longer fires when the initial `Coordinate()` RPC call fails before any connection is established. </details> Refs PLAT-60. --- _This PR was prepared by Coder Agents on behalf of @Emyrk._ **Manually QA'd a lot of common disconnects** --------- Co-authored-by: Coder Agents <noreply@coder.com>
145 lines
4.4 KiB
Go
145 lines
4.4 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"path/filepath"
|
|
"runtime"
|
|
"testing"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"cdr.dev/slog/v3/sloggers/slogtest"
|
|
"github.com/coder/coder/v2/agent/agentcontextconfig"
|
|
"github.com/coder/coder/v2/agent/proto"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
agentsdk "github.com/coder/coder/v2/codersdk/agentsdk"
|
|
"github.com/coder/coder/v2/testutil"
|
|
)
|
|
|
|
// platformAbsPath constructs an absolute path that is valid
|
|
// on the current platform. On Windows, paths must include a
|
|
// drive letter to be considered absolute.
|
|
func platformAbsPath(parts ...string) string {
|
|
if runtime.GOOS == "windows" {
|
|
return `C:\` + filepath.Join(parts...)
|
|
}
|
|
return "/" + filepath.Join(parts...)
|
|
}
|
|
|
|
// TestReportConnectionEmpty tests that reportConnection() doesn't choke if given an empty IP string, which is what we
|
|
// send if we cannot get the remote address.
|
|
func TestReportConnectionEmpty(t *testing.T) {
|
|
t.Parallel()
|
|
connID := uuid.UUID{1}
|
|
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
|
|
ctx := testutil.Context(t, testutil.WaitShort)
|
|
|
|
uut := &agent{
|
|
hardCtx: ctx,
|
|
logger: logger,
|
|
}
|
|
disconnected := uut.reportConnection(connID, proto.Connection_TYPE_UNSPECIFIED, "")
|
|
|
|
require.Len(t, uut.reportConnections, 1)
|
|
req0 := uut.reportConnections[0]
|
|
require.Equal(t, proto.Connection_TYPE_UNSPECIFIED, req0.GetConnection().GetType())
|
|
require.Equal(t, "", req0.GetConnection().Ip)
|
|
require.Equal(t, connID[:], req0.GetConnection().GetId())
|
|
require.Equal(t, proto.Connection_CONNECT, req0.GetConnection().GetAction())
|
|
|
|
disconnected(0, "because")
|
|
require.Len(t, uut.reportConnections, 2)
|
|
req1 := uut.reportConnections[1]
|
|
require.Equal(t, proto.Connection_TYPE_UNSPECIFIED, req1.GetConnection().GetType())
|
|
require.Equal(t, "", req1.GetConnection().Ip)
|
|
require.Equal(t, connID[:], req1.GetConnection().GetId())
|
|
require.Equal(t, proto.Connection_DISCONNECT, req1.GetConnection().GetAction())
|
|
require.Equal(t, "because", req1.GetConnection().GetReason())
|
|
}
|
|
|
|
func TestContextConfigAPI_InitOnce(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// After the fix, contextConfigAPI is set once in init() and
|
|
// never reassigned. Resolve() evaluates lazily via the
|
|
// manifest, so there is no concurrent write to race with.
|
|
dir1 := platformAbsPath("dir1")
|
|
dir2 := platformAbsPath("dir2")
|
|
|
|
a := &agent{}
|
|
a.manifest.Store(&agentsdk.Manifest{Directory: dir1})
|
|
a.contextConfigAPI = agentcontextconfig.NewAPI(func() string {
|
|
if m := a.manifest.Load(); m != nil {
|
|
return m.Directory
|
|
}
|
|
return ""
|
|
}, agentcontextconfig.Config{})
|
|
|
|
mcpFiles1 := a.contextConfigAPI.MCPConfigFiles()
|
|
require.NotEmpty(t, mcpFiles1)
|
|
require.Contains(t, mcpFiles1[0], dir1)
|
|
|
|
// Simulate manifest update on reconnection -- no field
|
|
// reassignment needed, the lazy closure picks it up.
|
|
a.manifest.Store(&agentsdk.Manifest{Directory: dir2})
|
|
mcpFiles2 := a.contextConfigAPI.MCPConfigFiles()
|
|
require.NotEmpty(t, mcpFiles2)
|
|
require.Contains(t, mcpFiles2[0], dir2)
|
|
}
|
|
|
|
func TestClassifyCoordinatorRPCExit(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
canceled, cancel := context.WithCancel(context.Background())
|
|
cancel()
|
|
|
|
cases := []struct {
|
|
name string
|
|
ctx context.Context
|
|
retErr error
|
|
reason codersdk.DisconnectReason
|
|
initiator codersdk.DisconnectInitiator
|
|
}{
|
|
{
|
|
name: "local shutdown, no error",
|
|
ctx: canceled,
|
|
retErr: nil,
|
|
reason: codersdk.DisconnectReasonServerShutdown,
|
|
initiator: codersdk.DisconnectInitiatorAgent,
|
|
},
|
|
{
|
|
name: "local shutdown, with cleanup error",
|
|
ctx: canceled,
|
|
retErr: xerrors.New("close timed out"),
|
|
reason: codersdk.DisconnectReasonServerShutdown,
|
|
initiator: codersdk.DisconnectInitiatorAgent,
|
|
},
|
|
{
|
|
name: "remote graceful, no error",
|
|
ctx: context.Background(),
|
|
retErr: nil,
|
|
reason: codersdk.DisconnectReasonGraceful,
|
|
initiator: codersdk.DisconnectInitiatorServer,
|
|
},
|
|
{
|
|
name: "stream broke unexpectedly",
|
|
ctx: context.Background(),
|
|
retErr: xerrors.New("read: connection reset"),
|
|
reason: codersdk.DisconnectReasonNetworkError,
|
|
initiator: codersdk.DisconnectInitiatorNetwork,
|
|
},
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
reason, initiator := classifyCoordinatorRPCExit(tc.ctx, tc.retErr)
|
|
require.Equal(t, tc.reason, reason)
|
|
require.Equal(t, tc.initiator, initiator)
|
|
})
|
|
}
|
|
}
|