fix(cli): retry transient connection failures during SSH setup (#24010)

When `coder ssh` connects to a workspace after laptop wake, DNS or the
control plane may be briefly unavailable. Previously this caused an
immediate failure, which VS Code Remote SSH classified as permanent
("Reload Window").

Wrap each network step (workspace resolution, template version fetch,
agent connection info, Coder Connect dial, tailnet dial) with
`retryWithInterval` so transient errors (DNS, connection refused, 5xx)
are retried individually. Non-retryable errors (auth, 404) and context
cancellation stop immediately. Data transfer is never retried.
This commit is contained in:
Ehab Younes
2026-04-07 21:59:10 +00:00
committed by GitHub
parent 48bc215f20
commit d00f148b76
2 changed files with 221 additions and 17 deletions
+92 -16
View File
@@ -52,6 +52,10 @@ import (
const (
disableUsageApp = "disable"
// Retry transient errors during SSH connection establishment.
sshRetryInterval = 2 * time.Second
sshMaxAttempts = 10 // initial + retries per step
)
var (
@@ -62,6 +66,51 @@ var (
workspaceNameRe = regexp.MustCompile(`[/.]+|--`)
)
// isRetryableError checks for transient connection errors worth
// retrying: DNS failures, connection refused, and server 5xx.
func isRetryableError(err error) bool {
if err == nil {
return false
}
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
return false
}
if codersdk.IsConnectionError(err) {
return true
}
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) {
return sdkErr.StatusCode() >= 500
}
return false
}
// retryWithInterval calls fn up to maxAttempts times, waiting
// interval between attempts. Stops on success, non-retryable
// error, or context cancellation.
func retryWithInterval(ctx context.Context, logger slog.Logger, interval time.Duration, maxAttempts int, fn func() error) error {
var lastErr error
attempt := 0
for r := retry.New(interval, interval); r.Wait(ctx); {
lastErr = fn()
if lastErr == nil || !isRetryableError(lastErr) {
return lastErr
}
attempt++
if attempt >= maxAttempts {
break
}
logger.Warn(ctx, "transient error, retrying",
slog.Error(lastErr),
slog.F("attempt", attempt),
)
}
if lastErr != nil {
return lastErr
}
return ctx.Err()
}
func (r *RootCmd) ssh() *serpent.Command {
var (
stdio bool
@@ -277,10 +326,17 @@ func (r *RootCmd) ssh() *serpent.Command {
HostnameSuffix: hostnameSuffix,
}
workspace, workspaceAgent, err := findWorkspaceAndAgentByHostname(
ctx, inv, client,
inv.Args[0], cliConfig, disableAutostart)
if err != nil {
// Populated by the closure below.
var workspace codersdk.Workspace
var workspaceAgent codersdk.WorkspaceAgent
resolveWorkspace := func() error {
var err error
workspace, workspaceAgent, err = findWorkspaceAndAgentByHostname(
ctx, inv, client,
inv.Args[0], cliConfig, disableAutostart)
return err
}
if err := retryWithInterval(ctx, logger, sshRetryInterval, sshMaxAttempts, resolveWorkspace); err != nil {
return err
}
@@ -306,8 +362,13 @@ func (r *RootCmd) ssh() *serpent.Command {
wait = false
}
templateVersion, err := client.TemplateVersion(ctx, workspace.LatestBuild.TemplateVersionID)
if err != nil {
var templateVersion codersdk.TemplateVersion
fetchVersion := func() error {
var err error
templateVersion, err = client.TemplateVersion(ctx, workspace.LatestBuild.TemplateVersionID)
return err
}
if err := retryWithInterval(ctx, logger, sshRetryInterval, sshMaxAttempts, fetchVersion); err != nil {
return err
}
@@ -347,8 +408,12 @@ func (r *RootCmd) ssh() *serpent.Command {
// If we're in stdio mode, check to see if we can use Coder Connect.
// We don't support Coder Connect over non-stdio coder ssh yet.
if stdio && !forceNewTunnel {
connInfo, err := wsClient.AgentConnectionInfoGeneric(ctx)
if err != nil {
var connInfo workspacesdk.AgentConnectionInfo
if err := retryWithInterval(ctx, logger, sshRetryInterval, sshMaxAttempts, func() error {
var err error
connInfo, err = wsClient.AgentConnectionInfoGeneric(ctx)
return err
}); err != nil {
return xerrors.Errorf("get agent connection info: %w", err)
}
coderConnectHost := fmt.Sprintf("%s.%s.%s.%s",
@@ -384,23 +449,27 @@ func (r *RootCmd) ssh() *serpent.Command {
})
defer closeUsage()
}
return runCoderConnectStdio(ctx, fmt.Sprintf("%s:22", coderConnectHost), stdioReader, stdioWriter, stack)
return runCoderConnectStdio(ctx, fmt.Sprintf("%s:22", coderConnectHost), stdioReader, stdioWriter, stack, logger)
}
}
if r.disableDirect {
_, _ = fmt.Fprintln(inv.Stderr, "Direct connections disabled.")
}
conn, err := wsClient.
DialAgent(ctx, workspaceAgent.ID, &workspacesdk.DialAgentOptions{
var conn workspacesdk.AgentConn
if err := retryWithInterval(ctx, logger, sshRetryInterval, sshMaxAttempts, func() error {
var err error
conn, err = wsClient.DialAgent(ctx, workspaceAgent.ID, &workspacesdk.DialAgentOptions{
Logger: logger,
BlockEndpoints: r.disableDirect,
EnableTelemetry: !r.disableNetworkTelemetry,
})
if err != nil {
return err
}); err != nil {
return xerrors.Errorf("dial agent: %w", err)
}
if err = stack.push("agent conn", conn); err != nil {
_ = conn.Close()
return err
}
conn.AwaitReachable(ctx)
@@ -1583,11 +1652,18 @@ func testOrDefaultDialer(ctx context.Context) coderConnectDialer {
return dialer
}
func runCoderConnectStdio(ctx context.Context, addr string, stdin io.Reader, stdout io.Writer, stack *closerStack) error {
func runCoderConnectStdio(ctx context.Context, addr string, stdin io.Reader, stdout io.Writer, stack *closerStack, logger slog.Logger) error {
dialer := testOrDefaultDialer(ctx)
conn, err := dialer.DialContext(ctx, "tcp", addr)
if err != nil {
return xerrors.Errorf("dial coder connect host: %w", err)
var conn net.Conn
if err := retryWithInterval(ctx, logger, sshRetryInterval, sshMaxAttempts, func() error {
var err error
conn, err = dialer.DialContext(ctx, "tcp", addr)
if err != nil {
return xerrors.Errorf("dial coder connect host %q over tcp: %w", addr, err)
}
return nil
}); err != nil {
return err
}
if err := stack.push("tcp conn", conn); err != nil {
return err