From 1d0653cdab69e5a0abd9d6b69a803fbd580ef13c Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 10 Apr 2026 00:55:16 +0300 Subject: [PATCH] fix(cli): retry dial timeouts in SSH connection setup (#24199) Reorder error checks in isRetryableError so IsConnectionError is evaluated before context.DeadlineExceeded. Dial timeouts (*net.OpError wrapping DeadlineExceeded) were incorrectly treated as non-retryable, causing Coder Connect to fail immediately on broken tunnels with valid DNS despite existing retry logic. Fixes #24201 --- cli/ssh.go | 10 ++++++---- cli/ssh_internal_test.go | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cli/ssh.go b/cli/ssh.go index 4bcc40d4c0..af046eda36 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -69,15 +69,17 @@ var ( // isRetryableError checks for transient connection errors worth // retrying: DNS failures, connection refused, and server 5xx. func isRetryableError(err error) bool { - if err == nil { - return false - } - if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) { + if err == nil || xerrors.Is(err, context.Canceled) { return false } + // Check connection errors before context.DeadlineExceeded because + // net.Dialer.Timeout produces *net.OpError that matches both. if codersdk.IsConnectionError(err) { return true } + if xerrors.Is(err, context.DeadlineExceeded) { + return false + } var sdkErr *codersdk.Error if xerrors.As(err, &sdkErr) { return sdkErr.StatusCode() >= 500 diff --git a/cli/ssh_internal_test.go b/cli/ssh_internal_test.go index 62977c50a9..9a9449eac0 100644 --- a/cli/ssh_internal_test.go +++ b/cli/ssh_internal_test.go @@ -516,6 +516,23 @@ func TestIsRetryableError(t *testing.T) { assert.Equal(t, tt.retryable, isRetryableError(tt.err)) }) } + + // net.Dialer.Timeout produces *net.OpError that matches both + // IsConnectionError and context.DeadlineExceeded. Verify it is retryable. + t.Run("DialTimeout", func(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithDeadline(context.Background(), time.Now()) + defer cancel() + <-ctx.Done() // ensure deadline has fired + _, err := (&net.Dialer{}).DialContext(ctx, "tcp", "127.0.0.1:1") + require.Error(t, err) + // Proves the ambiguity: this error matches BOTH checks. + require.ErrorIs(t, err, context.DeadlineExceeded) + require.ErrorAs(t, err, new(*net.OpError)) + assert.True(t, isRetryableError(err)) + // Also when wrapped, as runCoderConnectStdio does. + assert.True(t, isRetryableError(xerrors.Errorf("dial coder connect: %w", err))) + }) } func TestRetryWithInterval(t *testing.T) {