fix(coderd/externalauth): retry transient refresh failures with backoff (#25686)

## Summary Wraps external auth token refresh in an exponential-backoff retry so a brief upstream hiccup (5xx, network timeout, rate-limited 429) no longer surfaces as an `InvalidTokenError` and forces users to re-authenticate. GitHub in particular has been flaky enough lately that this is hitting real users. ## Behavior - `(*Config).RefreshToken` now calls a helper that retries the `TokenSource.Token()` exchange with exponential backoff (250ms → 2s), bounded by a 10s total budget. - Errors classified as permanent by `isFailedRefresh` (e.g. `bad_refresh_token`, `invalid_grant`, `unauthorized_client`, ...) skip the retry loop. Retrying a permanent failure wastes the refresh quota and, on providers with single-use refresh tokens, can mask a legitimate concurrent winner with repeated `bad_refresh_token` responses. - Refreshes with an empty refresh token still short-circuit without making an API call. - The existing concurrent-refresh-race detection and optimistic-lock paths are unchanged. ## Tunables Three new `time.Duration` fields on `externalauth.Config` (`RefreshRetryInitialBackoff`, `RefreshRetryMaxBackoff`, `RefreshRetryTimeout`) let callers override the defaults. They default to zero, which falls back to the package defaults, so existing call sites are unaffected. The fields exist primarily so tests can dial the timing way down without touching package globals (and therefore without serializing parallel tests). ## Tests - `TestRefreshToken/RefreshRetries` now disables internal retries via `RefreshRetryTimeout = time.Nanosecond` so its existing "1 IDP call per `RefreshToken` invocation" assertion still holds. Otherwise its assertions are unchanged. - New `TestRefreshToken/RefreshTokenWithBackoff` simulates 3 transient 5xx failures followed by success and verifies the refresh ultimately succeeds with 4 total IDP attempts. - New `TestRefreshToken/RefreshTokenBackoffPermanentError` returns `bad_refresh_token` and verifies the refresh is **not** retried even with a generous 1s budget. <details> <summary>Why the explicit <code>retryCtx.Err()</code> guard?</summary> `retry.Retrier.Wait` `select`s between `time.After(delay)` and `ctx.Done()`. The first call has `delay == 0`, so `time.After(0)` and an already-cancelled context both fire immediately and Go picks the case nondeterministically. Without the guard, a near-zero retry budget would still trigger an unwanted extra refresh attempt roughly half the time, which would have made the `RefreshRetries` test flaky. </details> This PR was opened by a Coder agent on behalf of @kylecarbs.
2026-06-02 20:48:20 +00:00 · 2026-05-26 15:35:22 -04:00
parent 8b1705eb65
commit 58f6b9c4d0
2 changed files with 195 additions and 2 deletions
@@ -38,6 +38,19 @@ const (

 	// tokenRevocationTimeout timeout for requests to external oauth provider.
 	tokenRevocationTimeout = 10 * time.Second
+
+	// defaultRefreshRetryInitialBackoff is the starting wait between transient
+	// refresh retry attempts when the IDP returns a temporary failure (5xx,
+	// 429, network error, ...).
+	defaultRefreshRetryInitialBackoff = 250 * time.Millisecond
+
+	// defaultRefreshRetryMaxBackoff caps the exponential backoff between
+	// transient refresh retry attempts.
+	defaultRefreshRetryMaxBackoff = 2 * time.Second
+
+	// defaultRefreshRetryTimeout bounds the total time spent retrying a
+	// transient refresh failure across all attempts.
+	defaultRefreshRetryTimeout = 10 * time.Second
 )

 // Config is used for authentication for Git operations.
@@ -115,6 +128,19 @@ type Config struct {
 	// This field can be nil if unspecified in the config.
 	MCPToolDenyRegex              *regexp.Regexp
 	CodeChallengeMethodsSupported []promoauth.Oauth2PKCEChallengeMethod
+
+	// RefreshRetryInitialBackoff overrides the initial wait between transient
+	// refresh retry attempts. A zero value applies
+	// defaultRefreshRetryInitialBackoff.
+	RefreshRetryInitialBackoff time.Duration
+	// RefreshRetryMaxBackoff overrides the maximum wait between transient
+	// refresh retry attempts. A zero value applies
+	// defaultRefreshRetryMaxBackoff.
+	RefreshRetryMaxBackoff time.Duration
+	// RefreshRetryTimeout overrides the total budget for retrying a transient
+	// refresh failure across all attempts. A zero value applies
+	// defaultRefreshRetryTimeout.
+	RefreshRetryTimeout time.Duration
 }

 // Git returns a Provider for this config if the provider type is a
@@ -192,7 +218,15 @@ func (c *Config) RefreshToken(ctx context.Context, db database.Store, externalAu
 	// Note: The TokenSource(...) method will make no remote HTTP requests if the
 	// token is expired and no refresh token is set. This is important to prevent
 	// spamming the API, consuming rate limits, when the token is known to fail.
-	token, err := c.TokenSource(ctx, existingToken).Token()
+	//
+	// External providers (GitHub in particular) intermittently fail token
+	// refreshes with transient errors such as 5xx responses, network timeouts,
+	// and rate-limited 429s. Retry with exponential backoff before surfacing
+	// the failure so a brief upstream blip does not force users to
+	// re-authenticate. Errors classified as permanent by isFailedRefresh
+	// (e.g. revoked or rotated refresh tokens) are not retried since those
+	// will never succeed and retrying wastes the refresh quota.
+	token, err := c.refreshTokenWithRetry(ctx, existingToken)
 	if err != nil {
 		// TokenSource can fail for numerous reasons. If it fails because of
 		// a bad refresh token, then the refresh token is invalid, and we should
@@ -353,6 +387,59 @@ validate:
 	return externalAuthLink, nil
 }

+// refreshTokenWithRetry exchanges the refresh token for a new access token,
+// retrying with exponential backoff on transient failures. Permanent
+// failures (as classified by isFailedRefresh) and the no-op case where no
+// refresh token is set bypass the retry loop so a doomed refresh is not
+// repeatedly attempted.
+func (c *Config) refreshTokenWithRetry(ctx context.Context, existingToken *oauth2.Token) (*oauth2.Token, error) {
+	// Without a refresh token the oauth2 library short-circuits with
+	// "token expired and refresh token is not set". No retry can recover
+	// from that, so make a single attempt and return.
+	if existingToken.RefreshToken == "" {
+		return c.TokenSource(ctx, existingToken).Token()
+	}
+
+	initial := c.RefreshRetryInitialBackoff
+	if initial <= 0 {
+		initial = defaultRefreshRetryInitialBackoff
+	}
+	maximum := c.RefreshRetryMaxBackoff
+	if maximum <= 0 {
+		maximum = defaultRefreshRetryMaxBackoff
+	}
+	total := c.RefreshRetryTimeout
+	if total <= 0 {
+		total = defaultRefreshRetryTimeout
+	}
+
+	retryCtx, retryCancel := context.WithTimeout(ctx, total)
+	defer retryCancel()
+	backoff := retry.New(initial, maximum)
+
+	var (
+		token *oauth2.Token
+		err   error
+	)
+	for {
+		token, err = c.TokenSource(ctx, existingToken).Token()
+		if err == nil || isFailedRefresh(existingToken, err) {
+			return token, err
+		}
+		// Bail out before waiting if the retry budget is already gone.
+		// retry.Wait selects between time.After(delay) and ctx.Done(); when
+		// delay is zero and the context is already canceled the two cases
+		// race nondeterministically, which would cause an unwanted extra
+		// refresh attempt with a near-zero budget (notably in tests).
+		if retryCtx.Err() != nil {
+			return token, err
+		}
+		if !backoff.Wait(retryCtx) {
+			return token, err
+		}
+	}
+}
+
 // ValidateToken checks if the Git token provided is valid.
 // The user is optionally returned if the provider supports it.
 // Returns valid=true when: the provider confirmed the token,
@@ -155,6 +155,10 @@ func TestRefreshToken(t *testing.T) {
 	// If a refresh token fails because the token itself is invalid, no more
 	// refresh attempts should ever happen. An invalid refresh token does
 	// not magically become valid at some point in the future.
+	//
+	// Internal retries are disabled in this subtest via RefreshRetryTimeout
+	// so each RefreshToken call results in exactly one IDP refresh attempt.
+	// The RefreshTokenWithBackoff subtest covers the retry-with-backoff path.
 	t.Run("RefreshRetries", func(t *testing.T) {
 		t.Parallel()

@@ -177,7 +181,11 @@ func TestRefreshToken(t *testing.T) {
 					return nil, xerrors.New("should not be called")
 				}),
 			},
-			ExternalAuthOpt: func(cfg *externalauth.Config) {},
+			ExternalAuthOpt: func(cfg *externalauth.Config) {
+				// Disable transient-error retries so the assertion below
+				// (1 IDP call per RefreshToken) holds.
+				cfg.RefreshRetryTimeout = time.Nanosecond
+			},
 		})

 		ctx := oidc.ClientContext(context.Background(), fake.HTTPClient(nil))
@@ -227,6 +235,104 @@ func TestRefreshToken(t *testing.T) {
 		require.Equal(t, refreshCount, totalRefreshes)
 	})

+	// RefreshTokenWithBackoff tests that refreshes which fail with transient
+	// errors (HTTP 5xx, 429, network errors) are retried with exponential
+	// backoff so a temporary upstream glitch does not force users to
+	// re-authenticate. After enough successful retries, RefreshToken should
+	// return a valid token without surfacing the transient error.
+	t.Run("RefreshTokenWithBackoff", func(t *testing.T) {
+		t.Parallel()
+
+		db, _ := dbtestutil.NewDB(t)
+
+		const failuresBeforeSuccess = 3
+		var refreshCalls atomic.Int64
+		fake, config, link := setupOauth2Test(t, testConfig{
+			FakeIDPOpts: []oidctest.FakeIDPOpt{
+				oidctest.WithRefresh(func(_ string) error {
+					// Fail the first N attempts with a transient 5xx, then succeed.
+					if refreshCalls.Add(1) <= failuresBeforeSuccess {
+						return &oauth2.RetrieveError{
+							Response:  &http.Response{StatusCode: http.StatusInternalServerError},
+							ErrorCode: "server_error",
+						}
+					}
+					return nil
+				}),
+			},
+			ExternalAuthOpt: func(cfg *externalauth.Config) {
+				cfg.Type = codersdk.EnhancedExternalAuthProviderGitHub.String()
+				// Tight backoffs keep the test fast.
+				cfg.RefreshRetryInitialBackoff = time.Millisecond
+				cfg.RefreshRetryMaxBackoff = 5 * time.Millisecond
+				cfg.RefreshRetryTimeout = 5 * time.Second
+			},
+			DB: db,
+		})
+
+		ctx := oidc.ClientContext(context.Background(), fake.HTTPClient(nil))
+		oldAccessToken := link.OAuthAccessToken
+		link.OAuthExpiry = expired
+
+		updated, err := config.RefreshToken(ctx, db, link)
+		require.NoError(t, err, "transient errors should be retried until success")
+		require.Equal(t, int64(failuresBeforeSuccess+1), refreshCalls.Load(),
+			"refresh should have been retried until the IDP returned success")
+		require.NotEqual(t, oldAccessToken, updated.OAuthAccessToken,
+			"a new access token should have been issued")
+	})
+
+	// RefreshTokenBackoffPermanentError verifies that errors classified as
+	// permanent by isFailedRefresh (e.g. "bad_refresh_token") are not
+	// retried. Retrying a permanent failure wastes the refresh quota and,
+	// on providers with single-use refresh tokens, can mask a legitimate
+	// concurrent winner with repeated "bad_refresh_token" responses.
+	t.Run("RefreshTokenBackoffPermanentError", func(t *testing.T) {
+		t.Parallel()
+
+		ctrl := gomock.NewController(t)
+		mDB := dbmock.NewMockStore(ctrl)
+
+		var refreshCalls atomic.Int64
+		fake, config, link := setupOauth2Test(t, testConfig{
+			FakeIDPOpts: []oidctest.FakeIDPOpt{
+				oidctest.WithRefresh(func(_ string) error {
+					refreshCalls.Add(1)
+					return &oauth2.RetrieveError{
+						Response:  &http.Response{StatusCode: http.StatusOK},
+						ErrorCode: "bad_refresh_token",
+					}
+				}),
+			},
+			ExternalAuthOpt: func(cfg *externalauth.Config) {
+				cfg.Type = codersdk.EnhancedExternalAuthProviderGitHub.String()
+				// Generous backoff: a regression that incorrectly retried
+				// would re-run the failing refresh many times and the test
+				// would fail on the call-count assertion below.
+				cfg.RefreshRetryInitialBackoff = time.Millisecond
+				cfg.RefreshRetryMaxBackoff = 5 * time.Millisecond
+				cfg.RefreshRetryTimeout = time.Second
+			},
+		})
+
+		// The race-detection re-read returns the same refresh token so it
+		// does not look like a concurrent winner. The cached-failure write
+		// then proceeds. Each runs exactly once for a single refresh attempt.
+		mDB.EXPECT().GetExternalAuthLink(gomock.Any(), gomock.Any()).
+			Return(link, nil).Times(1)
+		mDB.EXPECT().UpdateExternalAuthLinkRefreshToken(gomock.Any(), gomock.Any()).
+			Return(nil).Times(1)
+
+		ctx := oidc.ClientContext(context.Background(), fake.HTTPClient(nil))
+		link.OAuthExpiry = expired
+
+		_, err := config.RefreshToken(ctx, mDB, link)
+		require.Error(t, err)
+		require.True(t, externalauth.IsInvalidTokenError(err))
+		require.Equal(t, int64(1), refreshCalls.Load(),
+			"permanent failures should not be retried")
+	})
+
 	// ConcurrentRefreshRace tests that when multiple concurrent requests
 	// race to refresh the same token, the loser does not poison the
 	// database with a cached "bad_refresh_token" failure. This