Files
coder/coderd/x/chatd/chaterror/classify_test.go
T
Ethan 4d74603045 fix(coderd/x/chatd): respect provider Retry-After headers in chat retry loop (#23351)
> **PR Stack**
> 1. **#23351** ← `#23282` *(you are here)*
> 2. #23282 ← `#23275`
> 3. #23275 ← `#23349`
> 4. #23349 ← `main`

---

## Summary

`chatretry.Retry()` used pure exponential backoff (1 s, 2 s, 4 s, …) and
never consulted provider `Retry-After` headers. Fantasy's
`ProviderError` carries `ResponseHeaders` including `Retry-After`, but
`chaterror.Classify()` only parsed error text and silently dropped the
structured transport metadata.

This makes `Retry-After` a first-class signal in the classification →
retry pipeline.

<img width="853" height="346" alt="image"
src="https://github.com/user-attachments/assets/65f012b6-8173-43d2-957e-ab9faddea525"
/>


## Changes

### `coderd/chatd/chaterror/classify.go`

- Added `RetryAfter time.Duration` field to `ClassifiedError` — a
normalized minimum retry delay derived from provider response metadata.
- `Classify()` now calls `extractProviderErrorDetails()` before falling
back to text heuristics. Structured `ProviderError.StatusCode` takes
priority over regex extraction.
- `normalizeClassification()` preserves and clamps `RetryAfter`.

### `coderd/chatd/chaterror/provider_error.go` (new)

Provider-specific extraction, isolated from the text-based
classification logic:

- `extractProviderErrorDetails()` unwraps `*fantasy.ProviderError` from
the error chain via `errors.As`.
- `retryAfterFromHeaders()` parses headers in priority order:
  1. `retry-after-ms` (OpenAI-specific, millisecond precision)
  2. `retry-after` (standard HTTP — integer seconds or HTTP-date)
- Case-insensitive header key lookup.

### `coderd/chatd/chatretry/chatretry.go`

- `effectiveDelay(attempt, classified)` computes `max(Delay(attempt),
classified.RetryAfter)` — the provider hint acts as a floor without
weakening the local exponential backoff.
- `Retry()` now uses `effectiveDelay` and passes the effective delay to
both `onRetry(...)` and the sleep timer, so downstream payloads, logs,
and the frontend countdown stay aligned automatically.

### Tests

- `classify_test.go`: Structured provider status + `Retry-After`
extraction, `retry-after-ms` priority, HTTP-date parsing, invalid header
fallback, `WithProvider` preservation.
- `chatretry_test.go`: Retry-after-as-floor semantics — longer hint
wins, shorter hint keeps base delay.

## Design notes

- **No SDK/API/frontend changes needed.** `codersdk.ChatStreamRetry`
already carries `DelayMs` and `RetryingAt`, and the frontend already
consumes them. The fix is purely in the server-side delay computation.
- **Existing retryability rules unchanged.** This fixes *when* we sleep,
not *whether* an error is retryable.
- **Provider hint is a floor:** `max(baseDelay, RetryAfter)` ensures we
never retry earlier than the provider asks, and never weaken our own
backoff curve.
2026-03-27 01:20:46 +11:00

435 lines
15 KiB
Go

package chaterror_test
import (
"context"
"net/http"
"testing"
"time"
"charm.land/fantasy"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/x/chatd/chaterror"
)
func TestClassify(t *testing.T) {
t.Parallel()
tests := []struct {
name string
err error
want chaterror.ClassifiedError
}{
{
name: "AmbiguousOverloadKeepsProviderUnknown",
err: xerrors.New("status 529 from upstream"),
want: chaterror.ClassifiedError{
Message: "The AI provider is temporarily overloaded (HTTP 529).",
Kind: chaterror.KindOverloaded,
Provider: "",
Retryable: true,
StatusCode: 529,
},
},
{
name: "ExplicitAnthropicOverload",
err: xerrors.New("anthropic overloaded_error"),
want: chaterror.ClassifiedError{
Message: "Anthropic is temporarily overloaded.",
Kind: chaterror.KindOverloaded,
Provider: "anthropic",
Retryable: true,
StatusCode: 0,
},
},
{
name: "AuthBeatsConfig",
err: xerrors.New("authentication failed: invalid model"),
want: chaterror.ClassifiedError{
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
Kind: chaterror.KindAuth,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "PureConfig",
err: xerrors.New("invalid model"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "BareForbiddenClassifiesAsAuth",
err: xerrors.New("forbidden"),
want: chaterror.ClassifiedError{
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
Kind: chaterror.KindAuth,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "ExplicitStatus401ClassifiesAsAuth",
err: xerrors.New("status 401 from upstream"),
want: chaterror.ClassifiedError{
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
Kind: chaterror.KindAuth,
Provider: "",
Retryable: false,
StatusCode: 401,
},
},
{
name: "ExplicitStatus403ClassifiesAsAuth",
err: xerrors.New("status 403 from upstream"),
want: chaterror.ClassifiedError{
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
Kind: chaterror.KindAuth,
Provider: "",
Retryable: false,
StatusCode: 403,
},
},
{
name: "ForbiddenContextLengthClassifiesAsConfig",
err: xerrors.New("forbidden: context length exceeded"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "ExplicitStatus429ClassifiesAsRateLimit",
err: xerrors.New("status 429 from upstream"),
want: chaterror.ClassifiedError{
Message: "The AI provider is rate limiting requests (HTTP 429).",
Kind: chaterror.KindRateLimit,
Provider: "",
Retryable: true,
StatusCode: 429,
},
},
{
name: "RateLimitDoesNotBeatConfig",
err: xerrors.New("status 429: invalid model"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 429,
},
},
{
name: "ServiceUnavailableClassifiesAsRetryableTimeout",
err: xerrors.New("service unavailable"),
want: chaterror.ClassifiedError{
Message: "The AI provider is temporarily unavailable.",
Kind: chaterror.KindTimeout,
Provider: "",
Retryable: true,
StatusCode: 0,
},
},
{
name: "TimeoutDoesNotBeatConfigViaStatusCode",
err: xerrors.New("status 503: invalid model"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 503,
},
},
{
name: "TimeoutDoesNotBeatConfigViaMessage",
err: xerrors.New("service unavailable: model not found"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "ConnectionRefusedUnsupportedModelClassifiesAsConfig",
err: xerrors.New("connection refused: unsupported model"),
want: chaterror.ClassifiedError{
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
Kind: chaterror.KindConfig,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
{
name: "DeadlineExceededStaysNonRetryableTimeout",
err: context.DeadlineExceeded,
want: chaterror.ClassifiedError{
Message: "The request timed out before it completed.",
Kind: chaterror.KindTimeout,
Provider: "",
Retryable: false,
StatusCode: 0,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
require.Equal(t, tt.want, chaterror.Classify(tt.err))
})
}
}
func TestClassify_PatternCoverage(t *testing.T) {
t.Parallel()
tests := []struct {
name string
err string
wantKind string
wantRetry bool
}{
{name: "OverloadedLiteral", err: "overloaded", wantKind: chaterror.KindOverloaded, wantRetry: true},
{name: "RateLimitLiteral", err: "rate limit", wantKind: chaterror.KindRateLimit, wantRetry: true},
{name: "RateLimitUnderscoreLiteral", err: "rate_limit", wantKind: chaterror.KindRateLimit, wantRetry: true},
{name: "RateLimitedLiteral", err: "rate limited", wantKind: chaterror.KindRateLimit, wantRetry: true},
{name: "RateLimitedHyphenLiteral", err: "rate-limited", wantKind: chaterror.KindRateLimit, wantRetry: true},
{name: "TooManyRequestsLiteral", err: "too many requests", wantKind: chaterror.KindRateLimit, wantRetry: true},
{name: "TimeoutLiteral", err: "timeout", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "TimedOutLiteral", err: "timed out", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "ServiceUnavailableLiteral", err: "service unavailable", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "UnavailableLiteral", err: "unavailable", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "ConnectionResetLiteral", err: "connection reset", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "ConnectionRefusedLiteral", err: "connection refused", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "EOFLiteral", err: "eof", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "BrokenPipeLiteral", err: "broken pipe", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "BadGatewayLiteral", err: "bad gateway", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "GatewayTimeoutLiteral", err: "gateway timeout", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "AuthenticationLiteral", err: "authentication", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "UnauthorizedLiteral", err: "unauthorized", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "InvalidAPIKeyLiteral", err: "invalid api key", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "InvalidAPIKeyUnderscoreLiteral", err: "invalid_api_key", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "QuotaLiteral", err: "quota", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "BillingLiteral", err: "billing", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "InsufficientQuotaLiteral", err: "insufficient_quota", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "PaymentRequiredLiteral", err: "payment required", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "ForbiddenLiteral", err: "forbidden", wantKind: chaterror.KindAuth, wantRetry: false},
{name: "InvalidModelLiteral", err: "invalid model", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "ModelNotFoundLiteral", err: "model not found", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "ModelNotFoundUnderscoreLiteral", err: "model_not_found", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "UnsupportedModelLiteral", err: "unsupported model", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "ContextLengthExceededLiteral", err: "context length exceeded", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "ContextExceededLiteral", err: "context_exceeded", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "MaximumContextLengthLiteral", err: "maximum context length", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "MalformedConfigLiteral", err: "malformed config", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "MalformedConfigurationLiteral", err: "malformed configuration", wantKind: chaterror.KindConfig, wantRetry: false},
{name: "ServerErrorLiteral", err: "server error", wantKind: chaterror.KindGeneric, wantRetry: true},
{name: "InternalServerErrorLiteral", err: "internal server error", wantKind: chaterror.KindGeneric, wantRetry: true},
{name: "ChatInterruptedLiteral", err: "chat interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
{name: "RequestInterruptedLiteral", err: "request interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
{name: "OperationInterruptedLiteral", err: "operation interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
{name: "Status408", err: "status 408", wantKind: chaterror.KindTimeout, wantRetry: true},
{name: "Status500", err: "status 500", wantKind: chaterror.KindGeneric, wantRetry: true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(xerrors.New(tt.err))
require.Equal(t, tt.wantKind, classified.Kind)
require.Equal(t, tt.wantRetry, classified.Retryable)
})
}
}
func TestClassify_TransportFailuresUseBroaderRetryMessage(t *testing.T) {
t.Parallel()
tests := []struct {
name string
err string
}{
{name: "TimeoutLiteral", err: "timeout"},
{name: "EOFLiteral", err: "eof"},
{name: "BrokenPipeLiteral", err: "broken pipe"},
{name: "ConnectionResetLiteral", err: "connection reset"},
{name: "ConnectionRefusedLiteral", err: "connection refused"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(xerrors.New(tt.err))
require.Equal(t, chaterror.KindTimeout, classified.Kind)
require.True(t, classified.Retryable)
require.Equal(
t,
"The AI provider is temporarily unavailable.",
classified.Message,
)
})
}
}
func TestClassify_StartupTimeoutWrappedClassificationWins(t *testing.T) {
t.Parallel()
wrapped := chaterror.WithClassification(
xerrors.New("context canceled"),
chaterror.ClassifiedError{
Kind: chaterror.KindStartupTimeout,
Provider: "openai",
Retryable: true,
},
)
require.Equal(t, chaterror.ClassifiedError{
Message: "OpenAI did not start responding in time.",
Kind: chaterror.KindStartupTimeout,
Provider: "openai",
Retryable: true,
StatusCode: 0,
}, chaterror.Classify(wrapped))
}
func TestWithProviderUsesExplicitHint(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(xerrors.New("openai received status 429 from upstream"))
require.Equal(t, "openai", classified.Provider)
enriched := classified.WithProvider("azure openai")
require.Equal(t, chaterror.ClassifiedError{
Message: "Azure OpenAI is rate limiting requests (HTTP 429).",
Kind: chaterror.KindRateLimit,
Provider: "azure",
Retryable: true,
StatusCode: 429,
}, enriched)
}
func TestWithProviderAddsProviderWhenUnknown(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(xerrors.New("received status 429 from upstream"))
require.Empty(t, classified.Provider)
enriched := classified.WithProvider("openai")
require.Equal(t, chaterror.ClassifiedError{
Message: "OpenAI is rate limiting requests (HTTP 429).",
Kind: chaterror.KindRateLimit,
Provider: "openai",
Retryable: true,
StatusCode: 429,
}, enriched)
}
func TestClassify_UsesStructuredProviderStatusAndRetryAfter(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(testProviderError(
"",
429,
map[string]string{"Retry-After": "30"},
))
require.Equal(t, chaterror.ClassifiedError{
Message: "The AI provider is rate limiting requests (HTTP 429).",
Kind: chaterror.KindRateLimit,
Provider: "",
Retryable: true,
StatusCode: 429,
RetryAfter: 30 * time.Second,
}, classified)
}
func TestClassify_PrefersRetryAfterMsOverRetryAfter(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(testProviderError(
"upstream failed",
429,
map[string]string{
"Retry-After": "30",
"ReTrY-AfTeR-Ms": "1500",
},
))
require.Equal(t, 429, classified.StatusCode)
require.Equal(t, 1500*time.Millisecond, classified.RetryAfter)
}
func TestClassify_ParsesRetryAfterHTTPDate(t *testing.T) {
t.Parallel()
retryAt := time.Now().Add(3 * time.Second).UTC().Format(http.TimeFormat)
classified := chaterror.Classify(testProviderError(
"upstream failed",
429,
map[string]string{"Retry-After": retryAt},
))
require.Equal(t, 429, classified.StatusCode)
require.GreaterOrEqual(t, classified.RetryAfter, 2*time.Second)
require.LessOrEqual(t, classified.RetryAfter, 4*time.Second)
}
func TestClassify_IgnoresInvalidRetryAfter(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(testProviderError(
"upstream failed",
429,
map[string]string{"Retry-After": "definitely not a delay"},
))
require.Zero(t, classified.RetryAfter)
}
func TestWithProviderPreservesRetryAfter(t *testing.T) {
t.Parallel()
classified := chaterror.Classify(testProviderError(
"upstream failed",
429,
map[string]string{"Retry-After": "30"},
))
enriched := classified.WithProvider("openai")
require.Equal(t, 30*time.Second, enriched.RetryAfter)
require.Equal(t, chaterror.ClassifiedError{
Message: "OpenAI is rate limiting requests (HTTP 429).",
Kind: chaterror.KindRateLimit,
Provider: "openai",
Retryable: true,
StatusCode: 429,
RetryAfter: 30 * time.Second,
}, enriched)
}
func testProviderError(message string, statusCode int, headers map[string]string) error {
return &fantasy.ProviderError{
Message: message,
StatusCode: statusCode,
ResponseHeaders: headers,
}
}