mirror of
https://github.com/coder/coder.git
synced 2026-06-04 21:48:22 +00:00
4d74603045
> **PR Stack** > 1. **#23351** ← `#23282` *(you are here)* > 2. #23282 ← `#23275` > 3. #23275 ← `#23349` > 4. #23349 ← `main` --- ## Summary `chatretry.Retry()` used pure exponential backoff (1 s, 2 s, 4 s, …) and never consulted provider `Retry-After` headers. Fantasy's `ProviderError` carries `ResponseHeaders` including `Retry-After`, but `chaterror.Classify()` only parsed error text and silently dropped the structured transport metadata. This makes `Retry-After` a first-class signal in the classification → retry pipeline. <img width="853" height="346" alt="image" src="https://github.com/user-attachments/assets/65f012b6-8173-43d2-957e-ab9faddea525" /> ## Changes ### `coderd/chatd/chaterror/classify.go` - Added `RetryAfter time.Duration` field to `ClassifiedError` — a normalized minimum retry delay derived from provider response metadata. - `Classify()` now calls `extractProviderErrorDetails()` before falling back to text heuristics. Structured `ProviderError.StatusCode` takes priority over regex extraction. - `normalizeClassification()` preserves and clamps `RetryAfter`. ### `coderd/chatd/chaterror/provider_error.go` (new) Provider-specific extraction, isolated from the text-based classification logic: - `extractProviderErrorDetails()` unwraps `*fantasy.ProviderError` from the error chain via `errors.As`. - `retryAfterFromHeaders()` parses headers in priority order: 1. `retry-after-ms` (OpenAI-specific, millisecond precision) 2. `retry-after` (standard HTTP — integer seconds or HTTP-date) - Case-insensitive header key lookup. ### `coderd/chatd/chatretry/chatretry.go` - `effectiveDelay(attempt, classified)` computes `max(Delay(attempt), classified.RetryAfter)` — the provider hint acts as a floor without weakening the local exponential backoff. - `Retry()` now uses `effectiveDelay` and passes the effective delay to both `onRetry(...)` and the sleep timer, so downstream payloads, logs, and the frontend countdown stay aligned automatically. ### Tests - `classify_test.go`: Structured provider status + `Retry-After` extraction, `retry-after-ms` priority, HTTP-date parsing, invalid header fallback, `WithProvider` preservation. - `chatretry_test.go`: Retry-after-as-floor semantics — longer hint wins, shorter hint keeps base delay. ## Design notes - **No SDK/API/frontend changes needed.** `codersdk.ChatStreamRetry` already carries `DelayMs` and `RetryingAt`, and the frontend already consumes them. The fix is purely in the server-side delay computation. - **Existing retryability rules unchanged.** This fixes *when* we sleep, not *whether* an error is retryable. - **Provider hint is a floor:** `max(baseDelay, RetryAfter)` ensures we never retry earlier than the provider asks, and never weaken our own backoff curve.
435 lines
15 KiB
Go
435 lines
15 KiB
Go
package chaterror_test
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"testing"
|
|
"time"
|
|
|
|
"charm.land/fantasy"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/coder/coder/v2/coderd/x/chatd/chaterror"
|
|
)
|
|
|
|
func TestClassify(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
err error
|
|
want chaterror.ClassifiedError
|
|
}{
|
|
{
|
|
name: "AmbiguousOverloadKeepsProviderUnknown",
|
|
err: xerrors.New("status 529 from upstream"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider is temporarily overloaded (HTTP 529).",
|
|
Kind: chaterror.KindOverloaded,
|
|
Provider: "",
|
|
Retryable: true,
|
|
StatusCode: 529,
|
|
},
|
|
},
|
|
{
|
|
name: "ExplicitAnthropicOverload",
|
|
err: xerrors.New("anthropic overloaded_error"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "Anthropic is temporarily overloaded.",
|
|
Kind: chaterror.KindOverloaded,
|
|
Provider: "anthropic",
|
|
Retryable: true,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "AuthBeatsConfig",
|
|
err: xerrors.New("authentication failed: invalid model"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
|
|
Kind: chaterror.KindAuth,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "PureConfig",
|
|
err: xerrors.New("invalid model"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "BareForbiddenClassifiesAsAuth",
|
|
err: xerrors.New("forbidden"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
|
|
Kind: chaterror.KindAuth,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "ExplicitStatus401ClassifiesAsAuth",
|
|
err: xerrors.New("status 401 from upstream"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
|
|
Kind: chaterror.KindAuth,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 401,
|
|
},
|
|
},
|
|
{
|
|
name: "ExplicitStatus403ClassifiesAsAuth",
|
|
err: xerrors.New("status 403 from upstream"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "Authentication with the AI provider failed. Check the API key, permissions, and billing settings.",
|
|
Kind: chaterror.KindAuth,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 403,
|
|
},
|
|
},
|
|
{
|
|
name: "ForbiddenContextLengthClassifiesAsConfig",
|
|
err: xerrors.New("forbidden: context length exceeded"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "ExplicitStatus429ClassifiesAsRateLimit",
|
|
err: xerrors.New("status 429 from upstream"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider is rate limiting requests (HTTP 429).",
|
|
Kind: chaterror.KindRateLimit,
|
|
Provider: "",
|
|
Retryable: true,
|
|
StatusCode: 429,
|
|
},
|
|
},
|
|
{
|
|
name: "RateLimitDoesNotBeatConfig",
|
|
err: xerrors.New("status 429: invalid model"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 429,
|
|
},
|
|
},
|
|
{
|
|
name: "ServiceUnavailableClassifiesAsRetryableTimeout",
|
|
err: xerrors.New("service unavailable"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider is temporarily unavailable.",
|
|
Kind: chaterror.KindTimeout,
|
|
Provider: "",
|
|
Retryable: true,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "TimeoutDoesNotBeatConfigViaStatusCode",
|
|
err: xerrors.New("status 503: invalid model"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 503,
|
|
},
|
|
},
|
|
{
|
|
name: "TimeoutDoesNotBeatConfigViaMessage",
|
|
err: xerrors.New("service unavailable: model not found"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "ConnectionRefusedUnsupportedModelClassifiesAsConfig",
|
|
err: xerrors.New("connection refused: unsupported model"),
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The AI provider rejected the model configuration. Check the selected model and provider settings.",
|
|
Kind: chaterror.KindConfig,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
{
|
|
name: "DeadlineExceededStaysNonRetryableTimeout",
|
|
err: context.DeadlineExceeded,
|
|
want: chaterror.ClassifiedError{
|
|
Message: "The request timed out before it completed.",
|
|
Kind: chaterror.KindTimeout,
|
|
Provider: "",
|
|
Retryable: false,
|
|
StatusCode: 0,
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
require.Equal(t, tt.want, chaterror.Classify(tt.err))
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestClassify_PatternCoverage(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
err string
|
|
wantKind string
|
|
wantRetry bool
|
|
}{
|
|
{name: "OverloadedLiteral", err: "overloaded", wantKind: chaterror.KindOverloaded, wantRetry: true},
|
|
{name: "RateLimitLiteral", err: "rate limit", wantKind: chaterror.KindRateLimit, wantRetry: true},
|
|
{name: "RateLimitUnderscoreLiteral", err: "rate_limit", wantKind: chaterror.KindRateLimit, wantRetry: true},
|
|
{name: "RateLimitedLiteral", err: "rate limited", wantKind: chaterror.KindRateLimit, wantRetry: true},
|
|
{name: "RateLimitedHyphenLiteral", err: "rate-limited", wantKind: chaterror.KindRateLimit, wantRetry: true},
|
|
{name: "TooManyRequestsLiteral", err: "too many requests", wantKind: chaterror.KindRateLimit, wantRetry: true},
|
|
{name: "TimeoutLiteral", err: "timeout", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "TimedOutLiteral", err: "timed out", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "ServiceUnavailableLiteral", err: "service unavailable", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "UnavailableLiteral", err: "unavailable", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "ConnectionResetLiteral", err: "connection reset", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "ConnectionRefusedLiteral", err: "connection refused", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "EOFLiteral", err: "eof", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "BrokenPipeLiteral", err: "broken pipe", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "BadGatewayLiteral", err: "bad gateway", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "GatewayTimeoutLiteral", err: "gateway timeout", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "AuthenticationLiteral", err: "authentication", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "UnauthorizedLiteral", err: "unauthorized", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "InvalidAPIKeyLiteral", err: "invalid api key", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "InvalidAPIKeyUnderscoreLiteral", err: "invalid_api_key", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "QuotaLiteral", err: "quota", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "BillingLiteral", err: "billing", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "InsufficientQuotaLiteral", err: "insufficient_quota", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "PaymentRequiredLiteral", err: "payment required", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "ForbiddenLiteral", err: "forbidden", wantKind: chaterror.KindAuth, wantRetry: false},
|
|
{name: "InvalidModelLiteral", err: "invalid model", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "ModelNotFoundLiteral", err: "model not found", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "ModelNotFoundUnderscoreLiteral", err: "model_not_found", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "UnsupportedModelLiteral", err: "unsupported model", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "ContextLengthExceededLiteral", err: "context length exceeded", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "ContextExceededLiteral", err: "context_exceeded", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "MaximumContextLengthLiteral", err: "maximum context length", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "MalformedConfigLiteral", err: "malformed config", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "MalformedConfigurationLiteral", err: "malformed configuration", wantKind: chaterror.KindConfig, wantRetry: false},
|
|
{name: "ServerErrorLiteral", err: "server error", wantKind: chaterror.KindGeneric, wantRetry: true},
|
|
{name: "InternalServerErrorLiteral", err: "internal server error", wantKind: chaterror.KindGeneric, wantRetry: true},
|
|
{name: "ChatInterruptedLiteral", err: "chat interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
|
|
{name: "RequestInterruptedLiteral", err: "request interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
|
|
{name: "OperationInterruptedLiteral", err: "operation interrupted", wantKind: chaterror.KindGeneric, wantRetry: false},
|
|
{name: "Status408", err: "status 408", wantKind: chaterror.KindTimeout, wantRetry: true},
|
|
{name: "Status500", err: "status 500", wantKind: chaterror.KindGeneric, wantRetry: true},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(xerrors.New(tt.err))
|
|
require.Equal(t, tt.wantKind, classified.Kind)
|
|
require.Equal(t, tt.wantRetry, classified.Retryable)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestClassify_TransportFailuresUseBroaderRetryMessage(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
err string
|
|
}{
|
|
{name: "TimeoutLiteral", err: "timeout"},
|
|
{name: "EOFLiteral", err: "eof"},
|
|
{name: "BrokenPipeLiteral", err: "broken pipe"},
|
|
{name: "ConnectionResetLiteral", err: "connection reset"},
|
|
{name: "ConnectionRefusedLiteral", err: "connection refused"},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(xerrors.New(tt.err))
|
|
require.Equal(t, chaterror.KindTimeout, classified.Kind)
|
|
require.True(t, classified.Retryable)
|
|
require.Equal(
|
|
t,
|
|
"The AI provider is temporarily unavailable.",
|
|
classified.Message,
|
|
)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestClassify_StartupTimeoutWrappedClassificationWins(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
wrapped := chaterror.WithClassification(
|
|
xerrors.New("context canceled"),
|
|
chaterror.ClassifiedError{
|
|
Kind: chaterror.KindStartupTimeout,
|
|
Provider: "openai",
|
|
Retryable: true,
|
|
},
|
|
)
|
|
|
|
require.Equal(t, chaterror.ClassifiedError{
|
|
Message: "OpenAI did not start responding in time.",
|
|
Kind: chaterror.KindStartupTimeout,
|
|
Provider: "openai",
|
|
Retryable: true,
|
|
StatusCode: 0,
|
|
}, chaterror.Classify(wrapped))
|
|
}
|
|
|
|
func TestWithProviderUsesExplicitHint(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(xerrors.New("openai received status 429 from upstream"))
|
|
require.Equal(t, "openai", classified.Provider)
|
|
|
|
enriched := classified.WithProvider("azure openai")
|
|
require.Equal(t, chaterror.ClassifiedError{
|
|
Message: "Azure OpenAI is rate limiting requests (HTTP 429).",
|
|
Kind: chaterror.KindRateLimit,
|
|
Provider: "azure",
|
|
Retryable: true,
|
|
StatusCode: 429,
|
|
}, enriched)
|
|
}
|
|
|
|
func TestWithProviderAddsProviderWhenUnknown(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(xerrors.New("received status 429 from upstream"))
|
|
require.Empty(t, classified.Provider)
|
|
|
|
enriched := classified.WithProvider("openai")
|
|
require.Equal(t, chaterror.ClassifiedError{
|
|
Message: "OpenAI is rate limiting requests (HTTP 429).",
|
|
Kind: chaterror.KindRateLimit,
|
|
Provider: "openai",
|
|
Retryable: true,
|
|
StatusCode: 429,
|
|
}, enriched)
|
|
}
|
|
|
|
func TestClassify_UsesStructuredProviderStatusAndRetryAfter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(testProviderError(
|
|
"",
|
|
429,
|
|
map[string]string{"Retry-After": "30"},
|
|
))
|
|
|
|
require.Equal(t, chaterror.ClassifiedError{
|
|
Message: "The AI provider is rate limiting requests (HTTP 429).",
|
|
Kind: chaterror.KindRateLimit,
|
|
Provider: "",
|
|
Retryable: true,
|
|
StatusCode: 429,
|
|
RetryAfter: 30 * time.Second,
|
|
}, classified)
|
|
}
|
|
|
|
func TestClassify_PrefersRetryAfterMsOverRetryAfter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(testProviderError(
|
|
"upstream failed",
|
|
429,
|
|
map[string]string{
|
|
"Retry-After": "30",
|
|
"ReTrY-AfTeR-Ms": "1500",
|
|
},
|
|
))
|
|
|
|
require.Equal(t, 429, classified.StatusCode)
|
|
require.Equal(t, 1500*time.Millisecond, classified.RetryAfter)
|
|
}
|
|
|
|
func TestClassify_ParsesRetryAfterHTTPDate(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
retryAt := time.Now().Add(3 * time.Second).UTC().Format(http.TimeFormat)
|
|
classified := chaterror.Classify(testProviderError(
|
|
"upstream failed",
|
|
429,
|
|
map[string]string{"Retry-After": retryAt},
|
|
))
|
|
|
|
require.Equal(t, 429, classified.StatusCode)
|
|
require.GreaterOrEqual(t, classified.RetryAfter, 2*time.Second)
|
|
require.LessOrEqual(t, classified.RetryAfter, 4*time.Second)
|
|
}
|
|
|
|
func TestClassify_IgnoresInvalidRetryAfter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(testProviderError(
|
|
"upstream failed",
|
|
429,
|
|
map[string]string{"Retry-After": "definitely not a delay"},
|
|
))
|
|
|
|
require.Zero(t, classified.RetryAfter)
|
|
}
|
|
|
|
func TestWithProviderPreservesRetryAfter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
classified := chaterror.Classify(testProviderError(
|
|
"upstream failed",
|
|
429,
|
|
map[string]string{"Retry-After": "30"},
|
|
))
|
|
|
|
enriched := classified.WithProvider("openai")
|
|
require.Equal(t, 30*time.Second, enriched.RetryAfter)
|
|
require.Equal(t, chaterror.ClassifiedError{
|
|
Message: "OpenAI is rate limiting requests (HTTP 429).",
|
|
Kind: chaterror.KindRateLimit,
|
|
Provider: "openai",
|
|
Retryable: true,
|
|
StatusCode: 429,
|
|
RetryAfter: 30 * time.Second,
|
|
}, enriched)
|
|
}
|
|
|
|
func testProviderError(message string, statusCode int, headers map[string]string) error {
|
|
return &fantasy.ProviderError{
|
|
Message: message,
|
|
StatusCode: statusCode,
|
|
ResponseHeaders: headers,
|
|
}
|
|
}
|