Files
coder/coderd/x/chatd/chaterror/classify.go
T
Ethan 4d74603045 fix(coderd/x/chatd): respect provider Retry-After headers in chat retry loop (#23351)
> **PR Stack**
> 1. **#23351** ← `#23282` *(you are here)*
> 2. #23282 ← `#23275`
> 3. #23275 ← `#23349`
> 4. #23349 ← `main`

---

## Summary

`chatretry.Retry()` used pure exponential backoff (1 s, 2 s, 4 s, …) and
never consulted provider `Retry-After` headers. Fantasy's
`ProviderError` carries `ResponseHeaders` including `Retry-After`, but
`chaterror.Classify()` only parsed error text and silently dropped the
structured transport metadata.

This makes `Retry-After` a first-class signal in the classification →
retry pipeline.

<img width="853" height="346" alt="image"
src="https://github.com/user-attachments/assets/65f012b6-8173-43d2-957e-ab9faddea525"
/>


## Changes

### `coderd/chatd/chaterror/classify.go`

- Added `RetryAfter time.Duration` field to `ClassifiedError` — a
normalized minimum retry delay derived from provider response metadata.
- `Classify()` now calls `extractProviderErrorDetails()` before falling
back to text heuristics. Structured `ProviderError.StatusCode` takes
priority over regex extraction.
- `normalizeClassification()` preserves and clamps `RetryAfter`.

### `coderd/chatd/chaterror/provider_error.go` (new)

Provider-specific extraction, isolated from the text-based
classification logic:

- `extractProviderErrorDetails()` unwraps `*fantasy.ProviderError` from
the error chain via `errors.As`.
- `retryAfterFromHeaders()` parses headers in priority order:
  1. `retry-after-ms` (OpenAI-specific, millisecond precision)
  2. `retry-after` (standard HTTP — integer seconds or HTTP-date)
- Case-insensitive header key lookup.

### `coderd/chatd/chatretry/chatretry.go`

- `effectiveDelay(attempt, classified)` computes `max(Delay(attempt),
classified.RetryAfter)` — the provider hint acts as a floor without
weakening the local exponential backoff.
- `Retry()` now uses `effectiveDelay` and passes the effective delay to
both `onRetry(...)` and the sleep timer, so downstream payloads, logs,
and the frontend countdown stay aligned automatically.

### Tests

- `classify_test.go`: Structured provider status + `Retry-After`
extraction, `retry-after-ms` priority, HTTP-date parsing, invalid header
fallback, `WithProvider` preservation.
- `chatretry_test.go`: Retry-after-as-floor semantics — longer hint
wins, shorter hint keeps base delay.

## Design notes

- **No SDK/API/frontend changes needed.** `codersdk.ChatStreamRetry`
already carries `DelayMs` and `RetryingAt`, and the frontend already
consumes them. The fix is purely in the server-side delay computation.
- **Existing retryability rules unchanged.** This fixes *when* we sleep,
not *whether* an error is retryable.
- **Provider hint is a floor:** `max(baseDelay, RetryAfter)` ensures we
never retry earlier than the provider asks, and never weaken our own
backoff curve.
2026-03-27 01:20:46 +11:00

203 lines
5.5 KiB
Go

package chaterror
import (
"context"
"errors"
"strings"
"time"
)
// ClassifiedError is the normalized, user-facing view of an
// underlying provider or runtime error.
type ClassifiedError struct {
Message string
Kind string
Provider string
Retryable bool
StatusCode int
// RetryAfter is a normalized minimum retry delay derived from
// provider response metadata when available.
RetryAfter time.Duration
}
// WithProvider returns a copy of the classification using an explicit
// provider hint. Explicit provider hints are trusted over provider names
// heuristically parsed from the error text.
func (c ClassifiedError) WithProvider(provider string) ClassifiedError {
hint := normalizeProvider(provider)
if hint == "" {
return normalizeClassification(c)
}
if c.Provider == hint && strings.TrimSpace(c.Message) != "" {
return normalizeClassification(c)
}
updated := c
updated.Provider = hint
updated.Message = ""
return normalizeClassification(updated)
}
// WithClassification wraps err so future calls to Classify return
// classified instead of re-deriving it from err.Error().
func WithClassification(err error, classified ClassifiedError) error {
if err == nil {
return nil
}
return &classifiedError{
cause: err,
classified: normalizeClassification(classified),
}
}
type classifiedError struct {
cause error
classified ClassifiedError
}
func (e *classifiedError) Error() string {
return e.cause.Error()
}
func (e *classifiedError) Unwrap() error {
return e.cause
}
// Classify normalizes err into a stable, user-facing payload used for
// retry handling, streamed terminal errors, and persisted last_error
// values.
func Classify(err error) ClassifiedError {
if err == nil {
return ClassifiedError{}
}
var wrapped *classifiedError
if errors.As(err, &wrapped) {
return normalizeClassification(wrapped.classified)
}
structured := extractProviderErrorDetails(err)
message := strings.TrimSpace(err.Error())
if message == "" && structured.statusCode == 0 && structured.retryAfter <= 0 {
return ClassifiedError{}
}
lower := strings.ToLower(message)
statusCode := structured.statusCode
if statusCode == 0 {
statusCode = extractStatusCode(lower)
}
provider := detectProvider(lower)
canceled := errors.Is(err, context.Canceled) || strings.Contains(lower, "context canceled")
interrupted := containsAny(lower, interruptedPatterns...)
if canceled || interrupted {
return normalizeClassification(ClassifiedError{
Message: "The request was canceled before it completed.",
Kind: KindGeneric,
Provider: provider,
StatusCode: statusCode,
RetryAfter: structured.retryAfter,
})
}
deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded")
overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...)
authStrong := statusCode == 401 || containsAny(lower, authStrongPatterns...)
configMatch := containsAny(lower, configPatterns...)
authWeak := statusCode == 403 || containsAny(lower, authWeakPatterns...)
rateLimitMatch := statusCode == 429 || containsAny(lower, rateLimitPatterns...)
timeoutMatch := deadline || statusCode == 408 || statusCode == 502 ||
statusCode == 503 || statusCode == 504 ||
containsAny(lower, timeoutPatterns...)
genericRetryableMatch := statusCode == 500 || containsAny(lower, genericRetryablePatterns...)
// Config signals should beat ambiguous wrapper signals so
// transient-looking errors like "503 invalid model" fail fast.
// Overloaded stays ahead because 529/overloaded is a dedicated
// provider saturation signal, not a common transport wrapper.
// Strong auth still stays above config because bad credentials are
// the root cause when both signals appear.
rules := []struct {
match bool
kind string
retryable bool
}{
{
match: overloadedMatch,
kind: KindOverloaded,
retryable: true,
},
{
match: authStrong,
kind: KindAuth,
retryable: false,
},
{
match: authWeak && !configMatch,
kind: KindAuth,
retryable: false,
},
{
match: rateLimitMatch && !configMatch,
kind: KindRateLimit,
retryable: true,
},
{
match: timeoutMatch && !configMatch,
kind: KindTimeout,
retryable: !deadline,
},
{
match: configMatch,
kind: KindConfig,
retryable: false,
},
{
match: genericRetryableMatch,
kind: KindGeneric,
retryable: true,
},
}
for _, rule := range rules {
if !rule.match {
continue
}
return normalizeClassification(ClassifiedError{
Kind: rule.kind,
Provider: provider,
Retryable: rule.retryable,
StatusCode: statusCode,
RetryAfter: structured.retryAfter,
})
}
return normalizeClassification(ClassifiedError{
Kind: KindGeneric,
Provider: provider,
StatusCode: statusCode,
RetryAfter: structured.retryAfter,
})
}
func normalizeClassification(classified ClassifiedError) ClassifiedError {
classified.Message = strings.TrimSpace(classified.Message)
classified.Kind = strings.TrimSpace(classified.Kind)
classified.Provider = normalizeProvider(classified.Provider)
if classified.RetryAfter < 0 {
classified.RetryAfter = 0
}
if classified.Kind == "" && classified.Message == "" {
if classified.StatusCode == 0 && classified.RetryAfter <= 0 {
return ClassifiedError{}
}
classified.Kind = KindGeneric
}
if classified.Kind == "" {
classified.Kind = KindGeneric
}
if classified.Message == "" {
classified.Message = terminalMessage(classified)
}
return classified
}