mirror of
https://github.com/coder/coder.git
synced 2026-06-04 21:48:22 +00:00
4d74603045
> **PR Stack** > 1. **#23351** ← `#23282` *(you are here)* > 2. #23282 ← `#23275` > 3. #23275 ← `#23349` > 4. #23349 ← `main` --- ## Summary `chatretry.Retry()` used pure exponential backoff (1 s, 2 s, 4 s, …) and never consulted provider `Retry-After` headers. Fantasy's `ProviderError` carries `ResponseHeaders` including `Retry-After`, but `chaterror.Classify()` only parsed error text and silently dropped the structured transport metadata. This makes `Retry-After` a first-class signal in the classification → retry pipeline. <img width="853" height="346" alt="image" src="https://github.com/user-attachments/assets/65f012b6-8173-43d2-957e-ab9faddea525" /> ## Changes ### `coderd/chatd/chaterror/classify.go` - Added `RetryAfter time.Duration` field to `ClassifiedError` — a normalized minimum retry delay derived from provider response metadata. - `Classify()` now calls `extractProviderErrorDetails()` before falling back to text heuristics. Structured `ProviderError.StatusCode` takes priority over regex extraction. - `normalizeClassification()` preserves and clamps `RetryAfter`. ### `coderd/chatd/chaterror/provider_error.go` (new) Provider-specific extraction, isolated from the text-based classification logic: - `extractProviderErrorDetails()` unwraps `*fantasy.ProviderError` from the error chain via `errors.As`. - `retryAfterFromHeaders()` parses headers in priority order: 1. `retry-after-ms` (OpenAI-specific, millisecond precision) 2. `retry-after` (standard HTTP — integer seconds or HTTP-date) - Case-insensitive header key lookup. ### `coderd/chatd/chatretry/chatretry.go` - `effectiveDelay(attempt, classified)` computes `max(Delay(attempt), classified.RetryAfter)` — the provider hint acts as a floor without weakening the local exponential backoff. - `Retry()` now uses `effectiveDelay` and passes the effective delay to both `onRetry(...)` and the sleep timer, so downstream payloads, logs, and the frontend countdown stay aligned automatically. ### Tests - `classify_test.go`: Structured provider status + `Retry-After` extraction, `retry-after-ms` priority, HTTP-date parsing, invalid header fallback, `WithProvider` preservation. - `chatretry_test.go`: Retry-after-as-floor semantics — longer hint wins, shorter hint keeps base delay. ## Design notes - **No SDK/API/frontend changes needed.** `codersdk.ChatStreamRetry` already carries `DelayMs` and `RetryingAt`, and the frontend already consumes them. The fix is purely in the server-side delay computation. - **Existing retryability rules unchanged.** This fixes *when* we sleep, not *whether* an error is retryable. - **Provider hint is a floor:** `max(baseDelay, RetryAfter)` ensures we never retry earlier than the provider asks, and never weaken our own backoff curve.
203 lines
5.5 KiB
Go
203 lines
5.5 KiB
Go
package chaterror
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// ClassifiedError is the normalized, user-facing view of an
|
|
// underlying provider or runtime error.
|
|
type ClassifiedError struct {
|
|
Message string
|
|
Kind string
|
|
Provider string
|
|
Retryable bool
|
|
StatusCode int
|
|
|
|
// RetryAfter is a normalized minimum retry delay derived from
|
|
// provider response metadata when available.
|
|
RetryAfter time.Duration
|
|
}
|
|
|
|
// WithProvider returns a copy of the classification using an explicit
|
|
// provider hint. Explicit provider hints are trusted over provider names
|
|
// heuristically parsed from the error text.
|
|
func (c ClassifiedError) WithProvider(provider string) ClassifiedError {
|
|
hint := normalizeProvider(provider)
|
|
if hint == "" {
|
|
return normalizeClassification(c)
|
|
}
|
|
if c.Provider == hint && strings.TrimSpace(c.Message) != "" {
|
|
return normalizeClassification(c)
|
|
}
|
|
updated := c
|
|
updated.Provider = hint
|
|
updated.Message = ""
|
|
return normalizeClassification(updated)
|
|
}
|
|
|
|
// WithClassification wraps err so future calls to Classify return
|
|
// classified instead of re-deriving it from err.Error().
|
|
func WithClassification(err error, classified ClassifiedError) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
return &classifiedError{
|
|
cause: err,
|
|
classified: normalizeClassification(classified),
|
|
}
|
|
}
|
|
|
|
type classifiedError struct {
|
|
cause error
|
|
classified ClassifiedError
|
|
}
|
|
|
|
func (e *classifiedError) Error() string {
|
|
return e.cause.Error()
|
|
}
|
|
|
|
func (e *classifiedError) Unwrap() error {
|
|
return e.cause
|
|
}
|
|
|
|
// Classify normalizes err into a stable, user-facing payload used for
|
|
// retry handling, streamed terminal errors, and persisted last_error
|
|
// values.
|
|
func Classify(err error) ClassifiedError {
|
|
if err == nil {
|
|
return ClassifiedError{}
|
|
}
|
|
|
|
var wrapped *classifiedError
|
|
if errors.As(err, &wrapped) {
|
|
return normalizeClassification(wrapped.classified)
|
|
}
|
|
|
|
structured := extractProviderErrorDetails(err)
|
|
message := strings.TrimSpace(err.Error())
|
|
if message == "" && structured.statusCode == 0 && structured.retryAfter <= 0 {
|
|
return ClassifiedError{}
|
|
}
|
|
|
|
lower := strings.ToLower(message)
|
|
statusCode := structured.statusCode
|
|
if statusCode == 0 {
|
|
statusCode = extractStatusCode(lower)
|
|
}
|
|
provider := detectProvider(lower)
|
|
canceled := errors.Is(err, context.Canceled) || strings.Contains(lower, "context canceled")
|
|
interrupted := containsAny(lower, interruptedPatterns...)
|
|
if canceled || interrupted {
|
|
return normalizeClassification(ClassifiedError{
|
|
Message: "The request was canceled before it completed.",
|
|
Kind: KindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded")
|
|
overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...)
|
|
authStrong := statusCode == 401 || containsAny(lower, authStrongPatterns...)
|
|
configMatch := containsAny(lower, configPatterns...)
|
|
authWeak := statusCode == 403 || containsAny(lower, authWeakPatterns...)
|
|
rateLimitMatch := statusCode == 429 || containsAny(lower, rateLimitPatterns...)
|
|
timeoutMatch := deadline || statusCode == 408 || statusCode == 502 ||
|
|
statusCode == 503 || statusCode == 504 ||
|
|
containsAny(lower, timeoutPatterns...)
|
|
genericRetryableMatch := statusCode == 500 || containsAny(lower, genericRetryablePatterns...)
|
|
|
|
// Config signals should beat ambiguous wrapper signals so
|
|
// transient-looking errors like "503 invalid model" fail fast.
|
|
// Overloaded stays ahead because 529/overloaded is a dedicated
|
|
// provider saturation signal, not a common transport wrapper.
|
|
// Strong auth still stays above config because bad credentials are
|
|
// the root cause when both signals appear.
|
|
rules := []struct {
|
|
match bool
|
|
kind string
|
|
retryable bool
|
|
}{
|
|
{
|
|
match: overloadedMatch,
|
|
kind: KindOverloaded,
|
|
retryable: true,
|
|
},
|
|
{
|
|
match: authStrong,
|
|
kind: KindAuth,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: authWeak && !configMatch,
|
|
kind: KindAuth,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: rateLimitMatch && !configMatch,
|
|
kind: KindRateLimit,
|
|
retryable: true,
|
|
},
|
|
{
|
|
match: timeoutMatch && !configMatch,
|
|
kind: KindTimeout,
|
|
retryable: !deadline,
|
|
},
|
|
{
|
|
match: configMatch,
|
|
kind: KindConfig,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: genericRetryableMatch,
|
|
kind: KindGeneric,
|
|
retryable: true,
|
|
},
|
|
}
|
|
for _, rule := range rules {
|
|
if !rule.match {
|
|
continue
|
|
}
|
|
return normalizeClassification(ClassifiedError{
|
|
Kind: rule.kind,
|
|
Provider: provider,
|
|
Retryable: rule.retryable,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
return normalizeClassification(ClassifiedError{
|
|
Kind: KindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
func normalizeClassification(classified ClassifiedError) ClassifiedError {
|
|
classified.Message = strings.TrimSpace(classified.Message)
|
|
classified.Kind = strings.TrimSpace(classified.Kind)
|
|
classified.Provider = normalizeProvider(classified.Provider)
|
|
if classified.RetryAfter < 0 {
|
|
classified.RetryAfter = 0
|
|
}
|
|
if classified.Kind == "" && classified.Message == "" {
|
|
if classified.StatusCode == 0 && classified.RetryAfter <= 0 {
|
|
return ClassifiedError{}
|
|
}
|
|
classified.Kind = KindGeneric
|
|
}
|
|
if classified.Kind == "" {
|
|
classified.Kind = KindGeneric
|
|
}
|
|
if classified.Message == "" {
|
|
classified.Message = terminalMessage(classified)
|
|
}
|
|
return classified
|
|
}
|