mirror of
https://github.com/coder/coder.git
synced 2026-06-07 23:18:20 +00:00
4d74603045
> **PR Stack** > 1. **#23351** ← `#23282` *(you are here)* > 2. #23282 ← `#23275` > 3. #23275 ← `#23349` > 4. #23349 ← `main` --- ## Summary `chatretry.Retry()` used pure exponential backoff (1 s, 2 s, 4 s, …) and never consulted provider `Retry-After` headers. Fantasy's `ProviderError` carries `ResponseHeaders` including `Retry-After`, but `chaterror.Classify()` only parsed error text and silently dropped the structured transport metadata. This makes `Retry-After` a first-class signal in the classification → retry pipeline. <img width="853" height="346" alt="image" src="https://github.com/user-attachments/assets/65f012b6-8173-43d2-957e-ab9faddea525" /> ## Changes ### `coderd/chatd/chaterror/classify.go` - Added `RetryAfter time.Duration` field to `ClassifiedError` — a normalized minimum retry delay derived from provider response metadata. - `Classify()` now calls `extractProviderErrorDetails()` before falling back to text heuristics. Structured `ProviderError.StatusCode` takes priority over regex extraction. - `normalizeClassification()` preserves and clamps `RetryAfter`. ### `coderd/chatd/chaterror/provider_error.go` (new) Provider-specific extraction, isolated from the text-based classification logic: - `extractProviderErrorDetails()` unwraps `*fantasy.ProviderError` from the error chain via `errors.As`. - `retryAfterFromHeaders()` parses headers in priority order: 1. `retry-after-ms` (OpenAI-specific, millisecond precision) 2. `retry-after` (standard HTTP — integer seconds or HTTP-date) - Case-insensitive header key lookup. ### `coderd/chatd/chatretry/chatretry.go` - `effectiveDelay(attempt, classified)` computes `max(Delay(attempt), classified.RetryAfter)` — the provider hint acts as a floor without weakening the local exponential backoff. - `Retry()` now uses `effectiveDelay` and passes the effective delay to both `onRetry(...)` and the sleep timer, so downstream payloads, logs, and the frontend countdown stay aligned automatically. ### Tests - `classify_test.go`: Structured provider status + `Retry-After` extraction, `retry-after-ms` priority, HTTP-date parsing, invalid header fallback, `WithProvider` preservation. - `chatretry_test.go`: Retry-after-as-floor semantics — longer hint wins, shorter hint keeps base delay. ## Design notes - **No SDK/API/frontend changes needed.** `codersdk.ChatStreamRetry` already carries `DelayMs` and `RetryingAt`, and the frontend already consumes them. The fix is purely in the server-side delay computation. - **Existing retryability rules unchanged.** This fixes *when* we sleep, not *whether* an error is retryable. - **Provider hint is a floor:** `max(baseDelay, RetryAfter)` ensures we never retry earlier than the provider asks, and never weaken our own backoff curve.
123 lines
3.4 KiB
Go
123 lines
3.4 KiB
Go
// Package chatretry provides retry logic for transient LLM provider
|
|
// errors. It classifies errors as retryable or permanent and uses
|
|
// exponential backoff with provider retry hints when available.
|
|
package chatretry
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/coder/coder/v2/coderd/x/chatd/chaterror"
|
|
)
|
|
|
|
const (
|
|
// InitialDelay is the backoff duration for the first retry
|
|
// attempt.
|
|
InitialDelay = 1 * time.Second
|
|
|
|
// MaxDelay is the upper bound for the exponential backoff
|
|
// duration. Matches the cap used in coder/mux.
|
|
MaxDelay = 60 * time.Second
|
|
|
|
// MaxAttempts is the upper bound on retry attempts before
|
|
// giving up. With a 60s max backoff this allows roughly
|
|
// 25 minutes of retries, which is reasonable for transient
|
|
// LLM provider issues.
|
|
MaxAttempts = 25
|
|
)
|
|
|
|
type ClassifiedError = chaterror.ClassifiedError
|
|
|
|
// IsRetryable determines whether an error from an LLM provider is
|
|
// transient and worth retrying.
|
|
func IsRetryable(err error) bool {
|
|
return chaterror.Classify(err).Retryable
|
|
}
|
|
|
|
// Delay returns the backoff duration for the given 0-indexed attempt.
|
|
// Uses exponential backoff: min(InitialDelay * 2^attempt, MaxDelay).
|
|
// Matches the backoff curve used in coder/mux.
|
|
func Delay(attempt int) time.Duration {
|
|
d := InitialDelay
|
|
for range attempt {
|
|
d *= 2
|
|
if d >= MaxDelay {
|
|
return MaxDelay
|
|
}
|
|
}
|
|
return d
|
|
}
|
|
|
|
// effectiveDelay returns the delay for the given 0-indexed attempt
|
|
// while honoring any provider-supplied minimum retry delay.
|
|
func effectiveDelay(attempt int, classified ClassifiedError) time.Duration {
|
|
delay := Delay(attempt)
|
|
if classified.RetryAfter > delay {
|
|
return classified.RetryAfter
|
|
}
|
|
return delay
|
|
}
|
|
|
|
// RetryFn is the function to retry. It receives a context and returns
|
|
// an error. The context may be a child of the original with adjusted
|
|
// deadlines for individual attempts.
|
|
type RetryFn func(ctx context.Context) error
|
|
|
|
// OnRetryFn is called before each retry attempt with the attempt
|
|
// number (1-indexed), the raw error that triggered the retry, the
|
|
// normalized error payload, and the delay before the next attempt.
|
|
type OnRetryFn func(attempt int, err error, classified ClassifiedError, delay time.Duration)
|
|
|
|
// Retry calls fn repeatedly until it succeeds, returns a
|
|
// non-retryable error, ctx is canceled, or MaxAttempts is reached.
|
|
// Retries use exponential backoff capped at MaxDelay, unless the
|
|
// normalized error includes a longer provider Retry-After hint.
|
|
//
|
|
// The onRetry callback (if non-nil) is called before each retry
|
|
// attempt, giving the caller a chance to reset state, log, or
|
|
// publish status events.
|
|
func Retry(ctx context.Context, fn RetryFn, onRetry OnRetryFn) error {
|
|
var attempt int
|
|
for {
|
|
err := fn(ctx)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
classified := chaterror.Classify(err)
|
|
if !classified.Retryable {
|
|
return chaterror.WithClassification(err, classified)
|
|
}
|
|
|
|
// If the caller's context is already done, return the
|
|
// context error so cancellation propagates cleanly.
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
attempt++
|
|
if attempt >= MaxAttempts {
|
|
return chaterror.WithClassification(
|
|
xerrors.Errorf("max retry attempts (%d) exceeded: %w", MaxAttempts, err),
|
|
classified,
|
|
)
|
|
}
|
|
|
|
delay := effectiveDelay(attempt-1, classified)
|
|
|
|
if onRetry != nil {
|
|
onRetry(attempt, err, classified, delay)
|
|
}
|
|
|
|
timer := time.NewTimer(delay)
|
|
select {
|
|
case <-ctx.Done():
|
|
timer.Stop()
|
|
return ctx.Err()
|
|
case <-timer.C:
|
|
}
|
|
}
|
|
}
|