mirror of
https://github.com/coder/coder.git
synced 2026-06-07 15:08:20 +00:00
becc858fa8
Closes CODAGT-541. ## Problem An Agents chat stream could die with a terminal `context cancelled` error and surface to the user as a permanent chat failure, even when no context in our process had actually been canceled. The cancellation was a provider-returned error value (HTTP/2 RST_STREAM mid-body surfacing as `context.Canceled` from Go's net/http2), not a real caller cancel. The chain that produced the bug: - fantasy passed the provider's `context.Canceled` through unchanged. - `chaterror.Classify` short-circuited any `errors.Is(err, context.Canceled)` (or `"context canceled"` text) as terminal generic, before checking HTTP status codes or other retry signals. - `chatretry.Retry` did not retry. - The frontend rendered `type:"error"` and the chat was dead. The same short-circuit also masked retryable 5xx responses whose underlying transport error happened to wrap `context.Canceled`. ## Approach `context.Canceled` has no inherent intent. The same error value can mean a user pressing Stop, a server shutdown, the silence guard firing, or a provider-side stream reset. The only layer that can disambiguate is the one holding both the returned error and the caller context. That is `chatretry`. This PR centralizes the policy there and keeps `chaterror` context-free. ## Changes `coderd/x/chatd/chaterror/classify.go` - Add `ErrProviderTransportReset` sentinel to explicitly mark provider-side stream cancellations. - Remove the broad `context.Canceled` / `"context canceled"` short-circuit so status codes and other retry signals can win. - Classify `ErrProviderTransportReset` (with no status code) as a retryable timeout. - Keep a fallback that classifies bare `context.Canceled` as terminal-generic when no other signal is present, so legitimate caller cancels still terminate cleanly. `coderd/x/chatd/chatretry/chatretry.go` - Add `contextError(ctx)` that returns `context.Cause(ctx)` when set, falling back to `ctx.Err()`, so caller-owned cancel causes (`ErrInterrupted`, `errStreamSilenceTimeout`, server shutdown sentinels) propagate cleanly out of the retry loop. - Add `classifyProviderAttemptError(err)` that wraps a bare `context.Canceled` in `ErrProviderTransportReset` and reclassifies. Errors that already classify as retryable or carry a status code are left alone. - Restructure `Retry` so the policy is explicit and readable: check caller cancellation before attempting, run the attempt, check caller cancellation again before normalizing the provider error, then classify and retry. ## End-to-end behavior - Provider returns `context.Canceled` while caller context is healthy: classified as a retryable timeout, retried, the user sees a brief `type:"retry"` event and the chat continues. - User presses Stop: `contextError(ctx)` returns `ErrInterrupted`. Retry stops. `chatloop` flushes partial content and persists. - Stream-silence guard fires: `attemptCtx` is canceled with `errStreamSilenceTimeout`, `guardedStream` produces a classified retryable error, retry proceeds normally on the still-alive parent. - Server shutdown: parent context's cause propagates out, retry stops.
466 lines
14 KiB
Go
466 lines
14 KiB
Go
package chaterror
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/net/http2"
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/coder/coder/v2/codersdk"
|
|
)
|
|
|
|
// ErrProviderTransportReset identifies provider stream cancellations that
|
|
// occur while the caller-owned chat context is still alive.
|
|
var ErrProviderTransportReset = xerrors.New("provider transport reset")
|
|
|
|
// ClassifiedError is the normalized, user-facing view of an
|
|
// underlying provider or runtime error.
|
|
type ClassifiedError struct {
|
|
Message string
|
|
Detail string
|
|
Kind codersdk.ChatErrorKind
|
|
Provider string
|
|
Retryable bool
|
|
StatusCode int
|
|
|
|
// RetryAfter is a normalized minimum retry delay derived from
|
|
// provider response metadata when available.
|
|
RetryAfter time.Duration
|
|
|
|
// ChainBroken is true when the provider reported that the
|
|
// previous_response_id (or analogous chain anchor) is no longer
|
|
// retrievable. The chatloop retry path uses this signal to exit
|
|
// chain mode and replay full history before the next attempt.
|
|
// This is an internal signal; it is not surfaced as a separate
|
|
// codersdk.ChatErrorKind so the user-visible kind set stays
|
|
// stable.
|
|
ChainBroken bool
|
|
}
|
|
|
|
// http2PeerResetCause mirrors golang.org/x/net/http2's unexported
|
|
// errFromPeer message.
|
|
const http2PeerResetCause = "received from peer"
|
|
|
|
const responsesAPIDiagnosticMessage = "The chat continuation failed due to an " +
|
|
"internal state mismatch. This is not a configuration or billing issue."
|
|
|
|
type responsesAPIDiagnosticMatch struct {
|
|
pattern string
|
|
detail string
|
|
}
|
|
|
|
type streamIncompleteMatch struct {
|
|
pattern string
|
|
provider string
|
|
}
|
|
|
|
// responsesAPIDiagnosticMatches maps provider error fragments to safe
|
|
// diagnostics. Details must not include provider item IDs because they are
|
|
// returned to clients and used by operators for grepping.
|
|
var responsesAPIDiagnosticMatches = []responsesAPIDiagnosticMatch{
|
|
{
|
|
pattern: "no tool output found for function call",
|
|
detail: "OpenAI Responses API request continuity diagnostic: match=function_call_output_missing.",
|
|
},
|
|
{
|
|
pattern: "was provided without its required 'reasoning' item",
|
|
detail: "OpenAI Responses API request continuity diagnostic: match=web_search_reasoning_missing.",
|
|
},
|
|
}
|
|
|
|
// streamIncompleteMatches maps provider stream-truncation errors from
|
|
// fantasy to clearer user-facing messages before broad EOF handling
|
|
// classifies them as generic transport timeouts.
|
|
var streamIncompleteMatches = []streamIncompleteMatch{
|
|
{
|
|
pattern: "anthropic stream closed before message_stop",
|
|
provider: "anthropic",
|
|
},
|
|
{
|
|
pattern: "openai responses stream closed before terminal event",
|
|
provider: "openai",
|
|
},
|
|
}
|
|
|
|
// WithProvider returns a copy of the classification using an explicit
|
|
// provider hint. Explicit provider hints are trusted over provider names
|
|
// heuristically parsed from the error text.
|
|
func (c ClassifiedError) WithProvider(provider string) ClassifiedError {
|
|
hint := normalizeProvider(provider)
|
|
if hint == "" {
|
|
return normalizeClassification(c)
|
|
}
|
|
if c.Provider == hint && strings.TrimSpace(c.Message) != "" {
|
|
return normalizeClassification(c)
|
|
}
|
|
updated := c
|
|
updated.Provider = hint
|
|
updated.Message = ""
|
|
return normalizeClassification(updated)
|
|
}
|
|
|
|
// WithClassification wraps err so future calls to Classify return
|
|
// classified instead of re-deriving it from err.Error().
|
|
func WithClassification(err error, classified ClassifiedError) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
return &classifiedError{
|
|
cause: err,
|
|
classified: normalizeClassification(classified),
|
|
}
|
|
}
|
|
|
|
type classifiedError struct {
|
|
cause error
|
|
classified ClassifiedError
|
|
}
|
|
|
|
func (e *classifiedError) Error() string {
|
|
return e.cause.Error()
|
|
}
|
|
|
|
func (e *classifiedError) Unwrap() error {
|
|
return e.cause
|
|
}
|
|
|
|
// Classify normalizes err into a stable, user-facing payload used for
|
|
// retry handling, streamed terminal errors, and persisted last_error
|
|
// values.
|
|
func Classify(err error) ClassifiedError {
|
|
if err == nil {
|
|
return ClassifiedError{}
|
|
}
|
|
|
|
var wrapped *classifiedError
|
|
if errors.As(err, &wrapped) {
|
|
return normalizeClassification(wrapped.classified)
|
|
}
|
|
|
|
structured := extractProviderErrorDetails(err)
|
|
message := strings.TrimSpace(err.Error())
|
|
if message == "" && structured.detail == "" && structured.statusCode == 0 && structured.retryAfter <= 0 {
|
|
return ClassifiedError{}
|
|
}
|
|
|
|
lower := strings.ToLower(message)
|
|
statusCode := structured.statusCode
|
|
if statusCode == 0 {
|
|
statusCode = extractStatusCode(lower)
|
|
}
|
|
provider := detectProvider(lower)
|
|
canceled := errors.Is(err, context.Canceled)
|
|
providerTransportReset := errors.Is(err, ErrProviderTransportReset)
|
|
interrupted := containsAny(lower, interruptedPatterns...)
|
|
if interrupted {
|
|
return normalizeClassification(ClassifiedError{
|
|
Message: "The request was canceled before it completed.",
|
|
Detail: structured.detail,
|
|
Kind: codersdk.ChatErrorKindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
if detail, ok := responsesAPIDiagnostic(lower, structured.detail); ok {
|
|
return normalizeClassification(ClassifiedError{
|
|
Message: responsesAPIDiagnosticMessage,
|
|
Detail: detail,
|
|
Kind: codersdk.ChatErrorKindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
if classified, ok := streamIncompleteClassification(
|
|
lower,
|
|
provider,
|
|
statusCode,
|
|
structured,
|
|
); ok {
|
|
return classified
|
|
}
|
|
|
|
// Chain-broken detection runs before the generic rule table so a
|
|
// 404 carrying a chain anchor failure is not classified as a
|
|
// generic non-retryable error. The chatloop retry callback uses
|
|
// the ChainBroken flag to exit chain mode and replay full
|
|
// history.
|
|
if classified, ok := chainBrokenClassification(
|
|
lower,
|
|
provider,
|
|
statusCode,
|
|
structured,
|
|
); ok {
|
|
return classified
|
|
}
|
|
|
|
retryableHTTP2StreamReset, hasHTTP2StreamReset := classifyHTTP2StreamReset(err)
|
|
providerDisabledMatch := containsAny(lower, providerDisabledPatterns...)
|
|
deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded")
|
|
overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...)
|
|
usageLimitMatch := containsAny(lower, usageLimitPatterns...)
|
|
authStrong := statusCode == 401 || containsAny(lower, authStrongPatterns...)
|
|
configMatch := containsAny(lower, configPatterns...)
|
|
authWeak := statusCode == 403 || containsAny(lower, authWeakPatterns...)
|
|
rateLimitMatch := statusCode == 429 || containsAny(lower, rateLimitPatterns...)
|
|
timeoutPatternMatch := containsAny(lower, timeoutPatterns...)
|
|
if hasHTTP2StreamReset && !retryableHTTP2StreamReset {
|
|
// A typed HTTP/2 stream error gives us the reset code. Trust it
|
|
// over broader string fallbacks so protocol bugs do not retry.
|
|
timeoutPatternMatch = false
|
|
}
|
|
providerTransportResetMatch := providerTransportReset && statusCode == 0
|
|
timeoutMatch := providerTransportResetMatch || deadline ||
|
|
statusCode == 408 || statusCode == 502 || statusCode == 503 ||
|
|
statusCode == 504 || retryableHTTP2StreamReset ||
|
|
timeoutPatternMatch
|
|
genericRetryableMatch := statusCode == 500 || containsAny(lower, genericRetryablePatterns...)
|
|
|
|
// Config signals should beat ambiguous wrapper signals so
|
|
// transient-looking errors like "503 invalid model" fail fast.
|
|
// Overloaded stays ahead because 529/overloaded is a dedicated
|
|
// provider saturation signal, not a common transport wrapper.
|
|
// Usage-limit fires before auth so that quota/billing text wins
|
|
// over whatever HTTP status code the provider happened to use.
|
|
// Strong auth still stays above config because bad credentials are
|
|
// the root cause when both signals appear.
|
|
// Provider-disabled must precede timeout because disabled providers
|
|
// return 503, which matches the timeout rule.
|
|
rules := []struct {
|
|
match bool
|
|
kind codersdk.ChatErrorKind
|
|
retryable bool
|
|
}{
|
|
{
|
|
match: overloadedMatch,
|
|
kind: codersdk.ChatErrorKindOverloaded,
|
|
retryable: true,
|
|
},
|
|
{
|
|
match: usageLimitMatch,
|
|
kind: codersdk.ChatErrorKindUsageLimit,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: authStrong,
|
|
kind: codersdk.ChatErrorKindAuth,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: authWeak && !configMatch,
|
|
kind: codersdk.ChatErrorKindAuth,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: rateLimitMatch && !configMatch,
|
|
kind: codersdk.ChatErrorKindRateLimit,
|
|
retryable: true,
|
|
},
|
|
{
|
|
match: providerDisabledMatch,
|
|
kind: codersdk.ChatErrorKindProviderDisabled,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: timeoutMatch && !configMatch,
|
|
kind: codersdk.ChatErrorKindTimeout,
|
|
retryable: !deadline,
|
|
},
|
|
{
|
|
match: configMatch,
|
|
kind: codersdk.ChatErrorKindConfig,
|
|
retryable: false,
|
|
},
|
|
{
|
|
match: genericRetryableMatch,
|
|
kind: codersdk.ChatErrorKindGeneric,
|
|
retryable: true,
|
|
},
|
|
}
|
|
for _, rule := range rules {
|
|
if !rule.match {
|
|
continue
|
|
}
|
|
return normalizeClassification(ClassifiedError{
|
|
Detail: structured.detail,
|
|
Kind: rule.kind,
|
|
Provider: provider,
|
|
Retryable: rule.retryable,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
if canceled {
|
|
return normalizeClassification(ClassifiedError{
|
|
Message: "The request was canceled before it completed.",
|
|
Detail: structured.detail,
|
|
Kind: codersdk.ChatErrorKindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
return normalizeClassification(ClassifiedError{
|
|
Detail: structured.detail,
|
|
Kind: codersdk.ChatErrorKindGeneric,
|
|
Provider: provider,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
})
|
|
}
|
|
|
|
func classifyHTTP2StreamReset(err error) (retryable bool, found bool) {
|
|
streamErr, ok := findHTTP2StreamError(err)
|
|
if !ok {
|
|
return false, false
|
|
}
|
|
if !isPeerHTTP2StreamError(streamErr) {
|
|
return false, true
|
|
}
|
|
return isRetryableHTTP2StreamCode(streamErr.Code), true
|
|
}
|
|
|
|
func findHTTP2StreamError(err error) (http2.StreamError, bool) {
|
|
var streamErr http2.StreamError
|
|
if errors.As(err, &streamErr) {
|
|
return streamErr, true
|
|
}
|
|
var streamErrPtr *http2.StreamError
|
|
if errors.As(err, &streamErrPtr) && streamErrPtr != nil {
|
|
return *streamErrPtr, true
|
|
}
|
|
return http2.StreamError{}, false
|
|
}
|
|
|
|
func isPeerHTTP2StreamError(streamErr http2.StreamError) bool {
|
|
return streamErr.Cause != nil && streamErr.Cause.Error() == http2PeerResetCause
|
|
}
|
|
|
|
func isRetryableHTTP2StreamCode(code http2.ErrCode) bool {
|
|
switch code {
|
|
case http2.ErrCodeNo,
|
|
http2.ErrCodeInternal,
|
|
http2.ErrCodeRefusedStream,
|
|
http2.ErrCodeCancel,
|
|
http2.ErrCodeEnhanceYourCalm:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func streamIncompleteClassification(
|
|
lowerMessage string,
|
|
provider string,
|
|
statusCode int,
|
|
structured providerErrorDetails,
|
|
) (ClassifiedError, bool) {
|
|
for _, match := range streamIncompleteMatches {
|
|
if !strings.Contains(lowerMessage, match.pattern) {
|
|
continue
|
|
}
|
|
if provider == "" {
|
|
provider = match.provider
|
|
}
|
|
return normalizeClassification(ClassifiedError{
|
|
Message: streamIncompleteMessage(provider),
|
|
Detail: structured.detail,
|
|
Kind: codersdk.ChatErrorKindTimeout,
|
|
Provider: provider,
|
|
Retryable: true,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
}), true
|
|
}
|
|
return ClassifiedError{}, false
|
|
}
|
|
|
|
func streamIncompleteMessage(provider string) string {
|
|
return providerSubject(provider) + " stream closed unexpectedly before the response completed."
|
|
}
|
|
|
|
// chainBrokenClassification recognizes the OpenAI error
|
|
// "Previous response with id ... not found" returned when a
|
|
// chained turn references a previous_response_id the provider no
|
|
// longer recognizes.
|
|
func chainBrokenClassification(
|
|
lowerMessage string,
|
|
provider string,
|
|
statusCode int,
|
|
structured providerErrorDetails,
|
|
) (ClassifiedError, bool) {
|
|
if !(strings.Contains(lowerMessage, "previous response with id") &&
|
|
strings.Contains(lowerMessage, "not found")) {
|
|
return ClassifiedError{}, false
|
|
}
|
|
// This class of error has so far only been observed with OpenAI.
|
|
if provider == "" {
|
|
provider = "openai"
|
|
}
|
|
return normalizeClassification(ClassifiedError{
|
|
Detail: structured.detail,
|
|
Kind: codersdk.ChatErrorKindGeneric,
|
|
Provider: provider,
|
|
Retryable: true,
|
|
StatusCode: statusCode,
|
|
RetryAfter: structured.retryAfter,
|
|
ChainBroken: true,
|
|
}), true
|
|
}
|
|
|
|
func responsesAPIDiagnostic(lowerMessage, detail string) (string, bool) {
|
|
lowerDetail := strings.ToLower(detail)
|
|
for _, match := range responsesAPIDiagnosticMatches {
|
|
if strings.Contains(lowerMessage, match.pattern) || strings.Contains(lowerDetail, match.pattern) {
|
|
return match.detail, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func normalizeClassification(classified ClassifiedError) ClassifiedError {
|
|
classified.Message = strings.TrimSpace(classified.Message)
|
|
classified.Detail = normalizeClassificationDetail(classified.Detail)
|
|
classified.Kind = codersdk.ChatErrorKind(strings.TrimSpace(string(classified.Kind)))
|
|
classified.Provider = normalizeProvider(classified.Provider)
|
|
if classified.RetryAfter < 0 {
|
|
classified.RetryAfter = 0
|
|
}
|
|
if classified.Kind == "" && classified.Message == "" {
|
|
if classified.Detail == "" && classified.StatusCode == 0 &&
|
|
classified.RetryAfter <= 0 {
|
|
return ClassifiedError{}
|
|
}
|
|
classified.Kind = codersdk.ChatErrorKindGeneric
|
|
}
|
|
if classified.Kind == "" {
|
|
classified.Kind = codersdk.ChatErrorKindGeneric
|
|
}
|
|
if classified.Message == "" {
|
|
classified.Message = terminalMessage(classified)
|
|
}
|
|
return classified
|
|
}
|
|
|
|
const maxClassificationDetailRunes = 500
|
|
|
|
func normalizeClassificationDetail(detail string) string {
|
|
detail = strings.TrimSpace(detail)
|
|
if detail == "" {
|
|
return ""
|
|
}
|
|
runes := []rune(detail)
|
|
if len(runes) <= maxClassificationDetailRunes {
|
|
return detail
|
|
}
|
|
return string(runes[:maxClassificationDetailRunes-1]) + "…"
|
|
}
|