feat: classify provider_disabled 503 as non-retryable (#25800)

Builds on top of https://github.com/coder/coder/pull/25794

Adds a new `provider_disabled` error classification in `chatd` with the
corresponding plumbing to classify it as non-retryable. Also adds a
story for how this particular error kind is displayed in the UI.
This commit is contained in:
Cian Johnston
2026-05-29 13:14:04 +01:00
committed by GitHub
parent 4144eb3c4f
commit d0a51da0a9
13 changed files with 200 additions and 64 deletions
+4 -2
View File
@@ -16498,7 +16498,8 @@ const docTemplate = `{
"auth", "auth",
"config", "config",
"usage_limit", "usage_limit",
"missing_key" "missing_key",
"provider_disabled"
], ],
"x-enum-varnames": [ "x-enum-varnames": [
"ChatErrorKindGeneric", "ChatErrorKindGeneric",
@@ -16509,7 +16510,8 @@ const docTemplate = `{
"ChatErrorKindAuth", "ChatErrorKindAuth",
"ChatErrorKindConfig", "ChatErrorKindConfig",
"ChatErrorKindUsageLimit", "ChatErrorKindUsageLimit",
"ChatErrorKindMissingKey" "ChatErrorKindMissingKey",
"ChatErrorKindProviderDisabled"
] ]
}, },
"codersdk.ChatFileMetadata": { "codersdk.ChatFileMetadata": {
+4 -2
View File
@@ -14848,7 +14848,8 @@
"auth", "auth",
"config", "config",
"usage_limit", "usage_limit",
"missing_key" "missing_key",
"provider_disabled"
], ],
"x-enum-varnames": [ "x-enum-varnames": [
"ChatErrorKindGeneric", "ChatErrorKindGeneric",
@@ -14859,7 +14860,8 @@
"ChatErrorKindAuth", "ChatErrorKindAuth",
"ChatErrorKindConfig", "ChatErrorKindConfig",
"ChatErrorKindUsageLimit", "ChatErrorKindUsageLimit",
"ChatErrorKindMissingKey" "ChatErrorKindMissingKey",
"ChatErrorKindProviderDisabled"
] ]
}, },
"codersdk.ChatFileMetadata": { "codersdk.ChatFileMetadata": {
+8
View File
@@ -195,6 +195,7 @@ func Classify(err error) ClassifiedError {
} }
retryableHTTP2StreamReset, hasHTTP2StreamReset := classifyHTTP2StreamReset(err) retryableHTTP2StreamReset, hasHTTP2StreamReset := classifyHTTP2StreamReset(err)
providerDisabledMatch := containsAny(lower, providerDisabledPatterns...)
deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded") deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded")
overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...) overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...)
usageLimitMatch := containsAny(lower, usageLimitPatterns...) usageLimitMatch := containsAny(lower, usageLimitPatterns...)
@@ -221,6 +222,8 @@ func Classify(err error) ClassifiedError {
// over whatever HTTP status code the provider happened to use. // over whatever HTTP status code the provider happened to use.
// Strong auth still stays above config because bad credentials are // Strong auth still stays above config because bad credentials are
// the root cause when both signals appear. // the root cause when both signals appear.
// Provider-disabled must precede timeout because disabled providers
// return 503, which matches the timeout rule.
rules := []struct { rules := []struct {
match bool match bool
kind codersdk.ChatErrorKind kind codersdk.ChatErrorKind
@@ -251,6 +254,11 @@ func Classify(err error) ClassifiedError {
kind: codersdk.ChatErrorKindRateLimit, kind: codersdk.ChatErrorKindRateLimit,
retryable: true, retryable: true,
}, },
{
match: providerDisabledMatch,
kind: codersdk.ChatErrorKindProviderDisabled,
retryable: false,
},
{ {
match: timeoutMatch && !configMatch, match: timeoutMatch && !configMatch,
kind: codersdk.ChatErrorKindTimeout, kind: codersdk.ChatErrorKindTimeout,
+81
View File
@@ -2,6 +2,7 @@ package chaterror_test
import ( import (
"context" "context"
"fmt"
"io" "io"
"net/http" "net/http"
"strings" "strings"
@@ -218,6 +219,85 @@ func TestClassify(t *testing.T) {
StatusCode: 0, StatusCode: 0,
}, },
}, },
// The next cases model the error that fantasy produces
// when aibridge's disabledProviderHandler returns a 503
// plain-text sentinel. Fantasy sets Title from the HTTP
// status text and Message from the response body (including
// the trailing newline written by http.Error).
{
name: "ProviderDisabled503ClassifiesAsProviderDisabled",
err: &fantasy.ProviderError{
Title: fantasy.ErrorTitleForStatusCode(http.StatusServiceUnavailable),
Message: fmt.Sprintf("%s: AI provider %q is disabled\n", codersdk.ChatErrorKindProviderDisabled, "openai"),
StatusCode: http.StatusServiceUnavailable,
},
want: chaterror.ClassifiedError{
Message: "The OpenAI provider has been disabled. Contact your Coder administrator.",
Detail: fmt.Sprintf("%s: AI provider %q is disabled", codersdk.ChatErrorKindProviderDisabled, "openai"),
Kind: codersdk.ChatErrorKindProviderDisabled,
Provider: "openai",
Retryable: false,
StatusCode: 503,
},
},
{
name: "ProviderDisabled503UnknownProvider",
err: &fantasy.ProviderError{
Title: fantasy.ErrorTitleForStatusCode(http.StatusServiceUnavailable),
Message: fmt.Sprintf("%s: AI provider %q is disabled\n", codersdk.ChatErrorKindProviderDisabled, "mycustomprovider"),
StatusCode: http.StatusServiceUnavailable,
},
want: chaterror.ClassifiedError{
Message: "The AI provider has been disabled. Contact your Coder administrator.",
Detail: fmt.Sprintf("%s: AI provider %q is disabled", codersdk.ChatErrorKindProviderDisabled, "mycustomprovider"),
Kind: codersdk.ChatErrorKindProviderDisabled,
Provider: "",
Retryable: false,
StatusCode: 503,
},
},
{
name: "ProviderDisabledPlainErrorString",
err: xerrors.New(fmt.Sprintf("%s: AI provider %q is disabled", codersdk.ChatErrorKindProviderDisabled, "anthropic")),
want: chaterror.ClassifiedError{
Message: "The Anthropic provider has been disabled. Contact your Coder administrator.",
Kind: codersdk.ChatErrorKindProviderDisabled,
Provider: "anthropic",
Retryable: false,
StatusCode: 0,
},
},
{
name: "ProviderDisabledBeatsTimeout503",
err: &fantasy.ProviderError{
Title: fantasy.ErrorTitleForStatusCode(http.StatusServiceUnavailable),
Message: fmt.Sprintf("%s: AI provider %q is disabled\n", codersdk.ChatErrorKindProviderDisabled, "google"),
StatusCode: http.StatusServiceUnavailable,
},
want: chaterror.ClassifiedError{
Message: "The Google provider has been disabled. Contact your Coder administrator.",
Detail: fmt.Sprintf("%s: AI provider %q is disabled", codersdk.ChatErrorKindProviderDisabled, "google"),
Kind: codersdk.ChatErrorKindProviderDisabled,
Provider: "google",
Retryable: false,
StatusCode: 503,
},
},
{
name: "Generic503StillClassifiesAsTimeout",
err: &fantasy.ProviderError{
Message: "service unavailable",
StatusCode: 503,
},
want: chaterror.ClassifiedError{
Message: "The AI provider is temporarily unavailable.",
Detail: "service unavailable",
Kind: codersdk.ChatErrorKindTimeout,
Provider: "",
Retryable: true,
StatusCode: 503,
},
},
} }
for _, tt := range tests { for _, tt := range tests {
@@ -363,6 +443,7 @@ func TestClassify_PatternCoverage(t *testing.T) {
{name: "OperationInterruptedLiteral", err: "operation interrupted", wantKind: codersdk.ChatErrorKindGeneric, wantRetry: false}, {name: "OperationInterruptedLiteral", err: "operation interrupted", wantKind: codersdk.ChatErrorKindGeneric, wantRetry: false},
{name: "Status408", err: "status 408", wantKind: codersdk.ChatErrorKindTimeout, wantRetry: true}, {name: "Status408", err: "status 408", wantKind: codersdk.ChatErrorKindTimeout, wantRetry: true},
{name: "Status500", err: "status 500", wantKind: codersdk.ChatErrorKindGeneric, wantRetry: true}, {name: "Status500", err: "status 500", wantKind: codersdk.ChatErrorKindGeneric, wantRetry: true},
{name: "ProviderDisabledLiteral", err: "provider_disabled", wantKind: codersdk.ChatErrorKindProviderDisabled, wantRetry: false},
} }
for _, tt := range tests { for _, tt := range tests {
+39 -38
View File
@@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"strings" "strings"
stringutil "github.com/coder/coder/v2/coderd/util/strings"
"github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk"
) )
@@ -16,60 +17,58 @@ func terminalMessage(classified ClassifiedError) string {
subject := providerSubject(classified.Provider) subject := providerSubject(classified.Provider)
switch classified.Kind { switch classified.Kind {
case codersdk.ChatErrorKindOverloaded: case codersdk.ChatErrorKindOverloaded:
return fmt.Sprintf("%s is temporarily overloaded.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is temporarily overloaded.", subject))
case codersdk.ChatErrorKindRateLimit: case codersdk.ChatErrorKindRateLimit:
return fmt.Sprintf("%s is rate limiting requests.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is rate limiting requests.", subject))
case codersdk.ChatErrorKindTimeout: case codersdk.ChatErrorKindTimeout:
if !classified.Retryable && classified.StatusCode == 0 { if !classified.Retryable && classified.StatusCode == 0 {
return "The request timed out before it completed." return "The request timed out before it completed."
} }
return fmt.Sprintf("%s is temporarily unavailable.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is temporarily unavailable.", subject))
case codersdk.ChatErrorKindStartupTimeout: case codersdk.ChatErrorKindStartupTimeout:
return fmt.Sprintf( return stringutil.Capitalize(fmt.Sprintf(
"%s did not start responding in time.", subject, "%s did not start responding in time.", subject,
) ))
case codersdk.ChatErrorKindUsageLimit: case codersdk.ChatErrorKindUsageLimit:
displayName := providerDisplayName(classified.Provider) return stringutil.Capitalize(fmt.Sprintf(
if displayName == "" {
displayName = "the AI provider"
}
return fmt.Sprintf(
"The usage quota for %s has been exceeded."+ "The usage quota for %s has been exceeded."+
" Check the billing and quota settings for the provider account.", " Check the billing and quota settings for the provider account.",
displayName, subject,
) ))
case codersdk.ChatErrorKindAuth: case codersdk.ChatErrorKindAuth:
displayName := providerDisplayName(classified.Provider)
if displayName == "" {
displayName = "the AI provider"
}
return fmt.Sprintf( return fmt.Sprintf(
"Authentication with %s failed."+ "Authentication with %s failed."+
" Check the API key and permissions.", " Check the API key and permissions.",
displayName, subject,
) )
case codersdk.ChatErrorKindConfig: case codersdk.ChatErrorKindConfig:
return fmt.Sprintf( return stringutil.Capitalize(fmt.Sprintf(
"%s rejected the model configuration."+ "%s rejected the model configuration."+
" Check the selected model and provider settings.", " Check the selected model and provider settings.",
subject, subject,
) ))
case codersdk.ChatErrorKindMissingKey: case codersdk.ChatErrorKindMissingKey:
return "This conversation was started with an API key that is no longer available." + return "This conversation was started with an API key that is no longer available." +
" Send your message again to continue." " Send your message again to continue."
case codersdk.ChatErrorKindProviderDisabled:
displayName := providerDisplayName(classified.Provider)
return fmt.Sprintf(
"The %s provider has been disabled."+
" Contact your Coder administrator.",
displayName,
)
default: default:
if !classified.Retryable && classified.StatusCode == 0 { if !classified.Retryable && classified.StatusCode == 0 {
return "The chat request failed unexpectedly." return "The chat request failed unexpectedly."
} }
return fmt.Sprintf("%s returned an unexpected error.", subject) return stringutil.Capitalize(fmt.Sprintf("%s returned an unexpected error.", subject))
} }
} }
@@ -85,41 +84,43 @@ func retryMessage(classified ClassifiedError) string {
subject := providerSubject(classified.Provider) subject := providerSubject(classified.Provider)
switch classified.Kind { switch classified.Kind {
case codersdk.ChatErrorKindOverloaded: case codersdk.ChatErrorKindOverloaded:
return fmt.Sprintf("%s is temporarily overloaded.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is temporarily overloaded.", subject))
case codersdk.ChatErrorKindRateLimit: case codersdk.ChatErrorKindRateLimit:
return fmt.Sprintf("%s is rate limiting requests.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is rate limiting requests.", subject))
case codersdk.ChatErrorKindTimeout: case codersdk.ChatErrorKindTimeout:
return fmt.Sprintf("%s is temporarily unavailable.", subject) return stringutil.Capitalize(fmt.Sprintf("%s is temporarily unavailable.", subject))
case codersdk.ChatErrorKindStartupTimeout: case codersdk.ChatErrorKindStartupTimeout:
return fmt.Sprintf( return stringutil.Capitalize(fmt.Sprintf(
"%s did not start responding in time.", subject, "%s did not start responding in time.", subject,
) ))
case codersdk.ChatErrorKindAuth: case codersdk.ChatErrorKindAuth:
displayName := providerDisplayName(classified.Provider)
if displayName == "" {
displayName = "the AI provider"
}
return fmt.Sprintf( return fmt.Sprintf(
"Authentication with %s failed.", displayName, "Authentication with %s failed.", subject,
) )
case codersdk.ChatErrorKindConfig: case codersdk.ChatErrorKindConfig:
return fmt.Sprintf( return stringutil.Capitalize(fmt.Sprintf(
"%s rejected the model configuration.", subject, "%s rejected the model configuration.", subject,
) ))
case codersdk.ChatErrorKindMissingKey: case codersdk.ChatErrorKindMissingKey:
return "The API key for this conversation is no longer available." return "The API key for this conversation is no longer available."
default: case codersdk.ChatErrorKindProviderDisabled:
displayName := providerDisplayName(classified.Provider)
return fmt.Sprintf( return fmt.Sprintf(
"%s returned an unexpected error.", subject, "The %s provider has been disabled by an administrator.",
displayName,
) )
default:
return stringutil.Capitalize(fmt.Sprintf(
"%s returned an unexpected error.", subject,
))
} }
} }
func providerSubject(provider string) string { func providerSubject(provider string) string {
if displayName := providerDisplayName(provider); displayName != "" { if displayName := providerDisplayName(provider); displayName != "AI" && displayName != "" {
return displayName return displayName
} }
return "The AI provider" return "the AI provider"
} }
func providerDisplayName(provider string) string { func providerDisplayName(provider string) string {
@@ -141,7 +142,7 @@ func providerDisplayName(provider string) string {
case "vercel": case "vercel":
return "Vercel AI Gateway" return "Vercel AI Gateway"
default: default:
return "" return "AI"
} }
} }
+3
View File
@@ -4,6 +4,8 @@ import (
"regexp" "regexp"
"strconv" "strconv"
"strings" "strings"
"github.com/coder/coder/v2/aibridge"
) )
type providerHint struct { type providerHint struct {
@@ -83,6 +85,7 @@ var (
} }
genericRetryablePatterns = []string{"server error", "internal server error"} genericRetryablePatterns = []string{"server error", "internal server error"}
interruptedPatterns = []string{"chat interrupted", "request interrupted", "operation interrupted"} interruptedPatterns = []string{"chat interrupted", "request interrupted", "operation interrupted"}
providerDisabledPatterns = []string{aibridge.ErrorCodeProviderDisabled}
) )
func extractStatusCode(lower string) int { func extractStatusCode(lower string) int {
+2
View File
@@ -1534,6 +1534,7 @@ const (
ChatErrorKindConfig ChatErrorKind = "config" ChatErrorKindConfig ChatErrorKind = "config"
ChatErrorKindUsageLimit ChatErrorKind = "usage_limit" ChatErrorKindUsageLimit ChatErrorKind = "usage_limit"
ChatErrorKindMissingKey ChatErrorKind = "missing_key" ChatErrorKindMissingKey ChatErrorKind = "missing_key"
ChatErrorKindProviderDisabled ChatErrorKind = "provider_disabled"
) )
// AllChatErrorKinds contains every ChatErrorKind value. // AllChatErrorKinds contains every ChatErrorKind value.
@@ -1548,6 +1549,7 @@ var AllChatErrorKinds = []ChatErrorKind{
ChatErrorKindConfig, ChatErrorKindConfig,
ChatErrorKindUsageLimit, ChatErrorKindUsageLimit,
ChatErrorKindMissingKey, ChatErrorKindMissingKey,
ChatErrorKindProviderDisabled,
} }
// ChatError represents a terminal chat error in persisted chat state or the // ChatError represents a terminal chat error in persisted chat state or the
+2 -2
View File
@@ -293,9 +293,9 @@ Status Code **200**
#### Enumerated Values #### Enumerated Values
| Property | Value(s) | | Property | Value(s) |
|---------------|---------------------------------------------------------------------------------------------------------------------| |---------------|------------------------------------------------------------------------------------------------------------------------------------------|
| `client_type` | `api`, `ui` | | `client_type` | `api`, `ui` |
| `kind` | `auth`, `config`, `generic`, `missing_key`, `overloaded`, `rate_limit`, `startup_timeout`, `timeout`, `usage_limit` | | `kind` | `auth`, `config`, `generic`, `missing_key`, `overloaded`, `provider_disabled`, `rate_limit`, `startup_timeout`, `timeout`, `usage_limit` |
| `type` | `context-file`, `file`, `file-reference`, `reasoning`, `skill`, `source`, `text`, `tool-call`, `tool-result` | | `type` | `context-file`, `file`, `file-reference`, `reasoning`, `skill`, `source`, `text`, `tool-call`, `tool-result` |
| `plan_mode` | `plan` | | `plan_mode` | `plan` |
| `status` | `completed`, `error`, `paused`, `pending`, `requires_action`, `running`, `waiting` | | `status` | `completed`, `error`, `paused`, `pending`, `requires_action`, `running`, `waiting` |
+2 -2
View File
@@ -2682,8 +2682,8 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
#### Enumerated Values #### Enumerated Values
| Value(s) | | Value(s) |
|---------------------------------------------------------------------------------------------------------------------| |------------------------------------------------------------------------------------------------------------------------------------------|
| `auth`, `config`, `generic`, `missing_key`, `overloaded`, `rate_limit`, `startup_timeout`, `timeout`, `usage_limit` | | `auth`, `config`, `generic`, `missing_key`, `overloaded`, `provider_disabled`, `rate_limit`, `startup_timeout`, `timeout`, `usage_limit` |
## codersdk.ChatFileMetadata ## codersdk.ChatFileMetadata
+2
View File
@@ -1969,6 +1969,7 @@ export type ChatErrorKind =
| "generic" | "generic"
| "missing_key" | "missing_key"
| "overloaded" | "overloaded"
| "provider_disabled"
| "rate_limit" | "rate_limit"
| "startup_timeout" | "startup_timeout"
| "timeout" | "timeout"
@@ -1980,6 +1981,7 @@ export const ChatErrorKinds: ChatErrorKind[] = [
"generic", "generic",
"missing_key", "missing_key",
"overloaded", "overloaded",
"provider_disabled",
"rate_limit", "rate_limit",
"startup_timeout", "startup_timeout",
"timeout", "timeout",
@@ -288,6 +288,40 @@ export const TerminalStartupTimeoutError: Story = {
}, },
}; };
/** Disabled provider errors render an admin-oriented message without retry. */
export const TerminalProviderDisabledError: Story = {
args: {
...defaultArgs,
liveStatus: buildLiveStatus({
streamError: {
kind: "provider_disabled",
message:
"The OpenAI provider has been disabled. Contact your Coder administrator.",
provider: "openai",
retryable: false,
statusCode: 503,
},
}),
},
play: async ({ canvasElement }) => {
const canvas = within(canvasElement);
expect(
canvas.getByRole("heading", { name: /provider disabled/i }),
).toBeVisible();
expect(
canvas.getByText(
/the openai provider has been disabled.*contact your coder administrator/i,
),
).toBeVisible();
expect(canvas.getByText(/^HTTP 503$/)).toBeVisible();
// No retry or status link for administrative disablement.
expect(canvas.queryByText(/retrying/i)).not.toBeInTheDocument();
expect(
canvas.queryByRole("link", { name: /status/i }),
).not.toBeInTheDocument();
},
};
/** Generic failures do not show usage or provider CTAs. */ /** Generic failures do not show usage or provider CTAs. */
export const GenericErrorDoesNotShowUsageAction: Story = { export const GenericErrorDoesNotShowUsageAction: Story = {
args: { args: {
@@ -44,6 +44,8 @@ export const getErrorTitle = (
return "Usage limit reached"; return "Usage limit reached";
case "missing_key": case "missing_key":
return "Chat interrupted"; return "Chat interrupted";
case "provider_disabled":
return "Provider disabled";
default: default:
return mode === "retry" ? "Retrying request" : "Request failed"; return mode === "retry" ? "Retrying request" : "Request failed";
} }
@@ -11,9 +11,8 @@ type UsageLimitData = Partial<
/** /**
* Typed classification for errors surfaced in the agent detail view. * Typed classification for errors surfaced in the agent detail view.
* - "usage_limit": the user hit a spending cap (409 + valid usage data). * - "usage_limit": the user hit a spending cap (409 + valid usage data).
* - other kinds come from normalized stream/provider failures such as * - other kinds come from normalized stream/provider failures.
* "generic", "overloaded", "rate_limit", "timeout", * See ChatErrorKind for the full set.
* "startup_timeout", "auth", and "config".
*/ */
export type ChatDetailError = { export type ChatDetailError = {
message: string; message: string;