mirror of
https://github.com/coder/coder.git
synced 2026-06-06 14:38:23 +00:00
f1155ac4d7
## Description Adds automatic key failover for centralized Anthropic provider. When a key pool is configured, each upstream call walks the pool and tries keys in order until one succeeds or the pool is exhausted. Keys are marked **temporary** on 429 (with cooldown from `Retry-After`) and **permanent** on 401/403. Errors that aren't key-specific don't trigger failover. Each agentic-loop iteration gets its own fresh walker, so a tool-call continuation can fail over independently of the initial request. BYOK is unchanged: BYOK requests run as a single attempt with no failover. ## Changes - `config.Anthropic` carries a `KeyPool`. `Key` remains for BYOK X-Api-Key set per interception. - Blocking interceptor: walks the pool, marks keys on key-specific failures, returns on first success or non-failover error. - Streaming interceptor: per-iteration walker. Pre-stream failures fail over to the next key; mid-stream errors are relayed as SSE events. - New `keypool` error types: `TransientExhaustionError` (carries soonest cooldown) and `ErrPermanentExhaustion`. Replace the prior `ErrAllKeysExhausted`. - Error responses now consistently include the outer `"type": "error"` field. ## Related Issues Related to: https://github.com/coder/internal/issues/1446 Related to: https://linear.app/codercom/issue/AIGOV-197/aibridge-automatic-key-failover-for-bridged-and-passthrough-routes ## Follow-up PRs - Bedrock multi-key support. - Refactor provider vs interceptor config separation. - Record the actually-used key in the interception credential hint after failover. > [!NOTE] > Initially generated by Claude Opus 4.7, modified and reviewed by @ssncferreira
55 lines
1.4 KiB
Go
55 lines
1.4 KiB
Go
package keypool
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/aibridge/utils"
|
|
)
|
|
|
|
// MarkKeyOnStatus marks key based on a key-specific HTTP
|
|
// status code from resp (429 for temporary, 401 or 403 for
|
|
// permanent). Returns true if the status was a key-specific
|
|
// failover trigger so callers can retry with the next key.
|
|
func MarkKeyOnStatus(
|
|
ctx context.Context,
|
|
key *Key,
|
|
resp *http.Response,
|
|
logger slog.Logger,
|
|
providerName string,
|
|
) bool {
|
|
if resp == nil {
|
|
return false
|
|
}
|
|
statusCode := resp.StatusCode
|
|
switch statusCode {
|
|
case http.StatusTooManyRequests:
|
|
cooldown := ParseRetryAfter(resp)
|
|
if cooldown <= 0 {
|
|
cooldown = defaultCooldown
|
|
}
|
|
if key.MarkTemporary(cooldown) {
|
|
logger.Info(ctx, "key marked temporary",
|
|
slog.F("provider", providerName),
|
|
slog.F("api_key_hint", utils.MaskSecret(key.Value())),
|
|
slog.F("status", statusCode),
|
|
slog.F("cooldown", cooldown))
|
|
}
|
|
return true
|
|
case http.StatusUnauthorized, http.StatusForbidden:
|
|
if key.MarkPermanent() {
|
|
logger.Warn(ctx, "key marked permanent",
|
|
slog.F("provider", providerName),
|
|
slog.F("api_key_hint", utils.MaskSecret(key.Value())),
|
|
slog.F("status", statusCode))
|
|
}
|
|
return true
|
|
default:
|
|
logger.Debug(ctx, "status is not a key failover trigger",
|
|
slog.F("provider", providerName),
|
|
slog.F("status", statusCode))
|
|
return false
|
|
}
|
|
}
|