feat(codersdk): add circuit breaker configuration support for aibridge (#21546)

## Summary

Add circuit breaker support for AI Bridge to protect against cascading
failures from upstream AI provider rate limits (HTTP 429, 503, and
Anthropic's 529 overloaded responses).

## Changes

- Add 5 new CLI options for circuit breaker configuration:
  - `--aibridge-circuit-breaker-enabled` (default: false)
  - `--aibridge-circuit-breaker-failure-threshold` (default: 5)
  - `--aibridge-circuit-breaker-interval` (default: 10s)
  - `--aibridge-circuit-breaker-timeout` (default: 30s)
  - `--aibridge-circuit-breaker-max-requests` (default: 3)
- Update aibridge dependency to include circuit breaker support
- Add tests for pool creation with circuit breaker providers

## Notes

- Circuit breaker is **disabled by default** for backward compatibility
- When enabled, applies to both OpenAI and Anthropic providers
- Uses sony/gobreaker internally via the aibridge library

## Testing

```
make test RUN=TestPoolWithCircuitBreakerProviders
```
This commit is contained in:
Kacper Sawicki
2026-01-20 14:59:29 +01:00
committed by GitHub
parent bfae5b03dc
commit ed679bb3da
12 changed files with 341 additions and 16 deletions
+4
View File
@@ -121,6 +121,10 @@ AI BRIDGE OPTIONS:
See
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
--aibridge-circuit-breaker-enabled bool, $CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED (default: false)
Enable the circuit breaker to protect against cascading failures from
upstream AI provider rate limits (429, 503, 529 overloaded).
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
Length of time to retain data such as interceptions and all related
records (token, prompt, tool use).
+17
View File
@@ -777,6 +777,23 @@ aibridge:
# these records to external SIEM or observability systems.
# (default: false, type: bool)
structuredLogging: false
# Enable the circuit breaker to protect against cascading failures from upstream
# AI provider rate limits (429, 503, 529 overloaded).
# (default: false, type: bool)
circuitBreakerEnabled: false
# Number of consecutive failures that triggers the circuit breaker to open.
# (default: 5, type: int)
circuitBreakerFailureThreshold: 5
# Cyclic period of the closed state for clearing internal failure counts.
# (default: 10s, type: duration)
circuitBreakerInterval: 10s
# How long the circuit breaker stays open before transitioning to half-open state.
# (default: 30s, type: duration)
circuitBreakerTimeout: 30s
# Maximum number of requests allowed in half-open state before deciding to close
# or re-open the circuit.
# (default: 3, type: int)
circuitBreakerMaxRequests: 3
aibridgeproxy:
# Enable the AI Bridge MITM Proxy for intercepting and decrypting AI provider
# requests.
+16
View File
@@ -12038,6 +12038,22 @@ const docTemplate = `{
"bedrock": {
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
},
"circuit_breaker_enabled": {
"description": "Circuit breaker protects against cascading failures from upstream AI\nprovider rate limits (429, 503, 529 overloaded).",
"type": "boolean"
},
"circuit_breaker_failure_threshold": {
"type": "integer"
},
"circuit_breaker_interval": {
"type": "integer"
},
"circuit_breaker_max_requests": {
"type": "integer"
},
"circuit_breaker_timeout": {
"type": "integer"
},
"enabled": {
"type": "boolean"
},
+16
View File
@@ -10690,6 +10690,22 @@
"bedrock": {
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
},
"circuit_breaker_enabled": {
"description": "Circuit breaker protects against cascading failures from upstream AI\nprovider rate limits (429, 503, 529 overloaded).",
"type": "boolean"
},
"circuit_breaker_failure_threshold": {
"type": "integer"
},
"circuit_breaker_interval": {
"type": "integer"
},
"circuit_breaker_max_requests": {
"type": "integer"
},
"circuit_breaker_timeout": {
"type": "integer"
},
"enabled": {
"type": "boolean"
},
+73
View File
@@ -3494,6 +3494,72 @@ Write out the current server config as YAML to stdout.`,
Group: &deploymentGroupAIBridge,
YAML: "structuredLogging",
},
{
Name: "AI Bridge Circuit Breaker Enabled",
Description: "Enable the circuit breaker to protect against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded).",
Flag: "aibridge-circuit-breaker-enabled",
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED",
Value: &c.AI.BridgeConfig.CircuitBreakerEnabled,
Default: "false",
Group: &deploymentGroupAIBridge,
YAML: "circuitBreakerEnabled",
},
{
Name: "AI Bridge Circuit Breaker Failure Threshold",
Description: "Number of consecutive failures that triggers the circuit breaker to open.",
Flag: "aibridge-circuit-breaker-failure-threshold",
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_FAILURE_THRESHOLD",
Value: serpent.Validate(&c.AI.BridgeConfig.CircuitBreakerFailureThreshold, func(value *serpent.Int64) error {
if value.Value() <= 0 || value.Value() > 100 {
return xerrors.New("must be between 1 and 100")
}
return nil
}),
Default: "5",
Hidden: true,
Group: &deploymentGroupAIBridge,
YAML: "circuitBreakerFailureThreshold",
},
{
Name: "AI Bridge Circuit Breaker Interval",
Description: "Cyclic period of the closed state for clearing internal failure counts.",
Flag: "aibridge-circuit-breaker-interval",
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_INTERVAL",
Value: &c.AI.BridgeConfig.CircuitBreakerInterval,
Default: "10s",
Hidden: true,
Group: &deploymentGroupAIBridge,
YAML: "circuitBreakerInterval",
Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"),
},
{
Name: "AI Bridge Circuit Breaker Timeout",
Description: "How long the circuit breaker stays open before transitioning to half-open state.",
Flag: "aibridge-circuit-breaker-timeout",
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_TIMEOUT",
Value: &c.AI.BridgeConfig.CircuitBreakerTimeout,
Default: "30s",
Hidden: true,
Group: &deploymentGroupAIBridge,
YAML: "circuitBreakerTimeout",
Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"),
},
{
Name: "AI Bridge Circuit Breaker Max Requests",
Description: "Maximum number of requests allowed in half-open state before deciding to close or re-open the circuit.",
Flag: "aibridge-circuit-breaker-max-requests",
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_MAX_REQUESTS",
Value: serpent.Validate(&c.AI.BridgeConfig.CircuitBreakerMaxRequests, func(value *serpent.Int64) error {
if value.Value() <= 0 || value.Value() > 100 {
return xerrors.New("must be between 1 and 100")
}
return nil
}),
Default: "3",
Hidden: true,
Group: &deploymentGroupAIBridge,
YAML: "circuitBreakerMaxRequests",
},
// AI Bridge Proxy Options
{
@@ -3641,6 +3707,13 @@ type AIBridgeConfig struct {
MaxConcurrency serpent.Int64 `json:"max_concurrency" typescript:",notnull"`
RateLimit serpent.Int64 `json:"rate_limit" typescript:",notnull"`
StructuredLogging serpent.Bool `json:"structured_logging" typescript:",notnull"`
// Circuit breaker protects against cascading failures from upstream AI
// provider rate limits (429, 503, 529 overloaded).
CircuitBreakerEnabled serpent.Bool `json:"circuit_breaker_enabled" typescript:",notnull"`
CircuitBreakerFailureThreshold serpent.Int64 `json:"circuit_breaker_failure_threshold" typescript:",notnull"`
CircuitBreakerInterval serpent.Duration `json:"circuit_breaker_interval" typescript:",notnull"`
CircuitBreakerTimeout serpent.Duration `json:"circuit_breaker_timeout" typescript:",notnull"`
CircuitBreakerMaxRequests serpent.Int64 `json:"circuit_breaker_max_requests" typescript:",notnull"`
}
type AIBridgeOpenAIConfig struct {
+5
View File
@@ -185,6 +185,11 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
"region": "string",
"small_fast_model": "string"
},
"circuit_breaker_enabled": true,
"circuit_breaker_failure_threshold": 0,
"circuit_breaker_interval": 0,
"circuit_breaker_max_requests": 0,
"circuit_breaker_timeout": 0,
"enabled": true,
"inject_coder_mcp_tools": true,
"max_concurrency": 0,
+36 -11
View File
@@ -388,6 +388,11 @@
"region": "string",
"small_fast_model": "string"
},
"circuit_breaker_enabled": true,
"circuit_breaker_failure_threshold": 0,
"circuit_breaker_interval": 0,
"circuit_breaker_max_requests": 0,
"circuit_breaker_timeout": 0,
"enabled": true,
"inject_coder_mcp_tools": true,
"max_concurrency": 0,
@@ -403,17 +408,22 @@
### Properties
| Name | Type | Required | Restrictions | Description |
|--------------------------|----------------------------------------------------------------------|----------|--------------|-------------|
| `anthropic` | [codersdk.AIBridgeAnthropicConfig](#codersdkaibridgeanthropicconfig) | false | | |
| `bedrock` | [codersdk.AIBridgeBedrockConfig](#codersdkaibridgebedrockconfig) | false | | |
| `enabled` | boolean | false | | |
| `inject_coder_mcp_tools` | boolean | false | | |
| `max_concurrency` | integer | false | | |
| `openai` | [codersdk.AIBridgeOpenAIConfig](#codersdkaibridgeopenaiconfig) | false | | |
| `rate_limit` | integer | false | | |
| `retention` | integer | false | | |
| `structured_logging` | boolean | false | | |
| Name | Type | Required | Restrictions | Description |
|-------------------------------------|----------------------------------------------------------------------|----------|--------------|-----------------------------------------------------------------------------------------------------------------------|
| `anthropic` | [codersdk.AIBridgeAnthropicConfig](#codersdkaibridgeanthropicconfig) | false | | |
| `bedrock` | [codersdk.AIBridgeBedrockConfig](#codersdkaibridgebedrockconfig) | false | | |
| `circuit_breaker_enabled` | boolean | false | | Circuit breaker protects against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded). |
| `circuit_breaker_failure_threshold` | integer | false | | |
| `circuit_breaker_interval` | integer | false | | |
| `circuit_breaker_max_requests` | integer | false | | |
| `circuit_breaker_timeout` | integer | false | | |
| `enabled` | boolean | false | | |
| `inject_coder_mcp_tools` | boolean | false | | |
| `max_concurrency` | integer | false | | |
| `openai` | [codersdk.AIBridgeOpenAIConfig](#codersdkaibridgeopenaiconfig) | false | | |
| `rate_limit` | integer | false | | |
| `retention` | integer | false | | |
| `structured_logging` | boolean | false | | |
## codersdk.AIBridgeInterception
@@ -743,6 +753,11 @@
"region": "string",
"small_fast_model": "string"
},
"circuit_breaker_enabled": true,
"circuit_breaker_failure_threshold": 0,
"circuit_breaker_interval": 0,
"circuit_breaker_max_requests": 0,
"circuit_breaker_timeout": 0,
"enabled": true,
"inject_coder_mcp_tools": true,
"max_concurrency": 0,
@@ -2661,6 +2676,11 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
"region": "string",
"small_fast_model": "string"
},
"circuit_breaker_enabled": true,
"circuit_breaker_failure_threshold": 0,
"circuit_breaker_interval": 0,
"circuit_breaker_max_requests": 0,
"circuit_breaker_timeout": 0,
"enabled": true,
"inject_coder_mcp_tools": true,
"max_concurrency": 0,
@@ -3208,6 +3228,11 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
"region": "string",
"small_fast_model": "string"
},
"circuit_breaker_enabled": true,
"circuit_breaker_failure_threshold": 0,
"circuit_breaker_interval": 0,
"circuit_breaker_max_requests": 0,
"circuit_breaker_timeout": 0,
"enabled": true,
"inject_coder_mcp_tools": true,
"max_concurrency": 0,
+11
View File
@@ -1847,6 +1847,17 @@ Maximum number of AI Bridge requests per second per replica. Set to 0 to disable
Emit structured logs for AI Bridge interception records. Use this for exporting these records to external SIEM or observability systems.
### --aibridge-circuit-breaker-enabled
| | |
|-------------|------------------------------------------------------|
| Type | <code>bool</code> |
| Environment | <code>$CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED</code> |
| YAML | <code>aibridge.circuitBreakerEnabled</code> |
| Default | <code>false</code> |
Enable the circuit breaker to protect against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded).
### --aibridge-proxy-enabled
| | |
@@ -20,6 +20,7 @@ import (
"go.opentelemetry.io/otel/sdk/trace/tracetest"
"github.com/coder/aibridge"
"github.com/coder/aibridge/config"
aibtracing "github.com/coder/aibridge/tracing"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
@@ -415,3 +416,133 @@ func TestIntegrationWithMetrics(t *testing.T) {
return count == 1
}, testutil.WaitShort, testutil.IntervalFast, "interceptions_total metric should be 1")
}
// TestIntegrationCircuitBreaker validates that the circuit breaker opens after
// consecutive failures and that the corresponding metrics are exposed.
func TestIntegrationCircuitBreaker(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitLong)
// Create prometheus registry and metrics.
registry := prometheus.NewRegistry()
metrics := aibridge.NewMetrics(registry)
// Set up mock OpenAI server that always returns 429 Too Many Requests.
mockOpenAI := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
// Disable SDK retries.
w.Header().Set("x-should-retry", "false")
w.WriteHeader(http.StatusTooManyRequests)
_, _ = w.Write([]byte(`{"error":{"type":"rate_limit_error","message":"rate limited","code":"rate_limit_exceeded"}}`))
}))
t.Cleanup(mockOpenAI.Close)
// Set up mock Anthropic server that always returns 529 Overloaded.
mockAnthropic := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
// Anthropic uses 529 for overloaded errors.
w.WriteHeader(529)
_, _ = w.Write([]byte(`{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}`))
}))
t.Cleanup(mockAnthropic.Close)
// Database and coderd setup.
db, ps := dbtestutil.NewDB(t)
client, _, api, firstUser := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
Options: &coderdtest.Options{
Database: db,
Pubsub: ps,
},
})
userClient, _ := coderdtest.CreateAnotherUser(t, client, firstUser.OrganizationID)
// Create an API token for the user.
apiKey, err := userClient.CreateToken(ctx, "me", codersdk.CreateTokenRequest{
TokenName: fmt.Sprintf("test-key-%d", time.Now().UnixNano()),
Lifetime: time.Hour,
Scope: codersdk.APIKeyScopeCoderAll,
})
require.NoError(t, err)
// Create aibridge client.
aiBridgeClient, err := api.CreateInMemoryAIBridgeServer(ctx)
require.NoError(t, err)
logger := testutil.Logger(t)
// Create providers with circuit breaker configured to open after 2 failures.
cbConfig := &config.CircuitBreaker{
FailureThreshold: 2,
Interval: time.Minute,
Timeout: time.Minute,
MaxRequests: 1,
}
providers := []aibridge.Provider{
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{
BaseURL: mockOpenAI.URL,
CircuitBreaker: cbConfig,
}),
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{
BaseURL: mockAnthropic.URL,
Key: "test-key",
CircuitBreaker: cbConfig,
}, nil),
}
// Create pool with metrics.
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger, metrics, testTracer)
require.NoError(t, err)
// Given: aibridged is started.
srv, err := aibridged.New(ctx, pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
return aiBridgeClient, nil
}, logger, testTracer)
require.NoError(t, err, "create new aibridged")
t.Cleanup(func() {
_ = srv.Shutdown(ctx)
})
// Test OpenAI circuit breaker.
openaiRequestBody := `{"messages":[{"role":"user","content":"test"}],"model":"gpt-4"}`
for i := 0; i < 3; i++ {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "/openai/v1/chat/completions", bytes.NewBufferString(openaiRequestBody))
require.NoError(t, err)
req.Header.Add("Authorization", "Bearer "+apiKey.Key)
req.Header.Add("Accept", "application/json")
rec := httptest.NewRecorder()
srv.ServeHTTP(rec, req)
t.Logf("OpenAI request %d: status=%d", i+1, rec.Code)
}
// Test Anthropic circuit breaker.
anthropicRequestBody := `{"messages":[{"role":"user","content":"test"}],"model":"claude-3-5-sonnet-20241022","max_tokens":100}`
for i := 0; i < 3; i++ {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "/anthropic/v1/messages", bytes.NewBufferString(anthropicRequestBody))
require.NoError(t, err)
req.Header.Add("Authorization", "Bearer "+apiKey.Key)
req.Header.Add("Accept", "application/json")
rec := httptest.NewRecorder()
srv.ServeHTTP(rec, req)
t.Logf("Anthropic request %d: status=%d", i+1, rec.Code)
}
// Then: the circuit breaker metrics should reflect that both circuits opened.
// OpenAI circuit breaker should have tripped (state=1 means open).
openaiTrips := promtest.ToFloat64(metrics.CircuitBreakerTrips.WithLabelValues("openai", "/v1/chat/completions", "gpt-4"))
require.Equal(t, 1.0, openaiTrips, "OpenAI CircuitBreakerTrips should be 1")
openaiState := promtest.ToFloat64(metrics.CircuitBreakerState.WithLabelValues("openai", "/v1/chat/completions", "gpt-4"))
require.Equal(t, 1.0, openaiState, "OpenAI CircuitBreakerState should be 1 (open)")
// Anthropic circuit breaker should have tripped.
anthropicTrips := promtest.ToFloat64(metrics.CircuitBreakerTrips.WithLabelValues("anthropic", "/v1/messages", "claude-3-5-sonnet-20241022"))
require.Equal(t, 1.0, anthropicTrips, "Anthropic CircuitBreakerTrips should be 1")
anthropicState := promtest.ToFloat64(metrics.CircuitBreakerState.WithLabelValues("anthropic", "/v1/messages", "claude-3-5-sonnet-20241022"))
require.Equal(t, 1.0, anthropicState, "Anthropic CircuitBreakerState should be 1 (open)")
}
+19 -5
View File
@@ -9,6 +9,7 @@ import (
"golang.org/x/xerrors"
"github.com/coder/aibridge"
"github.com/coder/aibridge/config"
"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/enterprise/aibridged"
@@ -21,15 +22,28 @@ func newAIBridgeDaemon(coderAPI *coderd.API) (*aibridged.Server, error) {
logger := coderAPI.Logger.Named("aibridged")
// Setup supported providers.
// Build circuit breaker config if enabled.
var cbConfig *config.CircuitBreaker
if coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerEnabled.Value() {
cbConfig = &config.CircuitBreaker{
FailureThreshold: uint32(coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerFailureThreshold.Value()), //nolint:gosec // Validated by serpent.Validate in deployment options.
Interval: coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerInterval.Value(),
Timeout: coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerTimeout.Value(),
MaxRequests: uint32(coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerMaxRequests.Value()), //nolint:gosec // Validated by serpent.Validate in deployment options.
}
}
// Setup supported providers with circuit breaker config.
providers := []aibridge.Provider{
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.BaseURL.String(),
Key: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.Key.String(),
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.BaseURL.String(),
Key: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.Key.String(),
CircuitBreaker: cbConfig,
}),
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.BaseURL.String(),
Key: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.Key.String(),
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.BaseURL.String(),
Key: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.Key.String(),
CircuitBreaker: cbConfig,
}, getBedrockConfig(coderAPI.DeploymentValues.AI.BridgeConfig.Bedrock)),
}
+4
View File
@@ -122,6 +122,10 @@ AI BRIDGE OPTIONS:
See
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
--aibridge-circuit-breaker-enabled bool, $CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED (default: false)
Enable the circuit breaker to protect against cascading failures from
upstream AI provider rate limits (429, 503, 529 overloaded).
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
Length of time to retain data such as interceptions and all related
records (token, prompt, tool use).
+9
View File
@@ -36,6 +36,15 @@ export interface AIBridgeConfig {
readonly max_concurrency: number;
readonly rate_limit: number;
readonly structured_logging: boolean;
/**
* Circuit breaker protects against cascading failures from upstream AI
* provider rate limits (429, 503, 529 overloaded).
*/
readonly circuit_breaker_enabled: boolean;
readonly circuit_breaker_failure_threshold: number;
readonly circuit_breaker_interval: number;
readonly circuit_breaker_timeout: number;
readonly circuit_breaker_max_requests: number;
}
// From codersdk/aibridge.go