mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat(codersdk): add circuit breaker configuration support for aibridge (#21546)
## Summary Add circuit breaker support for AI Bridge to protect against cascading failures from upstream AI provider rate limits (HTTP 429, 503, and Anthropic's 529 overloaded responses). ## Changes - Add 5 new CLI options for circuit breaker configuration: - `--aibridge-circuit-breaker-enabled` (default: false) - `--aibridge-circuit-breaker-failure-threshold` (default: 5) - `--aibridge-circuit-breaker-interval` (default: 10s) - `--aibridge-circuit-breaker-timeout` (default: 30s) - `--aibridge-circuit-breaker-max-requests` (default: 3) - Update aibridge dependency to include circuit breaker support - Add tests for pool creation with circuit breaker providers ## Notes - Circuit breaker is **disabled by default** for backward compatibility - When enabled, applies to both OpenAI and Anthropic providers - Uses sony/gobreaker internally via the aibridge library ## Testing ``` make test RUN=TestPoolWithCircuitBreakerProviders ```
This commit is contained in:
+4
@@ -121,6 +121,10 @@ AI BRIDGE OPTIONS:
|
|||||||
See
|
See
|
||||||
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
|
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
|
||||||
|
|
||||||
|
--aibridge-circuit-breaker-enabled bool, $CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED (default: false)
|
||||||
|
Enable the circuit breaker to protect against cascading failures from
|
||||||
|
upstream AI provider rate limits (429, 503, 529 overloaded).
|
||||||
|
|
||||||
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
|
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
|
||||||
Length of time to retain data such as interceptions and all related
|
Length of time to retain data such as interceptions and all related
|
||||||
records (token, prompt, tool use).
|
records (token, prompt, tool use).
|
||||||
|
|||||||
+17
@@ -777,6 +777,23 @@ aibridge:
|
|||||||
# these records to external SIEM or observability systems.
|
# these records to external SIEM or observability systems.
|
||||||
# (default: false, type: bool)
|
# (default: false, type: bool)
|
||||||
structuredLogging: false
|
structuredLogging: false
|
||||||
|
# Enable the circuit breaker to protect against cascading failures from upstream
|
||||||
|
# AI provider rate limits (429, 503, 529 overloaded).
|
||||||
|
# (default: false, type: bool)
|
||||||
|
circuitBreakerEnabled: false
|
||||||
|
# Number of consecutive failures that triggers the circuit breaker to open.
|
||||||
|
# (default: 5, type: int)
|
||||||
|
circuitBreakerFailureThreshold: 5
|
||||||
|
# Cyclic period of the closed state for clearing internal failure counts.
|
||||||
|
# (default: 10s, type: duration)
|
||||||
|
circuitBreakerInterval: 10s
|
||||||
|
# How long the circuit breaker stays open before transitioning to half-open state.
|
||||||
|
# (default: 30s, type: duration)
|
||||||
|
circuitBreakerTimeout: 30s
|
||||||
|
# Maximum number of requests allowed in half-open state before deciding to close
|
||||||
|
# or re-open the circuit.
|
||||||
|
# (default: 3, type: int)
|
||||||
|
circuitBreakerMaxRequests: 3
|
||||||
aibridgeproxy:
|
aibridgeproxy:
|
||||||
# Enable the AI Bridge MITM Proxy for intercepting and decrypting AI provider
|
# Enable the AI Bridge MITM Proxy for intercepting and decrypting AI provider
|
||||||
# requests.
|
# requests.
|
||||||
|
|||||||
Generated
+16
@@ -12038,6 +12038,22 @@ const docTemplate = `{
|
|||||||
"bedrock": {
|
"bedrock": {
|
||||||
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
|
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": {
|
||||||
|
"description": "Circuit breaker protects against cascading failures from upstream AI\nprovider rate limits (429, 503, 529 overloaded).",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"circuit_breaker_failure_threshold": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_interval": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_max_requests": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_timeout": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
"enabled": {
|
"enabled": {
|
||||||
"type": "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
|
|||||||
Generated
+16
@@ -10690,6 +10690,22 @@
|
|||||||
"bedrock": {
|
"bedrock": {
|
||||||
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
|
"$ref": "#/definitions/codersdk.AIBridgeBedrockConfig"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": {
|
||||||
|
"description": "Circuit breaker protects against cascading failures from upstream AI\nprovider rate limits (429, 503, 529 overloaded).",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"circuit_breaker_failure_threshold": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_interval": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_max_requests": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"circuit_breaker_timeout": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
"enabled": {
|
"enabled": {
|
||||||
"type": "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -3494,6 +3494,72 @@ Write out the current server config as YAML to stdout.`,
|
|||||||
Group: &deploymentGroupAIBridge,
|
Group: &deploymentGroupAIBridge,
|
||||||
YAML: "structuredLogging",
|
YAML: "structuredLogging",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Name: "AI Bridge Circuit Breaker Enabled",
|
||||||
|
Description: "Enable the circuit breaker to protect against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded).",
|
||||||
|
Flag: "aibridge-circuit-breaker-enabled",
|
||||||
|
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED",
|
||||||
|
Value: &c.AI.BridgeConfig.CircuitBreakerEnabled,
|
||||||
|
Default: "false",
|
||||||
|
Group: &deploymentGroupAIBridge,
|
||||||
|
YAML: "circuitBreakerEnabled",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "AI Bridge Circuit Breaker Failure Threshold",
|
||||||
|
Description: "Number of consecutive failures that triggers the circuit breaker to open.",
|
||||||
|
Flag: "aibridge-circuit-breaker-failure-threshold",
|
||||||
|
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_FAILURE_THRESHOLD",
|
||||||
|
Value: serpent.Validate(&c.AI.BridgeConfig.CircuitBreakerFailureThreshold, func(value *serpent.Int64) error {
|
||||||
|
if value.Value() <= 0 || value.Value() > 100 {
|
||||||
|
return xerrors.New("must be between 1 and 100")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}),
|
||||||
|
Default: "5",
|
||||||
|
Hidden: true,
|
||||||
|
Group: &deploymentGroupAIBridge,
|
||||||
|
YAML: "circuitBreakerFailureThreshold",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "AI Bridge Circuit Breaker Interval",
|
||||||
|
Description: "Cyclic period of the closed state for clearing internal failure counts.",
|
||||||
|
Flag: "aibridge-circuit-breaker-interval",
|
||||||
|
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_INTERVAL",
|
||||||
|
Value: &c.AI.BridgeConfig.CircuitBreakerInterval,
|
||||||
|
Default: "10s",
|
||||||
|
Hidden: true,
|
||||||
|
Group: &deploymentGroupAIBridge,
|
||||||
|
YAML: "circuitBreakerInterval",
|
||||||
|
Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "AI Bridge Circuit Breaker Timeout",
|
||||||
|
Description: "How long the circuit breaker stays open before transitioning to half-open state.",
|
||||||
|
Flag: "aibridge-circuit-breaker-timeout",
|
||||||
|
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_TIMEOUT",
|
||||||
|
Value: &c.AI.BridgeConfig.CircuitBreakerTimeout,
|
||||||
|
Default: "30s",
|
||||||
|
Hidden: true,
|
||||||
|
Group: &deploymentGroupAIBridge,
|
||||||
|
YAML: "circuitBreakerTimeout",
|
||||||
|
Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "AI Bridge Circuit Breaker Max Requests",
|
||||||
|
Description: "Maximum number of requests allowed in half-open state before deciding to close or re-open the circuit.",
|
||||||
|
Flag: "aibridge-circuit-breaker-max-requests",
|
||||||
|
Env: "CODER_AIBRIDGE_CIRCUIT_BREAKER_MAX_REQUESTS",
|
||||||
|
Value: serpent.Validate(&c.AI.BridgeConfig.CircuitBreakerMaxRequests, func(value *serpent.Int64) error {
|
||||||
|
if value.Value() <= 0 || value.Value() > 100 {
|
||||||
|
return xerrors.New("must be between 1 and 100")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}),
|
||||||
|
Default: "3",
|
||||||
|
Hidden: true,
|
||||||
|
Group: &deploymentGroupAIBridge,
|
||||||
|
YAML: "circuitBreakerMaxRequests",
|
||||||
|
},
|
||||||
|
|
||||||
// AI Bridge Proxy Options
|
// AI Bridge Proxy Options
|
||||||
{
|
{
|
||||||
@@ -3641,6 +3707,13 @@ type AIBridgeConfig struct {
|
|||||||
MaxConcurrency serpent.Int64 `json:"max_concurrency" typescript:",notnull"`
|
MaxConcurrency serpent.Int64 `json:"max_concurrency" typescript:",notnull"`
|
||||||
RateLimit serpent.Int64 `json:"rate_limit" typescript:",notnull"`
|
RateLimit serpent.Int64 `json:"rate_limit" typescript:",notnull"`
|
||||||
StructuredLogging serpent.Bool `json:"structured_logging" typescript:",notnull"`
|
StructuredLogging serpent.Bool `json:"structured_logging" typescript:",notnull"`
|
||||||
|
// Circuit breaker protects against cascading failures from upstream AI
|
||||||
|
// provider rate limits (429, 503, 529 overloaded).
|
||||||
|
CircuitBreakerEnabled serpent.Bool `json:"circuit_breaker_enabled" typescript:",notnull"`
|
||||||
|
CircuitBreakerFailureThreshold serpent.Int64 `json:"circuit_breaker_failure_threshold" typescript:",notnull"`
|
||||||
|
CircuitBreakerInterval serpent.Duration `json:"circuit_breaker_interval" typescript:",notnull"`
|
||||||
|
CircuitBreakerTimeout serpent.Duration `json:"circuit_breaker_timeout" typescript:",notnull"`
|
||||||
|
CircuitBreakerMaxRequests serpent.Int64 `json:"circuit_breaker_max_requests" typescript:",notnull"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type AIBridgeOpenAIConfig struct {
|
type AIBridgeOpenAIConfig struct {
|
||||||
|
|||||||
Generated
+5
@@ -185,6 +185,11 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
|
|||||||
"region": "string",
|
"region": "string",
|
||||||
"small_fast_model": "string"
|
"small_fast_model": "string"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": true,
|
||||||
|
"circuit_breaker_failure_threshold": 0,
|
||||||
|
"circuit_breaker_interval": 0,
|
||||||
|
"circuit_breaker_max_requests": 0,
|
||||||
|
"circuit_breaker_timeout": 0,
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"inject_coder_mcp_tools": true,
|
"inject_coder_mcp_tools": true,
|
||||||
"max_concurrency": 0,
|
"max_concurrency": 0,
|
||||||
|
|||||||
Generated
+36
-11
@@ -388,6 +388,11 @@
|
|||||||
"region": "string",
|
"region": "string",
|
||||||
"small_fast_model": "string"
|
"small_fast_model": "string"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": true,
|
||||||
|
"circuit_breaker_failure_threshold": 0,
|
||||||
|
"circuit_breaker_interval": 0,
|
||||||
|
"circuit_breaker_max_requests": 0,
|
||||||
|
"circuit_breaker_timeout": 0,
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"inject_coder_mcp_tools": true,
|
"inject_coder_mcp_tools": true,
|
||||||
"max_concurrency": 0,
|
"max_concurrency": 0,
|
||||||
@@ -403,17 +408,22 @@
|
|||||||
|
|
||||||
### Properties
|
### Properties
|
||||||
|
|
||||||
| Name | Type | Required | Restrictions | Description |
|
| Name | Type | Required | Restrictions | Description |
|
||||||
|--------------------------|----------------------------------------------------------------------|----------|--------------|-------------|
|
|-------------------------------------|----------------------------------------------------------------------|----------|--------------|-----------------------------------------------------------------------------------------------------------------------|
|
||||||
| `anthropic` | [codersdk.AIBridgeAnthropicConfig](#codersdkaibridgeanthropicconfig) | false | | |
|
| `anthropic` | [codersdk.AIBridgeAnthropicConfig](#codersdkaibridgeanthropicconfig) | false | | |
|
||||||
| `bedrock` | [codersdk.AIBridgeBedrockConfig](#codersdkaibridgebedrockconfig) | false | | |
|
| `bedrock` | [codersdk.AIBridgeBedrockConfig](#codersdkaibridgebedrockconfig) | false | | |
|
||||||
| `enabled` | boolean | false | | |
|
| `circuit_breaker_enabled` | boolean | false | | Circuit breaker protects against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded). |
|
||||||
| `inject_coder_mcp_tools` | boolean | false | | |
|
| `circuit_breaker_failure_threshold` | integer | false | | |
|
||||||
| `max_concurrency` | integer | false | | |
|
| `circuit_breaker_interval` | integer | false | | |
|
||||||
| `openai` | [codersdk.AIBridgeOpenAIConfig](#codersdkaibridgeopenaiconfig) | false | | |
|
| `circuit_breaker_max_requests` | integer | false | | |
|
||||||
| `rate_limit` | integer | false | | |
|
| `circuit_breaker_timeout` | integer | false | | |
|
||||||
| `retention` | integer | false | | |
|
| `enabled` | boolean | false | | |
|
||||||
| `structured_logging` | boolean | false | | |
|
| `inject_coder_mcp_tools` | boolean | false | | |
|
||||||
|
| `max_concurrency` | integer | false | | |
|
||||||
|
| `openai` | [codersdk.AIBridgeOpenAIConfig](#codersdkaibridgeopenaiconfig) | false | | |
|
||||||
|
| `rate_limit` | integer | false | | |
|
||||||
|
| `retention` | integer | false | | |
|
||||||
|
| `structured_logging` | boolean | false | | |
|
||||||
|
|
||||||
## codersdk.AIBridgeInterception
|
## codersdk.AIBridgeInterception
|
||||||
|
|
||||||
@@ -743,6 +753,11 @@
|
|||||||
"region": "string",
|
"region": "string",
|
||||||
"small_fast_model": "string"
|
"small_fast_model": "string"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": true,
|
||||||
|
"circuit_breaker_failure_threshold": 0,
|
||||||
|
"circuit_breaker_interval": 0,
|
||||||
|
"circuit_breaker_max_requests": 0,
|
||||||
|
"circuit_breaker_timeout": 0,
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"inject_coder_mcp_tools": true,
|
"inject_coder_mcp_tools": true,
|
||||||
"max_concurrency": 0,
|
"max_concurrency": 0,
|
||||||
@@ -2661,6 +2676,11 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
|
|||||||
"region": "string",
|
"region": "string",
|
||||||
"small_fast_model": "string"
|
"small_fast_model": "string"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": true,
|
||||||
|
"circuit_breaker_failure_threshold": 0,
|
||||||
|
"circuit_breaker_interval": 0,
|
||||||
|
"circuit_breaker_max_requests": 0,
|
||||||
|
"circuit_breaker_timeout": 0,
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"inject_coder_mcp_tools": true,
|
"inject_coder_mcp_tools": true,
|
||||||
"max_concurrency": 0,
|
"max_concurrency": 0,
|
||||||
@@ -3208,6 +3228,11 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
|
|||||||
"region": "string",
|
"region": "string",
|
||||||
"small_fast_model": "string"
|
"small_fast_model": "string"
|
||||||
},
|
},
|
||||||
|
"circuit_breaker_enabled": true,
|
||||||
|
"circuit_breaker_failure_threshold": 0,
|
||||||
|
"circuit_breaker_interval": 0,
|
||||||
|
"circuit_breaker_max_requests": 0,
|
||||||
|
"circuit_breaker_timeout": 0,
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"inject_coder_mcp_tools": true,
|
"inject_coder_mcp_tools": true,
|
||||||
"max_concurrency": 0,
|
"max_concurrency": 0,
|
||||||
|
|||||||
Generated
+11
@@ -1847,6 +1847,17 @@ Maximum number of AI Bridge requests per second per replica. Set to 0 to disable
|
|||||||
|
|
||||||
Emit structured logs for AI Bridge interception records. Use this for exporting these records to external SIEM or observability systems.
|
Emit structured logs for AI Bridge interception records. Use this for exporting these records to external SIEM or observability systems.
|
||||||
|
|
||||||
|
### --aibridge-circuit-breaker-enabled
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|-------------|------------------------------------------------------|
|
||||||
|
| Type | <code>bool</code> |
|
||||||
|
| Environment | <code>$CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED</code> |
|
||||||
|
| YAML | <code>aibridge.circuitBreakerEnabled</code> |
|
||||||
|
| Default | <code>false</code> |
|
||||||
|
|
||||||
|
Enable the circuit breaker to protect against cascading failures from upstream AI provider rate limits (429, 503, 529 overloaded).
|
||||||
|
|
||||||
### --aibridge-proxy-enabled
|
### --aibridge-proxy-enabled
|
||||||
|
|
||||||
| | |
|
| | |
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ import (
|
|||||||
"go.opentelemetry.io/otel/sdk/trace/tracetest"
|
"go.opentelemetry.io/otel/sdk/trace/tracetest"
|
||||||
|
|
||||||
"github.com/coder/aibridge"
|
"github.com/coder/aibridge"
|
||||||
|
"github.com/coder/aibridge/config"
|
||||||
aibtracing "github.com/coder/aibridge/tracing"
|
aibtracing "github.com/coder/aibridge/tracing"
|
||||||
"github.com/coder/coder/v2/coderd/coderdtest"
|
"github.com/coder/coder/v2/coderd/coderdtest"
|
||||||
"github.com/coder/coder/v2/coderd/database"
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
@@ -415,3 +416,133 @@ func TestIntegrationWithMetrics(t *testing.T) {
|
|||||||
return count == 1
|
return count == 1
|
||||||
}, testutil.WaitShort, testutil.IntervalFast, "interceptions_total metric should be 1")
|
}, testutil.WaitShort, testutil.IntervalFast, "interceptions_total metric should be 1")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestIntegrationCircuitBreaker validates that the circuit breaker opens after
|
||||||
|
// consecutive failures and that the corresponding metrics are exposed.
|
||||||
|
func TestIntegrationCircuitBreaker(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
ctx := testutil.Context(t, testutil.WaitLong)
|
||||||
|
|
||||||
|
// Create prometheus registry and metrics.
|
||||||
|
registry := prometheus.NewRegistry()
|
||||||
|
metrics := aibridge.NewMetrics(registry)
|
||||||
|
|
||||||
|
// Set up mock OpenAI server that always returns 429 Too Many Requests.
|
||||||
|
mockOpenAI := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
// Disable SDK retries.
|
||||||
|
w.Header().Set("x-should-retry", "false")
|
||||||
|
w.WriteHeader(http.StatusTooManyRequests)
|
||||||
|
_, _ = w.Write([]byte(`{"error":{"type":"rate_limit_error","message":"rate limited","code":"rate_limit_exceeded"}}`))
|
||||||
|
}))
|
||||||
|
t.Cleanup(mockOpenAI.Close)
|
||||||
|
|
||||||
|
// Set up mock Anthropic server that always returns 529 Overloaded.
|
||||||
|
mockAnthropic := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
// Anthropic uses 529 for overloaded errors.
|
||||||
|
w.WriteHeader(529)
|
||||||
|
_, _ = w.Write([]byte(`{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}`))
|
||||||
|
}))
|
||||||
|
t.Cleanup(mockAnthropic.Close)
|
||||||
|
|
||||||
|
// Database and coderd setup.
|
||||||
|
db, ps := dbtestutil.NewDB(t)
|
||||||
|
client, _, api, firstUser := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
|
||||||
|
Options: &coderdtest.Options{
|
||||||
|
Database: db,
|
||||||
|
Pubsub: ps,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
userClient, _ := coderdtest.CreateAnotherUser(t, client, firstUser.OrganizationID)
|
||||||
|
|
||||||
|
// Create an API token for the user.
|
||||||
|
apiKey, err := userClient.CreateToken(ctx, "me", codersdk.CreateTokenRequest{
|
||||||
|
TokenName: fmt.Sprintf("test-key-%d", time.Now().UnixNano()),
|
||||||
|
Lifetime: time.Hour,
|
||||||
|
Scope: codersdk.APIKeyScopeCoderAll,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Create aibridge client.
|
||||||
|
aiBridgeClient, err := api.CreateInMemoryAIBridgeServer(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
logger := testutil.Logger(t)
|
||||||
|
|
||||||
|
// Create providers with circuit breaker configured to open after 2 failures.
|
||||||
|
cbConfig := &config.CircuitBreaker{
|
||||||
|
FailureThreshold: 2,
|
||||||
|
Interval: time.Minute,
|
||||||
|
Timeout: time.Minute,
|
||||||
|
MaxRequests: 1,
|
||||||
|
}
|
||||||
|
providers := []aibridge.Provider{
|
||||||
|
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{
|
||||||
|
BaseURL: mockOpenAI.URL,
|
||||||
|
CircuitBreaker: cbConfig,
|
||||||
|
}),
|
||||||
|
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{
|
||||||
|
BaseURL: mockAnthropic.URL,
|
||||||
|
Key: "test-key",
|
||||||
|
CircuitBreaker: cbConfig,
|
||||||
|
}, nil),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create pool with metrics.
|
||||||
|
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger, metrics, testTracer)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Given: aibridged is started.
|
||||||
|
srv, err := aibridged.New(ctx, pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
|
||||||
|
return aiBridgeClient, nil
|
||||||
|
}, logger, testTracer)
|
||||||
|
require.NoError(t, err, "create new aibridged")
|
||||||
|
t.Cleanup(func() {
|
||||||
|
_ = srv.Shutdown(ctx)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test OpenAI circuit breaker.
|
||||||
|
openaiRequestBody := `{"messages":[{"role":"user","content":"test"}],"model":"gpt-4"}`
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "/openai/v1/chat/completions", bytes.NewBufferString(openaiRequestBody))
|
||||||
|
require.NoError(t, err)
|
||||||
|
req.Header.Add("Authorization", "Bearer "+apiKey.Key)
|
||||||
|
req.Header.Add("Accept", "application/json")
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
srv.ServeHTTP(rec, req)
|
||||||
|
t.Logf("OpenAI request %d: status=%d", i+1, rec.Code)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test Anthropic circuit breaker.
|
||||||
|
anthropicRequestBody := `{"messages":[{"role":"user","content":"test"}],"model":"claude-3-5-sonnet-20241022","max_tokens":100}`
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "/anthropic/v1/messages", bytes.NewBufferString(anthropicRequestBody))
|
||||||
|
require.NoError(t, err)
|
||||||
|
req.Header.Add("Authorization", "Bearer "+apiKey.Key)
|
||||||
|
req.Header.Add("Accept", "application/json")
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
srv.ServeHTTP(rec, req)
|
||||||
|
t.Logf("Anthropic request %d: status=%d", i+1, rec.Code)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then: the circuit breaker metrics should reflect that both circuits opened.
|
||||||
|
|
||||||
|
// OpenAI circuit breaker should have tripped (state=1 means open).
|
||||||
|
openaiTrips := promtest.ToFloat64(metrics.CircuitBreakerTrips.WithLabelValues("openai", "/v1/chat/completions", "gpt-4"))
|
||||||
|
require.Equal(t, 1.0, openaiTrips, "OpenAI CircuitBreakerTrips should be 1")
|
||||||
|
|
||||||
|
openaiState := promtest.ToFloat64(metrics.CircuitBreakerState.WithLabelValues("openai", "/v1/chat/completions", "gpt-4"))
|
||||||
|
require.Equal(t, 1.0, openaiState, "OpenAI CircuitBreakerState should be 1 (open)")
|
||||||
|
|
||||||
|
// Anthropic circuit breaker should have tripped.
|
||||||
|
anthropicTrips := promtest.ToFloat64(metrics.CircuitBreakerTrips.WithLabelValues("anthropic", "/v1/messages", "claude-3-5-sonnet-20241022"))
|
||||||
|
require.Equal(t, 1.0, anthropicTrips, "Anthropic CircuitBreakerTrips should be 1")
|
||||||
|
|
||||||
|
anthropicState := promtest.ToFloat64(metrics.CircuitBreakerState.WithLabelValues("anthropic", "/v1/messages", "claude-3-5-sonnet-20241022"))
|
||||||
|
require.Equal(t, 1.0, anthropicState, "Anthropic CircuitBreakerState should be 1 (open)")
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
"github.com/coder/aibridge"
|
"github.com/coder/aibridge"
|
||||||
|
"github.com/coder/aibridge/config"
|
||||||
"github.com/coder/coder/v2/coderd/tracing"
|
"github.com/coder/coder/v2/coderd/tracing"
|
||||||
"github.com/coder/coder/v2/codersdk"
|
"github.com/coder/coder/v2/codersdk"
|
||||||
"github.com/coder/coder/v2/enterprise/aibridged"
|
"github.com/coder/coder/v2/enterprise/aibridged"
|
||||||
@@ -21,15 +22,28 @@ func newAIBridgeDaemon(coderAPI *coderd.API) (*aibridged.Server, error) {
|
|||||||
|
|
||||||
logger := coderAPI.Logger.Named("aibridged")
|
logger := coderAPI.Logger.Named("aibridged")
|
||||||
|
|
||||||
// Setup supported providers.
|
// Build circuit breaker config if enabled.
|
||||||
|
var cbConfig *config.CircuitBreaker
|
||||||
|
if coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerEnabled.Value() {
|
||||||
|
cbConfig = &config.CircuitBreaker{
|
||||||
|
FailureThreshold: uint32(coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerFailureThreshold.Value()), //nolint:gosec // Validated by serpent.Validate in deployment options.
|
||||||
|
Interval: coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerInterval.Value(),
|
||||||
|
Timeout: coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerTimeout.Value(),
|
||||||
|
MaxRequests: uint32(coderAPI.DeploymentValues.AI.BridgeConfig.CircuitBreakerMaxRequests.Value()), //nolint:gosec // Validated by serpent.Validate in deployment options.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup supported providers with circuit breaker config.
|
||||||
providers := []aibridge.Provider{
|
providers := []aibridge.Provider{
|
||||||
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{
|
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{
|
||||||
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.BaseURL.String(),
|
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.BaseURL.String(),
|
||||||
Key: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.Key.String(),
|
Key: coderAPI.DeploymentValues.AI.BridgeConfig.OpenAI.Key.String(),
|
||||||
|
CircuitBreaker: cbConfig,
|
||||||
}),
|
}),
|
||||||
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{
|
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{
|
||||||
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.BaseURL.String(),
|
BaseURL: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.BaseURL.String(),
|
||||||
Key: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.Key.String(),
|
Key: coderAPI.DeploymentValues.AI.BridgeConfig.Anthropic.Key.String(),
|
||||||
|
CircuitBreaker: cbConfig,
|
||||||
}, getBedrockConfig(coderAPI.DeploymentValues.AI.BridgeConfig.Bedrock)),
|
}, getBedrockConfig(coderAPI.DeploymentValues.AI.BridgeConfig.Bedrock)),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -122,6 +122,10 @@ AI BRIDGE OPTIONS:
|
|||||||
See
|
See
|
||||||
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
|
https://docs.claude.com/en/docs/claude-code/settings#environment-variables.
|
||||||
|
|
||||||
|
--aibridge-circuit-breaker-enabled bool, $CODER_AIBRIDGE_CIRCUIT_BREAKER_ENABLED (default: false)
|
||||||
|
Enable the circuit breaker to protect against cascading failures from
|
||||||
|
upstream AI provider rate limits (429, 503, 529 overloaded).
|
||||||
|
|
||||||
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
|
--aibridge-retention duration, $CODER_AIBRIDGE_RETENTION (default: 60d)
|
||||||
Length of time to retain data such as interceptions and all related
|
Length of time to retain data such as interceptions and all related
|
||||||
records (token, prompt, tool use).
|
records (token, prompt, tool use).
|
||||||
|
|||||||
Generated
+9
@@ -36,6 +36,15 @@ export interface AIBridgeConfig {
|
|||||||
readonly max_concurrency: number;
|
readonly max_concurrency: number;
|
||||||
readonly rate_limit: number;
|
readonly rate_limit: number;
|
||||||
readonly structured_logging: boolean;
|
readonly structured_logging: boolean;
|
||||||
|
/**
|
||||||
|
* Circuit breaker protects against cascading failures from upstream AI
|
||||||
|
* provider rate limits (429, 503, 529 overloaded).
|
||||||
|
*/
|
||||||
|
readonly circuit_breaker_enabled: boolean;
|
||||||
|
readonly circuit_breaker_failure_threshold: number;
|
||||||
|
readonly circuit_breaker_interval: number;
|
||||||
|
readonly circuit_breaker_timeout: number;
|
||||||
|
readonly circuit_breaker_max_requests: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
// From codersdk/aibridge.go
|
// From codersdk/aibridge.go
|
||||||
|
|||||||
Reference in New Issue
Block a user