feat(coderd): add overload protection with rate limiting and concurrency control (#21161)

## Summary This adds configurable overload protection to the AI Bridge daemon to prevent the server from being overwhelmed during periods of high load. Partially addresses coder/internal#1153 (rate limits and concurrency control; circuit breakers are deferred to a follow-up). ## New Configuration Options | Option | Environment Variable | Description | Default | |--------|---------------------|-------------|---------| | `--aibridge-max-concurrency` | `CODER_AIBRIDGE_MAX_CONCURRENCY` | Maximum number of concurrent AI Bridge requests. Set to 0 to disable (unlimited). | `0` | | `--aibridge-rate-limit` | `CODER_AIBRIDGE_RATE_LIMIT` | Maximum number of AI Bridge requests per second. Set to 0 to disable rate limiting. | `0` | ## Behavior When limits are exceeded: - **Concurrency limit**: Returns HTTP `503 Service Unavailable` with message "AI Bridge is currently at capacity. Please try again later." - **Rate limit**: Returns HTTP `429 Too Many Requests` with `Retry-After` header. Both protections are optional and disabled by default (0 values). ## Implementation The overload protection is implemented as reusable middleware in `coderd/httpmw/ratelimit.go`: 1. **`RateLimitByAuthToken`**: Per-user rate limiting that uses `APITokenFromRequest` to extract the authentication token, with fallback to `X-Api-Key` header for AI provider compatibility (e.g., Anthropic). Falls back to IP-based rate limiting if no token is present. Includes `Retry-After` header for backpressure signaling. 2. **`ConcurrencyLimit`**: Uses an atomic counter to track in-flight requests and reject when at capacity. The middleware is applied in `enterprise/coderd/aibridge.go` via `r.Group` in the following order: 1. Concurrency check (faster rejection for load shedding) 2. Rate limit check **Note**: Rate limiting currently applies to all AI Bridge requests, including pass-through requests. Ideally only actual interceptions should count, but this would require changes in the aibridge library. ## Testing Added comprehensive tests for: - Rate limiting by auth token (Bearer token, X-Api-Key, no token fallback to IP) - Different tokens not rate limited against each other - Disabled when limit is zero - Retry-After header is set on 429 responses - Concurrency limiting (allows within limit, rejects over limit, disabled when zero)
2026-06-02 20:48:20 +00:00 · 2025-12-11 16:38:54 +01:00
parent 8ead6f795d
commit 6f86f67754
17 changed files with 568 additions and 37 deletions
@@ -125,12 +125,20 @@ AI BRIDGE OPTIONS:
          requests (requires the "oauth2" and "mcp-server-http" experiments to
          be enabled).

+      --aibridge-max-concurrency int, $CODER_AIBRIDGE_MAX_CONCURRENCY (default: 0)
+          Maximum number of concurrent AI Bridge requests per replica. Set to 0
+          to disable (unlimited).
+
      --aibridge-openai-base-url string, $CODER_AIBRIDGE_OPENAI_BASE_URL (default: https://api.openai.com/v1/)
          The base URL of the OpenAI API.

      --aibridge-openai-key string, $CODER_AIBRIDGE_OPENAI_KEY
          The key to authenticate against the OpenAI API.

+      --aibridge-rate-limit int, $CODER_AIBRIDGE_RATE_LIMIT (default: 0)
+          Maximum number of AI Bridge requests per second per replica. Set to 0
+          to disable (unlimited).
+
 CLIENT OPTIONS: 
 These options change the behavior of how clients interact with the Coder.
 Clients include the Coder CLI, Coder Desktop, IDE extensions, and the web UI.
@@ -748,6 +748,14 @@ aibridge:
  # (token, prompt, tool use).
  # (default: 60d, type: duration)
  retention: 1440h0m0s
+  # Maximum number of concurrent AI Bridge requests per replica. Set to 0 to disable
+  # (unlimited).
+  # (default: 0, type: int)
+  maxConcurrency: 0
+  # Maximum number of AI Bridge requests per second per replica. Set to 0 to disable
+  # (unlimited).
+  # (default: 0, type: int)
+  rateLimit: 0
 # Configure data retention policies for various database tables. Retention
 # policies automatically purge old data to reduce database size and improve
 # performance. Setting a retention duration to 0 disables automatic purging for
@@ -0,0 +1,24 @@
+// Package aibridge provides utilities for the AI Bridge feature.
+package aibridge
+
+import (
+	"net/http"
+	"strings"
+)
+
+// ExtractAuthToken extracts an authorization token from HTTP headers.
+// It checks the Authorization header (Bearer token) and X-Api-Key header,
+// which represent the different ways clients authenticate against AI providers.
+// If neither are present, an empty string is returned.
+func ExtractAuthToken(header http.Header) string {
+	if auth := strings.TrimSpace(header.Get("Authorization")); auth != "" {
+		fields := strings.Fields(auth)
+		if len(fields) == 2 && strings.EqualFold(fields[0], "Bearer") {
+			return fields[1]
+		}
+	}
+	if apiKey := strings.TrimSpace(header.Get("X-Api-Key")); apiKey != "" {
+		return apiKey
+	}
+	return ""
+}
@@ -11883,9 +11883,15 @@ const docTemplate = `{
                "inject_coder_mcp_tools": {
                    "type": "boolean"
                },
+                "max_concurrency": {
+                    "type": "integer"
+                },
                "openai": {
                    "$ref": "#/definitions/codersdk.AIBridgeOpenAIConfig"
                },
+                "rate_limit": {
+                    "type": "integer"
+                },
                "retention": {
                    "type": "integer"
                }
@@ -10549,9 +10549,15 @@
 				"inject_coder_mcp_tools": {
 					"type": "boolean"
 				},
+				"max_concurrency": {
+					"type": "integer"
+				},
 				"openai": {
 					"$ref": "#/definitions/codersdk.AIBridgeOpenAIConfig"
 				},
+				"rate_limit": {
+					"type": "integer"
+				},
 				"retention": {
 					"type": "integer"
 				}
@@ -4,11 +4,13 @@ import (
 	"fmt"
 	"net/http"
 	"strconv"
+	"sync/atomic"
 	"time"

 	"github.com/go-chi/httprate"
 	"golang.org/x/xerrors"

+	"github.com/coder/coder/v2/coderd/aibridge"
 	"github.com/coder/coder/v2/coderd/database"
 	"github.com/coder/coder/v2/coderd/httpapi"
 	"github.com/coder/coder/v2/coderd/rbac"
@@ -70,3 +72,72 @@ func RateLimit(count int, window time.Duration) func(http.Handler) http.Handler
 		}),
 	)
 }
+
+// RateLimitByAuthToken returns a handler that limits requests based on the
+// authentication token in the request.
+//
+// This differs from [RateLimit] in several ways:
+//   - It extracts the token directly from request headers (Authorization Bearer
+//     or X-Api-Key) rather than from the request context, making it suitable for
+//     endpoints that handle authentication internally (like AI Bridge) rather than
+//     via [ExtractAPIKeyMW] middleware.
+//   - It does not support the bypass header for Owners.
+//   - It does not key by endpoint, so the limit applies across all endpoints using
+//     this middleware.
+//   - It includes a Retry-After header in 429 responses for backpressure signaling.
+//
+// If no token is found in the headers, it falls back to rate limiting by IP address.
+func RateLimitByAuthToken(count int, window time.Duration) func(http.Handler) http.Handler {
+	if count <= 0 {
+		return func(handler http.Handler) http.Handler {
+			return handler
+		}
+	}
+
+	return httprate.Limit(
+		count,
+		window,
+		httprate.WithKeyFuncs(func(r *http.Request) (string, error) {
+			// Try to extract auth token for per-user rate limiting using
+			// AI provider authentication headers (Authorization Bearer or X-Api-Key).
+			if token := aibridge.ExtractAuthToken(r.Header); token != "" {
+				return token, nil
+			}
+			// Fall back to IP-based rate limiting if no token present.
+			return httprate.KeyByIP(r)
+		}),
+		httprate.WithLimitHandler(func(w http.ResponseWriter, r *http.Request) {
+			// Add Retry-After header for backpressure signaling.
+			w.Header().Set("Retry-After", fmt.Sprintf("%d", int(window.Seconds())))
+			httpapi.Write(r.Context(), w, http.StatusTooManyRequests, codersdk.Response{
+				Message: "You've been rate limited. Please try again later.",
+			})
+		}),
+	)
+}
+
+// ConcurrencyLimit returns a handler that limits the number of concurrent
+// requests. When the limit is exceeded, it returns HTTP 503 Service Unavailable.
+func ConcurrencyLimit(maxConcurrent int64, resourceName string) func(http.Handler) http.Handler {
+	if maxConcurrent <= 0 {
+		return func(handler http.Handler) http.Handler {
+			return handler
+		}
+	}
+
+	var current atomic.Int64
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			c := current.Add(1)
+			defer current.Add(-1)
+
+			if c > maxConcurrent {
+				httpapi.Write(r.Context(), w, http.StatusServiceUnavailable, codersdk.Response{
+					Message: fmt.Sprintf("%s is currently at capacity. Please try again later.", resourceName),
+				})
+				return
+			}
+			next.ServeHTTP(w, r)
+		})
+	}
+}
@@ -6,6 +6,7 @@ import (
 	"net"
 	"net/http"
 	"net/http/httptest"
+	"sync"
 	"testing"
 	"time"

@@ -17,6 +18,7 @@ import (
 	"github.com/coder/coder/v2/coderd/database/dbtestutil"
 	"github.com/coder/coder/v2/coderd/httpmw"
 	"github.com/coder/coder/v2/codersdk"
+	"github.com/coder/coder/v2/testutil"
 )

 func randRemoteAddr() string {
@@ -145,3 +147,211 @@ func TestRateLimit(t *testing.T) {
 		}
 	})
 }
+
+func TestRateLimitByAuthToken(t *testing.T) {
+	t.Parallel()
+
+	t.Run("LimitsByAuthHeader", func(t *testing.T) {
+		t.Parallel()
+
+		tests := []struct {
+			name       string
+			headerName string
+			headerVal  string
+		}{
+			{
+				name:       "BearerToken",
+				headerName: "Authorization",
+				headerVal:  "Bearer test-token-123",
+			},
+			{
+				name:       "XApiKey",
+				headerName: "X-Api-Key",
+				headerVal:  "test-api-key-456",
+			},
+			{
+				name:       "NoToken",
+				headerName: "",
+				headerVal:  "",
+			},
+		}
+
+		for _, tt := range tests {
+			t.Run(tt.name, func(t *testing.T) {
+				t.Parallel()
+				rtr := chi.NewRouter()
+				rtr.Use(httpmw.RateLimitByAuthToken(2, time.Hour))
+				rtr.Get("/", func(rw http.ResponseWriter, r *http.Request) {
+					rw.WriteHeader(http.StatusOK)
+				})
+
+				// Same token (or IP if no token) should be rate limited after 2 requests.
+				for i := 0; i < 5; i++ {
+					req := httptest.NewRequest("GET", "/", nil)
+					if tt.headerName != "" {
+						req.Header.Set(tt.headerName, tt.headerVal)
+					}
+					rec := httptest.NewRecorder()
+					rtr.ServeHTTP(rec, req)
+					resp := rec.Result()
+					_ = resp.Body.Close()
+					if i < 2 {
+						require.Equal(t, http.StatusOK, resp.StatusCode, "request %d should succeed", i)
+					} else {
+						require.Equal(t, http.StatusTooManyRequests, resp.StatusCode, "request %d should be rate limited", i)
+						// Verify Retry-After header is set.
+						require.NotEmpty(t, resp.Header.Get("Retry-After"), "Retry-After header should be set")
+					}
+				}
+			})
+		}
+	})
+
+	t.Run("DifferentTokensNotLimited", func(t *testing.T) {
+		t.Parallel()
+		rtr := chi.NewRouter()
+		rtr.Use(httpmw.RateLimitByAuthToken(1, time.Hour))
+		rtr.Get("/", func(rw http.ResponseWriter, r *http.Request) {
+			rw.WriteHeader(http.StatusOK)
+		})
+
+		// Different tokens should not be rate limited against each other.
+		for i := 0; i < 5; i++ {
+			req := httptest.NewRequest("GET", "/", nil)
+			req.Header.Set("Authorization", fmt.Sprintf("Bearer token-%d", i))
+			rec := httptest.NewRecorder()
+			rtr.ServeHTTP(rec, req)
+			resp := rec.Result()
+			_ = resp.Body.Close()
+			require.Equal(t, http.StatusOK, resp.StatusCode, "request %d should succeed", i)
+		}
+	})
+
+	t.Run("DisabledWhenZero", func(t *testing.T) {
+		t.Parallel()
+		rtr := chi.NewRouter()
+		rtr.Use(httpmw.RateLimitByAuthToken(0, time.Hour))
+		rtr.Get("/", func(rw http.ResponseWriter, r *http.Request) {
+			rw.WriteHeader(http.StatusOK)
+		})
+
+		// Should not be rate limited when limit is 0.
+		for i := 0; i < 10; i++ {
+			req := httptest.NewRequest("GET", "/", nil)
+			req.Header.Set("Authorization", "Bearer same-token")
+			rec := httptest.NewRecorder()
+			rtr.ServeHTTP(rec, req)
+			resp := rec.Result()
+			_ = resp.Body.Close()
+			require.Equal(t, http.StatusOK, resp.StatusCode)
+		}
+	})
+}
+
+func TestConcurrencyLimit(t *testing.T) {
+	t.Parallel()
+
+	t.Run("LimitsConcurrentRequests", func(t *testing.T) {
+		t.Parallel()
+
+		const maxConcurrency = 2
+		rtr := chi.NewRouter()
+		rtr.Use(httpmw.ConcurrencyLimit(maxConcurrency, "Test"))
+
+		// Use a WaitGroup as a barrier to ensure all requests are in the handler
+		// before any of them proceed.
+		var handlersReady sync.WaitGroup
+		handlersReady.Add(maxConcurrency)
+		releaseHandler := make(chan struct{})
+
+		rtr.Get("/", func(rw http.ResponseWriter, r *http.Request) {
+			handlersReady.Done()
+			// Wait until released.
+			<-releaseHandler
+			rw.WriteHeader(http.StatusOK)
+		})
+
+		server := httptest.NewServer(rtr)
+		defer server.Close()
+
+		ctx := testutil.Context(t, testutil.WaitShort)
+
+		// Start maxConcurrency requests that will block.
+		// We use channels to collect errors instead of require in goroutines.
+		type result struct {
+			statusCode int
+			err        error
+		}
+		results := make(chan result, maxConcurrency)
+
+		var wg sync.WaitGroup
+		for i := 0; i < maxConcurrency; i++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				req, err := http.NewRequestWithContext(ctx, http.MethodGet, server.URL+"/", nil)
+				if err != nil {
+					results <- result{err: err}
+					return
+				}
+				resp, err := http.DefaultClient.Do(req)
+				if err != nil {
+					results <- result{err: err}
+					return
+				}
+				defer resp.Body.Close()
+				results <- result{statusCode: resp.StatusCode}
+			}()
+		}
+
+		// Wait for all requests to enter the handler with a timeout.
+		handlersReadyCh := make(chan struct{})
+		go func() {
+			handlersReady.Wait()
+			close(handlersReadyCh)
+		}()
+		select {
+		case <-handlersReadyCh:
+		case <-ctx.Done():
+			t.Fatal("timed out waiting for handlers to be ready")
+		}
+
+		// Next request should be rejected since we're at capacity.
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, server.URL+"/", nil)
+		require.NoError(t, err)
+		resp, err := http.DefaultClient.Do(req)
+		require.NoError(t, err)
+		defer resp.Body.Close()
+		require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
+
+		// Release all blocked requests.
+		close(releaseHandler)
+		wg.Wait()
+		close(results)
+
+		// Check all goroutine results.
+		for res := range results {
+			require.NoError(t, res.err)
+			require.Equal(t, http.StatusOK, res.statusCode)
+		}
+	})
+
+	t.Run("DisabledWhenZero", func(t *testing.T) {
+		t.Parallel()
+		rtr := chi.NewRouter()
+		rtr.Use(httpmw.ConcurrencyLimit(0, "Test"))
+		rtr.Get("/", func(rw http.ResponseWriter, r *http.Request) {
+			rw.WriteHeader(http.StatusOK)
+		})
+
+		// Should not be limited when maxConcurrency is 0.
+		for i := 0; i < 10; i++ {
+			req := httptest.NewRequest("GET", "/", nil)
+			rec := httptest.NewRecorder()
+			rtr.ServeHTTP(rec, req)
+			resp := rec.Result()
+			_ = resp.Body.Close()
+			require.Equal(t, http.StatusOK, resp.StatusCode)
+		}
+	})
+}
@@ -3401,6 +3401,26 @@ Write out the current server config as YAML to stdout.`,
 			YAML:        "retention",
 			Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"),
 		},
+		{
+			Name:        "AI Bridge Max Concurrency",
+			Description: "Maximum number of concurrent AI Bridge requests per replica. Set to 0 to disable (unlimited).",
+			Flag:        "aibridge-max-concurrency",
+			Env:         "CODER_AIBRIDGE_MAX_CONCURRENCY",
+			Value:       &c.AI.BridgeConfig.MaxConcurrency,
+			Default:     "0",
+			Group:       &deploymentGroupAIBridge,
+			YAML:        "maxConcurrency",
+		},
+		{
+			Name:        "AI Bridge Rate Limit",
+			Description: "Maximum number of AI Bridge requests per second per replica. Set to 0 to disable (unlimited).",
+			Flag:        "aibridge-rate-limit",
+			Env:         "CODER_AIBRIDGE_RATE_LIMIT",
+			Value:       &c.AI.BridgeConfig.RateLimit,
+			Default:     "0",
+			Group:       &deploymentGroupAIBridge,
+			YAML:        "rateLimit",
+		},
 		// Retention settings
 		{
 			Name:        "Audit Logs Retention",
@@ -3471,6 +3491,8 @@ type AIBridgeConfig struct {
 	Bedrock             AIBridgeBedrockConfig   `json:"bedrock" typescript:",notnull"`
 	InjectCoderMCPTools serpent.Bool            `json:"inject_coder_mcp_tools" typescript:",notnull"`
 	Retention           serpent.Duration        `json:"retention" typescript:",notnull"`
+	MaxConcurrency      serpent.Int64           `json:"max_concurrency" typescript:",notnull"`
+	RateLimit           serpent.Int64           `json:"rate_limit" typescript:",notnull"`
 }

 type AIBridgeOpenAIConfig struct {
@@ -176,10 +176,12 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
        },
        "enabled": true,
        "inject_coder_mcp_tools": true,
+        "max_concurrency": 0,
        "openai": {
          "base_url": "string",
          "key": "string"
        },
+        "rate_limit": 0,
        "retention": 0
      }
    },
@@ -390,10 +390,12 @@
  },
  "enabled": true,
  "inject_coder_mcp_tools": true,
+  "max_concurrency": 0,
  "openai": {
    "base_url": "string",
    "key": "string"
  },
+  "rate_limit": 0,
  "retention": 0
 }
 ```
@@ -406,7 +408,9 @@
 | `bedrock`                | [codersdk.AIBridgeBedrockConfig](#codersdkaibridgebedrockconfig)     | false    |              |             |
 | `enabled`                | boolean                                                              | false    |              |             |
 | `inject_coder_mcp_tools` | boolean                                                              | false    |              |             |
+| `max_concurrency`        | integer                                                              | false    |              |             |
 | `openai`                 | [codersdk.AIBridgeOpenAIConfig](#codersdkaibridgeopenaiconfig)       | false    |              |             |
+| `rate_limit`             | integer                                                              | false    |              |             |
 | `retention`              | integer                                                              | false    |              |             |

 ## codersdk.AIBridgeInterception
@@ -700,10 +704,12 @@
    },
    "enabled": true,
    "inject_coder_mcp_tools": true,
+    "max_concurrency": 0,
    "openai": {
      "base_url": "string",
      "key": "string"
    },
+    "rate_limit": 0,
    "retention": 0
  }
 }
@@ -2860,10 +2866,12 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
        },
        "enabled": true,
        "inject_coder_mcp_tools": true,
+        "max_concurrency": 0,
        "openai": {
          "base_url": "string",
          "key": "string"
        },
+        "rate_limit": 0,
        "retention": 0
      }
    },
@@ -3383,10 +3391,12 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o
      },
      "enabled": true,
      "inject_coder_mcp_tools": true,
+      "max_concurrency": 0,
      "openai": {
        "base_url": "string",
        "key": "string"
      },
+      "rate_limit": 0,
      "retention": 0
    }
  },
@@ -1781,6 +1781,28 @@ Whether to inject Coder's MCP tools into intercepted AI Bridge requests (require

 Length of time to retain data such as interceptions and all related records (token, prompt, tool use).

+### --aibridge-max-concurrency
+
+|             |                                              |
+|-------------|----------------------------------------------|
+| Type        | <code>int</code>                             |
+| Environment | <code>$CODER_AIBRIDGE_MAX_CONCURRENCY</code> |
+| YAML        | <code>aibridge.maxConcurrency</code>         |
+| Default     | <code>0</code>                               |
+
+Maximum number of concurrent AI Bridge requests per replica. Set to 0 to disable (unlimited).
+
+### --aibridge-rate-limit
+
+|             |                                         |
+|-------------|-----------------------------------------|
+| Type        | <code>int</code>                        |
+| Environment | <code>$CODER_AIBRIDGE_RATE_LIMIT</code> |
+| YAML        | <code>aibridge.rateLimit</code>         |
+| Default     | <code>0</code>                          |
+
+Maximum number of AI Bridge requests per second per replica. Set to 0 to disable (unlimited).
+
 ### --audit-logs-retention

 |             |                                          |
@@ -17,6 +17,7 @@ import (

 	"cdr.dev/slog/sloggers/slogtest"
 	"github.com/coder/aibridge"
+	agplaibridge "github.com/coder/coder/v2/coderd/aibridge"
 	"github.com/coder/coder/v2/codersdk"
 	"github.com/coder/coder/v2/enterprise/aibridged"
 	mock "github.com/coder/coder/v2/enterprise/aibridged/aibridgedmock"
@@ -220,7 +221,7 @@ func TestExtractAuthToken(t *testing.T) {
 			for k, v := range tc.headers {
 				headers.Add(k, v)
 			}
-			key := aibridged.ExtractAuthToken(headers)
+			key := agplaibridge.ExtractAuthToken(headers)
 			require.Equal(t, tc.expectedKey, key)
 		})
 	}
@@ -9,6 +9,7 @@ import (

 	"cdr.dev/slog"
 	"github.com/coder/aibridge"
+	agplaibridge "github.com/coder/coder/v2/coderd/aibridge"
 	"github.com/coder/coder/v2/enterprise/aibridged/proto"
 )

@@ -35,7 +36,7 @@ func (s *Server) ServeHTTP(rw http.ResponseWriter, r *http.Request) {

 	logger := s.logger.With(slog.F("path", r.URL.Path))

-	key := strings.TrimSpace(ExtractAuthToken(r.Header))
+	key := strings.TrimSpace(agplaibridge.ExtractAuthToken(r.Header))
 	if key == "" {
 		logger.Warn(ctx, "no auth key provided")
 		http.Error(rw, ErrNoAuthKey.Error(), http.StatusBadRequest)
@@ -79,20 +80,3 @@ func (s *Server) ServeHTTP(rw http.ResponseWriter, r *http.Request) {

 	handler.ServeHTTP(rw, r)
 }
-
-// ExtractAuthToken extracts authorization token from HTTP request using multiple sources.
-// These sources represent the different ways clients authenticate against AI providers.
-// It checks the Authorization header (Bearer token) and X-Api-Key header.
-// If neither are present, an empty string is returned.
-func ExtractAuthToken(header http.Header) string {
-	if auth := strings.TrimSpace(header.Get("Authorization")); auth != "" {
-		fields := strings.Fields(auth)
-		if len(fields) == 2 && strings.EqualFold(fields[0], "Bearer") {
-			return fields[1]
-		}
-	}
-	if apiKey := strings.TrimSpace(header.Get("X-Api-Key")); apiKey != "" {
-		return apiKey
-	}
-	return ""
-}
@@ -126,12 +126,20 @@ AI BRIDGE OPTIONS:
          requests (requires the "oauth2" and "mcp-server-http" experiments to
          be enabled).

+      --aibridge-max-concurrency int, $CODER_AIBRIDGE_MAX_CONCURRENCY (default: 0)
+          Maximum number of concurrent AI Bridge requests per replica. Set to 0
+          to disable (unlimited).
+
      --aibridge-openai-base-url string, $CODER_AIBRIDGE_OPENAI_BASE_URL (default: https://api.openai.com/v1/)
          The base URL of the OpenAI API.

      --aibridge-openai-key string, $CODER_AIBRIDGE_OPENAI_KEY
          The key to authenticate against the OpenAI API.

+      --aibridge-rate-limit int, $CODER_AIBRIDGE_RATE_LIMIT (default: 0)
+          Maximum number of AI Bridge requests per second per replica. Set to 0
+          to disable (unlimited).
+
 CLIENT OPTIONS: 
 These options change the behavior of how clients interact with the Coder.
 Clients include the Coder CLI, Coder Desktop, IDE extensions, and the web UI.
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"net/http"
 	"strings"
+	"time"

 	"github.com/go-chi/chi/v5"
 	"github.com/google/uuid"
@@ -23,10 +24,19 @@ import (
 const (
 	maxListInterceptionsLimit     = 1000
 	defaultListInterceptionsLimit = 100
+	// aiBridgeRateLimitWindow is the fixed duration for rate limiting AI Bridge
+	// requests. This is hardcoded to keep configuration simple.
+	aiBridgeRateLimitWindow = time.Second
 )

 // aibridgeHandler handles all aibridged-related endpoints.
 func aibridgeHandler(api *API, middlewares ...func(http.Handler) http.Handler) func(r chi.Router) {
+	// Build the overload protection middleware chain for the aibridged handler.
+	// These limits are applied per-replica.
+	bridgeCfg := api.DeploymentValues.AI.BridgeConfig
+	concurrencyLimiter := httpmw.ConcurrencyLimit(bridgeCfg.MaxConcurrency.Value(), "AI Bridge")
+	rateLimiter := httpmw.RateLimitByAuthToken(int(bridgeCfg.RateLimit.Value()), aiBridgeRateLimitWindow)
+
 	return func(r chi.Router) {
 		r.Use(api.RequireFeatureMW(codersdk.FeatureAIBridge))
 		r.Group(func(r chi.Router) {
@@ -34,25 +44,30 @@ func aibridgeHandler(api *API, middlewares ...func(http.Handler) http.Handler) f
 			r.Get("/interceptions", api.aiBridgeListInterceptions)
 		})

-		// This is a bit funky but since aibridge only exposes a HTTP
-		// handler, this is how it has to be.
-		r.HandleFunc("/*", func(rw http.ResponseWriter, r *http.Request) {
-			if api.aibridgedHandler == nil {
-				httpapi.Write(r.Context(), rw, http.StatusNotFound, codersdk.Response{
-					Message: "aibridged handler not mounted",
-				})
-				return
-			}
-
-			// Strip either the experimental or stable prefix.
-			// TODO: experimental route is deprecated and must be removed with Beta.
-			prefixes := []string{"/api/experimental/aibridge", "/api/v2/aibridge"}
-			for _, prefix := range prefixes {
-				if strings.Contains(r.URL.String(), prefix) {
-					http.StripPrefix(prefix, api.aibridgedHandler).ServeHTTP(rw, r)
-					break
+		// Apply overload protection middleware to the aibridged handler.
+		// Concurrency limit is checked first for faster rejection under load.
+		r.Group(func(r chi.Router) {
+			r.Use(concurrencyLimiter, rateLimiter)
+			// This is a bit funky but since aibridge only exposes a HTTP
+			// handler, this is how it has to be.
+			r.HandleFunc("/*", func(rw http.ResponseWriter, r *http.Request) {
+				if api.aibridgedHandler == nil {
+					httpapi.Write(r.Context(), rw, http.StatusNotFound, codersdk.Response{
+						Message: "aibridged handler not mounted",
+					})
+					return
 				}
-			}
+
+				// Strip either the experimental or stable prefix.
+				// TODO: experimental route is deprecated and must be removed with Beta.
+				prefixes := []string{"/api/experimental/aibridge", "/api/v2/aibridge"}
+				for _, prefix := range prefixes {
+					if strings.Contains(r.URL.String(), prefix) {
+						http.StripPrefix(prefix, api.aibridgedHandler).ServeHTTP(rw, r)
+						break
+					}
+				}
+			})
 		})
 	}
 }
@@ -703,3 +703,135 @@ func TestAIBridgeRouting(t *testing.T) {
 		})
 	}
 }
+
+func TestAIBridgeRateLimiting(t *testing.T) {
+	t.Parallel()
+
+	dv := coderdtest.DeploymentValues(t)
+	// Set a low rate limit for testing.
+	dv.AI.BridgeConfig.RateLimit = 2
+
+	client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
+		Options: &coderdtest.Options{
+			DeploymentValues: dv,
+		},
+		LicenseOptions: &coderdenttest.LicenseOptions{
+			Features: license.Features{
+				codersdk.FeatureAIBridge: 1,
+			},
+		},
+	})
+	t.Cleanup(func() {
+		_ = closer.Close()
+	})
+
+	// Register a simple test handler.
+	testHandler := http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
+		rw.WriteHeader(http.StatusOK)
+	})
+	api.RegisterInMemoryAIBridgedHTTPHandler(testHandler)
+
+	ctx := testutil.Context(t, testutil.WaitLong)
+	httpClient := &http.Client{}
+	url := client.URL.String() + "/api/v2/aibridge/test"
+
+	// Make requests up to the limit - should succeed.
+	for range 2 {
+		req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
+		require.NoError(t, err)
+		req.Header.Set(codersdk.SessionTokenHeader, client.SessionToken())
+
+		resp, err := httpClient.Do(req)
+		require.NoError(t, err)
+		_ = resp.Body.Close()
+		require.Equal(t, http.StatusOK, resp.StatusCode)
+	}
+
+	// Next request should be rate limited.
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
+	require.NoError(t, err)
+	req.Header.Set(codersdk.SessionTokenHeader, client.SessionToken())
+
+	resp, err := httpClient.Do(req)
+	require.NoError(t, err)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusTooManyRequests, resp.StatusCode)
+	require.NotEmpty(t, resp.Header.Get("Retry-After"))
+}
+
+func TestAIBridgeConcurrencyLimiting(t *testing.T) {
+	t.Parallel()
+
+	dv := coderdtest.DeploymentValues(t)
+	// Set a low concurrency limit for testing.
+	dv.AI.BridgeConfig.MaxConcurrency = 1
+
+	client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
+		Options: &coderdtest.Options{
+			DeploymentValues: dv,
+		},
+		LicenseOptions: &coderdenttest.LicenseOptions{
+			Features: license.Features{
+				codersdk.FeatureAIBridge: 1,
+			},
+		},
+	})
+	t.Cleanup(func() {
+		_ = closer.Close()
+	})
+
+	// Register a handler that blocks until signaled.
+	started := make(chan struct{})
+	unblock := make(chan struct{})
+	testHandler := http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
+		started <- struct{}{}
+		<-unblock
+		rw.WriteHeader(http.StatusOK)
+	})
+	api.RegisterInMemoryAIBridgedHTTPHandler(testHandler)
+
+	ctx := testutil.Context(t, testutil.WaitLong)
+	httpClient := &http.Client{}
+	url := client.URL.String() + "/api/v2/aibridge/test"
+
+	// Start a request that will block.
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+		req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
+		if err != nil {
+			return
+		}
+		req.Header.Set(codersdk.SessionTokenHeader, client.SessionToken())
+
+		resp, err := httpClient.Do(req)
+		if err == nil {
+			_ = resp.Body.Close()
+		}
+	}()
+
+	// Wait for the first request to start processing.
+	select {
+	case <-started:
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for first request to start")
+	}
+
+	// Second request should be rejected with 503.
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
+	require.NoError(t, err)
+	req.Header.Set(codersdk.SessionTokenHeader, client.SessionToken())
+
+	resp, err := httpClient.Do(req)
+	require.NoError(t, err)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
+
+	// Unblock the first request and wait for it to complete.
+	close(unblock)
+	select {
+	case <-done:
+	case <-ctx.Done():
+		t.Fatal("timed out waiting for first request to complete")
+	}
+}
@@ -33,6 +33,8 @@ export interface AIBridgeConfig {
 	readonly bedrock: AIBridgeBedrockConfig;
 	readonly inject_coder_mcp_tools: boolean;
 	readonly retention: number;
+	readonly max_concurrency: number;
+	readonly rate_limit: number;
 }

 // From codersdk/aibridge.go