Files
Michael Suchacz 8b1705eb65 feat: route chatd provider traffic through aibridge (#25629)
## Summary

Routes chatd model calls backed by concrete AI Provider rows through the
in-process aibridge transport by default, with deployment options to use
direct provider routing when AI Gateway is disabled or chat AI Gateway
routing is disabled.

- Splits model routing into common, direct provider, and AI Gateway
paths behind a single deployment-mode entry point.
- Builds chatd models through explicit request, route, and options data.
Active API key attribution is passed explicitly instead of being hidden
inside generic model construction.
- For AI Gateway BYOK routes, resolves the user's provider key in chatd,
forwards it through provider-specific auth headers, and sets
`X-Coder-AI-Governance-Token` to the `delegated` marker so aibridge
preserves those headers while still stripping Coder-specific metadata.
- Keeps central provider credentials and deployment fallback credentials
out of forwarded provider auth headers, so AI Gateway central policy
remains authoritative.
- Redacts delegated provider auth from default string formatting to
avoid accidental plaintext logging of user BYOK credentials.
- Covers selected chat models, advisor overrides, title and quickgen
paths, subagent overrides, computer use model selection, and an
integration-style chat turn through the aibridge transport path.
- Persists initiating API key IDs on chat and queued user messages,
including subagent child messages, and fails closed for AI
Gateway-routed model builds without an active key.
- Removes unused `api_key_id` indexes while keeping the persistence
columns and foreign keys.
- Keeps the deployment option available through config and env parsing,
but hides it from CLI help and generated docs.
- Stabilizes the subagent poll fallback test so background CreateChat
processing cannot win the state transition under slower CI environments.

## Tests

- `go test ./coderd/x/chatd -run
'TestAIGatewayProviderAuthForUser|TestAIGatewayProviderAuthRedactsFormatting|TestResolveModelRouteForConfigAIGatewayProviderAuth|TestAIGatewayModelForwardsProviderAuth|TestProcessChat_AIGatewayRoutingUsesDelegatedAPIKey|TestAwaitSubagentCompletion'
-count=1`
- `go test ./coderd/aibridged -run
'TestServeHTTP_DelegatedAPIKey|TestServeHTTP_StripCoderToken' -count=1`
- `git diff --check HEAD~1..HEAD`
- `make lint`

> Mux working on behalf of Mike.
2026-05-26 19:31:52 +00:00

145 lines
3.9 KiB
Go

package chatd
import (
"context"
"time"
"charm.land/fantasy"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/x/chatd/chatdebug"
"github.com/coder/coder/v2/coderd/x/chatd/chatprovider"
)
const (
debugCleanupRetryDelay = 500 * time.Millisecond
debugCleanupAttempts = 3
debugCleanupTimeout = 5 * time.Second
// debugCreateRunTimeout caps how long a CreateRun insert can
// block the caller's critical path. Debug persistence is
// best-effort, so the turn proceeds without debug rows if the
// DB is slow or locked. Matches the manual-title budget.
debugCreateRunTimeout = 5 * time.Second
// debugCleanupClockSkew gives cleanup cutoffs tolerance for cross-
// replica clock drift. The cutoff is sampled from the DB
// (updated_at returned by the status transition), and
// chat_debug_runs.started_at is stamped by whatever replica
// processes the replacement turn. If that replica's clock lags
// the DB, its started_at can land behind a commit-time cutoff
// even though the insert physically happened after commit.
// Subtracting this buffer ensures the fast retry path cannot
// delete replacement rows when clocks drift by up to this
// amount; rows within the buffer survive the fast cleanup but
// are still finalized (and eligible for stale-sweep cleanup) by
// the existing FinalizeStale background loop.
debugCleanupClockSkew = 30 * time.Second
)
func (p *Server) debugService() *chatdebug.Service {
if p == nil {
return nil
}
if p.debugSvcFactory == nil {
return p.debugSvc
}
p.debugSvcInit.Do(func() {
p.debugSvc = p.debugSvcFactory()
p.debugSvcReady.Store(p.debugSvc != nil)
})
return p.debugSvc
}
func (p *Server) existingDebugService() *chatdebug.Service {
if p == nil {
return nil
}
if p.debugSvcFactory == nil {
return p.debugSvc
}
if !p.debugSvcReady.Load() {
return nil
}
return p.debugSvc
}
func (p *Server) scheduleDebugCleanup(
ctx context.Context,
logMessage string,
fields []slog.Field,
cleanup func(context.Context, *chatdebug.Service) error,
) {
debugSvc := p.debugService()
if debugSvc == nil {
return
}
// Acquire inflightMu around the positive Add so Close() cannot
// call drainInflight concurrently when the counter is at zero.
// See drainInflight for the WaitGroup contract this preserves.
p.inflightMu.Lock()
p.inflight.Add(1)
p.inflightMu.Unlock()
go func() {
defer p.inflight.Done()
cleanupCtx := context.WithoutCancel(ctx)
for attempt := 0; attempt < debugCleanupAttempts; attempt++ {
if attempt > 0 {
timer := p.clock.NewTimer(debugCleanupRetryDelay, "chatd", "debug_cleanup")
<-timer.C
}
passCtx, cancel := context.WithTimeout(cleanupCtx, debugCleanupTimeout)
err := cleanup(passCtx, debugSvc)
cancel()
if err == nil {
return
}
logFields := append([]slog.Field{
slog.F("attempt", attempt+1),
slog.F("max_attempts", debugCleanupAttempts),
}, fields...)
logFields = append(logFields, slog.Error(err))
p.logger.Warn(cleanupCtx, logMessage, logFields...)
}
}()
}
func (p *Server) newDebugAwareModel(
ctx context.Context,
req modelClientRequest,
route resolvedModelRoute,
opts modelBuildOptions,
) (fantasy.LanguageModel, bool, error) {
providerHint, err := route.providerHint()
if err != nil {
return nil, false, err
}
provider, resolvedModel, err := chatprovider.ResolveModelWithProviderHint(req.ModelName, providerHint)
if err != nil {
return nil, false, err
}
route = route.withProviderHint(provider)
req.ModelName = resolvedModel
debugSvc := p.debugService()
debugEnabled := debugSvc != nil && debugSvc.IsEnabled(ctx, req.Chat.ID, req.Chat.OwnerID)
opts.RecordHTTP = debugEnabled
model, err := p.newModel(ctx, req, route, opts)
if err != nil {
return nil, debugEnabled, err
}
if !debugEnabled {
return model, false, nil
}
return chatdebug.WrapModel(model, debugSvc, chatdebug.RecorderOptions{
ChatID: req.Chat.ID,
OwnerID: req.Chat.OwnerID,
Provider: provider,
Model: resolvedModel,
}), true, nil
}