Files
coder/coderd/x/chatd/dialvalidation.go
T
Ethan 61e31ec5cc perf(coderd/x/chatd): persist workspace agent binding across chat turns (#23274)
## Summary

This change removes the steady-state "resolve the latest workspace
agent" query from chat execution.

Instead of asking the database for the latest build's agent on every
turn, a chat now persists the workspace/build/agent binding it actually
uses and reuses that binding across subsequent turns. The common path
becomes "load the bound agent by ID and dial it", with fallback paths to
repair the binding when it is missing, stale, or intentionally changed.

## What changes

- add `workspace_id`, `build_id`, and `agent_id` binding fields to
`chats`
- expose those fields through the chat API / SDK so the execution
context is explicit
- load the persisted binding first in chatd, instead of always resolving
the latest build's agent
- persist a refreshed binding when chatd has to re-resolve the workspace
agent
- keep child / subagent chats on the same bound workspace context by
inheriting the parent binding
- leave `build_id` / `agent_id` unset for flows like `create_workspace`,
then bind them lazily on the next agent-backed turn

## Runtime behavior

The binding is treated as an optimistic cache of the agent a chat should
use:

- if the bound agent still exists and dials successfully, we use it
without a latest-build lookup
- if the bound agent is missing or no longer reachable, chatd
re-resolves against the latest build and persists the new binding
- if a workspace mutation changes the chat's target workspace, the
binding is updated as part of that mutation

To avoid reintroducing a hot-path query, dialing uses lazy validation:

- start dialing the cached agent immediately
- only validate against the latest build if the dial is still pending
after a short delay
- if validation finds a different agent, cancel the stale dial, switch
to the current agent, and persist the repaired binding

## Result

The hot path stops issuing
`GetWorkspaceAgentsInLatestBuildByWorkspaceID` for every user message,
which is the source of the DB pressure this PR is addressing. At the
same time, chats still converge to the correct workspace agent when the
binding becomes stale due to rebuilds or explicit workspace changes.
2026-03-26 17:22:38 +11:00

171 lines
4.8 KiB
Go

package chatd
import (
"context"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
// DialResult contains the outcome of dialWithLazyValidation.
type DialResult struct {
Conn workspacesdk.AgentConn
Release func()
AgentID uuid.UUID // The agent that was actually dialed.
WasSwitched bool // True if validation discovered a different agent.
}
// DialFunc dials an agent by ID and returns a connection.
type DialFunc func(ctx context.Context, id uuid.UUID) (workspacesdk.AgentConn, func(), error)
// ValidateFunc returns the current agent ID for a workspace.
type ValidateFunc func(ctx context.Context, workspaceID uuid.UUID) (uuid.UUID, error)
type dialOut struct {
conn workspacesdk.AgentConn
release func()
err error
}
// dialWithLazyValidation dials an agent and only consults the database if the
// original dial is slow or fails quickly. This keeps the common path free of
// latest-build lookups while still repairing stale bindings.
//
// Outcomes:
// - The dial succeeds before delay, so validation is skipped.
// - The timer fires and validation confirms the same agent, so the original
// dial continues.
// - The timer fires and validation finds a different agent, so the stale
// dial is canceled and the new agent is dialed instead.
// - The dial fails before delay, so validation runs immediately and either
// switches to a different agent or retries the current one once.
func dialWithLazyValidation(
ctx context.Context,
agentID uuid.UUID,
workspaceID uuid.UUID,
dialFn DialFunc,
validateFn ValidateFunc,
delay time.Duration,
) (DialResult, error) {
wrapErr := func(err error) error {
return xerrors.Errorf("dial with lazy validation: %w", err)
}
dialCtx, dialCancel := context.WithCancel(ctx)
results := make(chan dialOut, 1)
go func() {
conn, release, err := dialFn(dialCtx, agentID)
results <- dialOut{conn: conn, release: release, err: err}
}()
drained := false
defer func() {
dialCancel()
if drained {
return
}
// Drain without blocking the caller. dialFn may take time to honor
// cancellation, but any late-arriving successful connection still needs to
// be released.
go func() {
result := <-results
if result.err == nil && result.release != nil {
result.release()
}
}()
}()
resultForAgent := func(dialedAgentID uuid.UUID, result dialOut, switched bool) DialResult {
return DialResult{
Conn: result.conn,
Release: result.release,
AgentID: dialedAgentID,
WasSwitched: switched,
}
}
dialAgent := func(targetAgentID uuid.UUID, switched bool) (DialResult, error) {
conn, release, err := dialFn(ctx, targetAgentID)
if err != nil {
return DialResult{}, wrapErr(err)
}
return resultForAgent(targetAgentID, dialOut{conn: conn, release: release}, switched), nil
}
preferReadyOriginalDial := func() (DialResult, bool) {
select {
case result := <-results:
drained = true
if result.err != nil {
return DialResult{}, false
}
return resultForAgent(agentID, result, false), true
default:
return DialResult{}, false
}
}
waitForOriginalDial := func(waitCtx context.Context) (DialResult, error) {
select {
case result := <-results:
drained = true
if result.err != nil {
return DialResult{}, wrapErr(result.err)
}
return resultForAgent(agentID, result, false), nil
case <-waitCtx.Done():
if ready, ok := preferReadyOriginalDial(); ok {
return ready, nil
}
return DialResult{}, waitCtx.Err()
}
}
validateBinding := func() (uuid.UUID, error) {
validatedAgentID, err := validateFn(ctx, workspaceID)
if err != nil {
return uuid.Nil, wrapErr(err)
}
return validatedAgentID, nil
}
resolveFastFailure := func() (DialResult, error) {
validatedAgentID, err := validateBinding()
if err != nil {
return DialResult{}, err
}
if validatedAgentID == agentID {
return dialAgent(agentID, false)
}
return dialAgent(validatedAgentID, true)
}
timer := time.NewTimer(delay)
defer timer.Stop()
select {
case result := <-results:
drained = true
if result.err == nil {
return resultForAgent(agentID, result, false), nil
}
return resolveFastFailure()
case <-timer.C:
validatedAgentID, validationErr := validateFn(ctx, workspaceID)
if validationErr != nil || validatedAgentID == agentID {
// Validation could not prove the binding was stale, so keep waiting on
// the original dial.
return waitForOriginalDial(ctx)
}
// The original dial is stale. Cancel it first, then let the deferred drain
// release any late result while we dial the validated agent immediately.
dialCancel()
return dialAgent(validatedAgentID, true)
case <-ctx.Done():
if ready, ok := preferReadyOriginalDial(); ok {
return ready, nil
}
return DialResult{}, ctx.Err()
}
}