mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
61e31ec5cc
## Summary This change removes the steady-state "resolve the latest workspace agent" query from chat execution. Instead of asking the database for the latest build's agent on every turn, a chat now persists the workspace/build/agent binding it actually uses and reuses that binding across subsequent turns. The common path becomes "load the bound agent by ID and dial it", with fallback paths to repair the binding when it is missing, stale, or intentionally changed. ## What changes - add `workspace_id`, `build_id`, and `agent_id` binding fields to `chats` - expose those fields through the chat API / SDK so the execution context is explicit - load the persisted binding first in chatd, instead of always resolving the latest build's agent - persist a refreshed binding when chatd has to re-resolve the workspace agent - keep child / subagent chats on the same bound workspace context by inheriting the parent binding - leave `build_id` / `agent_id` unset for flows like `create_workspace`, then bind them lazily on the next agent-backed turn ## Runtime behavior The binding is treated as an optimistic cache of the agent a chat should use: - if the bound agent still exists and dials successfully, we use it without a latest-build lookup - if the bound agent is missing or no longer reachable, chatd re-resolves against the latest build and persists the new binding - if a workspace mutation changes the chat's target workspace, the binding is updated as part of that mutation To avoid reintroducing a hot-path query, dialing uses lazy validation: - start dialing the cached agent immediately - only validate against the latest build if the dial is still pending after a short delay - if validation finds a different agent, cancel the stale dial, switch to the current agent, and persist the repaired binding ## Result The hot path stops issuing `GetWorkspaceAgentsInLatestBuildByWorkspaceID` for every user message, which is the source of the DB pressure this PR is addressing. At the same time, chats still converge to the correct workspace agent when the binding becomes stale due to rebuilds or explicit workspace changes.
171 lines
4.8 KiB
Go
171 lines
4.8 KiB
Go
package chatd
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
|
)
|
|
|
|
// DialResult contains the outcome of dialWithLazyValidation.
|
|
type DialResult struct {
|
|
Conn workspacesdk.AgentConn
|
|
Release func()
|
|
AgentID uuid.UUID // The agent that was actually dialed.
|
|
WasSwitched bool // True if validation discovered a different agent.
|
|
}
|
|
|
|
// DialFunc dials an agent by ID and returns a connection.
|
|
type DialFunc func(ctx context.Context, id uuid.UUID) (workspacesdk.AgentConn, func(), error)
|
|
|
|
// ValidateFunc returns the current agent ID for a workspace.
|
|
type ValidateFunc func(ctx context.Context, workspaceID uuid.UUID) (uuid.UUID, error)
|
|
|
|
type dialOut struct {
|
|
conn workspacesdk.AgentConn
|
|
release func()
|
|
err error
|
|
}
|
|
|
|
// dialWithLazyValidation dials an agent and only consults the database if the
|
|
// original dial is slow or fails quickly. This keeps the common path free of
|
|
// latest-build lookups while still repairing stale bindings.
|
|
//
|
|
// Outcomes:
|
|
// - The dial succeeds before delay, so validation is skipped.
|
|
// - The timer fires and validation confirms the same agent, so the original
|
|
// dial continues.
|
|
// - The timer fires and validation finds a different agent, so the stale
|
|
// dial is canceled and the new agent is dialed instead.
|
|
// - The dial fails before delay, so validation runs immediately and either
|
|
// switches to a different agent or retries the current one once.
|
|
func dialWithLazyValidation(
|
|
ctx context.Context,
|
|
agentID uuid.UUID,
|
|
workspaceID uuid.UUID,
|
|
dialFn DialFunc,
|
|
validateFn ValidateFunc,
|
|
delay time.Duration,
|
|
) (DialResult, error) {
|
|
wrapErr := func(err error) error {
|
|
return xerrors.Errorf("dial with lazy validation: %w", err)
|
|
}
|
|
|
|
dialCtx, dialCancel := context.WithCancel(ctx)
|
|
results := make(chan dialOut, 1)
|
|
go func() {
|
|
conn, release, err := dialFn(dialCtx, agentID)
|
|
results <- dialOut{conn: conn, release: release, err: err}
|
|
}()
|
|
|
|
drained := false
|
|
defer func() {
|
|
dialCancel()
|
|
if drained {
|
|
return
|
|
}
|
|
// Drain without blocking the caller. dialFn may take time to honor
|
|
// cancellation, but any late-arriving successful connection still needs to
|
|
// be released.
|
|
go func() {
|
|
result := <-results
|
|
if result.err == nil && result.release != nil {
|
|
result.release()
|
|
}
|
|
}()
|
|
}()
|
|
|
|
resultForAgent := func(dialedAgentID uuid.UUID, result dialOut, switched bool) DialResult {
|
|
return DialResult{
|
|
Conn: result.conn,
|
|
Release: result.release,
|
|
AgentID: dialedAgentID,
|
|
WasSwitched: switched,
|
|
}
|
|
}
|
|
dialAgent := func(targetAgentID uuid.UUID, switched bool) (DialResult, error) {
|
|
conn, release, err := dialFn(ctx, targetAgentID)
|
|
if err != nil {
|
|
return DialResult{}, wrapErr(err)
|
|
}
|
|
return resultForAgent(targetAgentID, dialOut{conn: conn, release: release}, switched), nil
|
|
}
|
|
preferReadyOriginalDial := func() (DialResult, bool) {
|
|
select {
|
|
case result := <-results:
|
|
drained = true
|
|
if result.err != nil {
|
|
return DialResult{}, false
|
|
}
|
|
return resultForAgent(agentID, result, false), true
|
|
default:
|
|
return DialResult{}, false
|
|
}
|
|
}
|
|
waitForOriginalDial := func(waitCtx context.Context) (DialResult, error) {
|
|
select {
|
|
case result := <-results:
|
|
drained = true
|
|
if result.err != nil {
|
|
return DialResult{}, wrapErr(result.err)
|
|
}
|
|
return resultForAgent(agentID, result, false), nil
|
|
case <-waitCtx.Done():
|
|
if ready, ok := preferReadyOriginalDial(); ok {
|
|
return ready, nil
|
|
}
|
|
return DialResult{}, waitCtx.Err()
|
|
}
|
|
}
|
|
validateBinding := func() (uuid.UUID, error) {
|
|
validatedAgentID, err := validateFn(ctx, workspaceID)
|
|
if err != nil {
|
|
return uuid.Nil, wrapErr(err)
|
|
}
|
|
return validatedAgentID, nil
|
|
}
|
|
resolveFastFailure := func() (DialResult, error) {
|
|
validatedAgentID, err := validateBinding()
|
|
if err != nil {
|
|
return DialResult{}, err
|
|
}
|
|
if validatedAgentID == agentID {
|
|
return dialAgent(agentID, false)
|
|
}
|
|
return dialAgent(validatedAgentID, true)
|
|
}
|
|
|
|
timer := time.NewTimer(delay)
|
|
defer timer.Stop()
|
|
|
|
select {
|
|
case result := <-results:
|
|
drained = true
|
|
if result.err == nil {
|
|
return resultForAgent(agentID, result, false), nil
|
|
}
|
|
return resolveFastFailure()
|
|
|
|
case <-timer.C:
|
|
validatedAgentID, validationErr := validateFn(ctx, workspaceID)
|
|
if validationErr != nil || validatedAgentID == agentID {
|
|
// Validation could not prove the binding was stale, so keep waiting on
|
|
// the original dial.
|
|
return waitForOriginalDial(ctx)
|
|
}
|
|
// The original dial is stale. Cancel it first, then let the deferred drain
|
|
// release any late result while we dial the validated agent immediately.
|
|
dialCancel()
|
|
return dialAgent(validatedAgentID, true)
|
|
|
|
case <-ctx.Done():
|
|
if ready, ok := preferReadyOriginalDial(); ok {
|
|
return ready, nil
|
|
}
|
|
return DialResult{}, ctx.Err()
|
|
}
|
|
}
|