Files
coder/coderd/x/chatd/chattool/createworkspace.go
T
Ethan 61e31ec5cc perf(coderd/x/chatd): persist workspace agent binding across chat turns (#23274)
## Summary

This change removes the steady-state "resolve the latest workspace
agent" query from chat execution.

Instead of asking the database for the latest build's agent on every
turn, a chat now persists the workspace/build/agent binding it actually
uses and reuses that binding across subsequent turns. The common path
becomes "load the bound agent by ID and dial it", with fallback paths to
repair the binding when it is missing, stale, or intentionally changed.

## What changes

- add `workspace_id`, `build_id`, and `agent_id` binding fields to
`chats`
- expose those fields through the chat API / SDK so the execution
context is explicit
- load the persisted binding first in chatd, instead of always resolving
the latest build's agent
- persist a refreshed binding when chatd has to re-resolve the workspace
agent
- keep child / subagent chats on the same bound workspace context by
inheriting the parent binding
- leave `build_id` / `agent_id` unset for flows like `create_workspace`,
then bind them lazily on the next agent-backed turn

## Runtime behavior

The binding is treated as an optimistic cache of the agent a chat should
use:

- if the bound agent still exists and dials successfully, we use it
without a latest-build lookup
- if the bound agent is missing or no longer reachable, chatd
re-resolves against the latest build and persists the new binding
- if a workspace mutation changes the chat's target workspace, the
binding is updated as part of that mutation

To avoid reintroducing a hot-path query, dialing uses lazy validation:

- start dialing the cached agent immediately
- only validate against the latest build if the dial is still pending
after a short delay
- if validation finds a different agent, cancel the stale dial, switch
to the current agent, and persist the repaired binding

## Result

The hot path stops issuing
`GetWorkspaceAgentsInLatestBuildByWorkspaceID` for every user message,
which is the source of the DB pressure this PR is addressing. At the
same time, chats still converge to the correct workspace agent when the
binding becomes stale due to rebuilds or explicit workspace changes.
2026-03-26 17:22:38 +11:00

543 lines
16 KiB
Go

package chattool
import (
"context"
"errors"
"fmt"
"strings"
"sync"
"time"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/namesgenerator"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
const (
// buildPollInterval is how often we check if the workspace
// build has completed.
buildPollInterval = 2 * time.Second
// buildTimeout is the maximum time to wait for a workspace
// build to complete before giving up.
buildTimeout = 10 * time.Minute
// agentConnectTimeout is the maximum time to wait for the
// workspace agent to become reachable after a successful build.
agentConnectTimeout = 2 * time.Minute
// agentRetryInterval is how often we retry connecting to the
// workspace agent.
agentRetryInterval = 2 * time.Second
// agentAttemptTimeout is the timeout for a single connection
// attempt to the workspace agent during the retry loop.
agentAttemptTimeout = 5 * time.Second
// startupScriptTimeout is the maximum time to wait for the
// workspace agent's startup scripts to finish after the agent
// is reachable.
startupScriptTimeout = 10 * time.Minute
// startupScriptPollInterval is how often we check the agent's
// lifecycle state while waiting for startup scripts.
startupScriptPollInterval = 2 * time.Second
)
// CreateWorkspaceFn creates a workspace for the given owner.
type CreateWorkspaceFn func(
ctx context.Context,
ownerID uuid.UUID,
req codersdk.CreateWorkspaceRequest,
) (codersdk.Workspace, error)
// AgentConnFunc provides access to workspace agent connections.
type AgentConnFunc func(
ctx context.Context,
agentID uuid.UUID,
) (workspacesdk.AgentConn, func(), error)
// CreateWorkspaceOptions configures the create_workspace tool.
type CreateWorkspaceOptions struct {
DB database.Store
OwnerID uuid.UUID
ChatID uuid.UUID
CreateFn CreateWorkspaceFn
AgentConnFn AgentConnFunc
AgentInactiveDisconnectTimeout time.Duration
WorkspaceMu *sync.Mutex
OnChatUpdated func(database.Chat)
Logger slog.Logger
AllowedTemplateIDs func() map[uuid.UUID]bool
}
type createWorkspaceArgs struct {
TemplateID string `json:"template_id"`
Name string `json:"name,omitempty"`
Parameters map[string]string `json:"parameters,omitempty"`
}
// CreateWorkspace returns a tool that creates a new workspace from a
// template. The tool is idempotent: if the chat already has a
// workspace that is building or running, it returns the existing
// workspace instead of creating a new one. A mutex prevents parallel
// calls from creating duplicate workspaces.
func CreateWorkspace(options CreateWorkspaceOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"create_workspace",
"Create a new workspace from a template. Requires a "+
"template_id (from list_templates). Optionally provide "+
"a name and parameter values (from read_template). "+
"If no name is given, one will be generated. "+
"This tool is idempotent — if the chat already has a "+
"workspace that is building or running, the existing "+
"workspace is returned.",
func(ctx context.Context, args createWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.CreateFn == nil {
return fantasy.NewTextErrorResponse("workspace creator is not configured"), nil
}
templateIDStr := strings.TrimSpace(args.TemplateID)
if templateIDStr == "" {
return fantasy.NewTextErrorResponse("template_id is required; use list_templates to find one"), nil
}
templateID, err := uuid.Parse(templateIDStr)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("invalid template_id: %w", err).Error(),
), nil
}
if !isTemplateAllowed(options.AllowedTemplateIDs, templateID) {
return fantasy.NewTextErrorResponse("template not available for chat workspaces; use list_templates to find allowed templates"), nil
}
// Serialize workspace creation to prevent parallel
// tool calls from creating duplicate workspaces.
if options.WorkspaceMu != nil {
options.WorkspaceMu.Lock()
defer options.WorkspaceMu.Unlock()
}
// Check for an existing workspace on the chat.
existing, done, existErr := options.checkExistingWorkspace(ctx)
if existErr != nil {
return fantasy.NewTextErrorResponse(existErr.Error()), nil
}
if done {
return toolResponse(existing), nil
}
ownerID := options.OwnerID
// Set up dbauthz context for DB lookups.
if options.DB != nil {
ownerCtx, ownerErr := asOwner(ctx, options.DB, ownerID)
if ownerErr != nil {
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
}
ctx = ownerCtx
}
var ttlMs *int64
if options.DB != nil {
raw, err := options.DB.GetChatWorkspaceTTL(ctx)
if err != nil {
options.Logger.Error(ctx, "failed to read chat workspace TTL setting, using template default",
slog.Error(err),
)
} else {
d, parseErr := codersdk.ParseChatWorkspaceTTL(raw)
if parseErr != nil {
options.Logger.Warn(ctx, "invalid chat workspace TTL setting, using template default",
slog.F("raw", raw),
slog.Error(parseErr),
)
} else if d > 0 {
ms := d.Milliseconds()
ttlMs = &ms
}
}
}
createReq := codersdk.CreateWorkspaceRequest{
TemplateID: templateID,
TTLMillis: ttlMs,
}
// Resolve workspace name.
name := strings.TrimSpace(args.Name)
if name == "" {
seed := "workspace"
if options.DB != nil {
if t, lookupErr := options.DB.GetTemplateByID(ctx, templateID); lookupErr == nil {
seed = t.Name
}
}
name = generatedWorkspaceName(seed)
} else if err := codersdk.NameValid(name); err != nil {
name = generatedWorkspaceName(name)
}
createReq.Name = name
// Map parameters.
for k, v := range args.Parameters {
createReq.RichParameterValues = append(
createReq.RichParameterValues,
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
)
}
workspace, err := options.CreateFn(ctx, ownerID, createReq)
if err != nil {
return fantasy.NewTextErrorResponse(err.Error()), nil
}
// Wait for the build to complete and the agent to
// come online so subsequent tools can use the
// workspace immediately.
if options.DB != nil {
if err := waitForBuild(ctx, options.DB, workspace.ID); err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("workspace build failed: %w", err).Error(),
), nil
}
}
// Look up the first agent so we can link it to the chat.
workspaceAgentID := uuid.Nil
if options.DB != nil {
agents, agentErr := options.DB.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if agentErr == nil && len(agents) > 0 {
workspaceAgentID = agents[0].ID
}
}
// Persist the workspace binding on the chat.
if options.DB != nil && options.ChatID != uuid.Nil {
updatedChat, err := options.DB.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: options.ChatID,
WorkspaceID: uuid.NullUUID{
UUID: workspace.ID,
Valid: true,
},
// BuildID and AgentID are intentionally left null
// here. The chatd runtime (loadWorkspaceAgentLocked)
// will bind them on the next turn. Authoritative
// tool-path binding is deferred to a follow-up PR.
BuildID: uuid.NullUUID{},
AgentID: uuid.NullUUID{},
})
if err != nil {
options.Logger.Error(ctx, "failed to persist chat workspace association",
slog.F("chat_id", options.ChatID),
slog.F("workspace_id", workspace.ID),
slog.Error(err),
)
} else if options.OnChatUpdated != nil {
options.OnChatUpdated(updatedChat)
}
}
// Wait for the agent to come online and startup scripts to finish.
if workspaceAgentID != uuid.Nil {
agentStatus := waitForAgentReady(ctx, options.DB, workspaceAgentID, options.AgentConnFn)
result := map[string]any{
"created": true,
"workspace_name": workspace.FullName(),
}
for k, v := range agentStatus {
result[k] = v
}
return toolResponse(result), nil
}
return toolResponse(map[string]any{
"created": true,
"workspace_name": workspace.FullName(),
}), nil
})
}
// checkExistingWorkspace checks whether the configured chat already has
// a usable workspace. Returns the result map and true if the caller
// should return early (workspace exists and is alive or building).
// Returns false if the caller should proceed with creation (workspace
// is dead or missing).
func (o CreateWorkspaceOptions) checkExistingWorkspace(
ctx context.Context,
) (map[string]any, bool, error) {
if o.DB == nil || o.ChatID == uuid.Nil {
return nil, false, nil
}
db := o.DB
chatID := o.ChatID
agentConnFn := o.AgentConnFn
agentInactiveDisconnectTimeout := o.AgentInactiveDisconnectTimeout
chat, err := db.GetChatByID(ctx, chatID)
if err != nil {
return nil, false, xerrors.Errorf("load chat: %w", err)
}
if !chat.WorkspaceID.Valid {
return nil, false, nil
}
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
if err != nil {
return nil, false, xerrors.Errorf("load workspace: %w", err)
}
// Workspace was soft-deleted — allow creation.
if ws.Deleted {
return nil, false, nil
}
// Check the latest build status.
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID)
if err != nil {
// Can't determine status — allow creation.
return nil, false, nil
}
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
if err != nil {
return nil, false, nil
}
switch job.JobStatus {
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning:
// Build is in progress — wait for it instead of
// creating a new workspace.
if err := waitForBuild(ctx, db, ws.ID); err != nil {
return nil, false, xerrors.Errorf(
"existing workspace build failed: %w", err,
)
}
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
"message": "workspace build completed",
}
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 {
for k, v := range waitForAgentReady(ctx, db, agents[0].ID, agentConnFn) {
result[k] = v
}
}
return result, true, nil
case database.ProvisionerJobStatusSucceeded:
// If the workspace was stopped, tell the model to use
// start_workspace instead of creating a new one.
if build.Transition == database.WorkspaceTransitionStop {
return map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "stopped",
"message": "workspace is stopped; use start_workspace to start it",
}, true, nil
}
// Build succeeded — use the agent's recent DB-backed
// connection status to decide whether the workspace is
// still usable.
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 {
status := agents[0].Status(agentInactiveDisconnectTimeout)
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
}
switch status.Status {
case database.WorkspaceAgentStatusConnected:
result["message"] = "workspace is already running and recently connected"
for k, v := range waitForAgentReady(ctx, db, agents[0].ID, nil) {
result[k] = v
}
return result, true, nil
case database.WorkspaceAgentStatusConnecting:
result["message"] = "workspace exists and the agent is still connecting"
for k, v := range waitForAgentReady(ctx, db, agents[0].ID, agentConnFn) {
result[k] = v
}
return result, true, nil
case database.WorkspaceAgentStatusDisconnected,
database.WorkspaceAgentStatusTimeout:
// Agent is offline or never became ready — allow
// creation.
}
}
// No agent ID or no agent status — allow creation.
return nil, false, nil
default:
// Failed, canceled, etc — allow creation.
return nil, false, nil
}
}
// waitForBuild polls the workspace's latest build until it
// completes or the context expires.
func waitForBuild(
ctx context.Context,
db database.Store,
workspaceID uuid.UUID,
) error {
buildCtx, cancel := context.WithTimeout(ctx, buildTimeout)
defer cancel()
ticker := time.NewTicker(buildPollInterval)
defer ticker.Stop()
for {
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(
buildCtx, workspaceID,
)
if err != nil {
return xerrors.Errorf("get latest build: %w", err)
}
job, err := db.GetProvisionerJobByID(buildCtx, build.JobID)
if err != nil {
return xerrors.Errorf("get provisioner job: %w", err)
}
switch job.JobStatus {
case database.ProvisionerJobStatusSucceeded:
return nil
case database.ProvisionerJobStatusFailed:
errMsg := "build failed"
if job.Error.Valid {
errMsg = job.Error.String
}
return xerrors.New(errMsg)
case database.ProvisionerJobStatusCanceled:
return xerrors.New("build was canceled")
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning,
database.ProvisionerJobStatusCanceling:
// Still in progress — keep waiting.
default:
return xerrors.Errorf("unexpected job status: %s", job.JobStatus)
}
select {
case <-buildCtx.Done():
return xerrors.Errorf(
"timed out waiting for workspace build: %w",
buildCtx.Err(),
)
case <-ticker.C:
}
}
}
// waitForAgentReady waits for the workspace agent to become
// reachable and for its startup scripts to finish. It returns
// status fields suitable for merging into a tool response.
func waitForAgentReady(
ctx context.Context,
db database.Store,
agentID uuid.UUID,
agentConnFn AgentConnFunc,
) map[string]any {
result := map[string]any{}
// Phase 1: retry connecting to the agent.
if agentConnFn != nil {
agentCtx, agentCancel := context.WithTimeout(ctx, agentConnectTimeout)
defer agentCancel()
ticker := time.NewTicker(agentRetryInterval)
defer ticker.Stop()
var lastErr error
for {
attemptCtx, attemptCancel := context.WithTimeout(agentCtx, agentAttemptTimeout)
conn, release, err := agentConnFn(attemptCtx, agentID)
attemptCancel()
if err == nil {
release()
_ = conn
break
}
lastErr = err
select {
case <-agentCtx.Done():
result["agent_status"] = "not_ready"
result["agent_error"] = lastErr.Error()
return result
case <-ticker.C:
}
}
}
// Phase 2: poll lifecycle until startup scripts finish.
if db != nil {
scriptCtx, scriptCancel := context.WithTimeout(ctx, startupScriptTimeout)
defer scriptCancel()
ticker := time.NewTicker(startupScriptPollInterval)
defer ticker.Stop()
var lastState database.WorkspaceAgentLifecycleState
for {
row, err := db.GetWorkspaceAgentLifecycleStateByID(scriptCtx, agentID)
if err == nil {
lastState = row.LifecycleState
switch lastState {
case database.WorkspaceAgentLifecycleStateCreated,
database.WorkspaceAgentLifecycleStateStarting:
// Still in progress, keep polling.
case database.WorkspaceAgentLifecycleStateReady:
return result
default:
// Terminal non-ready state.
result["startup_scripts"] = "startup_scripts_failed"
result["lifecycle_state"] = string(lastState)
return result
}
}
select {
case <-scriptCtx.Done():
if errors.Is(scriptCtx.Err(), context.DeadlineExceeded) {
result["startup_scripts"] = "startup_scripts_timeout"
} else {
result["startup_scripts"] = "startup_scripts_unknown"
}
return result
case <-ticker.C:
}
}
}
return result
}
func generatedWorkspaceName(seed string) string {
base := codersdk.UsernameFrom(strings.TrimSpace(strings.ToLower(seed)))
if strings.TrimSpace(base) == "" {
base = "workspace"
}
suffix := strings.ReplaceAll(uuid.NewString(), "-", "")[:4]
if len(base) > 27 {
base = strings.Trim(base[:27], "-")
}
if base == "" {
base = "workspace"
}
name := fmt.Sprintf("%s-%s", base, suffix)
if err := codersdk.NameValid(name); err == nil {
return name
}
return namesgenerator.NameDigitWith("-")
}