package chattool import ( "context" "database/sql" "errors" "fmt" "strings" "sync" "time" "charm.land/fantasy" "github.com/google/uuid" "golang.org/x/xerrors" "cdr.dev/slog/v3" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/httpapi/httperror" "github.com/coder/coder/v2/coderd/util/namesgenerator" "github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/workspacesdk" ) const ( // buildPollInterval is how often we check if the workspace // build has completed. buildPollInterval = 2 * time.Second // buildTimeout is the maximum time to wait for a workspace // build to complete before giving up. buildTimeout = 10 * time.Minute // agentConnectTimeout is the maximum time to wait for the // workspace agent to become reachable after a successful build. agentConnectTimeout = 2 * time.Minute // agentRetryInterval is how often we retry connecting to the // workspace agent. agentRetryInterval = 2 * time.Second // agentAttemptTimeout is the timeout for a single connection // attempt to the workspace agent during the retry loop. agentAttemptTimeout = 5 * time.Second // startupScriptTimeout is the maximum time to wait for the // workspace agent's startup scripts to finish after the agent // is reachable. startupScriptTimeout = 10 * time.Minute // startupScriptPollInterval is how often we check the agent's // lifecycle state while waiting for startup scripts. startupScriptPollInterval = 2 * time.Second ) // CreateWorkspaceFn creates a workspace for the given owner. type CreateWorkspaceFn func( ctx context.Context, ownerID uuid.UUID, req codersdk.CreateWorkspaceRequest, ) (codersdk.Workspace, error) // AgentConnFunc provides access to workspace agent connections. type AgentConnFunc func( ctx context.Context, agentID uuid.UUID, ) (workspacesdk.AgentConn, func(), error) // CreateWorkspaceOptions configures the create_workspace tool. type CreateWorkspaceOptions struct { OwnerID uuid.UUID CreateFn CreateWorkspaceFn AgentConnFn AgentConnFunc AgentInactiveDisconnectTimeout time.Duration WorkspaceMu *sync.Mutex OnChatUpdated func(database.Chat) Logger slog.Logger AllowedTemplateIDs func() map[uuid.UUID]bool } type createWorkspaceArgs struct { TemplateID string `json:"template_id" description:"The UUIDv4 of the template to create the workspace from. Obtain this from list_templates."` Name string `json:"name,omitempty" description:"The name of the workspace to create. If not provided, a random name will be generated."` Parameters map[string]string `json:"parameters,omitempty" description:"Key-value pairs of template parameters to use when creating the workspace. Obtain available parameters from read_template."` PresetID string `json:"preset_id,omitempty" description:"The UUIDv4 of a template version preset to use. Obtain available presets from read_template. When provided, the preset's parameters are applied automatically and the workspace may claim a prebuilt instance for faster startup."` } // CreateWorkspace returns a tool that creates a new workspace from a // template. The tool is idempotent: if the chat already has a // workspace that is building or running, it returns the existing // workspace instead of creating a new one. A mutex prevents parallel // calls from creating duplicate workspaces. // db must not be nil and chatID must not be uuid.Nil. func CreateWorkspace(db database.Store, organizationID, chatID uuid.UUID, options CreateWorkspaceOptions) fantasy.AgentTool { return fantasy.NewAgentTool( "create_workspace", "Create a new workspace from a template only when workspace-backed "+ "file inspection, command execution, or file editing is required, "+ "or when the user explicitly asks for one. Do not use this as a "+ "default first step for requests answerable from conversation "+ "context, provider tools, or external MCP tools. Requires a "+ "template_id (from list_templates). Optionally provide "+ "a name and parameter values (from read_template). "+ "If no name is given, one will be generated. "+ "Provide a preset_id (from read_template) to apply "+ "preset parameters and potentially claim a prebuilt "+ "workspace for faster startup. "+ "This tool is idempotent. If the chat already has a "+ "workspace that is building or running, the existing "+ "workspace is returned.", func(ctx context.Context, args createWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { if options.CreateFn == nil { return fantasy.NewTextErrorResponse("workspace creator is not configured"), nil } templateIDStr := strings.TrimSpace(args.TemplateID) if templateIDStr == "" { return fantasy.NewTextErrorResponse("template_id is required; use list_templates to find one"), nil } templateID, err := uuid.Parse(templateIDStr) if err != nil { return fantasy.NewTextErrorResponse( xerrors.Errorf("invalid template_id: %w", err).Error(), ), nil } if !isTemplateAllowed(options.AllowedTemplateIDs, templateID) { return fantasy.NewTextErrorResponse("template not available for chat workspaces; use list_templates to find allowed templates"), nil } // Serialize workspace creation to prevent parallel // tool calls from creating duplicate workspaces. if options.WorkspaceMu != nil { options.WorkspaceMu.Lock() defer options.WorkspaceMu.Unlock() } ownerID := options.OwnerID // Check for an existing workspace on the chat. check := options.checkExistingWorkspace(ctx, db, chatID) if check.BuildErr != nil { return buildFailureToolResponse( ctx, options.Logger, db, ownerID, organizationID, check.BuildAction, check.BuildID, check.BuildErr, ), nil } if check.Err != nil { return fantasy.NewTextErrorResponse(check.Err.Error()), nil } if check.Done { return toolResponse(check.Result), nil } // Set up dbauthz context for DB lookups. ownerCtx, ownerErr := asOwner(ctx, db, ownerID) if ownerErr != nil { return fantasy.NewTextErrorResponse(ownerErr.Error()), nil } ctx = ownerCtx // Verify the template belongs to the same org as the // chat. Without this check the tool could silently // bind a cross-org workspace to the chat. tmpl, tmplErr := db.GetTemplateByID(ctx, templateID) if tmplErr != nil { return fantasy.NewTextErrorResponse( xerrors.Errorf("look up template: %w", tmplErr).Error(), ), nil } if tmpl.OrganizationID != organizationID { return fantasy.NewTextErrorResponse( "template belongs to a different organization than this chat; " + "use list_templates to find templates in the correct organization", ), nil } hasExternalAgent, externalAgentErr := templateHasExternalAgent(ctx, db, tmpl) if externalAgentErr != nil { return fantasy.NewTextErrorResponse( xerrors.Errorf("look up template version: %w", externalAgentErr).Error(), ), nil } if hasExternalAgent { return fantasy.NewTextErrorResponse(createWorkspaceExternalAgentMessage), nil } var ttlMs *int64 raw, err := db.GetChatWorkspaceTTL(ctx) if err != nil { options.Logger.Error(ctx, "failed to read chat workspace TTL setting, using template default", slog.Error(err), ) } else { d, parseErr := codersdk.ParseChatWorkspaceTTL(raw) if parseErr != nil { options.Logger.Warn(ctx, "invalid chat workspace TTL setting, using template default", slog.F("raw", raw), slog.Error(parseErr), ) } else if d > 0 { ms := d.Milliseconds() ttlMs = &ms } } createReq := codersdk.CreateWorkspaceRequest{ TemplateID: templateID, TTLMillis: ttlMs, } // Apply preset if provided. presetIDStr := strings.TrimSpace(args.PresetID) if presetIDStr != "" { presetID, err := uuid.Parse(presetIDStr) if err != nil { return fantasy.NewTextErrorResponse( xerrors.Errorf("invalid preset_id: %w", err).Error(), ), nil } createReq.TemplateVersionPresetID = presetID } name := strings.TrimSpace(args.Name) if name == "" { name = generatedWorkspaceName(tmpl.Name) } else if err := codersdk.NameValid(name); err != nil { name = generatedWorkspaceName(name) } createReq.Name = name // Map parameters. for k, v := range args.Parameters { createReq.RichParameterValues = append( createReq.RichParameterValues, codersdk.WorkspaceBuildParameter{Name: k, Value: v}, ) } workspace, err := options.CreateFn(ctx, ownerID, createReq) if err != nil { if responseErr, ok := httperror.IsResponder(err); ok { _, resp := responseErr.Response() return toolResponse(responseErrorResult(resp)), nil } return fantasy.NewTextErrorResponse(err.Error()), nil } // Persist the workspace binding on the chat // immediately so the frontend can start streaming // build logs while the build is still running. // Note: this binding is intentional even if the build // later fails. The checkExistingWorkspace recovery // path handles failed workspaces by allowing // re-creation. updatedChat, err := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{ ID: chatID, WorkspaceID: uuid.NullUUID{ UUID: workspace.ID, Valid: true, }, BuildID: uuid.NullUUID{ UUID: workspace.LatestBuild.ID, Valid: workspace.LatestBuild.ID != uuid.Nil, }, // AgentID is left null because the build hasn't // completed yet. The chatd runtime binds it once // the agent comes online. AgentID: uuid.NullUUID{}, }) if err != nil { options.Logger.Error(ctx, "failed to persist chat workspace association", slog.F("chat_id", chatID), slog.F("workspace_id", workspace.ID), slog.Error(err), ) } else if options.OnChatUpdated != nil { options.OnChatUpdated(updatedChat) } // Wait for the build to complete and the agent to // come online so subsequent tools can use the // workspace immediately. buildID := workspace.LatestBuild.ID if buildID != uuid.Nil { if err := waitForBuild(ctx, db, buildID); err != nil { return buildFailureToolResponse( ctx, options.Logger, db, ownerID, organizationID, buildFailureActionCreate, buildID, xerrors.Errorf("workspace build failed: %w", err), ), nil } } result := map[string]any{ "created": true, "workspace_name": workspace.FullName(), } setBuildID(result, buildID) // Select the chat agent so follow-up tools wait on the // intended workspace agent. selectedAgent := database.WorkspaceAgent{} agents, agentErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if agentErr == nil { if len(agents) == 0 { result["agent_status"] = "no_agent" } else { selected, selectErr := agentselect.FindChatAgent(agents) if selectErr != nil { result["agent_status"] = "selection_error" result["agent_error"] = selectErr.Error() } else { selectedAgent = selected } } } // Wait for the agent to come online and startup scripts to finish. if selectedAgent.ID != uuid.Nil { agentStatus := waitForAgentReady(ctx, db, selectedAgent, options.AgentConnFn) for k, v := range agentStatus { result[k] = v } } // Re-fire after the agent is fully ready so callers // can load instruction files (AGENTS.md) from the // running agent. This must happen after // waitForAgentReady — firing earlier (e.g. right // after waitForBuild) races with the agent startup // and the connection usually times out before the // agent is reachable. if options.OnChatUpdated != nil { if latest, err := db.GetChatByID(ctx, chatID); err == nil { options.OnChatUpdated(latest) } } return toolResponse(result), nil }) } // existingWorkspaceResult holds the outcome of checking for an // existing workspace on the chat. type existingWorkspaceResult struct { // Result is the tool response map when Done is true. Result map[string]any // Done indicates the caller should return early. Done bool // BuildAction, BuildID, and BuildErr are set together when // waitForBuild failed, so the caller can render the build // failure through the shared response path. BuildAction buildFailureAction BuildID uuid.UUID BuildErr error // Err is non-nil when the check itself failed. Err error } // checkExistingWorkspace checks whether the given chat // already has a usable workspace. Returns an // existingWorkspaceResult with Done set when the caller should // return early (workspace exists and is alive or building). // Returns Done unset if the caller should proceed with creation // (workspace is dead or missing). func (o CreateWorkspaceOptions) checkExistingWorkspace( ctx context.Context, db database.Store, chatID uuid.UUID, ) existingWorkspaceResult { agentConnFn := o.AgentConnFn agentInactiveDisconnectTimeout := o.AgentInactiveDisconnectTimeout chat, err := db.GetChatByID(ctx, chatID) if err != nil { return existingWorkspaceResult{Err: xerrors.Errorf("load chat: %w", err)} } if !chat.WorkspaceID.Valid { return existingWorkspaceResult{} } ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID) if err != nil { return existingWorkspaceResult{Err: xerrors.Errorf("load workspace: %w", err)} } // Workspace was soft-deleted — allow creation. if ws.Deleted { return existingWorkspaceResult{} } // Check the latest build status. build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID) if err != nil { // Can't determine status — allow creation. return existingWorkspaceResult{} } job, err := db.GetProvisionerJobByID(ctx, build.JobID) if err != nil { return existingWorkspaceResult{} } switch job.JobStatus { case database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning: // Build is in progress. Publish the build ID so the // frontend can start streaming logs, then wait. updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{ ID: chatID, WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true}, BuildID: uuid.NullUUID{ UUID: build.ID, Valid: build.ID != uuid.Nil, }, AgentID: uuid.NullUUID{}, }) if bindErr != nil { o.Logger.Error(ctx, "failed to persist build ID on chat binding", slog.F("chat_id", chatID), slog.F("build_id", build.ID), slog.Error(bindErr), ) } else if o.OnChatUpdated != nil { o.OnChatUpdated(updatedChat) } if err := waitForBuild(ctx, db, build.ID); err != nil { action := buildFailureActionCreate if build.Transition == database.WorkspaceTransitionStart { action = buildFailureActionStart } return existingWorkspaceResult{ BuildAction: action, BuildID: build.ID, BuildErr: xerrors.Errorf("existing workspace build failed: %w", err), } } result := map[string]any{ "created": false, "workspace_name": ws.Name, "status": "already_exists", "message": "workspace build completed", } setBuildID(result, build.ID) agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID) if agentsErr == nil && len(agents) > 0 { selected, selectErr := agentselect.FindChatAgent(agents) if selectErr != nil { o.Logger.Debug(ctx, "agent selection failed, falling back to first agent for readiness check", slog.F("workspace_id", ws.ID), slog.Error(selectErr), ) selected = agents[0] } for k, v := range waitForAgentReady(ctx, db, selected, agentConnFn) { result[k] = v } } return existingWorkspaceResult{Result: result, Done: true} case database.ProvisionerJobStatusSucceeded: // If the workspace was stopped, tell the model to use // start_workspace instead of creating a new one. if build.Transition == database.WorkspaceTransitionStop { return existingWorkspaceResult{Result: map[string]any{ "created": false, "workspace_name": ws.Name, "status": "stopped", "message": "workspace is stopped; use start_workspace to start it", }, Done: true} } // Build succeeded — use the agent's recent DB-backed // connection status to decide whether the workspace is // still usable. agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID) if agentsErr == nil && len(agents) > 0 { selected, selectErr := agentselect.FindChatAgent(agents) if selectErr != nil { o.Logger.Debug(ctx, "agent selection failed, falling back to first agent for status check", slog.F("workspace_id", ws.ID), slog.Error(selectErr), ) selected = agents[0] } status := selected.Status(dbtime.Now(), agentInactiveDisconnectTimeout) result := map[string]any{ "created": false, "workspace_name": ws.Name, "status": "already_exists", } switch status.Status { case database.WorkspaceAgentStatusConnected: result["message"] = "workspace is already running and recently connected" for k, v := range waitForAgentReady(ctx, db, selected, nil) { result[k] = v } return existingWorkspaceResult{Result: result, Done: true} case database.WorkspaceAgentStatusConnecting: result["message"] = "workspace exists and the agent is still connecting" for k, v := range waitForAgentReady(ctx, db, selected, agentConnFn) { result[k] = v } return existingWorkspaceResult{Result: result, Done: true} case database.WorkspaceAgentStatusDisconnected, database.WorkspaceAgentStatusTimeout: // Agent is offline or never became ready - allow // creation. } } // No agent ID or no agent status — allow creation. return existingWorkspaceResult{} default: // Failed, canceled, etc — allow creation. return existingWorkspaceResult{} } } // waitForBuild polls the specified build until its provisioner job // completes or the context expires. func waitForBuild( ctx context.Context, db database.Store, buildID uuid.UUID, ) error { buildCtx, cancel := context.WithTimeout(ctx, buildTimeout) defer cancel() ticker := time.NewTicker(buildPollInterval) defer ticker.Stop() for { build, err := db.GetWorkspaceBuildByID(buildCtx, buildID) if err != nil { return xerrors.Errorf("get build: %w", err) } job, err := db.GetProvisionerJobByID(buildCtx, build.JobID) if err != nil { return xerrors.Errorf("get provisioner job: %w", err) } switch job.JobStatus { case database.ProvisionerJobStatusSucceeded: return nil case database.ProvisionerJobStatusFailed: errMsg := "build failed" if job.Error.Valid { errMsg = job.Error.String } var code codersdk.JobErrorCode if job.ErrorCode.Valid { code = codersdk.JobErrorCode(job.ErrorCode.String) } return &workspaceBuildError{message: errMsg, code: code} case database.ProvisionerJobStatusCanceled: return xerrors.New("build was canceled") case database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling: // Still in progress — keep waiting. default: return xerrors.Errorf("unexpected job status: %s", job.JobStatus) } select { case <-buildCtx.Done(): return xerrors.Errorf( "timed out waiting for workspace build: %w", buildCtx.Err(), ) case <-ticker.C: } } } func templateHasExternalAgent( ctx context.Context, db database.Store, tmpl database.Template, ) (bool, error) { version, err := db.GetTemplateVersionByID(ctx, tmpl.ActiveVersionID) if err != nil { if errors.Is(err, sql.ErrNoRows) { return false, nil } return false, err } return version.HasExternalAgent.Valid && version.HasExternalAgent.Bool, nil } // externalAgentReadyError returns the external-agent-specific error // message when agent belongs to an external resource, or the empty // string otherwise. Errors looking up the resource are treated as // non-external so the caller falls back to the dial error. func externalAgentReadyError( ctx context.Context, db database.Store, agent database.WorkspaceAgent, ) string { isExternal, err := IsExternalWorkspaceAgent(ctx, db, agent) if err != nil || !isExternal { return "" } return ExternalAgentUnavailableMessage(agent) } // waitForAgentReady waits for the workspace agent to become // reachable and for its startup scripts to finish. It returns // status fields suitable for merging into a tool response. func waitForAgentReady( ctx context.Context, db database.Store, agent database.WorkspaceAgent, agentConnFn AgentConnFunc, ) map[string]any { result := map[string]any{} agentID := agent.ID // Phase 1: retry connecting to the agent. if agentConnFn != nil { agentCtx, agentCancel := context.WithTimeout(ctx, agentConnectTimeout) defer agentCancel() ticker := time.NewTicker(agentRetryInterval) defer ticker.Stop() var lastErr error for { attemptCtx, attemptCancel := context.WithTimeout(agentCtx, agentAttemptTimeout) conn, release, err := agentConnFn(attemptCtx, agentID) attemptCancel() if err == nil { release() _ = conn break } lastErr = err select { case <-agentCtx.Done(): result["agent_status"] = "not_ready" // External agents may need user action on a different // host. Surface that guidance instead of the raw dial // error after the retry window has elapsed. The retry // loop itself is unchanged, so a Connecting external // agent still gets the full window to come online. if msg := externalAgentReadyError(ctx, db, agent); msg != "" { result["agent_error"] = msg } else { result["agent_error"] = lastErr.Error() } return result case <-ticker.C: } } } // Phase 2: poll lifecycle until startup scripts finish. scriptCtx, scriptCancel := context.WithTimeout(ctx, startupScriptTimeout) defer scriptCancel() ticker := time.NewTicker(startupScriptPollInterval) defer ticker.Stop() var lastState database.WorkspaceAgentLifecycleState for { row, err := db.GetWorkspaceAgentLifecycleStateByID(scriptCtx, agentID) if err == nil { lastState = row.LifecycleState switch lastState { case database.WorkspaceAgentLifecycleStateCreated, database.WorkspaceAgentLifecycleStateStarting: // Still in progress, keep polling. case database.WorkspaceAgentLifecycleStateReady: return result default: // Terminal non-ready state. result["startup_scripts"] = "startup_scripts_failed" result["lifecycle_state"] = string(lastState) return result } } select { case <-scriptCtx.Done(): if errors.Is(scriptCtx.Err(), context.DeadlineExceeded) { result["startup_scripts"] = "startup_scripts_timeout" } else { result["startup_scripts"] = "startup_scripts_unknown" } return result case <-ticker.C: } } } func generatedWorkspaceName(seed string) string { base := codersdk.UsernameFrom(strings.TrimSpace(strings.ToLower(seed))) if strings.TrimSpace(base) == "" { base = "workspace" } suffix := strings.ReplaceAll(uuid.NewString(), "-", "")[:4] if len(base) > 27 { base = strings.Trim(base[:27], "-") } if base == "" { base = "workspace" } name := fmt.Sprintf("%s-%s", base, suffix) if err := codersdk.NameValid(name); err == nil { return name } return namesgenerator.NameDigitWith("-") }