Files
coder/coderd/x/chatd/chattool/createworkspace.go
T
Ethan d0fa9ff986 fix(coderd/x/chatd/chattool): retry workspace name conflicts (#25668)
Retry Coder Agents workspace creation once with a generated random
suffix when the requested workspace name already exists. This preserves
structured errors for other conflicts and avoids surfacing avoidable
name collisions.

Closes CODAGT-386
2026-06-01 13:31:25 +00:00

762 lines
24 KiB
Go

package chattool
import (
"context"
"database/sql"
"errors"
"fmt"
"net/http"
"strings"
"sync"
"time"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/httpapi/httperror"
"github.com/coder/coder/v2/coderd/util/namesgenerator"
"github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
const (
// buildPollInterval is how often we check if the workspace
// build has completed.
buildPollInterval = 2 * time.Second
// buildTimeout is the maximum time to wait for a workspace
// build to complete before giving up.
buildTimeout = 10 * time.Minute
// agentConnectTimeout is the maximum time to wait for the
// workspace agent to become reachable after a successful build.
agentConnectTimeout = 2 * time.Minute
// agentRetryInterval is how often we retry connecting to the
// workspace agent.
agentRetryInterval = 2 * time.Second
// agentAttemptTimeout is the timeout for a single connection
// attempt to the workspace agent during the retry loop.
agentAttemptTimeout = 5 * time.Second
// startupScriptTimeout is the maximum time to wait for the
// workspace agent's startup scripts to finish after the agent
// is reachable.
startupScriptTimeout = 10 * time.Minute
// startupScriptPollInterval is how often we check the agent's
// lifecycle state while waiting for startup scripts.
startupScriptPollInterval = 2 * time.Second
)
// CreateWorkspaceFn creates a workspace for the given owner.
type CreateWorkspaceFn func(
ctx context.Context,
ownerID uuid.UUID,
req codersdk.CreateWorkspaceRequest,
) (codersdk.Workspace, error)
// AgentConnFunc provides access to workspace agent connections.
type AgentConnFunc func(
ctx context.Context,
agentID uuid.UUID,
) (workspacesdk.AgentConn, func(), error)
// CreateWorkspaceOptions configures the create_workspace tool.
type CreateWorkspaceOptions struct {
OwnerID uuid.UUID
CreateFn CreateWorkspaceFn
AgentConnFn AgentConnFunc
AgentInactiveDisconnectTimeout time.Duration
WorkspaceMu *sync.Mutex
OnChatUpdated func(database.Chat)
Logger slog.Logger
AllowedTemplateIDs func() map[uuid.UUID]bool
}
type createWorkspaceArgs struct {
TemplateID string `json:"template_id" description:"The UUIDv4 of the template to create the workspace from. Obtain this from list_templates."`
Name string `json:"name,omitempty" description:"The name of the workspace to create. If not provided, a random name will be generated."`
Parameters map[string]string `json:"parameters,omitempty" description:"Key-value pairs of template parameters to use when creating the workspace. Obtain available parameters from read_template."`
PresetID string `json:"preset_id,omitempty" description:"The UUIDv4 of a template version preset to use. Obtain available presets from read_template. When provided, the preset's parameters are applied automatically and the workspace may claim a prebuilt instance for faster startup."`
}
// CreateWorkspace returns a tool that creates a new workspace from a
// template. The tool is idempotent: if the chat already has a
// workspace that is building or running, it returns the existing
// workspace instead of creating a new one. A mutex prevents parallel
// calls from creating duplicate workspaces.
// db must not be nil and chatID must not be uuid.Nil.
func CreateWorkspace(db database.Store, organizationID, chatID uuid.UUID, options CreateWorkspaceOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"create_workspace",
"Create a new workspace from a template only when workspace-backed "+
"file inspection, command execution, or file editing is required, "+
"or when the user explicitly asks for one. Do not use this as a "+
"default first step for requests answerable from conversation "+
"context, provider tools, or external MCP tools. Requires a "+
"template_id (from list_templates). Optionally provide "+
"a name and parameter values (from read_template). "+
"If no name is given, one will be generated. "+
"Provide a preset_id (from read_template) to apply "+
"preset parameters and potentially claim a prebuilt "+
"workspace for faster startup. "+
"This tool is idempotent. If the chat already has a "+
"workspace that is building or running, the existing "+
"workspace is returned.",
func(ctx context.Context, args createWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.CreateFn == nil {
return fantasy.NewTextErrorResponse("workspace creator is not configured"), nil
}
templateIDStr := strings.TrimSpace(args.TemplateID)
if templateIDStr == "" {
return fantasy.NewTextErrorResponse("template_id is required; use list_templates to find one"), nil
}
templateID, err := uuid.Parse(templateIDStr)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("invalid template_id: %w", err).Error(),
), nil
}
if !isTemplateAllowed(options.AllowedTemplateIDs, templateID) {
return fantasy.NewTextErrorResponse("template not available for chat workspaces; use list_templates to find allowed templates"), nil
}
// Serialize workspace creation to prevent parallel
// tool calls from creating duplicate workspaces.
if options.WorkspaceMu != nil {
options.WorkspaceMu.Lock()
defer options.WorkspaceMu.Unlock()
}
ownerID := options.OwnerID
// Check for an existing workspace on the chat.
check := options.checkExistingWorkspace(ctx, db, chatID)
if check.BuildErr != nil {
return buildFailureToolResponse(
ctx,
options.Logger,
db,
ownerID,
organizationID,
check.BuildAction,
check.BuildID,
check.BuildErr,
), nil
}
if check.Err != nil {
return fantasy.NewTextErrorResponse(check.Err.Error()), nil
}
if check.Done {
return toolResponse(check.Result), nil
}
// Set up dbauthz context for DB lookups.
ownerCtx, ownerErr := asOwner(ctx, db, ownerID)
if ownerErr != nil {
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
}
ctx = ownerCtx
// Verify the template belongs to the same org as the
// chat. Without this check the tool could silently
// bind a cross-org workspace to the chat.
tmpl, tmplErr := db.GetTemplateByID(ctx, templateID)
if tmplErr != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("look up template: %w", tmplErr).Error(),
), nil
}
if tmpl.OrganizationID != organizationID {
return fantasy.NewTextErrorResponse(
"template belongs to a different organization than this chat; " +
"use list_templates to find templates in the correct organization",
), nil
}
hasExternalAgent, externalAgentErr := templateHasExternalAgent(ctx, db, tmpl)
if externalAgentErr != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("look up template version: %w", externalAgentErr).Error(),
), nil
}
if hasExternalAgent {
return fantasy.NewTextErrorResponse(createWorkspaceExternalAgentMessage), nil
}
var ttlMs *int64
raw, err := db.GetChatWorkspaceTTL(ctx)
if err != nil {
options.Logger.Error(ctx, "failed to read chat workspace TTL setting, using template default",
slog.Error(err),
)
} else {
d, parseErr := codersdk.ParseChatWorkspaceTTL(raw)
if parseErr != nil {
options.Logger.Warn(ctx, "invalid chat workspace TTL setting, using template default",
slog.F("raw", raw),
slog.Error(parseErr),
)
} else if d > 0 {
ms := d.Milliseconds()
ttlMs = &ms
}
}
createReq := codersdk.CreateWorkspaceRequest{
TemplateID: templateID,
TTLMillis: ttlMs,
}
// Apply preset if provided.
presetIDStr := strings.TrimSpace(args.PresetID)
if presetIDStr != "" {
presetID, err := uuid.Parse(presetIDStr)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("invalid preset_id: %w", err).Error(),
), nil
}
createReq.TemplateVersionPresetID = presetID
}
name := strings.TrimSpace(args.Name)
if name == "" {
name = generatedWorkspaceName(tmpl.Name)
} else if err := codersdk.NameValid(name); err != nil {
name = generatedWorkspaceName(name)
}
createReq.Name = name
// Map parameters.
for k, v := range args.Parameters {
createReq.RichParameterValues = append(
createReq.RichParameterValues,
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
)
}
workspace, err := createWorkspaceWithNameRetry(ctx, ownerID, createReq, options.CreateFn)
if err != nil {
if responseErr, ok := httperror.IsResponder(err); ok {
_, resp := responseErr.Response()
return toolResponse(responseErrorResult(resp)), nil
}
return fantasy.NewTextErrorResponse(err.Error()), nil
}
// Persist the workspace binding on the chat
// immediately so the frontend can start streaming
// build logs while the build is still running.
// Note: this binding is intentional even if the build
// later fails. The checkExistingWorkspace recovery
// path handles failed workspaces by allowing
// re-creation.
updatedChat, err := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: chatID,
WorkspaceID: uuid.NullUUID{
UUID: workspace.ID,
Valid: true,
},
BuildID: uuid.NullUUID{
UUID: workspace.LatestBuild.ID,
Valid: workspace.LatestBuild.ID != uuid.Nil,
},
// AgentID is left null because the build hasn't
// completed yet. The chatd runtime binds it once
// the agent comes online.
AgentID: uuid.NullUUID{},
})
if err != nil {
options.Logger.Error(ctx, "failed to persist chat workspace association",
slog.F("chat_id", chatID),
slog.F("workspace_id", workspace.ID),
slog.Error(err),
)
} else if options.OnChatUpdated != nil {
options.OnChatUpdated(updatedChat)
}
// Wait for the build to complete and the agent to
// come online so subsequent tools can use the
// workspace immediately.
buildID := workspace.LatestBuild.ID
if buildID != uuid.Nil {
if err := waitForBuild(ctx, db, buildID); err != nil {
return buildFailureToolResponse(
ctx,
options.Logger,
db,
ownerID,
organizationID,
buildFailureActionCreate,
buildID,
xerrors.Errorf("workspace build failed: %w", err),
), nil
}
}
result := map[string]any{
"created": true,
"workspace_name": workspace.FullName(),
}
setBuildID(result, buildID)
// Select the chat agent so follow-up tools wait on the
// intended workspace agent.
selectedAgent := database.WorkspaceAgent{}
agents, agentErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if agentErr == nil {
if len(agents) == 0 {
result["agent_status"] = "no_agent"
} else {
selected, selectErr := agentselect.FindChatAgent(agents)
if selectErr != nil {
result["agent_status"] = "selection_error"
result["agent_error"] = selectErr.Error()
} else {
selectedAgent = selected
}
}
}
// Wait for the agent to come online and startup scripts to finish.
if selectedAgent.ID != uuid.Nil {
agentStatus := waitForAgentReady(ctx, db, selectedAgent, options.AgentConnFn)
for k, v := range agentStatus {
result[k] = v
}
}
// Re-fire after the agent is fully ready so callers
// can load instruction files (AGENTS.md) from the
// running agent. This must happen after
// waitForAgentReady — firing earlier (e.g. right
// after waitForBuild) races with the agent startup
// and the connection usually times out before the
// agent is reachable.
if options.OnChatUpdated != nil {
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
options.OnChatUpdated(latest)
}
}
return toolResponse(result), nil
})
}
// existingWorkspaceResult holds the outcome of checking for an
// existing workspace on the chat.
type existingWorkspaceResult struct {
// Result is the tool response map when Done is true.
Result map[string]any
// Done indicates the caller should return early.
Done bool
// BuildAction, BuildID, and BuildErr are set together when
// waitForBuild failed, so the caller can render the build
// failure through the shared response path.
BuildAction buildFailureAction
BuildID uuid.UUID
BuildErr error
// Err is non-nil when the check itself failed.
Err error
}
// checkExistingWorkspace checks whether the given chat
// already has a usable workspace. Returns an
// existingWorkspaceResult with Done set when the caller should
// return early (workspace exists and is alive or building).
// Returns Done unset if the caller should proceed with creation
// (workspace is dead or missing).
func (o CreateWorkspaceOptions) checkExistingWorkspace(
ctx context.Context,
db database.Store,
chatID uuid.UUID,
) existingWorkspaceResult {
agentConnFn := o.AgentConnFn
agentInactiveDisconnectTimeout := o.AgentInactiveDisconnectTimeout
chat, err := db.GetChatByID(ctx, chatID)
if err != nil {
return existingWorkspaceResult{Err: xerrors.Errorf("load chat: %w", err)}
}
if !chat.WorkspaceID.Valid {
return existingWorkspaceResult{}
}
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
if err != nil {
return existingWorkspaceResult{Err: xerrors.Errorf("load workspace: %w", err)}
}
// Workspace was soft-deleted — allow creation.
if ws.Deleted {
return existingWorkspaceResult{}
}
// Check the latest build status.
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID)
if err != nil {
// Can't determine status — allow creation.
return existingWorkspaceResult{}
}
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
if err != nil {
return existingWorkspaceResult{}
}
switch job.JobStatus {
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning:
// Build is in progress. Publish the build ID so the
// frontend can start streaming logs, then wait.
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: chatID,
WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true},
BuildID: uuid.NullUUID{
UUID: build.ID,
Valid: build.ID != uuid.Nil,
},
AgentID: uuid.NullUUID{},
})
if bindErr != nil {
o.Logger.Error(ctx, "failed to persist build ID on chat binding",
slog.F("chat_id", chatID),
slog.F("build_id", build.ID),
slog.Error(bindErr),
)
} else if o.OnChatUpdated != nil {
o.OnChatUpdated(updatedChat)
}
if err := waitForBuild(ctx, db, build.ID); err != nil {
action := buildFailureActionCreate
if build.Transition == database.WorkspaceTransitionStart {
action = buildFailureActionStart
}
return existingWorkspaceResult{
BuildAction: action,
BuildID: build.ID,
BuildErr: xerrors.Errorf("existing workspace build failed: %w", err),
}
}
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
"message": "workspace build completed",
}
setBuildID(result, build.ID)
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 {
selected, selectErr := agentselect.FindChatAgent(agents)
if selectErr != nil {
o.Logger.Debug(ctx, "agent selection failed, falling back to first agent for readiness check",
slog.F("workspace_id", ws.ID),
slog.Error(selectErr),
)
selected = agents[0]
}
for k, v := range waitForAgentReady(ctx, db, selected, agentConnFn) {
result[k] = v
}
}
return existingWorkspaceResult{Result: result, Done: true}
case database.ProvisionerJobStatusSucceeded:
// If the workspace was stopped, tell the model to use
// start_workspace instead of creating a new one.
if build.Transition == database.WorkspaceTransitionStop {
return existingWorkspaceResult{Result: map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "stopped",
"message": "workspace is stopped; use start_workspace to start it",
}, Done: true}
}
// Build succeeded — use the agent's recent DB-backed
// connection status to decide whether the workspace is
// still usable.
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 {
selected, selectErr := agentselect.FindChatAgent(agents)
if selectErr != nil {
o.Logger.Debug(ctx, "agent selection failed, falling back to first agent for status check",
slog.F("workspace_id", ws.ID),
slog.Error(selectErr),
)
selected = agents[0]
}
status := selected.Status(dbtime.Now(), agentInactiveDisconnectTimeout)
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
}
switch status.Status {
case database.WorkspaceAgentStatusConnected:
result["message"] = "workspace is already running and recently connected"
for k, v := range waitForAgentReady(ctx, db, selected, nil) {
result[k] = v
}
return existingWorkspaceResult{Result: result, Done: true}
case database.WorkspaceAgentStatusConnecting:
result["message"] = "workspace exists and the agent is still connecting"
for k, v := range waitForAgentReady(ctx, db, selected, agentConnFn) {
result[k] = v
}
return existingWorkspaceResult{Result: result, Done: true}
case database.WorkspaceAgentStatusDisconnected,
database.WorkspaceAgentStatusTimeout:
// Agent is offline or never became ready - allow
// creation.
}
}
// No agent ID or no agent status — allow creation.
return existingWorkspaceResult{}
default:
// Failed, canceled, etc — allow creation.
return existingWorkspaceResult{}
}
}
// waitForBuild polls the specified build until its provisioner job
// completes or the context expires.
func waitForBuild(
ctx context.Context,
db database.Store,
buildID uuid.UUID,
) error {
buildCtx, cancel := context.WithTimeout(ctx, buildTimeout)
defer cancel()
ticker := time.NewTicker(buildPollInterval)
defer ticker.Stop()
for {
build, err := db.GetWorkspaceBuildByID(buildCtx, buildID)
if err != nil {
return xerrors.Errorf("get build: %w", err)
}
job, err := db.GetProvisionerJobByID(buildCtx, build.JobID)
if err != nil {
return xerrors.Errorf("get provisioner job: %w", err)
}
switch job.JobStatus {
case database.ProvisionerJobStatusSucceeded:
return nil
case database.ProvisionerJobStatusFailed:
errMsg := "build failed"
if job.Error.Valid {
errMsg = job.Error.String
}
var code codersdk.JobErrorCode
if job.ErrorCode.Valid {
code = codersdk.JobErrorCode(job.ErrorCode.String)
}
return &workspaceBuildError{message: errMsg, code: code}
case database.ProvisionerJobStatusCanceled:
return xerrors.New("build was canceled")
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning,
database.ProvisionerJobStatusCanceling:
// Still in progress — keep waiting.
default:
return xerrors.Errorf("unexpected job status: %s", job.JobStatus)
}
select {
case <-buildCtx.Done():
return xerrors.Errorf(
"timed out waiting for workspace build: %w",
buildCtx.Err(),
)
case <-ticker.C:
}
}
}
func templateHasExternalAgent(
ctx context.Context,
db database.Store,
tmpl database.Template,
) (bool, error) {
version, err := db.GetTemplateVersionByID(ctx, tmpl.ActiveVersionID)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return false, nil
}
return false, err
}
return version.HasExternalAgent.Valid && version.HasExternalAgent.Bool, nil
}
// externalAgentReadyError returns the external-agent-specific error
// message when agent belongs to an external resource, or the empty
// string otherwise. Errors looking up the resource are treated as
// non-external so the caller falls back to the dial error.
func externalAgentReadyError(
ctx context.Context,
db database.Store,
agent database.WorkspaceAgent,
) string {
isExternal, err := IsExternalWorkspaceAgent(ctx, db, agent)
if err != nil || !isExternal {
return ""
}
return ExternalAgentUnavailableMessage(agent)
}
// waitForAgentReady waits for the workspace agent to become
// reachable and for its startup scripts to finish. It returns
// status fields suitable for merging into a tool response.
func waitForAgentReady(
ctx context.Context,
db database.Store,
agent database.WorkspaceAgent,
agentConnFn AgentConnFunc,
) map[string]any {
result := map[string]any{}
agentID := agent.ID
// Phase 1: retry connecting to the agent.
if agentConnFn != nil {
agentCtx, agentCancel := context.WithTimeout(ctx, agentConnectTimeout)
defer agentCancel()
ticker := time.NewTicker(agentRetryInterval)
defer ticker.Stop()
var lastErr error
for {
attemptCtx, attemptCancel := context.WithTimeout(agentCtx, agentAttemptTimeout)
conn, release, err := agentConnFn(attemptCtx, agentID)
attemptCancel()
if err == nil {
release()
_ = conn
break
}
lastErr = err
select {
case <-agentCtx.Done():
result["agent_status"] = "not_ready"
// External agents may need user action on a different
// host. Surface that guidance instead of the raw dial
// error after the retry window has elapsed. The retry
// loop itself is unchanged, so a Connecting external
// agent still gets the full window to come online.
if msg := externalAgentReadyError(ctx, db, agent); msg != "" {
result["agent_error"] = msg
} else {
result["agent_error"] = lastErr.Error()
}
return result
case <-ticker.C:
}
}
}
// Phase 2: poll lifecycle until startup scripts finish.
scriptCtx, scriptCancel := context.WithTimeout(ctx, startupScriptTimeout)
defer scriptCancel()
ticker := time.NewTicker(startupScriptPollInterval)
defer ticker.Stop()
var lastState database.WorkspaceAgentLifecycleState
for {
row, err := db.GetWorkspaceAgentLifecycleStateByID(scriptCtx, agentID)
if err == nil {
lastState = row.LifecycleState
switch lastState {
case database.WorkspaceAgentLifecycleStateCreated,
database.WorkspaceAgentLifecycleStateStarting:
// Still in progress, keep polling.
case database.WorkspaceAgentLifecycleStateReady:
return result
default:
// Terminal non-ready state.
result["startup_scripts"] = "startup_scripts_failed"
result["lifecycle_state"] = string(lastState)
return result
}
}
select {
case <-scriptCtx.Done():
if errors.Is(scriptCtx.Err(), context.DeadlineExceeded) {
result["startup_scripts"] = "startup_scripts_timeout"
} else {
result["startup_scripts"] = "startup_scripts_unknown"
}
return result
case <-ticker.C:
}
}
}
func createWorkspaceWithNameRetry(
ctx context.Context,
ownerID uuid.UUID,
req codersdk.CreateWorkspaceRequest,
createFn CreateWorkspaceFn,
) (codersdk.Workspace, error) {
workspace, err := createFn(ctx, ownerID, req)
if err == nil {
return workspace, nil
}
if !isWorkspaceNameConflict(err) {
return codersdk.Workspace{}, err
}
req.Name = generatedWorkspaceName(req.Name)
return createFn(ctx, ownerID, req)
}
func isWorkspaceNameConflict(err error) bool {
responseErr, ok := httperror.IsResponder(err)
if !ok {
return false
}
status, resp := responseErr.Response()
if status != http.StatusConflict {
return false
}
for _, validation := range resp.Validations {
if validation.Field == "name" {
return true
}
}
return false
}
func generatedWorkspaceName(seed string) string {
base := codersdk.UsernameFrom(strings.TrimSpace(strings.ToLower(seed)))
if strings.TrimSpace(base) == "" {
base = "workspace"
}
suffix := strings.ReplaceAll(uuid.NewString(), "-", "")[:4]
if len(base) > 27 {
base = strings.Trim(base[:27], "-")
}
if base == "" {
base = "workspace"
}
name := fmt.Sprintf("%s-%s", base, suffix)
if err := codersdk.NameValid(name); err == nil {
return name
}
return namesgenerator.NameDigitWith("-")
}