Files
coder/coderd/chatd/chattool/createworkspace.go
T
Mathias Fredriksson 72689c2552 fix(coderd): improve error handling in chattest, chattool, and chats (#23047)
- Use t.Errorf in chattest non-streaming helpers so encoding
  failures fail the test
- Thread testing.TB into writeResponsesAPIStreaming and log
  SSE write errors instead of silently dropping them
- Bump createworkspace DB error log from Warn to Error
- Use errors.Join for timeout + output error in execute.go
2026-03-13 21:41:24 +02:00

502 lines
15 KiB
Go

package chattool
import (
"context"
"errors"
"fmt"
"strings"
"sync"
"time"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/namesgenerator"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
)
const (
// buildPollInterval is how often we check if the workspace
// build has completed.
buildPollInterval = 2 * time.Second
// buildTimeout is the maximum time to wait for a workspace
// build to complete before giving up.
buildTimeout = 10 * time.Minute
// agentConnectTimeout is the maximum time to wait for the
// workspace agent to become reachable after a successful build.
agentConnectTimeout = 2 * time.Minute
// agentRetryInterval is how often we retry connecting to the
// workspace agent.
agentRetryInterval = 2 * time.Second
// agentAttemptTimeout is the timeout for a single connection
// attempt to the workspace agent during the retry loop.
agentAttemptTimeout = 5 * time.Second
// agentPingTimeout is the timeout for a single agent ping
// when checking whether an existing workspace is alive.
agentPingTimeout = 5 * time.Second
// startupScriptTimeout is the maximum time to wait for the
// workspace agent's startup scripts to finish after the agent
// is reachable.
startupScriptTimeout = 10 * time.Minute
// startupScriptPollInterval is how often we check the agent's
// lifecycle state while waiting for startup scripts.
startupScriptPollInterval = 2 * time.Second
)
// CreateWorkspaceFn creates a workspace for the given owner.
type CreateWorkspaceFn func(
ctx context.Context,
ownerID uuid.UUID,
req codersdk.CreateWorkspaceRequest,
) (codersdk.Workspace, error)
// AgentConnFunc provides access to workspace agent connections.
type AgentConnFunc func(
ctx context.Context,
agentID uuid.UUID,
) (workspacesdk.AgentConn, func(), error)
// CreateWorkspaceOptions configures the create_workspace tool.
type CreateWorkspaceOptions struct {
DB database.Store
OwnerID uuid.UUID
ChatID uuid.UUID
CreateFn CreateWorkspaceFn
AgentConnFn AgentConnFunc
WorkspaceMu *sync.Mutex
Logger slog.Logger
}
type createWorkspaceArgs struct {
TemplateID string `json:"template_id"`
Name string `json:"name,omitempty"`
Parameters map[string]string `json:"parameters,omitempty"`
}
// CreateWorkspace returns a tool that creates a new workspace from a
// template. The tool is idempotent: if the chat already has a
// workspace that is building or running, it returns the existing
// workspace instead of creating a new one. A mutex prevents parallel
// calls from creating duplicate workspaces.
func CreateWorkspace(options CreateWorkspaceOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"create_workspace",
"Create a new workspace from a template. Requires a "+
"template_id (from list_templates). Optionally provide "+
"a name and parameter values (from read_template). "+
"If no name is given, one will be generated. "+
"This tool is idempotent — if the chat already has a "+
"workspace that is building or running, the existing "+
"workspace is returned.",
func(ctx context.Context, args createWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.CreateFn == nil {
return fantasy.NewTextErrorResponse("workspace creator is not configured"), nil
}
templateIDStr := strings.TrimSpace(args.TemplateID)
if templateIDStr == "" {
return fantasy.NewTextErrorResponse("template_id is required; use list_templates to find one"), nil
}
templateID, err := uuid.Parse(templateIDStr)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("invalid template_id: %w", err).Error(),
), nil
}
// Serialize workspace creation to prevent parallel
// tool calls from creating duplicate workspaces.
if options.WorkspaceMu != nil {
options.WorkspaceMu.Lock()
defer options.WorkspaceMu.Unlock()
}
// Check for an existing workspace on the chat.
if options.DB != nil && options.ChatID != uuid.Nil {
existing, done, existErr := checkExistingWorkspace(
ctx, options.DB, options.ChatID,
options.AgentConnFn,
)
if existErr != nil {
return fantasy.NewTextErrorResponse(existErr.Error()), nil
}
if done {
return toolResponse(existing), nil
}
}
ownerID := options.OwnerID
// Set up dbauthz context for DB lookups.
if options.DB != nil {
ownerCtx, ownerErr := asOwner(ctx, options.DB, ownerID)
if ownerErr != nil {
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
}
ctx = ownerCtx
}
createReq := codersdk.CreateWorkspaceRequest{
TemplateID: templateID,
}
// Resolve workspace name.
name := strings.TrimSpace(args.Name)
if name == "" {
seed := "workspace"
if options.DB != nil {
if t, lookupErr := options.DB.GetTemplateByID(ctx, templateID); lookupErr == nil {
seed = t.Name
}
}
name = generatedWorkspaceName(seed)
} else if err := codersdk.NameValid(name); err != nil {
name = generatedWorkspaceName(name)
}
createReq.Name = name
// Map parameters.
for k, v := range args.Parameters {
createReq.RichParameterValues = append(
createReq.RichParameterValues,
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
)
}
workspace, err := options.CreateFn(ctx, ownerID, createReq)
if err != nil {
return fantasy.NewTextErrorResponse(err.Error()), nil
}
// Wait for the build to complete and the agent to
// come online so subsequent tools can use the
// workspace immediately.
if options.DB != nil {
if err := waitForBuild(ctx, options.DB, workspace.ID); err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("workspace build failed: %w", err).Error(),
), nil
}
}
// Look up the first agent so we can link it to the chat.
workspaceAgentID := uuid.Nil
if options.DB != nil {
agents, agentErr := options.DB.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if agentErr == nil && len(agents) > 0 {
workspaceAgentID = agents[0].ID
}
}
// Persist workspace + agent association on the chat.
if options.DB != nil && options.ChatID != uuid.Nil {
if _, err := options.DB.UpdateChatWorkspace(ctx, database.UpdateChatWorkspaceParams{
ID: options.ChatID,
WorkspaceID: uuid.NullUUID{
UUID: workspace.ID,
Valid: true,
},
}); err != nil {
options.Logger.Error(ctx, "failed to persist chat workspace association",
slog.F("chat_id", options.ChatID),
slog.F("workspace_id", workspace.ID),
slog.Error(err),
)
}
}
// Wait for the agent to come online and startup scripts to finish.
if workspaceAgentID != uuid.Nil {
agentStatus := waitForAgentReady(ctx, options.DB, workspaceAgentID, options.AgentConnFn)
result := map[string]any{
"created": true,
"workspace_name": workspace.FullName(),
}
for k, v := range agentStatus {
result[k] = v
}
return toolResponse(result), nil
}
return toolResponse(map[string]any{
"created": true,
"workspace_name": workspace.FullName(),
}), nil
})
}
// checkExistingWorkspace checks whether the chat already has a usable
// workspace. Returns the result map and true if the caller should
// return early (workspace exists and is alive or building). Returns
// false if the caller should proceed with creation (workspace is dead
// or missing).
func checkExistingWorkspace(
ctx context.Context,
db database.Store,
chatID uuid.UUID,
agentConnFn AgentConnFunc,
) (map[string]any, bool, error) {
chat, err := db.GetChatByID(ctx, chatID)
if err != nil {
return nil, false, xerrors.Errorf("load chat: %w", err)
}
if !chat.WorkspaceID.Valid {
return nil, false, nil
}
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
if err != nil {
return nil, false, xerrors.Errorf("load workspace: %w", err)
}
// Workspace was soft-deleted — allow creation.
if ws.Deleted {
return nil, false, nil
}
// Check the latest build status.
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID)
if err != nil {
// Can't determine status — allow creation.
return nil, false, nil
}
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
if err != nil {
return nil, false, nil
}
switch job.JobStatus {
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning:
// Build is in progress — wait for it instead of
// creating a new workspace.
if err := waitForBuild(ctx, db, ws.ID); err != nil {
return nil, false, xerrors.Errorf(
"existing workspace build failed: %w", err,
)
}
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
"message": "workspace build completed",
}
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 {
for k, v := range waitForAgentReady(ctx, db, agents[0].ID, agentConnFn) {
result[k] = v
}
}
return result, true, nil
case database.ProvisionerJobStatusSucceeded:
// If the workspace was stopped, tell the model to use
// start_workspace instead of creating a new one.
if build.Transition == database.WorkspaceTransitionStop {
return map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "stopped",
"message": "workspace is stopped; use start_workspace to start it",
}, true, nil
}
// Build succeeded — check if agent is reachable.
agents, agentsErr := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if agentsErr == nil && len(agents) > 0 && agentConnFn != nil {
pingCtx, cancel := context.WithTimeout(ctx, agentPingTimeout)
conn, release, connErr := agentConnFn(pingCtx, agents[0].ID)
cancel()
if connErr == nil {
release()
_ = conn
// Agent is reachable; wait for startup scripts.
result := map[string]any{
"created": false,
"workspace_name": ws.Name,
"status": "already_exists",
"message": "workspace is already running and reachable",
}
// Pass nil for agentConnFn since we already confirmed connectivity.
for k, v := range waitForAgentReady(ctx, db, agents[0].ID, nil) {
result[k] = v
}
return result, true, nil
}
// Agent unreachable — workspace is dead, allow
// creation.
}
// No agent ID or no conn func — allow creation.
return nil, false, nil
default:
// Failed, canceled, etc — allow creation.
return nil, false, nil
}
}
// waitForBuild polls the workspace's latest build until it
// completes or the context expires.
func waitForBuild(
ctx context.Context,
db database.Store,
workspaceID uuid.UUID,
) error {
buildCtx, cancel := context.WithTimeout(ctx, buildTimeout)
defer cancel()
ticker := time.NewTicker(buildPollInterval)
defer ticker.Stop()
for {
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(
buildCtx, workspaceID,
)
if err != nil {
return xerrors.Errorf("get latest build: %w", err)
}
job, err := db.GetProvisionerJobByID(buildCtx, build.JobID)
if err != nil {
return xerrors.Errorf("get provisioner job: %w", err)
}
switch job.JobStatus {
case database.ProvisionerJobStatusSucceeded:
return nil
case database.ProvisionerJobStatusFailed:
errMsg := "build failed"
if job.Error.Valid {
errMsg = job.Error.String
}
return xerrors.New(errMsg)
case database.ProvisionerJobStatusCanceled:
return xerrors.New("build was canceled")
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning,
database.ProvisionerJobStatusCanceling:
// Still in progress — keep waiting.
default:
return xerrors.Errorf("unexpected job status: %s", job.JobStatus)
}
select {
case <-buildCtx.Done():
return xerrors.Errorf(
"timed out waiting for workspace build: %w",
buildCtx.Err(),
)
case <-ticker.C:
}
}
}
// waitForAgentReady waits for the workspace agent to become
// reachable and for its startup scripts to finish. It returns
// status fields suitable for merging into a tool response.
func waitForAgentReady(
ctx context.Context,
db database.Store,
agentID uuid.UUID,
agentConnFn AgentConnFunc,
) map[string]any {
result := map[string]any{}
// Phase 1: retry connecting to the agent.
if agentConnFn != nil {
agentCtx, agentCancel := context.WithTimeout(ctx, agentConnectTimeout)
defer agentCancel()
ticker := time.NewTicker(agentRetryInterval)
defer ticker.Stop()
var lastErr error
for {
attemptCtx, attemptCancel := context.WithTimeout(agentCtx, agentAttemptTimeout)
conn, release, err := agentConnFn(attemptCtx, agentID)
attemptCancel()
if err == nil {
release()
_ = conn
break
}
lastErr = err
select {
case <-agentCtx.Done():
result["agent_status"] = "not_ready"
result["agent_error"] = lastErr.Error()
return result
case <-ticker.C:
}
}
}
// Phase 2: poll lifecycle until startup scripts finish.
if db != nil {
scriptCtx, scriptCancel := context.WithTimeout(ctx, startupScriptTimeout)
defer scriptCancel()
ticker := time.NewTicker(startupScriptPollInterval)
defer ticker.Stop()
var lastState database.WorkspaceAgentLifecycleState
for {
row, err := db.GetWorkspaceAgentLifecycleStateByID(scriptCtx, agentID)
if err == nil {
lastState = row.LifecycleState
switch lastState {
case database.WorkspaceAgentLifecycleStateCreated,
database.WorkspaceAgentLifecycleStateStarting:
// Still in progress, keep polling.
case database.WorkspaceAgentLifecycleStateReady:
return result
default:
// Terminal non-ready state.
result["startup_scripts"] = "startup_scripts_failed"
result["lifecycle_state"] = string(lastState)
return result
}
}
select {
case <-scriptCtx.Done():
if errors.Is(scriptCtx.Err(), context.DeadlineExceeded) {
result["startup_scripts"] = "startup_scripts_timeout"
} else {
result["startup_scripts"] = "startup_scripts_unknown"
}
return result
case <-ticker.C:
}
}
}
return result
}
func generatedWorkspaceName(seed string) string {
base := codersdk.UsernameFrom(strings.TrimSpace(strings.ToLower(seed)))
if strings.TrimSpace(base) == "" {
base = "workspace"
}
suffix := strings.ReplaceAll(uuid.NewString(), "-", "")[:4]
if len(base) > 27 {
base = strings.Trim(base[:27], "-")
}
if base == "" {
base = "workspace"
}
name := fmt.Sprintf("%s-%s", base, suffix)
if err := codersdk.NameValid(name); err == nil {
return name
}
return namesgenerator.NameDigitWith("-")
}