Files
coder/coderd/x/chatd/chattool/startworkspace.go
T
Ethan bd6cc1aaf2 feat(coderd): add stop_workspace chatd tool and recovery classification (#24997)
## Summary

Adds a `stop_workspace` tool to chatd so the model can recover from the
"workspace running but agent dead" failure mode (e.g. an OOM that leaves
the workspace running but the agent unreachable) by stopping and then
starting the workspace.

<img width="924" height="742" alt="image"
src="https://github.com/user-attachments/assets/279dedb6-6e29-4fe1-8754-3a1f01e538bf"
/>



## What changed

**New `stop_workspace` chatd tool**
(`coderd/x/chatd/chattool/stopworkspace.go`). Mirrors `start_workspace`:
shares `WorkspaceMu` to serialize with create/start, waits for any
in-progress build before issuing a stop, and is idempotent only after a
successful Stop transition. Failed stop builds re-attempt rather than
reporting success.

**New `chatStopWorkspace` coderd hook** (`coderd/exp_chats.go`). Mirrors
`chatStartWorkspace` minus the `RequireActiveVersion` gate. Stop should
not be blocked by template version policy.

**Differentiated recovery sentinels** (`coderd/x/chatd/chatd.go`).
`errChatAgentDisconnected` instructs the model to call `stop_workspace`
then `start_workspace`. `errChatDialTimeout` instructs a single retry,
then user escalation if it repeats. The previous single message
conflated transient and persistent failures.

**Two-signal recovery gate.** Recovery is only surfaced when a tool call
times out *and* a fresh DB read of the latest workspace agent says
`Disconnected`. The previous draft escalated on the DB read alone, which
would fire on a 30-second heartbeat blip (e.g. agent respawn) and prompt
a destructive stop/start unnecessarily.

**Cache-hit disconnected handling** now clears the cache and retries a
fresh dial before escalating, rather than returning the recovery
sentinel immediately. Latest-agent classification uses
`GetWorkspaceAgentsInLatestBuildByWorkspaceID` instead of the chat's
bound `AgentID`, so stale bindings after a rebuild don't misclassify.

**Shared chattool helpers** in `coderd/x/chatd/chattool/chattool.go`:
`latestWorkspaceBuildAndJob`, `publishBuildBinding`,
`provisionerJobTerminal`. Applied to both `start_workspace` and
`stop_workspace`.

## Notes

- Reverts an earlier draft that widened `ask_user_question` to root
standard turns. Plan-mode-only behavior is restored.
- The `stop_workspace` tool currently renders via the generic chat
tool-call UI. A follow-up frontend PR will prettify the `stop_workspace`
tool and style it like the `start_workspace` tool.
- Never-connected (`Timeout` status) agents are intentionally excluded
from recovery. They indicate template or startup failure, not the
running-but-dead case this PR targets.

Closes CODAGT-315
2026-05-11 16:23:07 +10:00

272 lines
9.0 KiB
Go

package chattool
import (
"context"
"sync"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/httpapi/httperror"
"github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect"
"github.com/coder/coder/v2/codersdk"
)
// StartWorkspaceFn starts a workspace by creating a new build with
// the "start" transition.
type StartWorkspaceFn func(
ctx context.Context,
ownerID uuid.UUID,
workspaceID uuid.UUID,
req codersdk.CreateWorkspaceBuildRequest,
) (codersdk.WorkspaceBuild, error)
// StartWorkspaceOptions configures the start_workspace tool.
type StartWorkspaceOptions struct {
OwnerID uuid.UUID
StartFn StartWorkspaceFn
AgentConnFn AgentConnFunc
WorkspaceMu *sync.Mutex
OnChatUpdated func(database.Chat)
Logger slog.Logger
}
type startWorkspaceArgs struct {
Parameters map[string]string `json:"parameters,omitempty"`
}
// StartWorkspace returns a tool that starts a stopped workspace
// associated with the current chat. The tool is idempotent: if the
// workspace is already running or building, it returns immediately.
// db must not be nil and chatID must not be uuid.Nil.
func StartWorkspace(db database.Store, chatID uuid.UUID, options StartWorkspaceOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"start_workspace",
"Start the chat's workspace if it is currently stopped. "+
"This tool is idempotent — if the workspace is already "+
"running, it returns immediately. Use create_workspace "+
"first if no workspace exists yet. Provide parameter "+
"values (from read_template) only if necessary or "+
"explicitly requested by the user.",
func(ctx context.Context, args startWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.StartFn == nil {
return fantasy.NewTextErrorResponse("workspace starter is not configured"), nil
}
// Serialize with create_workspace and stop_workspace to prevent races.
if options.WorkspaceMu != nil {
options.WorkspaceMu.Lock()
defer options.WorkspaceMu.Unlock()
}
chat, err := db.GetChatByID(ctx, chatID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("load chat: %w", err).Error(),
), nil
}
if !chat.WorkspaceID.Valid {
return fantasy.NewTextErrorResponse(
"chat has no workspace; use create_workspace first",
), nil
}
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("load workspace: %w", err).Error(),
), nil
}
if ws.Deleted {
return fantasy.NewTextErrorResponse(
"workspace was deleted; use create_workspace to make a new one",
), nil
}
build, job, err := latestWorkspaceBuildAndJob(ctx, db, ws.ID)
if err != nil {
return fantasy.NewTextErrorResponse(err.Error()), nil
}
// If a build is already in progress, wait for it.
switch job.JobStatus {
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning:
// Publish the build ID to the frontend so it
// can start streaming logs immediately.
publishBuildBinding(ctx, db, options.Logger, chatID, ws.ID, build.ID, options.OnChatUpdated)
if err := waitForBuild(ctx, db, build.ID); err != nil {
// newBuildError returns via toolResponse (IsError: false)
// rather than NewTextErrorResponse (IsError: true) so the
// JSON result preserves build_id for the frontend's log
// viewer. The fantasy/chatprompt pipeline discards structured
// fields from IsError content.
// The frontend detects errors via the "error" key instead.
return buildFailureToolResponse(
ctx,
options.Logger,
db,
options.OwnerID,
ws.OrganizationID,
buildFailureActionStart,
build.ID,
xerrors.Errorf("waiting for in-progress build: %w", err),
), nil
}
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, build.ID)
// Re-fire after the agent is fully ready so
// callers can load instruction files (AGENTS.md).
// This must happen after waitForAgentAndRespond —
// firing earlier races with agent startup.
if options.OnChatUpdated != nil {
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
options.OnChatUpdated(latest)
}
}
return toolResponse(result), nil
case database.ProvisionerJobStatusSucceeded:
// If the latest successful build is a start
// transition, the workspace should be running.
if build.Transition == database.WorkspaceTransitionStart {
return toolResponse(waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, uuid.Nil)), nil
}
// Otherwise it is stopped (or deleted) — proceed
// to start it below.
default:
// Failed, canceled, etc — try starting anyway.
}
// Set up dbauthz context for the start call.
ownerCtx, ownerErr := asOwner(ctx, db, options.OwnerID)
if ownerErr != nil {
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
}
startReq := codersdk.CreateWorkspaceBuildRequest{
Transition: codersdk.WorkspaceTransitionStart,
}
for k, v := range args.Parameters {
startReq.RichParameterValues = append(
startReq.RichParameterValues,
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
)
}
startBuild, err := options.StartFn(ownerCtx, options.OwnerID, ws.ID, startReq)
if err != nil {
if responseErr, ok := httperror.IsResponder(err); ok {
_, resp := responseErr.Response()
result := responseErrorResult(resp)
if len(resp.Validations) > 0 && ws.TemplateID != uuid.Nil {
result["template_id"] = ws.TemplateID.String()
}
return toolResponse(result), nil
}
return fantasy.NewTextErrorResponse(
xerrors.Errorf("start workspace: %w", err).Error(),
), nil
}
// Persist the build ID on the chat binding so the
// frontend can stream logs without polling.
publishBuildBinding(ctx, db, options.Logger, chatID, ws.ID, startBuild.ID, options.OnChatUpdated)
if err := waitForBuild(ctx, db, startBuild.ID); err != nil {
return buildFailureToolResponse(
ctx,
options.Logger,
db,
options.OwnerID,
ws.OrganizationID,
buildFailureActionStart,
startBuild.ID,
xerrors.Errorf("workspace start build failed: %w", err),
), nil
}
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, startBuild.ID)
// If the template version changed, annotate the
// response so the model knows an auto-update
// occurred.
if startBuild.TemplateVersionID != uuid.Nil &&
build.TemplateVersionID != uuid.Nil &&
startBuild.TemplateVersionID != build.TemplateVersionID {
result["updated_to_active_version"] = true
result["update_reason"] = "template requires active versions"
result["message"] = "Workspace started and was updated to the active template version because the template requires active versions."
}
// Re-fire after the agent is fully ready so
// callers can load instruction files (AGENTS.md).
// This must happen after waitForAgentAndRespond —
// firing earlier races with agent startup.
if options.OnChatUpdated != nil {
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
options.OnChatUpdated(latest)
}
}
return toolResponse(result), nil
})
}
// waitForAgentAndRespond selects the chat agent from the workspace's
// latest build, waits for it to become reachable, and returns a
// result map. When buildID is non-zero, it is included in the
// result so the frontend can fetch historical build logs. Pass
// uuid.Nil when no build was triggered (e.g. workspace already
// running); the result will include no_build: true so the
// frontend can suppress the build-log section.
//
// The caller is responsible for converting the returned map to a
// fantasy.ToolResponse via toolResponse(), and may add extra
// fields before doing so.
func waitForAgentAndRespond(
ctx context.Context,
db database.Store,
agentConnFn AgentConnFunc,
ws database.Workspace,
buildID uuid.UUID,
) map[string]any {
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if err != nil || len(agents) == 0 {
// Workspace started but no agent found - still report
// success so the model knows the workspace is up.
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
"agent_status": "no_agent",
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
return result
}
selected, err := agentselect.FindChatAgent(agents)
if err != nil {
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
"agent_status": "selection_error",
"agent_error": err.Error(),
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
return result
}
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
for k, v := range waitForAgentReady(ctx, db, selected, agentConnFn) {
result[k] = v
}
return result
}