Files
coder/coderd/x/chatd/chattool/chattool.go
T
Ethan bd6cc1aaf2 feat(coderd): add stop_workspace chatd tool and recovery classification (#24997)
## Summary

Adds a `stop_workspace` tool to chatd so the model can recover from the
"workspace running but agent dead" failure mode (e.g. an OOM that leaves
the workspace running but the agent unreachable) by stopping and then
starting the workspace.

<img width="924" height="742" alt="image"
src="https://github.com/user-attachments/assets/279dedb6-6e29-4fe1-8754-3a1f01e538bf"
/>



## What changed

**New `stop_workspace` chatd tool**
(`coderd/x/chatd/chattool/stopworkspace.go`). Mirrors `start_workspace`:
shares `WorkspaceMu` to serialize with create/start, waits for any
in-progress build before issuing a stop, and is idempotent only after a
successful Stop transition. Failed stop builds re-attempt rather than
reporting success.

**New `chatStopWorkspace` coderd hook** (`coderd/exp_chats.go`). Mirrors
`chatStartWorkspace` minus the `RequireActiveVersion` gate. Stop should
not be blocked by template version policy.

**Differentiated recovery sentinels** (`coderd/x/chatd/chatd.go`).
`errChatAgentDisconnected` instructs the model to call `stop_workspace`
then `start_workspace`. `errChatDialTimeout` instructs a single retry,
then user escalation if it repeats. The previous single message
conflated transient and persistent failures.

**Two-signal recovery gate.** Recovery is only surfaced when a tool call
times out *and* a fresh DB read of the latest workspace agent says
`Disconnected`. The previous draft escalated on the DB read alone, which
would fire on a 30-second heartbeat blip (e.g. agent respawn) and prompt
a destructive stop/start unnecessarily.

**Cache-hit disconnected handling** now clears the cache and retries a
fresh dial before escalating, rather than returning the recovery
sentinel immediately. Latest-agent classification uses
`GetWorkspaceAgentsInLatestBuildByWorkspaceID` instead of the chat's
bound `AgentID`, so stale bindings after a rebuild don't misclassify.

**Shared chattool helpers** in `coderd/x/chatd/chattool/chattool.go`:
`latestWorkspaceBuildAndJob`, `publishBuildBinding`,
`provisionerJobTerminal`. Applied to both `start_workspace` and
`stop_workspace`.

## Notes

- Reverts an earlier draft that widened `ask_user_question` to root
standard turns. Plan-mode-only behavior is restored.
- The `stop_workspace` tool currently renders via the generic chat
tool-call UI. A follow-up frontend PR will prettify the `stop_workspace`
tool and style it like the `start_workspace` tool.
- Never-connected (`Timeout` status) agents are intentionally excluded
from recovery. They indicate template or startup failure, not the
running-but-dead case this PR targets.

Closes CODAGT-315
2026-05-11 16:23:07 +10:00

182 lines
4.9 KiB
Go

package chattool
import (
"context"
"encoding/json"
"unicode/utf8"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/codersdk"
)
func marshalToolResponse(result any) fantasy.ToolResponse {
data, err := json.Marshal(result)
if err != nil {
return fantasy.NewTextResponse("{}")
}
return fantasy.NewTextResponse(string(data))
}
// toolResponse builds a fantasy.ToolResponse from a JSON-serializable
// result map. The map constraint ensures all tool results serialize
// to JSON objects so the frontend can safely parse them.
func toolResponse(result map[string]any) fantasy.ToolResponse {
return marshalToolResponse(result)
}
// buildToolResponse marshals a buildErrorResult into a tool response.
// Separate from toolResponse to keep the map[string]any constraint
// on the general helper while allowing typed error structs.
func buildToolResponse(r buildErrorResult) fantasy.ToolResponse {
return marshalToolResponse(r)
}
// responseErrorResult converts a codersdk.Response into a structured
// tool result. We return these via toolResponse rather than
// NewTextErrorResponse because the fantasy/chatprompt pipeline flattens
// IsError content into a single string and drops validation details.
func responseErrorResult(resp codersdk.Response) map[string]any {
message := resp.Message
if message == "" {
message = "request failed"
}
result := map[string]any{
"error": message,
}
if resp.Detail != "" {
result["detail"] = resp.Detail
}
if len(resp.Validations) > 0 {
result["validations"] = resp.Validations
}
return result
}
func latestWorkspaceBuildAndJob(
ctx context.Context,
db database.Store,
workspaceID uuid.UUID,
) (database.WorkspaceBuild, database.ProvisionerJob, error) {
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, workspaceID)
if err != nil {
return database.WorkspaceBuild{}, database.ProvisionerJob{}, xerrors.Errorf("get latest build: %w", err)
}
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
if err != nil {
return database.WorkspaceBuild{}, database.ProvisionerJob{}, xerrors.Errorf("get provisioner job: %w", err)
}
return build, job, nil
}
func publishBuildBinding(
ctx context.Context,
db database.Store,
logger slog.Logger,
chatID uuid.UUID,
workspaceID uuid.UUID,
buildID uuid.UUID,
onChatUpdated func(database.Chat),
) {
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: chatID,
WorkspaceID: uuid.NullUUID{UUID: workspaceID, Valid: true},
BuildID: uuid.NullUUID{
UUID: buildID,
Valid: buildID != uuid.Nil,
},
AgentID: uuid.NullUUID{},
})
if bindErr != nil {
logger.Error(ctx, "failed to persist build ID on chat binding",
slog.F("chat_id", chatID),
slog.F("build_id", buildID),
slog.Error(bindErr),
)
return
}
if onChatUpdated != nil {
onChatUpdated(updatedChat)
}
}
func provisionerJobTerminal(status database.ProvisionerJobStatus) bool {
switch status {
case database.ProvisionerJobStatusSucceeded,
database.ProvisionerJobStatusFailed,
database.ProvisionerJobStatusCanceled:
return true
default:
return false
}
}
func truncateRunes(value string, maxLen int) string {
if maxLen <= 0 || value == "" {
return ""
}
if utf8.RuneCountInString(value) <= maxLen {
return value
}
runes := []rune(value)
if maxLen > len(runes) {
maxLen = len(runes)
}
return string(runes[:maxLen])
}
// buildErrorResult is a structured error response that preserves
// the build ID alongside the error message. This lets the frontend
// keep showing build logs when a build fails instead of losing
// them on the error transition.
type buildErrorResult struct {
Error string `json:"error"`
BuildID string `json:"build_id,omitempty"`
}
func newBuildError(msg string, buildID uuid.UUID) buildErrorResult {
r := buildErrorResult{Error: msg}
if buildID != uuid.Nil {
r.BuildID = buildID.String()
}
return r
}
// setBuildID adds the build_id field to a tool response map when
// the build ID is known (non-zero).
func setBuildID(result map[string]any, buildID uuid.UUID) {
if buildID != uuid.Nil {
result["build_id"] = buildID.String()
}
}
// setNoBuild marks the response with no_build: true when no build
// was triggered. The frontend uses this flag to suppress the
// build-log section for already-running workspaces.
func setNoBuild(result map[string]any, buildID uuid.UUID) {
if buildID == uuid.Nil {
result["no_build"] = true
}
}
// isTemplateAllowed checks whether a template ID is permitted by the
// configured allowlist. A nil function or an empty allowlist means
// all templates are allowed.
func isTemplateAllowed(getAllowlist func() map[uuid.UUID]bool, id uuid.UUID) bool {
if getAllowlist == nil {
return true
}
allowlist := getAllowlist()
if len(allowlist) == 0 {
return true
}
return allowlist[id]
}