Files
coder/coderd/x/chatd/chattool/startworkspace.go
T
Ethan ef0151601e feat: report insufficient quota build failures in chat tools (#24956)
## Summary

When a workspace build fails because the user is over their group quota,
the chat tools currently surface the failure as a bare `"workspace build
failed: insufficient quota"` string with no machine-readable error code
and no visibility into the user's current usage. Agents and the UI
cannot distinguish quota failures from any other Terraform error, so
users see an opaque message and have no clear path to recovery.

This PR tags quota failures with a typed error code at the source and
propagates it through the chat tool layer so callers can react to it
explicitly.

Relates to CODAGT-20

## Changes

**Provisioner runner**

- Add `InsufficientQuotaErrorCode = "INSUFFICIENT_QUOTA"` and set it
explicitly at the `commitQuota` failure site via a new
`failedWorkspaceBuildfCode` helper, so `provisioner_jobs.error_code` is
populated only on the genuine quota path. The substring matcher used for
externally produced sentinels (e.g. `"missing parameter"`, `"required
template variables"`) is intentionally not extended; provider errors
that happen to mention "insufficient quota" stay classified as generic
build failures.

**SDK and API contract**

- Add `JobErrorCodeInsufficientQuota` and a
`JobIsInsufficientQuotaErrorCode` helper to `codersdk`.
- Extend the swagger `enums` tag on `ProvisionerJob.ErrorCode` to
include `INSUFFICIENT_QUOTA`.
- Regenerate `coderd/apidoc`, `docs/reference/api/*`, and
`site/src/api/typesGenerated.ts`.

**chattool create_workspace / start_workspace**

- `waitForBuild` now returns a typed `*workspaceBuildError` carrying
both the message and the `JobErrorCode`, instead of a bare error string.
- New `quotaerror.go` introduces a structured `quotaErrorResult` (with
`error_code`, `title`, `message`, `build_id`, and optional `quota`) and
a best-effort `workspaceQuotaDetails` lookup that wraps owner
authorization internally and fetches `credits_consumed` and `budget`
from the database. Quota lookup failures (including authorization
failures) never block the failure payload.
- On quota-coded build failures, both `create_workspace` and
`start_workspace` now return the structured response (with the recovery
guidance inlined into `message`) instead of the bare `"insufficient
quota"` string. This applies to all three failure paths: post-creation,
an in-progress existing build, and a freshly triggered start build.
Non-quota build failures continue to use the existing
`buildToolResponse` / `newBuildError` path.
- Owner authorization is wrapped only on the call sites that need it
(the `CreateFn` and `StartFn` invocations and the quota-detail lookup),
so idempotent fast paths (already running, already in progress,
existing-workspace early returns) do not pay for an extra RBAC
round-trip or fail when role lookup is transient.

## Out of scope

- No changes to quota math, allowances, or bypass behavior.
- No automatic retries.
- No new quota-inspection tools and no changes to MCP
`coder_create_workspace` (which returns immediately and never observed
the build outcome here).
- No frontend UI changes; those will land in a follow-up PR that
consumes the new `INSUFFICIENT_QUOTA` code.
2026-05-07 15:01:58 +10:00

315 lines
10 KiB
Go

package chattool
import (
"context"
"sync"
"charm.land/fantasy"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/httpapi/httperror"
"github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect"
"github.com/coder/coder/v2/codersdk"
)
// StartWorkspaceFn starts a workspace by creating a new build with
// the "start" transition.
type StartWorkspaceFn func(
ctx context.Context,
ownerID uuid.UUID,
workspaceID uuid.UUID,
req codersdk.CreateWorkspaceBuildRequest,
) (codersdk.WorkspaceBuild, error)
// StartWorkspaceOptions configures the start_workspace tool.
type StartWorkspaceOptions struct {
OwnerID uuid.UUID
StartFn StartWorkspaceFn
AgentConnFn AgentConnFunc
WorkspaceMu *sync.Mutex
OnChatUpdated func(database.Chat)
Logger slog.Logger
}
type startWorkspaceArgs struct {
Parameters map[string]string `json:"parameters,omitempty"`
}
// StartWorkspace returns a tool that starts a stopped workspace
// associated with the current chat. The tool is idempotent: if the
// workspace is already running or building, it returns immediately.
// db must not be nil and chatID must not be uuid.Nil.
func StartWorkspace(db database.Store, chatID uuid.UUID, options StartWorkspaceOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"start_workspace",
"Start the chat's workspace if it is currently stopped. "+
"This tool is idempotent — if the workspace is already "+
"running, it returns immediately. Use create_workspace "+
"first if no workspace exists yet. Provide parameter "+
"values (from read_template) only if necessary or "+
"explicitly requested by the user.",
func(ctx context.Context, args startWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.StartFn == nil {
return fantasy.NewTextErrorResponse("workspace starter is not configured"), nil
}
// Serialize with create_workspace to prevent races.
if options.WorkspaceMu != nil {
options.WorkspaceMu.Lock()
defer options.WorkspaceMu.Unlock()
}
chat, err := db.GetChatByID(ctx, chatID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("load chat: %w", err).Error(),
), nil
}
if !chat.WorkspaceID.Valid {
return fantasy.NewTextErrorResponse(
"chat has no workspace; use create_workspace first",
), nil
}
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("load workspace: %w", err).Error(),
), nil
}
if ws.Deleted {
return fantasy.NewTextErrorResponse(
"workspace was deleted; use create_workspace to make a new one",
), nil
}
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("get latest build: %w", err).Error(),
), nil
}
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
if err != nil {
return fantasy.NewTextErrorResponse(
xerrors.Errorf("get provisioner job: %w", err).Error(),
), nil
}
// If a build is already in progress, wait for it.
switch job.JobStatus {
case database.ProvisionerJobStatusPending,
database.ProvisionerJobStatusRunning:
// Publish the build ID to the frontend so it
// can start streaming logs immediately.
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: chatID,
WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true},
BuildID: uuid.NullUUID{
UUID: build.ID,
Valid: build.ID != uuid.Nil,
},
AgentID: uuid.NullUUID{},
})
if bindErr != nil {
options.Logger.Error(ctx, "failed to persist build ID on chat binding",
slog.F("chat_id", chatID),
slog.F("build_id", build.ID),
slog.Error(bindErr),
)
} else if options.OnChatUpdated != nil {
options.OnChatUpdated(updatedChat)
}
if err := waitForBuild(ctx, db, build.ID); err != nil {
// newBuildError returns via toolResponse (IsError: false)
// rather than NewTextErrorResponse (IsError: true) so the
// JSON result preserves build_id for the frontend's log
// viewer. The fantasy/chatprompt pipeline discards structured
// fields from IsError content.
// The frontend detects errors via the "error" key instead.
return buildFailureToolResponse(
ctx,
options.Logger,
db,
options.OwnerID,
ws.OrganizationID,
buildFailureActionStart,
build.ID,
xerrors.Errorf("waiting for in-progress build: %w", err),
), nil
}
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, build.ID)
// Re-fire after the agent is fully ready so
// callers can load instruction files (AGENTS.md).
// This must happen after waitForAgentAndRespond —
// firing earlier races with agent startup.
if options.OnChatUpdated != nil {
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
options.OnChatUpdated(latest)
}
}
return toolResponse(result), nil
case database.ProvisionerJobStatusSucceeded:
// If the latest successful build is a start
// transition, the workspace should be running.
if build.Transition == database.WorkspaceTransitionStart {
return toolResponse(waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, uuid.Nil)), nil
}
// Otherwise it is stopped (or deleted) — proceed
// to start it below.
default:
// Failed, canceled, etc — try starting anyway.
}
// Set up dbauthz context for the start call.
ownerCtx, ownerErr := asOwner(ctx, db, options.OwnerID)
if ownerErr != nil {
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
}
startReq := codersdk.CreateWorkspaceBuildRequest{
Transition: codersdk.WorkspaceTransitionStart,
}
for k, v := range args.Parameters {
startReq.RichParameterValues = append(
startReq.RichParameterValues,
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
)
}
startBuild, err := options.StartFn(ownerCtx, options.OwnerID, ws.ID, startReq)
if err != nil {
if responseErr, ok := httperror.IsResponder(err); ok {
_, resp := responseErr.Response()
result := responseErrorResult(resp)
if len(resp.Validations) > 0 && ws.TemplateID != uuid.Nil {
result["template_id"] = ws.TemplateID.String()
}
return toolResponse(result), nil
}
return fantasy.NewTextErrorResponse(
xerrors.Errorf("start workspace: %w", err).Error(),
), nil
}
// Persist the build ID on the chat binding so the
// frontend can stream logs without polling.
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
ID: chatID,
WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true},
BuildID: uuid.NullUUID{
UUID: startBuild.ID,
Valid: startBuild.ID != uuid.Nil,
},
AgentID: uuid.NullUUID{},
})
if bindErr != nil {
options.Logger.Error(ctx, "failed to persist build ID on chat binding",
slog.F("chat_id", chatID),
slog.F("build_id", startBuild.ID),
slog.Error(bindErr),
)
} else if options.OnChatUpdated != nil {
options.OnChatUpdated(updatedChat)
}
if err := waitForBuild(ctx, db, startBuild.ID); err != nil {
return buildFailureToolResponse(
ctx,
options.Logger,
db,
options.OwnerID,
ws.OrganizationID,
buildFailureActionStart,
startBuild.ID,
xerrors.Errorf("workspace start build failed: %w", err),
), nil
}
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, startBuild.ID)
// If the template version changed, annotate the
// response so the model knows an auto-update
// occurred.
if startBuild.TemplateVersionID != uuid.Nil &&
build.TemplateVersionID != uuid.Nil &&
startBuild.TemplateVersionID != build.TemplateVersionID {
result["updated_to_active_version"] = true
result["update_reason"] = "template requires active versions"
result["message"] = "Workspace started and was updated to the active template version because the template requires active versions."
}
// Re-fire after the agent is fully ready so
// callers can load instruction files (AGENTS.md).
// This must happen after waitForAgentAndRespond —
// firing earlier races with agent startup.
if options.OnChatUpdated != nil {
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
options.OnChatUpdated(latest)
}
}
return toolResponse(result), nil
})
}
// waitForAgentAndRespond selects the chat agent from the workspace's
// latest build, waits for it to become reachable, and returns a
// result map. When buildID is non-zero, it is included in the
// result so the frontend can fetch historical build logs. Pass
// uuid.Nil when no build was triggered (e.g. workspace already
// running); the result will include no_build: true so the
// frontend can suppress the build-log section.
//
// The caller is responsible for converting the returned map to a
// fantasy.ToolResponse via toolResponse(), and may add extra
// fields before doing so.
func waitForAgentAndRespond(
ctx context.Context,
db database.Store,
agentConnFn AgentConnFunc,
ws database.Workspace,
buildID uuid.UUID,
) map[string]any {
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
if err != nil || len(agents) == 0 {
// Workspace started but no agent found - still report
// success so the model knows the workspace is up.
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
"agent_status": "no_agent",
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
return result
}
selected, err := agentselect.FindChatAgent(agents)
if err != nil {
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
"agent_status": "selection_error",
"agent_error": err.Error(),
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
return result
}
result := map[string]any{
"started": true,
"workspace_name": ws.Name,
}
setBuildID(result, buildID)
setNoBuild(result, buildID)
for k, v := range waitForAgentReady(ctx, db, selected.ID, agentConnFn) {
result[k] = v
}
return result
}