mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
ef0151601e
## Summary When a workspace build fails because the user is over their group quota, the chat tools currently surface the failure as a bare `"workspace build failed: insufficient quota"` string with no machine-readable error code and no visibility into the user's current usage. Agents and the UI cannot distinguish quota failures from any other Terraform error, so users see an opaque message and have no clear path to recovery. This PR tags quota failures with a typed error code at the source and propagates it through the chat tool layer so callers can react to it explicitly. Relates to CODAGT-20 ## Changes **Provisioner runner** - Add `InsufficientQuotaErrorCode = "INSUFFICIENT_QUOTA"` and set it explicitly at the `commitQuota` failure site via a new `failedWorkspaceBuildfCode` helper, so `provisioner_jobs.error_code` is populated only on the genuine quota path. The substring matcher used for externally produced sentinels (e.g. `"missing parameter"`, `"required template variables"`) is intentionally not extended; provider errors that happen to mention "insufficient quota" stay classified as generic build failures. **SDK and API contract** - Add `JobErrorCodeInsufficientQuota` and a `JobIsInsufficientQuotaErrorCode` helper to `codersdk`. - Extend the swagger `enums` tag on `ProvisionerJob.ErrorCode` to include `INSUFFICIENT_QUOTA`. - Regenerate `coderd/apidoc`, `docs/reference/api/*`, and `site/src/api/typesGenerated.ts`. **chattool create_workspace / start_workspace** - `waitForBuild` now returns a typed `*workspaceBuildError` carrying both the message and the `JobErrorCode`, instead of a bare error string. - New `quotaerror.go` introduces a structured `quotaErrorResult` (with `error_code`, `title`, `message`, `build_id`, and optional `quota`) and a best-effort `workspaceQuotaDetails` lookup that wraps owner authorization internally and fetches `credits_consumed` and `budget` from the database. Quota lookup failures (including authorization failures) never block the failure payload. - On quota-coded build failures, both `create_workspace` and `start_workspace` now return the structured response (with the recovery guidance inlined into `message`) instead of the bare `"insufficient quota"` string. This applies to all three failure paths: post-creation, an in-progress existing build, and a freshly triggered start build. Non-quota build failures continue to use the existing `buildToolResponse` / `newBuildError` path. - Owner authorization is wrapped only on the call sites that need it (the `CreateFn` and `StartFn` invocations and the quota-detail lookup), so idempotent fast paths (already running, already in progress, existing-workspace early returns) do not pay for an extra RBAC round-trip or fail when role lookup is transient. ## Out of scope - No changes to quota math, allowances, or bypass behavior. - No automatic retries. - No new quota-inspection tools and no changes to MCP `coder_create_workspace` (which returns immediately and never observed the build outcome here). - No frontend UI changes; those will land in a follow-up PR that consumes the new `INSUFFICIENT_QUOTA` code.
315 lines
10 KiB
Go
315 lines
10 KiB
Go
package chattool
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
|
|
"charm.land/fantasy"
|
|
"github.com/google/uuid"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/httpapi/httperror"
|
|
"github.com/coder/coder/v2/coderd/x/chatd/internal/agentselect"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
)
|
|
|
|
// StartWorkspaceFn starts a workspace by creating a new build with
|
|
// the "start" transition.
|
|
type StartWorkspaceFn func(
|
|
ctx context.Context,
|
|
ownerID uuid.UUID,
|
|
workspaceID uuid.UUID,
|
|
req codersdk.CreateWorkspaceBuildRequest,
|
|
) (codersdk.WorkspaceBuild, error)
|
|
|
|
// StartWorkspaceOptions configures the start_workspace tool.
|
|
type StartWorkspaceOptions struct {
|
|
OwnerID uuid.UUID
|
|
StartFn StartWorkspaceFn
|
|
AgentConnFn AgentConnFunc
|
|
WorkspaceMu *sync.Mutex
|
|
OnChatUpdated func(database.Chat)
|
|
Logger slog.Logger
|
|
}
|
|
|
|
type startWorkspaceArgs struct {
|
|
Parameters map[string]string `json:"parameters,omitempty"`
|
|
}
|
|
|
|
// StartWorkspace returns a tool that starts a stopped workspace
|
|
// associated with the current chat. The tool is idempotent: if the
|
|
// workspace is already running or building, it returns immediately.
|
|
// db must not be nil and chatID must not be uuid.Nil.
|
|
func StartWorkspace(db database.Store, chatID uuid.UUID, options StartWorkspaceOptions) fantasy.AgentTool {
|
|
return fantasy.NewAgentTool(
|
|
"start_workspace",
|
|
"Start the chat's workspace if it is currently stopped. "+
|
|
"This tool is idempotent — if the workspace is already "+
|
|
"running, it returns immediately. Use create_workspace "+
|
|
"first if no workspace exists yet. Provide parameter "+
|
|
"values (from read_template) only if necessary or "+
|
|
"explicitly requested by the user.",
|
|
func(ctx context.Context, args startWorkspaceArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
|
|
if options.StartFn == nil {
|
|
return fantasy.NewTextErrorResponse("workspace starter is not configured"), nil
|
|
}
|
|
|
|
// Serialize with create_workspace to prevent races.
|
|
if options.WorkspaceMu != nil {
|
|
options.WorkspaceMu.Lock()
|
|
defer options.WorkspaceMu.Unlock()
|
|
}
|
|
|
|
chat, err := db.GetChatByID(ctx, chatID)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
xerrors.Errorf("load chat: %w", err).Error(),
|
|
), nil
|
|
}
|
|
if !chat.WorkspaceID.Valid {
|
|
return fantasy.NewTextErrorResponse(
|
|
"chat has no workspace; use create_workspace first",
|
|
), nil
|
|
}
|
|
|
|
ws, err := db.GetWorkspaceByID(ctx, chat.WorkspaceID.UUID)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
xerrors.Errorf("load workspace: %w", err).Error(),
|
|
), nil
|
|
}
|
|
if ws.Deleted {
|
|
return fantasy.NewTextErrorResponse(
|
|
"workspace was deleted; use create_workspace to make a new one",
|
|
), nil
|
|
}
|
|
|
|
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, ws.ID)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
xerrors.Errorf("get latest build: %w", err).Error(),
|
|
), nil
|
|
}
|
|
|
|
job, err := db.GetProvisionerJobByID(ctx, build.JobID)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
xerrors.Errorf("get provisioner job: %w", err).Error(),
|
|
), nil
|
|
}
|
|
|
|
// If a build is already in progress, wait for it.
|
|
switch job.JobStatus {
|
|
case database.ProvisionerJobStatusPending,
|
|
database.ProvisionerJobStatusRunning:
|
|
// Publish the build ID to the frontend so it
|
|
// can start streaming logs immediately.
|
|
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
|
|
ID: chatID,
|
|
WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true},
|
|
BuildID: uuid.NullUUID{
|
|
UUID: build.ID,
|
|
Valid: build.ID != uuid.Nil,
|
|
},
|
|
AgentID: uuid.NullUUID{},
|
|
})
|
|
if bindErr != nil {
|
|
options.Logger.Error(ctx, "failed to persist build ID on chat binding",
|
|
slog.F("chat_id", chatID),
|
|
slog.F("build_id", build.ID),
|
|
slog.Error(bindErr),
|
|
)
|
|
} else if options.OnChatUpdated != nil {
|
|
options.OnChatUpdated(updatedChat)
|
|
}
|
|
if err := waitForBuild(ctx, db, build.ID); err != nil {
|
|
// newBuildError returns via toolResponse (IsError: false)
|
|
// rather than NewTextErrorResponse (IsError: true) so the
|
|
// JSON result preserves build_id for the frontend's log
|
|
// viewer. The fantasy/chatprompt pipeline discards structured
|
|
// fields from IsError content.
|
|
// The frontend detects errors via the "error" key instead.
|
|
return buildFailureToolResponse(
|
|
ctx,
|
|
options.Logger,
|
|
db,
|
|
options.OwnerID,
|
|
ws.OrganizationID,
|
|
buildFailureActionStart,
|
|
build.ID,
|
|
xerrors.Errorf("waiting for in-progress build: %w", err),
|
|
), nil
|
|
}
|
|
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, build.ID)
|
|
// Re-fire after the agent is fully ready so
|
|
// callers can load instruction files (AGENTS.md).
|
|
// This must happen after waitForAgentAndRespond —
|
|
// firing earlier races with agent startup.
|
|
if options.OnChatUpdated != nil {
|
|
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
|
|
options.OnChatUpdated(latest)
|
|
}
|
|
}
|
|
return toolResponse(result), nil
|
|
case database.ProvisionerJobStatusSucceeded:
|
|
// If the latest successful build is a start
|
|
// transition, the workspace should be running.
|
|
if build.Transition == database.WorkspaceTransitionStart {
|
|
return toolResponse(waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, uuid.Nil)), nil
|
|
}
|
|
// Otherwise it is stopped (or deleted) — proceed
|
|
// to start it below.
|
|
|
|
default:
|
|
// Failed, canceled, etc — try starting anyway.
|
|
}
|
|
|
|
// Set up dbauthz context for the start call.
|
|
ownerCtx, ownerErr := asOwner(ctx, db, options.OwnerID)
|
|
if ownerErr != nil {
|
|
return fantasy.NewTextErrorResponse(ownerErr.Error()), nil
|
|
}
|
|
|
|
startReq := codersdk.CreateWorkspaceBuildRequest{
|
|
Transition: codersdk.WorkspaceTransitionStart,
|
|
}
|
|
for k, v := range args.Parameters {
|
|
startReq.RichParameterValues = append(
|
|
startReq.RichParameterValues,
|
|
codersdk.WorkspaceBuildParameter{Name: k, Value: v},
|
|
)
|
|
}
|
|
|
|
startBuild, err := options.StartFn(ownerCtx, options.OwnerID, ws.ID, startReq)
|
|
if err != nil {
|
|
if responseErr, ok := httperror.IsResponder(err); ok {
|
|
_, resp := responseErr.Response()
|
|
result := responseErrorResult(resp)
|
|
if len(resp.Validations) > 0 && ws.TemplateID != uuid.Nil {
|
|
result["template_id"] = ws.TemplateID.String()
|
|
}
|
|
return toolResponse(result), nil
|
|
}
|
|
return fantasy.NewTextErrorResponse(
|
|
xerrors.Errorf("start workspace: %w", err).Error(),
|
|
), nil
|
|
}
|
|
|
|
// Persist the build ID on the chat binding so the
|
|
// frontend can stream logs without polling.
|
|
updatedChat, bindErr := db.UpdateChatWorkspaceBinding(ctx, database.UpdateChatWorkspaceBindingParams{
|
|
ID: chatID,
|
|
WorkspaceID: uuid.NullUUID{UUID: ws.ID, Valid: true},
|
|
BuildID: uuid.NullUUID{
|
|
UUID: startBuild.ID,
|
|
Valid: startBuild.ID != uuid.Nil,
|
|
},
|
|
AgentID: uuid.NullUUID{},
|
|
})
|
|
if bindErr != nil {
|
|
options.Logger.Error(ctx, "failed to persist build ID on chat binding",
|
|
slog.F("chat_id", chatID),
|
|
slog.F("build_id", startBuild.ID),
|
|
slog.Error(bindErr),
|
|
)
|
|
} else if options.OnChatUpdated != nil {
|
|
options.OnChatUpdated(updatedChat)
|
|
}
|
|
if err := waitForBuild(ctx, db, startBuild.ID); err != nil {
|
|
return buildFailureToolResponse(
|
|
ctx,
|
|
options.Logger,
|
|
db,
|
|
options.OwnerID,
|
|
ws.OrganizationID,
|
|
buildFailureActionStart,
|
|
startBuild.ID,
|
|
xerrors.Errorf("workspace start build failed: %w", err),
|
|
), nil
|
|
}
|
|
|
|
result := waitForAgentAndRespond(ctx, db, options.AgentConnFn, ws, startBuild.ID)
|
|
|
|
// If the template version changed, annotate the
|
|
// response so the model knows an auto-update
|
|
// occurred.
|
|
if startBuild.TemplateVersionID != uuid.Nil &&
|
|
build.TemplateVersionID != uuid.Nil &&
|
|
startBuild.TemplateVersionID != build.TemplateVersionID {
|
|
result["updated_to_active_version"] = true
|
|
result["update_reason"] = "template requires active versions"
|
|
result["message"] = "Workspace started and was updated to the active template version because the template requires active versions."
|
|
}
|
|
|
|
// Re-fire after the agent is fully ready so
|
|
// callers can load instruction files (AGENTS.md).
|
|
// This must happen after waitForAgentAndRespond —
|
|
// firing earlier races with agent startup.
|
|
if options.OnChatUpdated != nil {
|
|
if latest, err := db.GetChatByID(ctx, chatID); err == nil {
|
|
options.OnChatUpdated(latest)
|
|
}
|
|
}
|
|
return toolResponse(result), nil
|
|
})
|
|
}
|
|
|
|
// waitForAgentAndRespond selects the chat agent from the workspace's
|
|
// latest build, waits for it to become reachable, and returns a
|
|
// result map. When buildID is non-zero, it is included in the
|
|
// result so the frontend can fetch historical build logs. Pass
|
|
// uuid.Nil when no build was triggered (e.g. workspace already
|
|
// running); the result will include no_build: true so the
|
|
// frontend can suppress the build-log section.
|
|
//
|
|
// The caller is responsible for converting the returned map to a
|
|
// fantasy.ToolResponse via toolResponse(), and may add extra
|
|
// fields before doing so.
|
|
func waitForAgentAndRespond(
|
|
ctx context.Context,
|
|
db database.Store,
|
|
agentConnFn AgentConnFunc,
|
|
ws database.Workspace,
|
|
buildID uuid.UUID,
|
|
) map[string]any {
|
|
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, ws.ID)
|
|
if err != nil || len(agents) == 0 {
|
|
// Workspace started but no agent found - still report
|
|
// success so the model knows the workspace is up.
|
|
result := map[string]any{
|
|
"started": true,
|
|
"workspace_name": ws.Name,
|
|
"agent_status": "no_agent",
|
|
}
|
|
setBuildID(result, buildID)
|
|
setNoBuild(result, buildID)
|
|
return result
|
|
}
|
|
|
|
selected, err := agentselect.FindChatAgent(agents)
|
|
if err != nil {
|
|
result := map[string]any{
|
|
"started": true,
|
|
"workspace_name": ws.Name,
|
|
"agent_status": "selection_error",
|
|
"agent_error": err.Error(),
|
|
}
|
|
setBuildID(result, buildID)
|
|
setNoBuild(result, buildID)
|
|
return result
|
|
}
|
|
|
|
result := map[string]any{
|
|
"started": true,
|
|
"workspace_name": ws.Name,
|
|
}
|
|
setBuildID(result, buildID)
|
|
setNoBuild(result, buildID)
|
|
for k, v := range waitForAgentReady(ctx, db, selected.ID, agentConnFn) {
|
|
result[k] = v
|
|
}
|
|
return result
|
|
}
|