Files
coder/coderd/x/chatd/chattool/computeruse.go
T
Ethan ef6969dd70 feat(coderd/x/chatd): agent-created file attachments in chat (#24280)
Agents can already see workspace files and take screenshots, but users could not download those artifacts from chat. This PR adds durable chat attachments to chatd. `attach_file`, explicit `computer` screenshot actions (not the automatic post-action screenshots), and `propose_plan` now fetch bytes over the agent connection, store them in `chat_files`, link them to the chat, and carry attachment metadata in tool responses so `buildAssistantPartsForPersist` can materialize ordinary `type:"file"` assistant parts that the chat file APIs serve.

The same storage helpers are reused for other artifact-producing paths. `wait_agent` recordings and thumbnails are stored as chat files and linked back to the parent chat, with best-effort relinking so parent chats retain those artifacts without leaving orphaned rows when chat-file caps reject links. `storeChatAttachment` wraps insert + link in one transaction, files are capped at 10 MB each and 20 per chat, and serving defaults to `Content-Disposition: attachment` with an explicit inline-safe allowlist.

This PR also consolidates chat-file media policy in `coderd/chatfiles`. Uploads and tool-generated attachments share byte-based MIME detection, SVG blocking, inline-safety rules, and compatible `text/plain` refinement for JSON, CSV, and Markdown. Prompt construction still only inlines synthetic pasted text for model consumption; assistant-created attachments are persisted for the user and intentionally not replayed into later LLM turns.

UI follow-up lives in #24281.

Relates to CODAGT-91
2026-04-20 18:04:35 +10:00

243 lines
7.1 KiB
Go

package chattool
import (
"context"
"fmt"
"time"
"charm.land/fantasy"
fantasyanthropic "charm.land/fantasy/providers/anthropic"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/quartz"
)
const (
// ComputerUseModelProvider is the provider for the computer
// use model.
ComputerUseModelProvider = "anthropic"
// ComputerUseModelName is the model used for computer use
// subagents.
ComputerUseModelName = "claude-opus-4-6"
)
// computerUseTool implements fantasy.AgentTool and
// chatloop.ToolDefiner for Anthropic computer use.
type computerUseTool struct {
declaredWidth int
declaredHeight int
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error)
storeFile StoreFileFunc
providerOptions fantasy.ProviderOptions
clock quartz.Clock
logger slog.Logger
}
// NewComputerUseTool creates a computer use AgentTool that delegates to the
// agent's desktop endpoints. declaredWidth and declaredHeight are the
// model-facing desktop dimensions advertised to Anthropic and requested for
// screenshots.
func NewComputerUseTool(
declaredWidth, declaredHeight int,
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error),
storeFile StoreFileFunc,
clock quartz.Clock,
logger slog.Logger,
) fantasy.AgentTool {
return &computerUseTool{
declaredWidth: declaredWidth,
declaredHeight: declaredHeight,
getWorkspaceConn: getWorkspaceConn,
storeFile: storeFile,
clock: clock,
logger: logger,
}
}
func (*computerUseTool) Info() fantasy.ToolInfo {
return fantasy.ToolInfo{
Name: "computer",
Description: "Control the desktop: take screenshots, move the mouse, click, type, and scroll. " +
"Use an explicit screenshot action when you want to share a screenshot with the user; " +
"those screenshots are also attached to the chat.",
Parameters: map[string]any{},
Required: []string{},
}
}
// ComputerUseProviderTool creates the provider-defined Anthropic computer-use
// tool definition using the declared model-facing desktop geometry.
func ComputerUseProviderTool(declaredWidth, declaredHeight int) fantasy.Tool {
// The run callback is nil because execution is handled separately
// by the AgentTool runner in the chatloop. We extract just the
// provider-defined tool definition.
return fantasyanthropic.NewComputerUseTool(
fantasyanthropic.ComputerUseToolOptions{
DisplayWidthPx: int64(declaredWidth),
DisplayHeightPx: int64(declaredHeight),
ToolVersion: fantasyanthropic.ComputerUse20251124,
},
nil,
).Definition()
}
func (t *computerUseTool) ProviderOptions() fantasy.ProviderOptions {
return t.providerOptions
}
func (t *computerUseTool) SetProviderOptions(opts fantasy.ProviderOptions) {
t.providerOptions = opts
}
func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error) {
input, err := fantasyanthropic.ParseComputerUseInput(call.Input)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("invalid computer use input: %v", err),
), nil
}
conn, err := t.getWorkspaceConn(ctx)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("failed to connect to workspace: %v", err),
), nil
}
declaredWidth, declaredHeight := t.declaredActionDimensions()
// For wait actions, sleep then return a screenshot.
if input.Action == fantasyanthropic.ActionWait {
d := input.Duration
if d <= 0 {
d = 1000
}
timer := t.clock.NewTimer(time.Duration(d)*time.Millisecond, "computeruse", "wait")
defer timer.Stop()
select {
case <-ctx.Done():
case <-timer.C:
}
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
// For screenshot action, use ExecuteDesktopAction.
if input.Action == fantasyanthropic.ActionScreenshot {
return t.captureSharedScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
// Build the action request.
action := workspacesdk.DesktopAction{
Action: string(input.Action),
ScaledWidth: &declaredWidth,
ScaledHeight: &declaredHeight,
}
if input.Coordinate != ([2]int64{}) {
coord := [2]int{int(input.Coordinate[0]), int(input.Coordinate[1])}
action.Coordinate = &coord
}
if input.StartCoordinate != ([2]int64{}) {
coord := [2]int{int(input.StartCoordinate[0]), int(input.StartCoordinate[1])}
action.StartCoordinate = &coord
}
if input.Text != "" {
action.Text = &input.Text
}
if input.Duration > 0 {
d := int(input.Duration)
action.Duration = &d
}
if input.ScrollAmount > 0 {
s := int(input.ScrollAmount)
action.ScrollAmount = &s
}
if input.ScrollDirection != "" {
action.ScrollDirection = &input.ScrollDirection
}
// Execute the action.
_, err = conn.ExecuteDesktopAction(ctx, action)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("action %q failed: %v", input.Action, err),
), nil
}
// Take a screenshot after every action (Anthropic pattern).
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
func (*computerUseTool) captureScreenshot(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (fantasy.ToolResponse, error) {
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("screenshot failed: %v", err),
), nil
}
return fantasy.NewImageResponse([]byte(screenResp.ScreenshotData), "image/png"), nil
}
func (t *computerUseTool) captureSharedScreenshot(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (fantasy.ToolResponse, error) {
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("screenshot failed: %v", err),
), nil
}
attachmentName := fmt.Sprintf(
"screenshot-%s.png",
t.clock.Now().UTC().Format("2006-01-02T15-04-05Z"),
)
if t.storeFile == nil {
t.logger.Warn(ctx, "screenshot attachment storage is not configured")
return fantasy.NewImageResponse([]byte(screenResp.ScreenshotData), "image/png"), nil
}
attachment, err := storeScreenshotAttachment(
ctx,
t.storeFile,
attachmentName,
screenResp.ScreenshotData,
)
response := fantasy.NewImageResponse([]byte(screenResp.ScreenshotData), "image/png")
if err != nil {
t.logger.Warn(ctx, "failed to persist screenshot attachment",
slog.F("attachment_name", attachmentName),
slog.Error(err),
)
return response, nil
}
return WithAttachments(response, attachment), nil
}
func executeScreenshotAction(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (workspacesdk.DesktopActionResponse, error) {
screenshotAction := workspacesdk.DesktopAction{
Action: "screenshot",
ScaledWidth: &declaredWidth,
ScaledHeight: &declaredHeight,
}
return conn.ExecuteDesktopAction(ctx, screenshotAction)
}
func (t *computerUseTool) declaredActionDimensions() (declaredWidth, declaredHeight int) {
if t.declaredWidth <= 0 || t.declaredHeight <= 0 {
geometry := workspacesdk.DefaultDesktopGeometry()
return geometry.DeclaredWidth, geometry.DeclaredHeight
}
return t.declaredWidth, t.declaredHeight
}