mirror of
https://github.com/coder/coder.git
synced 2026-06-03 13:08:25 +00:00
a02339c66a
- **computeruse.go**: Decode base64 screenshot data before storing in
`ToolResponse.Data` (was casting base64 string to bytes without
decoding)
- **chatloop.go**: Re-encode `ToolResponse.Data` to base64 via
`base64.StdEncoding.EncodeToString` instead of `string()` cast
- **mcpclient.go**: UTF-8 validate all text from MCP responses in
`convertCallResult()` using `strings.ToValidUTF8`
- **chatprompt.go (persist)**: Defense-in-depth UTF-8 sanitization of
text and media Text fields before database storage
- **chatprompt.go (replay)**: Antivenom layer that validates base64 and
UTF-8 at read time, auto-healing already-poisoned chats without
requiring a migration
- `TestToolResultAntivenom`: 4 subtests covering poisoned text, poisoned
media, valid media round-trip, and media with invalid UTF-8 text
- Adds `TestConvertCallResult_UTF8Sanitization`: 4 subtests covering invalid
UTF-8 in TextContent, EmbeddedResource, valid passthrough, and
multi-part
- Adds `TestComputerUseTool_Run_ScreenshotDataIsDecodedBinary`: Verifies no
double-encode in the computer-use path
- Updated existing computer-use tests for the new decoded-binary
contract
> 🤖
264 lines
7.7 KiB
Go
264 lines
7.7 KiB
Go
package chattool
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"time"
|
|
|
|
"charm.land/fantasy"
|
|
fantasyanthropic "charm.land/fantasy/providers/anthropic"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
const (
|
|
// ComputerUseModelProvider is the provider for the computer
|
|
// use model.
|
|
ComputerUseModelProvider = "anthropic"
|
|
// ComputerUseModelName is the model used for computer use
|
|
// subagents.
|
|
ComputerUseModelName = "claude-opus-4-6"
|
|
)
|
|
|
|
// computerUseTool implements fantasy.AgentTool and
|
|
// chatloop.ToolDefiner for Anthropic computer use.
|
|
type computerUseTool struct {
|
|
declaredWidth int
|
|
declaredHeight int
|
|
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error)
|
|
storeFile StoreFileFunc
|
|
providerOptions fantasy.ProviderOptions
|
|
clock quartz.Clock
|
|
logger slog.Logger
|
|
}
|
|
|
|
// NewComputerUseTool creates a computer use AgentTool that delegates to the
|
|
// agent's desktop endpoints. declaredWidth and declaredHeight are the
|
|
// model-facing desktop dimensions advertised to Anthropic and requested for
|
|
// screenshots.
|
|
func NewComputerUseTool(
|
|
declaredWidth, declaredHeight int,
|
|
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error),
|
|
storeFile StoreFileFunc,
|
|
clock quartz.Clock,
|
|
logger slog.Logger,
|
|
) fantasy.AgentTool {
|
|
return &computerUseTool{
|
|
declaredWidth: declaredWidth,
|
|
declaredHeight: declaredHeight,
|
|
getWorkspaceConn: getWorkspaceConn,
|
|
storeFile: storeFile,
|
|
clock: clock,
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
func (*computerUseTool) Info() fantasy.ToolInfo {
|
|
return fantasy.ToolInfo{
|
|
Name: "computer",
|
|
Description: "Control the desktop: take screenshots, move the mouse, click, type, and scroll. " +
|
|
"Use an explicit screenshot action when you want to share a screenshot with the user; " +
|
|
"those screenshots are also attached to the chat.",
|
|
Parameters: map[string]any{},
|
|
Required: []string{},
|
|
}
|
|
}
|
|
|
|
// ComputerUseProviderTool creates the provider-defined Anthropic computer-use
|
|
// tool definition using the declared model-facing desktop geometry.
|
|
func ComputerUseProviderTool(declaredWidth, declaredHeight int) fantasy.Tool {
|
|
// The run callback is nil because execution is handled separately
|
|
// by the AgentTool runner in the chatloop. We extract just the
|
|
// provider-defined tool definition.
|
|
return fantasyanthropic.NewComputerUseTool(
|
|
fantasyanthropic.ComputerUseToolOptions{
|
|
DisplayWidthPx: int64(declaredWidth),
|
|
DisplayHeightPx: int64(declaredHeight),
|
|
ToolVersion: fantasyanthropic.ComputerUse20251124,
|
|
},
|
|
nil,
|
|
).Definition()
|
|
}
|
|
|
|
func (t *computerUseTool) ProviderOptions() fantasy.ProviderOptions {
|
|
return t.providerOptions
|
|
}
|
|
|
|
func (t *computerUseTool) SetProviderOptions(opts fantasy.ProviderOptions) {
|
|
t.providerOptions = opts
|
|
}
|
|
|
|
func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error) {
|
|
input, err := fantasyanthropic.ParseComputerUseInput(call.Input)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("invalid computer use input: %v", err),
|
|
), nil
|
|
}
|
|
|
|
conn, err := t.getWorkspaceConn(ctx)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("failed to connect to workspace: %v", err),
|
|
), nil
|
|
}
|
|
|
|
declaredWidth, declaredHeight := t.declaredActionDimensions()
|
|
|
|
// For wait actions, sleep then return a screenshot.
|
|
if input.Action == fantasyanthropic.ActionWait {
|
|
d := input.Duration
|
|
if d <= 0 {
|
|
d = 1000
|
|
}
|
|
timer := t.clock.NewTimer(time.Duration(d)*time.Millisecond, "computeruse", "wait")
|
|
defer timer.Stop()
|
|
select {
|
|
case <-ctx.Done():
|
|
case <-timer.C:
|
|
}
|
|
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
|
|
}
|
|
|
|
// For screenshot action, use ExecuteDesktopAction.
|
|
if input.Action == fantasyanthropic.ActionScreenshot {
|
|
return t.captureSharedScreenshot(ctx, conn, declaredWidth, declaredHeight)
|
|
}
|
|
|
|
// Build the action request.
|
|
action := workspacesdk.DesktopAction{
|
|
Action: string(input.Action),
|
|
ScaledWidth: &declaredWidth,
|
|
ScaledHeight: &declaredHeight,
|
|
}
|
|
if input.Coordinate != ([2]int64{}) {
|
|
coord := [2]int{int(input.Coordinate[0]), int(input.Coordinate[1])}
|
|
action.Coordinate = &coord
|
|
}
|
|
if input.StartCoordinate != ([2]int64{}) {
|
|
coord := [2]int{int(input.StartCoordinate[0]), int(input.StartCoordinate[1])}
|
|
action.StartCoordinate = &coord
|
|
}
|
|
if input.Text != "" {
|
|
action.Text = &input.Text
|
|
}
|
|
if input.Duration > 0 {
|
|
d := int(input.Duration)
|
|
action.Duration = &d
|
|
}
|
|
if input.ScrollAmount > 0 {
|
|
s := int(input.ScrollAmount)
|
|
action.ScrollAmount = &s
|
|
}
|
|
if input.ScrollDirection != "" {
|
|
action.ScrollDirection = &input.ScrollDirection
|
|
}
|
|
|
|
// Execute the action.
|
|
_, err = conn.ExecuteDesktopAction(ctx, action)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("action %q failed: %v", input.Action, err),
|
|
), nil
|
|
}
|
|
|
|
// Take a screenshot after every action (Anthropic pattern).
|
|
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
|
|
}
|
|
|
|
func (t *computerUseTool) captureScreenshot(
|
|
ctx context.Context,
|
|
conn workspacesdk.AgentConn,
|
|
declaredWidth, declaredHeight int,
|
|
) (fantasy.ToolResponse, error) {
|
|
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("screenshot failed: %v", err),
|
|
), nil
|
|
}
|
|
screenData, err := base64.StdEncoding.DecodeString(screenResp.ScreenshotData)
|
|
if err != nil {
|
|
t.logger.Error(ctx, "failed to decode screenshot base64 in captureScreenshot",
|
|
slog.Error(err),
|
|
)
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("failed to decode screenshot data: %v", err),
|
|
), nil
|
|
}
|
|
return fantasy.NewImageResponse(screenData, "image/png"), nil
|
|
}
|
|
|
|
func (t *computerUseTool) captureSharedScreenshot(
|
|
ctx context.Context,
|
|
conn workspacesdk.AgentConn,
|
|
declaredWidth, declaredHeight int,
|
|
) (fantasy.ToolResponse, error) {
|
|
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
|
|
if err != nil {
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("screenshot failed: %v", err),
|
|
), nil
|
|
}
|
|
|
|
screenData, err := base64.StdEncoding.DecodeString(screenResp.ScreenshotData)
|
|
if err != nil {
|
|
t.logger.Error(ctx, "failed to decode screenshot base64 in captureSharedScreenshot",
|
|
slog.Error(err),
|
|
)
|
|
return fantasy.NewTextErrorResponse(
|
|
fmt.Sprintf("failed to decode screenshot data: %v", err),
|
|
), nil
|
|
}
|
|
|
|
attachmentName := fmt.Sprintf(
|
|
"screenshot-%s.png",
|
|
t.clock.Now().UTC().Format("2006-01-02T15-04-05Z"),
|
|
)
|
|
if t.storeFile == nil {
|
|
t.logger.Warn(ctx, "screenshot attachment storage is not configured")
|
|
return fantasy.NewImageResponse(screenData, "image/png"), nil
|
|
}
|
|
|
|
response := fantasy.NewImageResponse(screenData, "image/png")
|
|
|
|
attachment, err := storeScreenshotAttachment(
|
|
ctx,
|
|
t.storeFile,
|
|
attachmentName,
|
|
screenResp.ScreenshotData,
|
|
)
|
|
if err != nil {
|
|
t.logger.Warn(ctx, "failed to persist screenshot attachment",
|
|
slog.F("attachment_name", attachmentName),
|
|
slog.Error(err),
|
|
)
|
|
return response, nil
|
|
}
|
|
return WithAttachments(response, attachment), nil
|
|
}
|
|
|
|
func executeScreenshotAction(
|
|
ctx context.Context,
|
|
conn workspacesdk.AgentConn,
|
|
declaredWidth, declaredHeight int,
|
|
) (workspacesdk.DesktopActionResponse, error) {
|
|
screenshotAction := workspacesdk.DesktopAction{
|
|
Action: "screenshot",
|
|
ScaledWidth: &declaredWidth,
|
|
ScaledHeight: &declaredHeight,
|
|
}
|
|
return conn.ExecuteDesktopAction(ctx, screenshotAction)
|
|
}
|
|
|
|
func (t *computerUseTool) declaredActionDimensions() (declaredWidth, declaredHeight int) {
|
|
if t.declaredWidth <= 0 || t.declaredHeight <= 0 {
|
|
geometry := workspacesdk.DefaultDesktopGeometry()
|
|
return geometry.DeclaredWidth, geometry.DeclaredHeight
|
|
}
|
|
return t.declaredWidth, t.declaredHeight
|
|
}
|