Files
Michael Suchacz 0bb09935bc feat: add computer-use provider selection for AI agents (#24772)
Adds a deployment-wide setting to select the computer-use provider
(Anthropic or OpenAI) for AI agents, plus the OpenAI computer-use runner
needed to honor that selection.

The setting is stored in `site_configs` under
`agents_computer_use_provider`, defaults to Anthropic when unset, and is
exposed via experimental GET/PUT endpoints under
`/api/experimental/chats/config/computer-use-provider`. The chatd
computer-use tool now dispatches to either `runAnthropicComputerUse` or
`runOpenAIComputerUse` based on the resolved provider, with
provider-specific result metadata for OpenAI screenshots.

Frontend adds a provider dropdown to the Agents Experiments settings
page nested under the virtual desktop toggle, with disabled state
handling while virtual desktop is off and skeleton loaders while config
queries are in flight.

Hugo and Codex review follow-up:
- Uses shared provider validation and clearer computer-use constant
names.
- Removes stale OpenAI pending-safety-checks commentary.
- Documents why provider result metadata is needed for OpenAI
screenshots.
- Keeps the computer-use subagent visible when provider credentials are
missing, then returns a clear spawn-time configuration error.
- Uses OpenAI's recommended 1600x900 screenshot geometry to preserve the
native 16:9 aspect ratio.
- Moves OpenAI-specific computer-use helpers into
`coderd/x/chatd/chatopenai/computeruse` after rebasing onto the provider
package refactor in `main`.
- Converts OpenAI pixel scroll deltas to Coder desktop wheel-click
amounts.
- Preserves OpenAI pointer modifiers with key down/up desktop actions
and rejects unsupported non-left double-click buttons explicitly.
- Maps OpenAI back/forward side-button clicks to browser navigation key
actions.
- Defaults omitted OpenAI click buttons to left-click.
- Retries mouse release cleanup if the final OpenAI drag release fails.
- Keeps computer-use subagent availability messages stable when provider
config cannot be loaded, while logging the backend error.
- Releases remaining OpenAI modifier keys if a synthetic key-up cleanup
action fails.
- Updates Storybook interaction stories so provider snapshots show the
selected final provider.

> Mux updated this PR description on behalf of Mike.
2026-05-04 20:30:50 +02:00

444 lines
14 KiB
Go

package chattool
import (
"context"
"encoding/base64"
"fmt"
"slices"
"strings"
"time"
"charm.land/fantasy"
fantasyanthropic "charm.land/fantasy/providers/anthropic"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
openaicomputeruse "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/quartz"
)
const (
// ComputerUseProviderAnthropic identifies Anthropic computer use.
ComputerUseProviderAnthropic = "anthropic"
// ComputerUseProviderOpenAI identifies OpenAI computer use.
ComputerUseProviderOpenAI = "openai"
// ComputerUseModelProviderDefault is the default model provider name for
// computer use, equal to ComputerUseProviderAnthropic.
ComputerUseModelProviderDefault = ComputerUseProviderAnthropic
// ComputerUseAnthropicModelName is the default Anthropic model used for
// computer use subagents.
ComputerUseAnthropicModelName = "claude-opus-4-6"
// ComputerUseOpenAIModelName is the default OpenAI model used for computer use.
ComputerUseOpenAIModelName = "gpt-5.5"
)
// SupportedComputerUseProviders returns the providers supported by computer use.
// The returned slice is a fresh copy and safe to mutate.
func SupportedComputerUseProviders() []string {
return []string{
ComputerUseProviderAnthropic,
ComputerUseProviderOpenAI,
}
}
// IsSupportedComputerUseProvider reports whether provider supports computer use.
func IsSupportedComputerUseProvider(provider string) bool {
return slices.Contains(SupportedComputerUseProviders(), provider)
}
// DefaultComputerUseProvider returns the effective computer use provider.
func DefaultComputerUseProvider(provider string) string {
if provider == "" {
return ComputerUseProviderAnthropic
}
return provider
}
// DefaultComputerUseModel returns the default model for a computer use provider.
func DefaultComputerUseModel(provider string) (modelProvider, modelName string, ok bool) {
switch DefaultComputerUseProvider(provider) {
case ComputerUseProviderAnthropic:
return ComputerUseModelProviderDefault, ComputerUseAnthropicModelName, true
case ComputerUseProviderOpenAI:
// Keep OpenAI isolated here because computer-use models may advance.
return ComputerUseProviderOpenAI, ComputerUseOpenAIModelName, true
default:
return "", "", false
}
}
// DefaultComputerUseDesktopGeometry returns provider-specific model-facing
// desktop geometry for computer use.
func DefaultComputerUseDesktopGeometry(provider string) workspacesdk.DesktopGeometry {
switch DefaultComputerUseProvider(provider) {
case ComputerUseProviderOpenAI:
return workspacesdk.DefaultOpenAIComputerUseDesktopGeometry()
default:
return workspacesdk.DefaultDesktopGeometry()
}
}
// computerUseTool implements fantasy.AgentTool and chatloop.ToolDefiner.
type computerUseTool struct {
provider string
declaredWidth int
declaredHeight int
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error)
storeFile StoreFileFunc
providerOptions fantasy.ProviderOptions
clock quartz.Clock
logger slog.Logger
}
// NewComputerUseTool creates a provider-aware computer use AgentTool that
// delegates to the agent's desktop endpoints. declaredWidth and declaredHeight
// are the model-facing desktop dimensions advertised to providers and requested
// for screenshots.
func NewComputerUseTool(
provider string,
declaredWidth, declaredHeight int,
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error),
storeFile StoreFileFunc,
clock quartz.Clock,
logger slog.Logger,
) fantasy.AgentTool {
return &computerUseTool{
provider: DefaultComputerUseProvider(provider),
declaredWidth: declaredWidth,
declaredHeight: declaredHeight,
getWorkspaceConn: getWorkspaceConn,
storeFile: storeFile,
clock: clock,
logger: logger,
}
}
func (*computerUseTool) Info() fantasy.ToolInfo {
return fantasy.ToolInfo{
Name: "computer",
Description: "Control the desktop: take screenshots, move the mouse, click, type, and scroll. " +
"Use an explicit screenshot action when you want to share a screenshot with the user; " +
"those screenshots are also attached to the chat.",
Parameters: map[string]any{},
Required: []string{},
}
}
// ComputerUseProviderTool creates the provider-defined computer-use tool
// definition using the declared model-facing desktop geometry.
func ComputerUseProviderTool(provider string, declaredWidth, declaredHeight int) (fantasy.Tool, error) {
switch DefaultComputerUseProvider(provider) {
case ComputerUseProviderAnthropic:
// The run callback is nil because execution is handled separately
// by the AgentTool runner in the chatloop. We extract just the
// provider-defined tool definition.
return fantasyanthropic.NewComputerUseTool(
fantasyanthropic.ComputerUseToolOptions{
DisplayWidthPx: int64(declaredWidth),
DisplayHeightPx: int64(declaredHeight),
ToolVersion: fantasyanthropic.ComputerUse20251124,
},
nil,
).Definition(), nil
case ComputerUseProviderOpenAI:
// OpenAI's GA computer tool schema does not accept display
// dimensions. The declared geometry is applied through screenshot
// sizing and desktop action coordinate scaling.
return openaicomputeruse.Tool(), nil
default:
return nil, xerrors.Errorf("unsupported computer use provider %q, supported providers: %s", provider,
strings.Join(SupportedComputerUseProviders(), ", "))
}
}
func (t *computerUseTool) ProviderOptions() fantasy.ProviderOptions {
return t.providerOptions
}
func (t *computerUseTool) SetProviderOptions(opts fantasy.ProviderOptions) {
t.providerOptions = opts
}
func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error) {
switch DefaultComputerUseProvider(t.provider) {
case ComputerUseProviderAnthropic:
return t.runAnthropicComputerUse(ctx, call)
case ComputerUseProviderOpenAI:
return t.runOpenAIComputerUse(ctx, call)
default:
return fantasy.NewTextErrorResponse(fmt.Sprintf(
"unsupported computer use provider %q, supported providers: %s",
t.provider,
strings.Join(SupportedComputerUseProviders(), ", "),
)), nil
}
}
func (t *computerUseTool) runAnthropicComputerUse(
ctx context.Context,
call fantasy.ToolCall,
) (fantasy.ToolResponse, error) {
input, err := fantasyanthropic.ParseComputerUseInput(call.Input)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("invalid computer use input: %v", err),
), nil
}
conn, err := t.getWorkspaceConn(ctx)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("failed to connect to workspace: %v", err),
), nil
}
declaredWidth, declaredHeight := t.declaredActionDimensions()
// For wait actions, sleep then return a screenshot.
if input.Action == fantasyanthropic.ActionWait {
t.wait(ctx, input.Duration)
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
// For screenshot action, use ExecuteDesktopAction.
if input.Action == fantasyanthropic.ActionScreenshot {
return t.captureSharedScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
// Build the action request.
action := t.desktopAction(string(input.Action), declaredWidth, declaredHeight)
if input.Coordinate != ([2]int64{}) {
coord := coordinateFromInt64(input.Coordinate[0], input.Coordinate[1])
action.Coordinate = &coord
}
if input.StartCoordinate != ([2]int64{}) {
coord := coordinateFromInt64(input.StartCoordinate[0], input.StartCoordinate[1])
action.StartCoordinate = &coord
}
if input.Text != "" {
action.Text = &input.Text
}
if input.Duration > 0 {
d := int(input.Duration)
action.Duration = &d
}
if input.ScrollAmount > 0 {
s := int(input.ScrollAmount)
action.ScrollAmount = &s
}
if input.ScrollDirection != "" {
action.ScrollDirection = &input.ScrollDirection
}
if resp, done := t.executeDesktopAction(ctx, conn, action); done {
return resp, nil
}
// Take a screenshot after every action (Anthropic pattern).
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
func (t *computerUseTool) runOpenAIComputerUse(
ctx context.Context,
call fantasy.ToolCall,
) (fantasy.ToolResponse, error) {
input, err := openaicomputeruse.ParseInput(call.Input)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("invalid computer use input: %v", err),
), nil
}
conn, err := t.getWorkspaceConn(ctx)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("failed to connect to workspace: %v", err),
), nil
}
declaredWidth, declaredHeight := t.declaredActionDimensions()
actions, err := openaicomputeruse.DesktopActions(
input,
declaredWidth,
declaredHeight,
)
if err != nil {
return fantasy.NewTextErrorResponse(err.Error()), nil
}
for _, action := range actions {
if action.WaitDurationMillis > 0 {
t.wait(ctx, action.WaitDurationMillis)
continue
}
if resp, done := t.executeDesktopAction(ctx, conn, action.Action); done {
if action.ReleaseMouseOnFailure {
_, err := conn.ExecuteDesktopAction(
ctx,
t.desktopAction("left_mouse_up", declaredWidth, declaredHeight),
)
if err != nil {
t.logger.Warn(ctx, "failed to release mouse after OpenAI drag error",
slog.Error(err),
)
}
}
t.releaseOpenAIModifierKeys(ctx, conn, action.ReleaseKeysOnFailure)
return resp, nil
}
}
return t.captureSharedScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
func (t *computerUseTool) releaseOpenAIModifierKeys(
ctx context.Context,
conn workspacesdk.AgentConn,
keys []string,
) {
for i := len(keys) - 1; i >= 0; i-- {
key := keys[i]
action := t.desktopAction("key_up", 0, 0)
action.Text = &key
if _, err := conn.ExecuteDesktopAction(ctx, action); err != nil {
t.logger.Warn(ctx, "failed to release OpenAI modifier key",
slog.F("key", key),
slog.Error(err),
)
}
}
}
func (*computerUseTool) executeDesktopAction(
ctx context.Context,
conn workspacesdk.AgentConn,
action workspacesdk.DesktopAction,
) (fantasy.ToolResponse, bool) {
_, err := conn.ExecuteDesktopAction(ctx, action)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("action %q failed: %v", action.Action, err),
), true
}
return fantasy.ToolResponse{}, false
}
func (*computerUseTool) desktopAction(
action string,
declaredWidth, declaredHeight int,
) workspacesdk.DesktopAction {
return workspacesdk.DesktopAction{
Action: action,
ScaledWidth: &declaredWidth,
ScaledHeight: &declaredHeight,
}
}
func (t *computerUseTool) wait(ctx context.Context, durationMillis int64) {
d := durationMillis
if d <= 0 {
d = 1000
}
timer := t.clock.NewTimer(time.Duration(d)*time.Millisecond, "computeruse", "wait")
defer timer.Stop()
select {
case <-ctx.Done():
case <-timer.C:
}
}
func coordinateFromInt64(x, y int64) [2]int {
return [2]int{int(x), int(y)}
}
func (t *computerUseTool) captureScreenshot(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (fantasy.ToolResponse, error) {
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("screenshot failed: %v", err),
), nil
}
screenData, err := base64.StdEncoding.DecodeString(screenResp.ScreenshotData)
if err != nil {
t.logger.Error(ctx, "failed to decode screenshot base64 in captureScreenshot",
slog.Error(err),
)
return fantasy.NewTextErrorResponse(
fmt.Sprintf("failed to decode screenshot data: %v", err),
), nil
}
return fantasy.NewImageResponse(screenData, "image/png"), nil
}
func (t *computerUseTool) captureSharedScreenshot(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (fantasy.ToolResponse, error) {
screenResp, err := executeScreenshotAction(ctx, conn, declaredWidth, declaredHeight)
if err != nil {
return fantasy.NewTextErrorResponse(
fmt.Sprintf("screenshot failed: %v", err),
), nil
}
screenData, err := base64.StdEncoding.DecodeString(screenResp.ScreenshotData)
if err != nil {
t.logger.Error(ctx, "failed to decode screenshot base64 in captureSharedScreenshot",
slog.Error(err),
)
return fantasy.NewTextErrorResponse(
fmt.Sprintf("failed to decode screenshot data: %v", err),
), nil
}
attachmentName := fmt.Sprintf(
"screenshot-%s.png",
t.clock.Now().UTC().Format("2006-01-02T15-04-05Z"),
)
if t.storeFile == nil {
t.logger.Warn(ctx, "screenshot attachment storage is not configured")
return fantasy.NewImageResponse(screenData, "image/png"), nil
}
response := fantasy.NewImageResponse(screenData, "image/png")
attachment, err := storeScreenshotAttachment(
ctx,
t.storeFile,
attachmentName,
screenResp.ScreenshotData,
)
if err != nil {
t.logger.Warn(ctx, "failed to persist screenshot attachment",
slog.F("attachment_name", attachmentName),
slog.Error(err),
)
return response, nil
}
return WithAttachments(response, attachment), nil
}
func executeScreenshotAction(
ctx context.Context,
conn workspacesdk.AgentConn,
declaredWidth, declaredHeight int,
) (workspacesdk.DesktopActionResponse, error) {
screenshotAction := workspacesdk.DesktopAction{
Action: "screenshot",
ScaledWidth: &declaredWidth,
ScaledHeight: &declaredHeight,
}
return conn.ExecuteDesktopAction(ctx, screenshotAction)
}
func (t *computerUseTool) declaredActionDimensions() (declaredWidth, declaredHeight int) {
if t.declaredWidth <= 0 || t.declaredHeight <= 0 {
geometry := DefaultComputerUseDesktopGeometry(t.provider)
return geometry.DeclaredWidth, geometry.DeclaredHeight
}
return t.declaredWidth, t.declaredHeight
}