mirror of
https://github.com/coder/coder.git
synced 2026-06-04 05:28:20 +00:00
0bb09935bc
Adds a deployment-wide setting to select the computer-use provider (Anthropic or OpenAI) for AI agents, plus the OpenAI computer-use runner needed to honor that selection. The setting is stored in `site_configs` under `agents_computer_use_provider`, defaults to Anthropic when unset, and is exposed via experimental GET/PUT endpoints under `/api/experimental/chats/config/computer-use-provider`. The chatd computer-use tool now dispatches to either `runAnthropicComputerUse` or `runOpenAIComputerUse` based on the resolved provider, with provider-specific result metadata for OpenAI screenshots. Frontend adds a provider dropdown to the Agents Experiments settings page nested under the virtual desktop toggle, with disabled state handling while virtual desktop is off and skeleton loaders while config queries are in flight. Hugo and Codex review follow-up: - Uses shared provider validation and clearer computer-use constant names. - Removes stale OpenAI pending-safety-checks commentary. - Documents why provider result metadata is needed for OpenAI screenshots. - Keeps the computer-use subagent visible when provider credentials are missing, then returns a clear spawn-time configuration error. - Uses OpenAI's recommended 1600x900 screenshot geometry to preserve the native 16:9 aspect ratio. - Moves OpenAI-specific computer-use helpers into `coderd/x/chatd/chatopenai/computeruse` after rebasing onto the provider package refactor in `main`. - Converts OpenAI pixel scroll deltas to Coder desktop wheel-click amounts. - Preserves OpenAI pointer modifiers with key down/up desktop actions and rejects unsupported non-left double-click buttons explicitly. - Maps OpenAI back/forward side-button clicks to browser navigation key actions. - Defaults omitted OpenAI click buttons to left-click. - Retries mouse release cleanup if the final OpenAI drag release fails. - Keeps computer-use subagent availability messages stable when provider config cannot be loaded, while logging the backend error. - Releases remaining OpenAI modifier keys if a synthetic key-up cleanup action fails. - Updates Storybook interaction stories so provider snapshots show the selected final provider. > Mux updated this PR description on behalf of Mike.
495 lines
12 KiB
Go
495 lines
12 KiB
Go
package computeruse
|
|
|
|
import (
|
|
"slices"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"charm.land/fantasy"
|
|
fantasyopenai "charm.land/fantasy/providers/openai"
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
|
)
|
|
|
|
// ComputerUseTool returns the OpenAI provider-defined computer-use tool.
|
|
func Tool() fantasy.Tool {
|
|
return fantasyopenai.NewComputerUseTool(nil).Definition()
|
|
}
|
|
|
|
// IsComputerUseTool reports whether tool is the OpenAI provider-defined
|
|
// computer-use tool.
|
|
func IsTool(tool fantasy.Tool) bool {
|
|
return fantasyopenai.IsComputerUseTool(tool)
|
|
}
|
|
|
|
// ParseInput parses an OpenAI computer-use tool call input.
|
|
func ParseInput(input string) (*fantasyopenai.ComputerUseInput, error) {
|
|
return fantasyopenai.ParseComputerUseInput(input)
|
|
}
|
|
|
|
// ComputerUseResultProviderMetadata returns metadata that should accompany an
|
|
// OpenAI computer-use screenshot result.
|
|
func ResultProviderMetadata(response fantasy.ToolResponse) fantasy.ProviderMetadata {
|
|
if response.IsError || response.Type != "image" || len(response.Data) == 0 ||
|
|
!strings.HasPrefix(response.MediaType, "image/") {
|
|
return nil
|
|
}
|
|
|
|
return fantasy.ProviderMetadata{
|
|
fantasyopenai.Name: &fantasyopenai.ComputerCallOutputOptions{
|
|
Detail: "original",
|
|
},
|
|
}
|
|
}
|
|
|
|
// OpenAI scroll deltas are pixels, but Coder desktop scroll amounts are
|
|
// wheel clicks.
|
|
const computerUseScrollPixelsPerWheelClick int64 = 100
|
|
|
|
// ComputerUseDesktopAction is a Coder desktop operation requested by an
|
|
// OpenAI computer-use tool call.
|
|
type DesktopAction struct {
|
|
Action workspacesdk.DesktopAction
|
|
WaitDurationMillis int64
|
|
ReleaseMouseOnFailure bool
|
|
ReleaseKeysOnFailure []string
|
|
}
|
|
|
|
// ComputerUseDesktopActions converts an OpenAI computer-use tool call into
|
|
// Coder desktop actions. A caller should execute the returned actions in order,
|
|
// wait for WaitDurationMillis entries, and then return a final screenshot.
|
|
func DesktopActions(
|
|
parsed *fantasyopenai.ComputerUseInput,
|
|
declaredWidth, declaredHeight int,
|
|
) ([]DesktopAction, error) {
|
|
if parsed == nil {
|
|
return nil, xerrors.New("OpenAI computer use input is nil")
|
|
}
|
|
var err error
|
|
actions := make([]DesktopAction, 0, len(parsed.Actions))
|
|
for _, action := range parsed.Actions {
|
|
switch action.Type {
|
|
case "screenshot":
|
|
// OpenAI returns one screenshot per response; individual screenshot
|
|
// actions in the batch are fulfilled by the batch-final capture.
|
|
continue
|
|
case "move":
|
|
actions = append(actions, DesktopAction{
|
|
Action: desktopActionWithCoordinate(
|
|
"mouse_move",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
action.X,
|
|
action.Y,
|
|
),
|
|
})
|
|
case "click":
|
|
actionSet, err := clickActions(
|
|
action.Button,
|
|
declaredWidth,
|
|
declaredHeight,
|
|
action.X,
|
|
action.Y,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
actions, err = appendWithModifiers(actions, action.Keys, actionSet)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case "double_click":
|
|
actionName, ok := DoubleClickAction(action.Button)
|
|
if !ok {
|
|
return nil, xerrors.Errorf(
|
|
"unsupported OpenAI double-click button %q",
|
|
action.Button,
|
|
)
|
|
}
|
|
actionSet := []DesktopAction{{
|
|
Action: desktopActionWithCoordinate(
|
|
actionName,
|
|
declaredWidth,
|
|
declaredHeight,
|
|
action.X,
|
|
action.Y,
|
|
),
|
|
}}
|
|
actions, err = appendWithModifiers(actions, action.Keys, actionSet)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case "drag":
|
|
if len(action.Path) < 2 {
|
|
return nil, xerrors.New("OpenAI drag action requires at least two path points")
|
|
}
|
|
actionSet := []DesktopAction{
|
|
{
|
|
Action: desktopActionWithCoordinate(
|
|
"mouse_move",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
action.Path[0].X,
|
|
action.Path[0].Y,
|
|
),
|
|
},
|
|
{
|
|
Action: desktopAction(
|
|
"left_mouse_down",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
),
|
|
ReleaseMouseOnFailure: true,
|
|
},
|
|
}
|
|
for _, point := range action.Path[1:] {
|
|
actionSet = append(actionSet, DesktopAction{
|
|
Action: desktopActionWithCoordinate(
|
|
"mouse_move",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
point.X,
|
|
point.Y,
|
|
),
|
|
ReleaseMouseOnFailure: true,
|
|
})
|
|
}
|
|
actionSet = append(actionSet, DesktopAction{
|
|
Action: desktopAction(
|
|
"left_mouse_up",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
),
|
|
ReleaseMouseOnFailure: true,
|
|
})
|
|
actions, err = appendWithModifiers(actions, action.Keys, actionSet)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case "keypress":
|
|
text, err := NormalizeKeys(action.Keys)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
desktopAction := desktopAction("key", declaredWidth, declaredHeight)
|
|
desktopAction.Text = &text
|
|
actions = append(actions, DesktopAction{Action: desktopAction})
|
|
case "type":
|
|
desktopAction := desktopAction("type", declaredWidth, declaredHeight)
|
|
desktopAction.Text = &action.Text
|
|
actions = append(actions, DesktopAction{Action: desktopAction})
|
|
case "scroll":
|
|
actionSet := computerUseScrollActions(
|
|
declaredWidth,
|
|
declaredHeight,
|
|
action.X,
|
|
action.Y,
|
|
action.ScrollX,
|
|
action.ScrollY,
|
|
)
|
|
actions, err = appendWithModifiers(actions, action.Keys, actionSet)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case "wait":
|
|
actions = append(actions, DesktopAction{WaitDurationMillis: 1000})
|
|
default:
|
|
return nil, xerrors.Errorf(
|
|
"unsupported OpenAI computer action type %q",
|
|
action.Type,
|
|
)
|
|
}
|
|
}
|
|
return actions, nil
|
|
}
|
|
|
|
func appendWithModifiers(
|
|
actions []DesktopAction,
|
|
keys []string,
|
|
actionSet []DesktopAction,
|
|
) ([]DesktopAction, error) {
|
|
if len(keys) == 0 {
|
|
return append(actions, actionSet...), nil
|
|
}
|
|
|
|
modifiers := make([]string, 0, len(keys))
|
|
for _, key := range keys {
|
|
modifier, err := normalizeComputerUseKey(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
modifiers = append(modifiers, modifier)
|
|
}
|
|
|
|
heldKeys := make([]string, 0, len(modifiers))
|
|
for _, modifier := range modifiers {
|
|
nextHeldKeys := append(slices.Clone(heldKeys), modifier)
|
|
desktopAction := desktopAction("key_down", 0, 0)
|
|
desktopAction.Text = &modifier
|
|
actions = append(actions, DesktopAction{
|
|
Action: desktopAction,
|
|
ReleaseKeysOnFailure: nextHeldKeys,
|
|
})
|
|
heldKeys = nextHeldKeys
|
|
}
|
|
|
|
for _, action := range actionSet {
|
|
action.ReleaseKeysOnFailure = slices.Clone(heldKeys)
|
|
actions = append(actions, action)
|
|
}
|
|
|
|
for i := len(heldKeys) - 1; i >= 0; i-- {
|
|
key := heldKeys[i]
|
|
desktopAction := desktopAction("key_up", 0, 0)
|
|
desktopAction.Text = &key
|
|
actions = append(actions, DesktopAction{
|
|
Action: desktopAction,
|
|
ReleaseKeysOnFailure: slices.Clone(heldKeys[:i+1]),
|
|
})
|
|
}
|
|
return actions, nil
|
|
}
|
|
|
|
func computerUseScrollActions(
|
|
declaredWidth, declaredHeight int,
|
|
x, y, scrollX, scrollY int64,
|
|
) []DesktopAction {
|
|
coord := coordinateFromInt64(x, y)
|
|
moveAction := desktopAction("mouse_move", declaredWidth, declaredHeight)
|
|
moveAction.Coordinate = &coord
|
|
actions := []DesktopAction{{Action: moveAction}}
|
|
|
|
if scrollY != 0 {
|
|
direction := "down"
|
|
if scrollY < 0 {
|
|
direction = "up"
|
|
}
|
|
scrollAction := desktopAction("scroll", declaredWidth, declaredHeight)
|
|
scrollAction.Coordinate = &coord
|
|
scrollAction.ScrollDirection = &direction
|
|
amount := scrollPixelsToWheelClicks(scrollY)
|
|
scrollAction.ScrollAmount = &amount
|
|
actions = append(actions, DesktopAction{Action: scrollAction})
|
|
}
|
|
|
|
if scrollX != 0 {
|
|
direction := "right"
|
|
if scrollX < 0 {
|
|
direction = "left"
|
|
}
|
|
scrollAction := desktopAction("scroll", declaredWidth, declaredHeight)
|
|
scrollAction.Coordinate = &coord
|
|
scrollAction.ScrollDirection = &direction
|
|
amount := scrollPixelsToWheelClicks(scrollX)
|
|
scrollAction.ScrollAmount = &amount
|
|
actions = append(actions, DesktopAction{Action: scrollAction})
|
|
}
|
|
return actions
|
|
}
|
|
|
|
func desktopActionWithCoordinate(
|
|
action string,
|
|
declaredWidth, declaredHeight int,
|
|
x, y int64,
|
|
) workspacesdk.DesktopAction {
|
|
desktopAction := desktopAction(action, declaredWidth, declaredHeight)
|
|
coord := coordinateFromInt64(x, y)
|
|
desktopAction.Coordinate = &coord
|
|
return desktopAction
|
|
}
|
|
|
|
func desktopAction(
|
|
action string,
|
|
declaredWidth, declaredHeight int,
|
|
) workspacesdk.DesktopAction {
|
|
return workspacesdk.DesktopAction{
|
|
Action: action,
|
|
ScaledWidth: &declaredWidth,
|
|
ScaledHeight: &declaredHeight,
|
|
}
|
|
}
|
|
|
|
func coordinateFromInt64(x, y int64) [2]int {
|
|
return [2]int{int(x), int(y)}
|
|
}
|
|
|
|
func scrollPixelsToWheelClicks(pixels int64) int {
|
|
if pixels < 0 {
|
|
pixels = -pixels
|
|
}
|
|
if pixels == 0 {
|
|
return 0
|
|
}
|
|
return int((pixels + computerUseScrollPixelsPerWheelClick - 1) /
|
|
computerUseScrollPixelsPerWheelClick)
|
|
}
|
|
|
|
func clickActions(
|
|
button string,
|
|
declaredWidth, declaredHeight int,
|
|
x, y int64,
|
|
) ([]DesktopAction, error) {
|
|
actionName, ok := ClickAction(button)
|
|
if ok {
|
|
return []DesktopAction{{
|
|
Action: desktopActionWithCoordinate(
|
|
actionName,
|
|
declaredWidth,
|
|
declaredHeight,
|
|
x,
|
|
y,
|
|
),
|
|
}}, nil
|
|
}
|
|
|
|
navigationKey := ""
|
|
switch button {
|
|
case "back":
|
|
navigationKey = "alt+Left"
|
|
case "forward":
|
|
navigationKey = "alt+Right"
|
|
default:
|
|
return nil, xerrors.Errorf("unsupported OpenAI click button %q", button)
|
|
}
|
|
|
|
keyAction := desktopAction("key", 0, 0)
|
|
keyAction.Text = &navigationKey
|
|
return []DesktopAction{
|
|
{
|
|
Action: desktopActionWithCoordinate(
|
|
"mouse_move",
|
|
declaredWidth,
|
|
declaredHeight,
|
|
x,
|
|
y,
|
|
),
|
|
},
|
|
{Action: keyAction},
|
|
}, nil
|
|
}
|
|
|
|
// DoubleClickAction maps an OpenAI computer-use double-click button to a Coder
|
|
// desktop action name. The desktop API currently supports only left-button
|
|
// double-clicks.
|
|
func DoubleClickAction(button string) (string, bool) {
|
|
switch button {
|
|
case "", "left":
|
|
return "double_click", true
|
|
default:
|
|
return "", false
|
|
}
|
|
}
|
|
|
|
// ComputerUseClickAction maps an OpenAI computer-use click button to a Coder
|
|
// desktop action name.
|
|
func ClickAction(button string) (string, bool) {
|
|
switch button {
|
|
case "", "left":
|
|
return "left_click", true
|
|
case "right":
|
|
return "right_click", true
|
|
case "middle", "wheel":
|
|
return "middle_click", true
|
|
default:
|
|
return "", false
|
|
}
|
|
}
|
|
|
|
// NormalizeComputerUseKeys maps OpenAI keypress tokens to Coder desktop key
|
|
// action tokens.
|
|
func NormalizeKeys(keys []string) (string, error) {
|
|
if len(keys) == 0 {
|
|
return "", xerrors.New("OpenAI keypress action requires at least one key")
|
|
}
|
|
normalized := make([]string, 0, len(keys))
|
|
for _, key := range keys {
|
|
normalizedKey, err := normalizeComputerUseKey(key)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
normalized = append(normalized, normalizedKey)
|
|
}
|
|
return strings.Join(normalized, "+"), nil
|
|
}
|
|
|
|
func normalizeComputerUseKey(key string) (string, error) {
|
|
trimmed := strings.TrimSpace(key)
|
|
if trimmed == "" {
|
|
return "", xerrors.New("OpenAI keypress action contains an empty key")
|
|
}
|
|
|
|
lower := strings.ToLower(trimmed)
|
|
switch lower {
|
|
case "ctrl", "control":
|
|
return "ctrl", nil
|
|
case "cmd", "command", "meta", "super":
|
|
return "meta", nil
|
|
case "shift":
|
|
return "shift", nil
|
|
case "alt", "option":
|
|
return "alt", nil
|
|
case "enter", "return":
|
|
return "Return", nil
|
|
case "escape", "esc":
|
|
return "Escape", nil
|
|
case "tab":
|
|
return "Tab", nil
|
|
case "space":
|
|
return "space", nil
|
|
case "backspace":
|
|
return "BackSpace", nil
|
|
case "delete", "del":
|
|
return "Delete", nil
|
|
case "arrowup", "up":
|
|
return "Up", nil
|
|
case "arrowdown", "down":
|
|
return "Down", nil
|
|
case "arrowleft", "left":
|
|
return "Left", nil
|
|
case "arrowright", "right":
|
|
return "Right", nil
|
|
}
|
|
|
|
if isFunctionKey(lower) {
|
|
return "F" + lower[1:], nil
|
|
}
|
|
|
|
runes := []rune(trimmed)
|
|
if len(runes) == 1 {
|
|
r := runes[0]
|
|
if unicode.IsLetter(r) {
|
|
return strings.ToLower(trimmed), nil
|
|
}
|
|
if unicode.IsDigit(r) {
|
|
return trimmed, nil
|
|
}
|
|
if unicode.IsPunct(r) || unicode.IsSymbol(r) {
|
|
return trimmed, nil
|
|
}
|
|
return "", xerrors.Errorf("unsupported OpenAI keypress %q", trimmed)
|
|
}
|
|
|
|
return "", xerrors.Errorf("unsupported OpenAI keypress %q", trimmed)
|
|
}
|
|
|
|
func isFunctionKey(key string) bool {
|
|
if len(key) < 2 || key[0] != 'f' {
|
|
return false
|
|
}
|
|
number, ok := strings.CutPrefix(key, "f")
|
|
if !ok || number == "" {
|
|
return false
|
|
}
|
|
for _, r := range number {
|
|
if r < '0' || r > '9' {
|
|
return false
|
|
}
|
|
}
|
|
value := 0
|
|
for _, r := range number {
|
|
value = value*10 + int(r-'0')
|
|
}
|
|
return value >= 1 && value <= 35
|
|
}
|