From 0bb09935bcffc0032fa5aab8b50f5a8b7675c55d Mon Sep 17 00:00:00 2001
From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com>
Date: Mon, 4 May 2026 20:30:50 +0200
Subject: [PATCH] feat: add computer-use provider selection for AI agents
(#24772)
Adds a deployment-wide setting to select the computer-use provider
(Anthropic or OpenAI) for AI agents, plus the OpenAI computer-use runner
needed to honor that selection.
The setting is stored in `site_configs` under
`agents_computer_use_provider`, defaults to Anthropic when unset, and is
exposed via experimental GET/PUT endpoints under
`/api/experimental/chats/config/computer-use-provider`. The chatd
computer-use tool now dispatches to either `runAnthropicComputerUse` or
`runOpenAIComputerUse` based on the resolved provider, with
provider-specific result metadata for OpenAI screenshots.
Frontend adds a provider dropdown to the Agents Experiments settings
page nested under the virtual desktop toggle, with disabled state
handling while virtual desktop is off and skeleton loaders while config
queries are in flight.
Hugo and Codex review follow-up:
- Uses shared provider validation and clearer computer-use constant
names.
- Removes stale OpenAI pending-safety-checks commentary.
- Documents why provider result metadata is needed for OpenAI
screenshots.
- Keeps the computer-use subagent visible when provider credentials are
missing, then returns a clear spawn-time configuration error.
- Uses OpenAI's recommended 1600x900 screenshot geometry to preserve the
native 16:9 aspect ratio.
- Moves OpenAI-specific computer-use helpers into
`coderd/x/chatd/chatopenai/computeruse` after rebasing onto the provider
package refactor in `main`.
- Converts OpenAI pixel scroll deltas to Coder desktop wheel-click
amounts.
- Preserves OpenAI pointer modifiers with key down/up desktop actions
and rejects unsupported non-left double-click buttons explicitly.
- Maps OpenAI back/forward side-button clicks to browser navigation key
actions.
- Defaults omitted OpenAI click buttons to left-click.
- Retries mouse release cleanup if the final OpenAI drag release fails.
- Keeps computer-use subagent availability messages stable when provider
config cannot be loaded, while logging the backend error.
- Releases remaining OpenAI modifier keys if a synthetic key-up cleanup
action fails.
- Updates Storybook interaction stories so provider snapshots show the
selected final provider.
> Mux updated this PR description on behalf of Mike.
---
agent/x/agentdesktop/api.go | 32 +
agent/x/agentdesktop/api_test.go | 53 ++
coderd/coderd.go | 2 +
coderd/database/dbauthz/dbauthz.go | 19 +
coderd/database/dbauthz/dbauthz_test.go | 8 +
coderd/database/dbmetrics/querymetrics.go | 16 +
coderd/database/dbmock/dbmock.go | 29 +
coderd/database/querier.go | 2 +
coderd/database/queries.sql.go | 22 +
coderd/database/queries/siteconfig.sql | 8 +
coderd/exp_chats.go | 53 ++
coderd/exp_chats_test.go | 133 ++++
coderd/x/chatd/chatd.go | 68 +-
coderd/x/chatd/chatd_internal_test.go | 168 +++++
coderd/x/chatd/chatd_test.go | 12 +-
coderd/x/chatd/chatloop/chatloop.go | 49 +-
coderd/x/chatd/chatloop/chatloop_test.go | 134 ++++
.../chatopenai/computeruse/computeruse.go | 494 +++++++++++++
.../computeruse/computeruse_test.go | 199 +++++
coderd/x/chatd/chattool/computeruse.go | 280 +++++--
coderd/x/chatd/chattool/computeruse_test.go | 693 +++++++++++++++++-
coderd/x/chatd/computer_use.go | 167 +++++
coderd/x/chatd/subagent.go | 24 +-
coderd/x/chatd/subagent_catalog.go | 29 +-
coderd/x/chatd/subagent_internal_test.go | 156 +++-
codersdk/chats.go | 40 +
codersdk/workspacesdk/display.go | 16 +
codersdk/workspacesdk/display_test.go | 13 +
go.mod | 2 +-
go.sum | 4 +-
site/src/api/api.ts | 18 +
site/src/api/queries/chats.ts | 16 +
site/src/api/typesGenerated.ts | 18 +
.../AgentSettingsExperimentsPage.tsx | 16 +
...entSettingsExperimentsPageView.stories.tsx | 153 ++++
.../AgentSettingsExperimentsPageView.tsx | 26 +
.../AdminChatDebugLoggingSettings.tsx | 29 +-
.../components/VirtualDesktopSettings.tsx | 121 ++-
38 files changed, 3181 insertions(+), 141 deletions(-)
create mode 100644 coderd/x/chatd/chatopenai/computeruse/computeruse.go
create mode 100644 coderd/x/chatd/chatopenai/computeruse/computeruse_test.go
create mode 100644 coderd/x/chatd/computer_use.go
diff --git a/agent/x/agentdesktop/api.go b/agent/x/agentdesktop/api.go
index 33ff0fb7ca..fc7686b072 100644
--- a/agent/x/agentdesktop/api.go
+++ b/agent/x/agentdesktop/api.go
@@ -183,6 +183,38 @@ func (a *API) handleAction(rw http.ResponseWriter, r *http.Request) {
}
resp.Output = "key action performed"
+ case "key_down":
+ if action.Text == nil {
+ httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
+ Message: "Missing \"text\" for key_down action.",
+ })
+ return
+ }
+ if err := a.desktop.KeyDown(ctx, *action.Text); err != nil {
+ httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
+ Message: "Key down failed.",
+ Detail: err.Error(),
+ })
+ return
+ }
+ resp.Output = "key_down action performed"
+
+ case "key_up":
+ if action.Text == nil {
+ httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
+ Message: "Missing \"text\" for key_up action.",
+ })
+ return
+ }
+ if err := a.desktop.KeyUp(ctx, *action.Text); err != nil {
+ httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
+ Message: "Key up failed.",
+ Detail: err.Error(),
+ })
+ return
+ }
+ resp.Output = "key_up action performed"
+
case "type":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
diff --git a/agent/x/agentdesktop/api_test.go b/agent/x/agentdesktop/api_test.go
index 7663d677bc..a8c232d978 100644
--- a/agent/x/agentdesktop/api_test.go
+++ b/agent/x/agentdesktop/api_test.go
@@ -523,6 +523,59 @@ func TestHandleAction_TypeAction(t *testing.T) {
assert.Equal(t, "hello world", fake.lastTyped)
}
+func TestHandleAction_KeyDownAndUp(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ action string
+ wantOutput string
+ }{
+ {name: "KeyDown", action: "key_down", wantOutput: "key_down action performed"},
+ {name: "KeyUp", action: "key_up", wantOutput: "key_up action performed"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ logger := slogtest.Make(t, nil)
+ fake := &fakeDesktop{
+ startCfg: agentdesktop.DisplayConfig{Width: 1920, Height: 1080},
+ }
+ api := agentdesktop.NewAPI(logger, fake, nil)
+ defer api.Close()
+
+ text := "ctrl"
+ body := agentdesktop.DesktopAction{
+ Action: tt.action,
+ Text: &text,
+ }
+ b, err := json.Marshal(body)
+ require.NoError(t, err)
+
+ rr := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodPost, "/action", bytes.NewReader(b))
+ req.Header.Set("Content-Type", "application/json")
+
+ handler := api.Routes()
+ handler.ServeHTTP(rr, req)
+
+ assert.Equal(t, http.StatusOK, rr.Code)
+
+ var resp agentdesktop.DesktopActionResponse
+ err = json.NewDecoder(rr.Body).Decode(&resp)
+ require.NoError(t, err)
+ assert.Equal(t, tt.wantOutput, resp.Output)
+ if tt.action == "key_down" {
+ assert.Equal(t, "ctrl", fake.lastKeyDown)
+ } else {
+ assert.Equal(t, "ctrl", fake.lastKeyUp)
+ }
+ })
+ }
+}
+
func TestHandleAction_HoldKey(t *testing.T) {
t.Parallel()
diff --git a/coderd/coderd.go b/coderd/coderd.go
index 1ff2a5ed48..3901727a06 100644
--- a/coderd/coderd.go
+++ b/coderd/coderd.go
@@ -1196,6 +1196,8 @@ func New(options *Options) *API {
r.Put("/model-override/{context}", api.putChatModelOverride)
r.Get("/desktop-enabled", api.getChatDesktopEnabled)
r.Put("/desktop-enabled", api.putChatDesktopEnabled)
+ r.Get("/computer-use-provider", api.getChatComputerUseProvider)
+ r.Put("/computer-use-provider", api.putChatComputerUseProvider)
r.Get("/debug-logging", api.getChatDebugLogging)
r.Put("/debug-logging", api.putChatDebugLogging)
r.Get("/user-debug-logging", api.getUserChatDebugLogging)
diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go
index 0ab0b52123..7901aec54a 100644
--- a/coderd/database/dbauthz/dbauthz.go
+++ b/coderd/database/dbauthz/dbauthz.go
@@ -2603,6 +2603,18 @@ func (q *querier) GetChatByIDForUpdate(ctx context.Context, id uuid.UUID) (datab
return fetch(q.log, q.auth, q.db.GetChatByIDForUpdate)(ctx, id)
}
+func (q *querier) GetChatComputerUseProvider(ctx context.Context) (string, error) {
+ // The computer-use provider is a deployment-wide runtime chat setting
+ // read by authenticated chat users and chatd. Feature and experiment
+ // access is enforced at caller and API boundaries where applicable, so
+ // this matches peer runtime config getters and only requires an explicit
+ // actor so unauthenticated calls fail closed.
+ if _, ok := ActorFromContext(ctx); !ok {
+ return "", ErrNoActor
+ }
+ return q.db.GetChatComputerUseProvider(ctx)
+}
+
func (q *querier) GetChatCostPerChat(ctx context.Context, arg database.GetChatCostPerChatParams) ([]database.GetChatCostPerChatRow, error) {
// The owner's chats, may cross orgs. AnyOrganization() authorizes
// the caller if they hold read permission on chats owned by
@@ -7437,6 +7449,13 @@ func (q *querier) UpsertChatAutoArchiveDays(ctx context.Context, autoArchiveDays
return q.db.UpsertChatAutoArchiveDays(ctx, autoArchiveDays)
}
+func (q *querier) UpsertChatComputerUseProvider(ctx context.Context, provider string) error {
+ if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceDeploymentConfig); err != nil {
+ return err
+ }
+ return q.db.UpsertChatComputerUseProvider(ctx, provider)
+}
+
func (q *querier) UpsertChatDebugLoggingAllowUsers(ctx context.Context, allowUsers bool) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceDeploymentConfig); err != nil {
return err
diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go
index 2d6b189c8c..41490dcd5a 100644
--- a/coderd/database/dbauthz/dbauthz_test.go
+++ b/coderd/database/dbauthz/dbauthz_test.go
@@ -910,6 +910,10 @@ func (s *MethodTestSuite) TestChats() {
dbm.EXPECT().GetChatDesktopEnabled(gomock.Any()).Return(false, nil).AnyTimes()
check.Args().Asserts()
}))
+ s.Run("GetChatComputerUseProvider", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
+ dbm.EXPECT().GetChatComputerUseProvider(gomock.Any()).Return("anthropic", nil).AnyTimes()
+ check.Args().Asserts()
+ }))
s.Run("GetChatGeneralModelOverride", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
dbm.EXPECT().GetChatGeneralModelOverride(gomock.Any()).Return("", nil).AnyTimes()
check.Args().Asserts(rbac.ResourceDeploymentConfig, policy.ActionRead)
@@ -1233,6 +1237,10 @@ func (s *MethodTestSuite) TestChats() {
dbm.EXPECT().UpsertChatDesktopEnabled(gomock.Any(), false).Return(nil).AnyTimes()
check.Args(false).Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate)
}))
+ s.Run("UpsertChatComputerUseProvider", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
+ dbm.EXPECT().UpsertChatComputerUseProvider(gomock.Any(), "anthropic").Return(nil).AnyTimes()
+ check.Args("anthropic").Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate)
+ }))
s.Run("UpsertChatGeneralModelOverride", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
dbm.EXPECT().UpsertChatGeneralModelOverride(gomock.Any(), "").Return(nil).AnyTimes()
check.Args("").Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate)
diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go
index f9b7c9651b..a4e24772d1 100644
--- a/coderd/database/dbmetrics/querymetrics.go
+++ b/coderd/database/dbmetrics/querymetrics.go
@@ -1152,6 +1152,14 @@ func (m queryMetricsStore) GetChatByIDForUpdate(ctx context.Context, id uuid.UUI
return r0, r1
}
+func (m queryMetricsStore) GetChatComputerUseProvider(ctx context.Context) (string, error) {
+ start := time.Now()
+ r0, r1 := m.s.GetChatComputerUseProvider(ctx)
+ m.queryLatencies.WithLabelValues("GetChatComputerUseProvider").Observe(time.Since(start).Seconds())
+ m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "GetChatComputerUseProvider").Inc()
+ return r0, r1
+}
+
func (m queryMetricsStore) GetChatCostPerChat(ctx context.Context, arg database.GetChatCostPerChatParams) ([]database.GetChatCostPerChatRow, error) {
start := time.Now()
r0, r1 := m.s.GetChatCostPerChat(ctx, arg)
@@ -5328,6 +5336,14 @@ func (m queryMetricsStore) UpsertChatAutoArchiveDays(ctx context.Context, autoAr
return r0
}
+func (m queryMetricsStore) UpsertChatComputerUseProvider(ctx context.Context, provider string) error {
+ start := time.Now()
+ r0 := m.s.UpsertChatComputerUseProvider(ctx, provider)
+ m.queryLatencies.WithLabelValues("UpsertChatComputerUseProvider").Observe(time.Since(start).Seconds())
+ m.queryCounts.WithLabelValues(httpmw.ExtractHTTPRoute(ctx), httpmw.ExtractHTTPMethod(ctx), "UpsertChatComputerUseProvider").Inc()
+ return r0
+}
+
func (m queryMetricsStore) UpsertChatDebugLoggingAllowUsers(ctx context.Context, allowUsers bool) error {
start := time.Now()
r0 := m.s.UpsertChatDebugLoggingAllowUsers(ctx, allowUsers)
diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go
index 625c5a53cf..1252d277e0 100644
--- a/coderd/database/dbmock/dbmock.go
+++ b/coderd/database/dbmock/dbmock.go
@@ -2117,6 +2117,21 @@ func (mr *MockStoreMockRecorder) GetChatByIDForUpdate(ctx, id any) *gomock.Call
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetChatByIDForUpdate", reflect.TypeOf((*MockStore)(nil).GetChatByIDForUpdate), ctx, id)
}
+// GetChatComputerUseProvider mocks base method.
+func (m *MockStore) GetChatComputerUseProvider(ctx context.Context) (string, error) {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "GetChatComputerUseProvider", ctx)
+ ret0, _ := ret[0].(string)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
+}
+
+// GetChatComputerUseProvider indicates an expected call of GetChatComputerUseProvider.
+func (mr *MockStoreMockRecorder) GetChatComputerUseProvider(ctx any) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetChatComputerUseProvider", reflect.TypeOf((*MockStore)(nil).GetChatComputerUseProvider), ctx)
+}
+
// GetChatCostPerChat mocks base method.
func (m *MockStore) GetChatCostPerChat(ctx context.Context, arg database.GetChatCostPerChatParams) ([]database.GetChatCostPerChatRow, error) {
m.ctrl.T.Helper()
@@ -10011,6 +10026,20 @@ func (mr *MockStoreMockRecorder) UpsertChatAutoArchiveDays(ctx, autoArchiveDays
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpsertChatAutoArchiveDays", reflect.TypeOf((*MockStore)(nil).UpsertChatAutoArchiveDays), ctx, autoArchiveDays)
}
+// UpsertChatComputerUseProvider mocks base method.
+func (m *MockStore) UpsertChatComputerUseProvider(ctx context.Context, provider string) error {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "UpsertChatComputerUseProvider", ctx, provider)
+ ret0, _ := ret[0].(error)
+ return ret0
+}
+
+// UpsertChatComputerUseProvider indicates an expected call of UpsertChatComputerUseProvider.
+func (mr *MockStoreMockRecorder) UpsertChatComputerUseProvider(ctx, provider any) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpsertChatComputerUseProvider", reflect.TypeOf((*MockStore)(nil).UpsertChatComputerUseProvider), ctx, provider)
+}
+
// UpsertChatDebugLoggingAllowUsers mocks base method.
func (m *MockStore) UpsertChatDebugLoggingAllowUsers(ctx context.Context, allowUsers bool) error {
m.ctrl.T.Helper()
diff --git a/coderd/database/querier.go b/coderd/database/querier.go
index 6a8cedd3ab..c67ce67231 100644
--- a/coderd/database/querier.go
+++ b/coderd/database/querier.go
@@ -282,6 +282,7 @@ type sqlcQuerier interface {
GetChatAutoArchiveDays(ctx context.Context, defaultAutoArchiveDays int32) (int32, error)
GetChatByID(ctx context.Context, id uuid.UUID) (Chat, error)
GetChatByIDForUpdate(ctx context.Context, id uuid.UUID) (Chat, error)
+ GetChatComputerUseProvider(ctx context.Context) (string, error)
// Per-root-chat cost breakdown for a single user within a date range.
// Groups by root_chat_id so forked chats roll up under their root.
// Only counts assistant-role messages.
@@ -1194,6 +1195,7 @@ type sqlcQuerier interface {
// to JSON before invoking this query.
UpsertChatAdvisorConfig(ctx context.Context, value string) error
UpsertChatAutoArchiveDays(ctx context.Context, autoArchiveDays int32) error
+ UpsertChatComputerUseProvider(ctx context.Context, provider string) error
// UpsertChatDebugLoggingAllowUsers updates the runtime admin setting that
// allows users to opt into chat debug logging.
UpsertChatDebugLoggingAllowUsers(ctx context.Context, allowUsers bool) error
diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go
index ee331ee2f6..c4a80c2b46 100644
--- a/coderd/database/queries.sql.go
+++ b/coderd/database/queries.sql.go
@@ -20568,6 +20568,18 @@ func (q *sqlQuerier) GetChatAutoArchiveDays(ctx context.Context, defaultAutoArch
return auto_archive_days, err
}
+const getChatComputerUseProvider = `-- name: GetChatComputerUseProvider :one
+SELECT
+ COALESCE((SELECT value FROM site_configs WHERE key = 'agents_computer_use_provider'), '') :: text AS provider
+`
+
+func (q *sqlQuerier) GetChatComputerUseProvider(ctx context.Context) (string, error) {
+ row := q.db.QueryRowContext(ctx, getChatComputerUseProvider)
+ var provider string
+ err := row.Scan(&provider)
+ return provider, err
+}
+
const getChatDebugLoggingAllowUsers = `-- name: GetChatDebugLoggingAllowUsers :one
SELECT
COALESCE((SELECT value = 'true' FROM site_configs WHERE key = 'agents_chat_debug_logging_allow_users'), false) :: boolean AS allow_users
@@ -20967,6 +20979,16 @@ func (q *sqlQuerier) UpsertChatAutoArchiveDays(ctx context.Context, autoArchiveD
return err
}
+const upsertChatComputerUseProvider = `-- name: UpsertChatComputerUseProvider :exec
+INSERT INTO site_configs (key, value) VALUES ('agents_computer_use_provider', $1)
+ON CONFLICT (key) DO UPDATE SET value = $1 WHERE site_configs.key = 'agents_computer_use_provider'
+`
+
+func (q *sqlQuerier) UpsertChatComputerUseProvider(ctx context.Context, provider string) error {
+ _, err := q.db.ExecContext(ctx, upsertChatComputerUseProvider, provider)
+ return err
+}
+
const upsertChatDebugLoggingAllowUsers = `-- name: UpsertChatDebugLoggingAllowUsers :exec
INSERT INTO site_configs (key, value)
VALUES (
diff --git a/coderd/database/queries/siteconfig.sql b/coderd/database/queries/siteconfig.sql
index 5c6e591023..629d89fc05 100644
--- a/coderd/database/queries/siteconfig.sql
+++ b/coderd/database/queries/siteconfig.sql
@@ -226,6 +226,14 @@ SELECT
INSERT INTO site_configs (key, value) VALUES ('agents_advisor_config', $1)
ON CONFLICT (key) DO UPDATE SET value = $1 WHERE site_configs.key = 'agents_advisor_config';
+-- name: GetChatComputerUseProvider :one
+SELECT
+ COALESCE((SELECT value FROM site_configs WHERE key = 'agents_computer_use_provider'), '') :: text AS provider;
+
+-- name: UpsertChatComputerUseProvider :exec
+INSERT INTO site_configs (key, value) VALUES ('agents_computer_use_provider', sqlc.arg(provider))
+ON CONFLICT (key) DO UPDATE SET value = sqlc.arg(provider) WHERE site_configs.key = 'agents_computer_use_provider';
+
-- GetChatDebugLoggingAllowUsers returns the runtime admin setting that
-- allows users to opt into chat debug logging when the deployment does
-- not already force debug logging on globally.
diff --git a/coderd/exp_chats.go b/coderd/exp_chats.go
index 9892f36ad4..202bdc1d2f 100644
--- a/coderd/exp_chats.go
+++ b/coderd/exp_chats.go
@@ -49,6 +49,7 @@ import (
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/coderd/x/chatd"
"github.com/coder/coder/v2/coderd/x/chatd/chatprovider"
+ "github.com/coder/coder/v2/coderd/x/chatd/chattool"
"github.com/coder/coder/v2/coderd/x/chatfiles"
"github.com/coder/coder/v2/coderd/x/gitsync"
"github.com/coder/coder/v2/codersdk"
@@ -4106,6 +4107,58 @@ func (api *API) putChatDesktopEnabled(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusNoContent)
}
+// EXPERIMENTAL: this endpoint is experimental and is subject to change.
+//
+//nolint:revive // get-return: revive assumes get* must be a getter, but this is an HTTP handler.
+func (api *API) getChatComputerUseProvider(rw http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ provider, err := api.Database.GetChatComputerUseProvider(ctx)
+ if err != nil {
+ httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
+ Message: "Internal error fetching computer use provider.",
+ Detail: err.Error(),
+ })
+ return
+ }
+ httpapi.Write(ctx, rw, http.StatusOK, codersdk.ChatComputerUseProviderResponse{
+ Provider: chattool.DefaultComputerUseProvider(provider),
+ })
+}
+
+// EXPERIMENTAL: this endpoint is experimental and is subject to change.
+func (api *API) putChatComputerUseProvider(rw http.ResponseWriter, r *http.Request) {
+ ctx := r.Context()
+ if !api.Authorize(r, policy.ActionUpdate, rbac.ResourceDeploymentConfig) {
+ httpapi.Forbidden(rw)
+ return
+ }
+
+ var req codersdk.UpdateChatComputerUseProviderRequest
+ if !httpapi.Read(ctx, rw, r, &req) {
+ return
+ }
+ if !chattool.IsSupportedComputerUseProvider(req.Provider) {
+ httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
+ Message: "Invalid computer use provider.",
+ Detail: fmt.Sprintf(
+ "Expected one of: %s. Got %q.",
+ strings.Join(chattool.SupportedComputerUseProviders(), ", "),
+ req.Provider,
+ ),
+ })
+ return
+ }
+
+ if err := api.Database.UpsertChatComputerUseProvider(ctx, req.Provider); err != nil {
+ httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
+ Message: "Internal error updating computer use provider.",
+ Detail: err.Error(),
+ })
+ return
+ }
+ rw.WriteHeader(http.StatusNoContent)
+}
+
func (api *API) deploymentChatDebugLoggingEnabled() bool {
return api.DeploymentValues != nil && api.DeploymentValues.AI.Chat.DebugLoggingEnabled.Value()
}
diff --git a/coderd/exp_chats_test.go b/coderd/exp_chats_test.go
index c93bf71cef..35fffae06c 100644
--- a/coderd/exp_chats_test.go
+++ b/coderd/exp_chats_test.go
@@ -10417,6 +10417,139 @@ func TestChatDesktopEnabled(t *testing.T) {
})
}
+func TestChatComputerUseProvider(t *testing.T) {
+ t.Parallel()
+
+ t.Run("ReturnsAnthropicWhenUnset", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ resp, err := adminClient.GetChatComputerUseProvider(ctx)
+ require.NoError(t, err)
+ require.Equal(t, "anthropic", resp.Provider)
+ })
+
+ t.Run("AdminCanSetAnthropic", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ err := adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "anthropic",
+ })
+ require.NoError(t, err)
+
+ resp, err := adminClient.GetChatComputerUseProvider(ctx)
+ require.NoError(t, err)
+ require.Equal(t, "anthropic", resp.Provider)
+ })
+
+ t.Run("AdminCanSetOpenAI", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ err := adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "openai",
+ })
+ require.NoError(t, err)
+
+ resp, err := adminClient.GetChatComputerUseProvider(ctx)
+ require.NoError(t, err)
+ require.Equal(t, "openai", resp.Provider)
+ })
+
+ t.Run("AdminCanSwitchProviders", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ err := adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "openai",
+ })
+ require.NoError(t, err)
+
+ err = adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "anthropic",
+ })
+ require.NoError(t, err)
+
+ resp, err := adminClient.GetChatComputerUseProvider(ctx)
+ require.NoError(t, err)
+ require.Equal(t, "anthropic", resp.Provider)
+ })
+
+ t.Run("InvalidProviderRejected", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ for _, provider := range []string{"", "invalid"} {
+ err := adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: provider,
+ })
+ requireSDKError(t, err, http.StatusBadRequest)
+ }
+ })
+
+ t.Run("NonAdminCanRead", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ firstUser := coderdtest.CreateFirstUser(t, adminClient.Client)
+ memberClientRaw, _ := coderdtest.CreateAnotherUser(t, adminClient.Client, firstUser.OrganizationID)
+ memberClient := codersdk.NewExperimentalClient(memberClientRaw)
+
+ err := adminClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "openai",
+ })
+ require.NoError(t, err)
+
+ resp, err := memberClient.GetChatComputerUseProvider(ctx)
+ require.NoError(t, err)
+ require.Equal(t, "openai", resp.Provider)
+ })
+
+ t.Run("NonAdminWriteFails", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ firstUser := coderdtest.CreateFirstUser(t, adminClient.Client)
+ memberClientRaw, _ := coderdtest.CreateAnotherUser(t, adminClient.Client, firstUser.OrganizationID)
+ memberClient := codersdk.NewExperimentalClient(memberClientRaw)
+
+ err := memberClient.UpdateChatComputerUseProvider(ctx, codersdk.UpdateChatComputerUseProviderRequest{
+ Provider: "openai",
+ })
+ requireSDKError(t, err, http.StatusForbidden)
+ })
+
+ t.Run("UnauthenticatedReadFails", func(t *testing.T) {
+ t.Parallel()
+ ctx := testutil.Context(t, testutil.WaitLong)
+
+ adminClient := newChatClient(t)
+ coderdtest.CreateFirstUser(t, adminClient.Client)
+
+ anonClient := codersdk.NewExperimentalClient(codersdk.New(adminClient.URL))
+ _, err := anonClient.GetChatComputerUseProvider(ctx)
+ requireSDKError(t, err, http.StatusUnauthorized)
+ })
+}
+
func TestChatDebugLoggingSettings(t *testing.T) {
t.Parallel()
diff --git a/coderd/x/chatd/chatd.go b/coderd/x/chatd/chatd.go
index 97127e13b1..e3788f265c 100644
--- a/coderd/x/chatd/chatd.go
+++ b/coderd/x/chatd/chatd.go
@@ -6146,6 +6146,22 @@ func (p *Server) runChat(
// Detect computer-use subagent via the mode column.
isComputerUse := chat.Mode.Valid && chat.Mode.ChatMode == database.ChatModeComputerUse
+ var (
+ computerUseProvider string
+ computerUseModelProvider string
+ computerUseModelName string
+ )
+ if isComputerUse {
+ var err error
+ computerUseProvider, computerUseModelProvider, computerUseModelName, err = p.computerUseProviderAndModelFromConfig(ctx)
+ if err != nil {
+ return result, xerrors.Errorf(
+ "resolve computer use provider and model: %w",
+ err,
+ )
+ }
+ }
+
// NOTE: Buffering was already started in processChat before
// the running status was published, so message_part events
// are captured from the moment subscribers can see
@@ -6717,24 +6733,16 @@ func (p *Server) runChat(
if isComputerUse {
// Override model for computer use subagent.
- resolvedProvider, resolvedModel, resolveErr := chatprovider.ResolveModelWithProviderHint(
- chattool.ComputerUseModelName,
- chattool.ComputerUseModelProvider,
- )
- if resolveErr != nil {
- return result, xerrors.Errorf("resolve computer use model metadata: %w", resolveErr)
- }
- cuModel, cuDebugEnabled, cuErr := p.newDebugAwareModelFromConfig(
+ cuModel, cuDebugEnabled, resolvedProvider, resolvedModel, cuErr := p.resolveComputerUseModel(
ctx,
chat,
- chattool.ComputerUseModelProvider,
- chattool.ComputerUseModelName,
providerKeys,
- chatprovider.UserAgent(),
- chatprovider.CoderHeaders(chat),
+ computerUseProvider,
+ computerUseModelProvider,
+ computerUseModelName,
)
if cuErr != nil {
- return result, xerrors.Errorf("resolve computer use model: %w", cuErr)
+ return result, cuErr
}
model = cuModel
debugEnabled = cuDebugEnabled
@@ -6900,22 +6908,24 @@ func (p *Server) runChat(
}
}
- if !isPlanModeTurn && !isExploreSubagent && isComputerUse {
- desktopGeometry := workspacesdk.DefaultDesktopGeometry()
- providerTools = append(providerTools, chatloop.ProviderTool{
- Definition: chattool.ComputerUseProviderTool(
- desktopGeometry.DeclaredWidth,
- desktopGeometry.DeclaredHeight,
- ),
- Runner: chattool.NewComputerUseTool(
- desktopGeometry.DeclaredWidth,
- desktopGeometry.DeclaredHeight,
- workspaceCtx.getWorkspaceConn,
- storeChatAttachment,
- quartz.NewReal(),
- p.logger.Named("computer_use"),
- ),
- })
+ providerTools, err = appendComputerUseProviderTool(
+ providerTools,
+ computerUseProviderToolOptions{
+ provider: computerUseProvider,
+ isPlanModeTurn: isPlanModeTurn,
+ isComputerUse: isComputerUse,
+ getWorkspaceConn: workspaceCtx.getWorkspaceConn,
+ storeFile: storeChatAttachment,
+ clock: p.clock,
+ logger: p.logger.Named("computer_use"),
+ },
+ )
+ if err != nil {
+ return result, xerrors.Errorf(
+ "register computer use provider tool for provider %q: %w",
+ computerUseProvider,
+ err,
+ )
}
providerOptions := chatprovider.ProviderOptionsFromChatModelConfig(
diff --git a/coderd/x/chatd/chatd_internal_test.go b/coderd/x/chatd/chatd_internal_test.go
index 896e3e8363..5a351f3371 100644
--- a/coderd/x/chatd/chatd_internal_test.go
+++ b/coderd/x/chatd/chatd_internal_test.go
@@ -18,11 +18,13 @@ import (
"cdr.dev/slog/v3"
"cdr.dev/slog/v3/sloggers/slogtest"
"github.com/coder/coder/v2/coderd/database"
+ "github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbmock"
dbpubsub "github.com/coder/coder/v2/coderd/database/pubsub"
coderdpubsub "github.com/coder/coder/v2/coderd/pubsub"
"github.com/coder/coder/v2/coderd/x/chatd/chaterror"
"github.com/coder/coder/v2/coderd/x/chatd/chatloop"
+ openaicomputeruse "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
"github.com/coder/coder/v2/coderd/x/chatd/chatprovider"
"github.com/coder/coder/v2/coderd/x/chatd/chattest"
"github.com/coder/coder/v2/coderd/x/chatd/chattool"
@@ -75,6 +77,172 @@ func (t *testMCPAgentTool) MCPServerConfigID() uuid.UUID {
return t.configID
}
+func TestComputerUseProviderAndModelFromConfig(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ rawProvider string
+ wantProvider string
+ wantErr string
+ }{
+ {
+ name: "DefaultAnthropic",
+ rawProvider: "",
+ wantProvider: chattool.ComputerUseProviderAnthropic,
+ },
+ {
+ name: "OpenAI",
+ rawProvider: " openai ",
+ wantProvider: chattool.ComputerUseProviderOpenAI,
+ },
+ {
+ name: "Unknown",
+ rawProvider: "bogus",
+ wantErr: `unknown computer-use provider "bogus" configured in agents_computer_use_provider`,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ db := dbmock.NewMockStore(ctrl)
+ server := &Server{db: db}
+
+ db.EXPECT().GetChatComputerUseProvider(gomock.Any()).DoAndReturn(
+ func(ctx context.Context) (string, error) {
+ _, ok := dbauthz.ActorFromContext(ctx)
+ require.True(t, ok, "config reads must have an actor")
+ return tt.rawProvider, nil
+ },
+ )
+
+ provider, modelProvider, modelName, err := server.computerUseProviderAndModelFromConfig(context.Background())
+ if tt.wantErr != "" {
+ require.Error(t, err)
+ require.Contains(t, err.Error(), tt.wantErr)
+ return
+ }
+ require.NoError(t, err)
+ require.Equal(t, tt.wantProvider, provider)
+
+ wantModelProvider, wantModelName, ok := chattool.DefaultComputerUseModel(tt.wantProvider)
+ require.True(t, ok)
+ require.Equal(t, wantModelProvider, modelProvider)
+ require.Equal(t, wantModelName, modelName)
+ })
+ }
+}
+
+func TestResolveComputerUseModel_OpenAIMissingCredentials(t *testing.T) {
+ t.Parallel()
+
+ server := &Server{}
+ provider := chattool.ComputerUseProviderOpenAI
+ modelProvider, modelName, ok := chattool.DefaultComputerUseModel(provider)
+ require.True(t, ok)
+
+ model, debugEnabled, resolvedProvider, resolvedModel, err := server.resolveComputerUseModel(
+ context.Background(),
+ database.Chat{ID: uuid.New(), OwnerID: uuid.New()},
+ chatprovider.ProviderAPIKeys{},
+ provider,
+ modelProvider,
+ modelName,
+ )
+ require.Error(t, err)
+ require.Nil(t, model)
+ require.False(t, debugEnabled)
+ require.Empty(t, resolvedProvider)
+ require.Empty(t, resolvedModel)
+ require.Contains(t, err.Error(), `provider "openai" model "gpt-5.5"`)
+ require.Contains(t, err.Error(), "OPENAI_API_KEY is not set")
+ require.NotContains(t, err.Error(), "ANTHROPIC_API_KEY")
+}
+
+func TestAppendComputerUseProviderTool(t *testing.T) {
+ t.Parallel()
+
+ providerTools, err := appendComputerUseProviderTool(
+ nil,
+ computerUseProviderToolOptions{
+ provider: chattool.ComputerUseProviderOpenAI,
+ isComputerUse: true,
+ logger: slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}),
+ },
+ )
+ require.NoError(t, err)
+ require.Len(t, providerTools, 1)
+ require.True(t, openaicomputeruse.IsTool(providerTools[0].Definition))
+ require.Equal(t, "computer", providerTools[0].Definition.GetName())
+ require.Equal(t, "computer", providerTools[0].Runner.Info().Name)
+ require.NotNil(t, providerTools[0].ResultProviderMetadata)
+
+ metadata := providerTools[0].ResultProviderMetadata(
+ fantasy.NewImageResponse([]byte("png"), "image/png"),
+ )
+ require.NotNil(t, metadata)
+}
+
+func TestAppendComputerUseProviderTool_Gates(t *testing.T) {
+ t.Parallel()
+
+ baseTools := []chatloop.ProviderTool{{
+ Definition: fantasy.ProviderDefinedTool{
+ ID: "web_search",
+ Name: "web_search",
+ },
+ }}
+
+ tests := []struct {
+ name string
+ isPlanModeTurn bool
+ isComputerUse bool
+ }{
+ {name: "PlanMode", isPlanModeTurn: true, isComputerUse: true},
+ // Non-computer-use includes regular, master, general, and explore chats.
+ // Mode cannot be both ChatModeComputerUse and another chat mode.
+ {name: "NonComputerUseModes"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ providerTools, err := appendComputerUseProviderTool(
+ baseTools,
+ computerUseProviderToolOptions{
+ provider: chattool.ComputerUseProviderOpenAI,
+ isPlanModeTurn: tt.isPlanModeTurn,
+ isComputerUse: tt.isComputerUse,
+ },
+ )
+ require.NoError(t, err)
+ require.Len(t, providerTools, 1)
+ require.Equal(t, "web_search", providerTools[0].Definition.GetName())
+ })
+ }
+}
+
+func TestAppendComputerUseProviderTool_AnthropicHasNoResultMetadata(t *testing.T) {
+ t.Parallel()
+
+ providerTools, err := appendComputerUseProviderTool(
+ nil,
+ computerUseProviderToolOptions{
+ provider: chattool.ComputerUseProviderAnthropic,
+ isComputerUse: true,
+ logger: slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}),
+ },
+ )
+ require.NoError(t, err)
+ require.Len(t, providerTools, 1)
+ require.Equal(t, "computer", providerTools[0].Definition.GetName())
+ require.Nil(t, providerTools[0].ResultProviderMetadata)
+}
+
func TestFilterExternalMCPConfigsForTurn(t *testing.T) {
t.Parallel()
diff --git a/coderd/x/chatd/chatd_test.go b/coderd/x/chatd/chatd_test.go
index 621b1af9bd..46e99fbf89 100644
--- a/coderd/x/chatd/chatd_test.go
+++ b/coderd/x/chatd/chatd_test.go
@@ -6485,6 +6485,10 @@ func TestComputerUseSubagentToolsAndModel(t *testing.T) {
db, ps := dbtestutil.NewDB(t)
ctx := testutil.Context(t, testutil.WaitLong)
+ computerUseModelProvider, computerUseModelName, ok := chattool.DefaultComputerUseModel(chattool.ComputerUseProviderAnthropic)
+ require.True(t, ok)
+ require.Equal(t, chattool.ComputerUseProviderAnthropic, computerUseModelProvider)
+
// Track tools and model from the Anthropic LLM calls (the
// computer use child chat). We use a raw HTTP handler because
// the chattest AnthropicRequest struct does not capture tools.
@@ -6532,7 +6536,7 @@ func TestComputerUseSubagentToolsAndModel(t *testing.T) {
"id": "msg-test",
"type": "message",
"role": "assistant",
- "model": chattool.ComputerUseModelName,
+ "model": computerUseModelName,
"content": []map[string]any{{"type": "text", "text": "Done."}},
"stop_reason": "end_turn",
"usage": map[string]any{"input_tokens": 10, "output_tokens": 5},
@@ -6552,7 +6556,7 @@ func TestComputerUseSubagentToolsAndModel(t *testing.T) {
"id": "msg-test",
"type": "message",
"role": "assistant",
- "model": chattool.ComputerUseModelName,
+ "model": computerUseModelName,
},
},
{
@@ -6713,9 +6717,9 @@ func TestComputerUseSubagentToolsAndModel(t *testing.T) {
childTools := calls[0].Tools
// 1. Verify the model is the computer use model.
- require.Equal(t, chattool.ComputerUseModelName, childModel,
+ require.Equal(t, computerUseModelName, childModel,
"computer use subagent should use %s",
- chattool.ComputerUseModelName)
+ computerUseModelName)
// 2. Verify the computer tool is present.
require.Contains(t, childTools, "computer",
diff --git a/coderd/x/chatd/chatloop/chatloop.go b/coderd/x/chatd/chatloop/chatloop.go
index bda2167dca..d82032015b 100644
--- a/coderd/x/chatd/chatloop/chatloop.go
+++ b/coderd/x/chatd/chatloop/chatloop.go
@@ -195,6 +195,11 @@ type RunOptions struct {
type ProviderTool struct {
Definition fantasy.Tool
Runner fantasy.AgentTool
+ // ResultProviderMetadata extracts provider-specific metadata from successful
+ // local runner responses. The chat loop attaches returned metadata to the tool
+ // result sent back to the model. OpenAI computer-use uses this to request
+ // original screenshot detail for image results.
+ ResultProviderMetadata func(response fantasy.ToolResponse) fantasy.ProviderMetadata
}
// stepResult holds the accumulated output of a single streaming
@@ -1020,13 +1025,22 @@ func executeTools(
toolMap[t.Info().Name] = t
}
providerRunnerNames := make(map[string]struct{}, len(providerTools))
+ resultProviderMetadata := make(
+ map[string]func(fantasy.ToolResponse) fantasy.ProviderMetadata,
+ len(providerTools),
+ )
// Include runners from provider tools so locally-executed
// provider tools (e.g. computer use) can be dispatched.
for _, pt := range providerTools {
- if pt.Runner != nil {
- name := pt.Runner.Info().Name
- toolMap[name] = pt.Runner
- providerRunnerNames[name] = struct{}{}
+ if pt.Runner == nil {
+ continue
+ }
+
+ name := pt.Runner.Info().Name
+ toolMap[name] = pt.Runner
+ providerRunnerNames[name] = struct{}{}
+ if pt.ResultProviderMetadata != nil {
+ resultProviderMetadata[name] = pt.ResultProviderMetadata
}
}
@@ -1052,7 +1066,19 @@ func executeTools(
// accurate individual completion times.
completedAt[i] = dbtime.Now()
}()
- results[i] = executeSingleTool(ctx, toolMap, tc, metrics, logger, provider, model, builtinToolNames, activeTools, providerRunnerNames)
+ results[i] = executeSingleTool(
+ ctx,
+ toolMap,
+ tc,
+ metrics,
+ logger,
+ provider,
+ model,
+ builtinToolNames,
+ activeTools,
+ providerRunnerNames,
+ resultProviderMetadata,
+ )
}()
}
wg.Wait()
@@ -1349,6 +1375,7 @@ func executeSingleTool(
builtinToolNames map[string]bool,
activeTools []string,
providerRunnerNames map[string]struct{},
+ resultProviderMetadata map[string]func(fantasy.ToolResponse) fantasy.ProviderMetadata,
) fantasy.ToolResultContent {
result := fantasy.ToolResultContent{
ToolCallID: tc.ToolCallID,
@@ -1430,6 +1457,18 @@ func executeSingleTool(
Text: strings.ToValidUTF8(resp.Content, "\uFFFD"),
}
}
+
+ if _, isError := result.Result.(fantasy.ToolResultOutputContentError); isError {
+ return result
+ }
+ if len(result.ProviderMetadata) == 0 {
+ if callback := resultProviderMetadata[tc.ToolName]; callback != nil {
+ metadata := callback(resp)
+ if len(metadata) > 0 {
+ result.ProviderMetadata = metadata
+ }
+ }
+ }
return result
}
diff --git a/coderd/x/chatd/chatloop/chatloop_test.go b/coderd/x/chatd/chatloop/chatloop_test.go
index 3b3a4483ed..89f5996e1a 100644
--- a/coderd/x/chatd/chatloop/chatloop_test.go
+++ b/coderd/x/chatd/chatloop/chatloop_test.go
@@ -339,6 +339,137 @@ func TestRun_ActiveToolsAllowsProviderRunnerExecution(t *testing.T) {
"persisted step should include the provider runner result")
}
+func TestRun_ProviderToolResultProviderMetadata(t *testing.T) {
+ t.Parallel()
+
+ expectedMetadata := fantasy.ProviderMetadata{
+ "openai": &testProviderData{data: map[string]any{
+ "detail": "original",
+ }},
+ }
+
+ tests := []struct {
+ name string
+ callback func(fantasy.ToolResponse) fantasy.ProviderMetadata
+ want fantasy.ProviderMetadata
+ }{
+ {
+ name: "callback returns metadata",
+ callback: func(fantasy.ToolResponse) fantasy.ProviderMetadata {
+ return expectedMetadata
+ },
+ want: expectedMetadata,
+ },
+ {
+ name: "callback nil",
+ want: nil,
+ },
+ {
+ name: "callback returns nil",
+ callback: func(fantasy.ToolResponse) fantasy.ProviderMetadata {
+ return nil
+ },
+ want: nil,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ providerRunnerName := "computer"
+ model := &chattest.FakeModel{
+ ProviderName: "fake",
+ StreamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
+ return streamFromParts([]fantasy.StreamPart{
+ {Type: fantasy.StreamPartTypeToolInputStart, ID: "tc-provider-runner", ToolCallName: providerRunnerName},
+ {Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc-provider-runner", Delta: `{}`},
+ {Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc-provider-runner"},
+ {
+ Type: fantasy.StreamPartTypeToolCall,
+ ID: "tc-provider-runner",
+ ToolCallName: providerRunnerName,
+ ToolCallInput: `{}`,
+ },
+ {Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls},
+ }), nil
+ },
+ }
+
+ runnerTool := fantasy.NewAgentTool(
+ providerRunnerName,
+ "provider runner",
+ func(context.Context, struct{}, fantasy.ToolCall) (fantasy.ToolResponse, error) {
+ return fantasy.ToolResponse{
+ Type: "image",
+ Data: []byte("image bytes"),
+ MediaType: "image/png",
+ Content: "screenshot",
+ }, nil
+ },
+ )
+
+ var persistedStep PersistedStep
+ err := Run(context.Background(), RunOptions{
+ Model: model,
+ Messages: []fantasy.Message{
+ textMessage(fantasy.MessageRoleUser, "use the computer"),
+ },
+ ProviderTools: []ProviderTool{
+ {
+ Definition: fantasy.FunctionTool{
+ Name: providerRunnerName,
+ Description: "provider runner",
+ InputSchema: map[string]any{
+ "type": "object",
+ "properties": map[string]any{},
+ },
+ },
+ Runner: runnerTool,
+ ResultProviderMetadata: tt.callback,
+ },
+ },
+ MaxSteps: 1,
+ PersistStep: func(_ context.Context, step PersistedStep) error {
+ persistedStep = step
+ return nil
+ },
+ })
+ require.NoError(t, err)
+
+ var foundResult fantasy.ToolResultContent
+ for _, block := range persistedStep.Content {
+ toolResult, ok := fantasy.AsContentType[fantasy.ToolResultContent](block)
+ if !ok || toolResult.ToolName != providerRunnerName {
+ continue
+ }
+ foundResult = toolResult
+ break
+ }
+ require.NotEmpty(t, foundResult.ToolCallID,
+ "persisted step should include the provider runner result")
+
+ mediaResult, ok := foundResult.Result.(fantasy.ToolResultOutputContentMedia)
+ require.True(t, ok, "expected media result")
+ assert.Equal(t, "image/png", mediaResult.MediaType)
+ assert.Equal(t, tt.want, foundResult.ProviderMetadata)
+
+ if tt.want == nil {
+ return
+ }
+
+ messages := stepResult{content: persistedStep.Content}.toResponseMessages()
+ require.Len(t, messages, 2)
+ require.Equal(t, fantasy.MessageRoleTool, messages[1].Role)
+ require.Len(t, messages[1].Content, 1)
+
+ resultPart, ok := fantasy.AsMessagePart[fantasy.ToolResultPart](messages[1].Content[0])
+ require.True(t, ok, "expected outbound tool result part")
+ assert.Equal(t, fantasy.ProviderOptions(tt.want), resultPart.ProviderOptions)
+ })
+ }
+}
+
func TestProcessStepStream_AnthropicUsageMatchesFinalDelta(t *testing.T) {
t.Parallel()
@@ -3917,6 +4048,7 @@ func TestExecuteSingleTool_MediaBase64Encoding(t *testing.T) {
map[string]bool{},
[]string{"screenshot"},
map[string]struct{}{},
+ nil,
)
media, ok := result.Result.(fantasy.ToolResultOutputContentMedia)
@@ -3963,6 +4095,7 @@ func TestExecuteSingleTool_MediaBase64Encoding(t *testing.T) {
map[string]bool{},
[]string{"screenshot"},
map[string]struct{}{},
+ nil,
)
media, ok := result.Result.(fantasy.ToolResultOutputContentMedia)
@@ -4004,6 +4137,7 @@ func TestExecuteSingleTool_MediaBase64Encoding(t *testing.T) {
map[string]bool{},
[]string{"echo"},
map[string]struct{}{},
+ nil,
)
textOutput, ok := result.Result.(fantasy.ToolResultOutputContentText)
diff --git a/coderd/x/chatd/chatopenai/computeruse/computeruse.go b/coderd/x/chatd/chatopenai/computeruse/computeruse.go
new file mode 100644
index 0000000000..116b2a78bb
--- /dev/null
+++ b/coderd/x/chatd/chatopenai/computeruse/computeruse.go
@@ -0,0 +1,494 @@
+package computeruse
+
+import (
+ "slices"
+ "strings"
+ "unicode"
+
+ "charm.land/fantasy"
+ fantasyopenai "charm.land/fantasy/providers/openai"
+ "golang.org/x/xerrors"
+
+ "github.com/coder/coder/v2/codersdk/workspacesdk"
+)
+
+// ComputerUseTool returns the OpenAI provider-defined computer-use tool.
+func Tool() fantasy.Tool {
+ return fantasyopenai.NewComputerUseTool(nil).Definition()
+}
+
+// IsComputerUseTool reports whether tool is the OpenAI provider-defined
+// computer-use tool.
+func IsTool(tool fantasy.Tool) bool {
+ return fantasyopenai.IsComputerUseTool(tool)
+}
+
+// ParseInput parses an OpenAI computer-use tool call input.
+func ParseInput(input string) (*fantasyopenai.ComputerUseInput, error) {
+ return fantasyopenai.ParseComputerUseInput(input)
+}
+
+// ComputerUseResultProviderMetadata returns metadata that should accompany an
+// OpenAI computer-use screenshot result.
+func ResultProviderMetadata(response fantasy.ToolResponse) fantasy.ProviderMetadata {
+ if response.IsError || response.Type != "image" || len(response.Data) == 0 ||
+ !strings.HasPrefix(response.MediaType, "image/") {
+ return nil
+ }
+
+ return fantasy.ProviderMetadata{
+ fantasyopenai.Name: &fantasyopenai.ComputerCallOutputOptions{
+ Detail: "original",
+ },
+ }
+}
+
+// OpenAI scroll deltas are pixels, but Coder desktop scroll amounts are
+// wheel clicks.
+const computerUseScrollPixelsPerWheelClick int64 = 100
+
+// ComputerUseDesktopAction is a Coder desktop operation requested by an
+// OpenAI computer-use tool call.
+type DesktopAction struct {
+ Action workspacesdk.DesktopAction
+ WaitDurationMillis int64
+ ReleaseMouseOnFailure bool
+ ReleaseKeysOnFailure []string
+}
+
+// ComputerUseDesktopActions converts an OpenAI computer-use tool call into
+// Coder desktop actions. A caller should execute the returned actions in order,
+// wait for WaitDurationMillis entries, and then return a final screenshot.
+func DesktopActions(
+ parsed *fantasyopenai.ComputerUseInput,
+ declaredWidth, declaredHeight int,
+) ([]DesktopAction, error) {
+ if parsed == nil {
+ return nil, xerrors.New("OpenAI computer use input is nil")
+ }
+ var err error
+ actions := make([]DesktopAction, 0, len(parsed.Actions))
+ for _, action := range parsed.Actions {
+ switch action.Type {
+ case "screenshot":
+ // OpenAI returns one screenshot per response; individual screenshot
+ // actions in the batch are fulfilled by the batch-final capture.
+ continue
+ case "move":
+ actions = append(actions, DesktopAction{
+ Action: desktopActionWithCoordinate(
+ "mouse_move",
+ declaredWidth,
+ declaredHeight,
+ action.X,
+ action.Y,
+ ),
+ })
+ case "click":
+ actionSet, err := clickActions(
+ action.Button,
+ declaredWidth,
+ declaredHeight,
+ action.X,
+ action.Y,
+ )
+ if err != nil {
+ return nil, err
+ }
+ actions, err = appendWithModifiers(actions, action.Keys, actionSet)
+ if err != nil {
+ return nil, err
+ }
+ case "double_click":
+ actionName, ok := DoubleClickAction(action.Button)
+ if !ok {
+ return nil, xerrors.Errorf(
+ "unsupported OpenAI double-click button %q",
+ action.Button,
+ )
+ }
+ actionSet := []DesktopAction{{
+ Action: desktopActionWithCoordinate(
+ actionName,
+ declaredWidth,
+ declaredHeight,
+ action.X,
+ action.Y,
+ ),
+ }}
+ actions, err = appendWithModifiers(actions, action.Keys, actionSet)
+ if err != nil {
+ return nil, err
+ }
+ case "drag":
+ if len(action.Path) < 2 {
+ return nil, xerrors.New("OpenAI drag action requires at least two path points")
+ }
+ actionSet := []DesktopAction{
+ {
+ Action: desktopActionWithCoordinate(
+ "mouse_move",
+ declaredWidth,
+ declaredHeight,
+ action.Path[0].X,
+ action.Path[0].Y,
+ ),
+ },
+ {
+ Action: desktopAction(
+ "left_mouse_down",
+ declaredWidth,
+ declaredHeight,
+ ),
+ ReleaseMouseOnFailure: true,
+ },
+ }
+ for _, point := range action.Path[1:] {
+ actionSet = append(actionSet, DesktopAction{
+ Action: desktopActionWithCoordinate(
+ "mouse_move",
+ declaredWidth,
+ declaredHeight,
+ point.X,
+ point.Y,
+ ),
+ ReleaseMouseOnFailure: true,
+ })
+ }
+ actionSet = append(actionSet, DesktopAction{
+ Action: desktopAction(
+ "left_mouse_up",
+ declaredWidth,
+ declaredHeight,
+ ),
+ ReleaseMouseOnFailure: true,
+ })
+ actions, err = appendWithModifiers(actions, action.Keys, actionSet)
+ if err != nil {
+ return nil, err
+ }
+ case "keypress":
+ text, err := NormalizeKeys(action.Keys)
+ if err != nil {
+ return nil, err
+ }
+ desktopAction := desktopAction("key", declaredWidth, declaredHeight)
+ desktopAction.Text = &text
+ actions = append(actions, DesktopAction{Action: desktopAction})
+ case "type":
+ desktopAction := desktopAction("type", declaredWidth, declaredHeight)
+ desktopAction.Text = &action.Text
+ actions = append(actions, DesktopAction{Action: desktopAction})
+ case "scroll":
+ actionSet := computerUseScrollActions(
+ declaredWidth,
+ declaredHeight,
+ action.X,
+ action.Y,
+ action.ScrollX,
+ action.ScrollY,
+ )
+ actions, err = appendWithModifiers(actions, action.Keys, actionSet)
+ if err != nil {
+ return nil, err
+ }
+ case "wait":
+ actions = append(actions, DesktopAction{WaitDurationMillis: 1000})
+ default:
+ return nil, xerrors.Errorf(
+ "unsupported OpenAI computer action type %q",
+ action.Type,
+ )
+ }
+ }
+ return actions, nil
+}
+
+func appendWithModifiers(
+ actions []DesktopAction,
+ keys []string,
+ actionSet []DesktopAction,
+) ([]DesktopAction, error) {
+ if len(keys) == 0 {
+ return append(actions, actionSet...), nil
+ }
+
+ modifiers := make([]string, 0, len(keys))
+ for _, key := range keys {
+ modifier, err := normalizeComputerUseKey(key)
+ if err != nil {
+ return nil, err
+ }
+ modifiers = append(modifiers, modifier)
+ }
+
+ heldKeys := make([]string, 0, len(modifiers))
+ for _, modifier := range modifiers {
+ nextHeldKeys := append(slices.Clone(heldKeys), modifier)
+ desktopAction := desktopAction("key_down", 0, 0)
+ desktopAction.Text = &modifier
+ actions = append(actions, DesktopAction{
+ Action: desktopAction,
+ ReleaseKeysOnFailure: nextHeldKeys,
+ })
+ heldKeys = nextHeldKeys
+ }
+
+ for _, action := range actionSet {
+ action.ReleaseKeysOnFailure = slices.Clone(heldKeys)
+ actions = append(actions, action)
+ }
+
+ for i := len(heldKeys) - 1; i >= 0; i-- {
+ key := heldKeys[i]
+ desktopAction := desktopAction("key_up", 0, 0)
+ desktopAction.Text = &key
+ actions = append(actions, DesktopAction{
+ Action: desktopAction,
+ ReleaseKeysOnFailure: slices.Clone(heldKeys[:i+1]),
+ })
+ }
+ return actions, nil
+}
+
+func computerUseScrollActions(
+ declaredWidth, declaredHeight int,
+ x, y, scrollX, scrollY int64,
+) []DesktopAction {
+ coord := coordinateFromInt64(x, y)
+ moveAction := desktopAction("mouse_move", declaredWidth, declaredHeight)
+ moveAction.Coordinate = &coord
+ actions := []DesktopAction{{Action: moveAction}}
+
+ if scrollY != 0 {
+ direction := "down"
+ if scrollY < 0 {
+ direction = "up"
+ }
+ scrollAction := desktopAction("scroll", declaredWidth, declaredHeight)
+ scrollAction.Coordinate = &coord
+ scrollAction.ScrollDirection = &direction
+ amount := scrollPixelsToWheelClicks(scrollY)
+ scrollAction.ScrollAmount = &amount
+ actions = append(actions, DesktopAction{Action: scrollAction})
+ }
+
+ if scrollX != 0 {
+ direction := "right"
+ if scrollX < 0 {
+ direction = "left"
+ }
+ scrollAction := desktopAction("scroll", declaredWidth, declaredHeight)
+ scrollAction.Coordinate = &coord
+ scrollAction.ScrollDirection = &direction
+ amount := scrollPixelsToWheelClicks(scrollX)
+ scrollAction.ScrollAmount = &amount
+ actions = append(actions, DesktopAction{Action: scrollAction})
+ }
+ return actions
+}
+
+func desktopActionWithCoordinate(
+ action string,
+ declaredWidth, declaredHeight int,
+ x, y int64,
+) workspacesdk.DesktopAction {
+ desktopAction := desktopAction(action, declaredWidth, declaredHeight)
+ coord := coordinateFromInt64(x, y)
+ desktopAction.Coordinate = &coord
+ return desktopAction
+}
+
+func desktopAction(
+ action string,
+ declaredWidth, declaredHeight int,
+) workspacesdk.DesktopAction {
+ return workspacesdk.DesktopAction{
+ Action: action,
+ ScaledWidth: &declaredWidth,
+ ScaledHeight: &declaredHeight,
+ }
+}
+
+func coordinateFromInt64(x, y int64) [2]int {
+ return [2]int{int(x), int(y)}
+}
+
+func scrollPixelsToWheelClicks(pixels int64) int {
+ if pixels < 0 {
+ pixels = -pixels
+ }
+ if pixels == 0 {
+ return 0
+ }
+ return int((pixels + computerUseScrollPixelsPerWheelClick - 1) /
+ computerUseScrollPixelsPerWheelClick)
+}
+
+func clickActions(
+ button string,
+ declaredWidth, declaredHeight int,
+ x, y int64,
+) ([]DesktopAction, error) {
+ actionName, ok := ClickAction(button)
+ if ok {
+ return []DesktopAction{{
+ Action: desktopActionWithCoordinate(
+ actionName,
+ declaredWidth,
+ declaredHeight,
+ x,
+ y,
+ ),
+ }}, nil
+ }
+
+ navigationKey := ""
+ switch button {
+ case "back":
+ navigationKey = "alt+Left"
+ case "forward":
+ navigationKey = "alt+Right"
+ default:
+ return nil, xerrors.Errorf("unsupported OpenAI click button %q", button)
+ }
+
+ keyAction := desktopAction("key", 0, 0)
+ keyAction.Text = &navigationKey
+ return []DesktopAction{
+ {
+ Action: desktopActionWithCoordinate(
+ "mouse_move",
+ declaredWidth,
+ declaredHeight,
+ x,
+ y,
+ ),
+ },
+ {Action: keyAction},
+ }, nil
+}
+
+// DoubleClickAction maps an OpenAI computer-use double-click button to a Coder
+// desktop action name. The desktop API currently supports only left-button
+// double-clicks.
+func DoubleClickAction(button string) (string, bool) {
+ switch button {
+ case "", "left":
+ return "double_click", true
+ default:
+ return "", false
+ }
+}
+
+// ComputerUseClickAction maps an OpenAI computer-use click button to a Coder
+// desktop action name.
+func ClickAction(button string) (string, bool) {
+ switch button {
+ case "", "left":
+ return "left_click", true
+ case "right":
+ return "right_click", true
+ case "middle", "wheel":
+ return "middle_click", true
+ default:
+ return "", false
+ }
+}
+
+// NormalizeComputerUseKeys maps OpenAI keypress tokens to Coder desktop key
+// action tokens.
+func NormalizeKeys(keys []string) (string, error) {
+ if len(keys) == 0 {
+ return "", xerrors.New("OpenAI keypress action requires at least one key")
+ }
+ normalized := make([]string, 0, len(keys))
+ for _, key := range keys {
+ normalizedKey, err := normalizeComputerUseKey(key)
+ if err != nil {
+ return "", err
+ }
+ normalized = append(normalized, normalizedKey)
+ }
+ return strings.Join(normalized, "+"), nil
+}
+
+func normalizeComputerUseKey(key string) (string, error) {
+ trimmed := strings.TrimSpace(key)
+ if trimmed == "" {
+ return "", xerrors.New("OpenAI keypress action contains an empty key")
+ }
+
+ lower := strings.ToLower(trimmed)
+ switch lower {
+ case "ctrl", "control":
+ return "ctrl", nil
+ case "cmd", "command", "meta", "super":
+ return "meta", nil
+ case "shift":
+ return "shift", nil
+ case "alt", "option":
+ return "alt", nil
+ case "enter", "return":
+ return "Return", nil
+ case "escape", "esc":
+ return "Escape", nil
+ case "tab":
+ return "Tab", nil
+ case "space":
+ return "space", nil
+ case "backspace":
+ return "BackSpace", nil
+ case "delete", "del":
+ return "Delete", nil
+ case "arrowup", "up":
+ return "Up", nil
+ case "arrowdown", "down":
+ return "Down", nil
+ case "arrowleft", "left":
+ return "Left", nil
+ case "arrowright", "right":
+ return "Right", nil
+ }
+
+ if isFunctionKey(lower) {
+ return "F" + lower[1:], nil
+ }
+
+ runes := []rune(trimmed)
+ if len(runes) == 1 {
+ r := runes[0]
+ if unicode.IsLetter(r) {
+ return strings.ToLower(trimmed), nil
+ }
+ if unicode.IsDigit(r) {
+ return trimmed, nil
+ }
+ if unicode.IsPunct(r) || unicode.IsSymbol(r) {
+ return trimmed, nil
+ }
+ return "", xerrors.Errorf("unsupported OpenAI keypress %q", trimmed)
+ }
+
+ return "", xerrors.Errorf("unsupported OpenAI keypress %q", trimmed)
+}
+
+func isFunctionKey(key string) bool {
+ if len(key) < 2 || key[0] != 'f' {
+ return false
+ }
+ number, ok := strings.CutPrefix(key, "f")
+ if !ok || number == "" {
+ return false
+ }
+ for _, r := range number {
+ if r < '0' || r > '9' {
+ return false
+ }
+ }
+ value := 0
+ for _, r := range number {
+ value = value*10 + int(r-'0')
+ }
+ return value >= 1 && value <= 35
+}
diff --git a/coderd/x/chatd/chatopenai/computeruse/computeruse_test.go b/coderd/x/chatd/chatopenai/computeruse/computeruse_test.go
new file mode 100644
index 0000000000..f75efc1f8b
--- /dev/null
+++ b/coderd/x/chatd/chatopenai/computeruse/computeruse_test.go
@@ -0,0 +1,199 @@
+package computeruse_test
+
+import (
+ "testing"
+
+ "charm.land/fantasy"
+ fantasyopenai "charm.land/fantasy/providers/openai"
+ "github.com/stretchr/testify/require"
+
+ "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
+)
+
+func TestComputerUseTool(t *testing.T) {
+ t.Parallel()
+
+ tool := computeruse.Tool()
+ require.True(t, computeruse.IsTool(tool))
+ require.Equal(t, "computer", tool.GetName())
+}
+
+func TestComputerUseResultProviderMetadata(t *testing.T) {
+ t.Parallel()
+
+ t.Run("SuccessfulImage", func(t *testing.T) {
+ t.Parallel()
+
+ metadata := computeruse.ResultProviderMetadata(
+ fantasy.NewImageResponse([]byte("png"), "image/png"),
+ )
+ outputOptions, ok := metadata[fantasyopenai.Name].(*fantasyopenai.ComputerCallOutputOptions)
+ require.True(t, ok)
+ require.Equal(t, "original", outputOptions.Detail)
+ })
+
+ tests := []struct {
+ name string
+ response fantasy.ToolResponse
+ }{
+ {name: "Error", response: fantasy.NewTextErrorResponse("failed")},
+ {name: "Text", response: fantasy.NewTextResponse("ok")},
+ {name: "EmptyImage", response: fantasy.NewImageResponse(nil, "image/png")},
+ {
+ name: "NonImageMediaType",
+ response: fantasy.NewImageResponse([]byte("png"), "application/octet-stream"),
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ metadata := computeruse.ResultProviderMetadata(tt.response)
+ require.Nil(t, metadata)
+ })
+ }
+}
+
+func TestDesktopActionsWrapsPointerActionsWithModifiers(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_click_modifier",
+ "actions":[{"type":"click","button":"left","x":70,"y":80,"keys":["ctrl","shift"]}]
+ }`)
+ require.NoError(t, err)
+
+ actions, err := computeruse.DesktopActions(input, 1440, 900)
+ require.NoError(t, err)
+ require.Len(t, actions, 5)
+
+ require.Equal(t, "key_down", actions[0].Action.Action)
+ require.NotNil(t, actions[0].Action.Text)
+ require.Equal(t, "ctrl", *actions[0].Action.Text)
+ require.Equal(t, []string{"ctrl"}, actions[0].ReleaseKeysOnFailure)
+
+ require.Equal(t, "key_down", actions[1].Action.Action)
+ require.NotNil(t, actions[1].Action.Text)
+ require.Equal(t, "shift", *actions[1].Action.Text)
+ require.Equal(t, []string{"ctrl", "shift"}, actions[1].ReleaseKeysOnFailure)
+
+ require.Equal(t, "left_click", actions[2].Action.Action)
+ require.Equal(t, []string{"ctrl", "shift"}, actions[2].ReleaseKeysOnFailure)
+
+ require.Equal(t, "key_up", actions[3].Action.Action)
+ require.NotNil(t, actions[3].Action.Text)
+ require.Equal(t, "shift", *actions[3].Action.Text)
+ require.Equal(t, []string{"ctrl", "shift"}, actions[3].ReleaseKeysOnFailure)
+
+ require.Equal(t, "key_up", actions[4].Action.Action)
+ require.NotNil(t, actions[4].Action.Text)
+ require.Equal(t, "ctrl", *actions[4].Action.Text)
+ require.Equal(t, []string{"ctrl"}, actions[4].ReleaseKeysOnFailure)
+}
+
+func TestDesktopActionsMarksFinalDragReleaseForCleanup(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_drag",
+ "actions":[{"type":"drag","path":[{"x":1,"y":2},{"x":3,"y":4}]}]
+ }`)
+ require.NoError(t, err)
+
+ actions, err := computeruse.DesktopActions(input, 1440, 900)
+ require.NoError(t, err)
+ require.Len(t, actions, 4)
+ require.Equal(t, "left_mouse_down", actions[1].Action.Action)
+ require.True(t, actions[1].ReleaseMouseOnFailure)
+ require.Equal(t, "left_mouse_up", actions[3].Action.Action)
+ require.True(t, actions[3].ReleaseMouseOnFailure)
+}
+
+func TestDesktopActionsDefaultsEmptyClickButtonToLeft(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_empty_button",
+ "actions":[{"type":"click","x":70,"y":80}]
+ }`)
+ require.NoError(t, err)
+
+ actions, err := computeruse.DesktopActions(input, 1440, 900)
+ require.NoError(t, err)
+ require.Len(t, actions, 1)
+ require.Equal(t, "left_click", actions[0].Action.Action)
+}
+
+func TestDesktopActionsMapsBackForwardClickButtons(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ button string
+ wantKey string
+ }{
+ {name: "Back", button: "back", wantKey: "alt+Left"},
+ {name: "Forward", button: "forward", wantKey: "alt+Right"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_side_button",
+ "actions":[{"type":"click","button":"` + tt.button + `","x":70,"y":80}]
+ }`)
+ require.NoError(t, err)
+
+ actions, err := computeruse.DesktopActions(input, 1440, 900)
+ require.NoError(t, err)
+ require.Len(t, actions, 2)
+ require.Equal(t, "mouse_move", actions[0].Action.Action)
+ require.Equal(t, "key", actions[1].Action.Action)
+ require.NotNil(t, actions[1].Action.Text)
+ require.Equal(t, tt.wantKey, *actions[1].Action.Text)
+ })
+ }
+}
+
+func TestDesktopActionsRejectsUnsupportedDoubleClickButton(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_double_click",
+ "actions":[{"type":"double_click","button":"right","x":70,"y":80}]
+ }`)
+ require.NoError(t, err)
+
+ _, err = computeruse.DesktopActions(input, 1440, 900)
+ require.Error(t, err)
+ require.Contains(t, err.Error(), `unsupported OpenAI double-click button "right"`)
+}
+
+func TestDesktopActionsConvertsScrollPixelsToWheelClicks(t *testing.T) {
+ t.Parallel()
+
+ input, err := computeruse.ParseInput(`{
+ "call_id":"call_scroll",
+ "actions":[{"type":"scroll","x":70,"y":80,"scroll_y":401,"scroll_x":-99}]
+ }`)
+ require.NoError(t, err)
+
+ actions, err := computeruse.DesktopActions(input, 1440, 900)
+ require.NoError(t, err)
+ require.Len(t, actions, 3)
+
+ vertical := actions[1].Action
+ require.NotNil(t, vertical.ScrollAmount)
+ require.NotNil(t, vertical.ScrollDirection)
+ require.Equal(t, "down", *vertical.ScrollDirection)
+ require.Equal(t, 5, *vertical.ScrollAmount)
+
+ horizontal := actions[2].Action
+ require.NotNil(t, horizontal.ScrollAmount)
+ require.NotNil(t, horizontal.ScrollDirection)
+ require.Equal(t, "left", *horizontal.ScrollDirection)
+ require.Equal(t, 1, *horizontal.ScrollAmount)
+}
diff --git a/coderd/x/chatd/chattool/computeruse.go b/coderd/x/chatd/chattool/computeruse.go
index 1b7a639148..fcff921b49 100644
--- a/coderd/x/chatd/chattool/computeruse.go
+++ b/coderd/x/chatd/chattool/computeruse.go
@@ -4,28 +4,84 @@ import (
"context"
"encoding/base64"
"fmt"
+ "slices"
+ "strings"
"time"
"charm.land/fantasy"
fantasyanthropic "charm.land/fantasy/providers/anthropic"
+ "golang.org/x/xerrors"
"cdr.dev/slog/v3"
+ openaicomputeruse "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/quartz"
)
const (
- // ComputerUseModelProvider is the provider for the computer
- // use model.
- ComputerUseModelProvider = "anthropic"
- // ComputerUseModelName is the model used for computer use
- // subagents.
- ComputerUseModelName = "claude-opus-4-6"
+ // ComputerUseProviderAnthropic identifies Anthropic computer use.
+ ComputerUseProviderAnthropic = "anthropic"
+ // ComputerUseProviderOpenAI identifies OpenAI computer use.
+ ComputerUseProviderOpenAI = "openai"
+ // ComputerUseModelProviderDefault is the default model provider name for
+ // computer use, equal to ComputerUseProviderAnthropic.
+ ComputerUseModelProviderDefault = ComputerUseProviderAnthropic
+ // ComputerUseAnthropicModelName is the default Anthropic model used for
+ // computer use subagents.
+ ComputerUseAnthropicModelName = "claude-opus-4-6"
+ // ComputerUseOpenAIModelName is the default OpenAI model used for computer use.
+ ComputerUseOpenAIModelName = "gpt-5.5"
)
-// computerUseTool implements fantasy.AgentTool and
-// chatloop.ToolDefiner for Anthropic computer use.
+// SupportedComputerUseProviders returns the providers supported by computer use.
+// The returned slice is a fresh copy and safe to mutate.
+func SupportedComputerUseProviders() []string {
+ return []string{
+ ComputerUseProviderAnthropic,
+ ComputerUseProviderOpenAI,
+ }
+}
+
+// IsSupportedComputerUseProvider reports whether provider supports computer use.
+func IsSupportedComputerUseProvider(provider string) bool {
+ return slices.Contains(SupportedComputerUseProviders(), provider)
+}
+
+// DefaultComputerUseProvider returns the effective computer use provider.
+func DefaultComputerUseProvider(provider string) string {
+ if provider == "" {
+ return ComputerUseProviderAnthropic
+ }
+ return provider
+}
+
+// DefaultComputerUseModel returns the default model for a computer use provider.
+func DefaultComputerUseModel(provider string) (modelProvider, modelName string, ok bool) {
+ switch DefaultComputerUseProvider(provider) {
+ case ComputerUseProviderAnthropic:
+ return ComputerUseModelProviderDefault, ComputerUseAnthropicModelName, true
+ case ComputerUseProviderOpenAI:
+ // Keep OpenAI isolated here because computer-use models may advance.
+ return ComputerUseProviderOpenAI, ComputerUseOpenAIModelName, true
+ default:
+ return "", "", false
+ }
+}
+
+// DefaultComputerUseDesktopGeometry returns provider-specific model-facing
+// desktop geometry for computer use.
+func DefaultComputerUseDesktopGeometry(provider string) workspacesdk.DesktopGeometry {
+ switch DefaultComputerUseProvider(provider) {
+ case ComputerUseProviderOpenAI:
+ return workspacesdk.DefaultOpenAIComputerUseDesktopGeometry()
+ default:
+ return workspacesdk.DefaultDesktopGeometry()
+ }
+}
+
+// computerUseTool implements fantasy.AgentTool and chatloop.ToolDefiner.
type computerUseTool struct {
+ provider string
declaredWidth int
declaredHeight int
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error)
@@ -35,11 +91,12 @@ type computerUseTool struct {
logger slog.Logger
}
-// NewComputerUseTool creates a computer use AgentTool that delegates to the
-// agent's desktop endpoints. declaredWidth and declaredHeight are the
-// model-facing desktop dimensions advertised to Anthropic and requested for
-// screenshots.
+// NewComputerUseTool creates a provider-aware computer use AgentTool that
+// delegates to the agent's desktop endpoints. declaredWidth and declaredHeight
+// are the model-facing desktop dimensions advertised to providers and requested
+// for screenshots.
func NewComputerUseTool(
+ provider string,
declaredWidth, declaredHeight int,
getWorkspaceConn func(ctx context.Context) (workspacesdk.AgentConn, error),
storeFile StoreFileFunc,
@@ -47,6 +104,7 @@ func NewComputerUseTool(
logger slog.Logger,
) fantasy.AgentTool {
return &computerUseTool{
+ provider: DefaultComputerUseProvider(provider),
declaredWidth: declaredWidth,
declaredHeight: declaredHeight,
getWorkspaceConn: getWorkspaceConn,
@@ -67,20 +125,31 @@ func (*computerUseTool) Info() fantasy.ToolInfo {
}
}
-// ComputerUseProviderTool creates the provider-defined Anthropic computer-use
-// tool definition using the declared model-facing desktop geometry.
-func ComputerUseProviderTool(declaredWidth, declaredHeight int) fantasy.Tool {
- // The run callback is nil because execution is handled separately
- // by the AgentTool runner in the chatloop. We extract just the
- // provider-defined tool definition.
- return fantasyanthropic.NewComputerUseTool(
- fantasyanthropic.ComputerUseToolOptions{
- DisplayWidthPx: int64(declaredWidth),
- DisplayHeightPx: int64(declaredHeight),
- ToolVersion: fantasyanthropic.ComputerUse20251124,
- },
- nil,
- ).Definition()
+// ComputerUseProviderTool creates the provider-defined computer-use tool
+// definition using the declared model-facing desktop geometry.
+func ComputerUseProviderTool(provider string, declaredWidth, declaredHeight int) (fantasy.Tool, error) {
+ switch DefaultComputerUseProvider(provider) {
+ case ComputerUseProviderAnthropic:
+ // The run callback is nil because execution is handled separately
+ // by the AgentTool runner in the chatloop. We extract just the
+ // provider-defined tool definition.
+ return fantasyanthropic.NewComputerUseTool(
+ fantasyanthropic.ComputerUseToolOptions{
+ DisplayWidthPx: int64(declaredWidth),
+ DisplayHeightPx: int64(declaredHeight),
+ ToolVersion: fantasyanthropic.ComputerUse20251124,
+ },
+ nil,
+ ).Definition(), nil
+ case ComputerUseProviderOpenAI:
+ // OpenAI's GA computer tool schema does not accept display
+ // dimensions. The declared geometry is applied through screenshot
+ // sizing and desktop action coordinate scaling.
+ return openaicomputeruse.Tool(), nil
+ default:
+ return nil, xerrors.Errorf("unsupported computer use provider %q, supported providers: %s", provider,
+ strings.Join(SupportedComputerUseProviders(), ", "))
+ }
}
func (t *computerUseTool) ProviderOptions() fantasy.ProviderOptions {
@@ -92,6 +161,24 @@ func (t *computerUseTool) SetProviderOptions(opts fantasy.ProviderOptions) {
}
func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error) {
+ switch DefaultComputerUseProvider(t.provider) {
+ case ComputerUseProviderAnthropic:
+ return t.runAnthropicComputerUse(ctx, call)
+ case ComputerUseProviderOpenAI:
+ return t.runOpenAIComputerUse(ctx, call)
+ default:
+ return fantasy.NewTextErrorResponse(fmt.Sprintf(
+ "unsupported computer use provider %q, supported providers: %s",
+ t.provider,
+ strings.Join(SupportedComputerUseProviders(), ", "),
+ )), nil
+ }
+}
+
+func (t *computerUseTool) runAnthropicComputerUse(
+ ctx context.Context,
+ call fantasy.ToolCall,
+) (fantasy.ToolResponse, error) {
input, err := fantasyanthropic.ParseComputerUseInput(call.Input)
if err != nil {
return fantasy.NewTextErrorResponse(
@@ -110,16 +197,7 @@ func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fanta
// For wait actions, sleep then return a screenshot.
if input.Action == fantasyanthropic.ActionWait {
- d := input.Duration
- if d <= 0 {
- d = 1000
- }
- timer := t.clock.NewTimer(time.Duration(d)*time.Millisecond, "computeruse", "wait")
- defer timer.Stop()
- select {
- case <-ctx.Done():
- case <-timer.C:
- }
+ t.wait(ctx, input.Duration)
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
@@ -129,17 +207,13 @@ func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fanta
}
// Build the action request.
- action := workspacesdk.DesktopAction{
- Action: string(input.Action),
- ScaledWidth: &declaredWidth,
- ScaledHeight: &declaredHeight,
- }
+ action := t.desktopAction(string(input.Action), declaredWidth, declaredHeight)
if input.Coordinate != ([2]int64{}) {
- coord := [2]int{int(input.Coordinate[0]), int(input.Coordinate[1])}
+ coord := coordinateFromInt64(input.Coordinate[0], input.Coordinate[1])
action.Coordinate = &coord
}
if input.StartCoordinate != ([2]int64{}) {
- coord := [2]int{int(input.StartCoordinate[0]), int(input.StartCoordinate[1])}
+ coord := coordinateFromInt64(input.StartCoordinate[0], input.StartCoordinate[1])
action.StartCoordinate = &coord
}
if input.Text != "" {
@@ -157,18 +231,124 @@ func (t *computerUseTool) Run(ctx context.Context, call fantasy.ToolCall) (fanta
action.ScrollDirection = &input.ScrollDirection
}
- // Execute the action.
- _, err = conn.ExecuteDesktopAction(ctx, action)
- if err != nil {
- return fantasy.NewTextErrorResponse(
- fmt.Sprintf("action %q failed: %v", input.Action, err),
- ), nil
+ if resp, done := t.executeDesktopAction(ctx, conn, action); done {
+ return resp, nil
}
// Take a screenshot after every action (Anthropic pattern).
return t.captureScreenshot(ctx, conn, declaredWidth, declaredHeight)
}
+func (t *computerUseTool) runOpenAIComputerUse(
+ ctx context.Context,
+ call fantasy.ToolCall,
+) (fantasy.ToolResponse, error) {
+ input, err := openaicomputeruse.ParseInput(call.Input)
+ if err != nil {
+ return fantasy.NewTextErrorResponse(
+ fmt.Sprintf("invalid computer use input: %v", err),
+ ), nil
+ }
+ conn, err := t.getWorkspaceConn(ctx)
+ if err != nil {
+ return fantasy.NewTextErrorResponse(
+ fmt.Sprintf("failed to connect to workspace: %v", err),
+ ), nil
+ }
+
+ declaredWidth, declaredHeight := t.declaredActionDimensions()
+ actions, err := openaicomputeruse.DesktopActions(
+ input,
+ declaredWidth,
+ declaredHeight,
+ )
+ if err != nil {
+ return fantasy.NewTextErrorResponse(err.Error()), nil
+ }
+ for _, action := range actions {
+ if action.WaitDurationMillis > 0 {
+ t.wait(ctx, action.WaitDurationMillis)
+ continue
+ }
+ if resp, done := t.executeDesktopAction(ctx, conn, action.Action); done {
+ if action.ReleaseMouseOnFailure {
+ _, err := conn.ExecuteDesktopAction(
+ ctx,
+ t.desktopAction("left_mouse_up", declaredWidth, declaredHeight),
+ )
+ if err != nil {
+ t.logger.Warn(ctx, "failed to release mouse after OpenAI drag error",
+ slog.Error(err),
+ )
+ }
+ }
+ t.releaseOpenAIModifierKeys(ctx, conn, action.ReleaseKeysOnFailure)
+ return resp, nil
+ }
+ }
+ return t.captureSharedScreenshot(ctx, conn, declaredWidth, declaredHeight)
+}
+
+func (t *computerUseTool) releaseOpenAIModifierKeys(
+ ctx context.Context,
+ conn workspacesdk.AgentConn,
+ keys []string,
+) {
+ for i := len(keys) - 1; i >= 0; i-- {
+ key := keys[i]
+ action := t.desktopAction("key_up", 0, 0)
+ action.Text = &key
+ if _, err := conn.ExecuteDesktopAction(ctx, action); err != nil {
+ t.logger.Warn(ctx, "failed to release OpenAI modifier key",
+ slog.F("key", key),
+ slog.Error(err),
+ )
+ }
+ }
+}
+
+func (*computerUseTool) executeDesktopAction(
+ ctx context.Context,
+ conn workspacesdk.AgentConn,
+ action workspacesdk.DesktopAction,
+) (fantasy.ToolResponse, bool) {
+ _, err := conn.ExecuteDesktopAction(ctx, action)
+ if err != nil {
+ return fantasy.NewTextErrorResponse(
+ fmt.Sprintf("action %q failed: %v", action.Action, err),
+ ), true
+ }
+ return fantasy.ToolResponse{}, false
+}
+
+func (*computerUseTool) desktopAction(
+ action string,
+ declaredWidth, declaredHeight int,
+) workspacesdk.DesktopAction {
+ return workspacesdk.DesktopAction{
+ Action: action,
+ ScaledWidth: &declaredWidth,
+ ScaledHeight: &declaredHeight,
+ }
+}
+
+func (t *computerUseTool) wait(ctx context.Context, durationMillis int64) {
+ d := durationMillis
+ if d <= 0 {
+ d = 1000
+ }
+ timer := t.clock.NewTimer(time.Duration(d)*time.Millisecond, "computeruse", "wait")
+ defer timer.Stop()
+ select {
+ case <-ctx.Done():
+ case <-timer.C:
+ }
+}
+
+func coordinateFromInt64(x, y int64) [2]int {
+ return [2]int{int(x), int(y)}
+}
+
func (t *computerUseTool) captureScreenshot(
ctx context.Context,
conn workspacesdk.AgentConn,
@@ -256,7 +436,7 @@ func executeScreenshotAction(
func (t *computerUseTool) declaredActionDimensions() (declaredWidth, declaredHeight int) {
if t.declaredWidth <= 0 || t.declaredHeight <= 0 {
- geometry := workspacesdk.DefaultDesktopGeometry()
+ geometry := DefaultComputerUseDesktopGeometry(t.provider)
return geometry.DeclaredWidth, geometry.DeclaredHeight
}
return t.declaredWidth, t.declaredHeight
diff --git a/coderd/x/chatd/chattool/computeruse_test.go b/coderd/x/chatd/chattool/computeruse_test.go
index fa92a55449..5138003345 100644
--- a/coderd/x/chatd/chattool/computeruse_test.go
+++ b/coderd/x/chatd/chattool/computeruse_test.go
@@ -5,8 +5,10 @@ import (
"context"
"encoding/base64"
"testing"
+ "time"
"charm.land/fantasy"
+ fantasyanthropic "charm.land/fantasy/providers/anthropic"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -14,23 +16,137 @@ import (
"golang.org/x/xerrors"
"cdr.dev/slog/v3/sloggers/slogtest"
+ openaicomputeruse "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
"github.com/coder/coder/v2/coderd/x/chatd/chattool"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/codersdk/workspacesdk/agentconnmock"
+ "github.com/coder/coder/v2/testutil"
"github.com/coder/quartz"
)
+func TestDefaultComputerUseModel(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ provider string
+ wantModelProvider string
+ wantModelName string
+ wantOK bool
+ }{
+ {
+ name: "empty defaults to Anthropic",
+ provider: "",
+ wantModelProvider: chattool.ComputerUseModelProviderDefault,
+ wantModelName: chattool.ComputerUseAnthropicModelName,
+ wantOK: true,
+ },
+ {
+ name: "Anthropic",
+ provider: chattool.ComputerUseProviderAnthropic,
+ wantModelProvider: chattool.ComputerUseModelProviderDefault,
+ wantModelName: chattool.ComputerUseAnthropicModelName,
+ wantOK: true,
+ },
+ {
+ name: "OpenAI",
+ provider: chattool.ComputerUseProviderOpenAI,
+ wantModelProvider: chattool.ComputerUseProviderOpenAI,
+ wantModelName: chattool.ComputerUseOpenAIModelName,
+ wantOK: true,
+ },
+ {
+ name: "unsupported",
+ provider: "unsupported",
+ wantOK: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ modelProvider, modelName, ok := chattool.DefaultComputerUseModel(tt.provider)
+ assert.Equal(t, tt.wantOK, ok)
+ assert.Equal(t, tt.wantModelProvider, modelProvider)
+ assert.Equal(t, tt.wantModelName, modelName)
+ })
+ }
+}
+
+func TestDefaultComputerUseDesktopGeometry(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ provider string
+ declaredWidth int
+ declaredHeight int
+ }{
+ {
+ name: "empty defaults to Anthropic geometry",
+ provider: "",
+ declaredWidth: 1280,
+ declaredHeight: 720,
+ },
+ {
+ name: "Anthropic",
+ provider: chattool.ComputerUseProviderAnthropic,
+ declaredWidth: 1280,
+ declaredHeight: 720,
+ },
+ {
+ name: "OpenAI",
+ provider: chattool.ComputerUseProviderOpenAI,
+ declaredWidth: 1600,
+ declaredHeight: 900,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ geometry := chattool.DefaultComputerUseDesktopGeometry(tt.provider)
+ assert.Equal(t, tt.declaredWidth, geometry.DeclaredWidth)
+ assert.Equal(t, tt.declaredHeight, geometry.DeclaredHeight)
+ })
+ }
+}
+
func TestComputerUseProviderTool(t *testing.T) {
t.Parallel()
geometry := workspacesdk.DefaultDesktopGeometry()
- def := chattool.ComputerUseProviderTool(geometry.DeclaredWidth, geometry.DeclaredHeight)
+ def, err := chattool.ComputerUseProviderTool(
+ chattool.ComputerUseProviderAnthropic,
+ geometry.DeclaredWidth,
+ geometry.DeclaredHeight,
+ )
+ require.NoError(t, err)
pdt, ok := def.(fantasy.ProviderDefinedTool)
require.True(t, ok, "ComputerUseProviderTool should return a ProviderDefinedTool")
+ assert.True(t, fantasyanthropic.IsComputerUseTool(def))
assert.Contains(t, pdt.ID, "computer")
assert.Equal(t, "computer", pdt.Name)
assert.Equal(t, int64(geometry.DeclaredWidth), pdt.Args["display_width_px"])
assert.Equal(t, int64(geometry.DeclaredHeight), pdt.Args["display_height_px"])
+
+ openAITool, err := chattool.ComputerUseProviderTool(
+ chattool.ComputerUseProviderOpenAI,
+ geometry.DeclaredWidth,
+ geometry.DeclaredHeight,
+ )
+ require.NoError(t, err)
+ assert.True(t, openaicomputeruse.IsTool(openAITool))
+
+ _, err = chattool.ComputerUseProviderTool(
+ "unsupported",
+ geometry.DeclaredWidth,
+ geometry.DeclaredHeight,
+ )
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "unsupported computer use provider")
}
func TestComputerUseTool_Run_Screenshot(t *testing.T) {
@@ -56,7 +172,7 @@ func TestComputerUseTool_Run_Screenshot(t *testing.T) {
}, nil
})
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, nil, quartz.NewReal(), slogtest.Make(t, nil))
@@ -100,7 +216,7 @@ func TestComputerUseTool_Run_Screenshot_PersistsAttachment(t *testing.T) {
var storedName string
var storedType string
var storedData []byte
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, func(_ context.Context, name string, detectName string, data []byte) (chattool.AttachmentMetadata, error) {
storedName = name
@@ -154,7 +270,7 @@ func TestComputerUseTool_Run_Screenshot_StoreErrorFallsBackToImage(t *testing.T)
ScreenshotHeight: geometry.DeclaredHeight,
}, nil)
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, func(_ context.Context, _ string, _ string, _ []byte) (chattool.AttachmentMetadata, error) {
return chattool.AttachmentMetadata{}, xerrors.New("chat already has the maximum of 20 linked files")
@@ -190,7 +306,7 @@ func TestComputerUseTool_Run_Screenshot_OversizedAttachmentFallsBackToImage(t *t
ScreenshotHeight: geometry.DeclaredHeight,
}, nil)
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, func(_ context.Context, _ string, _ string, _ []byte) (chattool.AttachmentMetadata, error) {
t.Fatal("storeFile should not be called for oversized screenshots")
@@ -250,7 +366,7 @@ func TestComputerUseTool_Run_LeftClick(t *testing.T) {
}, nil
})
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, func(_ context.Context, _ string, _ string, _ []byte) (chattool.AttachmentMetadata, error) {
t.Fatal("storeFile should not be called for left_click follow-up screenshots")
@@ -298,7 +414,7 @@ func TestComputerUseTool_Run_Wait(t *testing.T) {
}, nil
})
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return mockConn, nil
}, func(_ context.Context, _ string, _ string, _ []byte) (chattool.AttachmentMetadata, error) {
t.Fatal("storeFile should not be called for wait screenshots")
@@ -345,6 +461,7 @@ func TestComputerUseTool_Run_ScreenshotDataIsDecodedBinary(t *testing.T) {
}, nil)
tool := chattool.NewComputerUseTool(
+ chattool.ComputerUseProviderAnthropic,
geometry.DeclaredWidth,
geometry.DeclaredHeight,
func(_ context.Context) (workspacesdk.AgentConn, error) {
@@ -386,7 +503,7 @@ func TestComputerUseTool_Run_ConnError(t *testing.T) {
t.Parallel()
geometry := workspacesdk.DefaultDesktopGeometry()
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return nil, xerrors.New("workspace not available")
}, nil, quartz.NewReal(), slogtest.Make(t, nil))
@@ -406,7 +523,7 @@ func TestComputerUseTool_Run_InvalidInput(t *testing.T) {
t.Parallel()
geometry := workspacesdk.DefaultDesktopGeometry()
- tool := chattool.NewComputerUseTool(geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
+ tool := chattool.NewComputerUseTool(chattool.ComputerUseProviderAnthropic, geometry.DeclaredWidth, geometry.DeclaredHeight, func(_ context.Context) (workspacesdk.AgentConn, error) {
return nil, xerrors.New("should not be called")
}, nil, quartz.NewReal(), slogtest.Make(t, nil))
@@ -421,3 +538,561 @@ func TestComputerUseTool_Run_InvalidInput(t *testing.T) {
assert.True(t, resp.IsError)
assert.Contains(t, resp.Content, "invalid computer use input")
}
+
+func TestComputerUseTool_Run_OpenAI_BatchedActions(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ const screenshotPNG = "aW1hZ2UtZGF0YQ=="
+ actions := recordDesktopActions(t, mockConn, geometry, 16, screenshotPNG)
+
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_batch",
+ "actions":[
+ {"type":"screenshot"},
+ {"type":"move","x":10,"y":20},
+ {"type":"click","button":"left","x":30,"y":40},
+ {"type":"click","button":"right","x":31,"y":41},
+ {"type":"click","button":"middle","x":32,"y":42},
+ {"type":"double_click","x":50,"y":60},
+ {"type":"drag","path":[{"x":1,"y":2},{"x":3,"y":4},{"x":5,"y":6}]},
+ {"type":"keypress","keys":["ctrl","s"]},
+ {"type":"type","text":"hello"},
+ {"type":"scroll","x":70,"y":80,"scroll_y":500,"scroll_x":-200}
+ ]
+ }`))
+ require.NoError(t, err)
+ assert.Equal(t, "image", resp.Type)
+ assert.Equal(t, "image/png", resp.MediaType)
+ assert.False(t, resp.IsError)
+ expectedImage, err := base64.StdEncoding.DecodeString(screenshotPNG)
+ require.NoError(t, err)
+ assert.Equal(t, expectedImage, resp.Data)
+ attachments, err := chattool.AttachmentsFromMetadata(resp.Metadata)
+ require.NoError(t, err)
+ assert.Empty(t, attachments)
+
+ require.Len(t, *actions, 16)
+ for _, action := range *actions {
+ assertDesktopActionScaled(t, geometry, action)
+ }
+ assertDesktopAction(t, (*actions)[0], "mouse_move", [2]int{10, 20})
+ assertDesktopAction(t, (*actions)[1], "left_click", [2]int{30, 40})
+ assertDesktopAction(t, (*actions)[2], "right_click", [2]int{31, 41})
+ assertDesktopAction(t, (*actions)[3], "middle_click", [2]int{32, 42})
+ assertDesktopAction(t, (*actions)[4], "double_click", [2]int{50, 60})
+ assertDesktopAction(t, (*actions)[5], "mouse_move", [2]int{1, 2})
+ assert.Equal(t, "left_mouse_down", (*actions)[6].Action)
+ assert.Nil(t, (*actions)[6].Coordinate)
+ assertDesktopAction(t, (*actions)[7], "mouse_move", [2]int{3, 4})
+ assertDesktopAction(t, (*actions)[8], "mouse_move", [2]int{5, 6})
+ assert.Equal(t, "left_mouse_up", (*actions)[9].Action)
+ assert.Nil(t, (*actions)[9].Coordinate)
+ assertTextAction(t, (*actions)[10], "key", "ctrl+s")
+ assertTextAction(t, (*actions)[11], "type", "hello")
+ assertDesktopAction(t, (*actions)[12], "mouse_move", [2]int{70, 80})
+ assertScrollAction(t, (*actions)[13], [2]int{70, 80}, "down", 5)
+ assertScrollAction(t, (*actions)[14], [2]int{70, 80}, "left", 2)
+ assert.Equal(t, "screenshot", (*actions)[15].Action)
+ assert.Nil(t, (*actions)[15].Coordinate)
+}
+
+func TestComputerUseTool_Run_OpenAI_EmptyActionsCapturesScreenshotAndStoresAttachment(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ const screenshotPNG = "ZmluYWwtc2NyZWVuc2hvdA=="
+ actions := recordDesktopActions(t, mockConn, geometry, 1, screenshotPNG)
+
+ var storedName string
+ var storedData []byte
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, func(_ context.Context, name string, detectName string, data []byte) (chattool.AttachmentMetadata, error) {
+ storedName = name
+ require.Equal(t, name, detectName)
+ storedData = append([]byte(nil), data...)
+ return chattool.AttachmentMetadata{
+ FileID: uuid.MustParse("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"),
+ MediaType: "image/png",
+ Name: name,
+ }, nil
+ }, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_empty",
+ "actions":[]
+ }`))
+ require.NoError(t, err)
+ assert.Equal(t, "image", resp.Type)
+ require.Len(t, *actions, 1)
+ assert.Equal(t, "screenshot", (*actions)[0].Action)
+ assert.Contains(t, storedName, "screenshot-")
+ expectedData, err := base64.StdEncoding.DecodeString(screenshotPNG)
+ require.NoError(t, err)
+ assert.Equal(t, expectedData, storedData)
+
+ attachments, err := chattool.AttachmentsFromMetadata(resp.Metadata)
+ require.NoError(t, err)
+ require.Len(t, attachments, 1)
+ assert.Equal(t, uuid.MustParse("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"), attachments[0].FileID)
+ assert.Equal(t, "image/png", attachments[0].MediaType)
+}
+
+func TestComputerUseTool_Run_OpenAI_FinalScreenshotStoreErrorFallsBackToImage(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ const screenshotPNG = "ZmluYWwtc2NyZWVuc2hvdA=="
+ recordDesktopActions(t, mockConn, geometry, 1, screenshotPNG)
+
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, func(_ context.Context, _ string, _ string, _ []byte) (chattool.AttachmentMetadata, error) {
+ return chattool.AttachmentMetadata{}, xerrors.New("chat already has the maximum of 20 linked files")
+ }, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_store_error",
+ "actions":[{"type":"screenshot"}]
+ }`))
+ require.NoError(t, err)
+ assert.Equal(t, "image", resp.Type)
+ assert.Equal(t, "image/png", resp.MediaType)
+ assert.False(t, resp.IsError)
+ attachments, err := chattool.AttachmentsFromMetadata(resp.Metadata)
+ require.NoError(t, err)
+ assert.Empty(t, attachments)
+}
+
+func TestComputerUseTool_Run_OpenAI_DragReleaseFailureRetriesMouseUp(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+
+ gomock.InOrder(
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assertDesktopAction(t, action, "mouse_move", [2]int{1, 2})
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{Output: "mouse_move performed"}, nil
+ }),
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assert.Equal(t, "left_mouse_down", action.Action)
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{Output: "mouse_down performed"}, nil
+ }),
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assertDesktopAction(t, action, "mouse_move", [2]int{3, 4})
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{Output: "mouse_move performed"}, nil
+ }),
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assert.Equal(t, "left_mouse_up", action.Action)
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{}, xerrors.New("release failed")
+ }),
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assert.Equal(t, "left_mouse_up", action.Action)
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{Output: "mouse_up performed"}, nil
+ }),
+ )
+
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_release_failure",
+ "actions":[{"type":"drag","path":[{"x":1,"y":2},{"x":3,"y":4}]}]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, `action "left_mouse_up" failed`)
+}
+
+func TestComputerUseTool_Run_OpenAI_ActionFailureSkipsFinalScreenshot(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+
+ gomock.InOrder(
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assertDesktopAction(t, action, "mouse_move", [2]int{10, 20})
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{Output: "mouse_move performed"}, nil
+ }),
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assertTextAction(t, action, "type", "fail")
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{}, xerrors.New("desktop failed")
+ }),
+ )
+
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_failure",
+ "actions":[
+ {"type":"move","x":10,"y":20},
+ {"type":"type","text":"fail"}
+ ]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, `action "type" failed`)
+}
+
+func TestComputerUseTool_Run_OpenAI_UnsupportedClickButtons(t *testing.T) {
+ t.Parallel()
+
+ for _, button := range []string{"extra"} {
+ t.Run(button, func(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_unsupported_button",
+ "actions":[{"type":"click","button":"`+button+`","x":10,"y":20}]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, "unsupported OpenAI click button")
+ })
+ }
+}
+
+func TestComputerUseTool_Run_OpenAI_WheelClickIsMiddle(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ actions := recordDesktopActions(t, mockConn, geometry, 2, "d2hlZWwtY2xpY2s=")
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_wheel_click",
+ "actions":[{"type":"click","button":"wheel","x":10,"y":20}]
+ }`))
+ require.NoError(t, err)
+ assert.False(t, resp.IsError)
+ require.Len(t, *actions, 2)
+ assertDesktopAction(t, (*actions)[0], "middle_click", [2]int{10, 20})
+ assert.Equal(t, "screenshot", (*actions)[1].Action)
+}
+
+func TestComputerUseTool_Run_OpenAI_UnsupportedActionType(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_unknown_action",
+ "actions":[{"type":"hover","x":10,"y":20}]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, `unsupported OpenAI computer action type "hover"`)
+}
+
+func TestComputerUseTool_Run_OpenAI_InvalidInput(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{invalid json`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, "invalid")
+}
+
+func TestComputerUseTool_Run_OpenAI_DragRequiresTwoPoints(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_short_drag",
+ "actions":[{"type":"drag","path":[{"x":10,"y":20}]}]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, "requires at least two path points")
+}
+
+func TestComputerUseTool_Run_OpenAI_KeyNormalization(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ keysJSON string
+ wantText string
+ }{
+ {name: "ctrl s", keysJSON: `["ctrl","s"]`, wantText: "ctrl+s"},
+ {name: "modifier aliases", keysJSON: `["control","shift","alt","command","A"]`, wantText: "ctrl+shift+alt+meta+a"},
+ {name: "special keys", keysJSON: `["enter","escape","tab","space","backspace","delete"]`, wantText: "Return+Escape+Tab+space+BackSpace+Delete"},
+ {name: "arrows", keysJSON: `["ArrowUp","arrowdown","left","Right"]`, wantText: "Up+Down+Left+Right"},
+ {name: "function letters digits", keysJSON: `["f1","F12","5","Z"]`, wantText: "F1+F12+5+z"},
+ {name: "minus key", keysJSON: `["-"]`, wantText: "-"},
+ {name: "equals key", keysJSON: `["="]`, wantText: "="},
+ {name: "slash key", keysJSON: `["/"]`, wantText: "/"},
+ {name: "period key", keysJSON: `["."]`, wantText: "."},
+ {name: "left bracket key", keysJSON: `["["]`, wantText: "["},
+ {name: "right bracket key", keysJSON: `["]"]`, wantText: "]"},
+ {name: "semicolon key", keysJSON: `[";"]`, wantText: ";"},
+ {name: "apostrophe key", keysJSON: `["'"]`, wantText: "'"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ actions := recordDesktopActions(t, mockConn, geometry, 2, "a2V5LWltYWdl")
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_key",
+ "actions":[{"type":"keypress","keys":`+tt.keysJSON+`}]
+ }`))
+ require.NoError(t, err)
+ assert.False(t, resp.IsError)
+ require.Len(t, *actions, 2)
+ assertTextAction(t, (*actions)[0], "key", tt.wantText)
+ assert.Equal(t, "screenshot", (*actions)[1].Action)
+ })
+ }
+}
+
+func TestComputerUseTool_Run_OpenAI_KeyNormalizationErrors(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ keysJSON string
+ want string
+ }{
+ {name: "empty array", keysJSON: `[]`, want: "requires at least one key"},
+ {name: "empty token", keysJSON: `["ctrl",""]`, want: "contains an empty key"},
+ {name: "unsupported multi-rune", keysJSON: `["ab"]`, want: `unsupported OpenAI keypress "ab"`},
+ {name: "unsupported function key", keysJSON: `["f99"]`, want: `unsupported OpenAI keypress "f99"`},
+ {name: "unsupported named key", keysJSON: `["PageDown"]`, want: `unsupported OpenAI keypress "PageDown"`},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ t.Parallel()
+
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, quartz.NewReal())
+
+ resp, err := tool.Run(context.Background(), openAIComputerUseCall(`{
+ "call_id":"call_key_error",
+ "actions":[{"type":"keypress","keys":`+tt.keysJSON+`}]
+ }`))
+ require.NoError(t, err)
+ assert.True(t, resp.IsError)
+ assert.Contains(t, resp.Content, tt.want)
+ })
+ }
+}
+
+func TestComputerUseTool_Run_OpenAI_WaitUsesMockClock(t *testing.T) {
+ t.Parallel()
+
+ ctx := testutil.Context(t, testutil.WaitLong)
+ ctrl := gomock.NewController(t)
+ mockConn := agentconnmock.NewMockAgentConn(ctrl)
+ geometry := workspacesdk.DefaultDesktopGeometry()
+ mClock := quartz.NewMock(t)
+ const screenshotPNG = "d2FpdC1zY3JlZW5zaG90"
+
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ assert.Equal(t, "screenshot", action.Action)
+ assertDesktopActionScaled(t, geometry, action)
+ return workspacesdk.DesktopActionResponse{
+ Output: "screenshot",
+ ScreenshotData: screenshotPNG,
+ ScreenshotWidth: geometry.DeclaredWidth,
+ ScreenshotHeight: geometry.DeclaredHeight,
+ }, nil
+ }).Times(1)
+
+ trap := mClock.Trap().NewTimer("computeruse", "wait")
+ tool := newOpenAIComputerUseTool(t, geometry, mockConn, nil, mClock)
+
+ type toolResult struct {
+ resp fantasy.ToolResponse
+ err error
+ }
+ resultCh := make(chan toolResult, 1)
+ go func() {
+ resp, err := tool.Run(ctx, openAIComputerUseCall(`{
+ "call_id":"call_wait",
+ "actions":[{"type":"wait"}]
+ }`))
+ resultCh <- toolResult{resp: resp, err: err}
+ }()
+
+ trap.MustWait(ctx).MustRelease(ctx)
+ trap.Close()
+ mClock.Advance(time.Second).MustWait(ctx)
+
+ result := testutil.RequireReceive(ctx, t, resultCh)
+ require.NoError(t, result.err)
+ assert.Equal(t, "image", result.resp.Type)
+ assert.Equal(t, "image/png", result.resp.MediaType)
+ assert.False(t, result.resp.IsError)
+}
+
+func newOpenAIComputerUseTool(
+ t testing.TB,
+ geometry workspacesdk.DesktopGeometry,
+ conn workspacesdk.AgentConn,
+ storeFile chattool.StoreFileFunc,
+ clock quartz.Clock,
+) fantasy.AgentTool {
+ t.Helper()
+ return chattool.NewComputerUseTool(
+ chattool.ComputerUseProviderOpenAI,
+ geometry.DeclaredWidth,
+ geometry.DeclaredHeight,
+ func(_ context.Context) (workspacesdk.AgentConn, error) {
+ return conn, nil
+ },
+ storeFile,
+ clock,
+ slogtest.Make(t, nil),
+ )
+}
+
+func openAIComputerUseCall(input string) fantasy.ToolCall {
+ return fantasy.ToolCall{
+ ID: "openai-call",
+ Name: "computer",
+ Input: input,
+ }
+}
+
+func recordDesktopActions(
+ t testing.TB,
+ mockConn *agentconnmock.MockAgentConn,
+ geometry workspacesdk.DesktopGeometry,
+ times int,
+ screenshotPNG string,
+) *[]workspacesdk.DesktopAction {
+ t.Helper()
+ actions := make([]workspacesdk.DesktopAction, 0, times)
+ mockConn.EXPECT().ExecuteDesktopAction(
+ gomock.Any(),
+ gomock.AssignableToTypeOf(workspacesdk.DesktopAction{}),
+ ).DoAndReturn(func(_ context.Context, action workspacesdk.DesktopAction) (workspacesdk.DesktopActionResponse, error) {
+ actions = append(actions, action)
+ if action.Action == "screenshot" {
+ return workspacesdk.DesktopActionResponse{
+ Output: "screenshot",
+ ScreenshotData: screenshotPNG,
+ ScreenshotWidth: geometry.DeclaredWidth,
+ ScreenshotHeight: geometry.DeclaredHeight,
+ }, nil
+ }
+ return workspacesdk.DesktopActionResponse{Output: action.Action + " performed"}, nil
+ }).Times(times)
+ return &actions
+}
+
+func assertDesktopActionScaled(
+ t testing.TB,
+ geometry workspacesdk.DesktopGeometry,
+ action workspacesdk.DesktopAction,
+) {
+ t.Helper()
+ require.NotNil(t, action.ScaledWidth)
+ require.NotNil(t, action.ScaledHeight)
+ assert.Equal(t, geometry.DeclaredWidth, *action.ScaledWidth)
+ assert.Equal(t, geometry.DeclaredHeight, *action.ScaledHeight)
+}
+
+func assertDesktopAction(
+ t testing.TB,
+ action workspacesdk.DesktopAction,
+ actionName string,
+ coordinate [2]int,
+) {
+ t.Helper()
+ assert.Equal(t, actionName, action.Action)
+ require.NotNil(t, action.Coordinate)
+ assert.Equal(t, coordinate, *action.Coordinate)
+}
+
+func assertTextAction(
+ t testing.TB,
+ action workspacesdk.DesktopAction,
+ actionName string,
+ text string,
+) {
+ t.Helper()
+ assert.Equal(t, actionName, action.Action)
+ require.NotNil(t, action.Text)
+ assert.Equal(t, text, *action.Text)
+}
+
+func assertScrollAction(
+ t testing.TB,
+ action workspacesdk.DesktopAction,
+ coordinate [2]int,
+ direction string,
+ amount int,
+) {
+ t.Helper()
+ assertDesktopAction(t, action, "scroll", coordinate)
+ require.NotNil(t, action.ScrollDirection)
+ require.NotNil(t, action.ScrollAmount)
+ assert.Equal(t, direction, *action.ScrollDirection)
+ assert.Equal(t, amount, *action.ScrollAmount)
+}
diff --git a/coderd/x/chatd/computer_use.go b/coderd/x/chatd/computer_use.go
new file mode 100644
index 0000000000..d41214f558
--- /dev/null
+++ b/coderd/x/chatd/computer_use.go
@@ -0,0 +1,167 @@
+package chatd
+
+import (
+ "context"
+ "strings"
+
+ "charm.land/fantasy"
+ "golang.org/x/xerrors"
+
+ "cdr.dev/slog/v3"
+ "github.com/coder/coder/v2/coderd/database"
+ "github.com/coder/coder/v2/coderd/database/dbauthz"
+ "github.com/coder/coder/v2/coderd/x/chatd/chatloop"
+ openaicomputeruse "github.com/coder/coder/v2/coderd/x/chatd/chatopenai/computeruse"
+ "github.com/coder/coder/v2/coderd/x/chatd/chatprovider"
+ "github.com/coder/coder/v2/coderd/x/chatd/chattool"
+ "github.com/coder/coder/v2/codersdk/workspacesdk"
+ "github.com/coder/quartz"
+)
+
+// computerUseConfigContext lets internal and worker callers read
+// deployment-wide chat settings when they lack an HTTP-derived actor. HTTP
+// handlers always carry an actor, so the AsChatd fallback never elevates user
+// contexts and this function is a no-op in that path. The setting it gates is
+// global and readable by any authenticated actor, not a back-door.
+func computerUseConfigContext(ctx context.Context) context.Context {
+ if _, ok := dbauthz.ActorFromContext(ctx); ok {
+ return ctx
+ }
+ //nolint:gocritic // Worker contexts may lack an actor.
+ return dbauthz.AsChatd(ctx)
+}
+
+func (p *Server) computerUseProviderAndModelFromConfig(
+ ctx context.Context,
+) (provider, modelProvider, modelName string, err error) {
+ rawProvider, err := p.db.GetChatComputerUseProvider(
+ computerUseConfigContext(ctx),
+ )
+ if err != nil {
+ return "", "", "", xerrors.Errorf("get computer use provider: %w", err)
+ }
+
+ provider = strings.TrimSpace(rawProvider)
+ if provider == "" {
+ provider = chattool.ComputerUseProviderAnthropic
+ }
+
+ modelProvider, modelName, ok := chattool.DefaultComputerUseModel(provider)
+ if !ok {
+ return "", "", "", xerrors.Errorf(
+ "unknown computer-use provider %q configured in agents_computer_use_provider",
+ provider,
+ )
+ }
+
+ return provider, modelProvider, modelName, nil
+}
+
+func (p *Server) resolveComputerUseModel(
+ ctx context.Context,
+ chat database.Chat,
+ providerKeys chatprovider.ProviderAPIKeys,
+ computerUseProvider string,
+ computerUseModelProvider string,
+ computerUseModelName string,
+) (
+ model fantasy.LanguageModel,
+ debugEnabled bool,
+ resolvedProvider string,
+ resolvedModel string,
+ err error,
+) {
+ resolvedProvider, resolvedModel, err = chatprovider.ResolveModelWithProviderHint(
+ computerUseModelName,
+ computerUseModelProvider,
+ )
+ if err != nil {
+ return nil, false, "", "", xerrors.Errorf(
+ "resolve computer use model metadata for provider %q model %q: %w",
+ computerUseProvider,
+ computerUseModelName,
+ err,
+ )
+ }
+
+ model, debugEnabled, err = p.newDebugAwareModelFromConfig(
+ ctx,
+ chat,
+ computerUseModelProvider,
+ computerUseModelName,
+ providerKeys,
+ chatprovider.UserAgent(),
+ chatprovider.CoderHeaders(chat),
+ )
+ if err != nil {
+ return nil, false, "", "", xerrors.Errorf(
+ "resolve computer use model for provider %q model %q: %w",
+ computerUseProvider,
+ computerUseModelName,
+ err,
+ )
+ }
+
+ return model, debugEnabled, resolvedProvider, resolvedModel, nil
+}
+
+type computerUseProviderToolOptions struct {
+ provider string
+ isPlanModeTurn bool
+ isComputerUse bool
+ getWorkspaceConn func(context.Context) (workspacesdk.AgentConn, error)
+ storeFile chattool.StoreFileFunc
+ clock quartz.Clock
+ logger slog.Logger
+}
+
+func appendComputerUseProviderTool(
+ providerTools []chatloop.ProviderTool,
+ opts computerUseProviderToolOptions,
+) ([]chatloop.ProviderTool, error) {
+ // This helper is called for every chat turn. Only chats created by the
+ // computer_use subagent definition have ChatModeComputerUse, which filters
+ // out root, general, and explore chats. Plan mode is separate from Mode, so
+ // planning turns stay gated even for computer-use chats.
+ if opts.isPlanModeTurn || !opts.isComputerUse {
+ return providerTools, nil
+ }
+
+ desktopGeometry := chattool.DefaultComputerUseDesktopGeometry(opts.provider)
+ definition, err := chattool.ComputerUseProviderTool(
+ opts.provider,
+ desktopGeometry.DeclaredWidth,
+ desktopGeometry.DeclaredHeight,
+ )
+ if err != nil {
+ return providerTools, xerrors.Errorf(
+ "build computer use provider tool for provider %q: %w",
+ opts.provider,
+ err,
+ )
+ }
+
+ clock := opts.clock
+ if clock == nil {
+ clock = quartz.NewReal()
+ }
+ providerTool := chatloop.ProviderTool{
+ Definition: definition,
+ Runner: chattool.NewComputerUseTool(
+ opts.provider,
+ desktopGeometry.DeclaredWidth,
+ desktopGeometry.DeclaredHeight,
+ opts.getWorkspaceConn,
+ opts.storeFile,
+ clock,
+ opts.logger,
+ ),
+ }
+ if opts.provider == chattool.ComputerUseProviderOpenAI {
+ // OpenAI computer-use image results need detail metadata so the model receives
+ // the screenshot at original detail when the chat loop sends the tool result.
+ providerTool.ResultProviderMetadata = openaicomputeruse.ResultProviderMetadata
+ }
+
+ return append(providerTools, providerTool), nil
+}
diff --git a/coderd/x/chatd/subagent.go b/coderd/x/chatd/subagent.go
index ccf0d01446..4f1207bda5 100644
--- a/coderd/x/chatd/subagent.go
+++ b/coderd/x/chatd/subagent.go
@@ -77,22 +77,28 @@ type closeAgentArgs struct {
ChatID string `json:"chat_id"`
}
-// isAnthropicConfigured reports whether an Anthropic API key is
-// available, either from static provider keys or from the database.
-func (p *Server) isAnthropicConfigured(ctx context.Context) bool {
- if p.providerAPIKeys.APIKey("anthropic") != "" {
- return true
+// providerConfigured reports whether a provider has an API key from
+// static configuration or from the database provider configuration.
+func (p *Server) providerConfigured(ctx context.Context, provider string) (bool, error) {
+ normalizedProvider := chatprovider.NormalizeProvider(provider)
+ if normalizedProvider == "" {
+ return false, nil
}
+ if p.providerAPIKeys.APIKey(normalizedProvider) != "" {
+ return true, nil
+ }
+
dbProviders, err := p.configCache.EnabledProviders(ctx)
if err != nil {
- return false
+ return false, xerrors.Errorf("list enabled chat providers: %w", err)
}
for _, prov := range dbProviders {
- if chatprovider.NormalizeProvider(prov.Provider) == "anthropic" && strings.TrimSpace(prov.APIKey) != "" {
- return true
+ if chatprovider.NormalizeProvider(prov.Provider) == normalizedProvider &&
+ strings.TrimSpace(prov.APIKey) != "" {
+ return true, nil
}
}
- return false
+ return false, nil
}
func (p *Server) isDesktopEnabled(ctx context.Context) bool {
diff --git a/coderd/x/chatd/subagent_catalog.go b/coderd/x/chatd/subagent_catalog.go
index eea0004475..895bd8ec16 100644
--- a/coderd/x/chatd/subagent_catalog.go
+++ b/coderd/x/chatd/subagent_catalog.go
@@ -8,6 +8,7 @@ import (
"github.com/google/uuid"
"golang.org/x/xerrors"
+ "cdr.dev/slog/v3"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/codersdk"
)
@@ -103,12 +104,34 @@ func allSubagentDefinitions() []subagentDefinition {
if currentChat.PlanMode.Valid && currentChat.PlanMode.ChatPlanMode == database.ChatPlanModePlan {
return `type "computer_use" is unavailable in plan mode`
}
- if !p.isAnthropicConfigured(ctx) || !p.isDesktopEnabled(ctx) {
- return `type "computer_use" is unavailable because computer use is not configured`
+ if !p.isDesktopEnabled(ctx) {
+ return `type "computer_use" is unavailable because desktop access is not enabled`
+ }
+ _, _, _, err := p.computerUseProviderAndModelFromConfig(ctx)
+ if err != nil {
+ p.logger.Warn(ctx, "computer-use provider config is unavailable",
+ slog.F("chat_id", currentChat.ID),
+ slog.Error(err),
+ )
+ return `type "computer_use" is unavailable because its provider configuration could not be loaded`
}
return ""
},
- buildOptions: func(_ context.Context, _ *Server, _ database.Chat, _ database.Chat, _ uuid.UUID, prompt string) (childSubagentChatOptions, error) {
+ buildOptions: func(ctx context.Context, p *Server, _ database.Chat, _ database.Chat, _ uuid.UUID, prompt string) (childSubagentChatOptions, error) {
+ provider, _, _, err := p.computerUseProviderAndModelFromConfig(ctx)
+ if err != nil {
+ return childSubagentChatOptions{}, err
+ }
+ configured, err := p.providerConfigured(ctx, provider)
+ if err != nil {
+ return childSubagentChatOptions{}, err
+ }
+ if !configured {
+ return childSubagentChatOptions{}, xerrors.Errorf(
+ `API key for computer-use provider %q is not configured`,
+ provider,
+ )
+ }
return childSubagentChatOptions{
chatMode: database.NullChatMode{
ChatMode: database.ChatModeComputerUse,
diff --git a/coderd/x/chatd/subagent_internal_test.go b/coderd/x/chatd/subagent_internal_test.go
index 3827a894e4..5df6069209 100644
--- a/coderd/x/chatd/subagent_internal_test.go
+++ b/coderd/x/chatd/subagent_internal_test.go
@@ -1325,7 +1325,7 @@ func TestSpawnAgent_DescriptionListsAllAvailableTypes(t *testing.T) {
require.Contains(t, description, subagentTypeComputerUse)
}
-func TestSpawnAgent_DescriptionOmitsComputerUseWhenUnavailable(t *testing.T) {
+func TestSpawnAgent_DescriptionIncludesComputerUseWithMissingProviderKey(t *testing.T) {
t.Parallel()
db, ps := dbtestutil.NewDB(t)
@@ -1335,7 +1335,7 @@ func TestSpawnAgent_DescriptionOmitsComputerUseWhenUnavailable(t *testing.T) {
ctx := chatdTestContext(t)
user, org, model := seedInternalChatDeps(t, db)
parentChat := createInternalParentChat(
- ctx, t, server, db, org.ID, user.ID, model.ID, "parent-description-unavailable",
+ ctx, t, server, db, org.ID, user.ID, model.ID, "parent-description-missing-key",
)
tools := server.subagentTools(ctx, func() database.Chat { return parentChat }, parentChat.LastModelConfigID)
@@ -1344,7 +1344,7 @@ func TestSpawnAgent_DescriptionOmitsComputerUseWhenUnavailable(t *testing.T) {
description := tool.Info().Description
require.Contains(t, description, subagentTypeGeneral)
require.Contains(t, description, subagentTypeExplore)
- require.NotContains(t, description, subagentTypeComputerUse)
+ require.Contains(t, description, subagentTypeComputerUse)
}
func TestSpawnAgent_PlanModeDescriptionOmitsComputerUse(t *testing.T) {
@@ -1429,7 +1429,7 @@ func TestPlanningOverlaySubagentGuidance_UsesPlanModeSafeDescriptions(t *testing
require.NotContains(t, guidance, "may inspect or modify workspace files")
}
-func TestSpawnAgent_InvalidTypeAndUnavailableTypeAreDistinct(t *testing.T) {
+func TestSpawnAgent_InvalidTypeAndCredentialErrorAreDistinct(t *testing.T) {
t.Parallel()
db, ps := dbtestutil.NewDB(t)
@@ -1452,9 +1452,9 @@ func TestSpawnAgent_InvalidTypeAndUnavailableTypeAreDistinct(t *testing.T) {
spawnAgentArgs{Type: "invalid", Prompt: "delegate work"},
)
require.True(t, invalidResp.IsError)
- require.Contains(t, invalidResp.Content, "type must be one of: general, explore")
+ require.Contains(t, invalidResp.Content, "type must be one of: general, explore, computer_use")
- unavailableResp := runSubagentTool(
+ credentialResp := runSubagentTool(
ctx,
t,
server,
@@ -1463,8 +1463,140 @@ func TestSpawnAgent_InvalidTypeAndUnavailableTypeAreDistinct(t *testing.T) {
spawnAgentToolName,
spawnAgentArgs{Type: subagentTypeComputerUse, Prompt: "open browser"},
)
- require.True(t, unavailableResp.IsError)
- require.Contains(t, unavailableResp.Content, `type "computer_use" is unavailable because computer use is not configured`)
+ require.True(t, credentialResp.IsError)
+ require.Contains(t, credentialResp.Content, "API key")
+ require.Contains(t, credentialResp.Content, "computer-use")
+ require.Contains(t, credentialResp.Content, "anthropic")
+}
+
+func TestSpawnAgent_ComputerUseAvailabilityUsesConfiguredProvider(t *testing.T) {
+ t.Parallel()
+
+ db, ps := dbtestutil.NewDB(t)
+ ctx := chatdTestContext(t)
+ require.NoError(t, db.UpsertChatDesktopEnabled(ctx, true))
+ require.NoError(t, db.UpsertChatComputerUseProvider(
+ ctx,
+ chattool.ComputerUseProviderOpenAI,
+ ))
+ server := newInternalTestServer(t, db, ps, chatprovider.ProviderAPIKeys{})
+
+ user, org, model := seedInternalChatDeps(t, db)
+ parentChat := createInternalParentChat(
+ ctx, t, server, db, org.ID, user.ID, model.ID, "parent-openai-computer-use",
+ )
+
+ ids := availableSubagentTypeIDs(ctx, server, parentChat)
+ require.Contains(t, ids, subagentTypeComputerUse)
+}
+
+func TestSpawnAgent_ComputerUseRejectsMissingConfiguredProvider(t *testing.T) {
+ t.Parallel()
+
+ db, ps := dbtestutil.NewDB(t)
+ ctx := chatdTestContext(t)
+ require.NoError(t, db.UpsertChatDesktopEnabled(ctx, true))
+ require.NoError(t, db.UpsertChatComputerUseProvider(
+ ctx,
+ chattool.ComputerUseProviderOpenAI,
+ ))
+ server := newInternalTestServer(t, db, ps, chatprovider.ProviderAPIKeys{})
+
+ user := dbgen.User(t, db, database.User{})
+ org := dbgen.Organization(t, db, database.Organization{})
+ dbgen.OrganizationMember(t, db, database.OrganizationMember{
+ UserID: user.ID,
+ OrganizationID: org.ID,
+ })
+ model := insertInternalChatModelConfigForProvider(
+ t,
+ db,
+ chattool.ComputerUseProviderOpenAI,
+ "gpt-4o-mini",
+ true,
+ )
+ parentChat := createInternalParentChat(
+ ctx, t, server, db, org.ID, user.ID, model.ID, "parent-openai-missing",
+ )
+
+ ids := availableSubagentTypeIDs(ctx, server, parentChat)
+ require.Contains(t, ids, subagentTypeComputerUse)
+ beforeChats, err := db.GetChats(ctx, database.GetChatsParams{
+ OwnerID: user.ID,
+ AfterID: uuid.Nil,
+ OffsetOpt: 0,
+ LimitOpt: 100,
+ })
+ require.NoError(t, err)
+
+ resp := runSpawnAgentTool(ctx, t, server, parentChat, spawnAgentArgs{
+ Type: subagentTypeComputerUse,
+ Prompt: "open the browser",
+ })
+ require.True(t, resp.IsError)
+ require.Contains(t, resp.Content, "API key")
+ require.Contains(t, resp.Content, "computer-use")
+ require.Contains(t, resp.Content, "openai")
+ afterChats, err := db.GetChats(ctx, database.GetChatsParams{
+ OwnerID: user.ID,
+ AfterID: uuid.Nil,
+ OffsetOpt: 0,
+ LimitOpt: 100,
+ })
+ require.NoError(t, err)
+ require.Len(t, afterChats, len(beforeChats))
+}
+
+func TestSpawnAgent_ComputerUseRejectsInvalidConfiguredProviderWithStableReason(t *testing.T) {
+ t.Parallel()
+
+ db, ps := dbtestutil.NewDB(t)
+ ctx := chatdTestContext(t)
+ require.NoError(t, db.UpsertChatDesktopEnabled(ctx, true))
+ require.NoError(t, db.UpsertChatComputerUseProvider(ctx, "bogus"))
+ logSink := &subagentTestLogSink{}
+ logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).AppendSinks(logSink)
+ server := newInternalTestServerWithLogger(t, db, ps, chatprovider.ProviderAPIKeys{}, logger)
+
+ user, org, model := seedInternalChatDeps(t, db)
+ parentChat := createInternalParentChat(
+ ctx, t, server, db, org.ID, user.ID, model.ID, "parent-invalid-computer-use-provider",
+ )
+
+ resp := runSpawnAgentTool(ctx, t, server, parentChat, spawnAgentArgs{
+ Type: subagentTypeComputerUse,
+ Prompt: "open the browser",
+ })
+ require.True(t, resp.IsError)
+ require.Contains(t, resp.Content, `type "computer_use" is unavailable because its provider configuration could not be loaded`)
+ require.NotContains(t, resp.Content, "bogus")
+ require.NotContains(t, resp.Content, "agents_computer_use_provider")
+ require.NotEmpty(t, logSink.entriesAtLevelWithMessage(
+ slog.LevelWarn,
+ "computer-use provider config is unavailable",
+ ))
+}
+
+func TestSpawnAgent_ComputerUseRejectsDesktopDisabled(t *testing.T) {
+ t.Parallel()
+
+ db, ps := dbtestutil.NewDB(t)
+ server := newInternalTestServer(t, db, ps, chatprovider.ProviderAPIKeys{
+ Anthropic: "test-anthropic-key",
+ })
+
+ ctx := chatdTestContext(t)
+ user, org, model := seedInternalChatDeps(t, db)
+ parentChat := createInternalParentChat(
+ ctx, t, server, db, org.ID, user.ID, model.ID, "parent-desktop-disabled",
+ )
+
+ resp := runSpawnAgentTool(ctx, t, server, parentChat, spawnAgentArgs{
+ Type: subagentTypeComputerUse,
+ Prompt: "open the browser",
+ })
+ require.True(t, resp.IsError)
+ require.Contains(t, resp.Content, `type "computer_use" is unavailable because desktop access is not enabled`)
}
func TestSpawnAgent_BlankTypeReturnsValidOptions(t *testing.T) {
@@ -1779,10 +1911,12 @@ func TestSpawnAgent_ComputerUseUsesComputerUseModelNotParent(t *testing.T) {
require.Equal(t, parentChat.AgentID, childChat.AgentID)
require.True(t, childChat.Mode.Valid)
assert.Equal(t, database.ChatModeComputerUse, childChat.Mode.ChatMode)
- assert.NotEqual(t, model.Provider, chattool.ComputerUseModelProvider,
+ computerUseModelProvider, computerUseModelName, ok := chattool.DefaultComputerUseModel(chattool.ComputerUseProviderAnthropic)
+ require.True(t, ok)
+ assert.NotEqual(t, model.Provider, computerUseModelProvider,
"computer use model provider must differ from parent model provider")
- assert.Equal(t, "anthropic", chattool.ComputerUseModelProvider)
- assert.NotEmpty(t, chattool.ComputerUseModelName)
+ assert.Equal(t, "anthropic", computerUseModelProvider)
+ assert.NotEmpty(t, computerUseModelName)
}
func TestSpawnAgent_ComputerUseInheritsMCPServerIDs(t *testing.T) {
diff --git a/codersdk/chats.go b/codersdk/chats.go
index 1d8682b733..257fc6f9ab 100644
--- a/codersdk/chats.go
+++ b/codersdk/chats.go
@@ -672,6 +672,18 @@ type AdvisorConfig struct {
// the request and response shapes are currently identical.
type UpdateAdvisorConfigRequest = AdvisorConfig
+// ChatComputerUseProviderResponse is the response for getting the computer use
+// provider setting.
+type ChatComputerUseProviderResponse struct {
+ Provider string `json:"provider"`
+}
+
+// UpdateChatComputerUseProviderRequest is the request to update the computer use
+// provider setting.
+type UpdateChatComputerUseProviderRequest struct {
+ Provider string `json:"provider"`
+}
+
// ChatDebugLoggingAdminSettings describes the runtime admin setting
// that allows users to opt into chat debug logging.
type ChatDebugLoggingAdminSettings struct {
@@ -2206,6 +2218,34 @@ func (c *ExperimentalClient) UpdateChatAdvisorConfig(ctx context.Context, req Up
return nil
}
+// GetChatComputerUseProvider returns the deployment-wide computer use provider.
+func (c *ExperimentalClient) GetChatComputerUseProvider(ctx context.Context) (ChatComputerUseProviderResponse, error) {
+ res, err := c.Request(ctx, http.MethodGet, "/api/experimental/chats/config/computer-use-provider", nil)
+ if err != nil {
+ return ChatComputerUseProviderResponse{}, err
+ }
+ defer res.Body.Close()
+ if res.StatusCode != http.StatusOK {
+ return ChatComputerUseProviderResponse{}, ReadBodyAsError(res)
+ }
+ var resp ChatComputerUseProviderResponse
+ return resp, json.NewDecoder(res.Body).Decode(&resp)
+}
+
+// UpdateChatComputerUseProvider updates the deployment-wide computer use
+// provider.
+func (c *ExperimentalClient) UpdateChatComputerUseProvider(ctx context.Context, req UpdateChatComputerUseProviderRequest) error {
+ res, err := c.Request(ctx, http.MethodPut, "/api/experimental/chats/config/computer-use-provider", req)
+ if err != nil {
+ return err
+ }
+ defer res.Body.Close()
+ if res.StatusCode != http.StatusNoContent {
+ return ReadBodyAsError(res)
+ }
+ return nil
+}
+
// GetChatWorkspaceTTL returns the configured chat workspace TTL.
func (c *ExperimentalClient) GetChatWorkspaceTTL(ctx context.Context) (ChatWorkspaceTTLResponse, error) {
res, err := c.Request(ctx, http.MethodGet, "/api/experimental/chats/config/workspace-ttl", nil)
diff --git a/codersdk/workspacesdk/display.go b/codersdk/workspacesdk/display.go
index a3e4e4d040..7f180b4fee 100644
--- a/codersdk/workspacesdk/display.go
+++ b/codersdk/workspacesdk/display.go
@@ -12,6 +12,11 @@ const (
desktopDeclaredMaxLongEdge = 1568
desktopDeclaredMaxTotalPixels = 1_150_000
+
+ // OpenAI recommends 1440x900 or 1600x900 for computer use.
+ // Use 1600x900 so screenshots keep the native 16:9 aspect ratio.
+ desktopOpenAIComputerUseDeclaredWidth = 1600
+ desktopOpenAIComputerUseDeclaredHeight = 900
)
var preferredDeclaredDesktopWidths = []int{1280, 1024}
@@ -31,6 +36,17 @@ func DefaultDesktopGeometry() DesktopGeometry {
return NewDesktopGeometry(DesktopNativeWidth, DesktopNativeHeight)
}
+// DefaultOpenAIComputerUseDesktopGeometry returns the default native desktop
+// geometry with OpenAI's recommended computer-use declared dimensions.
+func DefaultOpenAIComputerUseDesktopGeometry() DesktopGeometry {
+ return NewDesktopGeometryWithDeclared(
+ DesktopNativeWidth,
+ DesktopNativeHeight,
+ desktopOpenAIComputerUseDeclaredWidth,
+ desktopOpenAIComputerUseDeclaredHeight,
+ )
+}
+
// NewDesktopGeometry derives a declared model-facing geometry from the native
// desktop size.
func NewDesktopGeometry(nativeWidth, nativeHeight int) DesktopGeometry {
diff --git a/codersdk/workspacesdk/display_test.go b/codersdk/workspacesdk/display_test.go
index 6ff5606c86..69dae9f0cb 100644
--- a/codersdk/workspacesdk/display_test.go
+++ b/codersdk/workspacesdk/display_test.go
@@ -99,6 +99,19 @@ func TestDefaultDesktopGeometry(t *testing.T) {
assert.Equal(t, 720, geometry.DeclaredHeight)
}
+// TestDefaultOpenAIComputerUseDesktopGeometry pins the model-facing coordinate
+// system for OpenAI computer use so future geometry changes are intentional.
+func TestDefaultOpenAIComputerUseDesktopGeometry(t *testing.T) {
+ t.Parallel()
+
+ geometry := workspacesdk.DefaultOpenAIComputerUseDesktopGeometry()
+
+ assert.Equal(t, 1920, geometry.NativeWidth)
+ assert.Equal(t, 1080, geometry.NativeHeight)
+ assert.Equal(t, 1600, geometry.DeclaredWidth)
+ assert.Equal(t, 900, geometry.DeclaredHeight)
+}
+
func TestDesktopGeometryDeclaredPointToNative(t *testing.T) {
t.Parallel()
diff --git a/go.mod b/go.mod
index 7dcd74dbf2..9d16338994 100644
--- a/go.mod
+++ b/go.mod
@@ -87,7 +87,7 @@ replace github.com/spf13/afero => github.com/aslilac/afero v0.0.0-20250403163713
// replay stored reasoning item references, only replay web_search references
// when paired with reasoning, and validate function_call output pairing.
// See: https://github.com/coder/fantasy/commits/f83367a4a205
-replace charm.land/fantasy => github.com/coder/fantasy v0.0.0-20260426185602-951a49c681df
+replace charm.land/fantasy => github.com/coder/fantasy v0.0.0-20260427164812-d0e6ce2243af
// coder/coder uses a fork of charmbracelet's fork of the Anthropic Go SDK
// with performance improvements and Bedrock header cleanup.
diff --git a/go.sum b/go.sum
index 4037a81e94..d1c51d3d9f 100644
--- a/go.sum
+++ b/go.sum
@@ -322,8 +322,8 @@ github.com/coder/bubbletea v1.2.2-0.20241212190825-007a1cdb2c41 h1:SBN/DA63+ZHwu
github.com/coder/bubbletea v1.2.2-0.20241212190825-007a1cdb2c41/go.mod h1:I9ULxr64UaOSUv7hcb3nX4kowodJCVS7vt7VVJk/kW4=
github.com/coder/clistat v1.2.1 h1:P9/10njXMyj5cWzIU5wkRsSy5LVQH49+tcGMsAgWX0w=
github.com/coder/clistat v1.2.1/go.mod h1:m7SC0uj88eEERgvF8Kn6+w6XF21BeSr+15f7GoLAw0A=
-github.com/coder/fantasy v0.0.0-20260426185602-951a49c681df h1:Xog/dBDcnXxr98lGZqRxOeFrCrhVZUBrFldtXH7v0EY=
-github.com/coder/fantasy v0.0.0-20260426185602-951a49c681df/go.mod h1:wZ0e3lEPqrM0XiIdAUQLvMKCLYhc3gi96MRX2wjbX44=
+github.com/coder/fantasy v0.0.0-20260427164812-d0e6ce2243af h1:5X38dLzIc5FSgVm9EuKkuKgtXt4fNV5iSCraxfgQXns=
+github.com/coder/fantasy v0.0.0-20260427164812-d0e6ce2243af/go.mod h1:wZ0e3lEPqrM0XiIdAUQLvMKCLYhc3gi96MRX2wjbX44=
github.com/coder/flog v1.1.0 h1:kbAes1ai8fIS5OeV+QAnKBQE22ty1jRF/mcAwHpLBa4=
github.com/coder/flog v1.1.0/go.mod h1:UQlQvrkJBvnRGo69Le8E24Tcl5SJleAAR7gYEHzAmdQ=
github.com/coder/go-httpstat v0.0.0-20230801153223-321c88088322 h1:m0lPZjlQ7vdVpRBPKfYIFlmgevoTkBxB10wv6l2gOaU=
diff --git a/site/src/api/api.ts b/site/src/api/api.ts
index 58b29eb768..48b1f7910d 100644
--- a/site/src/api/api.ts
+++ b/site/src/api/api.ts
@@ -3360,6 +3360,24 @@ class ExperimentalApiMethods {
await this.axios.put("/api/experimental/chats/config/advisor", req);
};
+ getChatComputerUseProvider =
+ async (): Promise
diff --git a/site/src/pages/AgentsPage/components/VirtualDesktopSettings.tsx b/site/src/pages/AgentsPage/components/VirtualDesktopSettings.tsx
index 999498831f..69d65914b3 100644
--- a/site/src/pages/AgentsPage/components/VirtualDesktopSettings.tsx
+++ b/site/src/pages/AgentsPage/components/VirtualDesktopSettings.tsx
@@ -3,6 +3,15 @@ import type { FC } from "react";
import type * as TypesGen from "#/api/typesGenerated";
import { Badge } from "#/components/Badge/Badge";
import { Link } from "#/components/Link/Link";
+import {
+ Select,
+ SelectContent,
+ SelectGroup,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "#/components/Select/Select";
+import { Skeleton } from "#/components/Skeleton/Skeleton";
import { Switch } from "#/components/Switch/Switch";
interface MutationCallbacks {
@@ -12,21 +21,57 @@ interface MutationCallbacks {
interface VirtualDesktopSettingsProps {
desktopEnabledData: TypesGen.ChatDesktopEnabledResponse | undefined;
+ isLoadingDesktopEnabled: boolean;
onSaveDesktopEnabled: (
req: TypesGen.UpdateChatDesktopEnabledRequest,
options?: MutationCallbacks,
) => void;
isSavingDesktopEnabled: boolean;
isSaveDesktopEnabledError: boolean;
+ computerUseProviderData: TypesGen.ChatComputerUseProviderResponse | undefined;
+ isLoadingComputerUseProvider: boolean;
+ onSaveComputerUseProvider: (
+ req: TypesGen.UpdateChatComputerUseProviderRequest,
+ options?: MutationCallbacks,
+ ) => void;
+ isSavingComputerUseProvider: boolean;
+ computerUseProviderSaveError: Error | null;
}
+const computerUseProviderOptions = [
+ { label: "Anthropic", value: "anthropic" },
+ { label: "OpenAI", value: "openai" },
+] as const;
+
+const getComputerUseProviderLabel = (provider: string) => {
+ return (
+ computerUseProviderOptions.find((option) => option.value === provider)
+ ?.label ?? provider
+ );
+};
+
export const VirtualDesktopSettings: FC
@@ -60,15 +111,65 @@ export const VirtualDesktopSettings: FC
+ Select the provider agents use for computer-use actions when virtual
+ desktop is enabled.
+
Failed to save desktop setting.
+ Failed to save computer use provider.
+
+ Computer use provider
+
+