Files
coder/agent/x/agentdesktop/api.go
T
Ethan 3a9080fff6 feat: tag chat-originating agent logs with chat_id (#25019)
Workspace-agent logs emitted while serving chatd-driven requests were
not correlated with the originating chat, making agent logs hard to
attribute to the corresponding/originating chat.

This adds agent-side chat context middleware that parses `Coder-Chat-Id`
once, enriches agent access logs and structured handler/background logs,
and adds a chatd bridge log when chat headers are attached to an agent
connection.

Closes CODAGT-324
2026-05-08 13:25:30 +10:00

771 lines
22 KiB
Go

package agentdesktop
import (
"context"
"encoding/json"
"errors"
"io"
"mime/multipart"
"net/http"
"net/textproto"
"strconv"
"sync"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/agent/agentchat"
"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/quartz"
"github.com/coder/websocket"
)
// DesktopAction is the request body for the desktop action endpoint.
type DesktopAction struct {
Action string `json:"action"`
Coordinate *[2]int `json:"coordinate,omitempty"`
StartCoordinate *[2]int `json:"start_coordinate,omitempty"`
Text *string `json:"text,omitempty"`
Duration *int `json:"duration,omitempty"`
ScrollAmount *int `json:"scroll_amount,omitempty"`
ScrollDirection *string `json:"scroll_direction,omitempty"`
// ScaledWidth and ScaledHeight describe the declared model-facing desktop
// geometry. When provided, input coordinates are mapped from declared space
// to native desktop pixels before dispatching.
ScaledWidth *int `json:"scaled_width,omitempty"`
ScaledHeight *int `json:"scaled_height,omitempty"`
}
// DesktopActionResponse is the response from the desktop action
// endpoint.
type DesktopActionResponse struct {
Output string `json:"output,omitempty"`
ScreenshotData string `json:"screenshot_data,omitempty"`
ScreenshotWidth int `json:"screenshot_width,omitempty"`
ScreenshotHeight int `json:"screenshot_height,omitempty"`
}
// API exposes the desktop streaming HTTP routes for the agent.
type API struct {
logger slog.Logger
desktop Desktop
clock quartz.Clock
closeMu sync.Mutex
closed bool
}
// NewAPI creates a new desktop streaming API.
func NewAPI(logger slog.Logger, desktop Desktop, clock quartz.Clock) *API {
if clock == nil {
clock = quartz.NewReal()
}
return &API{
logger: logger,
desktop: desktop,
clock: clock,
}
}
// Routes returns the chi router for mounting at /api/v0/desktop.
func (a *API) Routes() http.Handler {
r := chi.NewRouter()
r.Get("/vnc", a.handleDesktopVNC)
r.Post("/action", a.handleAction)
r.Route("/recording", func(r chi.Router) {
r.Post("/start", a.handleRecordingStart)
r.Post("/stop", a.handleRecordingStop)
})
return r
}
func (a *API) handleDesktopVNC(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
logger := a.logger.With(agentchat.Fields(ctx)...)
// Start the desktop session (idempotent).
_, err := a.desktop.Start(ctx)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to start desktop session.",
Detail: err.Error(),
})
return
}
// Get a VNC connection.
vncConn, err := a.desktop.VNCConn(ctx)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to connect to VNC server.",
Detail: err.Error(),
})
return
}
defer vncConn.Close()
// Accept WebSocket from coderd.
conn, err := websocket.Accept(rw, r, &websocket.AcceptOptions{
CompressionMode: websocket.CompressionDisabled,
})
if err != nil {
logger.Error(ctx, "failed to accept websocket", slog.Error(err))
return
}
// No read limit — RFB framebuffer updates can be large.
conn.SetReadLimit(-1)
wsCtx, wsNetConn := codersdk.WebsocketNetConn(ctx, conn, websocket.MessageBinary)
defer wsNetConn.Close()
// Bicopy raw bytes between WebSocket and VNC TCP.
agentssh.Bicopy(wsCtx, wsNetConn, vncConn)
}
func (a *API) handleAction(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
logger := a.logger.With(agentchat.Fields(ctx)...)
handlerStart := a.clock.Now()
// Update last desktop action timestamp for idle recording monitor.
a.desktop.RecordActivity()
// Ensure the desktop is running and grab native dimensions.
cfg, err := a.desktop.Start(ctx)
if err != nil {
logger.Warn(ctx, "handleAction: desktop.Start failed",
slog.Error(err),
slog.F("elapsed_ms", a.clock.Since(handlerStart).Milliseconds()),
)
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to start desktop session.",
Detail: err.Error(),
})
return
}
var action DesktopAction
if err := json.NewDecoder(r.Body).Decode(&action); err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Failed to decode request body.",
Detail: err.Error(),
})
return
}
logger.Info(ctx, "handleAction: started",
slog.F("action", action.Action),
slog.F("elapsed_ms", a.clock.Since(handlerStart).Milliseconds()),
)
geometry := desktopGeometryForAction(cfg, action)
scaleXY := geometry.DeclaredPointToNative
var resp DesktopActionResponse
switch action.Action {
case "key":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"text\" for key action.",
})
return
}
if err := a.desktop.KeyPress(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Key press failed.",
Detail: err.Error(),
})
return
}
resp.Output = "key action performed"
case "key_down":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"text\" for key_down action.",
})
return
}
if err := a.desktop.KeyDown(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Key down failed.",
Detail: err.Error(),
})
return
}
resp.Output = "key_down action performed"
case "key_up":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"text\" for key_up action.",
})
return
}
if err := a.desktop.KeyUp(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Key up failed.",
Detail: err.Error(),
})
return
}
resp.Output = "key_up action performed"
case "type":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"text\" for type action.",
})
return
}
if err := a.desktop.Type(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Type action failed.",
Detail: err.Error(),
})
return
}
resp.Output = "type action performed"
case "cursor_position":
nativeX, nativeY, err := a.desktop.CursorPosition(ctx)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Cursor position failed.",
Detail: err.Error(),
})
return
}
x, y := geometry.NativePointToDeclared(nativeX, nativeY)
resp.Output = "x=" + strconv.Itoa(x) + ",y=" + strconv.Itoa(y)
case "mouse_move":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
if err := a.desktop.Move(ctx, x, y); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Mouse move failed.",
Detail: err.Error(),
})
return
}
resp.Output = "mouse_move action performed"
case "left_click":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
stepStart := a.clock.Now()
if err := a.desktop.Click(ctx, x, y, MouseButtonLeft); err != nil {
logger.Warn(ctx, "handleAction: Click failed",
slog.F("action", "left_click"),
slog.F("step", "click"),
slog.F("step_ms", time.Since(stepStart).Milliseconds()),
slog.F("elapsed_ms", a.clock.Since(handlerStart).Milliseconds()),
slog.Error(err),
)
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Left click failed.",
Detail: err.Error(),
})
return
}
logger.Debug(ctx, "handleAction: Click completed",
slog.F("action", "left_click"),
slog.F("step_ms", time.Since(stepStart).Milliseconds()),
slog.F("elapsed_ms", a.clock.Since(handlerStart).Milliseconds()),
)
resp.Output = "left_click action performed"
case "left_click_drag":
if action.Coordinate == nil || action.StartCoordinate == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"coordinate\" or \"start_coordinate\" for left_click_drag.",
})
return
}
sx, sy := scaleXY(action.StartCoordinate[0], action.StartCoordinate[1])
ex, ey := scaleXY(action.Coordinate[0], action.Coordinate[1])
if err := a.desktop.Drag(ctx, sx, sy, ex, ey); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Left click drag failed.",
Detail: err.Error(),
})
return
}
resp.Output = "left_click_drag action performed"
case "left_mouse_down":
if err := a.desktop.ButtonDown(ctx, MouseButtonLeft); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Left mouse down failed.",
Detail: err.Error(),
})
return
}
resp.Output = "left_mouse_down action performed"
case "left_mouse_up":
if err := a.desktop.ButtonUp(ctx, MouseButtonLeft); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Left mouse up failed.",
Detail: err.Error(),
})
return
}
resp.Output = "left_mouse_up action performed"
case "right_click":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
if err := a.desktop.Click(ctx, x, y, MouseButtonRight); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Right click failed.",
Detail: err.Error(),
})
return
}
resp.Output = "right_click action performed"
case "middle_click":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
if err := a.desktop.Click(ctx, x, y, MouseButtonMiddle); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Middle click failed.",
Detail: err.Error(),
})
return
}
resp.Output = "middle_click action performed"
case "double_click":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
if err := a.desktop.DoubleClick(ctx, x, y, MouseButtonLeft); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Double click failed.",
Detail: err.Error(),
})
return
}
resp.Output = "double_click action performed"
case "triple_click":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
for range 3 {
if err := a.desktop.Click(ctx, x, y, MouseButtonLeft); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Triple click failed.",
Detail: err.Error(),
})
return
}
}
resp.Output = "triple_click action performed"
case "scroll":
x, y, err := coordFromAction(action)
if err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: err.Error(),
})
return
}
x, y = scaleXY(x, y)
amount := 3
if action.ScrollAmount != nil {
amount = *action.ScrollAmount
}
direction := "down"
if action.ScrollDirection != nil {
direction = *action.ScrollDirection
}
var dx, dy int
switch direction {
case "up":
dy = -amount
case "down":
dy = amount
case "left":
dx = -amount
case "right":
dx = amount
default:
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Invalid scroll direction: " + direction,
})
return
}
if err := a.desktop.Scroll(ctx, x, y, dx, dy); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Scroll failed.",
Detail: err.Error(),
})
return
}
resp.Output = "scroll action performed"
case "hold_key":
if action.Text == nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing \"text\" for hold_key action.",
})
return
}
dur := 1000
if action.Duration != nil {
dur = *action.Duration
}
if err := a.desktop.KeyDown(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Key down failed.",
Detail: err.Error(),
})
return
}
timer := a.clock.NewTimer(time.Duration(dur)*time.Millisecond, "agentdesktop", "hold_key")
defer timer.Stop()
select {
case <-ctx.Done():
// Context canceled; release the key immediately.
if err := a.desktop.KeyUp(ctx, *action.Text); err != nil {
logger.Warn(ctx, "handleAction: KeyUp after context cancel", slog.Error(err))
}
return
case <-timer.C:
}
if err := a.desktop.KeyUp(ctx, *action.Text); err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Key up failed.",
Detail: err.Error(),
})
return
}
resp.Output = "hold_key action performed"
case "screenshot":
result, err := a.desktop.Screenshot(ctx, ScreenshotOptions{
TargetWidth: geometry.DeclaredWidth,
TargetHeight: geometry.DeclaredHeight,
})
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Screenshot failed.",
Detail: err.Error(),
})
return
}
resp.Output = "screenshot"
resp.ScreenshotData = result.Data
resp.ScreenshotWidth = geometry.DeclaredWidth
resp.ScreenshotHeight = geometry.DeclaredHeight
default:
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Unknown action: " + action.Action,
})
return
}
elapsedMs := a.clock.Since(handlerStart).Milliseconds()
if ctx.Err() != nil {
logger.Error(ctx, "handleAction: context canceled before writing response",
slog.F("action", action.Action),
slog.F("elapsed_ms", elapsedMs),
slog.Error(ctx.Err()),
)
return
}
logger.Info(ctx, "handleAction: writing response",
slog.F("action", action.Action),
slog.F("elapsed_ms", elapsedMs),
)
httpapi.Write(ctx, rw, http.StatusOK, resp)
}
// Close shuts down the desktop session if one is running.
func (a *API) Close() error {
a.closeMu.Lock()
if a.closed {
a.closeMu.Unlock()
return nil
}
a.closed = true
a.closeMu.Unlock()
return a.desktop.Close()
}
// decodeRecordingRequest decodes and validates a recording request
// from the HTTP body, returning the recording ID. Returns false if
// the request was invalid and an error response was already written.
func (*API) decodeRecordingRequest(rw http.ResponseWriter, r *http.Request) (string, bool) {
ctx := r.Context()
var req struct {
RecordingID string `json:"recording_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Failed to decode request body.",
Detail: err.Error(),
})
return "", false
}
if req.RecordingID == "" {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Missing recording_id.",
})
return "", false
}
if _, err := uuid.Parse(req.RecordingID); err != nil {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: "Invalid recording_id format.",
Detail: "recording_id must be a valid UUID.",
})
return "", false
}
return req.RecordingID, true
}
func (a *API) handleRecordingStart(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
recordingID, ok := a.decodeRecordingRequest(rw, r)
if !ok {
return
}
a.closeMu.Lock()
if a.closed {
a.closeMu.Unlock()
httpapi.Write(ctx, rw, http.StatusServiceUnavailable, codersdk.Response{
Message: "Desktop API is shutting down.",
})
return
}
a.closeMu.Unlock()
if err := a.desktop.StartRecording(ctx, recordingID); err != nil {
if errors.Is(err, ErrDesktopClosed) {
httpapi.Write(ctx, rw, http.StatusServiceUnavailable, codersdk.Response{
Message: "Desktop API is shutting down.",
})
return
}
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to start recording.",
Detail: err.Error(),
})
return
}
httpapi.Write(ctx, rw, http.StatusOK, codersdk.Response{
Message: "Recording started.",
})
}
func (a *API) handleRecordingStop(rw http.ResponseWriter, r *http.Request) {
ctx := r.Context()
logger := a.logger.With(agentchat.Fields(ctx)...)
recordingID, ok := a.decodeRecordingRequest(rw, r)
if !ok {
return
}
a.closeMu.Lock()
if a.closed {
a.closeMu.Unlock()
httpapi.Write(ctx, rw, http.StatusServiceUnavailable, codersdk.Response{
Message: "Desktop API is shutting down.",
})
return
}
a.closeMu.Unlock()
// Stop recording (idempotent).
// Use a context detached from the HTTP request so that if the
// connection drops, the recording process can still shut down
// gracefully. WithoutCancel preserves request-scoped values.
stopCtx, stopCancel := context.WithTimeout(context.WithoutCancel(r.Context()), 30*time.Second)
defer stopCancel()
artifact, err := a.desktop.StopRecording(stopCtx, recordingID)
if err != nil {
if errors.Is(err, ErrUnknownRecording) {
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
Message: "Recording not found.",
Detail: err.Error(),
})
return
}
if errors.Is(err, ErrRecordingCorrupted) {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Recording is corrupted.",
Detail: err.Error(),
})
return
}
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Failed to stop recording.",
Detail: err.Error(),
})
return
}
defer artifact.Reader.Close()
defer func() {
if artifact.ThumbnailReader != nil {
_ = artifact.ThumbnailReader.Close()
}
}()
if artifact.Size > workspacesdk.MaxRecordingSize {
logger.Warn(ctx, "recording file exceeds maximum size",
slog.F("recording_id", recordingID),
slog.F("size", artifact.Size),
slog.F("max_size", workspacesdk.MaxRecordingSize),
)
httpapi.Write(ctx, rw, http.StatusRequestEntityTooLarge, codersdk.Response{
Message: "Recording file exceeds maximum allowed size.",
})
return
}
// Discard the thumbnail if it exceeds the maximum size.
// The server-side consumer also enforces this per-part, but
// rejecting it here avoids streaming a large thumbnail over
// the wire for nothing.
if artifact.ThumbnailReader != nil && artifact.ThumbnailSize > workspacesdk.MaxThumbnailSize {
logger.Warn(ctx, "thumbnail file exceeds maximum size, omitting",
slog.F("recording_id", recordingID),
slog.F("size", artifact.ThumbnailSize),
slog.F("max_size", workspacesdk.MaxThumbnailSize),
)
_ = artifact.ThumbnailReader.Close()
artifact.ThumbnailReader = nil
artifact.ThumbnailSize = 0
}
// The multipart response is best-effort: once WriteHeader(200) is
// called, CreatePart failures produce a truncated response without
// the closing boundary. The server-side consumer handles this
// gracefully, preserving any parts read before the error.
mw := multipart.NewWriter(rw)
defer mw.Close()
rw.Header().Set("Content-Type", "multipart/mixed; boundary="+mw.Boundary())
rw.WriteHeader(http.StatusOK)
// Part 1: video/mp4 (always present).
videoPart, err := mw.CreatePart(textproto.MIMEHeader{
"Content-Type": {"video/mp4"},
})
if err != nil {
logger.Warn(ctx, "failed to create video multipart part",
slog.F("recording_id", recordingID),
slog.Error(err))
return
}
if _, err := io.Copy(videoPart, artifact.Reader); err != nil {
logger.Warn(ctx, "failed to write video multipart part",
slog.F("recording_id", recordingID),
slog.Error(err))
return
}
// Part 2: image/jpeg (present only when thumbnail was extracted).
if artifact.ThumbnailReader != nil {
thumbPart, err := mw.CreatePart(textproto.MIMEHeader{
"Content-Type": {"image/jpeg"},
})
if err != nil {
logger.Warn(ctx, "failed to create thumbnail multipart part",
slog.F("recording_id", recordingID),
slog.Error(err))
return
}
_, _ = io.Copy(thumbPart, artifact.ThumbnailReader)
}
}
// coordFromAction extracts the coordinate pair from a DesktopAction,
// returning an error if the coordinate field is missing.
func coordFromAction(action DesktopAction) (x, y int, err error) {
if action.Coordinate == nil {
return 0, 0, &missingFieldError{field: "coordinate", action: action.Action}
}
return action.Coordinate[0], action.Coordinate[1], nil
}
func desktopGeometryForAction(cfg DisplayConfig, action DesktopAction) workspacesdk.DesktopGeometry {
declaredWidth := cfg.Width
declaredHeight := cfg.Height
if action.ScaledWidth != nil && *action.ScaledWidth > 0 {
declaredWidth = *action.ScaledWidth
}
if action.ScaledHeight != nil && *action.ScaledHeight > 0 {
declaredHeight = *action.ScaledHeight
}
return workspacesdk.NewDesktopGeometryWithDeclared(
cfg.Width,
cfg.Height,
declaredWidth,
declaredHeight,
)
}
// missingFieldError is returned when a required field is absent from
// a DesktopAction.
type missingFieldError struct {
field string
action string
}
func (e *missingFieldError) Error() string {
return "Missing \"" + e.field + "\" for " + e.action + " action."
}