From 6972d073a2176d5baace71fa52edf0b2df1d9179 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 16 Mar 2026 13:22:10 -0700 Subject: [PATCH] fix: improve background process handling for agent tools (#23132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Models frequently use shell `&` instead of `run_in_background=true` when starting long-running processes through `/agents`, causing them to die shortly after starting. This happens because: 1. **No guidance in tool schema** — The `ExecuteArgs` struct had zero `description` tags. The model saw `run_in_background: boolean (optional)` with no explanation of when/why to use it. 2. **Shell `&` is silently broken** — `sh -c "command &"` forks the process, the shell exits immediately, and the forked child becomes an orphan not tracked by the process manager. 3. **No process group isolation** — The SSH subsystem sets `Setsid: true` on spawned processes, but the agent process manager set no `SysProcAttr` at all. Signals only hit the top-level `sh`, not child processes. ## Investigation Compared our implementation against **openai/codex** and **coder/mux**: | Aspect | codex | mux | coder/coder (before) | |--------|-------|-----|---------------------| | Background flag | Yield/resume with `session_id` | `run_in_background` with rich description | `run_in_background` with **no description** | | `&` handling | `setsid()` + `killpg()` | `detached: true` + `killProcessTree()` | **Nothing** — orphaned children escape | | Process isolation | `setsid()` on every spawn | `set -m; nohup ... setsid` for background | **No `SysProcAttr` at all** | | Signal delivery | `killpg(pgid, sig)` — entire group | `kill -15 -\$pid` — negative PID | `proc.cmd.Process.Signal()` — **PID only** | ## Changes ### Fix 1: Add descriptions to `ExecuteArgs` (highest impact) The model now sees explicit guidance: *"Use for long-running processes like dev servers, file watchers, or builds. Do NOT use shell & — it will not work correctly."* ### Fix 2: Update tool description The top-level execute tool description now reinforces: *"Use run_in_background=true for long-running processes. Never use shell '&' for backgrounding."* ### Fix 3: Detect trailing `&` and auto-promote to background Defense-in-depth: if the model still uses `command &`, we strip the `&` and promote to `run_in_background=true` automatically. Correctly distinguishes `&` from `&&`. ### Fix 4: Process group isolation (`Setpgid`) New platform-specific files (`proc_other.go` / `proc_windows.go`) following the same pattern as `agentssh/exec_other.go`. Every spawned process gets its own process group. ### Fix 5: Process group signaling `signal()` now uses `syscall.Kill(-pid, sig)` on Unix to signal the entire process group, ensuring child processes from shell pipelines are also cleaned up. ## Testing All existing `agent/agentproc` tests pass. Both packages compile cleanly. --- agent/agentproc/proc_other.go | 26 ++++++++++++++++++++++++++ agent/agentproc/proc_windows.go | 20 ++++++++++++++++++++ agent/agentproc/process.go | 11 +++++++---- coderd/chatd/chattool/execute.go | 20 +++++++++++++++----- 4 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 agent/agentproc/proc_other.go create mode 100644 agent/agentproc/proc_windows.go diff --git a/agent/agentproc/proc_other.go b/agent/agentproc/proc_other.go new file mode 100644 index 0000000000..e56cc5d953 --- /dev/null +++ b/agent/agentproc/proc_other.go @@ -0,0 +1,26 @@ +//go:build !windows + +package agentproc + +import ( + "os" + "syscall" +) + +// procSysProcAttr returns the SysProcAttr to use when spawning +// processes. On Unix, Setpgid creates a new process group so +// that signals can be delivered to the entire group (the shell +// and all its children). +func procSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Setpgid: true, + } +} + +// signalProcess sends a signal to the process group rooted at p. +// Using the negative PID sends the signal to every process in the +// group, ensuring child processes (e.g. from shell pipelines) are +// also signaled. +func signalProcess(p *os.Process, sig syscall.Signal) error { + return syscall.Kill(-p.Pid, sig) +} diff --git a/agent/agentproc/proc_windows.go b/agent/agentproc/proc_windows.go new file mode 100644 index 0000000000..5efbb3efbb --- /dev/null +++ b/agent/agentproc/proc_windows.go @@ -0,0 +1,20 @@ +package agentproc + +import ( + "os" + "syscall" +) + +// procSysProcAttr returns the SysProcAttr to use when spawning +// processes. On Windows, process groups are not supported in the +// same way as Unix, so this returns an empty struct. +func procSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{} +} + +// signalProcess sends a signal directly to the process. Windows +// does not support process group signaling, so we fall back to +// sending the signal to the process itself. +func signalProcess(p *os.Process, _ syscall.Signal) error { + return p.Kill() +} diff --git a/agent/agentproc/process.go b/agent/agentproc/process.go index b29446f3dc..57cbb0fbfb 100644 --- a/agent/agentproc/process.go +++ b/agent/agentproc/process.go @@ -113,6 +113,7 @@ func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*p cmd.Dir = req.WorkDir } cmd.Stdin = nil + cmd.SysProcAttr = procSysProcAttr() // WaitDelay ensures cmd.Wait returns promptly after // the process is killed, even if child processes are @@ -272,13 +273,15 @@ func (m *manager) signal(id string, sig string) error { switch sig { case "kill": - if err := proc.cmd.Process.Kill(); err != nil { + // Use process group kill to ensure child processes + // (e.g. from shell pipelines) are also killed. + if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil { return xerrors.Errorf("kill process: %w", err) } case "terminate": - //nolint:revive // syscall.SIGTERM is portable enough - // for our supported platforms. - if err := proc.cmd.Process.Signal(syscall.SIGTERM); err != nil { + // Use process group signal to ensure child processes + // are also terminated. + if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil { return xerrors.Errorf("terminate process: %w", err) } default: diff --git a/coderd/chatd/chattool/execute.go b/coderd/chatd/chattool/execute.go index 6dec92eb08..d22e65bea0 100644 --- a/coderd/chatd/chattool/execute.go +++ b/coderd/chatd/chattool/execute.go @@ -78,10 +78,10 @@ type ProcessToolOptions struct { // ExecuteArgs are the parameters accepted by the execute tool. type ExecuteArgs struct { - Command string `json:"command"` - Timeout *string `json:"timeout,omitempty"` - WorkDir *string `json:"workdir,omitempty"` - RunInBackground *bool `json:"run_in_background,omitempty"` + Command string `json:"command" description:"The shell command to execute."` + Timeout *string `json:"timeout,omitempty" description:"Timeout duration (e.g. '30s', '5m'). Default is 10s. Only applies to foreground commands."` + WorkDir *string `json:"workdir,omitempty" description:"Working directory for the command."` + RunInBackground *bool `json:"run_in_background,omitempty" description:"Run this command in the background without blocking. Use for long-running processes like dev servers, file watchers, or builds that run longer than 5 seconds. Do NOT use shell & to background processes — it will not work correctly. Always use this parameter instead."` } // Execute returns an AgentTool that runs a shell command in the @@ -89,7 +89,7 @@ type ExecuteArgs struct { func Execute(options ExecuteOptions) fantasy.AgentTool { return fantasy.NewAgentTool( "execute", - "Execute a shell command in the workspace.", + "Execute a shell command in the workspace. Use run_in_background=true for long-running processes (dev servers, file watchers, builds). Never use shell '&' for backgrounding.", func(ctx context.Context, args ExecuteArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { if options.GetWorkspaceConn == nil { return fantasy.NewTextErrorResponse("workspace connection resolver is not configured"), nil @@ -122,6 +122,16 @@ func executeTool( background := args.RunInBackground != nil && *args.RunInBackground + // Detect shell-style backgrounding (trailing &) and promote to + // background mode. Models sometimes use "cmd &" instead of the + // run_in_background parameter, which causes the shell to fork + // and exit immediately, leaving an untracked orphan process. + trimmed := strings.TrimSpace(args.Command) + if !background && strings.HasSuffix(trimmed, "&") && !strings.HasSuffix(trimmed, "&&") { + background = true + args.Command = strings.TrimSpace(strings.TrimSuffix(trimmed, "&")) + } + var workDir string if args.WorkDir != nil { workDir = *args.WorkDir