fix: improve background process handling for agent tools (#23132)

## Problem

Models frequently use shell `&` instead of `run_in_background=true` when
starting long-running processes through `/agents`, causing them to die
shortly after starting. This happens because:

1. **No guidance in tool schema** — The `ExecuteArgs` struct had zero
`description` tags. The model saw `run_in_background: boolean
(optional)` with no explanation of when/why to use it.
2. **Shell `&` is silently broken** — `sh -c "command &"` forks the
process, the shell exits immediately, and the forked child becomes an
orphan not tracked by the process manager.
3. **No process group isolation** — The SSH subsystem sets `Setsid:
true` on spawned processes, but the agent process manager set no
`SysProcAttr` at all. Signals only hit the top-level `sh`, not child
processes.

## Investigation

Compared our implementation against **openai/codex** and **coder/mux**:

| Aspect | codex | mux | coder/coder (before) |
|--------|-------|-----|---------------------|
| Background flag | Yield/resume with `session_id` | `run_in_background`
with rich description | `run_in_background` with **no description** |
| `&` handling | `setsid()` + `killpg()` | `detached: true` +
`killProcessTree()` | **Nothing** — orphaned children escape |
| Process isolation | `setsid()` on every spawn | `set -m; nohup ...
setsid` for background | **No `SysProcAttr` at all** |
| Signal delivery | `killpg(pgid, sig)` — entire group | `kill -15
-\$pid` — negative PID | `proc.cmd.Process.Signal()` — **PID only** |

## Changes

### Fix 1: Add descriptions to `ExecuteArgs` (highest impact)
The model now sees explicit guidance: *"Use for long-running processes
like dev servers, file watchers, or builds. Do NOT use shell & — it will
not work correctly."*

### Fix 2: Update tool description
The top-level execute tool description now reinforces: *"Use
run_in_background=true for long-running processes. Never use shell '&'
for backgrounding."*

### Fix 3: Detect trailing `&` and auto-promote to background
Defense-in-depth: if the model still uses `command &`, we strip the `&`
and promote to `run_in_background=true` automatically. Correctly
distinguishes `&` from `&&`.

### Fix 4: Process group isolation (`Setpgid`)
New platform-specific files (`proc_other.go` / `proc_windows.go`)
following the same pattern as `agentssh/exec_other.go`. Every spawned
process gets its own process group.

### Fix 5: Process group signaling
`signal()` now uses `syscall.Kill(-pid, sig)` on Unix to signal the
entire process group, ensuring child processes from shell pipelines are
also cleaned up.

## Testing
All existing `agent/agentproc` tests pass. Both packages compile
cleanly.
This commit is contained in:
Kyle Carberry
2026-03-16 13:22:10 -07:00
committed by GitHub
parent 89bb5bb945
commit 6972d073a2
4 changed files with 68 additions and 9 deletions
+26
View File
@@ -0,0 +1,26 @@
//go:build !windows
package agentproc
import (
"os"
"syscall"
)
// procSysProcAttr returns the SysProcAttr to use when spawning
// processes. On Unix, Setpgid creates a new process group so
// that signals can be delivered to the entire group (the shell
// and all its children).
func procSysProcAttr() *syscall.SysProcAttr {
return &syscall.SysProcAttr{
Setpgid: true,
}
}
// signalProcess sends a signal to the process group rooted at p.
// Using the negative PID sends the signal to every process in the
// group, ensuring child processes (e.g. from shell pipelines) are
// also signaled.
func signalProcess(p *os.Process, sig syscall.Signal) error {
return syscall.Kill(-p.Pid, sig)
}
+20
View File
@@ -0,0 +1,20 @@
package agentproc
import (
"os"
"syscall"
)
// procSysProcAttr returns the SysProcAttr to use when spawning
// processes. On Windows, process groups are not supported in the
// same way as Unix, so this returns an empty struct.
func procSysProcAttr() *syscall.SysProcAttr {
return &syscall.SysProcAttr{}
}
// signalProcess sends a signal directly to the process. Windows
// does not support process group signaling, so we fall back to
// sending the signal to the process itself.
func signalProcess(p *os.Process, _ syscall.Signal) error {
return p.Kill()
}
+7 -4
View File
@@ -113,6 +113,7 @@ func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*p
cmd.Dir = req.WorkDir
}
cmd.Stdin = nil
cmd.SysProcAttr = procSysProcAttr()
// WaitDelay ensures cmd.Wait returns promptly after
// the process is killed, even if child processes are
@@ -272,13 +273,15 @@ func (m *manager) signal(id string, sig string) error {
switch sig {
case "kill":
if err := proc.cmd.Process.Kill(); err != nil {
// Use process group kill to ensure child processes
// (e.g. from shell pipelines) are also killed.
if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil {
return xerrors.Errorf("kill process: %w", err)
}
case "terminate":
//nolint:revive // syscall.SIGTERM is portable enough
// for our supported platforms.
if err := proc.cmd.Process.Signal(syscall.SIGTERM); err != nil {
// Use process group signal to ensure child processes
// are also terminated.
if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil {
return xerrors.Errorf("terminate process: %w", err)
}
default:
+15 -5
View File
@@ -78,10 +78,10 @@ type ProcessToolOptions struct {
// ExecuteArgs are the parameters accepted by the execute tool.
type ExecuteArgs struct {
Command string `json:"command"`
Timeout *string `json:"timeout,omitempty"`
WorkDir *string `json:"workdir,omitempty"`
RunInBackground *bool `json:"run_in_background,omitempty"`
Command string `json:"command" description:"The shell command to execute."`
Timeout *string `json:"timeout,omitempty" description:"Timeout duration (e.g. '30s', '5m'). Default is 10s. Only applies to foreground commands."`
WorkDir *string `json:"workdir,omitempty" description:"Working directory for the command."`
RunInBackground *bool `json:"run_in_background,omitempty" description:"Run this command in the background without blocking. Use for long-running processes like dev servers, file watchers, or builds that run longer than 5 seconds. Do NOT use shell & to background processes — it will not work correctly. Always use this parameter instead."`
}
// Execute returns an AgentTool that runs a shell command in the
@@ -89,7 +89,7 @@ type ExecuteArgs struct {
func Execute(options ExecuteOptions) fantasy.AgentTool {
return fantasy.NewAgentTool(
"execute",
"Execute a shell command in the workspace.",
"Execute a shell command in the workspace. Use run_in_background=true for long-running processes (dev servers, file watchers, builds). Never use shell '&' for backgrounding.",
func(ctx context.Context, args ExecuteArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
if options.GetWorkspaceConn == nil {
return fantasy.NewTextErrorResponse("workspace connection resolver is not configured"), nil
@@ -122,6 +122,16 @@ func executeTool(
background := args.RunInBackground != nil && *args.RunInBackground
// Detect shell-style backgrounding (trailing &) and promote to
// background mode. Models sometimes use "cmd &" instead of the
// run_in_background parameter, which causes the shell to fork
// and exit immediately, leaving an untracked orphan process.
trimmed := strings.TrimSpace(args.Command)
if !background && strings.HasSuffix(trimmed, "&") && !strings.HasSuffix(trimmed, "&&") {
background = true
args.Command = strings.TrimSpace(strings.TrimSuffix(trimmed, "&"))
}
var workDir string
if args.WorkDir != nil {
workDir = *args.WorkDir