fix: improve background process handling for agent tools (#23132)

## Problem Models frequently use shell `&` instead of `run_in_background=true` when starting long-running processes through `/agents`, causing them to die shortly after starting. This happens because: 1. **No guidance in tool schema** — The `ExecuteArgs` struct had zero `description` tags. The model saw `run_in_background: boolean (optional)` with no explanation of when/why to use it. 2. **Shell `&` is silently broken** — `sh -c "command &"` forks the process, the shell exits immediately, and the forked child becomes an orphan not tracked by the process manager. 3. **No process group isolation** — The SSH subsystem sets `Setsid: true` on spawned processes, but the agent process manager set no `SysProcAttr` at all. Signals only hit the top-level `sh`, not child processes. ## Investigation Compared our implementation against **openai/codex** and **coder/mux**: | Aspect | codex | mux | coder/coder (before) | |--------|-------|-----|---------------------| | Background flag | Yield/resume with `session_id` | `run_in_background` with rich description | `run_in_background` with **no description** | | `&` handling | `setsid()` + `killpg()` | `detached: true` + `killProcessTree()` | **Nothing** — orphaned children escape | | Process isolation | `setsid()` on every spawn | `set -m; nohup ... setsid` for background | **No `SysProcAttr` at all** | | Signal delivery | `killpg(pgid, sig)` — entire group | `kill -15 -\$pid` — negative PID | `proc.cmd.Process.Signal()` — **PID only** | ## Changes ### Fix 1: Add descriptions to `ExecuteArgs` (highest impact) The model now sees explicit guidance: *"Use for long-running processes like dev servers, file watchers, or builds. Do NOT use shell & — it will not work correctly."* ### Fix 2: Update tool description The top-level execute tool description now reinforces: *"Use run_in_background=true for long-running processes. Never use shell '&' for backgrounding."* ### Fix 3: Detect trailing `&` and auto-promote to background Defense-in-depth: if the model still uses `command &`, we strip the `&` and promote to `run_in_background=true` automatically. Correctly distinguishes `&` from `&&`. ### Fix 4: Process group isolation (`Setpgid`) New platform-specific files (`proc_other.go` / `proc_windows.go`) following the same pattern as `agentssh/exec_other.go`. Every spawned process gets its own process group. ### Fix 5: Process group signaling `signal()` now uses `syscall.Kill(-pid, sig)` on Unix to signal the entire process group, ensuring child processes from shell pipelines are also cleaned up. ## Testing All existing `agent/agentproc` tests pass. Both packages compile cleanly.
2026-06-02 20:48:20 +00:00 · 2026-03-16 13:22:10 -07:00
parent 89bb5bb945
commit 6972d073a2
4 changed files with 68 additions and 9 deletions
@@ -0,0 +1,26 @@
+//go:build !windows
+
+package agentproc
+
+import (
+	"os"
+	"syscall"
+)
+
+// procSysProcAttr returns the SysProcAttr to use when spawning
+// processes. On Unix, Setpgid creates a new process group so
+// that signals can be delivered to the entire group (the shell
+// and all its children).
+func procSysProcAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{
+		Setpgid: true,
+	}
+}
+
+// signalProcess sends a signal to the process group rooted at p.
+// Using the negative PID sends the signal to every process in the
+// group, ensuring child processes (e.g. from shell pipelines) are
+// also signaled.
+func signalProcess(p *os.Process, sig syscall.Signal) error {
+	return syscall.Kill(-p.Pid, sig)
+}
@@ -0,0 +1,20 @@
+package agentproc
+
+import (
+	"os"
+	"syscall"
+)
+
+// procSysProcAttr returns the SysProcAttr to use when spawning
+// processes. On Windows, process groups are not supported in the
+// same way as Unix, so this returns an empty struct.
+func procSysProcAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{}
+}
+
+// signalProcess sends a signal directly to the process. Windows
+// does not support process group signaling, so we fall back to
+// sending the signal to the process itself.
+func signalProcess(p *os.Process, _ syscall.Signal) error {
+	return p.Kill()
+}
@@ -113,6 +113,7 @@ func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*p
 		cmd.Dir = req.WorkDir
 	}
 	cmd.Stdin = nil
+	cmd.SysProcAttr = procSysProcAttr()

 	// WaitDelay ensures cmd.Wait returns promptly after
 	// the process is killed, even if child processes are
@@ -272,13 +273,15 @@ func (m *manager) signal(id string, sig string) error {

 	switch sig {
 	case "kill":
-		if err := proc.cmd.Process.Kill(); err != nil {
+		// Use process group kill to ensure child processes
+		// (e.g. from shell pipelines) are also killed.
+		if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil {
 			return xerrors.Errorf("kill process: %w", err)
 		}
 	case "terminate":
-		//nolint:revive // syscall.SIGTERM is portable enough
-		// for our supported platforms.
-		if err := proc.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+		// Use process group signal to ensure child processes
+		// are also terminated.
+		if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil {
 			return xerrors.Errorf("terminate process: %w", err)
 		}
 	default:
@@ -78,10 +78,10 @@ type ProcessToolOptions struct {

 // ExecuteArgs are the parameters accepted by the execute tool.
 type ExecuteArgs struct {
-	Command         string  `json:"command"`
-	Timeout         *string `json:"timeout,omitempty"`
-	WorkDir         *string `json:"workdir,omitempty"`
-	RunInBackground *bool   `json:"run_in_background,omitempty"`
+	Command         string  `json:"command" description:"The shell command to execute."`
+	Timeout         *string `json:"timeout,omitempty" description:"Timeout duration (e.g. '30s', '5m'). Default is 10s. Only applies to foreground commands."`
+	WorkDir         *string `json:"workdir,omitempty" description:"Working directory for the command."`
+	RunInBackground *bool   `json:"run_in_background,omitempty" description:"Run this command in the background without blocking. Use for long-running processes like dev servers, file watchers, or builds that run longer than 5 seconds. Do NOT use shell & to background processes — it will not work correctly. Always use this parameter instead."`
 }

 // Execute returns an AgentTool that runs a shell command in the
@@ -89,7 +89,7 @@ type ExecuteArgs struct {
 func Execute(options ExecuteOptions) fantasy.AgentTool {
 	return fantasy.NewAgentTool(
 		"execute",
-		"Execute a shell command in the workspace.",
+		"Execute a shell command in the workspace. Use run_in_background=true for long-running processes (dev servers, file watchers, builds). Never use shell '&' for backgrounding.",
 		func(ctx context.Context, args ExecuteArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
 			if options.GetWorkspaceConn == nil {
 				return fantasy.NewTextErrorResponse("workspace connection resolver is not configured"), nil
@@ -122,6 +122,16 @@ func executeTool(

 	background := args.RunInBackground != nil && *args.RunInBackground

+	// Detect shell-style backgrounding (trailing &) and promote to
+	// background mode. Models sometimes use "cmd &" instead of the
+	// run_in_background parameter, which causes the shell to fork
+	// and exit immediately, leaving an untracked orphan process.
+	trimmed := strings.TrimSpace(args.Command)
+	if !background && strings.HasSuffix(trimmed, "&") && !strings.HasSuffix(trimmed, "&&") {
+		background = true
+		args.Command = strings.TrimSpace(strings.TrimSuffix(trimmed, "&"))
+	}
+
 	var workDir string
 	if args.WorkDir != nil {
 		workDir = *args.WorkDir