mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix: improve background process handling for agent tools (#23132)
## Problem Models frequently use shell `&` instead of `run_in_background=true` when starting long-running processes through `/agents`, causing them to die shortly after starting. This happens because: 1. **No guidance in tool schema** — The `ExecuteArgs` struct had zero `description` tags. The model saw `run_in_background: boolean (optional)` with no explanation of when/why to use it. 2. **Shell `&` is silently broken** — `sh -c "command &"` forks the process, the shell exits immediately, and the forked child becomes an orphan not tracked by the process manager. 3. **No process group isolation** — The SSH subsystem sets `Setsid: true` on spawned processes, but the agent process manager set no `SysProcAttr` at all. Signals only hit the top-level `sh`, not child processes. ## Investigation Compared our implementation against **openai/codex** and **coder/mux**: | Aspect | codex | mux | coder/coder (before) | |--------|-------|-----|---------------------| | Background flag | Yield/resume with `session_id` | `run_in_background` with rich description | `run_in_background` with **no description** | | `&` handling | `setsid()` + `killpg()` | `detached: true` + `killProcessTree()` | **Nothing** — orphaned children escape | | Process isolation | `setsid()` on every spawn | `set -m; nohup ... setsid` for background | **No `SysProcAttr` at all** | | Signal delivery | `killpg(pgid, sig)` — entire group | `kill -15 -\$pid` — negative PID | `proc.cmd.Process.Signal()` — **PID only** | ## Changes ### Fix 1: Add descriptions to `ExecuteArgs` (highest impact) The model now sees explicit guidance: *"Use for long-running processes like dev servers, file watchers, or builds. Do NOT use shell & — it will not work correctly."* ### Fix 2: Update tool description The top-level execute tool description now reinforces: *"Use run_in_background=true for long-running processes. Never use shell '&' for backgrounding."* ### Fix 3: Detect trailing `&` and auto-promote to background Defense-in-depth: if the model still uses `command &`, we strip the `&` and promote to `run_in_background=true` automatically. Correctly distinguishes `&` from `&&`. ### Fix 4: Process group isolation (`Setpgid`) New platform-specific files (`proc_other.go` / `proc_windows.go`) following the same pattern as `agentssh/exec_other.go`. Every spawned process gets its own process group. ### Fix 5: Process group signaling `signal()` now uses `syscall.Kill(-pid, sig)` on Unix to signal the entire process group, ensuring child processes from shell pipelines are also cleaned up. ## Testing All existing `agent/agentproc` tests pass. Both packages compile cleanly.
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
//go:build !windows
|
||||
|
||||
package agentproc
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// procSysProcAttr returns the SysProcAttr to use when spawning
|
||||
// processes. On Unix, Setpgid creates a new process group so
|
||||
// that signals can be delivered to the entire group (the shell
|
||||
// and all its children).
|
||||
func procSysProcAttr() *syscall.SysProcAttr {
|
||||
return &syscall.SysProcAttr{
|
||||
Setpgid: true,
|
||||
}
|
||||
}
|
||||
|
||||
// signalProcess sends a signal to the process group rooted at p.
|
||||
// Using the negative PID sends the signal to every process in the
|
||||
// group, ensuring child processes (e.g. from shell pipelines) are
|
||||
// also signaled.
|
||||
func signalProcess(p *os.Process, sig syscall.Signal) error {
|
||||
return syscall.Kill(-p.Pid, sig)
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package agentproc
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// procSysProcAttr returns the SysProcAttr to use when spawning
|
||||
// processes. On Windows, process groups are not supported in the
|
||||
// same way as Unix, so this returns an empty struct.
|
||||
func procSysProcAttr() *syscall.SysProcAttr {
|
||||
return &syscall.SysProcAttr{}
|
||||
}
|
||||
|
||||
// signalProcess sends a signal directly to the process. Windows
|
||||
// does not support process group signaling, so we fall back to
|
||||
// sending the signal to the process itself.
|
||||
func signalProcess(p *os.Process, _ syscall.Signal) error {
|
||||
return p.Kill()
|
||||
}
|
||||
@@ -113,6 +113,7 @@ func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*p
|
||||
cmd.Dir = req.WorkDir
|
||||
}
|
||||
cmd.Stdin = nil
|
||||
cmd.SysProcAttr = procSysProcAttr()
|
||||
|
||||
// WaitDelay ensures cmd.Wait returns promptly after
|
||||
// the process is killed, even if child processes are
|
||||
@@ -272,13 +273,15 @@ func (m *manager) signal(id string, sig string) error {
|
||||
|
||||
switch sig {
|
||||
case "kill":
|
||||
if err := proc.cmd.Process.Kill(); err != nil {
|
||||
// Use process group kill to ensure child processes
|
||||
// (e.g. from shell pipelines) are also killed.
|
||||
if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil {
|
||||
return xerrors.Errorf("kill process: %w", err)
|
||||
}
|
||||
case "terminate":
|
||||
//nolint:revive // syscall.SIGTERM is portable enough
|
||||
// for our supported platforms.
|
||||
if err := proc.cmd.Process.Signal(syscall.SIGTERM); err != nil {
|
||||
// Use process group signal to ensure child processes
|
||||
// are also terminated.
|
||||
if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil {
|
||||
return xerrors.Errorf("terminate process: %w", err)
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -78,10 +78,10 @@ type ProcessToolOptions struct {
|
||||
|
||||
// ExecuteArgs are the parameters accepted by the execute tool.
|
||||
type ExecuteArgs struct {
|
||||
Command string `json:"command"`
|
||||
Timeout *string `json:"timeout,omitempty"`
|
||||
WorkDir *string `json:"workdir,omitempty"`
|
||||
RunInBackground *bool `json:"run_in_background,omitempty"`
|
||||
Command string `json:"command" description:"The shell command to execute."`
|
||||
Timeout *string `json:"timeout,omitempty" description:"Timeout duration (e.g. '30s', '5m'). Default is 10s. Only applies to foreground commands."`
|
||||
WorkDir *string `json:"workdir,omitempty" description:"Working directory for the command."`
|
||||
RunInBackground *bool `json:"run_in_background,omitempty" description:"Run this command in the background without blocking. Use for long-running processes like dev servers, file watchers, or builds that run longer than 5 seconds. Do NOT use shell & to background processes — it will not work correctly. Always use this parameter instead."`
|
||||
}
|
||||
|
||||
// Execute returns an AgentTool that runs a shell command in the
|
||||
@@ -89,7 +89,7 @@ type ExecuteArgs struct {
|
||||
func Execute(options ExecuteOptions) fantasy.AgentTool {
|
||||
return fantasy.NewAgentTool(
|
||||
"execute",
|
||||
"Execute a shell command in the workspace.",
|
||||
"Execute a shell command in the workspace. Use run_in_background=true for long-running processes (dev servers, file watchers, builds). Never use shell '&' for backgrounding.",
|
||||
func(ctx context.Context, args ExecuteArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
|
||||
if options.GetWorkspaceConn == nil {
|
||||
return fantasy.NewTextErrorResponse("workspace connection resolver is not configured"), nil
|
||||
@@ -122,6 +122,16 @@ func executeTool(
|
||||
|
||||
background := args.RunInBackground != nil && *args.RunInBackground
|
||||
|
||||
// Detect shell-style backgrounding (trailing &) and promote to
|
||||
// background mode. Models sometimes use "cmd &" instead of the
|
||||
// run_in_background parameter, which causes the shell to fork
|
||||
// and exit immediately, leaving an untracked orphan process.
|
||||
trimmed := strings.TrimSpace(args.Command)
|
||||
if !background && strings.HasSuffix(trimmed, "&") && !strings.HasSuffix(trimmed, "&&") {
|
||||
background = true
|
||||
args.Command = strings.TrimSpace(strings.TrimSuffix(trimmed, "&"))
|
||||
}
|
||||
|
||||
var workDir string
|
||||
if args.WorkDir != nil {
|
||||
workDir = *args.WorkDir
|
||||
|
||||
Reference in New Issue
Block a user