fix: prevent stdio MCP server subprocess from dying after connect (#24035)

## Problem

MCP servers configured in `.mcp.json` with stdio transport are
discovered successfully (tools appear) but die immediately after
connection, making all tool calls fail.

## Root Cause

In `connectServer`, the subprocess is spawned with `connectCtx` — a
30-second timeout context whose `cancel()` is deferred:

```go
connectCtx, cancel := context.WithTimeout(ctx, connectTimeout)
defer cancel()
if err := c.Start(connectCtx); err != nil { ... }
```

The mcp-go stdio transport calls `exec.CommandContext(connectCtx, ...)`.
When `connectServer` returns, `cancel()` fires, and
`exec.CommandContext` sends SIGKILL to the subprocess. The process
immediately becomes a zombie.

Confirmed by checking `/proc/<pid>/status` after context cancellation:
```
State: Z (zombie)
```

## Fix

Pass the parent `ctx` (which is `a.gracefulCtx` — the agent's long-lived
context) to `c.Start()`. `connectCtx` continues to bound only the
`Initialize()` handshake. The subprocess is cleaned up when the Manager
is closed or the parent context is canceled.

## Regression Test

Added `TestConnectServer_StdioProcessSurvivesConnect` which:
- Spawns a real subprocess (re-execs the test binary as a fake MCP
server)
- Calls `connectServer` and lets it return (internal `connectCtx` gets
canceled)
- Verifies the subprocess is still alive by calling `ListTools`

The test **fails** on the old code with `transport error: context
deadline exceeded` and **passes** with the fix.

> Generated with [Coder Agents](https://coder.com/agents)
This commit is contained in:
Kyle Carberry
2026-04-05 08:04:13 -04:00
committed by GitHub
parent 8625543413
commit 5b32c4d79d
2 changed files with 126 additions and 1 deletions
+5 -1
View File
@@ -187,7 +187,11 @@ func (*Manager) connectServer(ctx context.Context, cfg ServerConfig) (*client.Cl
connectCtx, cancel := context.WithTimeout(ctx, connectTimeout)
defer cancel()
if err := c.Start(connectCtx); err != nil {
// Use the parent ctx (not connectCtx) so the subprocess outlives
// the connect/initialize handshake. connectCtx bounds only the
// Initialize call below. The subprocess is cleaned up when the
// Manager is closed or ctx is canceled.
if err := c.Start(ctx); err != nil {
_ = c.Close()
return nil, xerrors.Errorf("start %q: %w", cfg.Name, err)
}
+121
View File
@@ -1,6 +1,11 @@
package agentmcp
import (
"bufio"
"context"
"encoding/json"
"fmt"
"os"
"testing"
"github.com/mark3labs/mcp-go/mcp"
@@ -8,6 +13,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/testutil"
)
func TestSplitToolName(t *testing.T) {
@@ -193,3 +199,118 @@ func TestConvertResult(t *testing.T) {
})
}
}
// TestConnectServer_StdioProcessSurvivesConnect verifies that a stdio MCP
// server subprocess remains alive after connectServer returns. This is a
// regression test for a bug where the subprocess was tied to a short-lived
// connectCtx and killed as soon as the context was canceled.
func TestConnectServer_StdioProcessSurvivesConnect(t *testing.T) {
t.Parallel()
if os.Getenv("TEST_MCP_FAKE_SERVER") == "1" {
// Child process: act as a minimal MCP server over stdio.
runFakeMCPServer()
return
}
// Get the path to the test binary so we can re-exec ourselves
// as a fake MCP server subprocess.
testBin, err := os.Executable()
require.NoError(t, err)
cfg := ServerConfig{
Name: "fake",
Transport: "stdio",
Command: testBin,
Args: []string{"-test.run=^TestConnectServer_StdioProcessSurvivesConnect$"},
Env: map[string]string{"TEST_MCP_FAKE_SERVER": "1"},
}
ctx := testutil.Context(t, testutil.WaitLong)
m := &Manager{}
client, err := m.connectServer(ctx, cfg)
require.NoError(t, err, "connectServer should succeed")
t.Cleanup(func() { _ = client.Close() })
// At this point connectServer has returned and its internal
// connectCtx has been canceled. The subprocess must still be
// alive. Verify by listing tools (requires a live server).
listCtx, listCancel := context.WithTimeout(ctx, testutil.WaitShort)
defer listCancel()
result, err := client.ListTools(listCtx, mcp.ListToolsRequest{})
require.NoError(t, err, "ListTools should succeed — server must be alive after connect")
require.Len(t, result.Tools, 1)
assert.Equal(t, "echo", result.Tools[0].Name)
}
// runFakeMCPServer implements a minimal JSON-RPC / MCP server over
// stdin/stdout, just enough for initialize + tools/list.
func runFakeMCPServer() {
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
line := scanner.Bytes()
var req struct {
JSONRPC string `json:"jsonrpc"`
ID json.RawMessage `json:"id"`
Method string `json:"method"`
}
if err := json.Unmarshal(line, &req); err != nil {
continue
}
var resp any
switch req.Method {
case "initialize":
resp = map[string]any{
"jsonrpc": "2.0",
"id": req.ID,
"result": map[string]any{
"protocolVersion": "2025-03-26",
"capabilities": map[string]any{
"tools": map[string]any{},
},
"serverInfo": map[string]any{
"name": "fake-server",
"version": "0.0.1",
},
},
}
case "notifications/initialized":
// No response needed for notifications.
continue
case "tools/list":
resp = map[string]any{
"jsonrpc": "2.0",
"id": req.ID,
"result": map[string]any{
"tools": []map[string]any{
{
"name": "echo",
"description": "echoes input",
"inputSchema": map[string]any{
"type": "object",
"properties": map[string]any{},
},
},
},
},
}
default:
resp = map[string]any{
"jsonrpc": "2.0",
"id": req.ID,
"error": map[string]any{
"code": -32601,
"message": "method not found",
},
}
}
out, err := json.Marshal(resp)
if err != nil {
continue
}
_, _ = fmt.Fprintf(os.Stdout, "%s\n", out)
}
}