mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix: prevent stdio MCP server subprocess from dying after connect (#24035)
## Problem
MCP servers configured in `.mcp.json` with stdio transport are
discovered successfully (tools appear) but die immediately after
connection, making all tool calls fail.
## Root Cause
In `connectServer`, the subprocess is spawned with `connectCtx` — a
30-second timeout context whose `cancel()` is deferred:
```go
connectCtx, cancel := context.WithTimeout(ctx, connectTimeout)
defer cancel()
if err := c.Start(connectCtx); err != nil { ... }
```
The mcp-go stdio transport calls `exec.CommandContext(connectCtx, ...)`.
When `connectServer` returns, `cancel()` fires, and
`exec.CommandContext` sends SIGKILL to the subprocess. The process
immediately becomes a zombie.
Confirmed by checking `/proc/<pid>/status` after context cancellation:
```
State: Z (zombie)
```
## Fix
Pass the parent `ctx` (which is `a.gracefulCtx` — the agent's long-lived
context) to `c.Start()`. `connectCtx` continues to bound only the
`Initialize()` handshake. The subprocess is cleaned up when the Manager
is closed or the parent context is canceled.
## Regression Test
Added `TestConnectServer_StdioProcessSurvivesConnect` which:
- Spawns a real subprocess (re-execs the test binary as a fake MCP
server)
- Calls `connectServer` and lets it return (internal `connectCtx` gets
canceled)
- Verifies the subprocess is still alive by calling `ListTools`
The test **fails** on the old code with `transport error: context
deadline exceeded` and **passes** with the fix.
> Generated with [Coder Agents](https://coder.com/agents)
This commit is contained in:
@@ -187,7 +187,11 @@ func (*Manager) connectServer(ctx context.Context, cfg ServerConfig) (*client.Cl
|
||||
connectCtx, cancel := context.WithTimeout(ctx, connectTimeout)
|
||||
defer cancel()
|
||||
|
||||
if err := c.Start(connectCtx); err != nil {
|
||||
// Use the parent ctx (not connectCtx) so the subprocess outlives
|
||||
// the connect/initialize handshake. connectCtx bounds only the
|
||||
// Initialize call below. The subprocess is cleaned up when the
|
||||
// Manager is closed or ctx is canceled.
|
||||
if err := c.Start(ctx); err != nil {
|
||||
_ = c.Close()
|
||||
return nil, xerrors.Errorf("start %q: %w", cfg.Name, err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
package agentmcp
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/mark3labs/mcp-go/mcp"
|
||||
@@ -8,6 +13,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
||||
"github.com/coder/coder/v2/testutil"
|
||||
)
|
||||
|
||||
func TestSplitToolName(t *testing.T) {
|
||||
@@ -193,3 +199,118 @@ func TestConvertResult(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestConnectServer_StdioProcessSurvivesConnect verifies that a stdio MCP
|
||||
// server subprocess remains alive after connectServer returns. This is a
|
||||
// regression test for a bug where the subprocess was tied to a short-lived
|
||||
// connectCtx and killed as soon as the context was canceled.
|
||||
func TestConnectServer_StdioProcessSurvivesConnect(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if os.Getenv("TEST_MCP_FAKE_SERVER") == "1" {
|
||||
// Child process: act as a minimal MCP server over stdio.
|
||||
runFakeMCPServer()
|
||||
return
|
||||
}
|
||||
|
||||
// Get the path to the test binary so we can re-exec ourselves
|
||||
// as a fake MCP server subprocess.
|
||||
testBin, err := os.Executable()
|
||||
require.NoError(t, err)
|
||||
|
||||
cfg := ServerConfig{
|
||||
Name: "fake",
|
||||
Transport: "stdio",
|
||||
Command: testBin,
|
||||
Args: []string{"-test.run=^TestConnectServer_StdioProcessSurvivesConnect$"},
|
||||
Env: map[string]string{"TEST_MCP_FAKE_SERVER": "1"},
|
||||
}
|
||||
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
m := &Manager{}
|
||||
client, err := m.connectServer(ctx, cfg)
|
||||
require.NoError(t, err, "connectServer should succeed")
|
||||
t.Cleanup(func() { _ = client.Close() })
|
||||
|
||||
// At this point connectServer has returned and its internal
|
||||
// connectCtx has been canceled. The subprocess must still be
|
||||
// alive. Verify by listing tools (requires a live server).
|
||||
listCtx, listCancel := context.WithTimeout(ctx, testutil.WaitShort)
|
||||
defer listCancel()
|
||||
result, err := client.ListTools(listCtx, mcp.ListToolsRequest{})
|
||||
require.NoError(t, err, "ListTools should succeed — server must be alive after connect")
|
||||
require.Len(t, result.Tools, 1)
|
||||
assert.Equal(t, "echo", result.Tools[0].Name)
|
||||
}
|
||||
|
||||
// runFakeMCPServer implements a minimal JSON-RPC / MCP server over
|
||||
// stdin/stdout, just enough for initialize + tools/list.
|
||||
func runFakeMCPServer() {
|
||||
scanner := bufio.NewScanner(os.Stdin)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
|
||||
var req struct {
|
||||
JSONRPC string `json:"jsonrpc"`
|
||||
ID json.RawMessage `json:"id"`
|
||||
Method string `json:"method"`
|
||||
}
|
||||
if err := json.Unmarshal(line, &req); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var resp any
|
||||
switch req.Method {
|
||||
case "initialize":
|
||||
resp = map[string]any{
|
||||
"jsonrpc": "2.0",
|
||||
"id": req.ID,
|
||||
"result": map[string]any{
|
||||
"protocolVersion": "2025-03-26",
|
||||
"capabilities": map[string]any{
|
||||
"tools": map[string]any{},
|
||||
},
|
||||
"serverInfo": map[string]any{
|
||||
"name": "fake-server",
|
||||
"version": "0.0.1",
|
||||
},
|
||||
},
|
||||
}
|
||||
case "notifications/initialized":
|
||||
// No response needed for notifications.
|
||||
continue
|
||||
case "tools/list":
|
||||
resp = map[string]any{
|
||||
"jsonrpc": "2.0",
|
||||
"id": req.ID,
|
||||
"result": map[string]any{
|
||||
"tools": []map[string]any{
|
||||
{
|
||||
"name": "echo",
|
||||
"description": "echoes input",
|
||||
"inputSchema": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
default:
|
||||
resp = map[string]any{
|
||||
"jsonrpc": "2.0",
|
||||
"id": req.ID,
|
||||
"error": map[string]any{
|
||||
"code": -32601,
|
||||
"message": "method not found",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
out, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
_, _ = fmt.Fprintf(os.Stdout, "%s\n", out)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user