Files
coder/coderd/chatd/chatloop/chatloop_test.go
T
Kyle Carberry f35b99a4fa fix(chatd): preserve context.Canceled in persistStep during shutdown (#22890)
## Problem

When a chat worker shuts down gracefully (e.g. Kubernetes pod SIGTERM)
while a tool is executing (like `wait_agent` polling for a subagent),
the chat gets stuck in `waiting` status forever — no other worker will
pick it up.

### Root Cause

`persistStep` in `chatd.go` unconditionally returned
`chatloop.ErrInterrupted` for **any** canceled context:

```go
if persistCtx.Err() != nil {
    return chatloop.ErrInterrupted  // BUG: doesn't check WHY the context was canceled
}
```

During shutdown, the context cause is `context.Canceled` (not
`ErrInterrupted`). But because `persistStep` returned `ErrInterrupted`,
the error handling in `processChat` hit the `ErrInterrupted` check first
(line 2011) and set status to `waiting` — the `isShutdownCancellation`
check (line 2017) was never reached:

```go
// Checked FIRST — matches because persistStep returned ErrInterrupted
if errors.Is(err, chatloop.ErrInterrupted) {
    status = database.ChatStatusWaiting  // Stuck forever
    return
}
// NEVER REACHED during shutdown
if isShutdownCancellation(ctx, chatCtx, err) {
    status = database.ChatStatusPending  // Would have been correct
    return
}
```

### Trigger scenario (from production logs)

1. Chat spawns a subagent via `spawn_agent`, then calls `wait_agent`
2. `wait_agent` blocks in `awaitSubagentCompletion` polling loop
3. Worker pod receives SIGTERM → `Close()` cancels server context
4. Context cancellation propagates to `awaitSubagentCompletion` →
returns `context.Canceled`
5. Tool execution completes, `persistStep` is called with canceled
context
6. `persistStep` returns `ErrInterrupted` (wrong!) → status set to
`waiting` (stuck!)

## Fix

Check `context.Cause()` before deciding which error to return:

```go
if persistCtx.Err() != nil {
    if errors.Is(context.Cause(persistCtx), chatloop.ErrInterrupted) {
        return chatloop.ErrInterrupted  // Intentional interruption
    }
    return persistCtx.Err()  // Shutdown → context.Canceled
}
```

This preserves `context.Canceled` for shutdown, allowing
`isShutdownCancellation` to match and set status to `pending` so another
worker retries the chat.

## Test

Added `TestRun_ShutdownDuringToolExecutionReturnsContextCanceled` which:
1. Streams a tool call to a blocking tool (simulating `wait_agent`)
2. Cancels the server context (simulating shutdown) while the tool
blocks
3. Verifies `Run` returns `context.Canceled`, NOT `ErrInterrupted`
2026-03-10 13:01:45 +00:00

515 lines
15 KiB
Go

package chatloop //nolint:testpackage // Uses internal symbols.
import (
"context"
"errors"
"iter"
"strings"
"sync"
"testing"
"charm.land/fantasy"
fantasyanthropic "charm.land/fantasy/providers/anthropic"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
)
const activeToolName = "read_file"
func TestRun_ActiveToolsPrepareBehavior(t *testing.T) {
t.Parallel()
var capturedCall fantasy.Call
model := &loopTestModel{
provider: fantasyanthropic.Name,
streamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
capturedCall = call
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeTextStart, ID: "text-1"},
{Type: fantasy.StreamPartTypeTextDelta, ID: "text-1", Delta: "done"},
{Type: fantasy.StreamPartTypeTextEnd, ID: "text-1"},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
}), nil
},
}
persistStepCalls := 0
var persistedStep PersistedStep
err := Run(context.Background(), RunOptions{
Model: model,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleSystem, "sys-1"),
textMessage(fantasy.MessageRoleSystem, "sys-2"),
textMessage(fantasy.MessageRoleUser, "hello"),
textMessage(fantasy.MessageRoleAssistant, "working"),
textMessage(fantasy.MessageRoleUser, "continue"),
},
Tools: []fantasy.AgentTool{
newNoopTool(activeToolName),
newNoopTool("write_file"),
},
MaxSteps: 3,
ActiveTools: []string{activeToolName},
ContextLimitFallback: 4096,
PersistStep: func(_ context.Context, step PersistedStep) error {
persistStepCalls++
persistedStep = step
return nil
},
})
require.NoError(t, err)
require.Equal(t, 1, persistStepCalls)
require.True(t, persistedStep.ContextLimit.Valid)
require.Equal(t, int64(4096), persistedStep.ContextLimit.Int64)
require.NotEmpty(t, capturedCall.Prompt)
require.False(t, containsPromptSentinel(capturedCall.Prompt))
require.Len(t, capturedCall.Tools, 1)
require.Equal(t, activeToolName, capturedCall.Tools[0].GetName())
require.Len(t, capturedCall.Prompt, 5)
require.False(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[0]))
require.True(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[1]))
require.False(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[2]))
require.True(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[3]))
require.True(t, hasAnthropicEphemeralCacheControl(capturedCall.Prompt[4]))
}
func TestRun_InterruptedStepPersistsSyntheticToolResult(t *testing.T) {
t.Parallel()
started := make(chan struct{})
model := &loopTestModel{
provider: "fake",
streamFn: func(ctx context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
return iter.Seq[fantasy.StreamPart](func(yield func(fantasy.StreamPart) bool) {
parts := []fantasy.StreamPart{
{
Type: fantasy.StreamPartTypeToolInputStart,
ID: "interrupt-tool-1",
ToolCallName: "read_file",
},
{
Type: fantasy.StreamPartTypeToolInputDelta,
ID: "interrupt-tool-1",
ToolCallName: "read_file",
Delta: `{"path":"main.go"`,
},
{Type: fantasy.StreamPartTypeTextStart, ID: "text-1"},
{Type: fantasy.StreamPartTypeTextDelta, ID: "text-1", Delta: "partial assistant output"},
}
for _, part := range parts {
if !yield(part) {
return
}
}
select {
case <-started:
default:
close(started)
}
<-ctx.Done()
_ = yield(fantasy.StreamPart{
Type: fantasy.StreamPartTypeError,
Error: ctx.Err(),
})
}), nil
},
}
ctx, cancel := context.WithCancelCause(context.Background())
defer cancel(nil)
go func() {
<-started
cancel(ErrInterrupted)
}()
persistedAssistantCtxErr := xerrors.New("unset")
var persistedContent []fantasy.Content
err := Run(ctx, RunOptions{
Model: model,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "hello"),
},
Tools: []fantasy.AgentTool{
newNoopTool("read_file"),
},
MaxSteps: 3,
PersistStep: func(persistCtx context.Context, step PersistedStep) error {
persistedAssistantCtxErr = persistCtx.Err()
persistedContent = append([]fantasy.Content(nil), step.Content...)
return nil
},
})
require.ErrorIs(t, err, ErrInterrupted)
require.NoError(t, persistedAssistantCtxErr)
require.NotEmpty(t, persistedContent)
var (
foundText bool
foundToolCall bool
foundToolResult bool
)
for _, block := range persistedContent {
if text, ok := fantasy.AsContentType[fantasy.TextContent](block); ok {
if strings.Contains(text.Text, "partial assistant output") {
foundText = true
}
continue
}
if toolCall, ok := fantasy.AsContentType[fantasy.ToolCallContent](block); ok {
if toolCall.ToolCallID == "interrupt-tool-1" &&
toolCall.ToolName == "read_file" &&
strings.Contains(toolCall.Input, `"path":"main.go"`) {
foundToolCall = true
}
continue
}
if toolResult, ok := fantasy.AsContentType[fantasy.ToolResultContent](block); ok {
if toolResult.ToolCallID == "interrupt-tool-1" &&
toolResult.ToolName == "read_file" {
_, isErr := toolResult.Result.(fantasy.ToolResultOutputContentError)
require.True(t, isErr, "interrupted tool result should be an error")
foundToolResult = true
}
}
}
require.True(t, foundText)
require.True(t, foundToolCall)
require.True(t, foundToolResult)
}
type loopTestModel struct {
provider string
model string
generateFn func(context.Context, fantasy.Call) (*fantasy.Response, error)
streamFn func(context.Context, fantasy.Call) (fantasy.StreamResponse, error)
}
func (m *loopTestModel) Provider() string {
if m.provider != "" {
return m.provider
}
return "fake"
}
func (m *loopTestModel) Model() string {
if m.model != "" {
return m.model
}
return "fake"
}
func (m *loopTestModel) Generate(ctx context.Context, call fantasy.Call) (*fantasy.Response, error) {
if m.generateFn != nil {
return m.generateFn(ctx, call)
}
return &fantasy.Response{}, nil
}
func (m *loopTestModel) Stream(ctx context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
if m.streamFn != nil {
return m.streamFn(ctx, call)
}
return streamFromParts([]fantasy.StreamPart{{
Type: fantasy.StreamPartTypeFinish,
FinishReason: fantasy.FinishReasonStop,
}}), nil
}
func (*loopTestModel) GenerateObject(context.Context, fantasy.ObjectCall) (*fantasy.ObjectResponse, error) {
return nil, xerrors.New("not implemented")
}
func (*loopTestModel) StreamObject(context.Context, fantasy.ObjectCall) (fantasy.ObjectStreamResponse, error) {
return nil, xerrors.New("not implemented")
}
func streamFromParts(parts []fantasy.StreamPart) fantasy.StreamResponse {
return iter.Seq[fantasy.StreamPart](func(yield func(fantasy.StreamPart) bool) {
for _, part := range parts {
if !yield(part) {
return
}
}
})
}
func newNoopTool(name string) fantasy.AgentTool {
return fantasy.NewAgentTool(
name,
"test noop tool",
func(context.Context, struct{}, fantasy.ToolCall) (fantasy.ToolResponse, error) {
return fantasy.ToolResponse{}, nil
},
)
}
func textMessage(role fantasy.MessageRole, text string) fantasy.Message {
return fantasy.Message{
Role: role,
Content: []fantasy.MessagePart{
fantasy.TextPart{Text: text},
},
}
}
func containsPromptSentinel(prompt []fantasy.Message) bool {
for _, message := range prompt {
if message.Role != fantasy.MessageRoleUser || len(message.Content) != 1 {
continue
}
textPart, ok := fantasy.AsMessagePart[fantasy.TextPart](message.Content[0])
if !ok {
continue
}
if strings.HasPrefix(textPart.Text, "__chatd_agent_prompt_sentinel_") {
return true
}
}
return false
}
func TestRun_MultiStepToolExecution(t *testing.T) {
t.Parallel()
var mu sync.Mutex
var streamCalls int
var secondCallPrompt []fantasy.Message
model := &loopTestModel{
provider: "fake",
streamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
mu.Lock()
step := streamCalls
streamCalls++
mu.Unlock()
switch step {
case 0:
// Step 0: produce a tool call.
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeToolInputStart, ID: "tc-1", ToolCallName: "read_file"},
{Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc-1", Delta: `{"path":"main.go"}`},
{Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc-1"},
{
Type: fantasy.StreamPartTypeToolCall,
ID: "tc-1",
ToolCallName: "read_file",
ToolCallInput: `{"path":"main.go"}`,
},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls},
}), nil
default:
// Step 1: capture the prompt the loop sent us,
// then return plain text.
mu.Lock()
secondCallPrompt = append([]fantasy.Message(nil), call.Prompt...)
mu.Unlock()
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeTextStart, ID: "text-1"},
{Type: fantasy.StreamPartTypeTextDelta, ID: "text-1", Delta: "all done"},
{Type: fantasy.StreamPartTypeTextEnd, ID: "text-1"},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
}), nil
}
},
}
var persistStepCalls int
err := Run(context.Background(), RunOptions{
Model: model,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "please read main.go"),
},
Tools: []fantasy.AgentTool{
newNoopTool("read_file"),
},
MaxSteps: 5,
PersistStep: func(_ context.Context, _ PersistedStep) error {
persistStepCalls++
return nil
},
})
require.NoError(t, err)
// Stream was called twice: once for the tool-call step,
// once for the follow-up text step.
require.Equal(t, 2, streamCalls)
// PersistStep is called once per step.
require.Equal(t, 2, persistStepCalls)
// The second call's prompt must contain the assistant message
// from step 0 (with the tool call) and a tool-result message.
require.NotEmpty(t, secondCallPrompt)
var foundAssistantToolCall bool
var foundToolResult bool
for _, msg := range secondCallPrompt {
if msg.Role == fantasy.MessageRoleAssistant {
for _, part := range msg.Content {
if tc, ok := fantasy.AsMessagePart[fantasy.ToolCallPart](part); ok {
if tc.ToolCallID == "tc-1" && tc.ToolName == "read_file" {
foundAssistantToolCall = true
}
}
}
}
if msg.Role == fantasy.MessageRoleTool {
for _, part := range msg.Content {
if tr, ok := fantasy.AsMessagePart[fantasy.ToolResultPart](part); ok {
if tr.ToolCallID == "tc-1" {
foundToolResult = true
}
}
}
}
}
require.True(t, foundAssistantToolCall, "second call prompt should contain assistant tool call from step 0")
require.True(t, foundToolResult, "second call prompt should contain tool result message")
}
func TestRun_PersistStepErrorPropagates(t *testing.T) {
t.Parallel()
model := &loopTestModel{
provider: "fake",
streamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeTextStart, ID: "text-1"},
{Type: fantasy.StreamPartTypeTextDelta, ID: "text-1", Delta: "hello"},
{Type: fantasy.StreamPartTypeTextEnd, ID: "text-1"},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonStop},
}), nil
},
}
persistErr := xerrors.New("database write failed")
err := Run(context.Background(), RunOptions{
Model: model,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "hello"),
},
MaxSteps: 1,
PersistStep: func(_ context.Context, _ PersistedStep) error {
return persistErr
},
})
require.Error(t, err)
require.ErrorContains(t, err, "database write failed")
}
// TestRun_ShutdownDuringToolExecutionReturnsContextCanceled verifies that
// when the parent context is canceled (simulating server shutdown) while
// a tool is blocked, Run returns context.Canceled — not ErrInterrupted.
// This matters because the caller uses the error type to decide whether
// to set chat status to "pending" (retryable on another worker) vs
// "waiting" (stuck forever).
func TestRun_ShutdownDuringToolExecutionReturnsContextCanceled(t *testing.T) {
t.Parallel()
toolStarted := make(chan struct{})
// Model returns a single tool call, then finishes.
model := &loopTestModel{
provider: "fake",
streamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeToolInputStart, ID: "tc-block", ToolCallName: "blocking_tool"},
{Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc-block", Delta: `{}`},
{Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc-block"},
{
Type: fantasy.StreamPartTypeToolCall,
ID: "tc-block",
ToolCallName: "blocking_tool",
ToolCallInput: `{}`,
},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls},
}), nil
},
}
// Tool that blocks until its context is canceled, simulating
// a long-running operation like wait_agent.
blockingTool := fantasy.NewAgentTool(
"blocking_tool",
"blocks until context canceled",
func(ctx context.Context, _ struct{}, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
close(toolStarted)
<-ctx.Done()
return fantasy.ToolResponse{}, ctx.Err()
},
)
// Simulate the server context (parent) and chat context
// (child). Canceling the parent simulates graceful shutdown.
serverCtx, serverCancel := context.WithCancel(context.Background())
defer serverCancel()
serverCancelDone := make(chan struct{})
go func() {
defer close(serverCancelDone)
<-toolStarted
t.Logf("tool started, canceling server context to simulate shutdown")
serverCancel()
}()
// persistStep mirrors the FIXED chatd.go code: it only returns
// ErrInterrupted when the context was actually canceled due to
// an interruption (cause is ErrInterrupted). For shutdown
// (plain context.Canceled), it returns the original error so
// callers can distinguish the two.
persistStep := func(persistCtx context.Context, _ PersistedStep) error {
if persistCtx.Err() != nil {
if errors.Is(context.Cause(persistCtx), ErrInterrupted) {
return ErrInterrupted
}
return persistCtx.Err()
}
return nil
}
err := Run(serverCtx, RunOptions{
Model: model,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "run the blocking tool"),
},
Tools: []fantasy.AgentTool{blockingTool},
MaxSteps: 3,
PersistStep: persistStep,
})
// Wait for the cancel goroutine to finish to aid flake
// diagnosis if the test ever hangs.
<-serverCancelDone
require.Error(t, err)
// The error must NOT be ErrInterrupted — it should propagate
// as context.Canceled so the caller can distinguish shutdown
// from user interruption. Use assert (not require) so both
// checks are evaluated even if the first fails.
assert.NotErrorIs(t, err, ErrInterrupted, "shutdown cancellation must not be converted to ErrInterrupted")
assert.ErrorIs(t, err, context.Canceled, "shutdown should propagate as context.Canceled")
}
func hasAnthropicEphemeralCacheControl(message fantasy.Message) bool {
if len(message.ProviderOptions) == 0 {
return false
}
options, ok := message.ProviderOptions[fantasyanthropic.Name]
if !ok {
return false
}
cacheOptions, ok := options.(*fantasyanthropic.ProviderCacheControlOptions)
return ok && cacheOptions.CacheControl.Type == "ephemeral"
}