Files
coder/coderd/x/chatd/chatloop/chatloop_internal_test.go
T
Cian Johnston e8508b2d90 fix: recover chatd from poisoned chain anchor on retry (#25097)
When OpenAI's Responses API returns `Previous response with id ... not
found` for a chained turn, classify it as a `ChainBroken` retry, clear
`previous_response_id`, exit chain mode, reload full history, and let
`chatretry` retry. Self-heals chats whose anchor was poisoned before
#25074 stopped truncated streams from being persisted as a successful
turn with a stored response id.

The new state is exposed via the existing
`coderd_chatd_stream_retries_total` counter as a
`chain_broken="true"|"false"` label. Aggregating queries (`sum`, `rate`
over `provider`/`model`/`kind`) keep working without changes; raw-series
matchers without aggregation will now see two series per `(provider,
model, kind)` where they previously saw one. The metric is internal-only
so the blast radius should be small, but if you have dashboards that
index by exact label matchers without aggregation they will need an
extra `sum` or an explicit `chain_broken` selector.

> 🤖 This PR was created with the help of Coder Agents, and was reviewed by a human 🧑‍💻
2026-05-11 17:43:40 +01:00

543 lines
17 KiB
Go

package chatloop
import (
"context"
"iter"
"sync"
"testing"
"charm.land/fantasy"
fantasyanthropic "charm.land/fantasy/providers/anthropic"
fantasyopenai "charm.land/fantasy/providers/openai"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/x/chatd/chatopenai"
"github.com/coder/coder/v2/coderd/x/chatd/chattest"
)
func TestRun_ChainBrokenRecovers(t *testing.T) {
t.Parallel()
// Given: a chain-mode run whose previous provider_response_id is present in
// our database but no longer recognized by the provider for some reason
var (
streamCalls int
secondCallOpt fantasy.ProviderOptions
secondPrompt []fantasy.Message
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
secondCallOpt = call.ProviderOptions
secondPrompt = call.Prompt
return finishingStream(), nil
}
},
}
disableCalls := 0
reloadCalls := 0
reloadedHistory := []fantasy.Message{
{Role: "system", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "sys"}}},
{Role: "user", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "hello"}}},
{Role: "assistant", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "hi"}}},
{Role: "user", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "follow up"}}},
}
chainFiltered := []fantasy.Message{
{Role: "system", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "sys"}}},
{Role: "user", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "follow up"}}},
}
// When: the first attempt fails with the chain-broken error
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: chainFiltered,
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {
disableCalls++
},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
reloadCalls++
return reloadedHistory, nil
},
})
// Then: DisableChainMode and ReloadMessages each run once and the
// retry attempt sends the full reloaded history without
// previous_response_id.
require.NoError(t, err)
require.Equal(t, 2, streamCalls, "exactly two stream attempts (one failure, one success)")
require.Equal(t, 1, disableCalls, "DisableChainMode called once on chain-broken recovery")
require.Equal(t, 1, reloadCalls, "ReloadMessages called once on chain-broken recovery")
require.False(t,
chatopenai.HasPreviousResponseID(secondCallOpt),
"second attempt must not carry previous_response_id; it was poisoned",
)
require.Equal(t, reloadedHistory, secondPrompt,
"second attempt must use full reloaded history, not chain-filtered prompt",
)
}
func TestRun_ChainBrokenRecoveryPreparesReloadedMessages(t *testing.T) {
t.Parallel()
var (
streamCalls int
prepareCalls int
secondCallOpt fantasy.ProviderOptions
secondPrompt []fantasy.Message
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
secondCallOpt = call.ProviderOptions
secondPrompt = call.Prompt
return finishingStream(), nil
}
},
}
reloadedHistory := []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "full history"),
}
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "chain-filtered"),
},
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
return reloadedHistory, nil
},
PrepareMessages: func(msgs []fantasy.Message) []fantasy.Message {
prepareCalls++
return append(msgs, textMessage(fantasy.MessageRoleSystem, "prepared"))
},
})
require.NoError(t, err)
require.Equal(t, 2, streamCalls)
require.Equal(t, 2, prepareCalls,
"reloaded history must be prepared before the retry")
require.False(t, chatopenai.HasPreviousResponseID(secondCallOpt))
requireTextPrompt(t, secondPrompt, "full history")
requireTextPrompt(t, secondPrompt, "prepared")
}
func TestRun_ChainBrokenRecoveryAppliesProviderPromptPrep(t *testing.T) {
t.Parallel()
var (
streamCalls int
secondCallOpt fantasy.ProviderOptions
secondPrompt []fantasy.Message
)
model := &chattest.FakeModel{
ProviderName: fantasyanthropic.Name,
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
secondCallOpt = call.ProviderOptions
secondPrompt = call.Prompt
return finishingStream(), nil
}
},
}
reloadedHistory := []fantasy.Message{
textMessage(fantasy.MessageRoleSystem, "sys-1"),
textMessage(fantasy.MessageRoleSystem, "sys-2"),
textMessage(fantasy.MessageRoleUser, "hello"),
textMessage(fantasy.MessageRoleAssistant, "hi"),
textMessage(fantasy.MessageRoleUser, "follow up"),
}
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleSystem, "sys-2"),
textMessage(fantasy.MessageRoleUser, "follow up"),
},
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
return reloadedHistory, nil
},
})
require.NoError(t, err)
require.Equal(t, 2, streamCalls)
require.False(t, chatopenai.HasPreviousResponseID(secondCallOpt))
require.Len(t, secondPrompt, 5)
require.False(t, hasAnthropicEphemeralCacheControl(secondPrompt[0]))
require.True(t, hasAnthropicEphemeralCacheControl(secondPrompt[1]))
require.False(t, hasAnthropicEphemeralCacheControl(secondPrompt[2]))
require.True(t, hasAnthropicEphemeralCacheControl(secondPrompt[3]))
require.True(t, hasAnthropicEphemeralCacheControl(secondPrompt[4]))
}
func TestRun_ChainBrokenReloadWithoutDisableChainModeIsExplicit(t *testing.T) {
t.Parallel()
var (
streamCalls int
prepareCalls int
reloadCalls int
secondCallOpt fantasy.ProviderOptions
secondPrompt []fantasy.Message
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
secondCallOpt = call.ProviderOptions
secondPrompt = call.Prompt
return finishingStream(), nil
}
},
}
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "chain-filtered"),
},
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
reloadCalls++
return []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "full history"),
}, nil
},
PrepareMessages: func(msgs []fantasy.Message) []fantasy.Message {
prepareCalls++
return append(msgs, textMessage(fantasy.MessageRoleSystem, "prepared"))
},
// DisableChainMode is intentionally nil. This covers callers
// whose ReloadMessages does not depend on chain-mode state.
})
require.NoError(t, err)
require.Equal(t, 2, streamCalls)
require.Equal(t, 1, reloadCalls)
require.Equal(t, 2, prepareCalls)
require.False(t, chatopenai.HasPreviousResponseID(secondCallOpt))
requireTextPrompt(t, secondPrompt, "full history")
requireTextPrompt(t, secondPrompt, "prepared")
}
func TestRun_ChainBrokenComposesWithPostStepChainExit(t *testing.T) {
t.Parallel()
// Given a chain-mode run whose recovery succeeds and yields a
// tool call so the step loop continues
var (
mu sync.Mutex
streamCalls int
capturedOpts []fantasy.ProviderOptions
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
mu.Lock()
streamCalls++
attempt := streamCalls
capturedOpts = append(capturedOpts, call.ProviderOptions)
mu.Unlock()
switch attempt {
case 1:
// Initial chained attempt: 404 from provider.
return nil, xerrors.New(chainBrokenErrorMessage)
case 2:
// Recovery succeeded; emit a tool call so the
// step loop continues to a second step.
return streamFromParts([]fantasy.StreamPart{
{Type: fantasy.StreamPartTypeToolInputStart, ID: "tc-1", ToolCallName: "read_file"},
{Type: fantasy.StreamPartTypeToolInputDelta, ID: "tc-1", Delta: `{"path":"main.go"}`},
{Type: fantasy.StreamPartTypeToolInputEnd, ID: "tc-1"},
{
Type: fantasy.StreamPartTypeToolCall,
ID: "tc-1",
ToolCallName: "read_file",
ToolCallInput: `{"path":"main.go"}`,
},
{Type: fantasy.StreamPartTypeFinish, FinishReason: fantasy.FinishReasonToolCalls},
}), nil
default:
// Step 1: end the run.
return finishingStream(), nil
}
},
}
// When the second step builds its call from opts.ProviderOptions
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 3,
ContextLimitFallback: 4096,
Messages: []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "hi"),
},
Tools: []fantasy.AgentTool{
newNoopTool("read_file"),
},
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
return []fantasy.Message{
textMessage(fantasy.MessageRoleUser, "hi"),
}, nil
},
})
// Then it must not re-send the poisoned previous_response_id
// because chain-broken recovery cleared both the current call and
// subsequent step options.
require.NoError(t, err)
require.Equal(t, 3, streamCalls,
"expected three stream calls: chain-broken failure, recovered tool-call step, follow-up step")
for i, providerOpts := range capturedOpts[1:] {
require.False(t,
chatopenai.HasPreviousResponseID(providerOpts),
"every stream call after recovery (index %d) must have cleared previous_response_id",
i+1,
)
}
}
func TestRun_ChainBrokenReloadFailureStillClearsChain(t *testing.T) {
t.Parallel()
// Given: a chain-mode run whose ReloadMessages callback errors
var (
streamCalls int
prepareCalls int
secondCallOpt fantasy.ProviderOptions
secondPrompt []fantasy.Message
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
secondCallOpt = call.ProviderOptions
secondPrompt = call.Prompt
return finishingStream(), nil
}
},
}
disableCalls := 0
chainFiltered := []fantasy.Message{
{Role: "system", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "sys"}}},
{Role: "user", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "follow up"}}},
}
// When: the chain-broken error fires
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: chainFiltered,
ProviderOptions: chainModeProviderOptions("resp_poisoned"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {
disableCalls++
},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
return nil, xerrors.New("reload exploded")
},
PrepareMessages: func(msgs []fantasy.Message) []fantasy.Message {
prepareCalls++
return append(msgs, textMessage(fantasy.MessageRoleSystem, "prepared"))
},
})
// Then: the poisoned previous_response_id is still cleared and
// DisableChainMode still runs, so the retry has any chance of
// succeeding against the chain-filtered prompt.
require.NoError(t, err)
require.Equal(t, 1, disableCalls)
require.Equal(t, 1, prepareCalls)
require.False(t,
chatopenai.HasPreviousResponseID(secondCallOpt),
"chain options must still be cleared even when reload fails",
)
requireTextPrompt(t, secondPrompt, "follow up")
requireTextPrompt(t, secondPrompt, "prepared")
}
func TestRun_ChainBrokenWithoutChainModeIsSafe(t *testing.T) {
t.Parallel()
// Given: a run with no chain-mode options or callbacks
var streamCalls int
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, _ fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New(chainBrokenErrorMessage)
default:
return finishingStream(), nil
}
},
}
// When: a future provider returns a chain-broken signal,
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
// No ProviderOptions, no DisableChainMode, no ReloadMessages.
})
// Then: the recovery branch must no-op (no panic, no missing
// callbacks) and the retry runs normally.
require.NoError(t, err)
require.Equal(t, 2, streamCalls)
}
func TestRun_NonChainBrokenRetryDoesNotTouchChainState(t *testing.T) {
t.Parallel()
// Given: a chain-mode run with a still-valid previous_response_id
var (
streamCalls int
secondCallOpt fantasy.ProviderOptions
)
model := &chattest.FakeModel{
ProviderName: "openai",
StreamFn: func(_ context.Context, call fantasy.Call) (fantasy.StreamResponse, error) {
streamCalls++
switch streamCalls {
case 1:
return nil, xerrors.New("received status 503 from upstream")
default:
secondCallOpt = call.ProviderOptions
return finishingStream(), nil
}
},
}
disableCalls := 0
reloadCalls := 0
// When: a non-chain-broken retryable error fires (503)
err := Run(context.Background(), RunOptions{
Model: model,
MaxSteps: 1,
ContextLimitFallback: 4096,
Messages: []fantasy.Message{
{Role: "user", Content: []fantasy.MessagePart{fantasy.TextPart{Text: "hi"}}},
},
ProviderOptions: chainModeProviderOptions("resp_still_valid"),
PersistStep: func(_ context.Context, _ PersistedStep) error {
return nil
},
DisableChainMode: func() {
disableCalls++
},
ReloadMessages: func(_ context.Context) ([]fantasy.Message, error) {
reloadCalls++
return nil, nil
},
})
// Then: chain mode stays engaged, ReloadMessages is not called,
// and the retry preserves previous_response_id.
require.NoError(t, err)
require.Equal(t, 0, disableCalls,
"non-chain-broken retry must not exit chain mode")
require.Equal(t, 0, reloadCalls,
"non-chain-broken retry must not reload history")
require.True(t,
chatopenai.HasPreviousResponseID(secondCallOpt),
"non-chain-broken retry must preserve previous_response_id",
)
}
// chainBrokenError is what OpenAI returns when previous_response_id
// points at a response it does not have stored.
const chainBrokenErrorMessage = "Previous response with id 'resp_abc' not found."
// finishingStream returns a stream that emits a single Finish part.
// The chatloop treats a finishReason of Stop as "stoppedByModel" and
// exits the per-step loop after persisting.
func finishingStream() fantasy.StreamResponse {
return iter.Seq[fantasy.StreamPart](func(yield func(fantasy.StreamPart) bool) {
yield(fantasy.StreamPart{
Type: fantasy.StreamPartTypeFinish,
FinishReason: fantasy.FinishReasonStop,
})
})
}
// chainModeProviderOptions builds a fantasy.ProviderOptions carrying
// the OpenAI Responses options with previous_response_id set, the same
// shape chatd builds when chain mode is active.
func chainModeProviderOptions(previousResponseID string) fantasy.ProviderOptions {
store := true
return fantasy.ProviderOptions{
fantasyopenai.Name: &fantasyopenai.ResponsesProviderOptions{
Store: &store,
PreviousResponseID: &previousResponseID,
},
}
}