package chatdebug import ( "bytes" "context" "database/sql" "encoding/json" "errors" "fmt" "sync" "sync/atomic" "time" "github.com/google/uuid" "github.com/sqlc-dev/pqtype" "golang.org/x/xerrors" "cdr.dev/slog/v3" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/pubsub" "github.com/coder/quartz" ) // DefaultStaleThreshold is the fallback stale timeout for debug rows // when no caller-provided value is supplied. const DefaultStaleThreshold = 5 * time.Minute // Service persists chat debug rows and fans out lightweight change events. type Service struct { db database.Store log slog.Logger pubsub pubsub.Pubsub clock quartz.Clock alwaysEnable bool // staleAfterNanos stores the stale threshold as nanoseconds in an // atomic.Int64 so SetStaleAfter and FinalizeStale can be called // from concurrent goroutines without a data race. staleAfterNanos atomic.Int64 // thresholdMu protects thresholdChanged. thresholdMu sync.Mutex // thresholdChanged is closed by SetStaleAfter to wake heartbeat // goroutines so they can re-read the (possibly shorter) interval // immediately instead of waiting for the old ticker to fire. thresholdChanged chan struct{} } // ServiceOption configures optional Service behavior. type ServiceOption func(*Service) // WithStaleThreshold overrides the default stale-row finalization // threshold. Callers that already have a configurable in-flight chat // timeout (e.g. chatd's InFlightChatStaleAfter) should pass it here // so the two sweeps stay in sync. func WithStaleThreshold(d time.Duration) ServiceOption { return func(s *Service) { if d > 0 { s.staleAfterNanos.Store(d.Nanoseconds()) } } } // WithAlwaysEnable forces debug logging on for every chat regardless // of the runtime admin and user opt-in settings. This is used for the // deployment-level serpent flag. func WithAlwaysEnable(always bool) ServiceOption { return func(s *Service) { s.alwaysEnable = always } } // WithClock overrides the default real clock. Tests inject // quartz.NewMock(t) to control time-dependent behavior such as // heartbeat tickers and FinalizeStale timestamps. func WithClock(c quartz.Clock) ServiceOption { return func(s *Service) { if c != nil { s.clock = c } } } // CreateRunParams contains friendly inputs for creating a debug run. type CreateRunParams struct { ChatID uuid.UUID RootChatID uuid.UUID ParentChatID uuid.UUID ModelConfigID uuid.UUID TriggerMessageID int64 HistoryTipMessageID int64 Kind RunKind Status Status Provider string Model string Summary any } // UpdateRunParams contains inputs for updating a debug run. // Zero-valued fields are treated as "keep the existing value" by the // COALESCE-based SQL query. Once a field is set it cannot be cleared // back to NULL; this is intentional for the write-once-finalize // lifecycle of debug rows. type UpdateRunParams struct { ID uuid.UUID ChatID uuid.UUID Status Status Summary any FinishedAt time.Time } // CreateStepParams contains friendly inputs for creating a debug step. type CreateStepParams struct { RunID uuid.UUID ChatID uuid.UUID StepNumber int32 Operation Operation Status Status HistoryTipMessageID int64 NormalizedRequest any } // UpdateStepParams contains optional inputs for updating a debug step. // Most payload fields are typed as any and serialized through nullJSON // because their shape varies by provider. The Attempts field uses a // concrete slice for compile-time safety where the schema is stable. // Zero-valued fields are treated as "keep the existing value" by the // COALESCE-based SQL query. Once set, fields cannot be cleared back // to NULL. This is intentional for the write-once-finalize lifecycle // of debug rows. type UpdateStepParams struct { ID uuid.UUID ChatID uuid.UUID Status Status AssistantMessageID int64 NormalizedResponse any Usage any Attempts []Attempt Error any Metadata any FinishedAt time.Time } // NewService constructs a chat debug persistence service. func NewService(db database.Store, log slog.Logger, ps pubsub.Pubsub, opts ...ServiceOption) *Service { if db == nil { panic("chatdebug: nil database.Store") } s := &Service{ db: db, log: log, pubsub: ps, clock: quartz.NewReal(), thresholdChanged: make(chan struct{}), } s.staleAfterNanos.Store(DefaultStaleThreshold.Nanoseconds()) for _, opt := range opts { opt(s) } return s } // SetStaleAfter overrides the in-flight stale threshold used when // finalizing abandoned debug rows. Zero or negative durations are // ignored, leaving the current threshold (initial or previously // overridden) unchanged. Active heartbeat goroutines are woken so // they can re-read the (possibly shorter) interval immediately. func (s *Service) SetStaleAfter(staleAfter time.Duration) { if s == nil || staleAfter <= 0 { return } s.staleAfterNanos.Store(staleAfter.Nanoseconds()) // Wake all heartbeat goroutines by closing the current channel // and replacing it with a fresh one for the next update. s.thresholdMu.Lock() close(s.thresholdChanged) s.thresholdChanged = make(chan struct{}) s.thresholdMu.Unlock() } // thresholdChan returns the current threshold-change notification // channel. Heartbeat goroutines select on this to detect runtime // stale-threshold updates. func (s *Service) thresholdChan() <-chan struct{} { s.thresholdMu.Lock() defer s.thresholdMu.Unlock() return s.thresholdChanged } // staleThreshold returns the current stale timeout. func (s *Service) staleThreshold() time.Duration { ns := s.staleAfterNanos.Load() d := time.Duration(ns) if d <= 0 { return DefaultStaleThreshold } return d } // heartbeatInterval returns a safe ticker interval for stream heartbeats. // It is half the stale threshold so at least one touch lands before the // stale sweep considers the row abandoned. The result is clamped to a // minimum of 1 ms to prevent panics from time.NewTicker(0) with // pathologically small thresholds, while still staying well below any // practical stale timeout. func (s *Service) heartbeatInterval() time.Duration { return max(s.staleThreshold()/2, time.Millisecond) } func chatdContext(ctx context.Context) context.Context { //nolint:gocritic // AsChatd provides narrowly-scoped daemon access for // chat debug persistence reads and writes. return dbauthz.AsChatd(ctx) } // IsEnabled returns whether debug logging is enabled for the given chat. func (s *Service) IsEnabled( ctx context.Context, chatID uuid.UUID, ownerID uuid.UUID, ) bool { if s == nil { return false } if s.alwaysEnable { return true } if s.db == nil { return false } authCtx := chatdContext(ctx) allowUsers, err := s.db.GetChatDebugLoggingAllowUsers(authCtx) if err != nil { if errors.Is(err, sql.ErrNoRows) { return false } s.log.Warn(ctx, "failed to load runtime admin chat debug logging setting", slog.Error(err), ) return false } if !allowUsers { return false } if ownerID == uuid.Nil { s.log.Warn(ctx, "missing chat owner for debug logging enablement check", slog.F("chat_id", chatID), ) return false } enabled, err := s.db.GetUserChatDebugLoggingEnabled(authCtx, ownerID) if err == nil { return enabled } if errors.Is(err, sql.ErrNoRows) { return false } s.log.Warn(ctx, "failed to load user chat debug logging setting", slog.Error(err), slog.F("chat_id", chatID), slog.F("owner_id", ownerID), ) return false } // CreateRun inserts a new debug run and emits a run update event. func (s *Service) CreateRun( ctx context.Context, params CreateRunParams, ) (database.ChatDebugRun, error) { now := s.clock.Now() run, err := s.db.InsertChatDebugRun(chatdContext(ctx), database.InsertChatDebugRunParams{ ChatID: params.ChatID, RootChatID: nullUUID(params.RootChatID), ParentChatID: nullUUID(params.ParentChatID), ModelConfigID: nullUUID(params.ModelConfigID), TriggerMessageID: nullInt64(params.TriggerMessageID), HistoryTipMessageID: nullInt64(params.HistoryTipMessageID), Kind: string(params.Kind), Status: string(params.Status), Provider: nullString(params.Provider), Model: nullString(params.Model), Summary: s.nullJSON(ctx, params.Summary), StartedAt: sql.NullTime{Time: now, Valid: true}, UpdatedAt: sql.NullTime{Time: now, Valid: true}, FinishedAt: sql.NullTime{}, }) if err != nil { return database.ChatDebugRun{}, err } s.publishEvent(ctx, run.ChatID, EventKindRunUpdate, run.ID, uuid.Nil) return run, nil } // UpdateRun updates an existing debug run and emits a run update event. // When a terminal status is set without an explicit FinishedAt, the // service auto-fills the timestamp so the row is immediately visible // to the InsertChatDebugStep atomic guard (finished_at IS NULL). // UpdateChatDebugRun itself enforces finished_at as write-once: once // the column is populated, repeated auto-fills or explicit refreshes // never overwrite the original completion timestamp, so calling this // more than once on an already-finalized run is idempotent. func (s *Service) UpdateRun( ctx context.Context, params UpdateRunParams, ) (database.ChatDebugRun, error) { if params.Status.IsTerminal() && params.FinishedAt.IsZero() { params.FinishedAt = s.clock.Now() } run, err := s.db.UpdateChatDebugRun(chatdContext(ctx), database.UpdateChatDebugRunParams{ RootChatID: uuid.NullUUID{}, ParentChatID: uuid.NullUUID{}, ModelConfigID: uuid.NullUUID{}, TriggerMessageID: sql.NullInt64{}, HistoryTipMessageID: sql.NullInt64{}, Status: nullString(string(params.Status)), Provider: sql.NullString{}, Model: sql.NullString{}, Summary: s.nullJSON(ctx, params.Summary), FinishedAt: nullTime(params.FinishedAt), Now: s.clock.Now(), ID: params.ID, ChatID: params.ChatID, }) if err != nil { return database.ChatDebugRun{}, err } s.publishEvent(ctx, run.ChatID, EventKindRunUpdate, run.ID, uuid.Nil) return run, nil } // errRunFinalized is returned by CreateStep when the parent run has // already reached a terminal state (finished_at IS NOT NULL). This // prevents delayed retries from appending in-progress steps to runs // that FinalizeStale already marked as interrupted. var errRunFinalized = xerrors.New("parent run is already finalized") // errRunNotFound is returned by CreateStep when the parent run cannot // be located (missing run_id or chat_id mismatch). This surfaces // caller-side data bugs instead of conflating them with the legitimate // "already finalized" terminal case. var errRunNotFound = xerrors.New("parent run not found") // CreateStep inserts a new debug step and emits a step update event. // It returns errRunFinalized if the parent run has already finished, // or errRunNotFound if the run_id/chat_id pair does not match an // existing run. The finalization guard is enforced atomically by the // INSERT's CTE, which issues an UPDATE on the parent run (taking a // row lock). This prevents concurrent FinalizeStale from setting // finished_at between the check and the INSERT. func (s *Service) CreateStep( ctx context.Context, params CreateStepParams, ) (database.ChatDebugStep, error) { now := s.clock.Now() insert := database.InsertChatDebugStepParams{ RunID: params.RunID, StepNumber: params.StepNumber, Operation: string(params.Operation), Status: string(params.Status), HistoryTipMessageID: nullInt64(params.HistoryTipMessageID), AssistantMessageID: sql.NullInt64{}, NormalizedRequest: s.nullJSON(ctx, params.NormalizedRequest), NormalizedResponse: pqtype.NullRawMessage{}, Usage: pqtype.NullRawMessage{}, Attempts: pqtype.NullRawMessage{}, Error: pqtype.NullRawMessage{}, Metadata: pqtype.NullRawMessage{}, StartedAt: sql.NullTime{Time: now, Valid: true}, UpdatedAt: sql.NullTime{Time: now, Valid: true}, FinishedAt: sql.NullTime{}, ChatID: params.ChatID, } // Cap retry attempts to prevent infinite loops under // pathological concurrency. Each iteration performs two DB // round-trips (insert + list), so 10 retries is generous. const maxCreateStepRetries = 10 for range maxCreateStepRetries { if err := ctx.Err(); err != nil { return database.ChatDebugStep{}, err } step, err := s.db.InsertChatDebugStep(chatdContext(ctx), insert) if err == nil { // The INSERT CTE atomically bumps the parent run's // updated_at, so no separate touch call is needed. s.publishEvent(ctx, step.ChatID, EventKindStepUpdate, step.RunID, step.ID) return step, nil } // The INSERT's locked_run CTE filters on id, chat_id, and // finished_at IS NULL, so sql.ErrNoRows can mean "run not // found", "chat_id mismatch", or "already finalized." Look // the run up to disambiguate instead of conflating // caller-side data bugs with the legitimate terminal case. if errors.Is(err, sql.ErrNoRows) { return database.ChatDebugStep{}, s.classifyMissingRun(ctx, params) } if !database.IsUniqueViolation(err, database.UniqueIndexChatDebugStepsRunStep) { return database.ChatDebugStep{}, err } steps, listErr := s.db.GetChatDebugStepsByRunID(chatdContext(ctx), params.RunID) if listErr != nil { return database.ChatDebugStep{}, listErr } nextStepNumber := insert.StepNumber + 1 for _, existing := range steps { if existing.StepNumber >= nextStepNumber { nextStepNumber = existing.StepNumber + 1 } } insert.StepNumber = nextStepNumber } return database.ChatDebugStep{}, xerrors.Errorf( "chatdebug: failed to create step after %d retries (run %s)", maxCreateStepRetries, params.RunID, ) } // classifyMissingRun disambiguates the sql.ErrNoRows returned by // InsertChatDebugStep's locked_run CTE. The CTE filters on id, // chat_id, and finished_at IS NULL, so empty RETURNING rows can mean // the run is absent, belongs to a different chat, or has already been // finalized. GetChatDebugRunByID is keyed only by id, which is // sufficient to tell these cases apart. func (s *Service) classifyMissingRun( ctx context.Context, params CreateStepParams, ) error { run, err := s.db.GetChatDebugRunByID(chatdContext(ctx), params.RunID) if errors.Is(err, sql.ErrNoRows) { return errRunNotFound } if err != nil { return xerrors.Errorf("look up parent run after failed step insert: %w", err) } if run.ChatID != params.ChatID { return errRunNotFound } if run.FinishedAt.Valid { return errRunFinalized } // The run matches the caller's (run_id, chat_id) and is still // open, yet the INSERT returned no rows. This is unexpected // under write-once-finalize semantics and likely indicates a // concurrent delete or unrelated defect; surface it instead of // silently masking it as a terminal case. return xerrors.Errorf( "InsertChatDebugStep returned no rows but run is still active (run_id=%s)", params.RunID, ) } // UpdateStep updates an existing debug step and emits a step update event. // When a terminal status is set without an explicit FinishedAt, the // service auto-fills the timestamp so the stale sweep does not leave // terminal rows with finished_at = NULL. func (s *Service) UpdateStep( ctx context.Context, params UpdateStepParams, ) (database.ChatDebugStep, error) { if params.Status.IsTerminal() && params.FinishedAt.IsZero() { params.FinishedAt = s.clock.Now() } step, err := s.db.UpdateChatDebugStep(chatdContext(ctx), database.UpdateChatDebugStepParams{ Status: nullString(string(params.Status)), HistoryTipMessageID: sql.NullInt64{}, AssistantMessageID: nullInt64(params.AssistantMessageID), NormalizedRequest: pqtype.NullRawMessage{}, NormalizedResponse: s.nullJSON(ctx, params.NormalizedResponse), Usage: s.nullJSON(ctx, params.Usage), Attempts: s.nullJSON(ctx, params.Attempts), Error: s.nullJSON(ctx, params.Error), Metadata: s.nullJSON(ctx, params.Metadata), FinishedAt: nullTime(params.FinishedAt), Now: s.clock.Now(), ID: params.ID, ChatID: params.ChatID, }) if err != nil { return database.ChatDebugStep{}, err } s.publishEvent(ctx, step.ChatID, EventKindStepUpdate, step.RunID, step.ID) return step, nil } // TouchStep bumps the step's and its parent run's updated_at timestamps // without changing any other fields. This prevents long-running operations // (e.g. streaming) from being prematurely swept by FinalizeStale, which // first marks runs stale by chat_debug_runs.updated_at and then cascades // to steps whose run_id was just finalized. func (s *Service) TouchStep( ctx context.Context, stepID uuid.UUID, runID uuid.UUID, chatID uuid.UUID, ) error { // Atomically bump both the step and its parent run so // FinalizeStale cannot interleave between the two touches. return s.db.TouchChatDebugStepAndRun(chatdContext(ctx), database.TouchChatDebugStepAndRunParams{ Now: s.clock.Now(), StepID: stepID, RunID: runID, ChatID: chatID, }) } // DeleteByChatID deletes debug data for a chat and emits a delete event. // The startedBefore bound scopes deletion to runs created before that // instant so that retried cleanup does not remove runs created by a // replacement turn that raced ahead of the retry window (for example, // an unarchive that fires between the initial archive-cleanup attempt // and its retry). func (s *Service) DeleteByChatID( ctx context.Context, chatID uuid.UUID, startedBefore time.Time, ) (int64, error) { deleted, err := s.db.DeleteChatDebugDataByChatID( chatdContext(ctx), database.DeleteChatDebugDataByChatIDParams{ ChatID: chatID, StartedBefore: startedBefore, }, ) if err != nil { return 0, err } s.publishEvent(ctx, chatID, EventKindDelete, uuid.Nil, uuid.Nil) return deleted, nil } // DeleteAfterMessageID deletes debug data newer than the given message. // The startedBefore bound scopes deletion to runs created before that // instant so that retried cleanup does not remove runs created by a // replacement turn that raced ahead of the retry window. func (s *Service) DeleteAfterMessageID( ctx context.Context, chatID uuid.UUID, messageID int64, startedBefore time.Time, ) (int64, error) { deleted, err := s.db.DeleteChatDebugDataAfterMessageID( chatdContext(ctx), database.DeleteChatDebugDataAfterMessageIDParams{ ChatID: chatID, MessageID: messageID, StartedBefore: startedBefore, }, ) if err != nil { return 0, err } s.publishEvent(ctx, chatID, EventKindDelete, uuid.Nil, uuid.Nil) return deleted, nil } // FinalizeStale finalizes stale in-flight debug rows and emits a broadcast. func (s *Service) FinalizeStale( ctx context.Context, ) (database.FinalizeStaleChatDebugRowsRow, error) { now := s.clock.Now() result, err := s.db.FinalizeStaleChatDebugRows( chatdContext(ctx), database.FinalizeStaleChatDebugRowsParams{ Now: now, UpdatedBefore: now.Add(-s.staleThreshold()), }, ) if err != nil { return database.FinalizeStaleChatDebugRowsRow{}, err } if result.RunsFinalized > 0 || result.StepsFinalized > 0 { s.publishEvent(ctx, uuid.Nil, EventKindFinalize, uuid.Nil, uuid.Nil) } return result, nil } // FinalizeRunParams bundles the arguments for FinalizeRun. type FinalizeRunParams struct { RunID uuid.UUID ChatID uuid.UUID Status Status SeedSummary map[string]any // Timeout for the aggregate + update calls. Zero defaults to 5s. Timeout time.Duration } // FinalizeRun aggregates the run summary, updates the run status, and // cleans up the step counter. It detaches from the parent context's // cancellation so finalization succeeds even when the request context // is already done. Errors are returned but are always safe to ignore; // callers that treat debug instrumentation as best-effort can discard // them. func (s *Service) FinalizeRun(ctx context.Context, p FinalizeRunParams) error { timeout := p.Timeout if timeout <= 0 { timeout = 5 * time.Second } finalizeCtx, cancel := context.WithTimeout( context.WithoutCancel(ctx), timeout, ) defer cancel() finalSummary := p.SeedSummary if aggregated, aggErr := s.AggregateRunSummary( finalizeCtx, p.RunID, p.SeedSummary, ); aggErr != nil { // Non-fatal: proceed with the seed summary. s.log.Warn(ctx, "failed to aggregate debug run summary", slog.F("chat_id", p.ChatID), slog.F("run_id", p.RunID), slog.Error(aggErr), ) } else { finalSummary = aggregated } if _, err := s.UpdateRun(finalizeCtx, UpdateRunParams{ ID: p.RunID, ChatID: p.ChatID, Status: p.Status, Summary: finalSummary, FinishedAt: s.clock.Now(), }); err != nil { CleanupStepCounter(p.RunID) return xerrors.Errorf("update debug run: %w", err) } CleanupStepCounter(p.RunID) return nil } // ClassifyError maps a run error to the appropriate debug status. // nil → StatusCompleted, context.Canceled → StatusInterrupted, // everything else → StatusError. Callers with additional // classification rules (e.g. ErrInterrupted, ErrDynamicToolCall) // should handle those before falling back to this helper. func ClassifyError(err error) Status { switch { case err == nil: return StatusCompleted case errors.Is(err, context.Canceled): return StatusInterrupted default: return StatusError } } func nullUUID(id uuid.UUID) uuid.NullUUID { return uuid.NullUUID{UUID: id, Valid: id != uuid.Nil} } func nullInt64(v int64) sql.NullInt64 { return sql.NullInt64{Int64: v, Valid: v != 0} } func nullString(value string) sql.NullString { return sql.NullString{String: value, Valid: value != ""} } func nullTime(value time.Time) sql.NullTime { return sql.NullTime{Time: value, Valid: !value.IsZero()} } // jsonClear is a sentinel value that tells nullJSON to emit a valid // JSON null (JSONB 'null') instead of SQL NULL. COALESCE treats SQL // NULL as "keep existing" but replaces with a non-NULL JSONB value, // so passing jsonClear explicitly overwrites a previously set field. type jsonClear struct{} // nullJSON marshals value to a NullRawMessage. When value is nil // (including typed nils such as `var p *T = nil` whose interface // representation carries a type but no value) or marshals to JSON // "null", the result is {Valid: false}. Typed nils fall through the // `value == nil` guard but produce `[]byte("null")` from // json.Marshal, which the `bytes.Equal(data, []byte("null"))` check // catches identically. This is intentional for the write-once-finalize // pattern: combined with the COALESCE-based UPDATE queries, passing // nil (typed or untyped) preserves the existing column value. Fields // accumulate monotonically (request -> response -> usage -> error) and // never need to be cleared during normal operation. The jsonClear // sentinel exists for the sole exception (error retry clearing). func (s *Service) nullJSON(ctx context.Context, value any) pqtype.NullRawMessage { if value == nil { return pqtype.NullRawMessage{} } // Sentinel: emit a valid JSONB null so COALESCE replaces // any previously stored value. if _, ok := value.(jsonClear); ok { return pqtype.NullRawMessage{ RawMessage: json.RawMessage("null"), Valid: true, } } data, err := json.Marshal(value) if err != nil { s.log.Warn(ctx, "failed to marshal chat debug JSON", slog.Error(err), slog.F("value_type", fmt.Sprintf("%T", value)), ) return pqtype.NullRawMessage{} } if bytes.Equal(data, []byte("null")) { return pqtype.NullRawMessage{} } return pqtype.NullRawMessage{RawMessage: data, Valid: true} } func (s *Service) publishEvent( ctx context.Context, chatID uuid.UUID, kind EventKind, runID uuid.UUID, stepID uuid.UUID, ) { if s.pubsub == nil { s.log.Debug(ctx, "chat debug pubsub unavailable; skipping event", slog.F("kind", kind), slog.F("chat_id", chatID), ) return } event := DebugEvent{ Kind: kind, ChatID: chatID, RunID: runID, StepID: stepID, } data, err := json.Marshal(event) if err != nil { s.log.Warn(ctx, "failed to marshal chat debug event", slog.Error(err), slog.F("kind", kind), slog.F("chat_id", chatID), ) return } channel := PubsubChannel(chatID) if err := s.pubsub.Publish(channel, data); err != nil { s.log.Warn(ctx, "failed to publish chat debug event", slog.Error(err), slog.F("channel", channel), slog.F("kind", kind), slog.F("chat_id", chatID), ) } }