coder/coderd/x/chatd/chatdebug/recorder.go

package chatdebug

import (
	"context"
	"sync"
	"sync/atomic"
	"time"

	"charm.land/fantasy"
	"github.com/google/uuid"

	"cdr.dev/slog/v3"
)

// RecorderOptions identifies the chat/model context for debug recording.
type RecorderOptions struct {
	ChatID   uuid.UUID
	OwnerID  uuid.UUID
	Provider string
	Model    string
}

// WrapModel returns model unchanged when debug recording is disabled, or a
// debug wrapper when a service is available.
func WrapModel(
	model fantasy.LanguageModel,
	svc *Service,
	opts RecorderOptions,
) fantasy.LanguageModel {
	if model == nil {
		panic("chatdebug: nil LanguageModel")
	}
	if svc == nil {
		return model
	}
	return &debugModel{inner: model, svc: svc, opts: opts}
}

type attemptSink struct {
	mu             sync.Mutex
	attempts       []Attempt
	attemptCounter atomic.Int32
}

func (s *attemptSink) nextAttemptNumber() int {
	if s == nil {
		panic("chatdebug: nil attemptSink")
	}
	return int(s.attemptCounter.Add(1))
}

func (s *attemptSink) record(a Attempt) {
	s.mu.Lock()
	defer s.mu.Unlock()

	s.attempts = append(s.attempts, a)
}

// replaceByNumber overwrites a previously recorded attempt whose Number
// matches. If no match is found, the attempt is appended. This supports
// the provisional-then-upgrade flow used for SSE bodies where Read()
// records a completed attempt on EOF and Close() later needs to replace
// it with a failed attempt when inner.Close() surfaces an error.
func (s *attemptSink) replaceByNumber(number int, a Attempt) {
	s.mu.Lock()
	defer s.mu.Unlock()

	for i := range s.attempts {
		if s.attempts[i].Number == number {
			s.attempts[i] = a
			return
		}
	}
	s.attempts = append(s.attempts, a)
}

func (s *attemptSink) snapshot() []Attempt {
	s.mu.Lock()
	defer s.mu.Unlock()

	attempts := make([]Attempt, len(s.attempts))
	copy(attempts, s.attempts)
	return attempts
}

type attemptSinkKey struct{}

func withAttemptSink(ctx context.Context, sink *attemptSink) context.Context {
	if sink == nil {
		panic("chatdebug: nil attemptSink")
	}
	return context.WithValue(ctx, attemptSinkKey{}, sink)
}

func attemptSinkFromContext(ctx context.Context) *attemptSink {
	sink, _ := ctx.Value(attemptSinkKey{}).(*attemptSink)
	return sink
}

var stepCounters sync.Map // map[uuid.UUID]*atomic.Int32

// runRefCounts tracks how many live RunContext instances reference each
// RunID. Cleanup of shared state (step counters) is deferred until the
// last RunContext for a given RunID is garbage collected.
var (
	runRefCounts sync.Map // map[uuid.UUID]*atomic.Int32
	// refCountMu serializes trackRunRef and releaseRunRef so the
	// decrement-to-zero check and subsequent map deletions are
	// atomic with respect to new references being added.
	refCountMu sync.Mutex
)

func trackRunRef(runID uuid.UUID) {
	refCountMu.Lock()
	defer refCountMu.Unlock()
	val, _ := runRefCounts.LoadOrStore(runID, &atomic.Int32{})
	counter, ok := val.(*atomic.Int32)
	if !ok {
		panic("chatdebug: runRefCounts contains non-*atomic.Int32 value")
	}
	counter.Add(1)
}

// releaseRunRef decrements the reference count for runID and cleans up
// shared state when the last reference is released. The mutex ensures
// no concurrent trackRunRef can increment between the zero check and
// the map deletions.
func releaseRunRef(runID uuid.UUID) {
	refCountMu.Lock()
	defer refCountMu.Unlock()
	val, ok := runRefCounts.Load(runID)
	if !ok {
		return
	}
	counter, ok := val.(*atomic.Int32)
	if !ok {
		panic("chatdebug: runRefCounts contains non-*atomic.Int32 value")
	}
	if counter.Add(-1) <= 0 {
		runRefCounts.Delete(runID)
		stepCounters.Delete(runID)
	}
}

func nextStepNumber(runID uuid.UUID) int32 {
	val, _ := stepCounters.LoadOrStore(runID, &atomic.Int32{})
	counter, ok := val.(*atomic.Int32)
	if !ok {
		panic("chatdebug: invalid step counter type")
	}
	return counter.Add(1)
}

// CleanupStepCounter removes per-run step counter and reference count
// state. This is used by tests and later stacked branches that have a
// real run lifecycle.
func CleanupStepCounter(runID uuid.UUID) {
	stepCounters.Delete(runID)
	runRefCounts.Delete(runID)
}

const stepFinalizeTimeout = 5 * time.Second

func stepFinalizeContext(ctx context.Context) (context.Context, context.CancelFunc) {
	if ctx == nil {
		panic("chatdebug: nil context")
	}
	return context.WithTimeout(context.WithoutCancel(ctx), stepFinalizeTimeout)
}

func syncStepCounter(runID uuid.UUID, stepNumber int32) {
	val, _ := stepCounters.LoadOrStore(runID, &atomic.Int32{})
	counter, ok := val.(*atomic.Int32)
	if !ok {
		panic("chatdebug: invalid step counter type")
	}
	for {
		current := counter.Load()
		if current >= stepNumber {
			return
		}
		if counter.CompareAndSwap(current, stepNumber) {
			return
		}
	}
}

type stepHandle struct {
	stepCtx  *StepContext
	sink     *attemptSink
	svc      *Service
	opts     RecorderOptions
	mu       sync.Mutex
	status   Status
	response any
	usage    any
	err      any
	metadata any
	// hadError tracks whether a prior finalization wrote an error
	// payload. Used to decide whether a successful retry needs to
	// explicitly clear the error field via jsonClear.
	hadError bool
}

// beginStep validates preconditions, creates a debug step, and returns a
// handle plus an enriched context carrying StepContext and attemptSink.
// Returns (nil, original ctx) when debug recording should be skipped.
func beginStep(
	ctx context.Context,
	svc *Service,
	opts RecorderOptions,
	op Operation,
	normalizedReq any,
) (*stepHandle, context.Context) {
	if svc == nil {
		return nil, ctx
	}

	rc, ok := RunFromContext(ctx)
	if !ok || rc.RunID == uuid.Nil {
		return nil, ctx
	}

	chatID := opts.ChatID
	if chatID == uuid.Nil {
		chatID = rc.ChatID
	}
	if !svc.IsEnabled(ctx, chatID, opts.OwnerID) {
		return nil, ctx
	}

	holder, reuseStep := reuseHolderFromContext(ctx)
	if reuseStep {
		holder.mu.Lock()
		defer holder.mu.Unlock()
		// Only reuse the cached handle if it belongs to the same run.
		// A different RunContext means a new logical run, so we must
		// create a fresh step to avoid cross-run attribution.
		if holder.handle != nil && holder.handle.stepCtx.RunID == rc.RunID {
			enriched := ContextWithStep(ctx, holder.handle.stepCtx)
			enriched = withAttemptSink(enriched, holder.handle.sink)
			return holder.handle, enriched
		}
	}

	stepNum := nextStepNumber(rc.RunID)
	step, err := svc.CreateStep(ctx, CreateStepParams{
		RunID:               rc.RunID,
		ChatID:              chatID,
		StepNumber:          stepNum,
		Operation:           op,
		Status:              StatusInProgress,
		HistoryTipMessageID: rc.HistoryTipMessageID,
		NormalizedRequest:   normalizedReq,
	})
	if err != nil {
		svc.log.Warn(ctx, "failed to create chat debug step",
			slog.Error(err),
			slog.F("chat_id", chatID),
			slog.F("run_id", rc.RunID),
			slog.F("operation", op),
		)
		return nil, ctx
	}

	syncStepCounter(rc.RunID, step.StepNumber)
	actualStepNumber := step.StepNumber
	if actualStepNumber == 0 {
		actualStepNumber = stepNum
	}

	sc := &StepContext{
		StepID:              step.ID,
		RunID:               rc.RunID,
		ChatID:              chatID,
		StepNumber:          actualStepNumber,
		Operation:           op,
		HistoryTipMessageID: rc.HistoryTipMessageID,
	}
	handle := &stepHandle{stepCtx: sc, sink: &attemptSink{}, svc: svc, opts: opts}
	enriched := ContextWithStep(ctx, handle.stepCtx)
	enriched = withAttemptSink(enriched, handle.sink)
	if reuseStep {
		holder.handle = handle
	}

	return handle, enriched
}

// finish updates the debug step with final status and data.  A mutex
// guards the write so concurrent callers (e.g. retried stream wrappers
// sharing a reuse handle) don't race.  Later retries are allowed to
// overwrite earlier failure results so the step reflects the final
// outcome, but stale callbacks cannot regress a terminal state.
func (h *stepHandle) finish(
	ctx context.Context,
	status Status,
	response any,
	usage any,
	errPayload any,
	metadata any,
) {
	if h == nil || h.stepCtx == nil {
		return
	}

	h.mu.Lock()
	defer h.mu.Unlock()

	// Reject stale callbacks that would regress a terminal state.
	// Status priority: in_progress < interrupted < error < completed.
	// A tardy safety-net writing "interrupted" cannot clobber a step
	// that already reached "completed" or "error" from a real retry.
	// Equal-priority updates are allowed so that retries ending in the
	// same terminal class (e.g. error → error under ReuseStep) can
	// still update the step with newer attempt data.
	if h.status.IsTerminal() && status.Priority() < h.status.Priority() {
		return
	}

	h.status = status
	h.response = response
	h.usage = usage
	h.err = errPayload
	h.metadata = metadata
	if errPayload != nil {
		h.hadError = true
	}
	if h.svc == nil {
		return
	}

	updateCtx, cancel := stepFinalizeContext(ctx)
	defer cancel()

	// When the step completes successfully after a prior failed
	// attempt, the error field must be explicitly cleared.  A plain
	// nil would leave the COALESCE-based SQL untouched, so we send
	// jsonClear{} which serializes as a valid JSONB null.  Only do
	// this when a prior error was actually recorded; otherwise
	// clean successes would get a spurious JSONB null that downstream
	// aggregation could misread as an error.
	errValue := errPayload
	if errValue == nil && status == StatusCompleted && h.hadError {
		errValue = jsonClear{}
	}

	if _, updateErr := h.svc.UpdateStep(updateCtx, UpdateStepParams{
		ID:                 h.stepCtx.StepID,
		ChatID:             h.stepCtx.ChatID,
		Status:             status,
		NormalizedResponse: response,
		Usage:              usage,
		Attempts:           h.sink.snapshot(),
		Error:              errValue,
		Metadata:           metadata,
		FinishedAt:         h.svc.clock.Now(),
	}); updateErr != nil {
		h.svc.log.Warn(updateCtx, "failed to finalize chat debug step",
			slog.Error(updateErr),
			slog.F("step_id", h.stepCtx.StepID),
			slog.F("chat_id", h.stepCtx.ChatID),
			slog.F("status", status),
		)
	}
}