coder/coderd/x/chatd/chatd_debug.go

package chatd

import (
	"context"
	"time"

	"charm.land/fantasy"

	"cdr.dev/slog/v3"
	"github.com/coder/coder/v2/coderd/x/chatd/chatdebug"
	"github.com/coder/coder/v2/coderd/x/chatd/chatprovider"
)

const (
	debugCleanupRetryDelay = 500 * time.Millisecond
	debugCleanupAttempts   = 3
	debugCleanupTimeout    = 5 * time.Second
	// debugCreateRunTimeout caps how long a CreateRun insert can
	// block the caller's critical path. Debug persistence is
	// best-effort, so the turn proceeds without debug rows if the
	// DB is slow or locked. Matches the manual-title budget.
	debugCreateRunTimeout = 5 * time.Second
	// debugCleanupClockSkew gives cleanup cutoffs tolerance for cross-
	// replica clock drift. The cutoff is sampled from the DB
	// (updated_at returned by the status transition), and
	// chat_debug_runs.started_at is stamped by whatever replica
	// processes the replacement turn. If that replica's clock lags
	// the DB, its started_at can land behind a commit-time cutoff
	// even though the insert physically happened after commit.
	// Subtracting this buffer ensures the fast retry path cannot
	// delete replacement rows when clocks drift by up to this
	// amount; rows within the buffer survive the fast cleanup but
	// are still finalized (and eligible for stale-sweep cleanup) by
	// the existing FinalizeStale background loop.
	debugCleanupClockSkew = 30 * time.Second
)

func (p *Server) debugService() *chatdebug.Service {
	if p == nil {
		return nil
	}
	if p.debugSvcFactory == nil {
		return p.debugSvc
	}
	p.debugSvcInit.Do(func() {
		p.debugSvc = p.debugSvcFactory()
		p.debugSvcReady.Store(p.debugSvc != nil)
	})
	return p.debugSvc
}

func (p *Server) existingDebugService() *chatdebug.Service {
	if p == nil {
		return nil
	}
	if p.debugSvcFactory == nil {
		return p.debugSvc
	}
	if !p.debugSvcReady.Load() {
		return nil
	}
	return p.debugSvc
}

func (p *Server) scheduleDebugCleanup(
	ctx context.Context,
	logMessage string,
	fields []slog.Field,
	cleanup func(context.Context, *chatdebug.Service) error,
) {
	debugSvc := p.debugService()
	if debugSvc == nil {
		return
	}

	// Acquire inflightMu around the positive Add so Close() cannot
	// call drainInflight concurrently when the counter is at zero.
	// See drainInflight for the WaitGroup contract this preserves.
	p.inflightMu.Lock()
	p.inflight.Add(1)
	p.inflightMu.Unlock()
	go func() {
		defer p.inflight.Done()

		cleanupCtx := context.WithoutCancel(ctx)
		for attempt := 0; attempt < debugCleanupAttempts; attempt++ {
			if attempt > 0 {
				timer := p.clock.NewTimer(debugCleanupRetryDelay, "chatd", "debug_cleanup")
				<-timer.C
			}

			passCtx, cancel := context.WithTimeout(cleanupCtx, debugCleanupTimeout)
			err := cleanup(passCtx, debugSvc)
			cancel()
			if err == nil {
				return
			}

			logFields := append([]slog.Field{
				slog.F("attempt", attempt+1),
				slog.F("max_attempts", debugCleanupAttempts),
			}, fields...)
			logFields = append(logFields, slog.Error(err))
			p.logger.Warn(cleanupCtx, logMessage, logFields...)
		}
	}()
}

func (p *Server) newDebugAwareModel(
	ctx context.Context,
	req modelClientRequest,
	route resolvedModelRoute,
	opts modelBuildOptions,
) (fantasy.LanguageModel, bool, error) {
	providerHint, err := route.providerHint()
	if err != nil {
		return nil, false, err
	}
	provider, resolvedModel, err := chatprovider.ResolveModelWithProviderHint(req.ModelName, providerHint)
	if err != nil {
		return nil, false, err
	}
	route = route.withProviderHint(provider)
	req.ModelName = resolvedModel

	debugSvc := p.debugService()
	debugEnabled := debugSvc != nil && debugSvc.IsEnabled(ctx, req.Chat.ID, req.Chat.OwnerID)
	opts.RecordHTTP = debugEnabled

	model, err := p.newModel(ctx, req, route, opts)
	if err != nil {
		return nil, debugEnabled, err
	}
	if !debugEnabled {
		return model, false, nil
	}

	return chatdebug.WrapModel(model, debugSvc, chatdebug.RecorderOptions{
		ChatID:   req.Chat.ID,
		OwnerID:  req.Chat.OwnerID,
		Provider: provider,
		Model:    resolvedModel,
	}), true, nil
}