coder/coderd/x/chatd/chatdebug/transport.go

package chatdebug

import (
	"bytes"
	"encoding/json"
	"errors"
	"io"
	"mime"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	"sync"
	"time"

	"golang.org/x/xerrors"
)

// attemptStatusCompleted is the status recorded when a response body
// is fully read without transport-level errors.
const attemptStatusCompleted = "completed"

// attemptStatusFailed is the status recorded when a transport error
// or body read error occurs.
const attemptStatusFailed = "failed"

// maxRecordedRequestBodyBytes caps in-memory request capture when GetBody
// is available.
const maxRecordedRequestBodyBytes = 50_000

// maxRecordedResponseBodyBytes caps in-memory response capture.
const maxRecordedResponseBodyBytes = 50_000

// RecordingTransport captures HTTP request/response data for debug steps.
// When the request context carries an attemptSink, it records each round
// trip. Otherwise it delegates directly.
type RecordingTransport struct {
	// Base is the underlying transport. nil defaults to http.DefaultTransport.
	Base http.RoundTripper
}

var _ http.RoundTripper = (*RecordingTransport)(nil)

func (t *RecordingTransport) RoundTrip(req *http.Request) (*http.Response, error) {
	if req == nil {
		panic("chatdebug: nil request")
	}

	base := t.Base
	if base == nil {
		base = http.DefaultTransport
	}

	sink := attemptSinkFromContext(req.Context())
	if sink == nil {
		return base.RoundTrip(req)
	}

	requestHeaders := RedactHeaders(req.Header)

	// Capture method and URL/path from the request.
	method := req.Method
	reqURL := ""
	reqPath := ""
	if req.URL != nil {
		reqURL = redactURL(req.URL)
		reqPath = req.URL.Path
	}

	requestBody, err := captureRequestBody(req)
	if err != nil {
		return nil, err
	}
	attemptNumber := sink.nextAttemptNumber()

	startedAt := time.Now()
	resp, err := base.RoundTrip(req)
	finishedAt := time.Now()
	durationMs := finishedAt.Sub(startedAt).Milliseconds()
	if err != nil {
		sink.record(Attempt{
			Number:         attemptNumber,
			Status:         attemptStatusFailed,
			Method:         method,
			URL:            reqURL,
			Path:           reqPath,
			StartedAt:      startedAt.UTC().Format(time.RFC3339Nano),
			FinishedAt:     finishedAt.UTC().Format(time.RFC3339Nano),
			RequestHeaders: requestHeaders,
			RequestBody:    requestBody,
			Error:          sanitizeErrorString(err.Error()),
			DurationMs:     durationMs,
		})
		return nil, err
	}

	respHeaders := RedactHeaders(resp.Header)
	resp.Body = &recordingBody{
		inner:         resp.Body,
		sink:          sink,
		startedAt:     startedAt,
		contentLength: resp.ContentLength,
		contentType:   resp.Header.Get("Content-Type"),
		base: Attempt{
			Number:          attemptNumber,
			Method:          method,
			URL:             reqURL,
			Path:            reqPath,
			RequestHeaders:  requestHeaders,
			RequestBody:     requestBody,
			ResponseStatus:  resp.StatusCode,
			ResponseHeaders: respHeaders,
			DurationMs:      durationMs,
		},
	}

	return resp, nil
}

// urlInErrorPattern matches URL-like substrings that transports or
// retry middleware may embed in error messages. Credentials can
// appear in userinfo or query parameters.
var urlInErrorPattern = regexp.MustCompile(`https?://[^\s"']+`)

// sanitizeErrorString redacts URL-like substrings that may contain
// credentials (userinfo, query parameters) from transport error
// messages before they are persisted in debug attempts.
func sanitizeErrorString(errMsg string) string {
	return urlInErrorPattern.ReplaceAllStringFunc(errMsg, func(rawURL string) string {
		parsed, err := url.Parse(rawURL)
		if err != nil {
			return "[REDACTED_URL]"
		}
		return redactURL(parsed)
	})
}

func redactURL(u *url.URL) string {
	if u == nil {
		return ""
	}
	clone := *u
	clone.User = nil
	q := clone.Query()
	for key, values := range q {
		if isSensitiveName(key) || isSensitiveJSONKey(key) {
			for i := range values {
				values[i] = RedactedValue
			}
			q[key] = values
		}
	}
	clone.RawQuery = q.Encode()
	return clone.String()
}

func captureRequestBody(req *http.Request) ([]byte, error) {
	if req == nil || req.Body == nil {
		return nil, nil
	}

	if req.GetBody != nil {
		clone, err := req.GetBody()
		if err == nil {
			limited, readErr := io.ReadAll(io.LimitReader(clone, maxRecordedRequestBodyBytes+1))
			_ = clone.Close()
			// Some SDKs return the active body from GetBody instead of an
			// independent reader. Restore the request body from GetBody so
			// the upstream transport still receives the original bytes.
			resetErr := resetRequestBody(req)
			if resetErr != nil {
				return nil, xerrors.Errorf("chatdebug: reset request body: %w", resetErr)
			}
			if readErr != nil {
				return nil, nil
			}
			if len(limited) > maxRecordedRequestBodyBytes {
				return []byte("[TRUNCATED]"), nil
			}
			return RedactJSONSecrets(limited), nil
		}
	}

	// Without GetBody we cannot safely capture the request body without
	// fully consuming a potentially large or streaming body before the
	// request is sent. Skip capture in that case to keep debug logging
	// lightweight and non-invasive.
	return nil, nil
}

// resetRequestBody replaces req.Body with a fresh reader from req.GetBody.
// It closes the previous request body before installing the replacement.
// Callers must ensure req.GetBody is non-nil.
func resetRequestBody(req *http.Request) error {
	body, err := req.GetBody()
	if err != nil {
		return err
	}
	if req.Body != nil {
		if err := req.Body.Close(); err != nil {
			_ = body.Close()
			return err
		}
	}
	req.Body = body
	return nil
}

type recordingBody struct {
	inner         io.ReadCloser
	contentLength int64
	contentType   string // from resp.Header.Get (case-insensitive)
	sink          *attemptSink
	base          Attempt
	startedAt     time.Time

	mu        sync.Mutex
	buf       bytes.Buffer
	truncated bool
	sawEOF    bool
	bytesRead int64
	// recordedProvisional is true when recordProvisional() has fired
	// for an SSE body's Read-path EOF but Close() has not yet run. A
	// subsequent inner.Close() error in Close() upgrades the
	// provisional entry in the sink so the close error is not lost.
	recordedProvisional bool

	recordOnce sync.Once
	closeOnce  sync.Once
}

// accumulateReadLocked updates the buffer, byte counters, and
// truncation/EOF flags after a read.  The caller must hold r.mu.
func (r *recordingBody) accumulateReadLocked(data []byte, n int, err error) {
	r.bytesRead += int64(n)
	if n > 0 && !r.truncated {
		remaining := maxRecordedResponseBodyBytes - r.buf.Len()
		if remaining > 0 {
			toWrite := n
			if toWrite > remaining {
				toWrite = remaining
				r.truncated = true
			}
			_, _ = r.buf.Write(data[:toWrite])
		} else {
			r.truncated = true
		}
	}
	if errors.Is(err, io.EOF) {
		r.sawEOF = true
	}
}

func (r *recordingBody) Read(p []byte) (int, error) {
	n, err := r.inner.Read(p)

	r.mu.Lock()
	r.accumulateReadLocked(p, n, err)
	r.mu.Unlock()

	// Record non-EOF errors immediately. EOF is handled
	// below for SSE or deferred to Close() for validation.
	if err != nil && !errors.Is(err, io.EOF) {
		r.record(err)
		return n, err
	}

	// For server-sent-events bodies, record eagerly on EOF. Streaming
	// consumers like fantasy's Anthropic SSE adapter iterate the
	// response to EOF and abandon it without calling Close(), so the
	// Close-only recording path would never fire and the attempt would
	// be lost. The recording is provisional so Close() can still
	// upgrade it to failed if inner.Close() surfaces a transport error.
	// Non-SSE bodies stay on the Close-only path so that JSON
	// integrity, content-length validation, and inner-Close errors
	// keep their existing semantics.
	if errors.Is(err, io.EOF) && isSSEContentType(r.contentType) {
		r.recordProvisional(io.EOF)
	}
	return n, err
}

func (r *recordingBody) Close() error {
	r.mu.Lock()
	sawEOF := r.sawEOF
	bytesRead := r.bytesRead
	contentLength := r.contentLength
	truncated := r.truncated
	responseBody := append([]byte(nil), r.buf.Bytes()...)
	r.mu.Unlock()

	contentType := r.contentType
	shouldDrainUnknownLengthJSON := contentLength < 0 &&
		!sawEOF &&
		bytesRead > 0 &&
		!truncated &&
		isCompleteUnknownLengthJSONBody(contentType, responseBody)

	// Always close the inner reader first so that stalled chunked
	// bodies cannot block drainToEOF indefinitely.  Once inner is
	// closed, reads return immediately with an error or EOF.
	var closeErr error
	r.closeOnce.Do(func() {
		closeErr = r.inner.Close()
	})
	if closeErr != nil {
		// Hold r.mu across the flag check AND the publish/replace so a
		// concurrent recordProvisional cannot slip its recordOnce
		// publish between our read of recordedProvisional and our call
		// into the sink. Without this serialization, Close() could
		// observe recordedProvisional=false, then lose the race and
		// see r.record(closeErr) become a no-op once recordOnce has
		// already fired from the SSE EOF path.
		r.mu.Lock()
		if r.recordedProvisional {
			// The SSE EOF path already appended a completed attempt.
			// inner.Close() surfaced a transport error, so upgrade
			// that entry to failed instead of losing the close error.
			upgraded := r.buildAttemptLocked(closeErr)
			r.sink.replaceByNumber(upgraded.Number, upgraded)
			r.recordedProvisional = false
		} else {
			r.recordOnce.Do(func() {
				r.sink.record(r.buildAttemptLocked(closeErr))
			})
		}
		r.mu.Unlock()
		return closeErr
	}

	// Drain remaining bytes that may already be buffered inside the
	// HTTP transport after close.  Because inner is closed, this
	// finishes immediately rather than blocking on the network.
	if shouldDrainUnknownLengthJSON {
		// Best-effort drain; ignore errors since inner is closed.
		_ = r.drainToEOF()
	}

	r.mu.Lock()
	sawEOF = r.sawEOF
	bytesRead = r.bytesRead
	contentLength = r.contentLength
	truncated = r.truncated
	responseBody = append([]byte(nil), r.buf.Bytes()...)
	r.mu.Unlock()

	switch {
	// Only check JSON completeness when the recording buffer is
	// not truncated. A truncated buffer is an incomplete prefix
	// of the body, so the completeness check would false-positive.
	case sawEOF && !truncated && contentLength < 0 && isJSONLikeContentType(contentType) && !isCompleteUnknownLengthJSONBody(contentType, responseBody):
		r.record(io.ErrUnexpectedEOF)
	case sawEOF:
		r.record(io.EOF)
	case responseHasNoBody(r.base.Method, r.base.ResponseStatus):
		r.record(nil)
	case contentLength >= 0 && bytesRead >= contentLength:
		r.record(nil)
	case contentLength < 0 && !truncated && isCompleteUnknownLengthJSONBody(contentType, responseBody):
		r.record(nil)
	// Truncated unknown-length bodies: the caller consumed the
	// response successfully but the recording buffer exceeded
	// maxRecordedResponseBodyBytes. This is not a transport
	// failure - mark as completed with the truncated capture.
	case contentLength < 0 && truncated:
		r.record(nil)
	default:
		r.record(io.ErrUnexpectedEOF)
	}
	return nil
}

func responseHasNoBody(method string, statusCode int) bool {
	if method == http.MethodHead {
		return true
	}
	return statusCode == http.StatusNoContent ||
		statusCode == http.StatusNotModified ||
		(statusCode >= 100 && statusCode < 200)
}

// parseMediaType extracts the media type from a Content-Type header
// value, falling back to splitting on ";" when mime.ParseMediaType
// fails.
func parseMediaType(contentType string) string {
	mediaType, _, err := mime.ParseMediaType(contentType)
	if err != nil {
		mediaType = strings.ToLower(strings.TrimSpace(strings.Split(contentType, ";")[0]))
	}
	return mediaType
}

func isJSONLikeContentType(contentType string) bool {
	mediaType := parseMediaType(contentType)
	return mediaType == "application/json" || strings.HasSuffix(mediaType, "+json")
}

func isNDJSONContentType(contentType string) bool {
	return parseMediaType(contentType) == "application/x-ndjson"
}

// isSSEContentType reports whether contentType is a
// server-sent-events stream.
func isSSEContentType(contentType string) bool {
	return parseMediaType(contentType) == "text/event-stream"
}

// maxDrainBytes caps how many trailing bytes drainToEOF will consume.
// This prevents Close() from blocking indefinitely on a misbehaving
// or extremely large chunked body.
const maxDrainBytes = 64 * 1024 // 64 KB

func (r *recordingBody) drainToEOF() error {
	buf := make([]byte, 4*1024)
	var drained int64
	for {
		n, err := r.inner.Read(buf)

		r.mu.Lock()
		r.accumulateReadLocked(buf, n, err)
		drained += int64(n)
		r.mu.Unlock()

		if err != nil {
			if errors.Is(err, io.EOF) {
				return nil
			}
			return err
		}

		// Safety valve: stop draining after maxDrainBytes to prevent
		// Close() from blocking indefinitely on a chunked body.
		if drained >= maxDrainBytes {
			return io.ErrUnexpectedEOF
		}
	}
}

func isCompleteUnknownLengthJSONBody(contentType string, body []byte) bool {
	if !isJSONLikeContentType(contentType) {
		return false
	}

	trimmed := bytes.TrimSpace(body)
	if len(trimmed) == 0 {
		return false
	}

	decoder := json.NewDecoder(bytes.NewReader(trimmed))
	var value any
	if err := decoder.Decode(&value); err != nil {
		return false
	}
	var extra any
	return errors.Is(decoder.Decode(&extra), io.EOF)
}

// buildAttemptLocked materializes the final Attempt from the current
// buffered response data plus err. Callers use this from both the
// record-once append path and the provisional-upgrade replace path so
// both sites apply the same redaction and status rules. The caller
// must hold r.mu for the duration of the call.
func (r *recordingBody) buildAttemptLocked(err error) Attempt {
	finishedAt := time.Now()

	truncated := r.truncated
	responseBody := append([]byte(nil), r.buf.Bytes()...)
	base := r.base
	startedAt := r.startedAt

	contentType := r.contentType
	switch {
	case truncated:
		base.ResponseBody = []byte("[TRUNCATED]")
	case isNDJSONContentType(contentType):
		base.ResponseBody = RedactNDJSONSecrets(responseBody)
	case contentType == "" || isJSONLikeContentType(contentType):
		// Redact JSON secrets when the content type is JSON-like
		// or absent (unknown). For unknown types, RedactJSONSecrets
		// fails closed by replacing non-JSON payloads with a
		// diagnostic message.
		base.ResponseBody = RedactJSONSecrets(responseBody)
	default:
		// Non-JSON content types (SSE, text/plain, HTML, etc.)
		// are preserved as-is to avoid losing debug content.
		base.ResponseBody = responseBody
	}
	base.StartedAt = startedAt.UTC().Format(time.RFC3339Nano)
	base.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano)
	// Recompute duration to include body read time.
	base.DurationMs = finishedAt.Sub(startedAt).Milliseconds()
	if err != nil && !errors.Is(err, io.EOF) {
		base.Error = sanitizeErrorString(err.Error())
		base.Status = attemptStatusFailed
	} else {
		base.Status = attemptStatusCompleted
	}
	return base
}

// record acquires r.mu before entering recordOnce.Do so it shares a
// single lock-acquisition order with recordProvisional. Without this,
// a concurrent Read (in recordProvisional, holding r.mu) and Close (in
// record, about to take r.mu inside the Do callback) would deadlock:
// the Do winner would block on r.mu while the loser would block on
// recordOnce. Callers must not hold r.mu.
func (r *recordingBody) record(err error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.recordOnce.Do(func() {
		r.sink.record(r.buildAttemptLocked(err))
	})
}

// recordProvisional records err via recordOnce and marks the entry as
// eligible for a later upgrade from Close(). Safe to call multiple
// times; only the first call appends. The publish and the provisional
// flag are committed atomically under r.mu so a concurrent Close()
// that takes r.mu to inspect the flag cannot observe a half-finished
// state where the attempt is in the sink but recordedProvisional is
// still false.
func (r *recordingBody) recordProvisional(err error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.recordOnce.Do(func() {
		r.sink.record(r.buildAttemptLocked(err))
		r.recordedProvisional = true
	})
}