Files
coder/coderd/database/queries/chatdebug.sql
T
Michael Suchacz 2874d4b4cd feat: add chat debug retention purge (#24943)
> Mux is acting on Mike's behalf.

Adds configurable retention for chat debug data, including the purge
query, updated_at index, site config, experimental API, SDK types,
frontend lifecycle setting, and docs.

The purge deletes debug runs older than the configured retention window
and relies on existing cascades to delete steps. The default retention
is 30 days, and setting the value to 0 disables the purge.
2026-05-05 22:37:13 +02:00

309 lines
12 KiB
SQL

-- updated_at is the retention clock used by DeleteOldChatDebugRuns.
-- Set it on every write to keep retention semantics correct.
-- name: InsertChatDebugRun :one
INSERT INTO chat_debug_runs (
chat_id,
root_chat_id,
parent_chat_id,
model_config_id,
trigger_message_id,
history_tip_message_id,
kind,
status,
provider,
model,
summary,
started_at,
updated_at,
finished_at
)
VALUES (
@chat_id::uuid,
sqlc.narg('root_chat_id')::uuid,
sqlc.narg('parent_chat_id')::uuid,
sqlc.narg('model_config_id')::uuid,
sqlc.narg('trigger_message_id')::bigint,
sqlc.narg('history_tip_message_id')::bigint,
@kind::text,
@status::text,
sqlc.narg('provider')::text,
sqlc.narg('model')::text,
COALESCE(sqlc.narg('summary')::jsonb, '{}'::jsonb),
COALESCE(sqlc.narg('started_at')::timestamptz, NOW()),
COALESCE(sqlc.narg('updated_at')::timestamptz, NOW()),
sqlc.narg('finished_at')::timestamptz
)
RETURNING *;
-- name: UpdateChatDebugRun :one
-- Uses COALESCE so that passing NULL from Go means "keep the
-- existing value." This is intentional: debug rows follow a
-- write-once-finalize pattern where fields are set at creation
-- or finalization and never cleared back to NULL. The @now
-- parameter keeps updated_at under the caller's clock.
-- updated_at is also the retention clock used by DeleteOldChatDebugRuns.
--
-- finished_at is enforced as write-once at the SQL level: once
-- populated it cannot be overwritten by a later call. Callers
-- that issue a summary or status refresh after the run has
-- already finalized therefore cannot corrupt the original
-- completion timestamp, which keeps duration and ordering
-- calculations stable regardless of how many times the row is
-- updated.
UPDATE chat_debug_runs
SET
root_chat_id = COALESCE(sqlc.narg('root_chat_id')::uuid, root_chat_id),
parent_chat_id = COALESCE(sqlc.narg('parent_chat_id')::uuid, parent_chat_id),
model_config_id = COALESCE(sqlc.narg('model_config_id')::uuid, model_config_id),
trigger_message_id = COALESCE(sqlc.narg('trigger_message_id')::bigint, trigger_message_id),
history_tip_message_id = COALESCE(sqlc.narg('history_tip_message_id')::bigint, history_tip_message_id),
status = COALESCE(sqlc.narg('status')::text, status),
provider = COALESCE(sqlc.narg('provider')::text, provider),
model = COALESCE(sqlc.narg('model')::text, model),
summary = COALESCE(sqlc.narg('summary')::jsonb, summary),
finished_at = COALESCE(finished_at, sqlc.narg('finished_at')::timestamptz),
updated_at = @now::timestamptz
WHERE id = @id::uuid
AND chat_id = @chat_id::uuid
RETURNING *;
-- name: InsertChatDebugStep :one
-- The CTE atomically locks the parent run via UPDATE, bumps its
-- updated_at (eliminating a separate TouchChatDebugRunUpdatedAt
-- call), and enforces the finalization guard: if the run is already
-- finished, the UPDATE returns zero rows, the INSERT gets no source
-- rows, and sql.ErrNoRows is returned. The UPDATE also serializes
-- with concurrent FinalizeStale under READ COMMITTED isolation.
WITH locked_run AS (
UPDATE chat_debug_runs
SET updated_at = COALESCE(sqlc.narg('updated_at')::timestamptz, NOW())
WHERE id = @run_id::uuid
AND chat_id = @chat_id::uuid
AND finished_at IS NULL
RETURNING chat_id
)
INSERT INTO chat_debug_steps (
run_id,
chat_id,
step_number,
operation,
status,
history_tip_message_id,
assistant_message_id,
normalized_request,
normalized_response,
usage,
attempts,
error,
metadata,
started_at,
updated_at,
finished_at
)
SELECT
@run_id::uuid,
locked_run.chat_id,
@step_number::int,
@operation::text,
@status::text,
sqlc.narg('history_tip_message_id')::bigint,
sqlc.narg('assistant_message_id')::bigint,
COALESCE(sqlc.narg('normalized_request')::jsonb, '{}'::jsonb),
sqlc.narg('normalized_response')::jsonb,
sqlc.narg('usage')::jsonb,
COALESCE(sqlc.narg('attempts')::jsonb, '[]'::jsonb),
sqlc.narg('error')::jsonb,
COALESCE(sqlc.narg('metadata')::jsonb, '{}'::jsonb),
COALESCE(sqlc.narg('started_at')::timestamptz, NOW()),
COALESCE(sqlc.narg('updated_at')::timestamptz, NOW()),
sqlc.narg('finished_at')::timestamptz
FROM locked_run
RETURNING *;
-- name: UpdateChatDebugStep :one
-- Uses COALESCE so that passing NULL from Go means "keep the
-- existing value." This is intentional: debug rows follow a
-- write-once-finalize pattern where fields are set at creation
-- or finalization and never cleared back to NULL. The @now
-- parameter keeps updated_at under the caller's clock, matching
-- the injectable quartz.Clock used by FinalizeStale sweeps.
UPDATE chat_debug_steps
SET
status = COALESCE(sqlc.narg('status')::text, status),
history_tip_message_id = COALESCE(sqlc.narg('history_tip_message_id')::bigint, history_tip_message_id),
assistant_message_id = COALESCE(sqlc.narg('assistant_message_id')::bigint, assistant_message_id),
normalized_request = COALESCE(sqlc.narg('normalized_request')::jsonb, normalized_request),
normalized_response = COALESCE(sqlc.narg('normalized_response')::jsonb, normalized_response),
usage = COALESCE(sqlc.narg('usage')::jsonb, usage),
attempts = COALESCE(sqlc.narg('attempts')::jsonb, attempts),
error = COALESCE(sqlc.narg('error')::jsonb, error),
metadata = COALESCE(sqlc.narg('metadata')::jsonb, metadata),
finished_at = COALESCE(sqlc.narg('finished_at')::timestamptz, finished_at),
updated_at = @now::timestamptz
WHERE id = @id::uuid
AND chat_id = @chat_id::uuid
RETURNING *;
-- name: TouchChatDebugRunUpdatedAt :exec
-- Overrides updated_at on the parent run without touching any
-- other column. Used by tests that need to stamp a run with a
-- specific timestamp after the InsertChatDebugStep CTE has
-- already bumped it to NOW(), so stale-row finalization paths
-- can be exercised deterministically. The chatdebug service
-- itself does not call this: heartbeats go through
-- TouchChatDebugStepAndRun, and step creation updates the parent
-- run via the InsertChatDebugStep CTE.
UPDATE chat_debug_runs
SET updated_at = @now::timestamptz
WHERE id = @id::uuid
AND chat_id = @chat_id::uuid;
-- name: TouchChatDebugStepAndRun :exec
-- Atomically bumps updated_at on both the step and its parent run
-- in a single statement. This prevents FinalizeStale from
-- interleaving between the two touches and finalizing a run whose
-- step heartbeat was just written.
--
-- The step UPDATE joins through touched_run (via FROM) and reads
-- its RETURNING rows. Per the PostgreSQL WITH semantics, RETURNING
-- is the only way to communicate values between a data-modifying
-- CTE and the main query, and consuming those rows forces the run
-- UPDATE to complete before the step UPDATE. That matches the
-- lock order used by FinalizeStaleChatDebugRows and avoids a
-- deadlock between concurrent heartbeats and stale sweeps. The
-- join also constrains the step update to the specified run so a
-- mismatched (run_id, step_id) pair cannot silently refresh an
-- unrelated step.
WITH touched_run AS (
UPDATE chat_debug_runs
SET updated_at = @now::timestamptz
WHERE id = @run_id::uuid
AND chat_id = @chat_id::uuid
RETURNING id, chat_id
)
UPDATE chat_debug_steps
SET updated_at = @now::timestamptz
FROM touched_run
WHERE chat_debug_steps.id = @step_id::uuid
AND chat_debug_steps.run_id = touched_run.id
AND chat_debug_steps.chat_id = touched_run.chat_id;
-- name: GetChatDebugRunsByChatID :many
-- Returns the most recent debug runs for a chat, ordered newest-first.
-- Callers must supply an explicit limit to avoid unbounded result sets.
SELECT *
FROM chat_debug_runs
WHERE chat_id = @chat_id::uuid
ORDER BY started_at DESC, id DESC
LIMIT @limit_val::int;
-- name: GetChatDebugRunByID :one
SELECT *
FROM chat_debug_runs
WHERE id = @id::uuid;
-- name: GetChatDebugStepsByRunID :many
SELECT *
FROM chat_debug_steps
WHERE run_id = @run_id::uuid
ORDER BY step_number ASC, started_at ASC;
-- name: DeleteChatDebugDataByChatID :execrows
-- The started_before bound prevents retried cleanup from deleting
-- runs created by a replacement turn that races ahead of the retry
-- window (for example, after an unarchive races with a pending
-- archive-cleanup retry).
DELETE FROM chat_debug_runs
WHERE chat_id = @chat_id::uuid
AND started_at < @started_before::timestamptz;
-- name: DeleteChatDebugDataAfterMessageID :execrows
-- Deletes debug runs (and their cascaded steps) whose message IDs
-- exceed the cutoff. The started_before bound prevents retried
-- cleanup from deleting runs created by a replacement turn that
-- raced ahead of the retry window.
WITH affected_runs AS (
SELECT DISTINCT run.id
FROM chat_debug_runs run
WHERE run.chat_id = @chat_id::uuid
AND run.started_at < @started_before::timestamptz
AND (
run.history_tip_message_id > @message_id::bigint
OR run.trigger_message_id > @message_id::bigint
)
UNION
SELECT DISTINCT step.run_id AS id
FROM chat_debug_steps step
JOIN chat_debug_runs run ON run.id = step.run_id
AND run.chat_id = step.chat_id
WHERE step.chat_id = @chat_id::uuid
AND run.started_at < @started_before::timestamptz
AND (
step.assistant_message_id > @message_id::bigint
OR step.history_tip_message_id > @message_id::bigint
)
)
DELETE FROM chat_debug_runs
WHERE chat_id = @chat_id::uuid
AND id IN (SELECT id FROM affected_runs);
-- updated_at is the retention clock, so the window starts after the run
-- stops being written to.
-- Intentionally no finished_at IS NOT NULL guard: abandoned in-flight rows
-- older than the cutoff are also purged.
-- name: DeleteOldChatDebugRuns :execrows
WITH deletable AS (
SELECT id, chat_id
FROM chat_debug_runs
WHERE updated_at < @before_time::timestamptz
ORDER BY updated_at ASC
LIMIT @limit_count::int
)
DELETE FROM chat_debug_runs
USING deletable
WHERE chat_debug_runs.id = deletable.id
AND chat_debug_runs.chat_id = deletable.chat_id;
-- name: FinalizeStaleChatDebugRows :one
-- Marks orphaned in-progress rows as interrupted so they do not stay
-- in a non-terminal state forever. The NOT IN list must match the
-- terminal statuses defined by ChatDebugStatus in codersdk/chats.go.
--
-- The steps CTE also catches steps whose parent run was just finalized
-- (via run_id IN), because PostgreSQL data-modifying CTEs share the
-- same snapshot and cannot see each other's row updates. Without this,
-- a step with a recent updated_at would survive its run's finalization
-- and remain in 'in_progress' state permanently.
--
-- @now is the caller's clock timestamp so that mock-clock tests stay
-- consistent with the @updated_before cutoff.
WITH finalized_runs AS (
UPDATE chat_debug_runs
SET
status = 'interrupted',
updated_at = @now::timestamptz,
finished_at = @now::timestamptz
WHERE updated_at < @updated_before::timestamptz
AND finished_at IS NULL
AND status NOT IN ('completed', 'error', 'interrupted')
RETURNING id
), finalized_steps AS (
UPDATE chat_debug_steps
SET
status = 'interrupted',
updated_at = @now::timestamptz,
finished_at = @now::timestamptz
WHERE (
updated_at < @updated_before::timestamptz
OR run_id IN (SELECT id FROM finalized_runs)
)
AND finished_at IS NULL
AND status NOT IN ('completed', 'error', 'interrupted')
RETURNING 1
)
SELECT
(SELECT COUNT(*) FROM finalized_runs)::bigint AS runs_finalized,
(SELECT COUNT(*) FROM finalized_steps)::bigint AS steps_finalized;