This commit is contained in:
Hugo Dutka
2026-05-22 11:22:17 +00:00
parent ef9bcfc335
commit 658a04d28f
65 changed files with 16045 additions and 15422 deletions
+30 -315
View File
@@ -1,18 +1,12 @@
package dbpurge
import (
"cmp"
"context"
"errors"
"io"
"net/http"
"slices"
"strconv"
"sync/atomic"
"time"
"github.com/dustin/go-humanize"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
@@ -21,9 +15,7 @@ import (
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/pproflabel"
"github.com/coder/coder/v2/coderd/util/slice"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/quartz"
)
@@ -52,18 +44,8 @@ const (
// Chat debug run deletions can cascade into steps with large JSONB
// payloads, so they use the same conservative batch size.
chatDebugRunsBatchSize = 1000
// chatAutoArchiveDigestMaxChats bounds how many chat titles a
// single digest body lists. Past the cap, surplus titles are
// summarized as "...and N more". 25 is a readable email-friendly
// length; the cap is unrelated to chatAutoArchiveBatchSize, which
// bounds work per tick.
chatAutoArchiveDigestMaxChats = 25
)
// defaultChatAutoArchiveBatchSize bounds how many root chats one
// tick will archive by default.
const defaultChatAutoArchiveBatchSize int32 = 1000
type Option func(*instance)
// WithClock overrides the clock used by the purger. Defaults to
@@ -72,34 +54,12 @@ func WithClock(clk quartz.Clock) Option {
return func(i *instance) { i.clk = clk }
}
// WithChatAutoArchiveBatchSize overrides how many root chats a
// single tick will auto-archive. Defaults to
// defaultChatAutoArchiveBatchSize (1000).
func WithChatAutoArchiveBatchSize(n int32) Option {
return func(i *instance) { i.chatAutoArchiveBatchSize = n }
}
// WithNotificationsEnqueuer sets the enqueuer used for digest
// notifications. Defaults to notifications.NewNoopEnqueuer(). Panics
// if e is nil: a nil enqueuer would NPE on the first dispatch tick,
// and failing fast at option-apply time surfaces the misuse at
// startup rather than minutes later.
func WithNotificationsEnqueuer(e notifications.Enqueuer) Option {
if e == nil {
panic("developer error: WithNotificationsEnqueuer called with nil enqueuer")
}
return func(i *instance) { i.enqueuer = e }
}
// New creates a new periodically purging database instance.
// Callers must Close the returned instance.
//
// The auditor pointer is loaded on each dispatch tick so runtime
// entitlement changes (e.g. toggling the audit-log feature) take
// effect without restarting the process. Notifications enqueuer
// defaults to no-op. Use WithNotificationsEnqueuer to pass a real
// one.
func New(ctx context.Context, logger slog.Logger, db database.Store, vals *codersdk.DeploymentValues, reg prometheus.Registerer, auditor *atomic.Pointer[audit.Auditor], opts ...Option) io.Closer {
// The auditor pointer is accepted for compatibility with other background
// services. Dbpurge does not emit audit logs directly.
func New(ctx context.Context, logger slog.Logger, db database.Store, vals *codersdk.DeploymentValues, reg prometheus.Registerer, _ *atomic.Pointer[audit.Auditor], opts ...Option) io.Closer {
closed := make(chan struct{})
ctx, cancelFunc := context.WithCancel(ctx)
@@ -123,26 +83,14 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, vals *coder
}, []string{"record_type"})
reg.MustRegister(recordsPurged)
chatAutoArchiveRecords := prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "coderd",
Subsystem: "chat_auto_archive",
Name: "records_archived_total",
Help: "Total number of chats archived by the auto-archive job (counting both roots and cascaded children).",
})
reg.MustRegister(chatAutoArchiveRecords)
inst := &instance{
cancel: cancelFunc,
closed: closed,
logger: logger,
vals: vals,
clk: quartz.NewReal(),
auditor: auditor,
enqueuer: notifications.NewNoopEnqueuer(),
iterationDuration: iterationDuration,
recordsPurged: recordsPurged,
chatAutoArchiveRecords: chatAutoArchiveRecords,
chatAutoArchiveBatchSize: defaultChatAutoArchiveBatchSize,
cancel: cancelFunc,
closed: closed,
logger: logger,
vals: vals,
clk: quartz.NewReal(),
iterationDuration: iterationDuration,
recordsPurged: recordsPurged,
}
for _, opt := range opts {
opt(inst)
@@ -185,19 +133,14 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, vals *coder
func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.Time) error {
// Read chat configs outside the tx so a corrupt value can't
// poison subsequent queries. On config read errors, log and stash
// the error, then run unrelated purges best-effort. Retention and
// auto-archive errors skip only the conversation purge and
// auto-archive work. Debug retention errors skip only the debug
// purge. purgeTick returns chatConfigErr after the tx so the failed
// iteration is operator-visible via metric and logs.
// the error, then run unrelated purges best-effort. Retention
// errors skip only the conversation purge. Debug retention errors
// skip only the debug purge. purgeTick returns chatConfigErr after
// the tx so the failed iteration is operator-visible via metric and
// logs.
chatRetentionDays, chatRetentionErr := db.GetChatRetentionDays(ctx)
if chatRetentionErr != nil {
i.logger.Error(ctx, "failed to read chat retention config: skipping chat purge and auto-archive this tick", slog.Error(chatRetentionErr))
}
chatAutoArchiveDays, chatAutoArchiveErr := db.GetChatAutoArchiveDays(ctx, codersdk.DefaultChatAutoArchiveDays)
if chatAutoArchiveErr != nil {
i.logger.Error(ctx, "failed to read chat auto-archive config: skipping chat purge and auto-archive this tick", slog.Error(chatAutoArchiveErr))
i.logger.Error(ctx, "failed to read chat retention config: skipping chat purge this tick", slog.Error(chatRetentionErr))
}
chatDebugRetentionDays, chatDebugRetentionErr := db.GetChatDebugRetentionDays(ctx, codersdk.DefaultChatDebugRetentionDays)
@@ -205,11 +148,7 @@ func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.
i.logger.Error(ctx, "failed to read chat debug retention config: skipping chat debug purge this tick", slog.Error(chatDebugRetentionErr))
}
chatRetentionConfigErr := errors.Join(chatRetentionErr, chatAutoArchiveErr)
chatConfigErr := errors.Join(chatRetentionConfigErr, chatDebugRetentionErr)
// Populated inside the tx; dispatched post-commit.
var archivedChats []database.AutoArchiveInactiveChatsRow
chatConfigErr := errors.Join(chatRetentionErr, chatDebugRetentionErr)
// Start a transaction to grab advisory lock, we don't want to run
// multiple purges at the same time (multiple replicas).
@@ -316,8 +255,8 @@ func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.
}
var purgedChats, purgedChatFiles, purgedChatDebugRuns int64
if chatRetentionConfigErr == nil {
purgedChats, purgedChatFiles, archivedChats, err = i.purgeChatsInTx(ctx, tx, start, chatRetentionDays, chatAutoArchiveDays)
if chatRetentionErr == nil {
purgedChats, purgedChatFiles, err = i.purgeChatsInTx(ctx, tx, start, chatRetentionDays)
if err != nil {
return xerrors.Errorf("failed to purge chats: %w", err)
}
@@ -345,7 +284,6 @@ func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.
slog.F("chats", purgedChats),
slog.F("chat_files", purgedChatFiles),
slog.F("chat_debug_runs", purgedChatDebugRuns),
slog.F("auto_archived_chats", len(archivedChats)),
slog.F("duration", i.clk.Since(start)),
)
@@ -379,35 +317,17 @@ func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.
return xerrors.Errorf("chat config read failed this tick: %w", chatConfigErr)
}
// Dispatch audits and digests post-commit. Detached context for audit
// so that ticker cancellation cannot truncate the audit trail.
// Notification enqueue uses the cancellable parent context to avoid
// stalling shutdown.
// Owners with more eligible chats than batch size will get a
// notification per tick until their backlog drains.
// If this is deemed too noisy, users can disable the
// "Chats Auto-Archived" template from their notification preferences.
if len(archivedChats) > 0 {
i.chatAutoArchiveRecords.Add(float64(len(archivedChats)))
auditCtx := context.WithoutCancel(ctx)
i.dispatchChatAutoArchive(auditCtx, ctx, start, chatAutoArchiveDays, chatRetentionDays, archivedChats)
}
return nil
}
type instance struct {
cancel context.CancelFunc
closed chan struct{}
logger slog.Logger
vals *codersdk.DeploymentValues
clk quartz.Clock
auditor *atomic.Pointer[audit.Auditor]
enqueuer notifications.Enqueuer
iterationDuration *prometheus.HistogramVec
recordsPurged *prometheus.CounterVec
chatAutoArchiveRecords prometheus.Counter
chatAutoArchiveBatchSize int32
cancel context.CancelFunc
closed chan struct{}
logger slog.Logger
vals *codersdk.DeploymentValues
clk quartz.Clock
iterationDuration *prometheus.HistogramVec
recordsPurged *prometheus.CounterVec
}
func (i *instance) Close() error {
@@ -416,73 +336,8 @@ func (i *instance) Close() error {
return nil
}
// chatFromAutoArchiveRow reshapes the query row into a database.Chat for
// audit.Auditable[database.Chat].
func chatFromAutoArchiveRow(logger slog.Logger, r database.AutoArchiveInactiveChatsRow) database.Chat {
var labels database.StringMap
// sqlc's StringMap override doesn't reach CTE-aliased columns, so Labels
// arrives as raw JSON bytes. StringMap.Scan handles []byte and nil.
if err := labels.Scan([]byte(r.Labels)); err != nil {
logger.Warn(context.Background(), "failed to parse chat labels from auto-archive row",
slog.F("chat_id", r.ID),
slog.F("raw_labels", string(r.Labels)),
slog.Error(err),
)
}
var userACL database.ChatACL
if err := userACL.Scan([]byte(r.UserACL)); err != nil {
logger.Warn(context.Background(), "failed to parse chat user ACL from auto-archive row",
slog.F("chat_id", r.ID),
slog.F("raw_user_acl", string(r.UserACL)),
slog.Error(err),
)
}
var groupACL database.ChatACL
if err := groupACL.Scan([]byte(r.GroupACL)); err != nil {
logger.Warn(context.Background(), "failed to parse chat group ACL from auto-archive row",
slog.F("chat_id", r.ID),
slog.F("raw_group_acl", string(r.GroupACL)),
slog.Error(err),
)
}
return database.Chat{
ID: r.ID,
OwnerID: r.OwnerID,
OrganizationID: r.OrganizationID,
WorkspaceID: r.WorkspaceID,
BuildID: r.BuildID,
AgentID: r.AgentID,
Title: r.Title,
Status: r.Status,
WorkerID: r.WorkerID,
StartedAt: r.StartedAt,
HeartbeatAt: r.HeartbeatAt,
CreatedAt: r.CreatedAt,
UpdatedAt: r.UpdatedAt,
ParentChatID: r.ParentChatID,
RootChatID: r.RootChatID,
LastModelConfigID: r.LastModelConfigID,
Archived: r.Archived,
LastError: r.LastError,
Mode: r.Mode,
MCPServerIDs: r.MCPServerIDs,
Labels: labels,
UserACL: userACL,
GroupACL: groupACL,
PinOrder: r.PinOrder,
LastReadMessageID: r.LastReadMessageID,
LastInjectedContext: r.LastInjectedContext,
DynamicTools: r.DynamicTools,
PlanMode: r.PlanMode,
ClientType: r.ClientType,
}
}
// purgeChatsInTx MUST BE CALLED WITH A TRANSACTION
func (i *instance) purgeChatsInTx(ctx context.Context, tx database.Store, start time.Time, chatRetentionDays, chatAutoArchiveDays int32) (purgedChats, purgedChatFiles int64, archivedChats []database.AutoArchiveInactiveChatsRow, err error) {
func (*instance) purgeChatsInTx(ctx context.Context, tx database.Store, start time.Time, chatRetentionDays int32) (purgedChats, purgedChatFiles int64, err error) {
// Delete old archived chats first, then orphaned files
// (cascade clears chat_file_links but not chat_files).
if chatRetentionDays > 0 {
@@ -492,7 +347,7 @@ func (i *instance) purgeChatsInTx(ctx context.Context, tx database.Store, start
LimitCount: chatsBatchSize,
})
if err != nil {
return 0, 0, nil, xerrors.Errorf("failed to delete old chats: %w", err)
return 0, 0, xerrors.Errorf("failed to delete old chats: %w", err)
}
purgedChatFiles, err = tx.DeleteOldChatFiles(ctx, database.DeleteOldChatFilesParams{
@@ -500,149 +355,9 @@ func (i *instance) purgeChatsInTx(ctx context.Context, tx database.Store, start
LimitCount: chatFilesBatchSize,
})
if err != nil {
return 0, 0, nil, xerrors.Errorf("failed to delete old chat files: %w", err)
return 0, 0, xerrors.Errorf("failed to delete old chat files: %w", err)
}
}
// Auto-archive runs after the delete pass so newly
// archived chats aren't eligible for deletion this tick.
// Eligibility uses UTC day boundaries: a chat is archived on the
// start of the UTC day after its inactivity period has elapsed.
if chatAutoArchiveDays > 0 {
today := dbtime.StartOfDay(start)
archiveCutoff := today.Add(-time.Duration(chatAutoArchiveDays) * 24 * time.Hour)
archivedChats, err = tx.AutoArchiveInactiveChats(ctx, database.AutoArchiveInactiveChatsParams{
ArchiveCutoff: archiveCutoff,
LimitCount: i.chatAutoArchiveBatchSize,
})
if err != nil {
return 0, 0, nil, xerrors.Errorf("failed to auto-archive inactive chats: %w", err)
}
}
return purgedChats, purgedChatFiles, archivedChats, nil
}
// dispatchChatAutoArchive audits every archived root chat and enqueues one
// notification per owner covering the roots archived in this tick. Children
// inherit their root's archival decision and are skipped for audit, matching
// the manual archive path (patchChat audits the root only). Enqueue is
// per-tick: owners whose backlog spans multiple ticks receive multiple
// notifications; notification_messages dedupe does not collapse them because
// each tick's payload differs.
//
// auditCtx is detached from the ticker so audits always complete. enqueueCtx
// is the cancellable parent: on shutdown we abandon any remaining digests
// rather than blocking Close.
func (i *instance) dispatchChatAutoArchive(auditCtx, enqueueCtx context.Context, tickStart time.Time, autoArchiveDays, retentionDays int32, archived []database.AutoArchiveInactiveChatsRow) {
// Children inherit their root's archival decision and are skipped
// for both audit and digest. Partition once so the two loops
// cannot drift apart if the cascade shape ever changes.
roots := slice.Filter(archived, func(r database.AutoArchiveInactiveChatsRow) bool {
return !r.ParentChatID.Valid
})
auditor := *i.auditor.Load()
for _, row := range roots {
after := chatFromAutoArchiveRow(i.logger, row)
before := after
before.Archived = false
audit.BackgroundAudit(auditCtx, &audit.BackgroundAuditParams[database.Chat]{
Audit: auditor,
Log: i.logger,
UserID: row.OwnerID,
OrganizationID: row.OrganizationID,
Action: database.AuditActionWrite,
Old: before,
New: after,
Status: http.StatusOK,
AdditionalFields: audit.BackgroundTaskFieldsBytes(auditCtx, i.logger, audit.BackgroundSubsystemChatAutoArchive),
})
}
// Group archived roots by owner. Inline because this is the
// only call site and the loop body is self-explanatory.
rootsByOwner := make(map[uuid.UUID][]database.AutoArchiveInactiveChatsRow, len(roots))
for _, row := range roots {
rootsByOwner[row.OwnerID] = append(rootsByOwner[row.OwnerID], row)
}
// Sort owner IDs so shutdown abandons a deterministic tail of the dispatch list.
ownerIDs := make([]uuid.UUID, 0, len(rootsByOwner))
for id := range rootsByOwner {
ownerIDs = append(ownerIDs, id)
}
slices.SortFunc(ownerIDs, func(a, b uuid.UUID) int {
return cmp.Compare(a.String(), b.String())
})
dispatched := 0
for _, ownerID := range ownerIDs {
// Check between iterations so shutdown unblocks promptly. A
// hung in-flight enqueue is unblocked by enqueueCtx propagating
// cancellation into the DB call. Skipped owners are not
// re-notified on the next tick because AutoArchiveInactiveChats
// only returns rows with archived = false; we accept that
// tradeoff over hanging shutdown.
if err := enqueueCtx.Err(); err != nil {
i.logger.Warn(enqueueCtx, "chat auto-archive digest dispatch canceled",
slog.F("remaining_owners", len(ownerIDs)-dispatched),
slog.Error(err))
return
}
dispatched++
ownerRoots := rootsByOwner[ownerID]
data := buildDigestData(ownerRoots, autoArchiveDays, retentionDays, tickStart)
// nolint:gocritic // Background digest runs as the notifier subject.
if _, err := i.enqueuer.EnqueueWithData(
dbauthz.AsNotifier(enqueueCtx),
ownerID,
notifications.TemplateChatAutoArchiveDigest,
map[string]string{},
data,
string(audit.BackgroundSubsystemChatAutoArchive),
); err != nil {
i.logger.Warn(enqueueCtx, "failed to enqueue chat auto-archive digest",
slog.F("owner_id", ownerID),
slog.Error(err))
}
}
}
// buildDigestData builds the notification payload; shape mirrors the
// golden fixtures in coderd/notifications/testdata. Truncation keeps
// the oldest archived roots (created_at ASC from the query) to
// preserve index-driven ordering; revisit if the digest becomes the
// primary surface for reviewing archived chats.
func buildDigestData(rows []database.AutoArchiveInactiveChatsRow, autoArchiveDays, retentionDays int32, tickStart time.Time) map[string]any {
// Cap titles; overflow surfaces as "...and N more" via the template.
overflow := 0
if len(rows) > chatAutoArchiveDigestMaxChats {
overflow = len(rows) - chatAutoArchiveDigestMaxChats
rows = rows[:chatAutoArchiveDigestMaxChats]
}
chats := make([]map[string]any, 0, len(rows))
for _, r := range rows {
chats = append(chats, map[string]any{
"title": r.Title,
"last_activity_humanized": humanize.RelTime(r.LastActivityAt, tickStart, "ago", "from now"),
})
}
// Stringify the int32 config values: the template's
// {{if eq .Data.retention_days "0"}} branch requires both
// operands to share a type, and Go templates do not coerce
// numeric ↔ string. Storing a raw int here would silently
// take the deletion-warning branch on every notification.
data := map[string]any{
"auto_archive_days": strconv.Itoa(int(autoArchiveDays)),
"retention_days": strconv.Itoa(int(retentionDays)),
"archived_chats": chats,
}
if overflow > 0 {
data["additional_archived_count"] = strconv.Itoa(overflow)
}
return data
return purgedChats, purgedChatFiles, nil
}
File diff suppressed because it is too large Load Diff