mirror of
https://github.com/coder/coder.git
synced 2026-06-04 13:38:21 +00:00
59b71f296f
Closes #21440 The `TestDBPurgeAuthorization` test was overfitting by calling each purge method individually, which reimplemented dbpurge logic in the test and created a maintenance burden. When new purge steps are added, they either need to be reflected in the test or there will be a testing blindspot. This change extracts the `doTick` closure into an exported `PurgeTick` function that returns an error, making the core purge logic testable. The test now calls `PurgeTick` directly to exercise the actual dbpurge behavior rather than reimplementing it. Retention values are configured to ensure all purge operations run, so we test RBAC permissions for all code paths. - Tests actual dbpurge behavior instead of reimplementing it - Automatically covers new purge steps when they're added - Still validates that all operations have proper RBAC permissions The test focuses on authorization (checking for RBAC errors) rather than verifying deletion behavior, which is already covered by other tests like `TestDeleteExpiredAPIKeys` and `TestDeleteOldAuditLogs`.
256 lines
9.4 KiB
Go
256 lines
9.4 KiB
Go
package dbpurge
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
|
"github.com/coder/coder/v2/coderd/pproflabel"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
const (
|
|
delay = 10 * time.Minute
|
|
// Connection events are now inserted into the `connection_logs` table.
|
|
// We'll slowly remove old connection events from the `audit_logs` table.
|
|
// The `connection_logs` table is purged based on the configured retention.
|
|
maxAuditLogConnectionEventAge = 90 * 24 * time.Hour // 90 days
|
|
auditLogConnectionEventBatchSize = 1000
|
|
// Batch size for connection log deletion.
|
|
connectionLogsBatchSize = 10000
|
|
// Batch size for audit log deletion.
|
|
auditLogsBatchSize = 10000
|
|
// Telemetry heartbeats are used to deduplicate events across replicas. We
|
|
// don't need to persist heartbeat rows for longer than 24 hours, as they
|
|
// are only used for deduplication across replicas. The time needs to be
|
|
// long enough to cover the maximum interval of a heartbeat event (currently
|
|
// 1 hour) plus some buffer.
|
|
maxTelemetryHeartbeatAge = 24 * time.Hour
|
|
)
|
|
|
|
// New creates a new periodically purging database instance.
|
|
// It is the caller's responsibility to call Close on the returned instance.
|
|
//
|
|
// This is for cleaning up old, unused resources from the database that take up space.
|
|
func New(ctx context.Context, logger slog.Logger, db database.Store, vals *codersdk.DeploymentValues, clk quartz.Clock, reg prometheus.Registerer) io.Closer {
|
|
closed := make(chan struct{})
|
|
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
//nolint:gocritic // Use dbpurge-specific subject with minimal permissions.
|
|
ctx = dbauthz.AsDBPurge(ctx)
|
|
|
|
iterationDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "dbpurge",
|
|
Name: "iteration_duration_seconds",
|
|
Help: "Duration of each dbpurge iteration in seconds.",
|
|
Buckets: []float64{1, 5, 10, 30, 60, 300, 600}, // 1s to 10min
|
|
}, []string{"success"})
|
|
reg.MustRegister(iterationDuration)
|
|
|
|
recordsPurged := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "dbpurge",
|
|
Name: "records_purged_total",
|
|
Help: "Total number of records purged by type.",
|
|
}, []string{"record_type"})
|
|
reg.MustRegister(recordsPurged)
|
|
|
|
inst := &instance{
|
|
cancel: cancelFunc,
|
|
closed: closed,
|
|
logger: logger,
|
|
vals: vals,
|
|
clk: clk,
|
|
iterationDuration: iterationDuration,
|
|
recordsPurged: recordsPurged,
|
|
}
|
|
|
|
// Start the ticker with the initial delay.
|
|
ticker := clk.NewTicker(delay)
|
|
doTick := func(ctx context.Context, start time.Time) {
|
|
defer ticker.Reset(delay)
|
|
err := inst.purgeTick(ctx, db, start)
|
|
if err != nil {
|
|
logger.Error(ctx, "failed to purge old database entries", slog.Error(err))
|
|
|
|
// Record metrics for failed purge iteration.
|
|
duration := clk.Since(start)
|
|
iterationDuration.WithLabelValues("false").Observe(duration.Seconds())
|
|
}
|
|
}
|
|
|
|
pproflabel.Go(ctx, pproflabel.Service(pproflabel.ServiceDBPurge), func(ctx context.Context) {
|
|
defer close(closed)
|
|
defer ticker.Stop()
|
|
// Force an initial tick.
|
|
doTick(ctx, dbtime.Time(clk.Now()).UTC())
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case tick := <-ticker.C:
|
|
ticker.Stop()
|
|
doTick(ctx, dbtime.Time(tick).UTC())
|
|
}
|
|
}
|
|
})
|
|
return inst
|
|
}
|
|
|
|
// purgeTick performs a single purge iteration. It returns an error if the
|
|
// purge fails.
|
|
func (i *instance) purgeTick(ctx context.Context, db database.Store, start time.Time) error {
|
|
// Start a transaction to grab advisory lock, we don't want to run
|
|
// multiple purges at the same time (multiple replicas).
|
|
return db.InTx(func(tx database.Store) error {
|
|
// Acquire a lock to ensure that only one instance of the
|
|
// purge is running at a time.
|
|
ok, err := tx.TryAcquireLock(ctx, database.LockIDDBPurge)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !ok {
|
|
i.logger.Debug(ctx, "unable to acquire lock for purging old database entries, skipping")
|
|
return nil
|
|
}
|
|
|
|
var purgedWorkspaceAgentLogs int64
|
|
workspaceAgentLogsRetention := i.vals.Retention.WorkspaceAgentLogs.Value()
|
|
if workspaceAgentLogsRetention > 0 {
|
|
deleteOldWorkspaceAgentLogsBefore := start.Add(-workspaceAgentLogsRetention)
|
|
purgedWorkspaceAgentLogs, err = tx.DeleteOldWorkspaceAgentLogs(ctx, deleteOldWorkspaceAgentLogsBefore)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to delete old workspace agent logs: %w", err)
|
|
}
|
|
}
|
|
if err := tx.DeleteOldWorkspaceAgentStats(ctx); err != nil {
|
|
return xerrors.Errorf("failed to delete old workspace agent stats: %w", err)
|
|
}
|
|
if err := tx.DeleteOldProvisionerDaemons(ctx); err != nil {
|
|
return xerrors.Errorf("failed to delete old provisioner daemons: %w", err)
|
|
}
|
|
if err := tx.DeleteOldNotificationMessages(ctx); err != nil {
|
|
return xerrors.Errorf("failed to delete old notification messages: %w", err)
|
|
}
|
|
if err := tx.ExpirePrebuildsAPIKeys(ctx, dbtime.Time(start)); err != nil {
|
|
return xerrors.Errorf("failed to expire prebuilds user api keys: %w", err)
|
|
}
|
|
|
|
var expiredAPIKeys int64
|
|
apiKeysRetention := i.vals.Retention.APIKeys.Value()
|
|
if apiKeysRetention > 0 {
|
|
// Delete keys that have been expired for at least the retention period.
|
|
// A higher retention period allows the backend to return a more helpful
|
|
// error message when a user tries to use an expired key.
|
|
deleteExpiredKeysBefore := start.Add(-apiKeysRetention)
|
|
expiredAPIKeys, err = tx.DeleteExpiredAPIKeys(ctx, database.DeleteExpiredAPIKeysParams{
|
|
Before: dbtime.Time(deleteExpiredKeysBefore),
|
|
// There could be a lot of expired keys here, so set a limit to prevent
|
|
// this taking too long. This runs every 10 minutes, so it deletes
|
|
// ~1.5m keys per day at most.
|
|
LimitCount: 10000,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to delete expired api keys: %w", err)
|
|
}
|
|
}
|
|
deleteOldTelemetryLocksBefore := start.Add(-maxTelemetryHeartbeatAge)
|
|
if err := tx.DeleteOldTelemetryLocks(ctx, deleteOldTelemetryLocksBefore); err != nil {
|
|
return xerrors.Errorf("failed to delete old telemetry locks: %w", err)
|
|
}
|
|
|
|
deleteOldAuditLogConnectionEventsBefore := start.Add(-maxAuditLogConnectionEventAge)
|
|
if err := tx.DeleteOldAuditLogConnectionEvents(ctx, database.DeleteOldAuditLogConnectionEventsParams{
|
|
BeforeTime: deleteOldAuditLogConnectionEventsBefore,
|
|
LimitCount: auditLogConnectionEventBatchSize,
|
|
}); err != nil {
|
|
return xerrors.Errorf("failed to delete old audit log connection events: %w", err)
|
|
}
|
|
|
|
var purgedAIBridgeRecords int64
|
|
aibridgeRetention := i.vals.AI.BridgeConfig.Retention.Value()
|
|
if aibridgeRetention > 0 {
|
|
deleteAIBridgeRecordsBefore := start.Add(-aibridgeRetention)
|
|
// nolint:gocritic // Needs to run as aibridge context.
|
|
purgedAIBridgeRecords, err = tx.DeleteOldAIBridgeRecords(dbauthz.AsAIBridged(ctx), deleteAIBridgeRecordsBefore)
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to delete old aibridge records: %w", err)
|
|
}
|
|
}
|
|
|
|
var purgedConnectionLogs int64
|
|
connectionLogsRetention := i.vals.Retention.ConnectionLogs.Value()
|
|
if connectionLogsRetention > 0 {
|
|
deleteConnectionLogsBefore := start.Add(-connectionLogsRetention)
|
|
purgedConnectionLogs, err = tx.DeleteOldConnectionLogs(ctx, database.DeleteOldConnectionLogsParams{
|
|
BeforeTime: deleteConnectionLogsBefore,
|
|
LimitCount: connectionLogsBatchSize,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to delete old connection logs: %w", err)
|
|
}
|
|
}
|
|
|
|
var purgedAuditLogs int64
|
|
auditLogsRetention := i.vals.Retention.AuditLogs.Value()
|
|
if auditLogsRetention > 0 {
|
|
deleteAuditLogsBefore := start.Add(-auditLogsRetention)
|
|
purgedAuditLogs, err = tx.DeleteOldAuditLogs(ctx, database.DeleteOldAuditLogsParams{
|
|
BeforeTime: deleteAuditLogsBefore,
|
|
LimitCount: auditLogsBatchSize,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("failed to delete old audit logs: %w", err)
|
|
}
|
|
}
|
|
|
|
i.logger.Debug(ctx, "purged old database entries",
|
|
slog.F("workspace_agent_logs", purgedWorkspaceAgentLogs),
|
|
slog.F("expired_api_keys", expiredAPIKeys),
|
|
slog.F("aibridge_records", purgedAIBridgeRecords),
|
|
slog.F("connection_logs", purgedConnectionLogs),
|
|
slog.F("audit_logs", purgedAuditLogs),
|
|
slog.F("duration", i.clk.Since(start)),
|
|
)
|
|
|
|
if i.iterationDuration != nil {
|
|
duration := i.clk.Since(start)
|
|
i.iterationDuration.WithLabelValues("true").Observe(duration.Seconds())
|
|
}
|
|
if i.recordsPurged != nil {
|
|
i.recordsPurged.WithLabelValues("workspace_agent_logs").Add(float64(purgedWorkspaceAgentLogs))
|
|
i.recordsPurged.WithLabelValues("expired_api_keys").Add(float64(expiredAPIKeys))
|
|
i.recordsPurged.WithLabelValues("aibridge_records").Add(float64(purgedAIBridgeRecords))
|
|
i.recordsPurged.WithLabelValues("connection_logs").Add(float64(purgedConnectionLogs))
|
|
i.recordsPurged.WithLabelValues("audit_logs").Add(float64(purgedAuditLogs))
|
|
}
|
|
|
|
return nil
|
|
}, database.DefaultTXOptions().WithID("db_purge"))
|
|
}
|
|
|
|
type instance struct {
|
|
cancel context.CancelFunc
|
|
closed chan struct{}
|
|
logger slog.Logger
|
|
vals *codersdk.DeploymentValues
|
|
clk quartz.Clock
|
|
iterationDuration *prometheus.HistogramVec
|
|
recordsPurged *prometheus.CounterVec
|
|
}
|
|
|
|
func (i *instance) Close() error {
|
|
i.cancel()
|
|
<-i.closed
|
|
return nil
|
|
}
|