feat(coderd/database/dbpurge): add retention for audit logs (#21025)

Add configurable retention policy for audit logs. The DeleteOldAuditLogs
query excludes deprecated connection events (connect, disconnect, open,
close) which are handled separately by DeleteOldAuditLogConnectionEvents.

Disabled (0) by default.

Depends on #21021
Updates #20743
This commit is contained in:
Mathias Fredriksson
2025-12-02 16:50:09 +02:00
committed by GitHub
parent fa7bbe2f55
commit c85d79bcdb
9 changed files with 296 additions and 0 deletions
+7
View File
@@ -1749,6 +1749,13 @@ func (q *querier) DeleteOldAuditLogConnectionEvents(ctx context.Context, thresho
return q.db.DeleteOldAuditLogConnectionEvents(ctx, threshold)
}
func (q *querier) DeleteOldAuditLogs(ctx context.Context, arg database.DeleteOldAuditLogsParams) (int64, error) {
if err := q.authorizeContext(ctx, policy.ActionDelete, rbac.ResourceSystem); err != nil {
return 0, err
}
return q.db.DeleteOldAuditLogs(ctx, arg)
}
func (q *querier) DeleteOldConnectionLogs(ctx context.Context, arg database.DeleteOldConnectionLogsParams) (int64, error) {
if err := q.authorizeContext(ctx, policy.ActionDelete, rbac.ResourceSystem); err != nil {
return 0, err
+4
View File
@@ -324,6 +324,10 @@ func (s *MethodTestSuite) TestAuditLogs() {
dbm.EXPECT().DeleteOldAuditLogConnectionEvents(gomock.Any(), database.DeleteOldAuditLogConnectionEventsParams{}).Return(nil).AnyTimes()
check.Args(database.DeleteOldAuditLogConnectionEventsParams{}).Asserts(rbac.ResourceSystem, policy.ActionDelete)
}))
s.Run("DeleteOldAuditLogs", s.Mocked(func(dbm *dbmock.MockStore, _ *gofakeit.Faker, check *expects) {
dbm.EXPECT().DeleteOldAuditLogs(gomock.Any(), database.DeleteOldAuditLogsParams{}).Return(int64(0), nil).AnyTimes()
check.Args(database.DeleteOldAuditLogsParams{}).Asserts(rbac.ResourceSystem, policy.ActionDelete)
}))
}
func (s *MethodTestSuite) TestConnectionLogs() {
@@ -410,6 +410,13 @@ func (m queryMetricsStore) DeleteOldAuditLogConnectionEvents(ctx context.Context
return r0
}
func (m queryMetricsStore) DeleteOldAuditLogs(ctx context.Context, arg database.DeleteOldAuditLogsParams) (int64, error) {
start := time.Now()
r0, r1 := m.s.DeleteOldAuditLogs(ctx, arg)
m.queryLatencies.WithLabelValues("DeleteOldAuditLogs").Observe(time.Since(start).Seconds())
return r0, r1
}
func (m queryMetricsStore) DeleteOldConnectionLogs(ctx context.Context, arg database.DeleteOldConnectionLogsParams) (int64, error) {
start := time.Now()
r0, r1 := m.s.DeleteOldConnectionLogs(ctx, arg)
+15
View File
@@ -753,6 +753,21 @@ func (mr *MockStoreMockRecorder) DeleteOldAuditLogConnectionEvents(ctx, arg any)
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteOldAuditLogConnectionEvents", reflect.TypeOf((*MockStore)(nil).DeleteOldAuditLogConnectionEvents), ctx, arg)
}
// DeleteOldAuditLogs mocks base method.
func (m *MockStore) DeleteOldAuditLogs(ctx context.Context, arg database.DeleteOldAuditLogsParams) (int64, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "DeleteOldAuditLogs", ctx, arg)
ret0, _ := ret[0].(int64)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// DeleteOldAuditLogs indicates an expected call of DeleteOldAuditLogs.
func (mr *MockStoreMockRecorder) DeleteOldAuditLogs(ctx, arg any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteOldAuditLogs", reflect.TypeOf((*MockStore)(nil).DeleteOldAuditLogs), ctx, arg)
}
// DeleteOldConnectionLogs mocks base method.
func (m *MockStore) DeleteOldConnectionLogs(ctx context.Context, arg database.DeleteOldConnectionLogsParams) (int64, error) {
m.ctrl.T.Helper()
+16
View File
@@ -27,6 +27,8 @@ const (
auditLogConnectionEventBatchSize = 1000
// Batch size for connection log deletion.
connectionLogsBatchSize = 10000
// Batch size for audit log deletion.
auditLogsBatchSize = 10000
// Telemetry heartbeats are used to deduplicate events across replicas. We
// don't need to persist heartbeat rows for longer than 24 hours, as they
// are only used for deduplication across replicas. The time needs to be
@@ -126,10 +128,24 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, vals *coder
}
}
var purgedAuditLogs int64
auditLogsRetention := vals.Retention.AuditLogs.Value()
if auditLogsRetention > 0 {
deleteAuditLogsBefore := start.Add(-auditLogsRetention)
purgedAuditLogs, err = tx.DeleteOldAuditLogs(ctx, database.DeleteOldAuditLogsParams{
BeforeTime: deleteAuditLogsBefore,
LimitCount: auditLogsBatchSize,
})
if err != nil {
return xerrors.Errorf("failed to delete old audit logs: %w", err)
}
}
logger.Debug(ctx, "purged old database entries",
slog.F("expired_api_keys", expiredAPIKeys),
slog.F("aibridge_records", purgedAIBridgeRecords),
slog.F("connection_logs", purgedConnectionLogs),
slog.F("audit_logs", purgedAuditLogs),
slog.F("duration", clk.Since(start)),
)
+195
View File
@@ -1050,3 +1050,198 @@ func TestDeleteOldAIBridgeRecords(t *testing.T) {
require.NoError(t, err)
require.Len(t, newToolUsages, 1, "near threshold tool usages should not be deleted")
}
func TestDeleteOldAuditLogs(t *testing.T) {
t.Parallel()
now := time.Date(2025, 1, 15, 7, 30, 0, 0, time.UTC)
retentionPeriod := 30 * 24 * time.Hour
afterThreshold := now.Add(-retentionPeriod).Add(-24 * time.Hour) // 31 days ago (older than threshold)
beforeThreshold := now.Add(-15 * 24 * time.Hour) // 15 days ago (newer than threshold)
testCases := []struct {
name string
retentionConfig codersdk.RetentionConfig
oldLogTime time.Time
recentLogTime *time.Time // nil means no recent log created
expectOldDeleted bool
expectedLogsRemaining int
}{
{
name: "RetentionEnabled",
retentionConfig: codersdk.RetentionConfig{
AuditLogs: serpent.Duration(retentionPeriod),
},
oldLogTime: afterThreshold,
recentLogTime: &beforeThreshold,
expectOldDeleted: true,
expectedLogsRemaining: 1, // only recent log remains
},
{
name: "RetentionDisabled",
retentionConfig: codersdk.RetentionConfig{
AuditLogs: serpent.Duration(0),
},
oldLogTime: now.Add(-365 * 24 * time.Hour), // 1 year ago
recentLogTime: nil,
expectOldDeleted: false,
expectedLogsRemaining: 1, // old log is kept
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
clk := quartz.NewMock(t)
clk.Set(now).MustWait(ctx)
db, _ := dbtestutil.NewDB(t, dbtestutil.WithDumpOnFailure())
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
// Setup test fixtures.
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
// Create old audit log.
oldLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: tc.oldLogTime,
Action: database.AuditActionCreate,
ResourceType: database.ResourceTypeWorkspace,
})
// Create recent audit log if specified.
var recentLog database.AuditLog
if tc.recentLogTime != nil {
recentLog = dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: *tc.recentLogTime,
Action: database.AuditActionCreate,
ResourceType: database.ResourceTypeWorkspace,
})
}
// Run the purge.
done := awaitDoTick(ctx, t, clk)
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
Retention: tc.retentionConfig,
}, clk)
defer closer.Close()
testutil.TryReceive(ctx, t, done)
// Verify results.
logs, err := db.GetAuditLogsOffset(ctx, database.GetAuditLogsOffsetParams{
LimitOpt: 100,
})
require.NoError(t, err)
require.Len(t, logs, tc.expectedLogsRemaining, "unexpected number of logs remaining")
logIDs := make([]uuid.UUID, len(logs))
for i, log := range logs {
logIDs[i] = log.AuditLog.ID
}
if tc.expectOldDeleted {
require.NotContains(t, logIDs, oldLog.ID, "old audit log should be deleted")
} else {
require.Contains(t, logIDs, oldLog.ID, "old audit log should NOT be deleted")
}
if tc.recentLogTime != nil {
require.Contains(t, logIDs, recentLog.ID, "recent audit log should be kept")
}
})
}
// ConnectionEventsNotDeleted is a special case that tests multiple audit
// action types, so it's kept as a separate subtest.
t.Run("ConnectionEventsNotDeleted", func(t *testing.T) {
t.Parallel()
ctx := testutil.Context(t, testutil.WaitShort)
clk := quartz.NewMock(t)
clk.Set(now).MustWait(ctx)
db, _ := dbtestutil.NewDB(t, dbtestutil.WithDumpOnFailure())
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
// Create old connection events (should NOT be deleted by audit logs retention).
oldConnectLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: afterThreshold,
Action: database.AuditActionConnect,
ResourceType: database.ResourceTypeWorkspace,
})
oldDisconnectLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: afterThreshold,
Action: database.AuditActionDisconnect,
ResourceType: database.ResourceTypeWorkspace,
})
oldOpenLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: afterThreshold,
Action: database.AuditActionOpen,
ResourceType: database.ResourceTypeWorkspace,
})
oldCloseLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: afterThreshold,
Action: database.AuditActionClose,
ResourceType: database.ResourceTypeWorkspace,
})
// Create old non-connection audit log (should be deleted).
oldCreateLog := dbgen.AuditLog(t, db, database.AuditLog{
UserID: user.ID,
OrganizationID: org.ID,
Time: afterThreshold,
Action: database.AuditActionCreate,
ResourceType: database.ResourceTypeWorkspace,
})
// Run the purge with audit logs retention enabled.
done := awaitDoTick(ctx, t, clk)
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
Retention: codersdk.RetentionConfig{
AuditLogs: serpent.Duration(retentionPeriod),
},
}, clk)
defer closer.Close()
testutil.TryReceive(ctx, t, done)
// Verify results.
logs, err := db.GetAuditLogsOffset(ctx, database.GetAuditLogsOffsetParams{
LimitOpt: 100,
})
require.NoError(t, err)
require.Len(t, logs, 4, "should have 4 connection event logs remaining")
logIDs := make([]uuid.UUID, len(logs))
for i, log := range logs {
logIDs[i] = log.AuditLog.ID
}
// Connection events should NOT be deleted by audit logs retention.
require.Contains(t, logIDs, oldConnectLog.ID, "old connect log should NOT be deleted by audit logs retention")
require.Contains(t, logIDs, oldDisconnectLog.ID, "old disconnect log should NOT be deleted by audit logs retention")
require.Contains(t, logIDs, oldOpenLog.ID, "old open log should NOT be deleted by audit logs retention")
require.Contains(t, logIDs, oldCloseLog.ID, "old close log should NOT be deleted by audit logs retention")
// Non-connection event should be deleted.
require.NotContains(t, logIDs, oldCreateLog.ID, "old create log should be deleted by audit logs retention")
})
}
+4
View File
@@ -106,6 +106,10 @@ type sqlcQuerier interface {
// Cumulative count.
DeleteOldAIBridgeRecords(ctx context.Context, beforeTime time.Time) (int32, error)
DeleteOldAuditLogConnectionEvents(ctx context.Context, arg DeleteOldAuditLogConnectionEventsParams) error
// Deletes old audit logs based on retention policy, excluding deprecated
// connection events (connect, disconnect, open, close) which are handled
// separately by DeleteOldAuditLogConnectionEvents.
DeleteOldAuditLogs(ctx context.Context, arg DeleteOldAuditLogsParams) (int64, error)
DeleteOldConnectionLogs(ctx context.Context, arg DeleteOldConnectionLogsParams) (int64, error)
// Delete all notification messages which have not been updated for over a week.
DeleteOldNotificationMessages(ctx context.Context) error
+31
View File
@@ -1637,6 +1637,37 @@ func (q *sqlQuerier) DeleteOldAuditLogConnectionEvents(ctx context.Context, arg
return err
}
const deleteOldAuditLogs = `-- name: DeleteOldAuditLogs :execrows
WITH old_logs AS (
SELECT id
FROM audit_logs
WHERE
"time" < $1::timestamp with time zone
AND action NOT IN ('connect', 'disconnect', 'open', 'close')
ORDER BY "time" ASC
LIMIT $2
)
DELETE FROM audit_logs
USING old_logs
WHERE audit_logs.id = old_logs.id
`
type DeleteOldAuditLogsParams struct {
BeforeTime time.Time `db:"before_time" json:"before_time"`
LimitCount int32 `db:"limit_count" json:"limit_count"`
}
// Deletes old audit logs based on retention policy, excluding deprecated
// connection events (connect, disconnect, open, close) which are handled
// separately by DeleteOldAuditLogConnectionEvents.
func (q *sqlQuerier) DeleteOldAuditLogs(ctx context.Context, arg DeleteOldAuditLogsParams) (int64, error) {
result, err := q.db.ExecContext(ctx, deleteOldAuditLogs, arg.BeforeTime, arg.LimitCount)
if err != nil {
return 0, err
}
return result.RowsAffected()
}
const getAuditLogsOffset = `-- name: GetAuditLogsOffset :many
SELECT audit_logs.id, audit_logs.time, audit_logs.user_id, audit_logs.organization_id, audit_logs.ip, audit_logs.user_agent, audit_logs.resource_type, audit_logs.resource_id, audit_logs.resource_target, audit_logs.action, audit_logs.diff, audit_logs.status_code, audit_logs.additional_fields, audit_logs.request_id, audit_logs.resource_icon,
-- sqlc.embed(users) would be nice but it does not seem to play well with
+17
View File
@@ -253,3 +253,20 @@ WHERE id IN (
ORDER BY "time" ASC
LIMIT @limit_count
);
-- name: DeleteOldAuditLogs :execrows
-- Deletes old audit logs based on retention policy, excluding deprecated
-- connection events (connect, disconnect, open, close) which are handled
-- separately by DeleteOldAuditLogConnectionEvents.
WITH old_logs AS (
SELECT id
FROM audit_logs
WHERE
"time" < @before_time::timestamp with time zone
AND action NOT IN ('connect', 'disconnect', 'open', 'close')
ORDER BY "time" ASC
LIMIT @limit_count
)
DELETE FROM audit_logs
USING old_logs
WHERE audit_logs.id = old_logs.id;