mirror of
https://github.com/coder/coder.git
synced 2026-06-03 13:08:25 +00:00
0a7a3da178
The provisioner state for a workspace build was being loaded for every long-lived agent rpc connection. Since this state can be anywhere from kilobytes to megabytes this can gradually cause the `coderd` memory footprint to grow over time. It's also a lot of unnecessary allocations for every query that fetches a workspace build since only a few callers ever actually reference the provisioner state. This PR removes it from the returned workspace build and adds a query to fetch the provisioner state explicitly.
947 lines
31 KiB
Go
947 lines
31 KiB
Go
package jobreaper_test
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/goleak"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/coderd/coderdtest"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/database/dbfake"
|
|
"github.com/coder/coder/v2/coderd/database/dbgen"
|
|
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
|
"github.com/coder/coder/v2/coderd/jobreaper"
|
|
"github.com/coder/coder/v2/coderd/provisionerdserver"
|
|
"github.com/coder/coder/v2/coderd/rbac"
|
|
"github.com/coder/coder/v2/provisionersdk"
|
|
"github.com/coder/coder/v2/testutil"
|
|
)
|
|
|
|
func TestMain(m *testing.M) {
|
|
goleak.VerifyTestMain(m, testutil.GoleakOptions...)
|
|
}
|
|
|
|
func TestDetectorNoJobs(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- time.Now()
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Empty(t, stats.TerminatedJobIDs)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorNoHungJobs(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
// Insert some jobs that are running and haven't been updated in a while,
|
|
// but not enough to be considered hung.
|
|
now := time.Now()
|
|
org := dbgen.Organization(t, db, database.Organization{})
|
|
user := dbgen.User(t, db, database.User{})
|
|
file := dbgen.File(t, db, database.File{})
|
|
for i := 0; i < 5; i++ {
|
|
dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: now.Add(-time.Minute * 5),
|
|
UpdatedAt: now.Add(-time.Minute * time.Duration(i)),
|
|
StartedAt: sql.NullTime{
|
|
Time: now.Add(-time.Minute * 5),
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
|
Input: []byte("{}"),
|
|
})
|
|
}
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Empty(t, stats.TerminatedJobIDs)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorHungWorkspaceBuild(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
twentyMinAgo = now.Add(-time.Minute * 20)
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`)
|
|
)
|
|
|
|
// Previous build (completed successfully).
|
|
previousBuild := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
OwnerID: user.ID,
|
|
}).Pubsub(pubsub).Seed(database.WorkspaceBuild{}).
|
|
ProvisionerState(expectedWorkspaceBuildState).
|
|
Succeeded(dbfake.WithJobCompletedAt(twentyMinAgo)).
|
|
Do()
|
|
|
|
// Current build (hung - running job with UpdatedAt > 5 min ago).
|
|
currentBuild := dbfake.WorkspaceBuild(t, db, previousBuild.Workspace).
|
|
Pubsub(pubsub).
|
|
Seed(database.WorkspaceBuild{BuildNumber: 2}).
|
|
Starting(dbfake.WithJobStartedAt(tenMinAgo), dbfake.WithJobUpdatedAt(sixMinAgo)).
|
|
Do()
|
|
|
|
t.Log("previous job ID: ", previousBuild.Build.JobID)
|
|
t.Log("current job ID: ", currentBuild.Build.JobID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Equal(t, currentBuild.Build.JobID, stats.TerminatedJobIDs[0])
|
|
|
|
// Check that the current provisioner job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, currentBuild.Build.JobID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the provisioner state was copied.
|
|
build, err := db.GetWorkspaceBuildByID(ctx, currentBuild.Build.ID)
|
|
require.NoError(t, err)
|
|
provisionerStateRow, err := db.GetWorkspaceBuildProvisionerStateByID(ctx, build.ID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, expectedWorkspaceBuildState, provisionerStateRow.ProvisionerState)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
twentyMinAgo = now.Add(-time.Minute * 20)
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`)
|
|
)
|
|
|
|
// Previous build (completed successfully).
|
|
previousBuild := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
OwnerID: user.ID,
|
|
}).Pubsub(pubsub).Seed(database.WorkspaceBuild{}).
|
|
ProvisionerState([]byte(`{"dean":"NOT cool","colin":"also NOT cool"}`)).
|
|
Succeeded(dbfake.WithJobCompletedAt(twentyMinAgo)).
|
|
Do()
|
|
|
|
// Current build (hung - running job with UpdatedAt > 5 min ago).
|
|
// This build already has provisioner state, which should NOT be overridden.
|
|
currentBuild := dbfake.WorkspaceBuild(t, db, previousBuild.Workspace).
|
|
Pubsub(pubsub).
|
|
Seed(database.WorkspaceBuild{
|
|
BuildNumber: 2,
|
|
}).ProvisionerState(expectedWorkspaceBuildState).
|
|
Starting(dbfake.WithJobStartedAt(tenMinAgo), dbfake.WithJobUpdatedAt(sixMinAgo)).
|
|
Do()
|
|
|
|
t.Log("previous job ID: ", previousBuild.Build.JobID)
|
|
t.Log("current job ID: ", currentBuild.Build.JobID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Equal(t, currentBuild.Build.JobID, stats.TerminatedJobIDs[0])
|
|
|
|
// Check that the current provisioner job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, currentBuild.Build.JobID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the provisioner state was NOT copied.
|
|
build, err := db.GetWorkspaceBuildByID(ctx, currentBuild.Build.ID)
|
|
require.NoError(t, err)
|
|
provisionerStateRow, err := db.GetWorkspaceBuildProvisionerStateByID(ctx, build.ID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, expectedWorkspaceBuildState, provisionerStateRow.ProvisionerState)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`)
|
|
)
|
|
|
|
// First build (hung - no previous build exists).
|
|
// This build has provisioner state, which should NOT be overridden.
|
|
currentBuild := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
OwnerID: user.ID,
|
|
}).Pubsub(pubsub).Seed(database.WorkspaceBuild{}).
|
|
ProvisionerState(expectedWorkspaceBuildState).
|
|
Starting(dbfake.WithJobStartedAt(tenMinAgo), dbfake.WithJobUpdatedAt(sixMinAgo)).
|
|
Do()
|
|
|
|
t.Log("current job ID: ", currentBuild.Build.JobID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Equal(t, currentBuild.Build.JobID, stats.TerminatedJobIDs[0])
|
|
|
|
// Check that the current provisioner job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, currentBuild.Build.JobID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the provisioner state was NOT updated.
|
|
build, err := db.GetWorkspaceBuildByID(ctx, currentBuild.Build.ID)
|
|
require.NoError(t, err)
|
|
provisionerStateRow, err := db.GetWorkspaceBuildProvisionerStateByID(ctx, build.ID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, expectedWorkspaceBuildState, provisionerStateRow.ProvisionerState)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorPendingWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
thirtyFiveMinAgo = now.Add(-time.Minute * 35)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`)
|
|
)
|
|
|
|
// First build (hung pending - no previous build exists).
|
|
// This build has provisioner state, which should NOT be overridden.
|
|
currentBuild := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
OwnerID: user.ID,
|
|
}).Pubsub(pubsub).Seed(database.WorkspaceBuild{}).
|
|
ProvisionerState(expectedWorkspaceBuildState).
|
|
Pending(dbfake.WithJobCreatedAt(thirtyFiveMinAgo), dbfake.WithJobUpdatedAt(thirtyFiveMinAgo)).
|
|
Do()
|
|
|
|
t.Log("current job ID: ", currentBuild.Build.JobID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Equal(t, currentBuild.Build.JobID, stats.TerminatedJobIDs[0])
|
|
|
|
// Check that the current provisioner job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, currentBuild.Build.JobID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.StartedAt.Valid)
|
|
require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as pending")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the provisioner state was NOT updated.
|
|
build, err := db.GetWorkspaceBuildByID(ctx, currentBuild.Build.ID)
|
|
require.NoError(t, err)
|
|
provisionerStateRow, err := db.GetWorkspaceBuildProvisionerStateByID(ctx, build.ID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, expectedWorkspaceBuildState, provisionerStateRow.ProvisionerState)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
// TestDetectorWorkspaceBuildForDormantWorkspace ensures that the jobreaper has
|
|
// enough permissions to fix dormant workspaces.
|
|
//
|
|
// Dormant workspaces are treated as rbac.ResourceWorkspaceDormant rather than
|
|
// rbac.ResourceWorkspace, which resulted in a bug where the jobreaper would
|
|
// be able to see but not fix dormant workspaces.
|
|
func TestDetectorWorkspaceBuildForDormantWorkspace(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`)
|
|
)
|
|
|
|
// First build (hung - running job with UpdatedAt > 5 min ago).
|
|
// This build has provisioner state, which should NOT be overridden.
|
|
// The workspace is dormant from the start.
|
|
currentBuild := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
OwnerID: user.ID,
|
|
DormantAt: sql.NullTime{
|
|
Time: now.Add(-time.Hour),
|
|
Valid: true,
|
|
},
|
|
}).Pubsub(pubsub).Seed(database.WorkspaceBuild{}).
|
|
ProvisionerState(expectedWorkspaceBuildState).
|
|
Starting(dbfake.WithJobStartedAt(tenMinAgo), dbfake.WithJobUpdatedAt(sixMinAgo)).
|
|
Do()
|
|
|
|
t.Log("current job ID: ", currentBuild.Build.JobID)
|
|
|
|
// Ensure the RBAC is the dormant type to ensure we're testing the right
|
|
// thing.
|
|
require.Equal(t, rbac.ResourceWorkspaceDormant.Type, currentBuild.Workspace.RBACObject().Type)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Equal(t, currentBuild.Build.JobID, stats.TerminatedJobIDs[0])
|
|
|
|
// Check that the current provisioner job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, currentBuild.Build.JobID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorHungOtherJobTypes(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
file = dbgen.File(t, db, database.File{})
|
|
|
|
// Template import job.
|
|
templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: tenMinAgo,
|
|
UpdatedAt: sixMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: tenMinAgo,
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
|
Input: []byte("{}"),
|
|
})
|
|
_ = dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
JobID: templateImportJob.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
)
|
|
|
|
// Template dry-run job.
|
|
dryRunVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
input, err := json.Marshal(provisionerdserver.TemplateVersionDryRunJob{
|
|
TemplateVersionID: dryRunVersion.ID,
|
|
})
|
|
require.NoError(t, err)
|
|
templateDryRunJob := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: tenMinAgo,
|
|
UpdatedAt: sixMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: tenMinAgo,
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionDryRun,
|
|
Input: input,
|
|
})
|
|
|
|
t.Log("template import job ID: ", templateImportJob.ID)
|
|
t.Log("template dry-run job ID: ", templateDryRunJob.ID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 2)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateDryRunJob.ID)
|
|
|
|
// Check that the template import job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the template dry-run job was updated.
|
|
job, err = db.GetProvisionerJobByID(ctx, templateDryRunJob.ID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorPendingOtherJobTypes(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
thirtyFiveMinAgo = now.Add(-time.Minute * 35)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
file = dbgen.File(t, db, database.File{})
|
|
|
|
// Template import job.
|
|
templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: thirtyFiveMinAgo,
|
|
UpdatedAt: thirtyFiveMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: time.Time{},
|
|
Valid: false,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
|
Input: []byte("{}"),
|
|
})
|
|
_ = dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
JobID: templateImportJob.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
)
|
|
|
|
// Template dry-run job.
|
|
dryRunVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
input, err := json.Marshal(provisionerdserver.TemplateVersionDryRunJob{
|
|
TemplateVersionID: dryRunVersion.ID,
|
|
})
|
|
require.NoError(t, err)
|
|
templateDryRunJob := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: thirtyFiveMinAgo,
|
|
UpdatedAt: thirtyFiveMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: time.Time{},
|
|
Valid: false,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionDryRun,
|
|
Input: input,
|
|
})
|
|
|
|
t.Log("template import job ID: ", templateImportJob.ID)
|
|
t.Log("template dry-run job ID: ", templateDryRunJob.ID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 2)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateDryRunJob.ID)
|
|
|
|
// Check that the template import job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.StartedAt.Valid)
|
|
require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as pending")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
// Check that the template dry-run job was updated.
|
|
job, err = db.GetProvisionerJobByID(ctx, templateDryRunJob.ID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.StartedAt.Valid)
|
|
require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as pending")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorHungCanceledJob(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
file = dbgen.File(t, db, database.File{})
|
|
|
|
// Template import job.
|
|
templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: tenMinAgo,
|
|
CanceledAt: sql.NullTime{
|
|
Time: tenMinAgo,
|
|
Valid: true,
|
|
},
|
|
UpdatedAt: sixMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: tenMinAgo,
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
|
Input: []byte("{}"),
|
|
})
|
|
_ = dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
JobID: templateImportJob.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
)
|
|
|
|
t.Log("template import job ID: ", templateImportJob.ID)
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID)
|
|
|
|
// Check that the job was updated.
|
|
job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID)
|
|
require.NoError(t, err)
|
|
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
|
|
require.True(t, job.CompletedAt.Valid)
|
|
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
|
|
require.True(t, job.Error.Valid)
|
|
require.Contains(t, job.Error.String, "Build has been detected as hung")
|
|
require.False(t, job.ErrorCode.Valid)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
func TestDetectorPushesLogs(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []struct {
|
|
name string
|
|
preLogCount int
|
|
preLogStage string
|
|
expectStage string
|
|
}{
|
|
{
|
|
name: "WithExistingLogs",
|
|
preLogCount: 10,
|
|
preLogStage: "Stage Name",
|
|
expectStage: "Stage Name",
|
|
},
|
|
{
|
|
name: "WithExistingLogsNoStage",
|
|
preLogCount: 10,
|
|
preLogStage: "",
|
|
expectStage: "Unknown",
|
|
},
|
|
{
|
|
name: "WithoutExistingLogs",
|
|
preLogCount: 0,
|
|
expectStage: "Unknown",
|
|
},
|
|
}
|
|
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
)
|
|
|
|
var (
|
|
now = time.Now()
|
|
tenMinAgo = now.Add(-time.Minute * 10)
|
|
sixMinAgo = now.Add(-time.Minute * 6)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
file = dbgen.File(t, db, database.File{})
|
|
|
|
// Template import job.
|
|
templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: tenMinAgo,
|
|
UpdatedAt: sixMinAgo,
|
|
StartedAt: sql.NullTime{
|
|
Time: tenMinAgo,
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
|
Input: []byte("{}"),
|
|
})
|
|
_ = dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
JobID: templateImportJob.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
)
|
|
|
|
t.Log("template import job ID: ", templateImportJob.ID)
|
|
|
|
// Insert some logs at the start of the job.
|
|
if c.preLogCount > 0 {
|
|
insertParams := database.InsertProvisionerJobLogsParams{
|
|
JobID: templateImportJob.ID,
|
|
}
|
|
for i := 0; i < c.preLogCount; i++ {
|
|
insertParams.CreatedAt = append(insertParams.CreatedAt, tenMinAgo.Add(time.Millisecond*time.Duration(i)))
|
|
insertParams.Level = append(insertParams.Level, database.LogLevelInfo)
|
|
insertParams.Stage = append(insertParams.Stage, c.preLogStage)
|
|
insertParams.Source = append(insertParams.Source, database.LogSourceProvisioner)
|
|
insertParams.Output = append(insertParams.Output, fmt.Sprintf("Output %d", i))
|
|
}
|
|
logs, err := db.InsertProvisionerJobLogs(ctx, insertParams)
|
|
require.NoError(t, err)
|
|
require.Len(t, logs, 10)
|
|
}
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
|
|
// Create pubsub subscription to listen for new log events.
|
|
pubsubCalled := make(chan int64, 1)
|
|
pubsubCancel, err := pubsub.Subscribe(provisionersdk.ProvisionerJobLogsNotifyChannel(templateImportJob.ID), func(ctx context.Context, message []byte) {
|
|
defer close(pubsubCalled)
|
|
var event provisionersdk.ProvisionerJobLogsNotifyMessage
|
|
err := json.Unmarshal(message, &event)
|
|
if !assert.NoError(t, err) {
|
|
return
|
|
}
|
|
|
|
assert.True(t, event.EndOfLogs)
|
|
pubsubCalled <- event.CreatedAfter
|
|
})
|
|
require.NoError(t, err)
|
|
defer pubsubCancel()
|
|
|
|
tickCh <- now
|
|
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID)
|
|
|
|
after := <-pubsubCalled
|
|
|
|
// Get the jobs after the given time and check that they are what we
|
|
// expect.
|
|
logs, err := db.GetProvisionerLogsAfterID(ctx, database.GetProvisionerLogsAfterIDParams{
|
|
JobID: templateImportJob.ID,
|
|
CreatedAfter: after,
|
|
})
|
|
require.NoError(t, err)
|
|
threshold := jobreaper.HungJobDuration
|
|
jobType := jobreaper.Hung
|
|
if templateImportJob.JobStatus == database.ProvisionerJobStatusPending {
|
|
threshold = jobreaper.PendingJobDuration
|
|
jobType = jobreaper.Pending
|
|
}
|
|
expectedLogs := jobreaper.JobLogMessages(jobType, threshold)
|
|
require.Len(t, logs, len(expectedLogs))
|
|
for i, log := range logs {
|
|
assert.Equal(t, database.LogLevelError, log.Level)
|
|
assert.Equal(t, c.expectStage, log.Stage)
|
|
assert.Equal(t, database.LogSourceProvisionerDaemon, log.Source)
|
|
assert.Equal(t, expectedLogs[i], log.Output)
|
|
}
|
|
|
|
// Double check the full log count.
|
|
logs, err = db.GetProvisionerLogsAfterID(ctx, database.GetProvisionerLogsAfterIDParams{
|
|
JobID: templateImportJob.ID,
|
|
CreatedAfter: 0,
|
|
})
|
|
require.NoError(t, err)
|
|
require.Len(t, logs, c.preLogCount+len(expectedLogs))
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDetectorMaxJobsPerRun(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
ctx = testutil.Context(t, testutil.WaitLong)
|
|
db, pubsub = dbtestutil.NewDB(t)
|
|
log = testutil.Logger(t)
|
|
tickCh = make(chan time.Time)
|
|
statsCh = make(chan jobreaper.Stats)
|
|
org = dbgen.Organization(t, db, database.Organization{})
|
|
user = dbgen.User(t, db, database.User{})
|
|
file = dbgen.File(t, db, database.File{})
|
|
)
|
|
|
|
// Create MaxJobsPerRun + 1 hung jobs.
|
|
now := time.Now()
|
|
for i := 0; i < jobreaper.MaxJobsPerRun+1; i++ {
|
|
pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{
|
|
CreatedAt: now.Add(-time.Hour),
|
|
UpdatedAt: now.Add(-time.Hour),
|
|
StartedAt: sql.NullTime{
|
|
Time: now.Add(-time.Hour),
|
|
Valid: true,
|
|
},
|
|
OrganizationID: org.ID,
|
|
InitiatorID: user.ID,
|
|
Provisioner: database.ProvisionerTypeEcho,
|
|
StorageMethod: database.ProvisionerStorageMethodFile,
|
|
FileID: file.ID,
|
|
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
|
Input: []byte("{}"),
|
|
})
|
|
_ = dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
OrganizationID: org.ID,
|
|
JobID: pj.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
}
|
|
|
|
detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh)
|
|
detector.Start()
|
|
tickCh <- now
|
|
|
|
// Make sure that only MaxJobsPerRun jobs are terminated.
|
|
stats := <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, jobreaper.MaxJobsPerRun)
|
|
|
|
// Run the detector again and make sure that only the remaining job is
|
|
// terminated.
|
|
tickCh <- now
|
|
stats = <-statsCh
|
|
require.NoError(t, stats.Error)
|
|
require.Len(t, stats.TerminatedJobIDs, 1)
|
|
|
|
detector.Close()
|
|
detector.Wait()
|
|
}
|
|
|
|
// wrapDBAuthz adds our Authorization/RBAC around the given database store, to
|
|
// ensure the reaper has the right permissions to do its work.
|
|
func wrapDBAuthz(db database.Store, logger slog.Logger) database.Store {
|
|
return dbauthz.New(
|
|
db,
|
|
rbac.NewStrictCachingAuthorizer(prometheus.NewRegistry()),
|
|
logger,
|
|
coderdtest.AccessControlStorePointer(),
|
|
)
|
|
}
|