From 4c180342604eaf2e0cef981e89fd4769b468e727 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Fri, 2 Sep 2022 13:24:47 +0300 Subject: [PATCH] fix: Prevent autobuild executor from slowing down API requests (#3726) With just a few workspaces, the autobuild executor can slow down API requests every time it runs. This is because we started a long running transaction and checked all eligible (for autostart) workspaces inside that transaction. PostgreSQL doesn't know if we're modifying rows and as such is locking the tables for read operations. This commit changes the behavior so each workspace is checked in its own transaction reducing the time the table/rows needs to stay locked. For now concurrency has been arbitrarily limited to 10 workspaces at a time, this could be made configurable or adjusted as the need arises. --- .../autobuild/executor/lifecycle_executor.go | 184 +++++++++++------- coderd/database/databasefake/databasefake.go | 17 +- coderd/database/querier.go | 1 - coderd/database/queries.sql.go | 50 ----- coderd/database/queries/workspaces.sql | 14 -- 5 files changed, 112 insertions(+), 154 deletions(-) diff --git a/coderd/autobuild/executor/lifecycle_executor.go b/coderd/autobuild/executor/lifecycle_executor.go index 97582f10de..6cd45823bb 100644 --- a/coderd/autobuild/executor/lifecycle_executor.go +++ b/coderd/autobuild/executor/lifecycle_executor.go @@ -5,14 +5,14 @@ import ( "encoding/json" "time" - "cdr.dev/slog" - - "github.com/coder/coder/coderd/autobuild/schedule" - "github.com/coder/coder/coderd/database" - "github.com/google/uuid" "github.com/moby/moby/pkg/namesgenerator" + "golang.org/x/sync/errgroup" "golang.org/x/xerrors" + + "cdr.dev/slog" + "github.com/coder/coder/coderd/autobuild/schedule" + "github.com/coder/coder/coderd/database" ) // Executor automatically starts or stops workspaces. @@ -89,80 +89,116 @@ func (e *Executor) runOnce(t time.Time) Stats { stats.Error = err }() currentTick := t.Truncate(time.Minute) - err = e.db.InTx(func(db database.Store) error { - // TTL is set at the workspace level, and deadline at the workspace build level. - // When a workspace build is created, its deadline initially starts at zero. - // When provisionerd successfully completes a provision job, the deadline is - // set to now + TTL if the associated workspace has a TTL set. This deadline - // is what we compare against when performing autostop operations, rounded down - // to the minute. - // - // NOTE: If a workspace build is created with a given TTL and then the user either - // changes or unsets the TTL, the deadline for the workspace build will not - // have changed. This behavior is as expected per #2229. - eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx) - if err != nil { - return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err) - } - for _, ws := range eligibleWorkspaces { - // Determine the workspace state based on its latest build. - priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID) - if err != nil { - e.log.Warn(e.ctx, "get latest workspace build", - slog.F("workspace_id", ws.ID), - slog.Error(err), - ) - continue - } - - priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID) - if err != nil { - e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", - slog.F("workspace_id", ws.ID), - slog.Error(err), - ) - continue - } - - validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob) - if err != nil { - e.log.Debug(e.ctx, "skipping workspace", - slog.Error(err), - slog.F("workspace_id", ws.ID), - ) - continue - } - - if currentTick.Before(nextTransition) { - e.log.Debug(e.ctx, "skipping workspace: too early", - slog.F("workspace_id", ws.ID), - slog.F("next_transition_at", nextTransition), - slog.F("transition", validTransition), - slog.F("current_tick", currentTick), - ) - continue - } - - e.log.Info(e.ctx, "scheduling workspace transition", - slog.F("workspace_id", ws.ID), - slog.F("transition", validTransition), - ) - - stats.Transitions[ws.ID] = validTransition - if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil { - e.log.Error(e.ctx, "unable to transition workspace", - slog.F("workspace_id", ws.ID), - slog.F("transition", validTransition), - slog.Error(err), - ) - } - } - return nil + // TTL is set at the workspace level, and deadline at the workspace build level. + // When a workspace build is created, its deadline initially starts at zero. + // When provisionerd successfully completes a provision job, the deadline is + // set to now + TTL if the associated workspace has a TTL set. This deadline + // is what we compare against when performing autostop operations, rounded down + // to the minute. + // + // NOTE: If a workspace build is created with a given TTL and then the user either + // changes or unsets the TTL, the deadline for the workspace build will not + // have changed. This behavior is as expected per #2229. + workspaces, err := e.db.GetWorkspaces(e.ctx, database.GetWorkspacesParams{ + Deleted: false, }) + if err != nil { + e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err)) + return stats + } + + var eligibleWorkspaceIDs []uuid.UUID + for _, ws := range workspaces { + if isEligibleForAutoStartStop(ws) { + eligibleWorkspaceIDs = append(eligibleWorkspaceIDs, ws.ID) + } + } + + // We only use errgroup here for convenience of API, not for early + // cancellation. This means we only return nil errors in th eg.Go. + eg := errgroup.Group{} + // Limit the concurrency to avoid overloading the database. + eg.SetLimit(10) + + for _, wsID := range eligibleWorkspaceIDs { + wsID := wsID + log := e.log.With(slog.F("workspace_id", wsID)) + + eg.Go(func() error { + err := e.db.InTx(func(db database.Store) error { + // Re-check eligibility since the first check was outside the + // transaction and the workspace settings may have changed. + ws, err := db.GetWorkspaceByID(e.ctx, wsID) + if err != nil { + log.Error(e.ctx, "get workspace autostart failed", slog.Error(err)) + return nil + } + if !isEligibleForAutoStartStop(ws) { + return nil + } + + // Determine the workspace state based on its latest build. + priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID) + if err != nil { + log.Warn(e.ctx, "get latest workspace build", slog.Error(err)) + return nil + } + + priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID) + if err != nil { + log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err)) + return nil + } + + validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob) + if err != nil { + log.Debug(e.ctx, "skipping workspace", slog.Error(err)) + return nil + } + + if currentTick.Before(nextTransition) { + log.Debug(e.ctx, "skipping workspace: too early", + slog.F("next_transition_at", nextTransition), + slog.F("transition", validTransition), + slog.F("current_tick", currentTick), + ) + return nil + } + + log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition)) + + stats.Transitions[ws.ID] = validTransition + if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil { + log.Error(e.ctx, "unable to transition workspace", + slog.F("transition", validTransition), + slog.Error(err), + ) + return nil + } + + return nil + }) + if err != nil { + log.Error(e.ctx, "workspace scheduling failed", slog.Error(err)) + } + return nil + }) + } + + // This should not happen since we don't want early cancellation. + err = eg.Wait() + if err != nil { + e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err)) + } + return stats } +func isEligibleForAutoStartStop(ws database.Workspace) bool { + return !ws.Deleted && (ws.AutostartSchedule.String != "" || ws.Ttl.Int64 > 0) +} + func getNextTransition( ws database.Workspace, priorHistory database.WorkspaceBuild, diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index 0958d2e7ff..a8b0e56429 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -588,20 +588,6 @@ func (q *fakeQuerier) GetWorkspaceAppsByAgentIDs(_ context.Context, ids []uuid.U return apps, nil } -func (q *fakeQuerier) GetWorkspacesAutostart(_ context.Context) ([]database.Workspace, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - workspaces := make([]database.Workspace, 0) - for _, ws := range q.workspaces { - if ws.AutostartSchedule.String != "" { - workspaces = append(workspaces, ws) - } else if ws.Ttl.Valid { - workspaces = append(workspaces, ws) - } - } - return workspaces, nil -} - func (q *fakeQuerier) GetWorkspaceOwnerCountsByTemplateIDs(_ context.Context, templateIDs []uuid.UUID) ([]database.GetWorkspaceOwnerCountsByTemplateIDsRow, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -2383,7 +2369,8 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) { } func (q *fakeQuerier) InsertLicense( - _ context.Context, arg database.InsertLicenseParams) (database.License, error) { + _ context.Context, arg database.InsertLicenseParams, +) (database.License, error) { q.mutex.Lock() defer q.mutex.Unlock() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 12985422f5..23cd258f49 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -99,7 +99,6 @@ type querier interface { GetWorkspaceResourcesByJobID(ctx context.Context, jobID uuid.UUID) ([]WorkspaceResource, error) GetWorkspaceResourcesCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceResource, error) GetWorkspaces(ctx context.Context, arg GetWorkspacesParams) ([]Workspace, error) - GetWorkspacesAutostart(ctx context.Context) ([]Workspace, error) InsertAPIKey(ctx context.Context, arg InsertAPIKeyParams) (APIKey, error) InsertAgentStat(ctx context.Context, arg InsertAgentStatParams) (AgentStat, error) InsertAuditLog(ctx context.Context, arg InsertAuditLogParams) (AuditLog, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 9d63a7e494..213f5ecf0b 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -4762,56 +4762,6 @@ func (q *sqlQuerier) GetWorkspaces(ctx context.Context, arg GetWorkspacesParams) return items, nil } -const getWorkspacesAutostart = `-- name: GetWorkspacesAutostart :many -SELECT - id, created_at, updated_at, owner_id, organization_id, template_id, deleted, name, autostart_schedule, ttl, last_used_at -FROM - workspaces -WHERE - deleted = false -AND -( - (autostart_schedule IS NOT NULL AND autostart_schedule <> '') - OR - (ttl IS NOT NULL AND ttl > 0) -) -` - -func (q *sqlQuerier) GetWorkspacesAutostart(ctx context.Context) ([]Workspace, error) { - rows, err := q.db.QueryContext(ctx, getWorkspacesAutostart) - if err != nil { - return nil, err - } - defer rows.Close() - var items []Workspace - for rows.Next() { - var i Workspace - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.OwnerID, - &i.OrganizationID, - &i.TemplateID, - &i.Deleted, - &i.Name, - &i.AutostartSchedule, - &i.Ttl, - &i.LastUsedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - const insertWorkspace = `-- name: InsertWorkspace :one INSERT INTO workspaces ( diff --git a/coderd/database/queries/workspaces.sql b/coderd/database/queries/workspaces.sql index 3af1ca4965..f2c1723f84 100644 --- a/coderd/database/queries/workspaces.sql +++ b/coderd/database/queries/workspaces.sql @@ -50,20 +50,6 @@ WHERE END ; --- name: GetWorkspacesAutostart :many -SELECT - * -FROM - workspaces -WHERE - deleted = false -AND -( - (autostart_schedule IS NOT NULL AND autostart_schedule <> '') - OR - (ttl IS NOT NULL AND ttl > 0) -); - -- name: GetWorkspaceByOwnerIDAndName :one SELECT *