fix(coderd): allow agent auth during workspace shutdown (#21538)

Agents were losing authentication during workspace shutdown, causing
shutdown scripts to fail. The auth query required agents to belong to
the latest build, but during shutdown a `stop` build becomes latest while
the `start` build's agents are still running.

Modified the auth query to allow `start` build agents to authenticate
temporarily during `stop` execution. The query allows auth when:

- Agent's `start` build job succeeded
- Latest build is `stop` with `pending`/`running` job status
- Builds are adjacent (`stop` is `build_number + 1`)
- Template versions match

Auth closes once `stop` completes.

Renamed `GetWorkspaceAgentAndLatestBuildByAuthToken` to
`GetAuthenticatedWorkspaceAgentAndBuildByAuthToken` since it returns the
agent's build (not always latest) during shutdown.

Closes coder/internal#1249
Fixes #19467
This commit is contained in:
Mathias Fredriksson
2026-01-21 15:18:43 +02:00
committed by GitHub
parent a14a22eb54
commit 97e8a5b093
15 changed files with 726 additions and 72 deletions
+44 -16
View File
@@ -18024,7 +18024,7 @@ func (q *sqlQuerier) DeleteWorkspaceSubAgentByID(ctx context.Context, id uuid.UU
return err
}
const getWorkspaceAgentAndLatestBuildByAuthToken = `-- name: GetWorkspaceAgentAndLatestBuildByAuthToken :one
const getAuthenticatedWorkspaceAgentAndBuildByAuthToken = `-- name: GetAuthenticatedWorkspaceAgentAndBuildByAuthToken :one
SELECT
workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates, workspaces.favorite, workspaces.next_start_at, workspaces.group_acl, workspaces.user_acl,
workspace_agents.id, workspace_agents.created_at, workspace_agents.updated_at, workspace_agents.name, workspace_agents.first_connected_at, workspace_agents.last_connected_at, workspace_agents.disconnected_at, workspace_agents.resource_id, workspace_agents.auth_token, workspace_agents.auth_instance_id, workspace_agents.architecture, workspace_agents.environment_variables, workspace_agents.operating_system, workspace_agents.instance_metadata, workspace_agents.resource_metadata, workspace_agents.directory, workspace_agents.version, workspace_agents.last_connected_replica_id, workspace_agents.connection_timeout_seconds, workspace_agents.troubleshooting_url, workspace_agents.motd_file, workspace_agents.lifecycle_state, workspace_agents.expanded_directory, workspace_agents.logs_length, workspace_agents.logs_overflowed, workspace_agents.started_at, workspace_agents.ready_at, workspace_agents.subsystems, workspace_agents.display_apps, workspace_agents.api_version, workspace_agents.display_order, workspace_agents.parent_id, workspace_agents.api_key_scope, workspace_agents.deleted,
@@ -18054,29 +18054,57 @@ WHERE
AND workspaces.deleted = FALSE
-- Filter out deleted sub agents.
AND workspace_agents.deleted = FALSE
-- Filter out builds that are not the latest.
AND workspace_build_with_user.build_number = (
-- Select from workspace_builds as it's one less join compared
-- to workspace_build_with_user.
SELECT
MAX(build_number)
FROM
workspace_builds
WHERE
workspace_id = workspace_build_with_user.workspace_id
)
-- Filter out builds that are not the latest, with exception for shutdown case.
-- Use CASE for short-circuiting: check normal case first (most common), then shutdown case.
AND CASE
-- Normal case: Agent's build is the latest build.
WHEN workspace_build_with_user.build_number = (
SELECT
MAX(build_number)
FROM
workspace_builds
WHERE
workspace_id = workspace_build_with_user.workspace_id
) THEN TRUE
-- Shutdown case: Agent from previous START build during STOP build execution.
WHEN workspace_build_with_user.transition = 'start'
-- Agent's START build job succeeded.
AND (SELECT job_status FROM provisioner_jobs WHERE id = workspace_build_with_user.job_id) = 'succeeded'
-- Latest build is a STOP build whose job is still active,
-- and agent's build is immediately previous.
AND EXISTS (
SELECT 1
FROM workspace_builds latest
JOIN provisioner_jobs pj ON pj.id = latest.job_id
WHERE latest.workspace_id = workspace_build_with_user.workspace_id
AND latest.build_number = workspace_build_with_user.build_number + 1
AND latest.build_number = (
SELECT MAX(build_number)
FROM workspace_builds l2
WHERE l2.workspace_id = latest.workspace_id
)
AND latest.transition = 'stop'
AND pj.job_status IN ('pending', 'running')
) THEN TRUE
ELSE FALSE
END
`
type GetWorkspaceAgentAndLatestBuildByAuthTokenRow struct {
type GetAuthenticatedWorkspaceAgentAndBuildByAuthTokenRow struct {
WorkspaceTable WorkspaceTable `db:"workspace_table" json:"workspace_table"`
WorkspaceAgent WorkspaceAgent `db:"workspace_agent" json:"workspace_agent"`
WorkspaceBuild WorkspaceBuild `db:"workspace_build" json:"workspace_build"`
TaskID uuid.NullUUID `db:"task_id" json:"task_id"`
}
func (q *sqlQuerier) GetWorkspaceAgentAndLatestBuildByAuthToken(ctx context.Context, authToken uuid.UUID) (GetWorkspaceAgentAndLatestBuildByAuthTokenRow, error) {
row := q.db.QueryRowContext(ctx, getWorkspaceAgentAndLatestBuildByAuthToken, authToken)
var i GetWorkspaceAgentAndLatestBuildByAuthTokenRow
// GetAuthenticatedWorkspaceAgentAndBuildByAuthToken returns an authenticated
// workspace agent and its associated build. During normal operation, this is
// the latest build. During shutdown, this may be the previous START build while
// the STOP build is executing, allowing shutdown scripts to authenticate (see
// issue #19467).
func (q *sqlQuerier) GetAuthenticatedWorkspaceAgentAndBuildByAuthToken(ctx context.Context, authToken uuid.UUID) (GetAuthenticatedWorkspaceAgentAndBuildByAuthTokenRow, error) {
row := q.db.QueryRowContext(ctx, getAuthenticatedWorkspaceAgentAndBuildByAuthToken, authToken)
var i GetAuthenticatedWorkspaceAgentAndBuildByAuthTokenRow
err := row.Scan(
&i.WorkspaceTable.ID,
&i.WorkspaceTable.CreatedAt,