fix(coderd): cut DB fan-out on agent instance-identity auth (backport #24973) (#24982)

Backport of #24973 to `release/2.33`.

## Summary

Restores `v2.33.0-rc.2`-equivalent query cost for agent
instance-identity auth, which currently saturates the pgx pool when
multiple agents share an instance ID. Customer report against rc.3
traced 233x `Internal error fetching provisioner job resource` 500s
during a 50-minute incident window to this path.

## Changes

1. **System fast-path on `authorizeProvisionerJob`**
(`coderd/database/dbauthz/dbauthz.go`): Short-circuits the per-job RBAC
fan-out through `GetWorkspaceBuildByJobID` -> `GetWorkspaceByID` for
`AsSystemRestricted` callers.
2. **Drop survivor re-fetch in `handleAuthInstanceID`**
(`coderd/workspaceresourceauth.go`): Captures the provisioner job
alongside each candidate during the filter loop so the post-selection
code reads it directly instead of re-querying.

## Conflict resolution

One conflict in `coderd/database/dbauthz/dbauthz_test.go`: the
`TestAsAutostart` test function (from an unrelated commit on `main`) was
brought in as surrounding context during the cherry-pick. It was removed
since it tests functionality (`ResourceUserSecret.Read` for the
Autostart role) not present on the release branch.

## Tests

- `TestAuthorizeProvisionerJob_SystemFastPath` (3 sub-tests): all pass
- `TestPostWorkspaceAuthAWSInstanceIdentity/Ambiguous/*` (7 sub-tests):
all pass

> Generated by Coder Agents

Co-authored-by: Dean Sheather <dean@deansheather.com>
This commit is contained in:
Garrett Delfosse
2026-05-05 15:54:04 -04:00
committed by GitHub
parent 17635dde5c
commit f009c17217
3 changed files with 159 additions and 36 deletions
+22
View File
@@ -1502,6 +1502,28 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole,
}
func (q *querier) authorizeProvisionerJob(ctx context.Context, job database.ProvisionerJob) error {
// System-restricted callers (e.g. instance-identity agent auth via
// AsSystemRestricted) have already passed an outer authz check before
// reaching the provisioner job. Skip the per-job RBAC fan-out through
// GetWorkspaceBuildByJobID -> GetWorkspaceByID, which serializes 2
// extra DB queries + 1 RBAC eval per call. Under saturated pgx pools
// this cascade can block agent auth past the HTTP write timeout (see
// incident report against v2.33.0-rc.3 with multi-agent
// instance-identity templates).
//
// We check the subject type directly rather than calling
// authorizeContext(ResourceSystem) so we do not record a site-scoped
// authz call on every provisioner-job lookup; tests like
// TestCreateUserWorkspace/AuthzStory assert that workspace creation
// only emits org-scoped authz calls. The same actor.Type check is
// already used elsewhere in this file (see GetChatDiffStatusesByChatIDs).
//
// If a future system actor needs the same fast-path, add its
// SubjectType here explicitly rather than broadening to a permission
// check.
if actor, ok := ActorFromContext(ctx); ok && actor.Type == rbac.SubjectTypeSystemRestricted {
return nil
}
switch job.Type {
case database.ProvisionerJobTypeWorkspaceBuild:
// Authorized call to get workspace build. If we can read the build, we can
+108
View File
@@ -6198,6 +6198,114 @@ func TestGetWorkspaceAgentByID_FastPath(t *testing.T) {
})
}
// TestAuthorizeProvisionerJob_SystemFastPath verifies that
// authorizeProvisionerJob short-circuits for system-restricted callers
// instead of fanning out into GetWorkspaceBuildByJobID -> GetWorkspaceByID.
// That cascade adds 2 SQL queries + 1 RBAC eval per provisioner-job lookup
// and saturates the pgx pool when called repeatedly from agent
// instance-identity auth (see incident report against v2.33.0-rc.3).
func TestAuthorizeProvisionerJob_SystemFastPath(t *testing.T) {
t.Parallel()
jobID := uuid.New()
job := database.ProvisionerJob{
ID: jobID,
Type: database.ProvisionerJobTypeWorkspaceBuild,
}
authorizer := rbac.NewAuthorizer(prometheus.NewRegistry())
t.Run("AsSystemRestricted/SkipsCascade", func(t *testing.T) {
t.Parallel()
ctrl := gomock.NewController(t)
mockDB := dbmock.NewMockStore(ctrl)
mockDB.EXPECT().Wrappers().Return([]string{})
// The fast-path must short-circuit before GetWorkspaceBuildByJobID
// or GetWorkspaceByID can be called. The strict mock will fail
// the test if either is invoked.
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
ctx := dbauthz.AsSystemRestricted(context.Background())
got, err := q.GetProvisionerJobByID(ctx, jobID)
require.NoError(t, err)
require.Equal(t, job, got)
})
t.Run("AsSystemRestricted/TemplateVersion/SkipsCascade", func(t *testing.T) {
t.Parallel()
// The fast-path is type-agnostic: it must short-circuit the
// template-version cascade as well, so neither
// GetTemplateVersionByJobID nor GetTemplateByID is invoked.
tvJobID := uuid.New()
tvJob := database.ProvisionerJob{
ID: tvJobID,
Type: database.ProvisionerJobTypeTemplateVersionImport,
}
ctrl := gomock.NewController(t)
mockDB := dbmock.NewMockStore(ctrl)
mockDB.EXPECT().Wrappers().Return([]string{})
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), tvJobID).Return(tvJob, nil)
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
ctx := dbauthz.AsSystemRestricted(context.Background())
got, err := q.GetProvisionerJobByID(ctx, tvJobID)
require.NoError(t, err)
require.Equal(t, tvJob, got)
})
t.Run("NonSystemActor/StillCascades", func(t *testing.T) {
t.Parallel()
// An auditor has no ResourceSystem permission, so the fast-path
// must fall through to the workspace-build cascade. That cascade
// then fails authz on the workspace because auditors cannot read
// arbitrary workspaces. The error type is what we assert: it
// proves the cascade ran rather than the fast-path short-circuiting.
orgID := uuid.New()
wsID := uuid.New()
workspace := database.Workspace{
ID: wsID,
OwnerID: uuid.New(),
OrganizationID: orgID,
}
build := database.WorkspaceBuild{
ID: uuid.New(),
WorkspaceID: wsID,
JobID: jobID,
}
auditor := rbac.Subject{
ID: uuid.NewString(),
Roles: rbac.RoleIdentifiers{rbac.RoleAuditor()},
Groups: []string{orgID.String()},
Scope: rbac.ScopeAll,
}
ctrl := gomock.NewController(t)
mockDB := dbmock.NewMockStore(ctrl)
mockDB.EXPECT().Wrappers().Return([]string{})
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
mockDB.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), jobID).Return(build, nil)
mockDB.EXPECT().GetWorkspaceByID(gomock.Any(), wsID).Return(workspace, nil)
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
ctx := dbauthz.As(context.Background(), auditor)
_, err := q.GetProvisionerJobByID(ctx, jobID)
require.Error(t, err)
require.True(t, dbauthz.IsNotAuthorizedError(err),
"cascade must run and produce a NotAuthorized error for auditor: got %v", err)
})
}
func TestAsChatd(t *testing.T) {
t.Parallel()
+29 -36
View File
@@ -148,7 +148,18 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
// Template version agents can share an instance ID with workspace build
// agents. Keep only workspace build agents before resolving ambiguity so
// template version agents do not force CODER_AGENT_NAME.
buildAgents := agents[:0]
//
// We attach the provisioner job to each candidate during the filter
// loop so the post-selection code below can read it directly from the
// chosen candidate instead of re-querying. The previous code re-fetched
// the resource and job for the surviving agent, firing the
// resource->job->build->workspace dbauthz cascade twice and saturating
// the pgx pool under load.
type instanceCandidate struct {
agent database.WorkspaceAgent
job database.ProvisionerJob
}
buildCandidates := make([]instanceCandidate, 0, len(agents))
for _, candidate := range agents {
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, candidate.ResourceID)
if err != nil {
@@ -167,40 +178,42 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
return
}
if job.Type == database.ProvisionerJobTypeWorkspaceBuild {
buildAgents = append(buildAgents, candidate)
buildCandidates = append(buildCandidates, instanceCandidate{
agent: candidate,
job: job,
})
}
}
agents = buildAgents
if len(agents) == 0 {
if len(buildCandidates) == 0 {
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
Message: fmt.Sprintf("Instance with id %q not found.", instanceID),
})
return
}
var agent database.WorkspaceAgent
var selected instanceCandidate
if agentName != "" {
for _, candidate := range agents {
if candidate.Name == agentName {
agent = candidate
for _, candidate := range buildCandidates {
if candidate.agent.Name == agentName {
selected = candidate
break
}
}
if agent.ID == uuid.Nil {
if selected.agent.ID == uuid.Nil {
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
Message: fmt.Sprintf("No agent found with instance ID %q and name %q.", instanceID, agentName),
})
return
}
} else {
if len(agents) != 1 {
if len(buildCandidates) != 1 {
// Include agent names in the error message to help operators
// configure CODER_AGENT_NAME. The caller has already proven
// cloud instance identity, so agent names are not sensitive
// here.
names := make([]string, len(agents))
for i, candidate := range agents {
names[i] = candidate.Name
names := make([]string, len(buildCandidates))
for i, candidate := range buildCandidates {
names[i] = candidate.agent.Name
}
sort.Strings(names)
httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{
@@ -212,30 +225,10 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
})
return
}
agent = agents[0]
}
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, agent.ResourceID)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Internal error fetching provisioner job resource.",
Detail: err.Error(),
})
return
}
job, err := api.Database.GetProvisionerJobByID(systemCtx, resource.JobID)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Internal error fetching provisioner job.",
Detail: err.Error(),
})
return
}
if job.Type != database.ProvisionerJobTypeWorkspaceBuild {
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
Message: fmt.Sprintf("%q jobs cannot be authenticated.", job.Type),
})
return
selected = buildCandidates[0]
}
agent := selected.agent
job := selected.job
var jobData provisionerdserver.WorkspaceProvisionJob
err = json.Unmarshal(job.Input, &jobData)
if err != nil {