From f009c17217e6bad9a61ba511d23735bc1ce94da0 Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Tue, 5 May 2026 15:54:04 -0400 Subject: [PATCH] fix(coderd): cut DB fan-out on agent instance-identity auth (backport #24973) (#24982) Backport of #24973 to `release/2.33`. ## Summary Restores `v2.33.0-rc.2`-equivalent query cost for agent instance-identity auth, which currently saturates the pgx pool when multiple agents share an instance ID. Customer report against rc.3 traced 233x `Internal error fetching provisioner job resource` 500s during a 50-minute incident window to this path. ## Changes 1. **System fast-path on `authorizeProvisionerJob`** (`coderd/database/dbauthz/dbauthz.go`): Short-circuits the per-job RBAC fan-out through `GetWorkspaceBuildByJobID` -> `GetWorkspaceByID` for `AsSystemRestricted` callers. 2. **Drop survivor re-fetch in `handleAuthInstanceID`** (`coderd/workspaceresourceauth.go`): Captures the provisioner job alongside each candidate during the filter loop so the post-selection code reads it directly instead of re-querying. ## Conflict resolution One conflict in `coderd/database/dbauthz/dbauthz_test.go`: the `TestAsAutostart` test function (from an unrelated commit on `main`) was brought in as surrounding context during the cherry-pick. It was removed since it tests functionality (`ResourceUserSecret.Read` for the Autostart role) not present on the release branch. ## Tests - `TestAuthorizeProvisionerJob_SystemFastPath` (3 sub-tests): all pass - `TestPostWorkspaceAuthAWSInstanceIdentity/Ambiguous/*` (7 sub-tests): all pass > Generated by Coder Agents Co-authored-by: Dean Sheather --- coderd/database/dbauthz/dbauthz.go | 22 +++++ coderd/database/dbauthz/dbauthz_test.go | 108 ++++++++++++++++++++++++ coderd/workspaceresourceauth.go | 65 +++++++------- 3 files changed, 159 insertions(+), 36 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index cd60befe57..8b20ba7f46 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1502,6 +1502,28 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole, } func (q *querier) authorizeProvisionerJob(ctx context.Context, job database.ProvisionerJob) error { + // System-restricted callers (e.g. instance-identity agent auth via + // AsSystemRestricted) have already passed an outer authz check before + // reaching the provisioner job. Skip the per-job RBAC fan-out through + // GetWorkspaceBuildByJobID -> GetWorkspaceByID, which serializes 2 + // extra DB queries + 1 RBAC eval per call. Under saturated pgx pools + // this cascade can block agent auth past the HTTP write timeout (see + // incident report against v2.33.0-rc.3 with multi-agent + // instance-identity templates). + // + // We check the subject type directly rather than calling + // authorizeContext(ResourceSystem) so we do not record a site-scoped + // authz call on every provisioner-job lookup; tests like + // TestCreateUserWorkspace/AuthzStory assert that workspace creation + // only emits org-scoped authz calls. The same actor.Type check is + // already used elsewhere in this file (see GetChatDiffStatusesByChatIDs). + // + // If a future system actor needs the same fast-path, add its + // SubjectType here explicitly rather than broadening to a permission + // check. + if actor, ok := ActorFromContext(ctx); ok && actor.Type == rbac.SubjectTypeSystemRestricted { + return nil + } switch job.Type { case database.ProvisionerJobTypeWorkspaceBuild: // Authorized call to get workspace build. If we can read the build, we can diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 469d563650..706ed9354f 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -6198,6 +6198,114 @@ func TestGetWorkspaceAgentByID_FastPath(t *testing.T) { }) } +// TestAuthorizeProvisionerJob_SystemFastPath verifies that +// authorizeProvisionerJob short-circuits for system-restricted callers +// instead of fanning out into GetWorkspaceBuildByJobID -> GetWorkspaceByID. +// That cascade adds 2 SQL queries + 1 RBAC eval per provisioner-job lookup +// and saturates the pgx pool when called repeatedly from agent +// instance-identity auth (see incident report against v2.33.0-rc.3). +func TestAuthorizeProvisionerJob_SystemFastPath(t *testing.T) { + t.Parallel() + + jobID := uuid.New() + job := database.ProvisionerJob{ + ID: jobID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + } + + authorizer := rbac.NewAuthorizer(prometheus.NewRegistry()) + + t.Run("AsSystemRestricted/SkipsCascade", func(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + mockDB := dbmock.NewMockStore(ctrl) + + mockDB.EXPECT().Wrappers().Return([]string{}) + // The fast-path must short-circuit before GetWorkspaceBuildByJobID + // or GetWorkspaceByID can be called. The strict mock will fail + // the test if either is invoked. + mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil) + + q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer()) + ctx := dbauthz.AsSystemRestricted(context.Background()) + + got, err := q.GetProvisionerJobByID(ctx, jobID) + require.NoError(t, err) + require.Equal(t, job, got) + }) + + t.Run("AsSystemRestricted/TemplateVersion/SkipsCascade", func(t *testing.T) { + t.Parallel() + + // The fast-path is type-agnostic: it must short-circuit the + // template-version cascade as well, so neither + // GetTemplateVersionByJobID nor GetTemplateByID is invoked. + tvJobID := uuid.New() + tvJob := database.ProvisionerJob{ + ID: tvJobID, + Type: database.ProvisionerJobTypeTemplateVersionImport, + } + + ctrl := gomock.NewController(t) + mockDB := dbmock.NewMockStore(ctrl) + + mockDB.EXPECT().Wrappers().Return([]string{}) + mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), tvJobID).Return(tvJob, nil) + + q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer()) + ctx := dbauthz.AsSystemRestricted(context.Background()) + + got, err := q.GetProvisionerJobByID(ctx, tvJobID) + require.NoError(t, err) + require.Equal(t, tvJob, got) + }) + + t.Run("NonSystemActor/StillCascades", func(t *testing.T) { + t.Parallel() + + // An auditor has no ResourceSystem permission, so the fast-path + // must fall through to the workspace-build cascade. That cascade + // then fails authz on the workspace because auditors cannot read + // arbitrary workspaces. The error type is what we assert: it + // proves the cascade ran rather than the fast-path short-circuiting. + orgID := uuid.New() + wsID := uuid.New() + workspace := database.Workspace{ + ID: wsID, + OwnerID: uuid.New(), + OrganizationID: orgID, + } + build := database.WorkspaceBuild{ + ID: uuid.New(), + WorkspaceID: wsID, + JobID: jobID, + } + auditor := rbac.Subject{ + ID: uuid.NewString(), + Roles: rbac.RoleIdentifiers{rbac.RoleAuditor()}, + Groups: []string{orgID.String()}, + Scope: rbac.ScopeAll, + } + + ctrl := gomock.NewController(t) + mockDB := dbmock.NewMockStore(ctrl) + + mockDB.EXPECT().Wrappers().Return([]string{}) + mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil) + mockDB.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), jobID).Return(build, nil) + mockDB.EXPECT().GetWorkspaceByID(gomock.Any(), wsID).Return(workspace, nil) + + q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer()) + ctx := dbauthz.As(context.Background(), auditor) + + _, err := q.GetProvisionerJobByID(ctx, jobID) + require.Error(t, err) + require.True(t, dbauthz.IsNotAuthorizedError(err), + "cascade must run and produce a NotAuthorized error for auditor: got %v", err) + }) +} + func TestAsChatd(t *testing.T) { t.Parallel() diff --git a/coderd/workspaceresourceauth.go b/coderd/workspaceresourceauth.go index f414c3a828..290b98d07b 100644 --- a/coderd/workspaceresourceauth.go +++ b/coderd/workspaceresourceauth.go @@ -148,7 +148,18 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in // Template version agents can share an instance ID with workspace build // agents. Keep only workspace build agents before resolving ambiguity so // template version agents do not force CODER_AGENT_NAME. - buildAgents := agents[:0] + // + // We attach the provisioner job to each candidate during the filter + // loop so the post-selection code below can read it directly from the + // chosen candidate instead of re-querying. The previous code re-fetched + // the resource and job for the surviving agent, firing the + // resource->job->build->workspace dbauthz cascade twice and saturating + // the pgx pool under load. + type instanceCandidate struct { + agent database.WorkspaceAgent + job database.ProvisionerJob + } + buildCandidates := make([]instanceCandidate, 0, len(agents)) for _, candidate := range agents { resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, candidate.ResourceID) if err != nil { @@ -167,40 +178,42 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in return } if job.Type == database.ProvisionerJobTypeWorkspaceBuild { - buildAgents = append(buildAgents, candidate) + buildCandidates = append(buildCandidates, instanceCandidate{ + agent: candidate, + job: job, + }) } } - agents = buildAgents - if len(agents) == 0 { + if len(buildCandidates) == 0 { httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{ Message: fmt.Sprintf("Instance with id %q not found.", instanceID), }) return } - var agent database.WorkspaceAgent + var selected instanceCandidate if agentName != "" { - for _, candidate := range agents { - if candidate.Name == agentName { - agent = candidate + for _, candidate := range buildCandidates { + if candidate.agent.Name == agentName { + selected = candidate break } } - if agent.ID == uuid.Nil { + if selected.agent.ID == uuid.Nil { httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{ Message: fmt.Sprintf("No agent found with instance ID %q and name %q.", instanceID, agentName), }) return } } else { - if len(agents) != 1 { + if len(buildCandidates) != 1 { // Include agent names in the error message to help operators // configure CODER_AGENT_NAME. The caller has already proven // cloud instance identity, so agent names are not sensitive // here. - names := make([]string, len(agents)) - for i, candidate := range agents { - names[i] = candidate.Name + names := make([]string, len(buildCandidates)) + for i, candidate := range buildCandidates { + names[i] = candidate.agent.Name } sort.Strings(names) httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{ @@ -212,30 +225,10 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in }) return } - agent = agents[0] - } - resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, agent.ResourceID) - if err != nil { - httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ - Message: "Internal error fetching provisioner job resource.", - Detail: err.Error(), - }) - return - } - job, err := api.Database.GetProvisionerJobByID(systemCtx, resource.JobID) - if err != nil { - httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ - Message: "Internal error fetching provisioner job.", - Detail: err.Error(), - }) - return - } - if job.Type != database.ProvisionerJobTypeWorkspaceBuild { - httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ - Message: fmt.Sprintf("%q jobs cannot be authenticated.", job.Type), - }) - return + selected = buildCandidates[0] } + agent := selected.agent + job := selected.job var jobData provisionerdserver.WorkspaceProvisionJob err = json.Unmarshal(job.Input, &jobData) if err != nil {