mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
Backport of #24973 to `release/2.33`. ## Summary Restores `v2.33.0-rc.2`-equivalent query cost for agent instance-identity auth, which currently saturates the pgx pool when multiple agents share an instance ID. Customer report against rc.3 traced 233x `Internal error fetching provisioner job resource` 500s during a 50-minute incident window to this path. ## Changes 1. **System fast-path on `authorizeProvisionerJob`** (`coderd/database/dbauthz/dbauthz.go`): Short-circuits the per-job RBAC fan-out through `GetWorkspaceBuildByJobID` -> `GetWorkspaceByID` for `AsSystemRestricted` callers. 2. **Drop survivor re-fetch in `handleAuthInstanceID`** (`coderd/workspaceresourceauth.go`): Captures the provisioner job alongside each candidate during the filter loop so the post-selection code reads it directly instead of re-querying. ## Conflict resolution One conflict in `coderd/database/dbauthz/dbauthz_test.go`: the `TestAsAutostart` test function (from an unrelated commit on `main`) was brought in as surrounding context during the cherry-pick. It was removed since it tests functionality (`ResourceUserSecret.Read` for the Autostart role) not present on the release branch. ## Tests - `TestAuthorizeProvisionerJob_SystemFastPath` (3 sub-tests): all pass - `TestPostWorkspaceAuthAWSInstanceIdentity/Ambiguous/*` (7 sub-tests): all pass > Generated by Coder Agents Co-authored-by: Dean Sheather <dean@deansheather.com>
This commit is contained in:
@@ -1502,6 +1502,28 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole,
|
||||
}
|
||||
|
||||
func (q *querier) authorizeProvisionerJob(ctx context.Context, job database.ProvisionerJob) error {
|
||||
// System-restricted callers (e.g. instance-identity agent auth via
|
||||
// AsSystemRestricted) have already passed an outer authz check before
|
||||
// reaching the provisioner job. Skip the per-job RBAC fan-out through
|
||||
// GetWorkspaceBuildByJobID -> GetWorkspaceByID, which serializes 2
|
||||
// extra DB queries + 1 RBAC eval per call. Under saturated pgx pools
|
||||
// this cascade can block agent auth past the HTTP write timeout (see
|
||||
// incident report against v2.33.0-rc.3 with multi-agent
|
||||
// instance-identity templates).
|
||||
//
|
||||
// We check the subject type directly rather than calling
|
||||
// authorizeContext(ResourceSystem) so we do not record a site-scoped
|
||||
// authz call on every provisioner-job lookup; tests like
|
||||
// TestCreateUserWorkspace/AuthzStory assert that workspace creation
|
||||
// only emits org-scoped authz calls. The same actor.Type check is
|
||||
// already used elsewhere in this file (see GetChatDiffStatusesByChatIDs).
|
||||
//
|
||||
// If a future system actor needs the same fast-path, add its
|
||||
// SubjectType here explicitly rather than broadening to a permission
|
||||
// check.
|
||||
if actor, ok := ActorFromContext(ctx); ok && actor.Type == rbac.SubjectTypeSystemRestricted {
|
||||
return nil
|
||||
}
|
||||
switch job.Type {
|
||||
case database.ProvisionerJobTypeWorkspaceBuild:
|
||||
// Authorized call to get workspace build. If we can read the build, we can
|
||||
|
||||
@@ -6198,6 +6198,114 @@ func TestGetWorkspaceAgentByID_FastPath(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestAuthorizeProvisionerJob_SystemFastPath verifies that
|
||||
// authorizeProvisionerJob short-circuits for system-restricted callers
|
||||
// instead of fanning out into GetWorkspaceBuildByJobID -> GetWorkspaceByID.
|
||||
// That cascade adds 2 SQL queries + 1 RBAC eval per provisioner-job lookup
|
||||
// and saturates the pgx pool when called repeatedly from agent
|
||||
// instance-identity auth (see incident report against v2.33.0-rc.3).
|
||||
func TestAuthorizeProvisionerJob_SystemFastPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobID := uuid.New()
|
||||
job := database.ProvisionerJob{
|
||||
ID: jobID,
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
}
|
||||
|
||||
authorizer := rbac.NewAuthorizer(prometheus.NewRegistry())
|
||||
|
||||
t.Run("AsSystemRestricted/SkipsCascade", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
// The fast-path must short-circuit before GetWorkspaceBuildByJobID
|
||||
// or GetWorkspaceByID can be called. The strict mock will fail
|
||||
// the test if either is invoked.
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.AsSystemRestricted(context.Background())
|
||||
|
||||
got, err := q.GetProvisionerJobByID(ctx, jobID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, job, got)
|
||||
})
|
||||
|
||||
t.Run("AsSystemRestricted/TemplateVersion/SkipsCascade", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// The fast-path is type-agnostic: it must short-circuit the
|
||||
// template-version cascade as well, so neither
|
||||
// GetTemplateVersionByJobID nor GetTemplateByID is invoked.
|
||||
tvJobID := uuid.New()
|
||||
tvJob := database.ProvisionerJob{
|
||||
ID: tvJobID,
|
||||
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
||||
}
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), tvJobID).Return(tvJob, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.AsSystemRestricted(context.Background())
|
||||
|
||||
got, err := q.GetProvisionerJobByID(ctx, tvJobID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tvJob, got)
|
||||
})
|
||||
|
||||
t.Run("NonSystemActor/StillCascades", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// An auditor has no ResourceSystem permission, so the fast-path
|
||||
// must fall through to the workspace-build cascade. That cascade
|
||||
// then fails authz on the workspace because auditors cannot read
|
||||
// arbitrary workspaces. The error type is what we assert: it
|
||||
// proves the cascade ran rather than the fast-path short-circuiting.
|
||||
orgID := uuid.New()
|
||||
wsID := uuid.New()
|
||||
workspace := database.Workspace{
|
||||
ID: wsID,
|
||||
OwnerID: uuid.New(),
|
||||
OrganizationID: orgID,
|
||||
}
|
||||
build := database.WorkspaceBuild{
|
||||
ID: uuid.New(),
|
||||
WorkspaceID: wsID,
|
||||
JobID: jobID,
|
||||
}
|
||||
auditor := rbac.Subject{
|
||||
ID: uuid.NewString(),
|
||||
Roles: rbac.RoleIdentifiers{rbac.RoleAuditor()},
|
||||
Groups: []string{orgID.String()},
|
||||
Scope: rbac.ScopeAll,
|
||||
}
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
|
||||
mockDB.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), jobID).Return(build, nil)
|
||||
mockDB.EXPECT().GetWorkspaceByID(gomock.Any(), wsID).Return(workspace, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.As(context.Background(), auditor)
|
||||
|
||||
_, err := q.GetProvisionerJobByID(ctx, jobID)
|
||||
require.Error(t, err)
|
||||
require.True(t, dbauthz.IsNotAuthorizedError(err),
|
||||
"cascade must run and produce a NotAuthorized error for auditor: got %v", err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAsChatd(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -148,7 +148,18 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
|
||||
// Template version agents can share an instance ID with workspace build
|
||||
// agents. Keep only workspace build agents before resolving ambiguity so
|
||||
// template version agents do not force CODER_AGENT_NAME.
|
||||
buildAgents := agents[:0]
|
||||
//
|
||||
// We attach the provisioner job to each candidate during the filter
|
||||
// loop so the post-selection code below can read it directly from the
|
||||
// chosen candidate instead of re-querying. The previous code re-fetched
|
||||
// the resource and job for the surviving agent, firing the
|
||||
// resource->job->build->workspace dbauthz cascade twice and saturating
|
||||
// the pgx pool under load.
|
||||
type instanceCandidate struct {
|
||||
agent database.WorkspaceAgent
|
||||
job database.ProvisionerJob
|
||||
}
|
||||
buildCandidates := make([]instanceCandidate, 0, len(agents))
|
||||
for _, candidate := range agents {
|
||||
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, candidate.ResourceID)
|
||||
if err != nil {
|
||||
@@ -167,40 +178,42 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
|
||||
return
|
||||
}
|
||||
if job.Type == database.ProvisionerJobTypeWorkspaceBuild {
|
||||
buildAgents = append(buildAgents, candidate)
|
||||
buildCandidates = append(buildCandidates, instanceCandidate{
|
||||
agent: candidate,
|
||||
job: job,
|
||||
})
|
||||
}
|
||||
}
|
||||
agents = buildAgents
|
||||
if len(agents) == 0 {
|
||||
if len(buildCandidates) == 0 {
|
||||
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
|
||||
Message: fmt.Sprintf("Instance with id %q not found.", instanceID),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
var agent database.WorkspaceAgent
|
||||
var selected instanceCandidate
|
||||
if agentName != "" {
|
||||
for _, candidate := range agents {
|
||||
if candidate.Name == agentName {
|
||||
agent = candidate
|
||||
for _, candidate := range buildCandidates {
|
||||
if candidate.agent.Name == agentName {
|
||||
selected = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
if agent.ID == uuid.Nil {
|
||||
if selected.agent.ID == uuid.Nil {
|
||||
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
|
||||
Message: fmt.Sprintf("No agent found with instance ID %q and name %q.", instanceID, agentName),
|
||||
})
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if len(agents) != 1 {
|
||||
if len(buildCandidates) != 1 {
|
||||
// Include agent names in the error message to help operators
|
||||
// configure CODER_AGENT_NAME. The caller has already proven
|
||||
// cloud instance identity, so agent names are not sensitive
|
||||
// here.
|
||||
names := make([]string, len(agents))
|
||||
for i, candidate := range agents {
|
||||
names[i] = candidate.Name
|
||||
names := make([]string, len(buildCandidates))
|
||||
for i, candidate := range buildCandidates {
|
||||
names[i] = candidate.agent.Name
|
||||
}
|
||||
sort.Strings(names)
|
||||
httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{
|
||||
@@ -212,30 +225,10 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
|
||||
})
|
||||
return
|
||||
}
|
||||
agent = agents[0]
|
||||
}
|
||||
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, agent.ResourceID)
|
||||
if err != nil {
|
||||
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
|
||||
Message: "Internal error fetching provisioner job resource.",
|
||||
Detail: err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
job, err := api.Database.GetProvisionerJobByID(systemCtx, resource.JobID)
|
||||
if err != nil {
|
||||
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
|
||||
Message: "Internal error fetching provisioner job.",
|
||||
Detail: err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
if job.Type != database.ProvisionerJobTypeWorkspaceBuild {
|
||||
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
|
||||
Message: fmt.Sprintf("%q jobs cannot be authenticated.", job.Type),
|
||||
})
|
||||
return
|
||||
selected = buildCandidates[0]
|
||||
}
|
||||
agent := selected.agent
|
||||
job := selected.job
|
||||
var jobData provisionerdserver.WorkspaceProvisionJob
|
||||
err = json.Unmarshal(job.Input, &jobData)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user