mirror of
https://github.com/coder/coder.git
synced 2026-06-04 05:28:20 +00:00
Backport of #24973 to `release/2.33`. ## Summary Restores `v2.33.0-rc.2`-equivalent query cost for agent instance-identity auth, which currently saturates the pgx pool when multiple agents share an instance ID. Customer report against rc.3 traced 233x `Internal error fetching provisioner job resource` 500s during a 50-minute incident window to this path. ## Changes 1. **System fast-path on `authorizeProvisionerJob`** (`coderd/database/dbauthz/dbauthz.go`): Short-circuits the per-job RBAC fan-out through `GetWorkspaceBuildByJobID` -> `GetWorkspaceByID` for `AsSystemRestricted` callers. 2. **Drop survivor re-fetch in `handleAuthInstanceID`** (`coderd/workspaceresourceauth.go`): Captures the provisioner job alongside each candidate during the filter loop so the post-selection code reads it directly instead of re-querying. ## Conflict resolution One conflict in `coderd/database/dbauthz/dbauthz_test.go`: the `TestAsAutostart` test function (from an unrelated commit on `main`) was brought in as surrounding context during the cherry-pick. It was removed since it tests functionality (`ResourceUserSecret.Read` for the Autostart role) not present on the release branch. ## Tests - `TestAuthorizeProvisionerJob_SystemFastPath` (3 sub-tests): all pass - `TestPostWorkspaceAuthAWSInstanceIdentity/Ambiguous/*` (7 sub-tests): all pass > Generated by Coder Agents Co-authored-by: Dean Sheather <dean@deansheather.com>
This commit is contained in:
@@ -1502,6 +1502,28 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole,
|
||||
}
|
||||
|
||||
func (q *querier) authorizeProvisionerJob(ctx context.Context, job database.ProvisionerJob) error {
|
||||
// System-restricted callers (e.g. instance-identity agent auth via
|
||||
// AsSystemRestricted) have already passed an outer authz check before
|
||||
// reaching the provisioner job. Skip the per-job RBAC fan-out through
|
||||
// GetWorkspaceBuildByJobID -> GetWorkspaceByID, which serializes 2
|
||||
// extra DB queries + 1 RBAC eval per call. Under saturated pgx pools
|
||||
// this cascade can block agent auth past the HTTP write timeout (see
|
||||
// incident report against v2.33.0-rc.3 with multi-agent
|
||||
// instance-identity templates).
|
||||
//
|
||||
// We check the subject type directly rather than calling
|
||||
// authorizeContext(ResourceSystem) so we do not record a site-scoped
|
||||
// authz call on every provisioner-job lookup; tests like
|
||||
// TestCreateUserWorkspace/AuthzStory assert that workspace creation
|
||||
// only emits org-scoped authz calls. The same actor.Type check is
|
||||
// already used elsewhere in this file (see GetChatDiffStatusesByChatIDs).
|
||||
//
|
||||
// If a future system actor needs the same fast-path, add its
|
||||
// SubjectType here explicitly rather than broadening to a permission
|
||||
// check.
|
||||
if actor, ok := ActorFromContext(ctx); ok && actor.Type == rbac.SubjectTypeSystemRestricted {
|
||||
return nil
|
||||
}
|
||||
switch job.Type {
|
||||
case database.ProvisionerJobTypeWorkspaceBuild:
|
||||
// Authorized call to get workspace build. If we can read the build, we can
|
||||
|
||||
@@ -6198,6 +6198,114 @@ func TestGetWorkspaceAgentByID_FastPath(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestAuthorizeProvisionerJob_SystemFastPath verifies that
|
||||
// authorizeProvisionerJob short-circuits for system-restricted callers
|
||||
// instead of fanning out into GetWorkspaceBuildByJobID -> GetWorkspaceByID.
|
||||
// That cascade adds 2 SQL queries + 1 RBAC eval per provisioner-job lookup
|
||||
// and saturates the pgx pool when called repeatedly from agent
|
||||
// instance-identity auth (see incident report against v2.33.0-rc.3).
|
||||
func TestAuthorizeProvisionerJob_SystemFastPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobID := uuid.New()
|
||||
job := database.ProvisionerJob{
|
||||
ID: jobID,
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
}
|
||||
|
||||
authorizer := rbac.NewAuthorizer(prometheus.NewRegistry())
|
||||
|
||||
t.Run("AsSystemRestricted/SkipsCascade", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
// The fast-path must short-circuit before GetWorkspaceBuildByJobID
|
||||
// or GetWorkspaceByID can be called. The strict mock will fail
|
||||
// the test if either is invoked.
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.AsSystemRestricted(context.Background())
|
||||
|
||||
got, err := q.GetProvisionerJobByID(ctx, jobID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, job, got)
|
||||
})
|
||||
|
||||
t.Run("AsSystemRestricted/TemplateVersion/SkipsCascade", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// The fast-path is type-agnostic: it must short-circuit the
|
||||
// template-version cascade as well, so neither
|
||||
// GetTemplateVersionByJobID nor GetTemplateByID is invoked.
|
||||
tvJobID := uuid.New()
|
||||
tvJob := database.ProvisionerJob{
|
||||
ID: tvJobID,
|
||||
Type: database.ProvisionerJobTypeTemplateVersionImport,
|
||||
}
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), tvJobID).Return(tvJob, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.AsSystemRestricted(context.Background())
|
||||
|
||||
got, err := q.GetProvisionerJobByID(ctx, tvJobID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tvJob, got)
|
||||
})
|
||||
|
||||
t.Run("NonSystemActor/StillCascades", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// An auditor has no ResourceSystem permission, so the fast-path
|
||||
// must fall through to the workspace-build cascade. That cascade
|
||||
// then fails authz on the workspace because auditors cannot read
|
||||
// arbitrary workspaces. The error type is what we assert: it
|
||||
// proves the cascade ran rather than the fast-path short-circuiting.
|
||||
orgID := uuid.New()
|
||||
wsID := uuid.New()
|
||||
workspace := database.Workspace{
|
||||
ID: wsID,
|
||||
OwnerID: uuid.New(),
|
||||
OrganizationID: orgID,
|
||||
}
|
||||
build := database.WorkspaceBuild{
|
||||
ID: uuid.New(),
|
||||
WorkspaceID: wsID,
|
||||
JobID: jobID,
|
||||
}
|
||||
auditor := rbac.Subject{
|
||||
ID: uuid.NewString(),
|
||||
Roles: rbac.RoleIdentifiers{rbac.RoleAuditor()},
|
||||
Groups: []string{orgID.String()},
|
||||
Scope: rbac.ScopeAll,
|
||||
}
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
mockDB := dbmock.NewMockStore(ctrl)
|
||||
|
||||
mockDB.EXPECT().Wrappers().Return([]string{})
|
||||
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
|
||||
mockDB.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), jobID).Return(build, nil)
|
||||
mockDB.EXPECT().GetWorkspaceByID(gomock.Any(), wsID).Return(workspace, nil)
|
||||
|
||||
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
|
||||
ctx := dbauthz.As(context.Background(), auditor)
|
||||
|
||||
_, err := q.GetProvisionerJobByID(ctx, jobID)
|
||||
require.Error(t, err)
|
||||
require.True(t, dbauthz.IsNotAuthorizedError(err),
|
||||
"cascade must run and produce a NotAuthorized error for auditor: got %v", err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAsChatd(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user