feat: support multiple agents with shared instance-identity auth (#24325)

> This PR was authored by Mux on behalf of Mike.

## Summary

Adds support for multiple peer root workspace agents sharing the same
`auth_instance_id`, so AWS, Azure, and GCP instance-identity auth can
issue the correct session token for a selected agent instead of assuming
a
single root agent per instance.

## Problem

When a Terraform template attaches two or more `coder_agent` resources
(with `auth = "aws-instance-identity"`) to a single compute instance,
every agent shares the same cloud instance ID. The existing singular
lookup picks whichever agent was created most recently, silently
ignoring
the others.

## Solution

Introduce an optional pre-auth agent selector (`CODER_AGENT_NAME`) and
make the server-side lookup ambiguity-aware.

**Database layer:**
- `GetWorkspaceAgentsByInstanceID` (`:many`): returns all matching root
  agents for an instance ID.
- `GetWorkspaceAgentByInstanceIDAndName` (`:one`): returns the named
root
  agent for disambiguation.

**SDK and CLI:**
- `agent_name` field added to AWS, Azure, and GCP request structs
  (`omitempty` for backward compatibility).
- `CODER_AGENT_NAME` env var and `--agent-name` flag wired into the
agent
  bootstrap before instance-identity auth runs.

**Server handler (`handleAuthInstanceID`):**
- When `agent_name` is present: direct lookup by (instance ID, name).
- When absent: legacy lookup, then resource-scoped ambiguity check.
  Returns 409 with available agent names if multiple root agents match.
- Whitespace-only names are trimmed and treated as unspecified.
- Sub-agents remain excluded (`parent_id IS NULL` filter).

**Verification template:**
- `examples/templates/aws-multi-agent/` provisions one EC2 instance with
  two agents (`main` and `dev`), both using instance-identity auth with
  `CODER_AGENT_NAME` set in the cloud-init user data.

## Backward compatibility

Existing single-agent deployments work unchanged. The `agent_name` field
is optional with `omitempty`, and the unnamed path preserves today's
behavior when only one root agent matches.
This commit is contained in:
Michael Suchacz
2026-04-16 13:59:09 +02:00
committed by GitHub
parent 1cf0354f72
commit e5707a13d6
29 changed files with 1563 additions and 286 deletions
+119 -26
View File
@@ -7184,38 +7184,55 @@ func TestGetWorkspaceAgentsByParentID(t *testing.T) {
})
}
func TestGetWorkspaceAgentByInstanceID(t *testing.T) {
func setupWorkspaceAgentQueryResources(t *testing.T, db database.Store, count int) []database.WorkspaceResource {
t.Helper()
org := dbgen.Organization(t, db, database.Organization{})
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeTemplateVersionImport,
OrganizationID: org.ID,
})
resources := make([]database.WorkspaceResource, 0, count)
for i := 0; i < count; i++ {
resources = append(resources, dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
JobID: job.ID,
}))
}
return resources
}
func markWorkspaceAgentDeleted(ctx context.Context, t *testing.T, sqlDB *sql.DB, agentID uuid.UUID) {
t.Helper()
_, err := sqlDB.ExecContext(ctx, "UPDATE workspace_agents SET deleted = TRUE WHERE id = $1", agentID)
require.NoError(t, err)
}
func TestGetWorkspaceAgentsByInstanceID(t *testing.T) {
t.Parallel()
// Context: https://github.com/coder/coder/pull/22196
t.Run("DoesNotReturnSubAgents", func(t *testing.T) {
t.Run("ReturnsAllMatchingRootAgents", func(t *testing.T) {
t.Parallel()
// Given: A parent workspace agent with an AuthInstanceID and a
// sub-agent that shares the same AuthInstanceID.
db, _ := dbtestutil.NewDB(t)
org := dbgen.Organization(t, db, database.Organization{})
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeTemplateVersionImport,
OrganizationID: org.ID,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
JobID: job.ID,
})
resources := setupWorkspaceAgentQueryResources(t, db, 2)
authInstanceID := fmt.Sprintf("instance-%s-%d", t.Name(), time.Now().UnixNano())
parentAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
olderCreatedAt := dbtime.Now().Add(-time.Hour)
newerCreatedAt := dbtime.Now()
olderAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[0].ID,
CreatedAt: olderCreatedAt,
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
// Create a sub-agent with the same AuthInstanceID (simulating
// the old behavior before the fix).
_ = dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ParentID: uuid.NullUUID{UUID: parentAgent.ID, Valid: true},
ResourceID: resource.ID,
newerAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[1].ID,
CreatedAt: newerCreatedAt,
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
@@ -7224,13 +7241,89 @@ func TestGetWorkspaceAgentByInstanceID(t *testing.T) {
ctx := testutil.Context(t, testutil.WaitShort)
// When: We look up the agent by instance ID.
agent, err := db.GetWorkspaceAgentByInstanceID(ctx, authInstanceID)
agents, err := db.GetWorkspaceAgentsByInstanceID(ctx, authInstanceID)
require.NoError(t, err)
require.Len(t, agents, 2)
assert.Equal(t, []uuid.UUID{newerAgent.ID, olderAgent.ID}, []uuid.UUID{agents[0].ID, agents[1].ID})
})
// Then: The result must be the parent agent, not the sub-agent.
assert.Equal(t, parentAgent.ID, agent.ID, "instance ID lookup should return the parent agent, not a sub-agent")
assert.False(t, agent.ParentID.Valid, "returned agent should not have a parent (should be the parent itself)")
t.Run("ExcludesDeletedAndSubAgents", func(t *testing.T) {
t.Parallel()
db, _, sqlDB := dbtestutil.NewDBWithSQLDB(t)
resources := setupWorkspaceAgentQueryResources(t, db, 2)
authInstanceID := fmt.Sprintf("instance-%s-%d", t.Name(), time.Now().UnixNano())
baseCreatedAt := dbtime.Now()
rootAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[0].ID,
CreatedAt: baseCreatedAt.Add(-time.Hour),
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
_ = dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ParentID: uuid.NullUUID{UUID: rootAgent.ID, Valid: true},
ResourceID: resources[0].ID,
CreatedAt: baseCreatedAt,
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
deletedRootAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[1].ID,
CreatedAt: baseCreatedAt.Add(time.Minute),
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
ctx := testutil.Context(t, testutil.WaitShort)
markWorkspaceAgentDeleted(ctx, t, sqlDB, deletedRootAgent.ID)
agents, err := db.GetWorkspaceAgentsByInstanceID(ctx, authInstanceID)
require.NoError(t, err)
require.Len(t, agents, 1)
assert.Equal(t, rootAgent.ID, agents[0].ID)
assert.False(t, agents[0].ParentID.Valid)
})
t.Run("OrdersNewestFirst", func(t *testing.T) {
t.Parallel()
db, _ := dbtestutil.NewDB(t)
resources := setupWorkspaceAgentQueryResources(t, db, 2)
authInstanceID := fmt.Sprintf("instance-%s-%d", t.Name(), time.Now().UnixNano())
olderCreatedAt := dbtime.Now().Add(-time.Hour)
newerCreatedAt := dbtime.Now()
olderAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[0].ID,
CreatedAt: olderCreatedAt,
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
newerAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resources[1].ID,
CreatedAt: newerCreatedAt,
AuthInstanceID: sql.NullString{
String: authInstanceID,
Valid: true,
},
})
ctx := testutil.Context(t, testutil.WaitShort)
agents, err := db.GetWorkspaceAgentsByInstanceID(ctx, authInstanceID)
require.NoError(t, err)
require.Len(t, agents, 2)
assert.Equal(t, newerAgent.ID, agents[0].ID)
assert.Equal(t, olderAgent.ID, agents[1].ID)
})
}