mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
5b6b7719df
## Problem When a prebuilt workspace is claimed, the agent reinitializes via a single fire-and-forget pubsub event over SSE. If the agent's SSE connection is interrupted at claim time, the event is permanently lost — the workspace is stuck with no self-healing path. Additionally, regular (non-prebuild) workspaces had no way to opt out of the `/reinit` polling loop — agents would reconnect indefinitely to an endpoint that would never send them anything useful. ## Root Cause `workspaceAgentReinit` fetches the workspace (with its current `owner_id`) via `GetWorkspaceByAgentID`, but never checked whether a claim already happened. It only subscribed to pubsub for future events. The database already has durable claim state (`owner_id` changes from `PrebuildsSystemUserID` to the real user), but no layer ever consulted it on reconnection. ## Solution ### Server-side durable check with first-build-initiator gating **TOCTOU-safe ordering**: Subscribe to pubsub claim events *before* any durable checks, so a claim that fires during the check is buffered in the channel rather than lost. **First-build-initiator gating**: When `!workspace.IsPrebuild()` (owner is no longer the system user), look up the first build's `InitiatorID`. The prebuild reconciler always uses `PrebuildsSystemUserID` as the initiator. This distinguishes claimed prebuilds from regular workspaces without any SQL schema changes. - **Regular workspace** (first build initiator ≠ system user) → **409 Conflict**, agent stops reconnecting - **Claimed prebuild, build completed** → pre-seed channel with reinit event and close it, transmitter delivers one-shot then exits - **Claimed prebuild, build in-progress** → fall through to pubsub subscription, agent waits for completion event - **Unclaimed prebuild** → pubsub subscription (existing happy path) ### Declarative reinit events (defense-in-depth) - Added `UserID` field to `ReinitializationEvent` with JSON tags - Switched pubsub serialization from raw string to JSON (with backward-compat fallback for rolling upgrades) - Populated `UserID` at both the publish site and the durable check ### Agent SDK: 409 handling `WaitForReinitLoop` detects 409 Conflict from the server and closes the `reinitEvents` channel, cleanly exiting the retry goroutine. ### Agent CLI: fixed two bugs + added reinitCtx - **Closed channel (`!ok`)**: now blocks on `<-ctx.Done()` instead of `continue`, keeping the current agent running. Previously this would leak agents by skipping `agnt.Close()` and re-entering the loop. - **Duplicate owner reinit**: cancels `reinitCtx` (stops the reinit goroutine), then blocks on `<-ctx.Done()`. Previously `continue` would skip cleanup and create a new agent on the next loop iteration. - **`reinitCtx`**: a cancellable child of `ctx` passed to `WaitForReinitLoop`, allowing the agent to stop the reinit HTTP polling after reinit completes. ### Agent-side idempotency Tracks `lastOwnerID` in the agent reinit loop — duplicate events for the same owner are skipped. ## Testing - **"unclaimed prebuild receives reinit via pubsub"**: prebuild owned by system user, pubsub event triggers reinit - **"claimed prebuild receives one-shot reinit on reconnect"**: first build by system user, owner changed, build completed → immediate reinit (no pubsub needed) - **"claimed prebuild waits during in-progress claim build"**: claimed but build still running → no reinit until build completes - **"regular workspace gets 409"**: first build by real user → 409 Conflict, agent stops polling - Updated claim publisher/listener tests: verify `UserID` survives JSON round-trip + backward compat with raw string payloads - Updated SSE round-trip test: verify `UserID` survives transmit → receive cycle Fixes #22359 ## Rolling upgrade note During a rolling deploy where old coderd instances coexist with new ones, the pubsub `ReinitializationEvent` has a new `workspace_id` field (JSON key `workspace_id`). Old publishers send a raw reason string instead of JSON; the new listener gracefully falls back by treating the entire payload as the reason and filling in `WorkspaceID` from context. The only visible effect during the upgrade window is that `WorkspaceID` may be the zero UUID in agent-side logs — this is cosmetic and resolves once all instances are updated.
141 lines
4.1 KiB
Go
141 lines
4.1 KiB
Go
package prebuilds_test
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3/sloggers/slogtest"
|
|
"github.com/coder/coder/v2/coderd/database/pubsub"
|
|
"github.com/coder/coder/v2/coderd/prebuilds"
|
|
"github.com/coder/coder/v2/codersdk/agentsdk"
|
|
"github.com/coder/coder/v2/testutil"
|
|
)
|
|
|
|
func TestPubsubWorkspaceClaimPublisher(t *testing.T) {
|
|
t.Parallel()
|
|
t.Run("published claim is received by a listener for the same workspace", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := testutil.Context(t, testutil.WaitShort)
|
|
logger := testutil.Logger(t)
|
|
ps := pubsub.NewInMemory()
|
|
workspaceID := uuid.New()
|
|
publisher := prebuilds.NewPubsubWorkspaceClaimPublisher(ps)
|
|
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, logger)
|
|
|
|
events, cancel, err := listener.ListenForWorkspaceClaims(ctx, workspaceID)
|
|
require.NoError(t, err)
|
|
defer cancel()
|
|
|
|
userID := uuid.New()
|
|
claim := agentsdk.ReinitializationEvent{
|
|
WorkspaceID: workspaceID,
|
|
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
|
|
OwnerID: userID,
|
|
}
|
|
err = publisher.PublishWorkspaceClaim(claim)
|
|
require.NoError(t, err)
|
|
|
|
gotEvent := testutil.RequireReceive(ctx, t, events)
|
|
require.Equal(t, workspaceID, gotEvent.WorkspaceID)
|
|
require.Equal(t, claim.Reason, gotEvent.Reason)
|
|
require.Equal(t, userID, gotEvent.OwnerID)
|
|
})
|
|
|
|
t.Run("fail to publish claim", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ps := &brokenPubsub{}
|
|
|
|
publisher := prebuilds.NewPubsubWorkspaceClaimPublisher(ps)
|
|
claim := agentsdk.ReinitializationEvent{
|
|
WorkspaceID: uuid.New(),
|
|
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
|
|
}
|
|
|
|
err := publisher.PublishWorkspaceClaim(claim)
|
|
require.ErrorContains(t, err, "failed to trigger prebuilt workspace agent reinitialization")
|
|
})
|
|
}
|
|
|
|
func TestPubsubWorkspaceClaimListener(t *testing.T) {
|
|
t.Parallel()
|
|
t.Run("finds claim events for its workspace", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ps := pubsub.NewInMemory()
|
|
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))
|
|
|
|
workspaceID := uuid.New()
|
|
events, cancelFunc, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID)
|
|
require.NoError(t, err)
|
|
defer cancelFunc()
|
|
|
|
// Publish a claim
|
|
channel := agentsdk.PrebuildClaimedChannel(workspaceID)
|
|
reason := agentsdk.ReinitializeReasonPrebuildClaimed
|
|
err = ps.Publish(channel, []byte(reason))
|
|
require.NoError(t, err)
|
|
|
|
// Verify we receive the claim
|
|
ctx := testutil.Context(t, testutil.WaitShort)
|
|
claim := testutil.RequireReceive(ctx, t, events)
|
|
require.Equal(t, workspaceID, claim.WorkspaceID)
|
|
require.Equal(t, reason, claim.Reason)
|
|
require.Equal(t, uuid.Nil, claim.OwnerID)
|
|
})
|
|
|
|
t.Run("ignores claim events for other workspaces", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ps := pubsub.NewInMemory()
|
|
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))
|
|
|
|
workspaceID := uuid.New()
|
|
otherWorkspaceID := uuid.New()
|
|
events, cancelFunc, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID)
|
|
require.NoError(t, err)
|
|
defer cancelFunc()
|
|
|
|
// Publish a claim for a different workspace
|
|
channel := agentsdk.PrebuildClaimedChannel(otherWorkspaceID)
|
|
err = ps.Publish(channel, []byte(agentsdk.ReinitializeReasonPrebuildClaimed))
|
|
require.NoError(t, err)
|
|
|
|
// Verify we don't receive the claim
|
|
select {
|
|
case <-events:
|
|
t.Fatal("received claim for wrong workspace")
|
|
case <-time.After(100 * time.Millisecond):
|
|
// Expected - no claim received
|
|
}
|
|
})
|
|
|
|
t.Run("communicates the error if it can't subscribe", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ps := &brokenPubsub{}
|
|
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))
|
|
|
|
_, _, err := listener.ListenForWorkspaceClaims(context.Background(), uuid.New())
|
|
require.ErrorContains(t, err, "failed to subscribe to prebuild claimed channel")
|
|
})
|
|
}
|
|
|
|
type brokenPubsub struct {
|
|
pubsub.Pubsub
|
|
}
|
|
|
|
func (brokenPubsub) Subscribe(_ string, _ pubsub.Listener) (func(), error) {
|
|
return nil, xerrors.New("broken")
|
|
}
|
|
|
|
func (brokenPubsub) Publish(_ string, _ []byte) error {
|
|
return xerrors.New("broken")
|
|
}
|