mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
5b6b7719df
## Problem When a prebuilt workspace is claimed, the agent reinitializes via a single fire-and-forget pubsub event over SSE. If the agent's SSE connection is interrupted at claim time, the event is permanently lost — the workspace is stuck with no self-healing path. Additionally, regular (non-prebuild) workspaces had no way to opt out of the `/reinit` polling loop — agents would reconnect indefinitely to an endpoint that would never send them anything useful. ## Root Cause `workspaceAgentReinit` fetches the workspace (with its current `owner_id`) via `GetWorkspaceByAgentID`, but never checked whether a claim already happened. It only subscribed to pubsub for future events. The database already has durable claim state (`owner_id` changes from `PrebuildsSystemUserID` to the real user), but no layer ever consulted it on reconnection. ## Solution ### Server-side durable check with first-build-initiator gating **TOCTOU-safe ordering**: Subscribe to pubsub claim events *before* any durable checks, so a claim that fires during the check is buffered in the channel rather than lost. **First-build-initiator gating**: When `!workspace.IsPrebuild()` (owner is no longer the system user), look up the first build's `InitiatorID`. The prebuild reconciler always uses `PrebuildsSystemUserID` as the initiator. This distinguishes claimed prebuilds from regular workspaces without any SQL schema changes. - **Regular workspace** (first build initiator ≠ system user) → **409 Conflict**, agent stops reconnecting - **Claimed prebuild, build completed** → pre-seed channel with reinit event and close it, transmitter delivers one-shot then exits - **Claimed prebuild, build in-progress** → fall through to pubsub subscription, agent waits for completion event - **Unclaimed prebuild** → pubsub subscription (existing happy path) ### Declarative reinit events (defense-in-depth) - Added `UserID` field to `ReinitializationEvent` with JSON tags - Switched pubsub serialization from raw string to JSON (with backward-compat fallback for rolling upgrades) - Populated `UserID` at both the publish site and the durable check ### Agent SDK: 409 handling `WaitForReinitLoop` detects 409 Conflict from the server and closes the `reinitEvents` channel, cleanly exiting the retry goroutine. ### Agent CLI: fixed two bugs + added reinitCtx - **Closed channel (`!ok`)**: now blocks on `<-ctx.Done()` instead of `continue`, keeping the current agent running. Previously this would leak agents by skipping `agnt.Close()` and re-entering the loop. - **Duplicate owner reinit**: cancels `reinitCtx` (stops the reinit goroutine), then blocks on `<-ctx.Done()`. Previously `continue` would skip cleanup and create a new agent on the next loop iteration. - **`reinitCtx`**: a cancellable child of `ctx` passed to `WaitForReinitLoop`, allowing the agent to stop the reinit HTTP polling after reinit completes. ### Agent-side idempotency Tracks `lastOwnerID` in the agent reinit loop — duplicate events for the same owner are skipped. ## Testing - **"unclaimed prebuild receives reinit via pubsub"**: prebuild owned by system user, pubsub event triggers reinit - **"claimed prebuild receives one-shot reinit on reconnect"**: first build by system user, owner changed, build completed → immediate reinit (no pubsub needed) - **"claimed prebuild waits during in-progress claim build"**: claimed but build still running → no reinit until build completes - **"regular workspace gets 409"**: first build by real user → 409 Conflict, agent stops polling - Updated claim publisher/listener tests: verify `UserID` survives JSON round-trip + backward compat with raw string payloads - Updated SSE round-trip test: verify `UserID` survives transmit → receive cycle Fixes #22359 ## Rolling upgrade note During a rolling deploy where old coderd instances coexist with new ones, the pubsub `ReinitializationEvent` has a new `workspace_id` field (JSON key `workspace_id`). Old publishers send a raw reason string instead of JSON; the new listener gracefully falls back by treating the entire payload as the reason and filling in `WorkspaceID` from context. The only visible effect during the upgrade window is that `WorkspaceID` may be the zero UUID in agent-side logs — this is cosmetic and resolves once all instances are updated.
189 lines
5.9 KiB
Go
189 lines
5.9 KiB
Go
package agentsdk_test
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"net/url"
|
|
"testing"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/stretchr/testify/require"
|
|
"tailscale.com/tailcfg"
|
|
|
|
"cdr.dev/slog/v3/sloggers/slogtest"
|
|
"github.com/coder/coder/v2/codersdk/agentsdk"
|
|
"github.com/coder/coder/v2/testutil"
|
|
)
|
|
|
|
func TestStreamAgentReinitEvents(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
t.Run("transmitted events are received", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
eventToSend := agentsdk.ReinitializationEvent{
|
|
WorkspaceID: uuid.New(),
|
|
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
|
|
OwnerID: uuid.New(),
|
|
}
|
|
|
|
events := make(chan agentsdk.ReinitializationEvent, 1)
|
|
events <- eventToSend
|
|
|
|
transmitCtx := testutil.Context(t, testutil.WaitShort)
|
|
transmitErrCh := make(chan error, 1)
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
transmitter := agentsdk.NewSSEAgentReinitTransmitter(slogtest.Make(t, nil), w, r)
|
|
transmitErrCh <- transmitter.Transmit(transmitCtx, events)
|
|
}))
|
|
defer srv.Close()
|
|
|
|
requestCtx := testutil.Context(t, testutil.WaitShort)
|
|
req, err := http.NewRequestWithContext(requestCtx, "GET", srv.URL, nil)
|
|
require.NoError(t, err)
|
|
client := &http.Client{}
|
|
resp, err := client.Do(req)
|
|
require.NoError(t, err)
|
|
defer resp.Body.Close()
|
|
|
|
receiveCtx := testutil.Context(t, testutil.WaitShort)
|
|
receiver := agentsdk.NewSSEAgentReinitReceiver(resp.Body)
|
|
sentEvent, receiveErr := receiver.Receive(receiveCtx)
|
|
require.Nil(t, receiveErr)
|
|
require.Equal(t, eventToSend, *sentEvent)
|
|
})
|
|
|
|
t.Run("doesn't transmit events if the transmitter context is canceled", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
eventToSend := agentsdk.ReinitializationEvent{
|
|
WorkspaceID: uuid.New(),
|
|
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
|
|
}
|
|
|
|
events := make(chan agentsdk.ReinitializationEvent, 1)
|
|
events <- eventToSend
|
|
|
|
transmitCtx, cancelTransmit := context.WithCancel(testutil.Context(t, testutil.WaitShort))
|
|
cancelTransmit()
|
|
transmitErrCh := make(chan error, 1)
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
transmitter := agentsdk.NewSSEAgentReinitTransmitter(slogtest.Make(t, nil), w, r)
|
|
transmitErrCh <- transmitter.Transmit(transmitCtx, events)
|
|
}))
|
|
|
|
defer srv.Close()
|
|
|
|
requestCtx := testutil.Context(t, testutil.WaitShort)
|
|
req, err := http.NewRequestWithContext(requestCtx, "GET", srv.URL, nil)
|
|
require.NoError(t, err)
|
|
client := &http.Client{}
|
|
resp, err := client.Do(req)
|
|
require.NoError(t, err)
|
|
defer resp.Body.Close()
|
|
|
|
receiveCtx := testutil.Context(t, testutil.WaitShort)
|
|
receiver := agentsdk.NewSSEAgentReinitReceiver(resp.Body)
|
|
sentEvent, receiveErr := receiver.Receive(receiveCtx)
|
|
require.Nil(t, sentEvent)
|
|
require.ErrorIs(t, receiveErr, io.EOF)
|
|
})
|
|
|
|
t.Run("does not receive events if the receiver context is canceled", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
eventToSend := agentsdk.ReinitializationEvent{
|
|
WorkspaceID: uuid.New(),
|
|
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
|
|
}
|
|
|
|
events := make(chan agentsdk.ReinitializationEvent, 1)
|
|
events <- eventToSend
|
|
|
|
transmitCtx := testutil.Context(t, testutil.WaitShort)
|
|
transmitErrCh := make(chan error, 1)
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
transmitter := agentsdk.NewSSEAgentReinitTransmitter(slogtest.Make(t, nil), w, r)
|
|
transmitErrCh <- transmitter.Transmit(transmitCtx, events)
|
|
}))
|
|
defer srv.Close()
|
|
|
|
requestCtx := testutil.Context(t, testutil.WaitShort)
|
|
req, err := http.NewRequestWithContext(requestCtx, "GET", srv.URL, nil)
|
|
require.NoError(t, err)
|
|
client := &http.Client{}
|
|
resp, err := client.Do(req)
|
|
require.NoError(t, err)
|
|
defer resp.Body.Close()
|
|
|
|
receiveCtx, cancelReceive := context.WithCancel(context.Background())
|
|
cancelReceive()
|
|
receiver := agentsdk.NewSSEAgentReinitReceiver(resp.Body)
|
|
sentEvent, receiveErr := receiver.Receive(receiveCtx)
|
|
require.Nil(t, sentEvent)
|
|
require.ErrorIs(t, receiveErr, context.Canceled)
|
|
})
|
|
}
|
|
|
|
func TestRewriteDERPMap(t *testing.T) {
|
|
t.Parallel()
|
|
// This test ensures that RewriteDERPMap mutates built-in DERPs with the
|
|
// client access URL.
|
|
dm := &tailcfg.DERPMap{
|
|
Regions: map[int]*tailcfg.DERPRegion{
|
|
1: {
|
|
EmbeddedRelay: true,
|
|
RegionID: 1,
|
|
Nodes: []*tailcfg.DERPNode{{
|
|
HostName: "bananas.org",
|
|
DERPPort: 1,
|
|
}},
|
|
},
|
|
},
|
|
}
|
|
parsed, err := url.Parse("https://coconuts.org:44558")
|
|
require.NoError(t, err)
|
|
client := agentsdk.New(parsed, agentsdk.WithFixedToken("unused"))
|
|
client.RewriteDERPMap(dm)
|
|
region := dm.Regions[1]
|
|
require.True(t, region.EmbeddedRelay)
|
|
require.Len(t, region.Nodes, 1)
|
|
node := region.Nodes[0]
|
|
require.Equal(t, "coconuts.org", node.HostName)
|
|
require.Equal(t, 44558, node.DERPPort)
|
|
}
|
|
|
|
func TestExternalAuthRequestQuery(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
t.Run("IncludesGitRefFieldsAndOmitsWorkdir", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
require.Equal(t, "/api/v2/workspaceagents/me/external-auth", r.URL.Path)
|
|
require.Equal(t, "true", r.URL.Query().Get("listen"))
|
|
require.Equal(t, "main", r.URL.Query().Get("git_branch"))
|
|
require.Equal(t, "https://github.com/coder/coder.git", r.URL.Query().Get("git_remote_origin"))
|
|
require.Equal(t, "test-chat-id", r.URL.Query().Get("chat_id"))
|
|
require.False(t, r.URL.Query().Has("workdir"))
|
|
_, _ = w.Write([]byte(`{"type":"github","access_token":"token"}`))
|
|
}))
|
|
defer srv.Close()
|
|
|
|
parsedURL, err := url.Parse(srv.URL)
|
|
require.NoError(t, err)
|
|
|
|
client := agentsdk.New(parsedURL, agentsdk.WithFixedToken("token"))
|
|
_, err = client.ExternalAuth(testutil.Context(t, testutil.WaitShort), agentsdk.ExternalAuthRequest{
|
|
Match: "github.com",
|
|
Listen: true,
|
|
GitBranch: "main",
|
|
GitRemoteOrigin: "https://github.com/coder/coder.git",
|
|
ChatID: "test-chat-id",
|
|
})
|
|
require.NoError(t, err)
|
|
})
|
|
}
|