From d9e3e206ccaa7c38c81d2d75d48da3486367cdd2 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 23 Apr 2026 11:13:41 +0100 Subject: [PATCH] fix(enterprise/coderd/x/chatd): deflake relay drain test for multiple timers (#24609) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/coder/internal/issues/1474. PR #24549 introduced a `quartz.NewMock` clock + `Trap().NewTimer("drain")` to remove the wall-clock race. However, the trap consumed only **one** `NewTimer("drain")` call via `MustWait/MustRelease`. The merge loop has two code paths that create drain timers with the same tag: - Relay result handler (`drainAndClose` path in `relayReadyCh` case): when an async dial completes after `drainAndClose` was set. - Status notification handler (`relayParts != nil` branch in `statusNotifications` case): when `status!=running` arrives while an active relay exists. Depending on goroutine scheduling, one or both paths fire. When two calls hit the trap, the second blocks the merge loop in `matchCallLocked` (quartz waits for all traps to release). Since the test already moved past `MustWait`, nobody reads the second call from the trap channel, deadlocking the test. - Replace the single `MustWait/MustRelease/Advance` with a goroutine that loops over `trapDrain.Wait`, releasing and advancing for every drain timer. - No production code changes. > 🤖 --- enterprise/coderd/x/chatd/chatd_test.go | 33 ++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/enterprise/coderd/x/chatd/chatd_test.go b/enterprise/coderd/x/chatd/chatd_test.go index a59a61780b..ef7f39c5c5 100644 --- a/enterprise/coderd/x/chatd/chatd_test.go +++ b/enterprise/coderd/x/chatd/chatd_test.go @@ -1545,15 +1545,36 @@ func TestSubscribeRelayDrainWithinGraceLeavesBufferRetained(t *testing.T) { } }, testutil.IntervalFast) - // Wait for the drain timer to be armed by the relay manager, - // then release and advance the subscriber clock past the drain - // timeout. This deterministically fires the drain without - // relying on wall-clock timing. - trapDrain.MustWait(ctx).MustRelease(ctx) - subscriberClock.Advance(200 * time.Millisecond).MustWait(ctx) + // Drain all NewTimer("drain") calls in a background goroutine. + // The merge loop may create one or two drain timers depending + // on the relative ordering of the status=WAITING pubsub + // notification and the async relay dial completion. Each + // trapped call must be released so the production goroutine + // is unblocked, and the clock must be advanced past the + // 200ms drain timeout to fire the timer. + var drainsFired atomic.Int32 + go func() { + for { + call, err := trapDrain.Wait(ctx) + if err != nil { + return + } + if err := call.Release(ctx); err != nil { + return + } + subscriberClock.Advance(200 * time.Millisecond) + drainsFired.Add(1) + } + }() + // Wait for DB status=waiting AND at least one drain timer to + // have fired. Checking drainsFired proves the relay was torn + // down by the drain path, not by context cancellation. evCtx2 := testutil.Context(t, testutil.WaitLong) testutil.Eventually(evCtx2, t, func(ctx context.Context) bool { + if drainsFired.Load() == 0 { + return false + } fromDB, dbErr := db.GetChatByID(ctx, chat.ID) if dbErr != nil { return false