fix(coderd/notifications): serialize pending updates gauge writes (#25495)

Fixes a race where concurrent notification dispatch goroutines could
overwrite `coderd_notifications_pending_updates` with an older
buffer-length snapshot. Pending update snapshots now serialize count
evaluation with the gauge write, and inhibited dispatch results refresh
the metric when buffered.
This commit is contained in:
dylanhuff-at-coder
2026-05-29 14:02:13 -04:00
committed by GitHub
parent 5cdc9e28a9
commit 0401ed3af5
5 changed files with 135 additions and 18 deletions
+2 -4
View File
@@ -237,9 +237,7 @@ func (m *Manager) BufferedUpdatesCount() (success int, failure int) {
// syncUpdates updates messages in the store based on the given successful and failed message dispatch results. // syncUpdates updates messages in the store based on the given successful and failed message dispatch results.
func (m *Manager) syncUpdates(ctx context.Context) { func (m *Manager) syncUpdates(ctx context.Context) {
// Ensure we update the metrics to reflect the current state after each invocation. // Ensure we update the metrics to reflect the current state after each invocation.
defer func() { defer m.metrics.pendingUpdatesGauge.set(func() int { return len(m.success) + len(m.failure) })
m.metrics.PendingUpdates.Set(float64(len(m.success) + len(m.failure)))
}()
select { select {
case <-ctx.Done(): case <-ctx.Done():
@@ -250,7 +248,7 @@ func (m *Manager) syncUpdates(ctx context.Context) {
nSuccess := len(m.success) nSuccess := len(m.success)
nFailure := len(m.failure) nFailure := len(m.failure)
m.metrics.PendingUpdates.Set(float64(nSuccess + nFailure)) m.metrics.pendingUpdatesGauge.set(func() int { return len(m.success) + len(m.failure) })
// Nothing to do. // Nothing to do.
if nSuccess+nFailure == 0 { if nSuccess+nFailure == 0 {
+31 -5
View File
@@ -3,6 +3,7 @@ package notifications
import ( import (
"fmt" "fmt"
"strings" "strings"
"sync"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promauto"
@@ -17,8 +18,28 @@ type Metrics struct {
InflightDispatches *prometheus.GaugeVec InflightDispatches *prometheus.GaugeVec
DispatcherSendSeconds *prometheus.HistogramVec DispatcherSendSeconds *prometheus.HistogramVec
PendingUpdates prometheus.Gauge PendingUpdates prometheus.Collector
SyncedUpdates prometheus.Counter SyncedUpdates prometheus.Counter
pendingUpdatesGauge *pendingUpdatesGauge
}
// pendingUpdatesGauge serializes count evaluation with the gauge write,
// preventing stale snapshots when concurrent goroutines race to update
// the metric.
type pendingUpdatesGauge struct {
gauge prometheus.Gauge
mu sync.Mutex
}
// set evaluates count under the lock and writes the result to the gauge.
// count is a function, not a value, so the channel length is read atomically
// with the write; passing a pre-evaluated int would reintroduce the race.
func (g *pendingUpdatesGauge) set(count func() int) {
g.mu.Lock()
defer g.mu.Unlock()
g.gauge.Set(float64(count()))
} }
const ( const (
@@ -35,6 +56,11 @@ const (
) )
func NewMetrics(reg prometheus.Registerer) *Metrics { func NewMetrics(reg prometheus.Registerer) *Metrics {
pendingUpdates := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
Help: "The number of dispatch attempt results waiting to be flushed to the store.",
})
return &Metrics{ return &Metrics{
DispatchAttempts: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ DispatchAttempts: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "dispatch_attempts_total", Namespace: ns, Subsystem: subsystem, Name: "dispatch_attempts_total", Namespace: ns, Subsystem: subsystem,
@@ -68,10 +94,10 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
}, []string{LabelMethod}), }, []string{LabelMethod}),
// Currently no requirement to discriminate between success and failure updates which are pending. // Currently no requirement to discriminate between success and failure updates which are pending.
PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ PendingUpdates: pendingUpdates,
Name: "pending_updates", Namespace: ns, Subsystem: subsystem, pendingUpdatesGauge: &pendingUpdatesGauge{
Help: "The number of dispatch attempt results waiting to be flushed to the store.", gauge: pendingUpdates,
}), },
SyncedUpdates: promauto.With(reg).NewCounter(prometheus.CounterOpts{ SyncedUpdates: promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "synced_updates_total", Namespace: ns, Subsystem: subsystem, Name: "synced_updates_total", Namespace: ns, Subsystem: subsystem,
Help: "The number of dispatch attempt results flushed to the store.", Help: "The number of dispatch attempt results flushed to the store.",
@@ -0,0 +1,85 @@
package notifications
import (
"sync"
"testing"
"github.com/prometheus/client_golang/prometheus"
promtest "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"github.com/coder/coder/v2/testutil"
)
func TestMetricsSetPendingUpdatesSerializesGaugeWrites(t *testing.T) {
t.Parallel()
realGauge := prometheus.NewGauge(prometheus.GaugeOpts{
Name: "test_pending_updates",
Help: "test pending updates gauge",
})
blockingGauge := &pendingUpdatesBlockingGauge{
Gauge: realGauge,
blockValue: 3,
entered: make(chan struct{}),
release: make(chan struct{}),
}
metrics := &Metrics{
PendingUpdates: blockingGauge,
pendingUpdatesGauge: &pendingUpdatesGauge{gauge: blockingGauge},
}
success := make(chan dispatchResult, 4)
failure := make(chan dispatchResult, 4)
success <- dispatchResult{}
success <- dispatchResult{}
firstDone := make(chan struct{})
go func() {
defer close(firstDone)
failure <- dispatchResult{}
// The first writer observes total=3 and blocks inside Set(3)
// while still holding the pendingUpdatesGauge mutex.
metrics.pendingUpdatesGauge.set(func() int { return len(success) + len(failure) })
}()
testutil.TryReceive(testutil.Context(t, testutil.WaitShort), t, blockingGauge.entered)
// The main goroutine raises the real total to 4 before a second
// writer queues behind the locked gauge.
success <- dispatchResult{}
secondDone := make(chan struct{})
go func() {
defer close(secondDone)
// This count must be evaluated after release, while holding the
// mutex, so the final gauge value cannot regress to 3.
metrics.pendingUpdatesGauge.set(func() int { return len(success) + len(failure) })
}()
close(blockingGauge.release)
testutil.TryReceive(testutil.Context(t, testutil.WaitShort), t, firstDone)
testutil.TryReceive(testutil.Context(t, testutil.WaitShort), t, secondDone)
require.Equal(t, 4, len(success)+len(failure))
require.EqualValues(t, 4, promtest.ToFloat64(metrics.PendingUpdates))
}
type pendingUpdatesBlockingGauge struct {
prometheus.Gauge
blockValue float64
entered chan struct{}
release chan struct{}
once sync.Once
}
func (g *pendingUpdatesBlockingGauge) Set(value float64) {
if value == g.blockValue {
g.once.Do(func() {
close(g.entered)
<-g.release
})
}
g.Gauge.Set(value)
}
+14 -7
View File
@@ -276,17 +276,24 @@ func TestPendingUpdatesMetric(t *testing.T) {
mClock.Advance(cfg.FetchInterval.Value()).MustWait(ctx) mClock.Advance(cfg.FetchInterval.Value()).MustWait(ctx)
// THEN: // THEN:
// handler has dispatched the given notifications. // Both handlers have dispatched the given notifications, and their
func() { // results are pending in the metrics.
require.EventuallyWithT(t, func(ct *assert.CollectT) {
handler.mu.RLock() handler.mu.RLock()
inboxHandler.mu.RLock()
defer handler.mu.RUnlock() defer handler.mu.RUnlock()
defer inboxHandler.mu.RUnlock()
require.Len(t, handler.succeeded, 1) assert.Len(ct, handler.succeeded, 1)
require.Len(t, handler.failed, 1) assert.Len(ct, handler.failed, 1)
}() assert.Len(ct, inboxHandler.succeeded, 1)
assert.Len(ct, inboxHandler.failed, 1)
// Both handler calls should be pending in the metrics. success, failure := mgr.BufferedUpdatesCount()
require.EqualValues(t, 4, promtest.ToFloat64(metrics.PendingUpdates)) assert.Equal(ct, 2, success)
assert.Equal(ct, 2, failure)
assert.EqualValues(ct, 4, promtest.ToFloat64(metrics.PendingUpdates))
}, testutil.WaitShort, testutil.IntervalFast)
// THEN: // THEN:
// Trigger syncing updates // Trigger syncing updates
+3 -2
View File
@@ -172,6 +172,7 @@ func (n *notifier) process(ctx context.Context, success chan<- dispatchResult, f
// If a notification template has been disabled by the user after a notification was enqueued, mark it as inhibited // If a notification template has been disabled by the user after a notification was enqueued, mark it as inhibited
if msg.Disabled { if msg.Disabled {
failure <- n.newInhibitedDispatch(msg) failure <- n.newInhibitedDispatch(msg)
n.metrics.pendingUpdatesGauge.set(func() int { return len(success) + len(failure) })
continue continue
} }
@@ -184,7 +185,7 @@ func (n *notifier) process(ctx context.Context, success chan<- dispatchResult, f
n.log.Error(ctx, "dispatcher construction failed", slog.F("msg_id", msg.ID), slog.Error(err)) n.log.Error(ctx, "dispatcher construction failed", slog.F("msg_id", msg.ID), slog.Error(err))
} }
failure <- n.newFailedDispatch(msg, err, xerrors.Is(err, decorateHelpersError{})) failure <- n.newFailedDispatch(msg, err, xerrors.Is(err, decorateHelpersError{}))
n.metrics.PendingUpdates.Set(float64(len(success) + len(failure))) n.metrics.pendingUpdatesGauge.set(func() int { return len(success) + len(failure) })
continue continue
} }
@@ -316,7 +317,7 @@ func (n *notifier) deliver(ctx context.Context, msg database.AcquireNotification
logger.Debug(ctx, "message dispatch succeeded") logger.Debug(ctx, "message dispatch succeeded")
} }
} }
n.metrics.PendingUpdates.Set(float64(len(success) + len(failure))) n.metrics.pendingUpdatesGauge.set(func() int { return len(success) + len(failure) })
return nil return nil
} }