feat: add ai provider status and reload freshness metrics (#25770)

Add metrics for `aibridged` and `aibridgeproxyd`'s provider statuses. AI providers can be modified, and possibly misconfigured, at runtime. These metrics help operators understand the state of these provider definitions in case unexpected behaviour is observed.
This commit is contained in:
Danny Kopping
2026-05-28 14:57:33 +02:00
committed by GitHub
parent 637855e276
commit 12520ee964
18 changed files with 704 additions and 125 deletions
+94
View File
@@ -0,0 +1,94 @@
package aibridged
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// Metrics is the prometheus surface for aibridged provider reloads.
type Metrics struct {
registerer prometheus.Registerer
// ProviderInfo is one series per configured provider; value is
// always 1 and the status label carries the alertable signal.
// Labels: provider_name, provider_type, status.
ProviderInfo *prometheus.GaugeVec
// ProvidersLastReloadTimestampSeconds is the unix timestamp of the
// last reload attempt, success or failure.
ProvidersLastReloadTimestampSeconds prometheus.Gauge
// ProvidersLastReloadSuccessTimestampSeconds is the unix timestamp
// of the last reload that successfully refreshed the pool. A gap
// against ProvidersLastReloadTimestampSeconds means the loop is
// firing but the refresh function is failing.
ProvidersLastReloadSuccessTimestampSeconds prometheus.Gauge
}
// NewMetrics registers the provider metrics against reg.
func NewMetrics(reg prometheus.Registerer) *Metrics {
factory := promauto.With(reg)
return &Metrics{
registerer: reg,
ProviderInfo: factory.NewGaugeVec(prometheus.GaugeOpts{
Name: "provider_info",
Help: "One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal.",
}, []string{"provider_name", "provider_type", "status"}),
ProvidersLastReloadTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{
Name: "providers_last_reload_timestamp_seconds",
Help: "Unix timestamp of the last provider reload attempt, success or failure.",
}),
ProvidersLastReloadSuccessTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{
Name: "providers_last_reload_success_timestamp_seconds",
Help: "Unix timestamp of the last provider reload that successfully refreshed the pool. A gap against coder_aibridged_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing.",
}),
}
}
// Unregister removes the provider metrics from the registerer.
func (m *Metrics) Unregister() {
if m == nil {
return
}
m.registerer.Unregister(m.ProviderInfo)
m.registerer.Unregister(m.ProvidersLastReloadTimestampSeconds)
m.registerer.Unregister(m.ProvidersLastReloadSuccessTimestampSeconds)
}
// RecordReloadAttempt stamps the attempt-time gauge at the start of a
// reload. A reload that hangs mid-flight is detected by watching the
// gap between this gauge and ProvidersLastReloadSuccessTimestampSeconds.
func (m *Metrics) RecordReloadAttempt() {
if m == nil {
return
}
m.ProvidersLastReloadTimestampSeconds.Set(float64(time.Now().Unix()))
}
// RecordReloadSuccess rewrites the ProviderInfo GaugeVec from the
// outcomes and stamps the success-time gauge. Reset clears series for
// providers that have left the configuration so they don't linger as
// stale.
func (m *Metrics) RecordReloadSuccess(outcomes []ProviderOutcome) {
if m == nil {
return
}
WriteProviderInfoSnapshot(m.ProviderInfo, outcomes)
m.ProvidersLastReloadSuccessTimestampSeconds.Set(float64(time.Now().Unix()))
}
// WriteProviderInfoSnapshot Resets info and writes one series per
// outcome. Both aibridged and aibridgeproxyd use this so the
// provider_info recording contract stays in one place.
func WriteProviderInfoSnapshot(info *prometheus.GaugeVec, outcomes []ProviderOutcome) {
info.Reset()
for _, o := range outcomes {
info.WithLabelValues(o.Name, o.Type, string(o.Status)).Set(1)
}
}
+84
View File
@@ -0,0 +1,84 @@
package aibridged_test
import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
promtest "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/aibridged"
)
// TestMetricsRecordReloadSuccess covers the provider_info GaugeVec
// surface: every reload pass rewrites the series for the current
// outcomes and the Reset on each pass drops stale series.
func TestMetricsRecordReloadSuccess(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := aibridged.NewMetrics(reg)
outcomes := []aibridged.ProviderOutcome{
{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled},
{Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusDisabled},
{Name: "gamma", Type: "openai", Status: aibridged.ProviderStatusError, Err: xerrors.New("bad config")},
}
before := time.Now().Unix()
m.RecordReloadAttempt()
m.RecordReloadSuccess(outcomes)
after := time.Now().Unix()
assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("alpha", "openai", "enabled")))
assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("beta", "anthropic", "disabled")))
assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("gamma", "openai", "error")))
attemptTS := int64(promtest.ToFloat64(m.ProvidersLastReloadTimestampSeconds))
successTS := int64(promtest.ToFloat64(m.ProvidersLastReloadSuccessTimestampSeconds))
assert.GreaterOrEqual(t, attemptTS, before)
assert.LessOrEqual(t, attemptTS, after)
assert.GreaterOrEqual(t, successTS, before)
assert.LessOrEqual(t, successTS, after)
}
// TestMetricsResetsStaleProviderSeries verifies that providers removed
// from the outcome set between reloads do not leave behind stale
// series.
func TestMetricsResetsStaleProviderSeries(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := aibridged.NewMetrics(reg)
m.RecordReloadSuccess([]aibridged.ProviderOutcome{
{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled},
{Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusEnabled},
})
require.Equal(t, 2, promtest.CollectAndCount(m.ProviderInfo))
m.RecordReloadSuccess([]aibridged.ProviderOutcome{
{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled},
})
assert.Equal(t, 1, promtest.CollectAndCount(m.ProviderInfo),
"beta should have been Reset out of the GaugeVec")
assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("alpha", "openai", "enabled")))
}
// TestMetricsNilSafe asserts the helpers tolerate a nil receiver so
// callers can pass `nil` to disable metric updates without guarding
// every call site.
func TestMetricsNilSafe(t *testing.T) {
t.Parallel()
var m *aibridged.Metrics
require.NotPanics(t, func() {
m.RecordReloadAttempt()
m.RecordReloadSuccess(nil)
m.Unregister()
})
}
+28
View File
@@ -0,0 +1,28 @@
package aibridged
// ProviderStatus is the lifecycle state of a configured AI provider.
type ProviderStatus string
const (
// ProviderStatusEnabled indicates the provider is configured and
// valid, and is included in the active pool snapshot.
ProviderStatusEnabled ProviderStatus = "enabled"
// ProviderStatusDisabled indicates the provider is configured but
// intentionally turned off by an operator.
ProviderStatusDisabled ProviderStatus = "disabled"
// ProviderStatusError indicates the provider is configured but
// cannot be constructed (missing keys, unsupported type, malformed
// settings).
ProviderStatusError ProviderStatus = "error"
)
// ProviderOutcome classifies one ai_providers row, including disabled
// and errored rows the pool excludes. Err is populated only when
// Status == ProviderStatusError; the build error is already logged at
// the call site.
type ProviderOutcome struct {
Name string
Type string
Status ProviderStatus
Err error
}