diff --git a/cli/aibridged.go b/cli/aibridged.go index da28a74dee..caf67082fc 100644 --- a/cli/aibridged.go +++ b/cli/aibridged.go @@ -37,6 +37,7 @@ func newAIBridgeDaemon(coderAPI *coderd.API, providers []aibridge.Provider, cfg reg := prometheus.WrapRegistererWithPrefix("coder_aibridged_", coderAPI.PrometheusRegistry) metrics := aibridge.NewMetrics(reg) + providerMetrics := aibridged.NewMetrics(reg) tracer := coderAPI.TracerProvider.Tracer(tracing.TracerName) // Create pool for reusable stateful [aibridge.RequestBridge] instances (one per user). @@ -50,10 +51,11 @@ func newAIBridgeDaemon(coderAPI *coderd.API, providers []aibridge.Provider, cfg // derives from env config and serves as a fallback if the database // load fails inside the reloader. reloader := &poolDBReloader{ - pool: pool, - db: coderAPI.Database, - cfg: cfg, - logger: logger.Named("provider-loader"), + pool: pool, + db: coderAPI.Database, + cfg: cfg, + logger: logger.Named("provider-loader"), + metrics: providerMetrics, } unsubscribe, err := aibridged.SubscribeProviderReload(ctx, coderAPI.Pubsub, reloader, logger.Named("provider-reload")) if err != nil { @@ -78,14 +80,16 @@ func newAIBridgeDaemon(coderAPI *coderd.API, providers []aibridge.Provider, cfg // the live provider set from the database and forwarding it to the // pool. type poolDBReloader struct { - pool *aibridged.CachedBridgePool - db database.Store - cfg codersdk.AIBridgeConfig - logger slog.Logger + pool *aibridged.CachedBridgePool + db database.Store + cfg codersdk.AIBridgeConfig + logger slog.Logger + metrics *aibridged.Metrics } func (r *poolDBReloader) Reload(ctx context.Context) error { - providers, err := BuildProviders(ctx, r.db, r.cfg, r.logger) + r.metrics.RecordReloadAttempt() + providers, outcomes, err := BuildProviders(ctx, r.db, r.cfg, r.logger) if err != nil { // Keep the previous snapshot in place: dropping all providers // because the DB read failed would compound the visible failure @@ -93,19 +97,15 @@ func (r *poolDBReloader) Reload(ctx context.Context) error { return xerrors.Errorf("load ai providers from database: %w", err) } r.pool.ReplaceProviders(providers) + r.metrics.RecordReloadSuccess(outcomes) return nil } -// BuildProviders loads every enabled ai_providers row, attaches its -// keys, and constructs the equivalent [aibridge.Provider] instances. -// The database is the single source of truth for runtime provider -// configuration. -// -// Per-provider construction errors are logged and the offending row is -// excluded from the returned snapshot; only a failure of the DB query -// itself is propagated. This keeps a single misconfigured row from -// taking the whole daemon down. -func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridgeConfig, logger slog.Logger) ([]aibridge.Provider, error) { +// BuildProviders loads every ai_providers row (including disabled) +// and returns the active provider list plus per-row outcomes. Per-row +// build errors are logged and excluded from providers but recorded in +// outcomes; only DB query failures propagate. +func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridgeConfig, logger slog.Logger) ([]aibridge.Provider, []aibridged.ProviderOutcome, error) { //nolint:gocritic // AsAIBridged has a minimal permission set for this purpose. authCtx := dbauthz.AsAIBridged(ctx) @@ -117,7 +117,7 @@ func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridg err := db.InTx(func(tx database.Store) error { var err error rows, err = tx.GetAIProviders(authCtx, database.GetAIProvidersParams{ - IncludeDisabled: false, + IncludeDisabled: true, }) if err != nil { return xerrors.Errorf("load ai providers: %w", err) @@ -129,9 +129,15 @@ func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridg // Load keys only for the enabled providers to avoid materializing // secrets for disabled rows. - ids := make([]uuid.UUID, len(rows)) - for i, r := range rows { - ids[i] = r.ID + ids := make([]uuid.UUID, 0, len(rows)) + for _, r := range rows { + if !r.Enabled { + continue + } + ids = append(ids, r.ID) + } + if len(ids) == 0 { + return nil } keyRows, err := tx.GetAIProviderKeysByProviderIDs(authCtx, ids) if err != nil { @@ -143,13 +149,28 @@ func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridg return nil }, &database.TxOptions{ReadOnly: true, TxIdentifier: "build_ai_providers"}) if err != nil { - return nil, err + return nil, nil, err } - out := make([]aibridge.Provider, 0, len(rows)) + providers := make([]aibridge.Provider, 0, len(rows)) + outcomes := make([]aibridged.ProviderOutcome, 0, len(rows)) + enabledCount := 0 for _, row := range rows { + outcome := aibridged.ProviderOutcome{ + Name: row.Name, + Type: string(row.Type), + } + if !row.Enabled { + outcome.Status = aibridged.ProviderStatusDisabled + outcomes = append(outcomes, outcome) + continue + } + enabledCount++ prov, err := buildAIProviderFromRow(row, keysByProvider[row.ID], cfg) if err != nil { + outcome.Status = aibridged.ProviderStatusError + outcome.Err = err + outcomes = append(outcomes, outcome) logger.Error(ctx, "skipping misconfigured ai provider", slog.F("provider_id", row.ID), slog.F("provider_name", row.Name), @@ -158,14 +179,16 @@ func BuildProviders(ctx context.Context, db database.Store, cfg codersdk.AIBridg ) continue } - out = append(out, prov) + outcome.Status = aibridged.ProviderStatusEnabled + outcomes = append(outcomes, outcome) + providers = append(providers, prov) } - if len(rows) > 0 && len(out) == 0 { + if enabledCount > 0 && len(providers) == 0 { logger.Warn(ctx, "all enabled ai providers failed to build; daemon will start with zero providers") } - return out, nil + return providers, outcomes, nil } // buildAIProviderFromRow decodes the settings blob and constructs the diff --git a/cli/aibridged_internal_test.go b/cli/aibridged_internal_test.go index 1f9f512678..0226974520 100644 --- a/cli/aibridged_internal_test.go +++ b/cli/aibridged_internal_test.go @@ -13,6 +13,7 @@ import ( "github.com/coder/coder/v2/aibridge" "github.com/coder/coder/v2/coderd" agplaibridge "github.com/coder/coder/v2/coderd/aibridge" + "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" @@ -35,7 +36,8 @@ func buildFromEnv(t *testing.T, cfg codersdk.AIBridgeConfig) ([]aibridge.Provide if err := coderd.SeedAIProvidersFromEnv(ctx, db, cfg, logger); err != nil { return nil, err } - return BuildProviders(ctx, db, cfg, logger) + providers, _, err := BuildProviders(ctx, db, cfg, logger) + return providers, err } func TestBuildProviders(t *testing.T) { @@ -323,28 +325,35 @@ func TestBuildProvidersSkipsBadRows(t *testing.T) { Settings: sql.NullString{String: "not-json", Valid: true}, }) - providers, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) + providers, outcomes, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) require.NoError(t, err) assert.Empty(t, providers) + require.Len(t, outcomes, 1) + assert.Equal(t, "anthropic-broken", outcomes[0].Name) + assert.Equal(t, aibridged.ProviderStatusError, outcomes[0].Status) + assert.Error(t, outcomes[0].Err) }) - t.Run("UnsupportedType", func(t *testing.T) { + t.Run("EnabledButNoKeys", func(t *testing.T) { t.Parallel() db, _ := dbtestutil.NewDB(t) ctx := testutil.Context(t, testutil.WaitShort) logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) - // Azure is a valid DB-level provider type but has no runtime - // builder yet; it must hit the default branch and be skipped. + // Azure routes through the OpenAI-family builder, which rejects + // rows without keys when BYOK is disabled. The row must be + // classified as error and excluded from the snapshot. dbgen.AIProvider(t, db, database.AIProvider{ Type: database.AiProviderTypeAzure, Name: "azure-openai", BaseUrl: "https://example.openai.azure.com/", }) - providers, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) + providers, outcomes, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) require.NoError(t, err) assert.Empty(t, providers) + require.Len(t, outcomes, 1) + assert.Equal(t, aibridged.ProviderStatusError, outcomes[0].Status) }) t.Run("BadRowDoesNotBlockGoodRow", func(t *testing.T) { @@ -369,10 +378,40 @@ func TestBuildProvidersSkipsBadRows(t *testing.T) { APIKey: "sk-good", }) - providers, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) + providers, outcomes, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) require.NoError(t, err) require.Len(t, providers, 1) assert.Equal(t, "openai-good", providers[0].Name()) + require.Len(t, outcomes, 2) + byName := map[string]aibridged.ProviderOutcome{} + for _, o := range outcomes { + byName[o.Name] = o + } + assert.Equal(t, aibridged.ProviderStatusError, byName["anthropic-broken"].Status) + assert.Equal(t, aibridged.ProviderStatusEnabled, byName["openai-good"].Status) + }) + + t.Run("DisabledRowClassifiedAsDisabled", func(t *testing.T) { + t.Parallel() + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + logger := slogtest.Make(t, nil) + + dbgen.AIProvider(t, db, database.AIProvider{ + Type: database.AiProviderTypeOpenai, + Name: "openai-off", + BaseUrl: "https://api.openai.com/", + }, func(p *database.InsertAIProviderParams) { + p.Enabled = false + }) + + providers, outcomes, err := BuildProviders(ctx, db, codersdk.AIBridgeConfig{}, logger) + require.NoError(t, err) + assert.Empty(t, providers, "disabled providers must not be in the active snapshot") + require.Len(t, outcomes, 1) + assert.Equal(t, "openai-off", outcomes[0].Name) + assert.Equal(t, aibridged.ProviderStatusDisabled, outcomes[0].Status) + assert.NoError(t, outcomes[0].Err) }) } diff --git a/cli/server.go b/cli/server.go index 0a15645e50..b2fa89fd3b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -1041,7 +1041,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. // unconditionally when the bridge feature is enabled by config so // chatd can use it regardless of license entitlement. if vals.AI.BridgeConfig.Enabled.Value() { - aibridgeProviders, err := BuildProviders(aibridgeInitCtx, options.Database, vals.AI.BridgeConfig, logger.Named("aibridge.providers")) + aibridgeProviders, _, err := BuildProviders(aibridgeInitCtx, options.Database, vals.AI.BridgeConfig, logger.Named("aibridge.providers")) if err != nil { return xerrors.Errorf("build AI providers: %w", err) } diff --git a/coderd/aibridged/metrics.go b/coderd/aibridged/metrics.go new file mode 100644 index 0000000000..b06a9c067c --- /dev/null +++ b/coderd/aibridged/metrics.go @@ -0,0 +1,94 @@ +package aibridged + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +// Metrics is the prometheus surface for aibridged provider reloads. +type Metrics struct { + registerer prometheus.Registerer + + // ProviderInfo is one series per configured provider; value is + // always 1 and the status label carries the alertable signal. + // Labels: provider_name, provider_type, status. + ProviderInfo *prometheus.GaugeVec + + // ProvidersLastReloadTimestampSeconds is the unix timestamp of the + // last reload attempt, success or failure. + ProvidersLastReloadTimestampSeconds prometheus.Gauge + + // ProvidersLastReloadSuccessTimestampSeconds is the unix timestamp + // of the last reload that successfully refreshed the pool. A gap + // against ProvidersLastReloadTimestampSeconds means the loop is + // firing but the refresh function is failing. + ProvidersLastReloadSuccessTimestampSeconds prometheus.Gauge +} + +// NewMetrics registers the provider metrics against reg. +func NewMetrics(reg prometheus.Registerer) *Metrics { + factory := promauto.With(reg) + + return &Metrics{ + registerer: reg, + + ProviderInfo: factory.NewGaugeVec(prometheus.GaugeOpts{ + Name: "provider_info", + Help: "One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal.", + }, []string{"provider_name", "provider_type", "status"}), + + ProvidersLastReloadTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{ + Name: "providers_last_reload_timestamp_seconds", + Help: "Unix timestamp of the last provider reload attempt, success or failure.", + }), + + ProvidersLastReloadSuccessTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{ + Name: "providers_last_reload_success_timestamp_seconds", + Help: "Unix timestamp of the last provider reload that successfully refreshed the pool. A gap against coder_aibridged_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing.", + }), + } +} + +// Unregister removes the provider metrics from the registerer. +func (m *Metrics) Unregister() { + if m == nil { + return + } + m.registerer.Unregister(m.ProviderInfo) + m.registerer.Unregister(m.ProvidersLastReloadTimestampSeconds) + m.registerer.Unregister(m.ProvidersLastReloadSuccessTimestampSeconds) +} + +// RecordReloadAttempt stamps the attempt-time gauge at the start of a +// reload. A reload that hangs mid-flight is detected by watching the +// gap between this gauge and ProvidersLastReloadSuccessTimestampSeconds. +func (m *Metrics) RecordReloadAttempt() { + if m == nil { + return + } + m.ProvidersLastReloadTimestampSeconds.Set(float64(time.Now().Unix())) +} + +// RecordReloadSuccess rewrites the ProviderInfo GaugeVec from the +// outcomes and stamps the success-time gauge. Reset clears series for +// providers that have left the configuration so they don't linger as +// stale. +func (m *Metrics) RecordReloadSuccess(outcomes []ProviderOutcome) { + if m == nil { + return + } + WriteProviderInfoSnapshot(m.ProviderInfo, outcomes) + m.ProvidersLastReloadSuccessTimestampSeconds.Set(float64(time.Now().Unix())) +} + +// WriteProviderInfoSnapshot Resets info and writes one series per +// outcome. Both aibridged and aibridgeproxyd use this so the +// provider_info recording contract stays in one place. +func WriteProviderInfoSnapshot(info *prometheus.GaugeVec, outcomes []ProviderOutcome) { + info.Reset() + for _, o := range outcomes { + info.WithLabelValues(o.Name, o.Type, string(o.Status)).Set(1) + } +} diff --git a/coderd/aibridged/metrics_test.go b/coderd/aibridged/metrics_test.go new file mode 100644 index 0000000000..008c79dd34 --- /dev/null +++ b/coderd/aibridged/metrics_test.go @@ -0,0 +1,84 @@ +package aibridged_test + +import ( + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/aibridged" +) + +// TestMetricsRecordReloadSuccess covers the provider_info GaugeVec +// surface: every reload pass rewrites the series for the current +// outcomes and the Reset on each pass drops stale series. +func TestMetricsRecordReloadSuccess(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + m := aibridged.NewMetrics(reg) + + outcomes := []aibridged.ProviderOutcome{ + {Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, + {Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusDisabled}, + {Name: "gamma", Type: "openai", Status: aibridged.ProviderStatusError, Err: xerrors.New("bad config")}, + } + + before := time.Now().Unix() + m.RecordReloadAttempt() + m.RecordReloadSuccess(outcomes) + after := time.Now().Unix() + + assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("alpha", "openai", "enabled"))) + assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("beta", "anthropic", "disabled"))) + assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("gamma", "openai", "error"))) + + attemptTS := int64(promtest.ToFloat64(m.ProvidersLastReloadTimestampSeconds)) + successTS := int64(promtest.ToFloat64(m.ProvidersLastReloadSuccessTimestampSeconds)) + assert.GreaterOrEqual(t, attemptTS, before) + assert.LessOrEqual(t, attemptTS, after) + assert.GreaterOrEqual(t, successTS, before) + assert.LessOrEqual(t, successTS, after) +} + +// TestMetricsResetsStaleProviderSeries verifies that providers removed +// from the outcome set between reloads do not leave behind stale +// series. +func TestMetricsResetsStaleProviderSeries(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + m := aibridged.NewMetrics(reg) + + m.RecordReloadSuccess([]aibridged.ProviderOutcome{ + {Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, + {Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusEnabled}, + }) + require.Equal(t, 2, promtest.CollectAndCount(m.ProviderInfo)) + + m.RecordReloadSuccess([]aibridged.ProviderOutcome{ + {Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, + }) + + assert.Equal(t, 1, promtest.CollectAndCount(m.ProviderInfo), + "beta should have been Reset out of the GaugeVec") + assert.Equal(t, 1.0, promtest.ToFloat64(m.ProviderInfo.WithLabelValues("alpha", "openai", "enabled"))) +} + +// TestMetricsNilSafe asserts the helpers tolerate a nil receiver so +// callers can pass `nil` to disable metric updates without guarding +// every call site. +func TestMetricsNilSafe(t *testing.T) { + t.Parallel() + + var m *aibridged.Metrics + require.NotPanics(t, func() { + m.RecordReloadAttempt() + m.RecordReloadSuccess(nil) + m.Unregister() + }) +} diff --git a/coderd/aibridged/provider.go b/coderd/aibridged/provider.go new file mode 100644 index 0000000000..6fb53e1a93 --- /dev/null +++ b/coderd/aibridged/provider.go @@ -0,0 +1,28 @@ +package aibridged + +// ProviderStatus is the lifecycle state of a configured AI provider. +type ProviderStatus string + +const ( + // ProviderStatusEnabled indicates the provider is configured and + // valid, and is included in the active pool snapshot. + ProviderStatusEnabled ProviderStatus = "enabled" + // ProviderStatusDisabled indicates the provider is configured but + // intentionally turned off by an operator. + ProviderStatusDisabled ProviderStatus = "disabled" + // ProviderStatusError indicates the provider is configured but + // cannot be constructed (missing keys, unsupported type, malformed + // settings). + ProviderStatusError ProviderStatus = "error" +) + +// ProviderOutcome classifies one ai_providers row, including disabled +// and errored rows the pool excludes. Err is populated only when +// Status == ProviderStatusError; the build error is already logged at +// the call site. +type ProviderOutcome struct { + Name string + Type string + Status ProviderStatus + Err error +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 210f22d040..acaf3e0641 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -120,11 +120,17 @@ deployment. They will always be available from the agent. | `coder_aibridged_non_injected_tool_selections_total` | counter | The number of times an AI model selected a tool to be invoked by the client. | `model` `name` `provider` | | `coder_aibridged_passthrough_total` | counter | The count of requests which were not intercepted but passed through to the upstream. | `method` `provider` `route` | | `coder_aibridged_prompts_total` | counter | The number of prompts issued by users (initiators). | `initiator_id` `model` `provider` | +| `coder_aibridged_provider_info` | gauge | One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal. | `provider_name` `provider_type` `status` | +| `coder_aibridged_providers_last_reload_success_timestamp_seconds` | gauge | Unix timestamp of the last provider reload that successfully refreshed the pool. A gap against coder_aibridged_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing. | | +| `coder_aibridged_providers_last_reload_timestamp_seconds` | gauge | Unix timestamp of the last provider reload attempt, success or failure. | | | `coder_aibridged_tokens_total` | counter | The number of tokens used by intercepted requests. | `initiator_id` `model` `provider` `type` | | `coder_aibridgeproxyd_connect_sessions_total` | counter | Total number of CONNECT sessions established. | `type` | | `coder_aibridgeproxyd_inflight_mitm_requests` | gauge | Number of MITM requests currently being processed. | `provider` | | `coder_aibridgeproxyd_mitm_requests_total` | counter | Total number of MITM requests handled by the proxy. | `provider` | | `coder_aibridgeproxyd_mitm_responses_total` | counter | Total number of MITM responses by HTTP status code class. | `code` `provider` | +| `coder_aibridgeproxyd_provider_info` | gauge | One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal. | `provider_name` `provider_type` `status` | +| `coder_aibridgeproxyd_providers_last_reload_success_timestamp_seconds` | gauge | Unix timestamp of the last provider reload that successfully refreshed the router. A gap against coder_aibridgeproxyd_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing. | | +| `coder_aibridgeproxyd_providers_last_reload_timestamp_seconds` | gauge | Unix timestamp of the last provider reload attempt, success or failure. | | | `coder_derp_server_accepts_total` | counter | Total DERP connections accepted. | | | `coder_derp_server_average_queue_duration_ms` | gauge | Average queue duration in milliseconds. | | | `coder_derp_server_bytes_received_total` | counter | Total bytes received. | | diff --git a/enterprise/aibridgeproxyd/aibridgeproxyd_test.go b/enterprise/aibridgeproxyd/aibridgeproxyd_test.go index fbf77956a2..50224aa98c 100644 --- a/enterprise/aibridgeproxyd/aibridgeproxyd_test.go +++ b/enterprise/aibridgeproxyd/aibridgeproxyd_test.go @@ -35,6 +35,7 @@ import ( "cdr.dev/slog/v3/sloggers/slogtest" "github.com/coder/coder/v2/aibridge" agplaibridge "github.com/coder/coder/v2/coderd/aibridge" + "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/enterprise/aibridgeproxyd" "github.com/coder/coder/v2/testutil" ) @@ -209,10 +210,12 @@ func withProviderHosts(hosts ...string) testProxyOption { host = h } providers = append(providers, aibridgeproxyd.ReloadedProvider{ - Name: name, - Type: "openai", - Host: strings.ToLower(host), - Status: aibridgeproxyd.ProviderStatusEnabled, + ProviderOutcome: aibridged.ProviderOutcome{ + Name: name, + Type: "openai", + Status: aibridged.ProviderStatusEnabled, + }, + Host: strings.ToLower(host), }) } cfg.providers = providers @@ -296,8 +299,8 @@ func newTestProxy(t *testing.T, opts ...testProxyOption) *aibridgeproxyd.Server // loopback, are reachable. Tests that verify IP blocking override this. allowedPrivateCIDRs: []string{"127.0.0.1/32"}, providers: []aibridgeproxyd.ReloadedProvider{ - {Name: "test-provider", Type: "openai", Host: "127.0.0.1", Status: aibridgeproxyd.ProviderStatusEnabled}, - {Name: "test-provider", Type: "openai", Host: "localhost", Status: aibridgeproxyd.ProviderStatusEnabled}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "test-provider", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "127.0.0.1"}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "test-provider", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "localhost"}, }, } for _, opt := range opts { @@ -2077,10 +2080,12 @@ func TestProxy_MITM_CustomProvider(t *testing.T) { srv := newTestProxy(t, withCoderAccessURL(aibridgedServer.URL), withProviders(aibridgeproxyd.ReloadedProvider{ - Name: openrouterProvider, - Type: "openai", - Host: openrouterDomain, - Status: aibridgeproxyd.ProviderStatusEnabled, + ProviderOutcome: aibridged.ProviderOutcome{ + Name: openrouterProvider, + Type: "openai", + Status: aibridged.ProviderStatusEnabled, + }, + Host: openrouterDomain, }), ) diff --git a/enterprise/aibridgeproxyd/metrics.go b/enterprise/aibridgeproxyd/metrics.go index 55a1fa4177..ccfd334aa7 100644 --- a/enterprise/aibridgeproxyd/metrics.go +++ b/enterprise/aibridgeproxyd/metrics.go @@ -30,6 +30,21 @@ type Metrics struct { // Labels: code (HTTP status code), provider // Cardinality is bounded: ~100 used status codes x few providers. MITMResponsesTotal *prometheus.CounterVec + + // ProviderInfo is one series per configured provider; value is + // always 1 and the status label carries the alertable signal. + // Labels: provider_name, provider_type, status. + ProviderInfo *prometheus.GaugeVec + + // ProvidersLastReloadTimestampSeconds is the unix timestamp of the + // last reload attempt, success or failure. + ProvidersLastReloadTimestampSeconds prometheus.Gauge + + // ProvidersLastReloadSuccessTimestampSeconds is the unix timestamp + // of the last reload that successfully refreshed the router. A gap + // against ProvidersLastReloadTimestampSeconds means the loop is + // firing but the refresh function is failing. + ProvidersLastReloadSuccessTimestampSeconds prometheus.Gauge } // NewMetrics creates and registers all metrics for aibridgeproxyd. @@ -58,6 +73,21 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { Name: "mitm_responses_total", Help: "Total number of MITM responses by HTTP status code class.", }, []string{"code", "provider"}), + + ProviderInfo: factory.NewGaugeVec(prometheus.GaugeOpts{ + Name: "provider_info", + Help: "One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal.", + }, []string{"provider_name", "provider_type", "status"}), + + ProvidersLastReloadTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{ + Name: "providers_last_reload_timestamp_seconds", + Help: "Unix timestamp of the last provider reload attempt, success or failure.", + }), + + ProvidersLastReloadSuccessTimestampSeconds: factory.NewGauge(prometheus.GaugeOpts{ + Name: "providers_last_reload_success_timestamp_seconds", + Help: "Unix timestamp of the last provider reload that successfully refreshed the router. A gap against coder_aibridgeproxyd_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing.", + }), } } @@ -67,4 +97,7 @@ func (m *Metrics) Unregister() { m.registerer.Unregister(m.MITMRequestsTotal) m.registerer.Unregister(m.InflightMITMRequests) m.registerer.Unregister(m.MITMResponsesTotal) + m.registerer.Unregister(m.ProviderInfo) + m.registerer.Unregister(m.ProvidersLastReloadTimestampSeconds) + m.registerer.Unregister(m.ProvidersLastReloadSuccessTimestampSeconds) } diff --git a/enterprise/aibridgeproxyd/metrics_internal_test.go b/enterprise/aibridgeproxyd/metrics_internal_test.go new file mode 100644 index 0000000000..6ebefbd56b --- /dev/null +++ b/enterprise/aibridgeproxyd/metrics_internal_test.go @@ -0,0 +1,135 @@ +package aibridgeproxyd + +import ( + "context" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/xerrors" + + "cdr.dev/slog/v3/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/aibridged" + "github.com/coder/coder/v2/testutil" +) + +// TestReloadUpdatesProviderMetrics covers the provider_info GaugeVec +// surface: every reload pass rewrites the series for the current +// snapshot, including disabled and errored rows; the Reset on each +// reload drops series for providers that have left the configuration. +func TestReloadUpdatesProviderMetrics(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + metrics := NewMetrics(reg) + + reload := ProviderReload{Providers: []ReloadedProvider{ + {ProviderOutcome: aibridged.ProviderOutcome{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "alpha.example.com"}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusDisabled}}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "gamma", Type: "openai", Status: aibridged.ProviderStatusError, Err: xerrors.New("bad config")}}, + }} + + ctx := testutil.Context(t, testutil.WaitShort) + srv := &Server{ + ctx: ctx, + logger: slogtest.Make(t, nil), + allowedPorts: []string{"443"}, + metrics: metrics, + refreshProviders: func(context.Context) (ProviderReload, error) { + return reload, nil + }, + } + srv.providerRouter.Store(emptyProviderRouter) + + before := time.Now().Unix() + require.NoError(t, srv.Reload(ctx)) + after := time.Now().Unix() + + assert.Equal(t, 1.0, promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues("alpha", "openai", "enabled"))) + assert.Equal(t, 1.0, promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues("beta", "anthropic", "disabled"))) + assert.Equal(t, 1.0, promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues("gamma", "openai", "error"))) + + attemptTS := int64(promtest.ToFloat64(metrics.ProvidersLastReloadTimestampSeconds)) + successTS := int64(promtest.ToFloat64(metrics.ProvidersLastReloadSuccessTimestampSeconds)) + assert.GreaterOrEqual(t, attemptTS, before) + assert.LessOrEqual(t, attemptTS, after) + assert.GreaterOrEqual(t, successTS, before) + assert.LessOrEqual(t, successTS, after) +} + +// TestReloadResetsStaleProviderSeries verifies that providers removed +// between reloads do not leave behind stale series. Without Reset, a +// removed provider's last-seen value would persist for 5+ minutes and +// could fire alerts despite the provider no longer being configured. +func TestReloadResetsStaleProviderSeries(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + metrics := NewMetrics(reg) + + current := ProviderReload{Providers: []ReloadedProvider{ + {ProviderOutcome: aibridged.ProviderOutcome{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "alpha.example.com"}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "beta", Type: "anthropic", Status: aibridged.ProviderStatusEnabled}, Host: "beta.example.com"}, + }} + + ctx := testutil.Context(t, testutil.WaitShort) + srv := &Server{ + ctx: ctx, + logger: slogtest.Make(t, nil), + allowedPorts: []string{"443"}, + metrics: metrics, + refreshProviders: func(context.Context) (ProviderReload, error) { + return current, nil + }, + } + srv.providerRouter.Store(emptyProviderRouter) + + require.NoError(t, srv.Reload(ctx)) + require.Equal(t, 2, promtest.CollectAndCount(metrics.ProviderInfo)) + + current = ProviderReload{Providers: []ReloadedProvider{ + {ProviderOutcome: aibridged.ProviderOutcome{Name: "alpha", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "alpha.example.com"}, + }} + require.NoError(t, srv.Reload(ctx)) + + assert.Equal(t, 1, promtest.CollectAndCount(metrics.ProviderInfo), + "beta should have been Reset out of the GaugeVec") + assert.Equal(t, 1.0, promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues("alpha", "openai", "enabled"))) +} + +// TestReloadAttemptTimestampUpdatesOnFailure asserts the attempt-time +// gauge advances even when the refresh function fails, while the +// success-time gauge does not. +func TestReloadAttemptTimestampUpdatesOnFailure(t *testing.T) { + t.Parallel() + + reg := prometheus.NewRegistry() + metrics := NewMetrics(reg) + refreshErr := xerrors.New("simulated failure") + + ctx := testutil.Context(t, testutil.WaitShort) + srv := &Server{ + ctx: ctx, + logger: slogtest.Make(t, nil), + allowedPorts: []string{"443"}, + metrics: metrics, + refreshProviders: func(context.Context) (ProviderReload, error) { + return ProviderReload{}, refreshErr + }, + } + srv.providerRouter.Store(emptyProviderRouter) + + before := time.Now().Unix() + err := srv.Reload(ctx) + require.ErrorIs(t, err, refreshErr) + after := time.Now().Unix() + + attemptTS := int64(promtest.ToFloat64(metrics.ProvidersLastReloadTimestampSeconds)) + successTS := int64(promtest.ToFloat64(metrics.ProvidersLastReloadSuccessTimestampSeconds)) + assert.GreaterOrEqual(t, attemptTS, before) + assert.LessOrEqual(t, attemptTS, after) + assert.Equal(t, int64(0), successTS, "success timestamp must not advance on failure") +} diff --git a/enterprise/aibridgeproxyd/reload.go b/enterprise/aibridgeproxyd/reload.go index 686bab1130..04b1f5438b 100644 --- a/enterprise/aibridgeproxyd/reload.go +++ b/enterprise/aibridgeproxyd/reload.go @@ -5,40 +5,21 @@ import ( "net/http" "slices" "strings" + "time" "github.com/elazarl/goproxy" "golang.org/x/xerrors" "cdr.dev/slog/v3" + "github.com/coder/coder/v2/coderd/aibridged" ) -// ProviderStatus describes the lifecycle state of a configured AI -// provider for observability and routing purposes. -type ProviderStatus string - -const ( - // ProviderStatusEnabled means the provider is configured, valid, and - // included in the active routing snapshot. - ProviderStatusEnabled ProviderStatus = "enabled" - // ProviderStatusDisabled means the provider exists in configuration - // but is intentionally turned off by an operator. - ProviderStatusDisabled ProviderStatus = "disabled" - // ProviderStatusError means the provider exists in configuration but - // cannot be routed to because of a validation failure (missing or - // invalid base URL, duplicate host, etc.). - ProviderStatusError ProviderStatus = "error" -) - -// ReloadedProvider is one row from the provider configuration together -// with the outcome of evaluating it for routing. Host is populated only -// when Status == ProviderStatusEnabled; Err is populated only when -// Status == ProviderStatusError. +// ReloadedProvider is the classification of one ai_providers row. +// Host is the routable hostname; it's populated only when the embedded +// outcome's Status == aibridged.ProviderStatusEnabled. type ReloadedProvider struct { - Name string - Type string - Host string - Status ProviderStatus - Err error + aibridged.ProviderOutcome + Host string } // ProviderReload is the result of a single refresh pass: every @@ -47,8 +28,8 @@ type ProviderReload struct { Providers []ReloadedProvider } -// RefreshProvidersFunc returns the live provider classification used by -// Reload to rebuild the proxy's routing snapshot. +// RefreshProvidersFunc returns the live provider classification used +// by Reload to rebuild the proxy's routing snapshot. type RefreshProvidersFunc func(ctx context.Context) (ProviderReload, error) // Reload refreshes proxy routing from the configured provider source. @@ -57,6 +38,7 @@ func (s *Server) Reload(ctx context.Context) error { if s.refreshProviders == nil { return nil } + s.recordReloadAttempt() reload, err := s.refreshProviders(ctx) if err != nil { return xerrors.Errorf("refresh ai providers for proxy routing: %w", err) @@ -67,13 +49,14 @@ func (s *Server) Reload(ctx context.Context) error { } s.providerRouter.Store(router) for _, p := range reload.Providers { - if p.Status == ProviderStatusError { + if p.Status == aibridged.ProviderStatusError { s.logger.Warn(s.ctx, "provider excluded from routing", slog.F("provider", p.Name), slog.Error(p.Err), ) } } + s.recordReloadSuccess(reload) s.logger.Debug(s.ctx, "aibridgeproxyd router reloaded", slog.F("provider_count", len(reload.Providers)), slog.F("mitm_host_count", len(router.mitmHosts)), @@ -82,6 +65,32 @@ func (s *Server) Reload(ctx context.Context) error { return nil } +// recordReloadAttempt stamps the attempt-time gauge at the start of a +// Reload. A reload that hangs mid-flight is detected by watching the +// gap between this gauge and ProvidersLastReloadSuccessTimestampSeconds. +func (s *Server) recordReloadAttempt() { + if s.metrics == nil { + return + } + s.metrics.ProvidersLastReloadTimestampSeconds.Set(float64(time.Now().Unix())) +} + +// recordReloadSuccess rewrites the provider_info GaugeVec from the +// classified reload and stamps the success-time gauge. Reset clears +// series for providers that have left the configuration so they don't +// linger as stale. +func (s *Server) recordReloadSuccess(reload ProviderReload) { + if s.metrics == nil { + return + } + outcomes := make([]aibridged.ProviderOutcome, len(reload.Providers)) + for i, p := range reload.Providers { + outcomes[i] = p.ProviderOutcome + } + aibridged.WriteProviderInfoSnapshot(s.metrics.ProviderInfo, outcomes) + s.metrics.ProvidersLastReloadSuccessTimestampSeconds.Set(float64(time.Now().Unix())) +} + func (s *Server) loadProviderRouter() *providerRouter { if p := s.providerRouter.Load(); p != nil { return p @@ -103,16 +112,17 @@ func (s *Server) mitmHostsCondition() goproxy.ReqConditionFunc { } // buildProviderRouter constructs a router snapshot from a classified -// provider reload. Only providers with Status == ProviderStatusEnabled -// are included in the active routing tables; the refresh function is -// responsible for classifying disabled and errored rows. First entry -// wins on duplicate hostnames as a defense-in-depth measure even though -// the refresh function should mark duplicates as errors. +// provider reload. Only providers with Status == +// aibridged.ProviderStatusEnabled are included in the active routing +// tables; the refresh function is responsible for classifying disabled +// and errored rows. First entry wins on duplicate hostnames as a +// defense-in-depth measure even though the refresh function should +// mark duplicates as errors. func buildProviderRouter(reload ProviderReload, allowedPorts []string) (*providerRouter, error) { nameByHost := make(map[string]string, len(reload.Providers)) domains := make([]string, 0, len(reload.Providers)) for _, p := range reload.Providers { - if p.Status != ProviderStatusEnabled { + if p.Status != aibridged.ProviderStatusEnabled { continue } host := strings.ToLower(p.Host) diff --git a/enterprise/aibridgeproxyd/reload_internal_test.go b/enterprise/aibridgeproxyd/reload_internal_test.go index fb985445f3..5ccba37ec7 100644 --- a/enterprise/aibridgeproxyd/reload_internal_test.go +++ b/enterprise/aibridgeproxyd/reload_internal_test.go @@ -9,15 +9,18 @@ import ( "golang.org/x/xerrors" "cdr.dev/slog/v3/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/testutil" ) func enabledProvider(name, host string) ReloadedProvider { return ReloadedProvider{ - Name: name, - Type: "openai", - Host: host, - Status: ProviderStatusEnabled, + ProviderOutcome: aibridged.ProviderOutcome{ + Name: name, + Type: "openai", + Status: aibridged.ProviderStatusEnabled, + }, + Host: host, } } @@ -96,8 +99,8 @@ func TestBuildProviderRouter(t *testing.T) { enabledProvider("custom", "custom-llm.example.com"), // Host is populated on the non-enabled rows so the Status // guard, not the empty-host guard, is what excludes them. - {Name: "off", Type: "openai", Host: "disabled.example.com", Status: ProviderStatusDisabled}, - {Name: "bad", Type: "openai", Host: "errored.example.com", Status: ProviderStatusError, Err: xerrors.New("nope")}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "off", Type: "openai", Status: aibridged.ProviderStatusDisabled}, Host: "disabled.example.com"}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "bad", Type: "openai", Status: aibridged.ProviderStatusError, Err: xerrors.New("nope")}, Host: "errored.example.com"}, }} router, err := buildProviderRouter(reload, []string{"443"}) @@ -121,7 +124,7 @@ func TestBuildProviderRouter(t *testing.T) { t.Parallel() reload := ProviderReload{Providers: []ReloadedProvider{ - {Name: "provider", Type: "openai", Host: "API.Example.COM", Status: ProviderStatusEnabled}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "provider", Type: "openai", Status: aibridged.ProviderStatusEnabled}, Host: "API.Example.COM"}, }} router, err := buildProviderRouter(reload, []string{"443"}) @@ -152,7 +155,7 @@ func TestBuildProviderRouter(t *testing.T) { t.Parallel() reload := ProviderReload{Providers: []ReloadedProvider{ - {Name: "no-host", Type: "openai", Status: ProviderStatusEnabled}, + {ProviderOutcome: aibridged.ProviderOutcome{Name: "no-host", Type: "openai", Status: aibridged.ProviderStatusEnabled}}, enabledProvider("good", "api.good.example.com"), }} diff --git a/enterprise/aibridgeproxyd/reload_test.go b/enterprise/aibridgeproxyd/reload_test.go index e55d45a372..bfc90338d4 100644 --- a/enterprise/aibridgeproxyd/reload_test.go +++ b/enterprise/aibridgeproxyd/reload_test.go @@ -11,10 +11,13 @@ import ( "sync" "testing" + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/xerrors" + "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/enterprise/aibridgeproxyd" "github.com/coder/coder/v2/testutil" ) @@ -28,6 +31,7 @@ type reloadTestHarness struct { client *http.Client bridged *httptest.Server recorder *aibridgedRecorder + metrics *aibridgeproxyd.Metrics } // aibridgedRecorder captures the path of the last request received by @@ -106,32 +110,34 @@ func (s *providerStore) refresh(context.Context) (aibridgeproxyd.ProviderReload, // classifyRaw mirrors the production classifier in enterprise/cli so // the reload tests exercise the same validation rules end-to-end. func classifyRaw(p rawProvider, seenHost map[string]string) aibridgeproxyd.ReloadedProvider { - out := aibridgeproxyd.ReloadedProvider{Name: p.name, Type: "openai"} + out := aibridgeproxyd.ReloadedProvider{ + ProviderOutcome: aibridged.ProviderOutcome{Name: p.name, Type: "openai"}, + } if strings.TrimSpace(p.baseURL) == "" { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.New("base url is empty") return out } u, err := url.Parse(p.baseURL) if err != nil { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("invalid base url %q: %w", p.baseURL, err) return out } host := strings.ToLower(u.Hostname()) if host == "" { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("base url %q has no hostname", p.baseURL) return out } if claimedBy, taken := seenHost[host]; taken { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("hostname %q already claimed by provider %q", host, claimedBy) return out } seenHost[host] = p.name out.Host = host - out.Status = aibridgeproxyd.ProviderStatusEnabled + out.Status = aibridged.ProviderStatusEnabled return out } @@ -151,10 +157,12 @@ func newReloadTestHarness(t *testing.T) *reloadTestHarness { t.Cleanup(bridged.Close) store := &providerStore{} + metrics := aibridgeproxyd.NewMetrics(prometheus.NewRegistry()) srv := newTestProxy(t, withCoderAccessURL(bridged.URL), withAllowedPorts("443"), withRefreshProviders(store.refresh), + withMetrics(metrics), ) certPool := getProxyCertPool(t) @@ -169,6 +177,7 @@ func newReloadTestHarness(t *testing.T) *reloadTestHarness { return &reloadTestHarness{ srv: srv, store: store, + metrics: metrics, client: client, bridged: bridged, recorder: recorder, @@ -236,6 +245,25 @@ func (h *reloadTestHarness) expectNotRouted(t *testing.T, targetURL string) { "aibridged must not be reached for non-routed host %s", targetURL) } +// expectProviderStatus asserts the provider_info series for (name, +// status) is present with value 1. +func (h *reloadTestHarness) expectProviderStatus(t *testing.T, name, status string) { + t.Helper() + assert.Equal(t, 1.0, promtest.ToFloat64(h.metrics.ProviderInfo.WithLabelValues(name, "openai", status)), + "expected provider_info{provider_name=%q, status=%q} == 1", name, status) +} + +// expectProviderAbsent asserts no series exists for the provider name +// in any status. This verifies the GaugeVec.Reset on each reload +// clears stale entries. +func (h *reloadTestHarness) expectProviderAbsent(t *testing.T, name string) { + t.Helper() + for _, status := range []string{"enabled", "disabled", "error"} { + assert.Equal(t, 0.0, promtest.ToFloat64(h.metrics.ProviderInfo.WithLabelValues(name, "openai", status)), + "expected no provider_info series for %q, found status %q", name, status) + } +} + // TestProxy_StaleTunnelStopsRoutingAfterProviderChange is the // regression test for a bug where a long-lived CONNECT tunnel that was // established while a provider was enabled kept routing decrypted @@ -377,14 +405,18 @@ func TestProxy_HotReloadRoutingCRUD(t *testing.T) { }) require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://alpha.invalid/v1/messages", "/api/v2/aibridge/alpha/v1/messages") + h.expectProviderStatus(t, "alpha", "enabled") // UpdateProviderName: the same BaseURL with a new name must route - // under the new name on the next Reload. + // under the new name on the next Reload. The renamed provider must + // not leave a stale alpha series behind. h.store.set([]rawProvider{ {name: "alpha-v2", baseURL: "https://alpha.invalid/v1"}, }) require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://alpha.invalid/v1/messages", "/api/v2/aibridge/alpha-v2/v1/messages") + h.expectProviderStatus(t, "alpha-v2", "enabled") + h.expectProviderAbsent(t, "alpha") // UpdateProviderBaseURLHost: moving the provider to a new host must // start MITM'ing the new host and stop MITM'ing the old one. @@ -394,6 +426,7 @@ func TestProxy_HotReloadRoutingCRUD(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://alpha-new.invalid/v1/messages", "/api/v2/aibridge/alpha-v2/v1/messages") h.expectNotRouted(t, "https://alpha.invalid/v1/messages") + h.expectProviderStatus(t, "alpha-v2", "enabled") // AddSecondProvider: a second provider added in the same Reload must // route independently from the first. @@ -404,15 +437,19 @@ func TestProxy_HotReloadRoutingCRUD(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://alpha-new.invalid/v1/messages", "/api/v2/aibridge/alpha-v2/v1/messages") h.expectRoutedTo(t, "https://beta.invalid/v1/chat/completions", "/api/v2/aibridge/beta/v1/chat/completions") + h.expectProviderStatus(t, "alpha-v2", "enabled") + h.expectProviderStatus(t, "beta", "enabled") // DeleteOneProvider: removing alpha must keep beta routed and stop - // routing alpha. + // routing alpha. The deleted name disappears from provider_info. h.store.set([]rawProvider{ {name: "beta", baseURL: "https://beta.invalid/v1"}, }) require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://beta.invalid/v1/chat/completions", "/api/v2/aibridge/beta/v1/chat/completions") h.expectNotRouted(t, "https://alpha-new.invalid/v1/messages") + h.expectProviderStatus(t, "beta", "enabled") + h.expectProviderAbsent(t, "alpha-v2") // DeleteAllProviders: an empty Reload must collapse the router to // the fail-closed state with no host MITM'd. @@ -420,6 +457,7 @@ func TestProxy_HotReloadRoutingCRUD(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectNotRouted(t, "https://beta.invalid/v1/chat/completions") h.expectNotRouted(t, "https://alpha-new.invalid/v1/messages") + h.expectProviderAbsent(t, "beta") // RecreateAfterDelete: reintroducing a previously-deleted provider // must route again without restart, confirming the swap is @@ -429,6 +467,11 @@ func TestProxy_HotReloadRoutingCRUD(t *testing.T) { }) require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://alpha.invalid/v1/messages", "/api/v2/aibridge/alpha/v1/messages") + h.expectProviderStatus(t, "alpha", "enabled") + + // Both timestamp gauges must have advanced through this sequence. + assert.Positive(t, promtest.ToFloat64(h.metrics.ProvidersLastReloadTimestampSeconds)) + assert.Positive(t, promtest.ToFloat64(h.metrics.ProvidersLastReloadSuccessTimestampSeconds)) } // TestProxy_HotReloadRoutingInvalidProviders covers the resilience @@ -453,6 +496,8 @@ func TestProxy_HotReloadRoutingInvalidProviders(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://valid.invalid/v1/messages", "/api/v2/aibridge/valid/v1/messages") + h.expectProviderStatus(t, "no-url", "error") + h.expectProviderStatus(t, "valid", "enabled") }) t.Run("MalformedBaseURLSkipped", func(t *testing.T) { @@ -470,6 +515,9 @@ func TestProxy_HotReloadRoutingInvalidProviders(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://valid.invalid/v1/messages", "/api/v2/aibridge/valid/v1/messages") + h.expectProviderStatus(t, "malformed", "error") + h.expectProviderStatus(t, "no-host", "error") + h.expectProviderStatus(t, "valid", "enabled") }) t.Run("DuplicateHostFirstWins", func(t *testing.T) { @@ -485,6 +533,8 @@ func TestProxy_HotReloadRoutingInvalidProviders(t *testing.T) { require.NoError(t, h.srv.Reload(t.Context())) h.expectRoutedTo(t, "https://shared.invalid/v1/messages", "/api/v2/aibridge/first/v1/messages") + h.expectProviderStatus(t, "first", "enabled") + h.expectProviderStatus(t, "second", "error") }) t.Run("AllInvalidYieldsEmptyRouter", func(t *testing.T) { diff --git a/enterprise/cli/aibridgeproxyd.go b/enterprise/cli/aibridgeproxyd.go index 00cbefaee6..08641f5769 100644 --- a/enterprise/cli/aibridgeproxyd.go +++ b/enterprise/cli/aibridgeproxyd.go @@ -118,37 +118,39 @@ func refreshProxyProviders(db database.Store) aibridgeproxyd.RefreshProvidersFun // hostname so later duplicates can be flagged as errors. func classifyProviderRow(row database.AIProvider, seenHost map[string]string) aibridgeproxyd.ReloadedProvider { out := aibridgeproxyd.ReloadedProvider{ - Name: row.Name, - Type: string(row.Type), + ProviderOutcome: aibridged.ProviderOutcome{ + Name: row.Name, + Type: string(row.Type), + }, } if !row.Enabled { - out.Status = aibridgeproxyd.ProviderStatusDisabled + out.Status = aibridged.ProviderStatusDisabled return out } if strings.TrimSpace(row.BaseUrl) == "" { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.New("base url is empty") return out } u, err := url.Parse(row.BaseUrl) if err != nil { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("invalid base url %q: %w", row.BaseUrl, err) return out } host := strings.ToLower(u.Hostname()) if host == "" { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("base url %q has no hostname", row.BaseUrl) return out } if claimedBy, taken := seenHost[host]; taken { - out.Status = aibridgeproxyd.ProviderStatusError + out.Status = aibridged.ProviderStatusError out.Err = xerrors.Errorf("hostname %q already claimed by provider %q", host, claimedBy) return out } seenHost[host] = row.Name out.Host = host - out.Status = aibridgeproxyd.ProviderStatusEnabled + out.Status = aibridged.ProviderStatusEnabled return out } diff --git a/enterprise/cli/aibridgeproxyd_internal_test.go b/enterprise/cli/aibridgeproxyd_internal_test.go index 54c6c25f78..2c8520878b 100644 --- a/enterprise/cli/aibridgeproxyd_internal_test.go +++ b/enterprise/cli/aibridgeproxyd_internal_test.go @@ -7,8 +7,8 @@ import ( "github.com/stretchr/testify/assert" + "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/enterprise/aibridgeproxyd" ) // TestClassifyProviderRow covers every branch of the classifier so the @@ -34,7 +34,7 @@ func TestClassifyProviderRow(t *testing.T) { got := classifyProviderRow(enabledRow("openai", "https://api.openai.com/v1"), seen) assert.Equal(t, "openai", got.Name) assert.Equal(t, string(database.AiProviderTypeOpenai), got.Type) - assert.Equal(t, aibridgeproxyd.ProviderStatusEnabled, got.Status) + assert.Equal(t, aibridged.ProviderStatusEnabled, got.Status) assert.Equal(t, "api.openai.com", got.Host) assert.NoError(t, got.Err) assert.Equal(t, "openai", seen["api.openai.com"]) @@ -47,7 +47,7 @@ func TestClassifyProviderRow(t *testing.T) { row := enabledRow("off", "https://api.off.example.com/v1") row.Enabled = false got := classifyProviderRow(row, seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusDisabled, got.Status) + assert.Equal(t, aibridged.ProviderStatusDisabled, got.Status) assert.Empty(t, got.Host, "disabled provider must not claim a host") assert.NoError(t, got.Err) assert.Empty(t, seen, "disabled provider must not occupy a host slot") @@ -58,7 +58,7 @@ func TestClassifyProviderRow(t *testing.T) { seen := map[string]string{} got := classifyProviderRow(enabledRow("no-url", " "), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusError, got.Status) + assert.Equal(t, aibridged.ProviderStatusError, got.Status) assert.Empty(t, got.Host) assert.ErrorContains(t, got.Err, "base url is empty") }) @@ -68,7 +68,7 @@ func TestClassifyProviderRow(t *testing.T) { seen := map[string]string{} got := classifyProviderRow(enabledRow("bad", "://not-a-url"), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusError, got.Status) + assert.Equal(t, aibridged.ProviderStatusError, got.Status) assert.ErrorContains(t, got.Err, "invalid base url") }) @@ -77,7 +77,7 @@ func TestClassifyProviderRow(t *testing.T) { seen := map[string]string{} got := classifyProviderRow(enabledRow("no-host", "https://"), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusError, got.Status) + assert.Equal(t, aibridged.ProviderStatusError, got.Status) assert.ErrorContains(t, got.Err, "no hostname") }) @@ -86,10 +86,10 @@ func TestClassifyProviderRow(t *testing.T) { seen := map[string]string{} first := classifyProviderRow(enabledRow("first", "https://shared.example.com/v1"), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusEnabled, first.Status) + assert.Equal(t, aibridged.ProviderStatusEnabled, first.Status) second := classifyProviderRow(enabledRow("second", "https://shared.example.com/v2"), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusError, second.Status) + assert.Equal(t, aibridged.ProviderStatusError, second.Status) assert.ErrorContains(t, second.Err, "already claimed by provider \"first\"") assert.Equal(t, "first", seen["shared.example.com"], "first wins must not be overwritten") }) @@ -99,7 +99,7 @@ func TestClassifyProviderRow(t *testing.T) { seen := map[string]string{} got := classifyProviderRow(enabledRow("mixed", "https://API.Example.COM/v1"), seen) - assert.Equal(t, aibridgeproxyd.ProviderStatusEnabled, got.Status) + assert.Equal(t, aibridged.ProviderStatusEnabled, got.Status) assert.Equal(t, "api.example.com", got.Host) }) } diff --git a/enterprise/coderd/aibridge_reload_test.go b/enterprise/coderd/aibridge_reload_test.go index 6aee3afa91..65f678df6e 100644 --- a/enterprise/coderd/aibridge_reload_test.go +++ b/enterprise/coderd/aibridge_reload_test.go @@ -9,6 +9,8 @@ import ( "sync/atomic" "testing" + "github.com/prometheus/client_golang/prometheus" + promtest "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.opentelemetry.io/otel" @@ -54,7 +56,7 @@ func newMockUpstream(t *testing.T, name string) *mockUpstream { // the supplied API and subscribes it to ai_providers change events. // This mirrors what cli/server.go does in production so /api/v2/aibridge // requests dispatch through the real pool and reloader. -func startTestAIBridgeDaemon(t *testing.T, api *coderd.API) { +func startTestAIBridgeDaemon(t *testing.T, api *coderd.API) *aibridged.Metrics { t.Helper() ctx := context.Background() @@ -62,14 +64,15 @@ func startTestAIBridgeDaemon(t *testing.T, api *coderd.API) { cfg := api.DeploymentValues.AI.BridgeConfig tracer := otel.Tracer("aibridge-reload-test") - providers, err := cli.BuildProviders(ctx, api.Database, cfg, logger) + providers, _, err := cli.BuildProviders(ctx, api.Database, cfg, logger) require.NoError(t, err) pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger.Named("pool"), nil, tracer) require.NoError(t, err) t.Cleanup(func() { _ = pool.Shutdown(context.Background()) }) - reloader := &testPoolReloader{pool: pool, db: api.Database, cfg: cfg, logger: logger.Named("reloader")} + metrics := aibridged.NewMetrics(prometheus.NewRegistry()) + reloader := &testPoolReloader{pool: pool, db: api.Database, cfg: cfg, logger: logger.Named("reloader"), metrics: metrics} unsubscribe, err := aibridged.SubscribeProviderReload(ctx, api.Pubsub, reloader, logger.Named("subscriber")) require.NoError(t, err) t.Cleanup(unsubscribe) @@ -81,21 +84,25 @@ func startTestAIBridgeDaemon(t *testing.T, api *coderd.API) { t.Cleanup(func() { _ = srv.Close() }) api.RegisterInMemoryAIBridgedHTTPHandler(srv) + return metrics } type testPoolReloader struct { - pool *aibridged.CachedBridgePool - db database.Store - cfg codersdk.AIBridgeConfig - logger slog.Logger + pool *aibridged.CachedBridgePool + db database.Store + cfg codersdk.AIBridgeConfig + logger slog.Logger + metrics *aibridged.Metrics } func (r *testPoolReloader) Reload(ctx context.Context) error { - providers, err := cli.BuildProviders(ctx, r.db, r.cfg, r.logger) + defer r.metrics.RecordReloadAttempt() + providers, outcomes, err := cli.BuildProviders(ctx, r.db, r.cfg, r.logger) if err != nil { return err } r.pool.ReplaceProviders(providers) + r.metrics.RecordReloadSuccess(outcomes) return nil } @@ -124,7 +131,34 @@ func TestAIBridgeProviderHotReload(t *testing.T) { }, }) - startTestAIBridgeDaemon(t, api.AGPL) + metrics := startTestAIBridgeDaemon(t, api.AGPL) + + // requireProviderStatus polls until the provider_info series for + // (name, status) settles to value 1. Reloads happen via pubsub, so + // the assertion has to be eventual. + requireProviderStatus := func(t *testing.T, name, status string) { + t.Helper() + require.Eventuallyf(t, func() bool { + return promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues(name, "openai", status)) == 1 + }, testutil.WaitShort, testutil.IntervalFast, + "expected provider_info{provider_name=%q, status=%q} == 1", name, status) + } + + // requireProviderAbsent polls until no series exists for the + // provider name in any status. After a delete the Reset on the + // next reload must clear all previous status labels for the name. + requireProviderAbsent := func(t *testing.T, name string) { + t.Helper() + require.Eventuallyf(t, func() bool { + for _, status := range []string{"enabled", "disabled", "error"} { + if promtest.ToFloat64(metrics.ProviderInfo.WithLabelValues(name, "openai", status)) != 0 { + return false + } + } + return true + }, testutil.WaitShort, testutil.IntervalFast, + "expected provider_info series for %q to be cleared after delete", name) + } ctx := testutil.Context(t, testutil.WaitLong) @@ -188,6 +222,7 @@ func TestAIBridgeProviderHotReload(t *testing.T) { require.NoError(t, err) require.Equal(t, "primary", created.Name) requireRoutesTo(t, "primary", upstreamA) + requireProviderStatus(t, "primary", "enabled") // 2. Update BaseURL: same name, now points at upstream B. newBaseURL := upstreamB.server.URL @@ -196,15 +231,17 @@ func TestAIBridgeProviderHotReload(t *testing.T) { }) require.NoError(t, err) requireRoutesTo(t, "primary", upstreamB) + requireProviderStatus(t, "primary", "enabled") // 3. Disable: the provider drops out of the snapshot, requests - // stop reaching any upstream. + // stop reaching any upstream. The metric flips to "disabled". disabled := false _, err = client.UpdateAIProvider(ctx, "primary", codersdk.UpdateAIProviderRequest{ Enabled: &disabled, }) require.NoError(t, err) requireRoutingGone(t, "primary") + requireProviderStatus(t, "primary", "disabled") // 4. Re-enable: routing comes back at the most recent BaseURL. enabled := true @@ -213,6 +250,7 @@ func TestAIBridgeProviderHotReload(t *testing.T) { }) require.NoError(t, err) requireRoutesTo(t, "primary", upstreamB) + requireProviderStatus(t, "primary", "enabled") // 5. Add a second provider; both names must route independently. _, err = client.CreateAIProvider(ctx, codersdk.CreateAIProviderRequest{ @@ -225,9 +263,19 @@ func TestAIBridgeProviderHotReload(t *testing.T) { require.NoError(t, err) requireRoutesTo(t, "primary", upstreamB) requireRoutesTo(t, "secondary", upstreamA) + requireProviderStatus(t, "primary", "enabled") + requireProviderStatus(t, "secondary", "enabled") - // 6. Delete primary: only secondary remains routable. + // 6. Delete primary: only secondary remains routable. The + // provider_info series for primary disappears entirely on the + // next reload's Reset. require.NoError(t, client.DeleteAIProvider(ctx, "primary")) requireRoutingGone(t, "primary") requireRoutesTo(t, "secondary", upstreamA) + requireProviderAbsent(t, "primary") + requireProviderStatus(t, "secondary", "enabled") + + // Both timestamp gauges must have advanced during this test. + assert.Positive(t, promtest.ToFloat64(metrics.ProvidersLastReloadTimestampSeconds)) + assert.Positive(t, promtest.ToFloat64(metrics.ProvidersLastReloadSuccessTimestampSeconds)) } diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 653de99241..036ac496a1 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -208,3 +208,21 @@ coder_aibridgeproxyd_mitm_requests_total{provider=""} 0 # HELP coder_aibridgeproxyd_mitm_responses_total Total number of MITM responses by HTTP status code class. # TYPE coder_aibridgeproxyd_mitm_responses_total counter coder_aibridgeproxyd_mitm_responses_total{code="",provider=""} 0 +# HELP coder_aibridged_provider_info One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal. +# TYPE coder_aibridged_provider_info gauge +coder_aibridged_provider_info{provider_name="",provider_type="",status=""} 0 +# HELP coder_aibridged_providers_last_reload_timestamp_seconds Unix timestamp of the last provider reload attempt, success or failure. +# TYPE coder_aibridged_providers_last_reload_timestamp_seconds gauge +coder_aibridged_providers_last_reload_timestamp_seconds 0 +# HELP coder_aibridged_providers_last_reload_success_timestamp_seconds Unix timestamp of the last provider reload that successfully refreshed the pool. A gap against coder_aibridged_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing. +# TYPE coder_aibridged_providers_last_reload_success_timestamp_seconds gauge +coder_aibridged_providers_last_reload_success_timestamp_seconds 0 +# HELP coder_aibridgeproxyd_provider_info One series per configured AI provider. Value is always 1; the status label (enabled, disabled, error) carries the alertable signal. +# TYPE coder_aibridgeproxyd_provider_info gauge +coder_aibridgeproxyd_provider_info{provider_name="",provider_type="",status=""} 0 +# HELP coder_aibridgeproxyd_providers_last_reload_timestamp_seconds Unix timestamp of the last provider reload attempt, success or failure. +# TYPE coder_aibridgeproxyd_providers_last_reload_timestamp_seconds gauge +coder_aibridgeproxyd_providers_last_reload_timestamp_seconds 0 +# HELP coder_aibridgeproxyd_providers_last_reload_success_timestamp_seconds Unix timestamp of the last provider reload that successfully refreshed the router. A gap against coder_aibridgeproxyd_providers_last_reload_timestamp_seconds means the loop is firing but the refresh function is failing. +# TYPE coder_aibridgeproxyd_providers_last_reload_success_timestamp_seconds gauge +coder_aibridgeproxyd_providers_last_reload_success_timestamp_seconds 0 diff --git a/scripts/metricsdocgen/scanner/scanner.go b/scripts/metricsdocgen/scanner/scanner.go index eee4166e49..f7ab57f9d4 100644 --- a/scripts/metricsdocgen/scanner/scanner.go +++ b/scripts/metricsdocgen/scanner/scanner.go @@ -40,6 +40,7 @@ var scanDirs = []string{ // // eliminate the need for this skip list. var skipPaths = []string{ + "coderd/aibridged/metrics.go", "enterprise/aibridgeproxyd/metrics.go", }