From 04b0253e8a67f44a12978f38c8b7d7cd4fae33aa Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 29 Jan 2026 13:50:15 +0100 Subject: [PATCH] feat: add Prometheus metrics for license warnings and errors (#21749) Fixes: coder/internal#767 Adds two new Prometheus metrics for license health monitoring: - `coderd_license_warnings` - count of active license warnings - `coderd_license_errors` - count of active license errors Metrics endpoint after startup of a deployment with license enabled: ``` ... # HELP coderd_license_errors The number of active license errors. # TYPE coderd_license_errors gauge coderd_license_errors 0 ... # HELP coderd_license_warnings The number of active license warnings. # TYPE coderd_license_warnings gauge coderd_license_warnings 0 ... ``` --- coderd/entitlements/entitlements.go | 6 + docs/admin/integrations/prometheus.md | 2 + enterprise/coderd/license/metricscollector.go | 22 +++ .../coderd/license/metricscollector_test.go | 134 ++++++++++++++++-- .../license/testdata/license-metrics.json | 4 +- scripts/metricsdocgen/metrics | 6 + 6 files changed, 164 insertions(+), 10 deletions(-) diff --git a/coderd/entitlements/entitlements.go b/coderd/entitlements/entitlements.go index 1be422b476..6da2bc17b5 100644 --- a/coderd/entitlements/entitlements.go +++ b/coderd/entitlements/entitlements.go @@ -162,6 +162,12 @@ func (l *Set) Errors() []string { return slices.Clone(l.entitlements.Errors) } +func (l *Set) Warnings() []string { + l.entitlementsMu.RLock() + defer l.entitlementsMu.RUnlock() + return slices.Clone(l.entitlements.Warnings) +} + func (l *Set) HasLicense() bool { l.entitlementsMu.RLock() defer l.entitlementsMu.RUnlock() diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index aabb8ec95b..ffa1957b73 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -147,8 +147,10 @@ deployment. They will always be available from the agent. | `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | | `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | | `coderd_license_active_users` | gauge | The number of active users. | | +| `coderd_license_errors` | gauge | The number of active license errors. | | | `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | | `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | +| `coderd_license_warnings` | gauge | The number of active license warnings. | | | `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | | `coderd_oauth2_external_requests_rate_limit` | gauge | The total number of allowed requests per interval. | `name` `resource` | | `coderd_oauth2_external_requests_rate_limit_next_reset_unix` | gauge | Unix timestamp of the next interval | `name` `resource` | diff --git a/enterprise/coderd/license/metricscollector.go b/enterprise/coderd/license/metricscollector.go index 8c0ccd83fb..a9888f4c22 100644 --- a/enterprise/coderd/license/metricscollector.go +++ b/enterprise/coderd/license/metricscollector.go @@ -11,6 +11,10 @@ var ( activeUsersDesc = prometheus.NewDesc("coderd_license_active_users", "The number of active users.", nil, nil) limitUsersDesc = prometheus.NewDesc("coderd_license_limit_users", "The user seats limit based on the active Coder license.", nil, nil) userLimitEnabledDesc = prometheus.NewDesc("coderd_license_user_limit_enabled", "Returns 1 if the current license enforces the user limit.", nil, nil) + + // Metrics for license warnings and errors. + licenseWarningsDesc = prometheus.NewDesc("coderd_license_warnings", "The number of active license warnings.", nil, nil) + licenseErrorsDesc = prometheus.NewDesc("coderd_license_errors", "The number of active license errors.", nil, nil) ) type MetricsCollector struct { @@ -23,9 +27,19 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- activeUsersDesc descCh <- limitUsersDesc descCh <- userLimitEnabledDesc + descCh <- licenseWarningsDesc + descCh <- licenseErrorsDesc } func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { + // Collect user limit metrics. + mc.collectUserLimit(metricsCh) + + // Collect license warnings and errors metrics. + mc.collectWarningsAndErrors(metricsCh) +} + +func (mc *MetricsCollector) collectUserLimit(metricsCh chan<- prometheus.Metric) { userLimitEntitlement, ok := mc.Entitlements.Feature(codersdk.FeatureUserLimit) if !ok { return @@ -45,3 +59,11 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(limitUsersDesc, prometheus.GaugeValue, float64(*userLimitEntitlement.Limit)) } } + +func (mc *MetricsCollector) collectWarningsAndErrors(metricsCh chan<- prometheus.Metric) { + warnings := mc.Entitlements.Warnings() + errors := mc.Entitlements.Errors() + + metricsCh <- prometheus.MustNewConstMetric(licenseWarningsDesc, prometheus.GaugeValue, float64(len(warnings))) + metricsCh <- prometheus.MustNewConstMetric(licenseErrorsDesc, prometheus.GaugeValue, float64(len(errors))) +} diff --git a/enterprise/coderd/license/metricscollector_test.go b/enterprise/coderd/license/metricscollector_test.go index 3c2e7860b6..48083b85ed 100644 --- a/enterprise/coderd/license/metricscollector_test.go +++ b/enterprise/coderd/license/metricscollector_test.go @@ -7,6 +7,7 @@ import ( "github.com/aws/smithy-go/ptr" "github.com/prometheus/client_golang/prometheus" + prometheus_client "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" "github.com/coder/coder/v2/coderd/entitlements" @@ -48,16 +49,131 @@ func TestCollectLicenseMetrics(t *testing.T) { err = json.Unmarshal(goldenFile, &golden) require.NoError(t, err) - collected := map[string]int{} - for _, metric := range metrics { - switch metric.GetName() { - case "coderd_license_active_users", "coderd_license_limit_users", "coderd_license_user_limit_enabled": - for _, m := range metric.Metric { - collected[metric.GetName()] = int(m.Gauge.GetValue()) + for name, expected := range golden { + actual, ok := findMetric(metrics, name) + require.True(t, ok, "metric %s not found", name) + require.Equal(t, expected, actual, "metric %s", name) + } +} + +func TestCollectLicenseMetrics_WarningsAndErrors(t *testing.T) { + t.Parallel() + + t.Run("NoWarningsOrErrors", func(t *testing.T) { + t.Parallel() + + registry := prometheus.NewRegistry() + var sut license.MetricsCollector + sut.Entitlements = entitlements.New() + + registry.Register(&sut) + + metrics, err := registry.Gather() + require.NoError(t, err) + + warnings, ok := findMetric(metrics, "coderd_license_warnings") + require.True(t, ok) + require.Zero(t, warnings) + + errors, ok := findMetric(metrics, "coderd_license_errors") + require.True(t, ok) + require.Zero(t, errors) + }) + + t.Run("WithWarnings", func(t *testing.T) { + t.Parallel() + + registry := prometheus.NewRegistry() + var sut license.MetricsCollector + sut.Entitlements = entitlements.New() + sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) { + entitlements.Warnings = []string{ + "License expires in 30 days", + "User limit is at 90% capacity", + } + }) + + registry.Register(&sut) + + metrics, err := registry.Gather() + require.NoError(t, err) + + warnings, ok := findMetric(metrics, "coderd_license_warnings") + require.True(t, ok) + require.Equal(t, 2, warnings) + + errors, ok := findMetric(metrics, "coderd_license_errors") + require.True(t, ok) + require.Zero(t, errors) + }) + + t.Run("WithErrors", func(t *testing.T) { + t.Parallel() + + registry := prometheus.NewRegistry() + var sut license.MetricsCollector + sut.Entitlements = entitlements.New() + sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) { + entitlements.Errors = []string{ + "License has expired", + } + }) + + registry.Register(&sut) + + metrics, err := registry.Gather() + require.NoError(t, err) + + warnings, ok := findMetric(metrics, "coderd_license_warnings") + require.True(t, ok) + require.Zero(t, warnings) + + errors, ok := findMetric(metrics, "coderd_license_errors") + require.True(t, ok) + require.Equal(t, 1, errors) + }) + + t.Run("WithBothWarningsAndErrors", func(t *testing.T) { + t.Parallel() + + registry := prometheus.NewRegistry() + var sut license.MetricsCollector + sut.Entitlements = entitlements.New() + sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) { + entitlements.Warnings = []string{ + "License expires in 7 days", + "User limit is at 95% capacity", + "Feature X is deprecated", + } + entitlements.Errors = []string{ + "Invalid license signature", + "License UUID mismatch", + } + }) + + registry.Register(&sut) + + metrics, err := registry.Gather() + require.NoError(t, err) + + warnings, ok := findMetric(metrics, "coderd_license_warnings") + require.True(t, ok) + require.Equal(t, 3, warnings) + + errors, ok := findMetric(metrics, "coderd_license_errors") + require.True(t, ok) + require.Equal(t, 2, errors) + }) +} + +// findMetric searches for a metric by name and returns its value. +func findMetric(metrics []*prometheus_client.MetricFamily, name string) (int, bool) { + for _, metric := range metrics { + if metric.GetName() == name { + for _, m := range metric.Metric { + return int(m.Gauge.GetValue()), true } - default: - require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - require.EqualValues(t, golden, collected) + return 0, false } diff --git a/enterprise/coderd/license/testdata/license-metrics.json b/enterprise/coderd/license/testdata/license-metrics.json index 3b4740ba15..bba78687f5 100644 --- a/enterprise/coderd/license/testdata/license-metrics.json +++ b/enterprise/coderd/license/testdata/license-metrics.json @@ -1,5 +1,7 @@ { "coderd_license_active_users": 4, "coderd_license_limit_users": 7, - "coderd_license_user_limit_enabled": 1 + "coderd_license_user_limit_enabled": 1, + "coderd_license_warnings": 0, + "coderd_license_errors": 0 } diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 7c9c8ef845..78b5772f07 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -978,3 +978,9 @@ coderd_agentapi_metadata_flushed_total 71 # HELP coderd_agentapi_metadata_publish_errors_total Total number of metadata batch pubsub publish calls that have resulted in an error. # TYPE coderd_agentapi_metadata_publish_errors_total counter coderd_agentapi_metadata_publish_errors_total 0 +# HELP coderd_license_warnings The number of active license warnings. +# TYPE coderd_license_warnings gauge +coderd_license_warnings 0 +# HELP coderd_license_errors The number of active license errors. +# TYPE coderd_license_errors gauge +coderd_license_errors 0