feat: add Prometheus metrics for license warnings and errors (#21749)

Fixes: coder/internal#767

Adds two new Prometheus metrics for license health monitoring:

- `coderd_license_warnings` - count of active license warnings
- `coderd_license_errors` - count of active license errors

Metrics endpoint after startup of a deployment with license enabled:

```
...
# HELP coderd_license_errors The number of active license errors.
# TYPE coderd_license_errors gauge
coderd_license_errors 0
...
# HELP coderd_license_warnings The number of active license warnings.
# TYPE coderd_license_warnings gauge
coderd_license_warnings 0
...
```
This commit is contained in:
Marcin Tojek
2026-01-29 13:50:15 +01:00
committed by GitHub
parent 06e396188f
commit 04b0253e8a
6 changed files with 164 additions and 10 deletions
+6
View File
@@ -162,6 +162,12 @@ func (l *Set) Errors() []string {
return slices.Clone(l.entitlements.Errors) return slices.Clone(l.entitlements.Errors)
} }
func (l *Set) Warnings() []string {
l.entitlementsMu.RLock()
defer l.entitlementsMu.RUnlock()
return slices.Clone(l.entitlements.Warnings)
}
func (l *Set) HasLicense() bool { func (l *Set) HasLicense() bool {
l.entitlementsMu.RLock() l.entitlementsMu.RLock()
defer l.entitlementsMu.RUnlock() defer l.entitlementsMu.RUnlock()
+2
View File
@@ -147,8 +147,10 @@ deployment. They will always be available from the agent.
| `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | | `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` |
| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | | `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` |
| `coderd_license_active_users` | gauge | The number of active users. | | | `coderd_license_active_users` | gauge | The number of active users. | |
| `coderd_license_errors` | gauge | The number of active license errors. | |
| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | | `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | |
| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | | `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | |
| `coderd_license_warnings` | gauge | The number of active license warnings. | |
| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | | `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | |
| `coderd_oauth2_external_requests_rate_limit` | gauge | The total number of allowed requests per interval. | `name` `resource` | | `coderd_oauth2_external_requests_rate_limit` | gauge | The total number of allowed requests per interval. | `name` `resource` |
| `coderd_oauth2_external_requests_rate_limit_next_reset_unix` | gauge | Unix timestamp of the next interval | `name` `resource` | | `coderd_oauth2_external_requests_rate_limit_next_reset_unix` | gauge | Unix timestamp of the next interval | `name` `resource` |
@@ -11,6 +11,10 @@ var (
activeUsersDesc = prometheus.NewDesc("coderd_license_active_users", "The number of active users.", nil, nil) activeUsersDesc = prometheus.NewDesc("coderd_license_active_users", "The number of active users.", nil, nil)
limitUsersDesc = prometheus.NewDesc("coderd_license_limit_users", "The user seats limit based on the active Coder license.", nil, nil) limitUsersDesc = prometheus.NewDesc("coderd_license_limit_users", "The user seats limit based on the active Coder license.", nil, nil)
userLimitEnabledDesc = prometheus.NewDesc("coderd_license_user_limit_enabled", "Returns 1 if the current license enforces the user limit.", nil, nil) userLimitEnabledDesc = prometheus.NewDesc("coderd_license_user_limit_enabled", "Returns 1 if the current license enforces the user limit.", nil, nil)
// Metrics for license warnings and errors.
licenseWarningsDesc = prometheus.NewDesc("coderd_license_warnings", "The number of active license warnings.", nil, nil)
licenseErrorsDesc = prometheus.NewDesc("coderd_license_errors", "The number of active license errors.", nil, nil)
) )
type MetricsCollector struct { type MetricsCollector struct {
@@ -23,9 +27,19 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- activeUsersDesc descCh <- activeUsersDesc
descCh <- limitUsersDesc descCh <- limitUsersDesc
descCh <- userLimitEnabledDesc descCh <- userLimitEnabledDesc
descCh <- licenseWarningsDesc
descCh <- licenseErrorsDesc
} }
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
// Collect user limit metrics.
mc.collectUserLimit(metricsCh)
// Collect license warnings and errors metrics.
mc.collectWarningsAndErrors(metricsCh)
}
func (mc *MetricsCollector) collectUserLimit(metricsCh chan<- prometheus.Metric) {
userLimitEntitlement, ok := mc.Entitlements.Feature(codersdk.FeatureUserLimit) userLimitEntitlement, ok := mc.Entitlements.Feature(codersdk.FeatureUserLimit)
if !ok { if !ok {
return return
@@ -45,3 +59,11 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(limitUsersDesc, prometheus.GaugeValue, float64(*userLimitEntitlement.Limit)) metricsCh <- prometheus.MustNewConstMetric(limitUsersDesc, prometheus.GaugeValue, float64(*userLimitEntitlement.Limit))
} }
} }
func (mc *MetricsCollector) collectWarningsAndErrors(metricsCh chan<- prometheus.Metric) {
warnings := mc.Entitlements.Warnings()
errors := mc.Entitlements.Errors()
metricsCh <- prometheus.MustNewConstMetric(licenseWarningsDesc, prometheus.GaugeValue, float64(len(warnings)))
metricsCh <- prometheus.MustNewConstMetric(licenseErrorsDesc, prometheus.GaugeValue, float64(len(errors)))
}
@@ -7,6 +7,7 @@ import (
"github.com/aws/smithy-go/ptr" "github.com/aws/smithy-go/ptr"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
prometheus_client "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/coder/coder/v2/coderd/entitlements" "github.com/coder/coder/v2/coderd/entitlements"
@@ -48,16 +49,131 @@ func TestCollectLicenseMetrics(t *testing.T) {
err = json.Unmarshal(goldenFile, &golden) err = json.Unmarshal(goldenFile, &golden)
require.NoError(t, err) require.NoError(t, err)
collected := map[string]int{} for name, expected := range golden {
for _, metric := range metrics { actual, ok := findMetric(metrics, name)
switch metric.GetName() { require.True(t, ok, "metric %s not found", name)
case "coderd_license_active_users", "coderd_license_limit_users", "coderd_license_user_limit_enabled": require.Equal(t, expected, actual, "metric %s", name)
for _, m := range metric.Metric { }
collected[metric.GetName()] = int(m.Gauge.GetValue()) }
func TestCollectLicenseMetrics_WarningsAndErrors(t *testing.T) {
t.Parallel()
t.Run("NoWarningsOrErrors", func(t *testing.T) {
t.Parallel()
registry := prometheus.NewRegistry()
var sut license.MetricsCollector
sut.Entitlements = entitlements.New()
registry.Register(&sut)
metrics, err := registry.Gather()
require.NoError(t, err)
warnings, ok := findMetric(metrics, "coderd_license_warnings")
require.True(t, ok)
require.Zero(t, warnings)
errors, ok := findMetric(metrics, "coderd_license_errors")
require.True(t, ok)
require.Zero(t, errors)
})
t.Run("WithWarnings", func(t *testing.T) {
t.Parallel()
registry := prometheus.NewRegistry()
var sut license.MetricsCollector
sut.Entitlements = entitlements.New()
sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) {
entitlements.Warnings = []string{
"License expires in 30 days",
"User limit is at 90% capacity",
}
})
registry.Register(&sut)
metrics, err := registry.Gather()
require.NoError(t, err)
warnings, ok := findMetric(metrics, "coderd_license_warnings")
require.True(t, ok)
require.Equal(t, 2, warnings)
errors, ok := findMetric(metrics, "coderd_license_errors")
require.True(t, ok)
require.Zero(t, errors)
})
t.Run("WithErrors", func(t *testing.T) {
t.Parallel()
registry := prometheus.NewRegistry()
var sut license.MetricsCollector
sut.Entitlements = entitlements.New()
sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) {
entitlements.Errors = []string{
"License has expired",
}
})
registry.Register(&sut)
metrics, err := registry.Gather()
require.NoError(t, err)
warnings, ok := findMetric(metrics, "coderd_license_warnings")
require.True(t, ok)
require.Zero(t, warnings)
errors, ok := findMetric(metrics, "coderd_license_errors")
require.True(t, ok)
require.Equal(t, 1, errors)
})
t.Run("WithBothWarningsAndErrors", func(t *testing.T) {
t.Parallel()
registry := prometheus.NewRegistry()
var sut license.MetricsCollector
sut.Entitlements = entitlements.New()
sut.Entitlements.Modify(func(entitlements *codersdk.Entitlements) {
entitlements.Warnings = []string{
"License expires in 7 days",
"User limit is at 95% capacity",
"Feature X is deprecated",
}
entitlements.Errors = []string{
"Invalid license signature",
"License UUID mismatch",
}
})
registry.Register(&sut)
metrics, err := registry.Gather()
require.NoError(t, err)
warnings, ok := findMetric(metrics, "coderd_license_warnings")
require.True(t, ok)
require.Equal(t, 3, warnings)
errors, ok := findMetric(metrics, "coderd_license_errors")
require.True(t, ok)
require.Equal(t, 2, errors)
})
}
// findMetric searches for a metric by name and returns its value.
func findMetric(metrics []*prometheus_client.MetricFamily, name string) (int, bool) {
for _, metric := range metrics {
if metric.GetName() == name {
for _, m := range metric.Metric {
return int(m.Gauge.GetValue()), true
} }
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
} }
} }
require.EqualValues(t, golden, collected) return 0, false
} }
+3 -1
View File
@@ -1,5 +1,7 @@
{ {
"coderd_license_active_users": 4, "coderd_license_active_users": 4,
"coderd_license_limit_users": 7, "coderd_license_limit_users": 7,
"coderd_license_user_limit_enabled": 1 "coderd_license_user_limit_enabled": 1,
"coderd_license_warnings": 0,
"coderd_license_errors": 0
} }
+6
View File
@@ -978,3 +978,9 @@ coderd_agentapi_metadata_flushed_total 71
# HELP coderd_agentapi_metadata_publish_errors_total Total number of metadata batch pubsub publish calls that have resulted in an error. # HELP coderd_agentapi_metadata_publish_errors_total Total number of metadata batch pubsub publish calls that have resulted in an error.
# TYPE coderd_agentapi_metadata_publish_errors_total counter # TYPE coderd_agentapi_metadata_publish_errors_total counter
coderd_agentapi_metadata_publish_errors_total 0 coderd_agentapi_metadata_publish_errors_total 0
# HELP coderd_license_warnings The number of active license warnings.
# TYPE coderd_license_warnings gauge
coderd_license_warnings 0
# HELP coderd_license_errors The number of active license errors.
# TYPE coderd_license_errors gauge
coderd_license_errors 0