mirror of
https://github.com/coder/coder.git
synced 2026-06-03 04:58:23 +00:00
bddb808b25
Fixes all our Go file imports to match the preferred spec that we've _mostly_ been using. For example: ``` import ( "context" "time" "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" "gopkg.in/natefinch/lumberjack.v2" "cdr.dev/slog/v3" "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/serpent" ) ``` 3 groups: standard library, 3rd partly libs, Coder libs. This PR makes the change across the codebase. The PR in the stack above modifies our formatting to maintain this state of affairs, and is a separate PR so it's possible to review that one in detail.
543 lines
15 KiB
Go
543 lines
15 KiB
Go
package prometheusmetrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/model"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
agentproto "github.com/coder/coder/v2/agent/proto"
|
|
"github.com/coder/coder/v2/coderd/agentmetrics"
|
|
"github.com/coder/coder/v2/coderd/pproflabel"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
const (
|
|
// MetricHelpForAgent is a help string that replaces all agent metric help
|
|
// messages. This is because a registry cannot have conflicting
|
|
// help messages for the same metric in a "gather". If our coder agents are
|
|
// on different versions, this is a possible scenario.
|
|
metricHelpForAgent = "Metrics are forwarded from workspace agents connected to this instance of coderd."
|
|
)
|
|
|
|
const (
|
|
loggerName = "prometheusmetrics"
|
|
|
|
sizeCollectCh = 10
|
|
sizeUpdateCh = 4096
|
|
|
|
defaultMetricsCleanupInterval = 2 * time.Minute
|
|
)
|
|
|
|
var MetricLabelValueEncoder = strings.NewReplacer("\\", "\\\\", "|", "\\|", ",", "\\,", "=", "\\=")
|
|
|
|
type descCacheEntry struct {
|
|
desc *prometheus.Desc
|
|
lastUsed time.Time
|
|
}
|
|
|
|
type MetricsAggregator struct {
|
|
store map[metricKey]annotatedMetric
|
|
|
|
log slog.Logger
|
|
metricsCleanupInterval time.Duration
|
|
clock quartz.Clock
|
|
|
|
collectCh chan (chan []prometheus.Metric)
|
|
updateCh chan updateRequest
|
|
|
|
storeSizeGauge prometheus.Gauge
|
|
updateHistogram prometheus.Histogram
|
|
cleanupHistogram prometheus.Histogram
|
|
aggregateByLabels []string
|
|
// per-aggregator cache of descriptors
|
|
descCache map[string]descCacheEntry
|
|
}
|
|
|
|
type updateRequest struct {
|
|
username string
|
|
workspaceName string
|
|
agentName string
|
|
templateName string
|
|
|
|
metrics []*agentproto.Stats_Metric
|
|
|
|
timestamp time.Time
|
|
}
|
|
|
|
type annotatedMetric struct {
|
|
*agentproto.Stats_Metric
|
|
|
|
username string
|
|
workspaceName string
|
|
agentName string
|
|
templateName string
|
|
|
|
expiryDate time.Time
|
|
|
|
aggregateByLabels []string
|
|
}
|
|
|
|
type metricKey struct {
|
|
username string
|
|
workspaceName string
|
|
agentName string
|
|
templateName string
|
|
|
|
metricName string
|
|
labelsStr string
|
|
}
|
|
|
|
func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey {
|
|
labelPairs := make(sort.StringSlice, 0, len(m.GetLabels()))
|
|
for _, label := range m.GetLabels() {
|
|
if label.Value == "" {
|
|
continue
|
|
}
|
|
labelPairs = append(labelPairs, fmt.Sprintf("%s=%s", label.Name, MetricLabelValueEncoder.Replace(label.Value)))
|
|
}
|
|
labelPairs.Sort()
|
|
return metricKey{
|
|
username: req.username,
|
|
workspaceName: req.workspaceName,
|
|
agentName: req.agentName,
|
|
templateName: req.templateName,
|
|
metricName: m.Name,
|
|
labelsStr: strings.Join(labelPairs, ","),
|
|
}
|
|
}
|
|
|
|
var _ prometheus.Collector = new(MetricsAggregator)
|
|
|
|
// getFieldByLabel returns the related field value for a given label
|
|
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) {
|
|
var labelVal string
|
|
switch label {
|
|
case agentmetrics.LabelWorkspaceName:
|
|
labelVal = am.workspaceName
|
|
case agentmetrics.LabelTemplateName:
|
|
labelVal = am.templateName
|
|
case agentmetrics.LabelAgentName:
|
|
labelVal = am.agentName
|
|
case agentmetrics.LabelUsername:
|
|
labelVal = am.username
|
|
default:
|
|
return "", xerrors.Errorf("unexpected label: %q", label)
|
|
}
|
|
|
|
return labelVal, nil
|
|
}
|
|
|
|
func (am *annotatedMetric) shallowCopy() annotatedMetric {
|
|
stats := &agentproto.Stats_Metric{
|
|
Name: am.Name,
|
|
Type: am.Type,
|
|
Value: am.Value,
|
|
Labels: am.Labels,
|
|
}
|
|
|
|
return annotatedMetric{
|
|
Stats_Metric: stats,
|
|
username: am.username,
|
|
workspaceName: am.workspaceName,
|
|
agentName: am.agentName,
|
|
templateName: am.templateName,
|
|
expiryDate: am.expiryDate,
|
|
}
|
|
}
|
|
|
|
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string, options ...func(*MetricsAggregator)) (*MetricsAggregator, error) {
|
|
metricsCleanupInterval := defaultMetricsCleanupInterval
|
|
if duration > 0 {
|
|
metricsCleanupInterval = duration
|
|
}
|
|
|
|
storeSizeGauge := prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prometheusmetrics",
|
|
Name: "metrics_aggregator_store_size",
|
|
Help: "The number of metrics stored in the aggregator",
|
|
})
|
|
err := registerer.Register(storeSizeGauge)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
updateHistogram := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prometheusmetrics",
|
|
Name: "metrics_aggregator_execution_update_seconds",
|
|
Help: "Histogram for duration of metrics aggregator update in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
err = registerer.Register(updateHistogram)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cleanupHistogram := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "prometheusmetrics",
|
|
Name: "metrics_aggregator_execution_cleanup_seconds",
|
|
Help: "Histogram for duration of metrics aggregator cleanup in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
err = registerer.Register(cleanupHistogram)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ma := &MetricsAggregator{
|
|
log: logger.Named(loggerName),
|
|
metricsCleanupInterval: metricsCleanupInterval,
|
|
clock: quartz.NewReal(),
|
|
|
|
store: map[metricKey]annotatedMetric{},
|
|
|
|
collectCh: make(chan (chan []prometheus.Metric), sizeCollectCh),
|
|
updateCh: make(chan updateRequest, sizeUpdateCh),
|
|
|
|
storeSizeGauge: storeSizeGauge,
|
|
updateHistogram: updateHistogram,
|
|
cleanupHistogram: cleanupHistogram,
|
|
|
|
aggregateByLabels: aggregateByLabels,
|
|
}
|
|
|
|
for _, option := range options {
|
|
option(ma)
|
|
}
|
|
|
|
return ma, nil
|
|
}
|
|
|
|
func WithClock(clock quartz.Clock) func(*MetricsAggregator) {
|
|
return func(ma *MetricsAggregator) {
|
|
ma.clock = clock
|
|
}
|
|
}
|
|
|
|
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels.
|
|
type labelAggregator struct {
|
|
aggregations map[string]float64
|
|
metrics map[string]annotatedMetric
|
|
}
|
|
|
|
func newLabelAggregator(size int) *labelAggregator {
|
|
return &labelAggregator{
|
|
aggregations: make(map[string]float64, size),
|
|
metrics: make(map[string]annotatedMetric, size),
|
|
}
|
|
}
|
|
|
|
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error {
|
|
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering.
|
|
labelSet := make(model.LabelSet, len(labels))
|
|
|
|
for _, label := range labels {
|
|
val, err := am.getFieldByLabel(label)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
labelSet[model.LabelName(label)] = model.LabelValue(val)
|
|
}
|
|
|
|
// Memoize based on the metric name & the unique combination of labels.
|
|
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint())
|
|
|
|
// Aggregate the value based on the key.
|
|
a.aggregations[key] += am.Value
|
|
|
|
metric, found := a.metrics[key]
|
|
if !found {
|
|
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers.
|
|
metric = am.shallowCopy()
|
|
}
|
|
|
|
// Store the metric.
|
|
metric.aggregateByLabels = labels
|
|
metric.Value = a.aggregations[key]
|
|
|
|
a.metrics[key] = metric
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a *labelAggregator) listMetrics() []annotatedMetric {
|
|
var out []annotatedMetric
|
|
for _, am := range a.metrics {
|
|
out = append(out, am)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (ma *MetricsAggregator) Run(ctx context.Context) func() {
|
|
ctx, cancelFunc := context.WithCancel(ctx)
|
|
done := make(chan struct{})
|
|
|
|
cleanupTicker := time.NewTicker(ma.metricsCleanupInterval)
|
|
pproflabel.Go(ctx, pproflabel.Service(pproflabel.ServiceAgentMetricAggregator), func(ctx context.Context) {
|
|
defer close(done)
|
|
defer cleanupTicker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case req := <-ma.updateCh:
|
|
ma.log.Debug(ctx, "update metrics")
|
|
|
|
timer := prometheus.NewTimer(ma.updateHistogram)
|
|
for _, m := range req.metrics {
|
|
key := hashKey(&req, m)
|
|
|
|
if val, ok := ma.store[key]; ok {
|
|
val.Stats_Metric.Value = m.Value
|
|
val.expiryDate = req.timestamp.Add(ma.metricsCleanupInterval)
|
|
ma.store[key] = val
|
|
} else {
|
|
ma.store[key] = annotatedMetric{
|
|
Stats_Metric: m,
|
|
username: req.username,
|
|
workspaceName: req.workspaceName,
|
|
agentName: req.agentName,
|
|
templateName: req.templateName,
|
|
expiryDate: req.timestamp.Add(ma.metricsCleanupInterval),
|
|
}
|
|
}
|
|
}
|
|
timer.ObserveDuration()
|
|
|
|
ma.storeSizeGauge.Set(float64(len(ma.store)))
|
|
case outputCh := <-ma.collectCh:
|
|
ma.log.Debug(ctx, "collect metrics")
|
|
|
|
var input []annotatedMetric
|
|
output := make([]prometheus.Metric, 0, len(ma.store))
|
|
|
|
if len(ma.aggregateByLabels) == 0 {
|
|
ma.aggregateByLabels = agentmetrics.LabelAll
|
|
}
|
|
|
|
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation.
|
|
// This results in higher cardinality, but may be desirable in larger deployments.
|
|
//
|
|
// Default behavior.
|
|
if len(ma.aggregateByLabels) == len(agentmetrics.LabelAll) {
|
|
for _, m := range ma.store {
|
|
// Aggregate by all available metrics.
|
|
m.aggregateByLabels = defaultAgentMetricsLabels
|
|
input = append(input, m)
|
|
}
|
|
} else {
|
|
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated
|
|
// metrics because we cannot register multiple metric series with the same labels.
|
|
la := newLabelAggregator(len(ma.store))
|
|
|
|
for _, m := range ma.store {
|
|
if err := la.aggregate(m, ma.aggregateByLabels); err != nil {
|
|
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err))
|
|
}
|
|
}
|
|
|
|
input = la.listMetrics()
|
|
}
|
|
|
|
for _, m := range input {
|
|
promMetric, err := ma.asPrometheus(&m)
|
|
if err != nil {
|
|
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err))
|
|
continue
|
|
}
|
|
output = append(output, promMetric)
|
|
}
|
|
|
|
outputCh <- output
|
|
close(outputCh)
|
|
case <-cleanupTicker.C:
|
|
ma.log.Debug(ctx, "clean expired metrics")
|
|
|
|
timer := prometheus.NewTimer(ma.cleanupHistogram)
|
|
now := ma.clock.Now()
|
|
|
|
for key, val := range ma.store {
|
|
if now.After(val.expiryDate) {
|
|
delete(ma.store, key)
|
|
}
|
|
}
|
|
|
|
ma.cleanupDescCache()
|
|
|
|
timer.ObserveDuration()
|
|
cleanupTicker.Reset(ma.metricsCleanupInterval)
|
|
ma.storeSizeGauge.Set(float64(len(ma.store)))
|
|
|
|
case <-ctx.Done():
|
|
ma.log.Debug(ctx, "metrics aggregator is stopped")
|
|
return
|
|
}
|
|
}
|
|
})
|
|
return func() {
|
|
cancelFunc()
|
|
<-done
|
|
}
|
|
}
|
|
|
|
// Describe function does not have any knowledge about the metrics schema,
|
|
// so it does not emit anything.
|
|
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
|
|
}
|
|
|
|
// cacheKeyForDesc is used to determine the cache key for a set of labels/extra labels. Used with the aggregators description cache.
|
|
// for strings.Builder returned errors from these functions are always nil.
|
|
// nolint:revive
|
|
func cacheKeyForDesc(name string, baseLabelNames []string, extraLabels []*agentproto.Stats_Metric_Label) string {
|
|
var b strings.Builder
|
|
hint := len(name) + (len(baseLabelNames)+len(extraLabels))*8
|
|
b.Grow(hint)
|
|
b.WriteString(name)
|
|
for _, ln := range baseLabelNames {
|
|
b.WriteByte('|')
|
|
b.WriteString(ln)
|
|
}
|
|
for _, l := range extraLabels {
|
|
b.WriteByte('|')
|
|
b.WriteString(l.Name)
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// getOrCreateDec checks if we already have a metric description in the aggregators cache for a given combination of base
|
|
// labels and extra labels. If we do not, we create a new description and cache it.
|
|
func (ma *MetricsAggregator) getOrCreateDesc(name string, help string, baseLabelNames []string, extraLabels []*agentproto.Stats_Metric_Label) *prometheus.Desc {
|
|
if ma.descCache == nil {
|
|
ma.descCache = make(map[string]descCacheEntry)
|
|
}
|
|
key := cacheKeyForDesc(name, baseLabelNames, extraLabels)
|
|
if d, ok := ma.descCache[key]; ok {
|
|
d.lastUsed = ma.clock.Now()
|
|
ma.descCache[key] = d
|
|
return d.desc
|
|
}
|
|
nBase := len(baseLabelNames)
|
|
nExtra := len(extraLabels)
|
|
labels := make([]string, nBase+nExtra)
|
|
copy(labels, baseLabelNames)
|
|
for i, l := range extraLabels {
|
|
labels[nBase+i] = l.Name
|
|
}
|
|
d := prometheus.NewDesc(name, help, labels, nil)
|
|
ma.descCache[key] = descCacheEntry{d, ma.clock.Now()}
|
|
return d
|
|
}
|
|
|
|
// asPrometheus returns the annotatedMetric as a prometheus.Metric, it preallocates/fills by index, uses the aggregators
|
|
// metric description cache, and a small stack buffer for values in order to reduce memory allocations.
|
|
func (ma *MetricsAggregator) asPrometheus(am *annotatedMetric) (prometheus.Metric, error) {
|
|
baseLabelNames := am.aggregateByLabels
|
|
extraLabels := am.Labels
|
|
|
|
nBase := len(baseLabelNames)
|
|
nExtra := len(extraLabels)
|
|
nTotal := nBase + nExtra
|
|
|
|
var scratch [16]string
|
|
var labelValues []string
|
|
if nTotal <= len(scratch) {
|
|
labelValues = scratch[:nTotal]
|
|
} else {
|
|
labelValues = make([]string, nTotal)
|
|
}
|
|
|
|
for i, label := range baseLabelNames {
|
|
val, err := am.getFieldByLabel(label)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
labelValues[i] = val
|
|
}
|
|
for i, l := range extraLabels {
|
|
labelValues[nBase+i] = l.Value
|
|
}
|
|
|
|
desc := ma.getOrCreateDesc(am.Name, metricHelpForAgent, baseLabelNames, extraLabels)
|
|
valueType, err := asPrometheusValueType(am.Type)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
|
|
}
|
|
|
|
var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}
|
|
|
|
// AgentMetricLabels are the labels used to decorate an agent's metrics.
|
|
// This list should match the list of labels in agentMetricsLabels.
|
|
type AgentMetricLabels struct {
|
|
Username string
|
|
WorkspaceName string
|
|
AgentName string
|
|
TemplateName string
|
|
}
|
|
|
|
func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) {
|
|
output := make(chan []prometheus.Metric, 1)
|
|
|
|
select {
|
|
case ma.collectCh <- output:
|
|
default:
|
|
ma.log.Error(context.Background(), "collect queue is full")
|
|
return
|
|
}
|
|
|
|
for s := range output {
|
|
for _, m := range s {
|
|
ch <- m
|
|
}
|
|
}
|
|
}
|
|
|
|
func (ma *MetricsAggregator) Update(ctx context.Context, labels AgentMetricLabels, metrics []*agentproto.Stats_Metric) {
|
|
select {
|
|
case ma.updateCh <- updateRequest{
|
|
username: labels.Username,
|
|
workspaceName: labels.WorkspaceName,
|
|
agentName: labels.AgentName,
|
|
templateName: labels.TemplateName,
|
|
metrics: metrics,
|
|
|
|
timestamp: ma.clock.Now(),
|
|
}:
|
|
case <-ctx.Done():
|
|
ma.log.Debug(ctx, "update request is canceled")
|
|
default:
|
|
ma.log.Error(ctx, "update queue is full")
|
|
}
|
|
}
|
|
|
|
// Move to a function for testability
|
|
func (ma *MetricsAggregator) cleanupDescCache() {
|
|
now := ma.clock.Now()
|
|
for key, entry := range ma.descCache {
|
|
if now.Sub(entry.lastUsed) > ma.metricsCleanupInterval {
|
|
delete(ma.descCache, key)
|
|
}
|
|
}
|
|
}
|
|
|
|
func asPrometheusValueType(metricType agentproto.Stats_Metric_Type) (prometheus.ValueType, error) {
|
|
switch metricType {
|
|
case agentproto.Stats_Metric_GAUGE:
|
|
return prometheus.GaugeValue, nil
|
|
case agentproto.Stats_Metric_COUNTER:
|
|
return prometheus.CounterValue, nil
|
|
default:
|
|
return -1, xerrors.Errorf("unsupported value type: %s", metricType)
|
|
}
|
|
}
|