diff --git a/coderd/coderd.go b/coderd/coderd.go index 39e2f2bc07..e05e174cec 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -99,6 +99,7 @@ import ( "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/site" "github.com/coder/coder/v2/tailnet" + "github.com/coder/coder/v2/tailnet/derpmetrics" "github.com/coder/quartz" "github.com/coder/serpent" ) @@ -899,17 +900,18 @@ func New(options *Options) *API { apiRateLimiter := httpmw.RateLimit(options.APIRateLimit, time.Minute) // Register DERP on expvar HTTP handler, which we serve below in the router, c.f. expvar.Handler() - // These are the metrics the DERP server exposes. - // TODO: export via prometheus expDERPOnce.Do(func() { // We need to do this via a global Once because expvar registry is global and panics if we // register multiple times. In production there is only one Coderd and one DERP server per // process, but in testing, we create multiple of both, so the Once protects us from // panicking. - if options.DERPServer != nil { + if options.DERPServer != nil && expvar.Get("derp") == nil { expvar.Publish("derp", api.DERPServer.ExpVar()) } }) + if options.PrometheusRegistry != nil && options.DERPServer != nil { + options.PrometheusRegistry.MustRegister(derpmetrics.NewDERPExpvarCollector(options.DERPServer)) + } cors := httpmw.Cors(options.DeploymentValues.Dangerous.AllowAllCors.Value()) prometheusMW := httpmw.Prometheus(options.PrometheusRegistry) diff --git a/coderd/coderd_test.go b/coderd/coderd_test.go index c77ddf50a5..49612b2b40 100644 --- a/coderd/coderd_test.go +++ b/coderd/coderd_test.go @@ -390,3 +390,29 @@ func TestCSRFExempt(t *testing.T) { require.NotContains(t, string(data), "CSRF") }) } + +func TestDERPMetrics(t *testing.T) { + t.Parallel() + + _, _, api := coderdtest.NewWithAPI(t, nil) + + require.NotNil(t, api.Options.DERPServer, "DERP server should be configured") + require.NotNil(t, api.Options.PrometheusRegistry, "Prometheus registry should be configured") + + // The registry is created internally by coderd. Gather from it + // to verify DERP metrics were registered during startup. + metrics, err := api.Options.PrometheusRegistry.Gather() + require.NoError(t, err) + + names := make(map[string]struct{}) + for _, m := range metrics { + names[m.GetName()] = struct{}{} + } + + assert.Contains(t, names, "coder_derp_server_connections", + "expected coder_derp_server_connections to be registered") + assert.Contains(t, names, "coder_derp_server_bytes_received_total", + "expected coder_derp_server_bytes_received_total to be registered") + assert.Contains(t, names, "coder_derp_server_packets_dropped_reason_total", + "expected coder_derp_server_packets_dropped_reason_total to be registered") +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 2353b819ae..c9ab350b65 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -125,6 +125,31 @@ deployment. They will always be available from the agent. | `coder_aibridgeproxyd_inflight_mitm_requests` | gauge | Number of MITM requests currently being processed. | `provider` | | `coder_aibridgeproxyd_mitm_requests_total` | counter | Total number of MITM requests handled by the proxy. | `provider` | | `coder_aibridgeproxyd_mitm_responses_total` | counter | Total number of MITM responses by HTTP status code class. | `code` `provider` | +| `coder_derp_server_accepts_total` | counter | Total DERP connections accepted. | | +| `coder_derp_server_average_queue_duration_ms` | gauge | Average queue duration in milliseconds. | | +| `coder_derp_server_bytes_received_total` | counter | Total bytes received. | | +| `coder_derp_server_bytes_sent_total` | counter | Total bytes sent. | | +| `coder_derp_server_clients` | gauge | Total clients (local + remote). | | +| `coder_derp_server_clients_local` | gauge | Local clients. | | +| `coder_derp_server_clients_remote` | gauge | Remote (mesh) clients. | | +| `coder_derp_server_connections` | gauge | Current DERP connections. | | +| `coder_derp_server_got_ping_total` | counter | Total pings received. | | +| `coder_derp_server_home_connections` | gauge | Current home DERP connections. | | +| `coder_derp_server_home_moves_in_total` | counter | Total home moves in. | | +| `coder_derp_server_home_moves_out_total` | counter | Total home moves out. | | +| `coder_derp_server_packets_dropped_reason_total` | counter | Packets dropped by reason. | `reason` | +| `coder_derp_server_packets_dropped_total` | counter | Total packets dropped. | | +| `coder_derp_server_packets_dropped_type_total` | counter | Packets dropped by type. | `type` | +| `coder_derp_server_packets_forwarded_in_total` | counter | Total packets forwarded in from mesh peers. | | +| `coder_derp_server_packets_forwarded_out_total` | counter | Total packets forwarded out to mesh peers. | | +| `coder_derp_server_packets_received_kind_total` | counter | Packets received by kind. | `kind` | +| `coder_derp_server_packets_received_total` | counter | Total packets received. | | +| `coder_derp_server_packets_sent_total` | counter | Total packets sent. | | +| `coder_derp_server_peer_gone_disconnected_total` | counter | Total peer gone (disconnected) frames sent. | | +| `coder_derp_server_peer_gone_not_here_total` | counter | Total peer gone (not here) frames sent. | | +| `coder_derp_server_sent_pong_total` | counter | Total pongs sent. | | +| `coder_derp_server_unknown_frames_total` | counter | Total unknown frames received. | | +| `coder_derp_server_watchers` | gauge | Current watchers. | | | `coder_pubsub_connected` | gauge | Whether we are connected (1) or not connected (0) to postgres | | | `coder_pubsub_current_events` | gauge | The current number of pubsub event channels listened for | | | `coder_pubsub_current_subscribers` | gauge | The current number of active pubsub subscribers | | diff --git a/enterprise/wsproxy/wsproxy.go b/enterprise/wsproxy/wsproxy.go index 2b033115f5..4359213d4e 100644 --- a/enterprise/wsproxy/wsproxy.go +++ b/enterprise/wsproxy/wsproxy.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "errors" + "expvar" "fmt" "net/http" "net/url" @@ -42,8 +43,14 @@ import ( sharedhttpmw "github.com/coder/coder/v2/httpmw" "github.com/coder/coder/v2/site" "github.com/coder/coder/v2/tailnet" + "github.com/coder/coder/v2/tailnet/derpmetrics" ) +// expDERPOnce guards the global expvar.Publish call for the DERP server. +// expvar panics on duplicate registration, and tests may create multiple +// servers in the same process. +var expDERPOnce sync.Once + type Options struct { Logger slog.Logger Experiments codersdk.Experiments @@ -196,6 +203,17 @@ func New(ctx context.Context, opts *Options) (*Server, error) { return nil, xerrors.Errorf("create DERP mesh tls config: %w", err) } derpServer := derp.NewServer(key.NewNode(), tailnet.Logger(opts.Logger.Named("net.derp"))) + // Publish DERP stats to expvar, available via the pprof + // debug server (--pprof-enable) at /debug/vars. This avoids + // exposing expvar on the public HTTP router. + expDERPOnce.Do(func() { + if expvar.Get("derp") == nil { + expvar.Publish("derp", derpServer.ExpVar()) + } + }) + if opts.PrometheusRegistry != nil { + opts.PrometheusRegistry.MustRegister(derpmetrics.NewDERPExpvarCollector(derpServer)) + } ctx, cancel := context.WithCancel(context.Background()) diff --git a/enterprise/wsproxy/wsproxy_test.go b/enterprise/wsproxy/wsproxy_test.go index 3cb51e320c..8115e4ae15 100644 --- a/enterprise/wsproxy/wsproxy_test.go +++ b/enterprise/wsproxy/wsproxy_test.go @@ -1223,3 +1223,55 @@ func createProxyReplicas(ctx context.Context, t *testing.T, opts *createProxyRep return proxies } + +func TestWorkspaceProxyDERPMetrics(t *testing.T) { + t.Parallel() + + deploymentValues := coderdtest.DeploymentValues(t) + deploymentValues.Experiments = []string{"*"} + + client, closer, api, _ := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + DeploymentValues: deploymentValues, + AppHostname: "*.primary.test.coder.com", + IncludeProvisionerDaemon: true, + RealIPConfig: &httpmw.RealIPConfig{ + TrustedOrigins: []*net.IPNet{{ + IP: net.ParseIP("127.0.0.1"), + Mask: net.CIDRMask(8, 32), + }}, + TrustedHeaders: []string{ + "CF-Connecting-IP", + }, + }, + }, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{ + codersdk.FeatureWorkspaceProxy: 1, + }, + }, + }) + t.Cleanup(func() { + _ = closer.Close() + }) + + proxy := coderdenttest.NewWorkspaceProxyReplica(t, api, client, &coderdenttest.ProxyOptions{ + Name: "metrics-test-proxy", + }) + + // Gather metrics from the wsproxy's Prometheus registry. + metrics, err := proxy.PrometheusRegistry.Gather() + require.NoError(t, err) + + names := make(map[string]struct{}) + for _, m := range metrics { + names[m.GetName()] = struct{}{} + } + + assert.Contains(t, names, "coder_derp_server_connections", + "expected coder_derp_server_connections to be registered") + assert.Contains(t, names, "coder_derp_server_bytes_received_total", + "expected coder_derp_server_bytes_received_total to be registered") + assert.Contains(t, names, "coder_derp_server_packets_dropped_reason_total", + "expected coder_derp_server_packets_dropped_reason_total to be registered") +} diff --git a/go.mod b/go.mod index beef7a97c3..81a0ed70ce 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ replace github.com/tcnksm/go-httpstat => github.com/coder/go-httpstat v0.0.0-202 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20250829055706-6eafe0f9199e +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20260306035934-af5c6fc52433 // This is replaced to include // 1. a fix for a data race: c.f. https://github.com/tailscale/wireguard-go/pull/25 @@ -115,7 +115,7 @@ require ( github.com/coder/wgtunnel v0.2.0 github.com/coreos/go-oidc/v3 v3.17.0 github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf - github.com/creack/pty v1.1.21 + github.com/creack/pty v1.1.24 github.com/dave/dst v0.27.2 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/dblohm7/wingoes v0.0.0-20240820181039-f2b84150679e @@ -289,9 +289,8 @@ require ( github.com/containerd/continuity v0.4.5 // indirect github.com/coreos/go-iptables v0.6.0 // indirect github.com/dlclark/regexp2 v1.11.5 // indirect - github.com/docker/cli v28.3.2+incompatible // indirect - github.com/docker/docker v28.3.3+incompatible // indirect - github.com/docker/go-connections v0.5.0 // indirect + github.com/docker/cli v29.2.0+incompatible // indirect + github.com/docker/go-connections v0.6.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/dop251/goja v0.0.0-20241024094426-79f3a7efcdbd // indirect github.com/dustin/go-humanize v1.0.1 @@ -537,8 +536,11 @@ require ( github.com/clipperhouse/uax29/v2 v2.5.0 // indirect github.com/cncf/xds/go v0.0.0-20260202195803-dba9d589def2 // indirect github.com/coder/paralleltestctx v0.0.1 // indirect + github.com/containerd/errdefs v1.0.0 // indirect + github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/daixiang0/gci v0.13.7 // indirect + github.com/distribution/reference v0.6.0 // indirect github.com/envoyproxy/go-control-plane/envoy v1.37.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.3.3 // indirect github.com/esiqveland/notify v0.13.3 // indirect @@ -562,6 +564,8 @@ require ( github.com/klauspost/cpuid/v2 v2.2.10 // indirect github.com/landlock-lsm/go-landlock v0.0.0-20251103212306-430f8e5cd97c // indirect github.com/mattn/go-shellwords v1.0.12 // indirect + github.com/moby/moby/api v1.54.0 // indirect + github.com/moby/moby/client v0.3.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/openai/openai-go v1.12.0 // indirect @@ -592,6 +596,7 @@ require ( go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.40.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect go.yaml.in/yaml/v4 v4.0.0-rc.3 // indirect golang.org/x/telemetry v0.0.0-20260209163413-e7419c687ee4 // indirect google.golang.org/genai v1.47.0 // indirect diff --git a/go.sum b/go.sum index 5b322e14a5..4e256ac3e3 100644 --- a/go.sum +++ b/go.sum @@ -347,8 +347,8 @@ github.com/coder/serpent v0.14.0 h1:g7vt2zBMp3nWyAvyhvQduaI53Ku65U3wITMi01+/8pU= github.com/coder/serpent v0.14.0/go.mod h1:7OIvFBYMd+OqarMy5einBl8AtRr8LliopVU7pyrwucY= github.com/coder/ssh v0.0.0-20231128192721-70855dedb788 h1:YoUSJ19E8AtuUFVYBpXuOD6a/zVP3rcxezNsoDseTUw= github.com/coder/ssh v0.0.0-20231128192721-70855dedb788/go.mod h1:aGQbuCLyhRLMzZF067xc84Lh7JDs1FKwCmF1Crl9dxQ= -github.com/coder/tailscale v1.1.1-0.20250829055706-6eafe0f9199e h1:9RKGKzGLHtTvVBQublzDGtCtal3cXP13diCHoAIGPeI= -github.com/coder/tailscale v1.1.1-0.20250829055706-6eafe0f9199e/go.mod h1:jU9T1vEs+DOs8NtGp1F2PT0/TOGVwtg/JCCKYRgvMOs= +github.com/coder/tailscale v1.1.1-0.20260306035934-af5c6fc52433 h1:NxqWSEZFuCeIR/N7lZ9cx+434urbNvrrA7ZyNPTwnmc= +github.com/coder/tailscale v1.1.1-0.20260306035934-af5c6fc52433/go.mod h1:q+R4UL4pPb0CpaSNVUTDsg0kZeL/OlqjRNO9XbJxU5g= github.com/coder/terraform-config-inspect v0.0.0-20250107175719-6d06d90c630e h1:JNLPDi2P73laR1oAclY6jWzAbucf70ASAvf5mh2cME0= github.com/coder/terraform-config-inspect v0.0.0-20250107175719-6d06d90c630e/go.mod h1:Gz/z9Hbn+4KSp8A2FBtNszfLSdT2Tn/uAKGuVqqWmDI= github.com/coder/terraform-provider-coder/v2 v2.13.1 h1:dtPaJUvueFm+XwBPUMWQCc5Z1QUQBW4B4RNyzX4h4y8= @@ -382,8 +382,8 @@ github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHf github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= -github.com/creack/pty v1.1.21 h1:1/QdRyBaHHJP61QkWMXlOIBfsgdDeeKfK8SYVUWJKf0= -github.com/creack/pty v1.1.21/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48= github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/daixiang0/gci v0.13.7 h1:+0bG5eK9vlI08J+J/NWGbWPTNiXPG4WhNLJOkSxWITQ= @@ -420,12 +420,12 @@ github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZ github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI= github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= -github.com/docker/cli v28.3.2+incompatible h1:mOt9fcLE7zaACbxW1GeS65RI67wIJrTnqS3hP2huFsY= -github.com/docker/cli v28.3.2+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v29.2.0+incompatible h1:9oBd9+YM7rxjZLfyMGxjraKBKE4/nVyvVfN4qNl9XRM= +github.com/docker/cli v29.2.0+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/dop251/goja v0.0.0-20241024094426-79f3a7efcdbd h1:QMSNEh9uQkDjyPwu/J541GgSH+4hw+0skJDIj9HJ3mE= @@ -872,6 +872,10 @@ github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3N github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/go-archive v0.1.0 h1:Kk/5rdW/g+H8NHdJW2gsXyZ7UnzvJNOy6VKJqueWdcQ= github.com/moby/go-archive v0.1.0/go.mod h1:G9B+YoujNohJmrIYFBpSd54GTUB4lt9S+xVQvsJyFuo= +github.com/moby/moby/api v1.54.0 h1:7kbUgyiKcoBhm0UrWbdrMs7RX8dnwzURKVbZGy2GnL0= +github.com/moby/moby/api v1.54.0/go.mod h1:8mb+ReTlisw4pS6BRzCMts5M49W5M7bKt1cJy/YbAqc= +github.com/moby/moby/client v0.3.0 h1:UUGL5okry+Aomj3WhGt9Aigl3ZOxZGqR7XPo+RLPlKs= +github.com/moby/moby/client v0.3.0/go.mod h1:HJgFbJRvogDQjbM8fqc1MCEm4mIAGMLjXbgwoZp6jCQ= github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= @@ -1525,6 +1529,8 @@ kernel.org/pub/linux/libs/security/libcap/psx v1.2.77 h1:Z06sMOzc0GNCwp6efaVrIrz kernel.org/pub/linux/libs/security/libcap/psx v1.2.77/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24= mvdan.cc/gofumpt v0.8.0 h1:nZUCeC2ViFaerTcYKstMmfysj6uhQrA2vJe+2vwGU6k= mvdan.cc/gofumpt v0.8.0/go.mod h1:vEYnSzyGPmjvFkqJWtXkh79UwPWP9/HMxQdGEXZHjpg= +pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= +pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/qr v0.2.0 h1:6vBLea5/NRMVTz8V66gipeLycZMl/+UlFmk8DvqQ6WY= rsc.io/qr v0.2.0/go.mod h1:IF+uZjkb9fqyeF/4tlBoynqmQxUoPfWEKh921coOuXs= sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index ce024a0a66..fae3de129a 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -7,6 +7,81 @@ agent_boundary_log_proxy_batches_forwarded_total 0 # HELP agent_boundary_log_proxy_logs_dropped_total Total number of individual boundary log entries dropped before reaching coderd. Reason: buffer_full = the agent's internal buffer is full; forward_failed = the agent failed to send the batch to coderd; boundary_channel_full = boundary's internal send channel overflowed, meaning boundary is generating logs faster than it can batch and send them; boundary_batch_full = boundary's outgoing batch buffer overflowed after a failed flush, meaning boundary could not write to the agent's socket. # TYPE agent_boundary_log_proxy_logs_dropped_total counter agent_boundary_log_proxy_logs_dropped_total{reason=""} 0 +# HELP coder_derp_server_accepts_total Total DERP connections accepted. +# TYPE coder_derp_server_accepts_total counter +coder_derp_server_accepts_total 0 +# HELP coder_derp_server_average_queue_duration_ms Average queue duration in milliseconds. +# TYPE coder_derp_server_average_queue_duration_ms gauge +coder_derp_server_average_queue_duration_ms 0 +# HELP coder_derp_server_bytes_received_total Total bytes received. +# TYPE coder_derp_server_bytes_received_total counter +coder_derp_server_bytes_received_total 0 +# HELP coder_derp_server_bytes_sent_total Total bytes sent. +# TYPE coder_derp_server_bytes_sent_total counter +coder_derp_server_bytes_sent_total 0 +# HELP coder_derp_server_clients Total clients (local + remote). +# TYPE coder_derp_server_clients gauge +coder_derp_server_clients 0 +# HELP coder_derp_server_clients_local Local clients. +# TYPE coder_derp_server_clients_local gauge +coder_derp_server_clients_local 0 +# HELP coder_derp_server_clients_remote Remote (mesh) clients. +# TYPE coder_derp_server_clients_remote gauge +coder_derp_server_clients_remote 0 +# HELP coder_derp_server_connections Current DERP connections. +# TYPE coder_derp_server_connections gauge +coder_derp_server_connections 0 +# HELP coder_derp_server_got_ping_total Total pings received. +# TYPE coder_derp_server_got_ping_total counter +coder_derp_server_got_ping_total 0 +# HELP coder_derp_server_home_connections Current home DERP connections. +# TYPE coder_derp_server_home_connections gauge +coder_derp_server_home_connections 0 +# HELP coder_derp_server_home_moves_in_total Total home moves in. +# TYPE coder_derp_server_home_moves_in_total counter +coder_derp_server_home_moves_in_total 0 +# HELP coder_derp_server_home_moves_out_total Total home moves out. +# TYPE coder_derp_server_home_moves_out_total counter +coder_derp_server_home_moves_out_total 0 +# HELP coder_derp_server_packets_dropped_reason_total Packets dropped by reason. +# TYPE coder_derp_server_packets_dropped_reason_total counter +coder_derp_server_packets_dropped_reason_total{reason=""} 0 +# HELP coder_derp_server_packets_dropped_total Total packets dropped. +# TYPE coder_derp_server_packets_dropped_total counter +coder_derp_server_packets_dropped_total 0 +# HELP coder_derp_server_packets_dropped_type_total Packets dropped by type. +# TYPE coder_derp_server_packets_dropped_type_total counter +coder_derp_server_packets_dropped_type_total{type=""} 0 +# HELP coder_derp_server_packets_forwarded_in_total Total packets forwarded in from mesh peers. +# TYPE coder_derp_server_packets_forwarded_in_total counter +coder_derp_server_packets_forwarded_in_total 0 +# HELP coder_derp_server_packets_forwarded_out_total Total packets forwarded out to mesh peers. +# TYPE coder_derp_server_packets_forwarded_out_total counter +coder_derp_server_packets_forwarded_out_total 0 +# HELP coder_derp_server_packets_received_kind_total Packets received by kind. +# TYPE coder_derp_server_packets_received_kind_total counter +coder_derp_server_packets_received_kind_total{kind=""} 0 +# HELP coder_derp_server_packets_received_total Total packets received. +# TYPE coder_derp_server_packets_received_total counter +coder_derp_server_packets_received_total 0 +# HELP coder_derp_server_packets_sent_total Total packets sent. +# TYPE coder_derp_server_packets_sent_total counter +coder_derp_server_packets_sent_total 0 +# HELP coder_derp_server_peer_gone_disconnected_total Total peer gone (disconnected) frames sent. +# TYPE coder_derp_server_peer_gone_disconnected_total counter +coder_derp_server_peer_gone_disconnected_total 0 +# HELP coder_derp_server_peer_gone_not_here_total Total peer gone (not here) frames sent. +# TYPE coder_derp_server_peer_gone_not_here_total counter +coder_derp_server_peer_gone_not_here_total 0 +# HELP coder_derp_server_sent_pong_total Total pongs sent. +# TYPE coder_derp_server_sent_pong_total counter +coder_derp_server_sent_pong_total 0 +# HELP coder_derp_server_unknown_frames_total Total unknown frames received. +# TYPE coder_derp_server_unknown_frames_total counter +coder_derp_server_unknown_frames_total 0 +# HELP coder_derp_server_watchers Current watchers. +# TYPE coder_derp_server_watchers gauge +coder_derp_server_watchers 0 # HELP coder_pubsub_connected Whether we are connected (1) or not connected (0) to postgres # TYPE coder_pubsub_connected gauge coder_pubsub_connected 0 diff --git a/scripts/metricsdocgen/scanner/scanner.go b/scripts/metricsdocgen/scanner/scanner.go index 551220b7ff..e38da99876 100644 --- a/scripts/metricsdocgen/scanner/scanner.go +++ b/scripts/metricsdocgen/scanner/scanner.go @@ -30,6 +30,7 @@ var scanDirs = []string{ "coderd", "enterprise", "provisionerd", + "tailnet", } // skipPaths lists files that should be excluded from scanning. Their metrics diff --git a/tailnet/derpmetrics/metrics.go b/tailnet/derpmetrics/metrics.go new file mode 100644 index 0000000000..91c5cd1086 --- /dev/null +++ b/tailnet/derpmetrics/metrics.go @@ -0,0 +1,214 @@ +package derpmetrics + +import ( + "expvar" + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "tailscale.com/derp" +) + +// DERPExpvarCollector exports a DERP server's expvar stats as +// properly typed Prometheus metrics. +type DERPExpvarCollector struct { + server *derp.Server + + // Counters. + accepts *prometheus.Desc + bytesReceived *prometheus.Desc + bytesSent *prometheus.Desc + packetsReceived *prometheus.Desc + packetsSent *prometheus.Desc + packetsDropped *prometheus.Desc + packetsForwardedIn *prometheus.Desc + packetsForwardedOut *prometheus.Desc + homeMovesIn *prometheus.Desc + homeMovesOut *prometheus.Desc + gotPing *prometheus.Desc + sentPong *prometheus.Desc + peerGoneDisconnected *prometheus.Desc + peerGoneNotHere *prometheus.Desc + unknownFrames *prometheus.Desc + + // Labeled counters. + packetsDroppedByReason *prometheus.Desc + packetsDroppedByType *prometheus.Desc + packetsReceivedByKind *prometheus.Desc + + // Gauges. + connections *prometheus.Desc + homeConnections *prometheus.Desc + clientsTotal *prometheus.Desc + clientsLocal *prometheus.Desc + clientsRemote *prometheus.Desc + watchers *prometheus.Desc + avgQueueDurMS *prometheus.Desc +} + +// NewDERPExpvarCollector creates a Prometheus collector that reads +// stats from a DERP server's expvar on each scrape. +func NewDERPExpvarCollector(server *derp.Server) *DERPExpvarCollector { + return &DERPExpvarCollector{ + server: server, + + accepts: prometheus.NewDesc("coder_derp_server_accepts_total", "Total DERP connections accepted.", nil, nil), + bytesReceived: prometheus.NewDesc("coder_derp_server_bytes_received_total", "Total bytes received.", nil, nil), + bytesSent: prometheus.NewDesc("coder_derp_server_bytes_sent_total", "Total bytes sent.", nil, nil), + packetsReceived: prometheus.NewDesc("coder_derp_server_packets_received_total", "Total packets received.", nil, nil), + packetsSent: prometheus.NewDesc("coder_derp_server_packets_sent_total", "Total packets sent.", nil, nil), + packetsDropped: prometheus.NewDesc("coder_derp_server_packets_dropped_total", "Total packets dropped.", nil, nil), + packetsForwardedIn: prometheus.NewDesc("coder_derp_server_packets_forwarded_in_total", "Total packets forwarded in from mesh peers.", nil, nil), + packetsForwardedOut: prometheus.NewDesc("coder_derp_server_packets_forwarded_out_total", "Total packets forwarded out to mesh peers.", nil, nil), + homeMovesIn: prometheus.NewDesc("coder_derp_server_home_moves_in_total", "Total home moves in.", nil, nil), + homeMovesOut: prometheus.NewDesc("coder_derp_server_home_moves_out_total", "Total home moves out.", nil, nil), + gotPing: prometheus.NewDesc("coder_derp_server_got_ping_total", "Total pings received.", nil, nil), + sentPong: prometheus.NewDesc("coder_derp_server_sent_pong_total", "Total pongs sent.", nil, nil), + peerGoneDisconnected: prometheus.NewDesc("coder_derp_server_peer_gone_disconnected_total", "Total peer gone (disconnected) frames sent.", nil, nil), + peerGoneNotHere: prometheus.NewDesc("coder_derp_server_peer_gone_not_here_total", "Total peer gone (not here) frames sent.", nil, nil), + unknownFrames: prometheus.NewDesc("coder_derp_server_unknown_frames_total", "Total unknown frames received.", nil, nil), + + packetsDroppedByReason: prometheus.NewDesc("coder_derp_server_packets_dropped_reason_total", "Packets dropped by reason.", []string{"reason"}, nil), + packetsDroppedByType: prometheus.NewDesc("coder_derp_server_packets_dropped_type_total", "Packets dropped by type.", []string{"type"}, nil), + packetsReceivedByKind: prometheus.NewDesc("coder_derp_server_packets_received_kind_total", "Packets received by kind.", []string{"kind"}, nil), + + connections: prometheus.NewDesc("coder_derp_server_connections", "Current DERP connections.", nil, nil), + homeConnections: prometheus.NewDesc("coder_derp_server_home_connections", "Current home DERP connections.", nil, nil), + clientsTotal: prometheus.NewDesc("coder_derp_server_clients", "Total clients (local + remote).", nil, nil), + clientsLocal: prometheus.NewDesc("coder_derp_server_clients_local", "Local clients.", nil, nil), + clientsRemote: prometheus.NewDesc("coder_derp_server_clients_remote", "Remote (mesh) clients.", nil, nil), + watchers: prometheus.NewDesc("coder_derp_server_watchers", "Current watchers.", nil, nil), + avgQueueDurMS: prometheus.NewDesc("coder_derp_server_average_queue_duration_ms", "Average queue duration in milliseconds.", nil, nil), + } +} + +func (c *DERPExpvarCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.accepts + ch <- c.bytesReceived + ch <- c.bytesSent + ch <- c.packetsReceived + ch <- c.packetsSent + ch <- c.packetsDropped + ch <- c.packetsForwardedIn + ch <- c.packetsForwardedOut + ch <- c.homeMovesIn + ch <- c.homeMovesOut + ch <- c.gotPing + ch <- c.sentPong + ch <- c.peerGoneDisconnected + ch <- c.peerGoneNotHere + ch <- c.unknownFrames + ch <- c.packetsDroppedByReason + ch <- c.packetsDroppedByType + ch <- c.packetsReceivedByKind + ch <- c.connections + ch <- c.homeConnections + ch <- c.clientsTotal + ch <- c.clientsLocal + ch <- c.clientsRemote + ch <- c.watchers + ch <- c.avgQueueDurMS +} + +// Collect reads the DERP server's expvar stats and emits them as +// Prometheus metrics. Called on each /metrics scrape. +func (c *DERPExpvarCollector) Collect(ch chan<- prometheus.Metric) { + vars, ok := c.server.ExpVar().(interface { + Do(func(expvar.KeyValue)) + }) + if !ok { + return + } + + vars.Do(func(kv expvar.KeyValue) { + switch kv.Key { + case "accepts": + emitCounter(ch, c.accepts, kv.Value) + case "bytes_received": + emitCounter(ch, c.bytesReceived, kv.Value) + case "bytes_sent": + emitCounter(ch, c.bytesSent, kv.Value) + case "packets_received": + emitCounter(ch, c.packetsReceived, kv.Value) + case "packets_sent": + emitCounter(ch, c.packetsSent, kv.Value) + case "packets_dropped": + emitCounter(ch, c.packetsDropped, kv.Value) + case "packets_forwarded_in": + emitCounter(ch, c.packetsForwardedIn, kv.Value) + case "packets_forwarded_out": + emitCounter(ch, c.packetsForwardedOut, kv.Value) + case "home_moves_in": + emitCounter(ch, c.homeMovesIn, kv.Value) + case "home_moves_out": + emitCounter(ch, c.homeMovesOut, kv.Value) + case "got_ping": + emitCounter(ch, c.gotPing, kv.Value) + case "sent_pong": + emitCounter(ch, c.sentPong, kv.Value) + case "peer_gone_disconnected_frames": + emitCounter(ch, c.peerGoneDisconnected, kv.Value) + case "peer_gone_not_here_frames": + emitCounter(ch, c.peerGoneNotHere, kv.Value) + case "unknown_frames": + emitCounter(ch, c.unknownFrames, kv.Value) + + case "counter_packets_dropped_reason": + emitLabeledCounters(ch, c.packetsDroppedByReason, kv.Value) + case "counter_packets_dropped_type": + emitLabeledCounters(ch, c.packetsDroppedByType, kv.Value) + case "counter_packets_received_kind": + emitLabeledCounters(ch, c.packetsReceivedByKind, kv.Value) + + case "gauge_current_connections": + emitGauge(ch, c.connections, kv.Value) + case "gauge_current_home_connections": + emitGauge(ch, c.homeConnections, kv.Value) + case "gauge_clients_total": + emitGauge(ch, c.clientsTotal, kv.Value) + case "gauge_clients_local": + emitGauge(ch, c.clientsLocal, kv.Value) + case "gauge_clients_remote": + emitGauge(ch, c.clientsRemote, kv.Value) + case "gauge_watchers": + emitGauge(ch, c.watchers, kv.Value) + case "average_queue_duration_ms": + emitGauge(ch, c.avgQueueDurMS, kv.Value) + } + }) +} + +func emitCounter(ch chan<- prometheus.Metric, desc *prometheus.Desc, v expvar.Var) { + if f, ok := parseExpvarFloat(v); ok { + ch <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, f) + } +} + +func emitGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, v expvar.Var) { + if f, ok := parseExpvarFloat(v); ok { + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, f) + } +} + +func emitLabeledCounters(ch chan<- prometheus.Metric, desc *prometheus.Desc, v expvar.Var) { + sub, ok := v.(interface{ Do(func(expvar.KeyValue)) }) + if !ok { + return + } + sub.Do(func(kv expvar.KeyValue) { + if f, ok := parseExpvarFloat(kv.Value); ok { + ch <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, f, kv.Key) + } + }) +} + +func parseExpvarFloat(v expvar.Var) (float64, bool) { + switch val := v.(type) { + case *expvar.Int: + return float64(val.Value()), true + case *expvar.Float: + return val.Value(), true + default: + f, err := strconv.ParseFloat(v.String(), 64) + return f, err == nil + } +} diff --git a/tailnet/derpmetrics/metrics_test.go b/tailnet/derpmetrics/metrics_test.go new file mode 100644 index 0000000000..a4f63217c6 --- /dev/null +++ b/tailnet/derpmetrics/metrics_test.go @@ -0,0 +1,177 @@ +package derpmetrics_test + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + ptestutil "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "tailscale.com/derp" + "tailscale.com/types/key" + + "github.com/coder/coder/v2/tailnet/derpmetrics" +) + +func TestDERPExpvarCollector(t *testing.T) { + t.Parallel() + + t.Run("RegistersAndCollects", func(t *testing.T) { + t.Parallel() + + server := derp.NewServer(key.NewNode(), func(format string, args ...any) {}) + defer server.Close() + + reg := prometheus.NewRegistry() + collector := derpmetrics.NewDERPExpvarCollector(server) + require.NoError(t, reg.Register(collector)) + + // Verify we can gather without error. + metrics, err := reg.Gather() + require.NoError(t, err) + require.NotEmpty(t, metrics, "expected at least one metric family") + + // Verify expected metric names are present. + names := make(map[string]struct{}) + for _, m := range metrics { + names[m.GetName()] = struct{}{} + } + + expectedCounters := []string{ + "coder_derp_server_accepts_total", + "coder_derp_server_bytes_received_total", + "coder_derp_server_bytes_sent_total", + "coder_derp_server_packets_received_total", + "coder_derp_server_packets_sent_total", + "coder_derp_server_packets_dropped_total", + "coder_derp_server_packets_forwarded_in_total", + "coder_derp_server_packets_forwarded_out_total", + "coder_derp_server_home_moves_in_total", + "coder_derp_server_home_moves_out_total", + "coder_derp_server_got_ping_total", + "coder_derp_server_sent_pong_total", + "coder_derp_server_peer_gone_disconnected_total", + "coder_derp_server_peer_gone_not_here_total", + "coder_derp_server_unknown_frames_total", + } + expectedGauges := []string{ + "coder_derp_server_connections", + "coder_derp_server_home_connections", + "coder_derp_server_clients", + "coder_derp_server_clients_local", + "coder_derp_server_clients_remote", + "coder_derp_server_watchers", + "coder_derp_server_average_queue_duration_ms", + } + expectedLabeled := []string{ + "coder_derp_server_packets_dropped_reason_total", + "coder_derp_server_packets_dropped_type_total", + "coder_derp_server_packets_received_kind_total", + } + + for _, name := range expectedCounters { + assert.Contains(t, names, name, "missing counter %s", name) + } + for _, name := range expectedGauges { + assert.Contains(t, names, name, "missing gauge %s", name) + } + for _, name := range expectedLabeled { + assert.Contains(t, names, name, "missing labeled counter %s", name) + } + }) + + t.Run("CounterTypes", func(t *testing.T) { + t.Parallel() + + server := derp.NewServer(key.NewNode(), func(format string, args ...any) {}) + defer server.Close() + + reg := prometheus.NewRegistry() + collector := derpmetrics.NewDERPExpvarCollector(server) + require.NoError(t, reg.Register(collector)) + + // Counters should report as counter type. + count := ptestutil.CollectAndCount(collector) + assert.Greater(t, count, 0, "expected metrics to be collected") + + // Verify a known counter starts at zero. + metrics, err := reg.Gather() + require.NoError(t, err) + for _, m := range metrics { + if m.GetName() == "coder_derp_server_bytes_received_total" { + require.Len(t, m.GetMetric(), 1) + assert.Equal(t, float64(0), m.GetMetric()[0].GetCounter().GetValue()) + return + } + } + t.Fatal("coder_derp_server_bytes_received_total not found") + }) + + t.Run("GaugeTypes", func(t *testing.T) { + t.Parallel() + + server := derp.NewServer(key.NewNode(), func(format string, args ...any) {}) + defer server.Close() + + reg := prometheus.NewRegistry() + collector := derpmetrics.NewDERPExpvarCollector(server) + require.NoError(t, reg.Register(collector)) + + metrics, err := reg.Gather() + require.NoError(t, err) + for _, m := range metrics { + if m.GetName() == "coder_derp_server_connections" { + require.Len(t, m.GetMetric(), 1) + // Gauge type check — GetGauge should be non-nil. + assert.NotNil(t, m.GetMetric()[0].GetGauge()) + assert.Equal(t, float64(0), m.GetMetric()[0].GetGauge().GetValue()) + return + } + } + t.Fatal("coder_derp_server_connections not found") + }) + + t.Run("LabeledCounters", func(t *testing.T) { + t.Parallel() + + server := derp.NewServer(key.NewNode(), func(format string, args ...any) {}) + defer server.Close() + + reg := prometheus.NewRegistry() + collector := derpmetrics.NewDERPExpvarCollector(server) + require.NoError(t, reg.Register(collector)) + + metrics, err := reg.Gather() + require.NoError(t, err) + + for _, m := range metrics { + if m.GetName() == "coder_derp_server_packets_dropped_reason_total" { + // Should have labeled sub-metrics (one per reason). + require.NotEmpty(t, m.GetMetric(), "expected labeled metrics for drop reasons") + // Each metric should have a "reason" label. + for _, metric := range m.GetMetric() { + labels := metric.GetLabel() + require.Len(t, labels, 1) + assert.Equal(t, "reason", labels[0].GetName()) + } + return + } + } + t.Fatal("coder_derp_server_packets_dropped_reason_total not found") + }) + + t.Run("NoDuplicateRegistration", func(t *testing.T) { + t.Parallel() + + server := derp.NewServer(key.NewNode(), func(format string, args ...any) {}) + defer server.Close() + + reg := prometheus.NewRegistry() + c1 := derpmetrics.NewDERPExpvarCollector(server) + require.NoError(t, reg.Register(c1)) + + c2 := derpmetrics.NewDERPExpvarCollector(server) + err := reg.Register(c2) + assert.Error(t, err, "registering a second collector should fail") + }) +}