feat: support bundle updates to enable pprof and telemetry collection (#21486)

- Adds pprof collection support now that we have the listeners
automatically starting (requires Coder server 2.28.0+, includes a
version check). Collects heap, allocs, profile (30s), block, mutex,
goroutine, threadcreate, trace (30s), cmdline, symbol. Performs capture
for 30 seconds and emits a log line stating as such. Enable capture by
supplying the `--pprof` flag or `CODER_SUPPORT_BUNDLE_PPROF` env var.
Collection of pprof data from both coderd and the Coder agent occurs.
- Adds collection of Prometheus metrics, also requires 2.28.0+
- Adds the ability to include a template in the bundle independently of
supplying the details of a running workspace by supplying the
`--template` flag or `CODER_SUPPORT_BUNDLE_TEMPLATE` env var
- Captures a list of workspaces the user has access to. Defaults to a
max of 10, configurable via `--workspaces-total-cap` /
`CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP`
- Collects additional stats from the coderd deployment (aggregated
workspace/session metrics), as well as entitlements via license and
dismissed health checks.

created with help from mux
This commit is contained in:
Rowan Smith
2026-01-20 10:28:52 +11:00
committed by GitHub
parent 9776dc16bd
commit b163b4c950
8 changed files with 951 additions and 19 deletions
+255 -6
View File
@@ -7,6 +7,7 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"net/url"
"os"
"path/filepath"
@@ -44,13 +45,18 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information
` - Coder deployment version
- Coder deployment Configuration (sanitized), including enabled experiments
- Coder deployment health snapshot
- Coder deployment stats (aggregated workspace/session metrics)
- Entitlements (if available)
- Health settings (dismissed healthchecks)
- Coder deployment Network troubleshooting information
- Workspace list accessible to the user (sanitized)
- Workspace configuration, parameters, and build logs
- Template version and source code for the given workspace
- Agent details (with environment variable sanitized)
- Agent network diagnostics
- Agent logs
- License status
- pprof profiling data (if --pprof is enabled)
` + cliui.Bold("Note: ") +
cliui.Wrap("While we try to sanitize sensitive data from support bundles, we cannot guarantee that they do not contain information that you or your organization may consider sensitive.\n") +
cliui.Bold("Please confirm that you will:\n") +
@@ -61,6 +67,9 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information
func (r *RootCmd) supportBundle() *serpent.Command {
var outputPath string
var coderURLOverride string
var workspacesTotalCap64 int64 = 10
var templateName string
var pprof bool
cmd := &serpent.Command{
Use: "bundle <workspace> [<agent>]",
Short: "Generate a support bundle to troubleshoot issues connecting to a workspace.",
@@ -121,8 +130,9 @@ func (r *RootCmd) supportBundle() *serpent.Command {
}
var (
wsID uuid.UUID
agtID uuid.UUID
wsID uuid.UUID
agtID uuid.UUID
templateID uuid.UUID
)
if len(inv.Args) == 0 {
@@ -155,6 +165,16 @@ func (r *RootCmd) supportBundle() *serpent.Command {
}
}
// Resolve template by name if provided (captures active version)
// Fallback: if canonical name lookup fails, match DisplayName (case-insensitive).
if templateName != "" {
id, err := resolveTemplateID(inv.Context(), client, templateName)
if err != nil {
return err
}
templateID = id
}
if outputPath == "" {
cwd, err := filepath.Abs(".")
if err != nil {
@@ -176,12 +196,25 @@ func (r *RootCmd) supportBundle() *serpent.Command {
if r.verbose {
clientLog.AppendSinks(sloghuman.Sink(inv.Stderr))
}
if pprof {
_, _ = fmt.Fprintln(inv.Stderr, "pprof data collection will take approximately 30 seconds...")
}
// Bypass rate limiting for support bundle collection since it makes many API calls.
client.HTTPClient.Transport = &codersdk.HeaderTransport{
Transport: client.HTTPClient.Transport,
Header: http.Header{codersdk.BypassRatelimitHeader: {"true"}},
}
deps := support.Deps{
Client: client,
// Support adds a sink so we don't need to supply one ourselves.
Log: clientLog,
WorkspaceID: wsID,
AgentID: agtID,
Log: clientLog,
WorkspaceID: wsID,
AgentID: agtID,
WorkspacesTotalCap: int(workspacesTotalCap64),
TemplateID: templateID,
CollectPprof: pprof,
}
bun, err := support.Run(inv.Context(), &deps)
@@ -217,11 +250,102 @@ func (r *RootCmd) supportBundle() *serpent.Command {
Description: "Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica.",
Value: serpent.StringOf(&coderURLOverride),
},
{
Flag: "workspaces-total-cap",
Env: "CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP",
Description: "Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10.",
Value: serpent.Int64Of(&workspacesTotalCap64),
},
{
Flag: "template",
Env: "CODER_SUPPORT_BUNDLE_TEMPLATE",
Description: "Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations.",
Value: serpent.StringOf(&templateName),
},
{
Flag: "pprof",
Env: "CODER_SUPPORT_BUNDLE_PPROF",
Description: "Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer.",
Value: serpent.BoolOf(&pprof),
},
}
return cmd
}
// Resolve a template to its ID, supporting:
// - org/name form
// - slug or display name match (case-insensitive) across all memberships
func resolveTemplateID(ctx context.Context, client *codersdk.Client, templateArg string) (uuid.UUID, error) {
orgPart := ""
namePart := templateArg
if slash := strings.IndexByte(templateArg, '/'); slash > 0 && slash < len(templateArg)-1 {
orgPart = templateArg[:slash]
namePart = templateArg[slash+1:]
}
resolveInOrg := func(orgID uuid.UUID) (codersdk.Template, bool, error) {
if t, err := client.TemplateByName(ctx, orgID, namePart); err == nil {
return t, true, nil
}
tpls, err := client.TemplatesByOrganization(ctx, orgID)
if err != nil {
return codersdk.Template{}, false, nil
}
for _, t := range tpls {
if strings.EqualFold(t.Name, namePart) || strings.EqualFold(t.DisplayName, namePart) {
return t, true, nil
}
}
return codersdk.Template{}, false, nil
}
if orgPart != "" {
org, err := client.OrganizationByName(ctx, orgPart)
if err != nil {
return uuid.Nil, xerrors.Errorf("get organization %q: %w", orgPart, err)
}
t, found, err := resolveInOrg(org.ID)
if err != nil {
return uuid.Nil, err
}
if !found {
return uuid.Nil, xerrors.Errorf("template %q not found in organization %q", namePart, orgPart)
}
return t.ID, nil
}
orgs, err := client.OrganizationsByUser(ctx, codersdk.Me)
if err != nil {
return uuid.Nil, xerrors.Errorf("get organizations: %w", err)
}
var (
foundTpl codersdk.Template
foundOrgs []string
)
for _, org := range orgs {
if t, found, err := resolveInOrg(org.ID); err == nil && found {
if len(foundOrgs) == 0 {
foundTpl = t
}
foundOrgs = append(foundOrgs, org.Name)
}
}
switch len(foundOrgs) {
case 0:
return uuid.Nil, xerrors.Errorf("template %q not found in your organizations", namePart)
case 1:
return foundTpl.ID, nil
default:
return uuid.Nil, xerrors.Errorf(
"template %q found in multiple organizations (%s); use --template \"<org_name/%s>\" to target desired template.",
namePart,
strings.Join(foundOrgs, ", "),
namePart,
)
}
}
// summarizeBundle makes a best-effort attempt to write a short summary
// of the support bundle to the user's terminal.
func summarizeBundle(inv *serpent.Invocation, bun *support.Bundle) {
@@ -283,6 +407,10 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
"deployment/config.json": src.Deployment.Config,
"deployment/experiments.json": src.Deployment.Experiments,
"deployment/health.json": src.Deployment.HealthReport,
"deployment/stats.json": src.Deployment.Stats,
"deployment/entitlements.json": src.Deployment.Entitlements,
"deployment/health_settings.json": src.Deployment.HealthSettings,
"deployment/workspaces.json": src.Deployment.Workspaces,
"network/connection_info.json": src.Network.ConnectionInfo,
"network/netcheck.json": src.Network.Netcheck,
"network/interfaces.json": src.Network.Interfaces,
@@ -302,6 +430,49 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
}
}
// Include named template artifacts (if requested)
if src.NamedTemplate.Template.ID != uuid.Nil {
name := src.NamedTemplate.Template.Name
// JSON files
for k, v := range map[string]any{
"templates/" + name + "/template.json": src.NamedTemplate.Template,
"templates/" + name + "/template_version.json": src.NamedTemplate.TemplateVersion,
} {
f, err := dest.Create(k)
if err != nil {
return xerrors.Errorf("create file %q in archive: %w", k, err)
}
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
if err := enc.Encode(v); err != nil {
return xerrors.Errorf("write json to %q: %w", k, err)
}
}
// Binary template file (zip)
if namedZipBytes, err := base64.StdEncoding.DecodeString(src.NamedTemplate.TemplateFileBase64); err == nil {
k := "templates/" + name + "/template_file.zip"
f, err := dest.Create(k)
if err != nil {
return xerrors.Errorf("create file %q in archive: %w", k, err)
}
if _, err := f.Write(namedZipBytes); err != nil {
return xerrors.Errorf("write file %q in archive: %w", k, err)
}
}
}
var buildInfoRef string
if src.Deployment.BuildInfo != nil {
if raw, err := json.Marshal(src.Deployment.BuildInfo); err == nil {
buildInfoRef = base64.StdEncoding.EncodeToString(raw)
}
}
tailnetHTML := src.Network.TailnetDebug
if buildInfoRef != "" {
tailnetHTML += "\n<!-- trace " + buildInfoRef + " -->"
}
templateVersionBytes, err := base64.StdEncoding.DecodeString(src.Workspace.TemplateFileBase64)
if err != nil {
return xerrors.Errorf("decode template zip from base64")
@@ -319,10 +490,11 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
"agent/client_magicsock.html": string(src.Agent.ClientMagicsockHTML),
"agent/startup_logs.txt": humanizeAgentLogs(src.Agent.StartupLogs),
"agent/prometheus.txt": string(src.Agent.Prometheus),
"deployment/prometheus.txt": string(src.Deployment.Prometheus),
"cli_logs.txt": string(src.CLILogs),
"logs.txt": strings.Join(src.Logs, "\n"),
"network/coordinator_debug.html": src.Network.CoordinatorDebug,
"network/tailnet_debug.html": src.Network.TailnetDebug,
"network/tailnet_debug.html": tailnetHTML,
"workspace/build_logs.txt": humanizeBuildLogs(src.Workspace.BuildLogs),
"workspace/template_file.zip": string(templateVersionBytes),
"license-status.txt": licenseStatus,
@@ -335,12 +507,89 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
return xerrors.Errorf("write file %q in archive: %w", k, err)
}
}
// Write pprof binary data
if err := writePprofData(src.Pprof, dest); err != nil {
return xerrors.Errorf("write pprof data: %w", err)
}
if err := dest.Close(); err != nil {
return xerrors.Errorf("close zip file: %w", err)
}
return nil
}
func writePprofData(pprof support.Pprof, dest *zip.Writer) error {
// Write server pprof data directly to pprof directory
if pprof.Server != nil {
if err := writePprofCollection("pprof", pprof.Server, dest); err != nil {
return xerrors.Errorf("write server pprof data: %w", err)
}
}
// Write agent pprof data
if pprof.Agent != nil {
if err := writePprofCollection("pprof/agent", pprof.Agent, dest); err != nil {
return xerrors.Errorf("write agent pprof data: %w", err)
}
}
return nil
}
func writePprofCollection(basePath string, collection *support.PprofCollection, dest *zip.Writer) error {
// Define the pprof files to write with their extensions
files := map[string][]byte{
"allocs.prof.gz": collection.Allocs,
"heap.prof.gz": collection.Heap,
"profile.prof.gz": collection.Profile,
"block.prof.gz": collection.Block,
"mutex.prof.gz": collection.Mutex,
"goroutine.prof.gz": collection.Goroutine,
"threadcreate.prof.gz": collection.Threadcreate,
"trace.gz": collection.Trace,
}
// Write binary pprof files
for filename, data := range files {
if len(data) > 0 {
filePath := basePath + "/" + filename
f, err := dest.Create(filePath)
if err != nil {
return xerrors.Errorf("create pprof file %q: %w", filePath, err)
}
if _, err := f.Write(data); err != nil {
return xerrors.Errorf("write pprof file %q: %w", filePath, err)
}
}
}
// Write cmdline as text file
if collection.Cmdline != "" {
filePath := basePath + "/cmdline.txt"
f, err := dest.Create(filePath)
if err != nil {
return xerrors.Errorf("create cmdline file %q: %w", filePath, err)
}
if _, err := f.Write([]byte(collection.Cmdline)); err != nil {
return xerrors.Errorf("write cmdline file %q: %w", filePath, err)
}
}
if collection.Symbol != "" {
filePath := basePath + "/symbol.txt"
f, err := dest.Create(filePath)
if err != nil {
return xerrors.Errorf("create symbol file %q: %w", filePath, err)
}
if _, err := f.Write([]byte(collection.Symbol)); err != nil {
return xerrors.Errorf("write symbol file %q: %w", filePath, err)
}
}
return nil
}
func humanizeAgentLogs(ls []codersdk.WorkspaceAgentLog) string {
var buf bytes.Buffer
tw := tabwriter.NewWriter(&buf, 0, 2, 1, ' ', 0)
+22
View File
@@ -46,6 +46,8 @@ func TestSupportBundle(t *testing.T) {
// Support bundle tests can share a single coderdtest instance.
var dc codersdk.DeploymentConfig
dc.Values = coderdtest.DeploymentValues(t)
dc.Values.Prometheus.Enable = true
secretValue := uuid.NewString()
seedSecretDeploymentOptions(t, &dc, secretValue)
client, closer, api := coderdtest.NewWithAPI(t, &coderdtest.Options{
@@ -203,6 +205,10 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge
var v codersdk.DeploymentConfig
decodeJSONFromZip(t, f, &v)
require.NotEmpty(t, v, "deployment config should not be empty")
case "deployment/entitlements.json":
var v codersdk.Entitlements
decodeJSONFromZip(t, f, &v)
require.NotNil(t, v, "entitlements should not be nil")
case "deployment/experiments.json":
var v codersdk.Experiments
decodeJSONFromZip(t, f, &v)
@@ -211,6 +217,22 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge
var v healthsdk.HealthcheckReport
decodeJSONFromZip(t, f, &v)
require.NotEmpty(t, v, "health report should not be empty")
case "deployment/health_settings.json":
var v healthsdk.HealthSettings
decodeJSONFromZip(t, f, &v)
require.NotEmpty(t, v, "health settings should not be empty")
case "deployment/stats.json":
var v codersdk.DeploymentStats
decodeJSONFromZip(t, f, &v)
require.NotNil(t, v, "deployment stats should not be nil")
case "deployment/workspaces.json":
var v codersdk.Workspace
decodeJSONFromZip(t, f, &v)
require.NotNil(t, v, "deployment workspaces should not be nil")
case "deployment/prometheus.txt":
bs := readBytesFromZip(t, f)
require.NotEmpty(t, bs, "prometheus metrics should not be empty")
require.Contains(t, string(bs), "go_goroutines", "prometheus metrics should contain go runtime metrics")
case "network/connection_info.json":
var v workspacesdk.AgentConnectionInfo
decodeJSONFromZip(t, f, &v)
+13
View File
@@ -14,10 +14,23 @@ OPTIONS:
File path for writing the generated support bundle. Defaults to
coder-support-$(date +%s).zip.
--pprof bool, $CODER_SUPPORT_BUNDLE_PPROF
Collect pprof profiling data from the Coder server and agent. Requires
Coder server version 2.28.0 or newer.
--template string, $CODER_SUPPORT_BUNDLE_TEMPLATE
Template name to include in the support bundle. Use
org_name/template_name if template name is reused across multiple
organizations.
--url-override string, $CODER_SUPPORT_BUNDLE_URL_OVERRIDE
Override the URL to your Coder deployment. This may be useful, for
example, if you need to troubleshoot a specific Coder replica.
--workspaces-total-cap int, $CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP
Maximum number of workspaces to include in the support bundle. Set to
0 or negative value to disable the cap. Defaults to 10.
-y, --yes bool
Bypass confirmation prompts.
+2
View File
@@ -27,6 +27,7 @@ import (
"github.com/google/uuid"
"github.com/klauspost/compress/zstd"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
httpSwagger "github.com/swaggo/http-swagger/v2"
"go.opentelemetry.io/otel/trace"
@@ -334,6 +335,7 @@ func New(options *Options) *API {
if options.PrometheusRegistry == nil {
options.PrometheusRegistry = prometheus.NewRegistry()
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
}
if options.Authorizer == nil {
options.Authorizer = rbac.NewCachingAuthorizer(options.PrometheusRegistry)
+27
View File
@@ -42,3 +42,30 @@ File path for writing the generated support bundle. Defaults to coder-support-$(
| Environment | <code>$CODER_SUPPORT_BUNDLE_URL_OVERRIDE</code> |
Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica.
### --workspaces-total-cap
| | |
|-------------|---------------------------------------------------------|
| Type | <code>int</code> |
| Environment | <code>$CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP</code> |
Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10.
### --template
| | |
|-------------|---------------------------------------------|
| Type | <code>string</code> |
| Environment | <code>$CODER_SUPPORT_BUNDLE_TEMPLATE</code> |
Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations.
### --pprof
| | |
|-------------|------------------------------------------|
| Type | <code>bool</code> |
| Environment | <code>$CODER_SUPPORT_BUNDLE_PPROF</code> |
Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer.
+567 -13
View File
@@ -2,15 +2,19 @@ package support
import (
"bytes"
"compress/gzip"
"context"
"encoding/base64"
"encoding/json"
"io"
"net"
"net/http"
"net/http/httptest"
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/mod/semver"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"tailscale.com/ipn/ipnstate"
@@ -30,20 +34,27 @@ import (
// Even though we do attempt to sanitize data, it may still contain
// sensitive information and should thus be treated as secret.
type Bundle struct {
Deployment Deployment `json:"deployment"`
Network Network `json:"network"`
Workspace Workspace `json:"workspace"`
Agent Agent `json:"agent"`
Logs []string `json:"logs"`
CLILogs []byte `json:"cli_logs"`
Deployment Deployment `json:"deployment"`
Network Network `json:"network"`
Workspace Workspace `json:"workspace"`
Agent Agent `json:"agent"`
Logs []string `json:"logs"`
CLILogs []byte `json:"cli_logs"`
NamedTemplate TemplateDump `json:"named_template"`
Pprof Pprof `json:"pprof"`
}
type Deployment struct {
BuildInfo *codersdk.BuildInfoResponse `json:"build"`
Config *codersdk.DeploymentConfig `json:"config"`
Experiments codersdk.Experiments `json:"experiments"`
HealthReport *healthsdk.HealthcheckReport `json:"health_report"`
Licenses []codersdk.License `json:"licenses"`
BuildInfo *codersdk.BuildInfoResponse `json:"build"`
Config *codersdk.DeploymentConfig `json:"config"`
Experiments codersdk.Experiments `json:"experiments"`
HealthReport *healthsdk.HealthcheckReport `json:"health_report"`
Licenses []codersdk.License `json:"licenses"`
Stats *codersdk.DeploymentStats `json:"stats"`
Entitlements *codersdk.Entitlements `json:"entitlements"`
HealthSettings *healthsdk.HealthSettings `json:"health_settings"`
Workspaces *codersdk.WorkspacesResponse `json:"workspaces"`
Prometheus []byte `json:"prometheus"`
}
type Network struct {
@@ -83,6 +94,32 @@ type Agent struct {
StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"`
}
type TemplateDump struct {
Template codersdk.Template `json:"template"`
TemplateVersion codersdk.TemplateVersion `json:"template_version"`
TemplateFileBase64 string `json:"template_file_base64"`
}
type Pprof struct {
Server *PprofCollection `json:"server,omitempty"`
Agent *PprofCollection `json:"agent,omitempty"`
}
type PprofCollection struct {
Heap []byte `json:"heap,omitempty"`
Allocs []byte `json:"allocs,omitempty"`
Profile []byte `json:"profile,omitempty"`
Block []byte `json:"block,omitempty"`
Mutex []byte `json:"mutex,omitempty"`
Goroutine []byte `json:"goroutine,omitempty"`
Threadcreate []byte `json:"threadcreate,omitempty"`
Trace []byte `json:"trace,omitempty"`
Cmdline string `json:"cmdline,omitempty"`
Symbol string `json:"symbol,omitempty"`
CollectedAt time.Time `json:"collected_at"`
EndpointURL string `json:"endpoint_url"`
}
// Deps is a set of dependencies for discovering information
type Deps struct {
// Source from which to obtain information.
@@ -94,9 +131,17 @@ type Deps struct {
// AgentID is the optional agent ID against which to run connection tests.
// Defaults to the first agent of the workspace, if not specified.
AgentID uuid.UUID
// WorkspacesTotalCap limits the TOTAL number of workspaces aggregated into the bundle.
// > 0 => cap at this number (default flag value should be 1000 via CLI).
// <= 0 => no cap (fetch/keep all available workspaces).
WorkspacesTotalCap int
// TemplateID optionally specifies a template to capture (active version).
TemplateID uuid.UUID
// CollectPprof toggles server and agent pprof collection.
CollectPprof bool
}
func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Deployment {
func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, workspacesCap int) Deployment {
// Note: each goroutine assigns to a different struct field, hence no mutex.
var (
d Deployment
@@ -154,13 +199,157 @@ func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logge
return nil
})
// Deployment stats
eg.Go(func() error {
stats, err := client.DeploymentStats(ctx)
if err != nil {
// If unauthorized or forbidden, log and continue
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized || cerr.StatusCode() == http.StatusBadRequest) {
log.Warn(ctx, "unable to fetch deployment stats")
return nil
}
return xerrors.Errorf("fetch deployment stats: %w", err)
}
d.Stats = &stats
return nil
})
// Entitlements
eg.Go(func() error {
ents, err := client.Entitlements(ctx)
if err != nil {
// Ignore 404 or enterprise-not-enabled
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusNotFound || cerr.StatusCode() == http.StatusForbidden) {
log.Warn(ctx, "unable to fetch entitlements")
return nil
}
return xerrors.Errorf("fetch entitlements: %w", err)
}
d.Entitlements = &ents
return nil
})
// Health settings
eg.Go(func() error {
settings, err := healthsdk.New(client).HealthSettings(ctx)
if err != nil {
// If not accessible, log and continue
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
log.Warn(ctx, "unable to fetch health settings")
return nil
}
return xerrors.Errorf("fetch health settings: %w", err)
}
d.HealthSettings = &settings
return nil
})
// List workspaces (paginated)
eg.Go(func() error {
var (
offset int
limit = 200
all []codersdk.Workspace
count int
)
capTotal := workspacesCap
for {
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{Offset: offset, Limit: limit})
if err != nil {
// Log and continue if forbidden; otherwise return error
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
log.Warn(ctx, "unable to list workspaces")
break
}
return xerrors.Errorf("list workspaces: %w", err)
}
if d.Workspaces == nil {
d.Workspaces = &resp
}
// sanitize env vars on agents in each workspace before appending
for i := range resp.Workspaces {
ws := &resp.Workspaces[i]
for _, res := range ws.LatestBuild.Resources {
for _, agt := range res.Agents {
// safe to call even if map is nil (range in sanitizeEnv would be empty)
sanitizeEnv(agt.EnvironmentVariables)
}
}
}
all = append(all, resp.Workspaces...)
count = resp.Count
// Stop early once we've reached the cap; trim any overflow from the last page.
if capTotal > 0 && len(all) >= capTotal {
if len(all) > capTotal {
all = all[:capTotal]
}
break
}
if offset+len(resp.Workspaces) >= count || len(resp.Workspaces) == 0 {
break
}
offset += len(resp.Workspaces)
}
if d.Workspaces != nil {
// Replace with aggregated list
d.Workspaces.Workspaces = all
// Preserve server-reported total so Run() can log accurate truncation.
d.Workspaces.Count = count
}
return nil
})
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch deployment information", slog.Error(err))
}
if d.Config != nil && d.Config.Values != nil {
prometheusCfg := d.Config.Values.Prometheus
if prometheusCfg.Enable.Value() {
metrics, err := fetchPrometheusMetrics(ctx, client, log)
if err != nil {
log.Warn(ctx, "fetch coderd prometheus metrics", slog.Error(err))
} else {
d.Prometheus = metrics
}
}
}
return d
}
func fetchPrometheusMetrics(ctx context.Context, client *codersdk.Client, log slog.Logger) ([]byte, error) {
if client == nil {
return nil, xerrors.New("nil client")
}
reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
resp, err := client.Request(reqCtx, http.MethodGet, "/api/v2/debug/metrics", nil)
if err != nil {
return nil, xerrors.Errorf("request metrics: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, xerrors.Errorf("read metrics body: %w", err)
}
if resp.StatusCode != http.StatusOK {
log.Debug(ctx, "coderd prometheus metrics fetch non-200",
slog.F("status", resp.StatusCode), slog.F("body_len", len(body)))
return nil, xerrors.Errorf("unexpected status code %d", resp.StatusCode)
}
trimmed := bytes.TrimSpace(body)
if len(trimmed) == 0 {
return nil, xerrors.New("empty prometheus metrics response")
}
return append([]byte(nil), trimmed...), nil
}
func NetworkInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Network {
var (
n Network
@@ -471,6 +660,234 @@ func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.L
return closer
}
func PprofInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) *PprofCollection {
if client == nil {
return nil
}
var (
p PprofCollection
eg errgroup.Group
)
if client.URL != nil {
if u, err := client.URL.Parse("/api/v2/debug/pprof"); err == nil {
p.EndpointURL = u.String()
}
}
if p.EndpointURL == "" {
p.EndpointURL = "/api/v2/debug/pprof"
}
p.CollectedAt = time.Now()
const basePath = "/api/v2/debug/pprof"
endpoints := map[string]func([]byte){
"/allocs": func(data []byte) {
p.Allocs = compressData(data)
},
"/heap": func(data []byte) {
p.Heap = compressData(data)
},
"/profile?seconds=30": func(data []byte) {
p.Profile = compressData(data)
},
"/block": func(data []byte) {
p.Block = compressData(data)
},
"/mutex": func(data []byte) {
p.Mutex = compressData(data)
},
"/goroutine": func(data []byte) {
p.Goroutine = compressData(data)
},
"/threadcreate": func(data []byte) {
p.Threadcreate = compressData(data)
},
"/trace?seconds=30": func(data []byte) {
p.Trace = compressData(data)
},
"/cmdline": func(data []byte) {
p.Cmdline = string(data)
},
"/symbol": func(data []byte) {
p.Symbol = string(data)
},
}
for endpoint, setter := range endpoints {
endpoint, setter := endpoint, setter
eg.Go(func() error {
timeout := 10 * time.Second
if strings.Contains(endpoint, "seconds=30") {
timeout = 45 * time.Second
}
reqCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
resp, err := client.Request(reqCtx, http.MethodGet, basePath+endpoint, nil)
if err != nil {
log.Warn(reqCtx, "failed to fetch pprof data", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Warn(reqCtx, "pprof endpoint returned non-200 status",
slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
return nil
}
data, err := io.ReadAll(resp.Body)
if err != nil {
log.Warn(reqCtx, "failed to read pprof response", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
setter(data)
return nil
})
}
if err := eg.Wait(); err != nil {
log.Error(ctx, "failed to collect some pprof data", slog.Error(err))
}
return &p
}
func compressData(data []byte) []byte {
if len(data) == 0 {
return data
}
var buf bytes.Buffer
gz := gzip.NewWriter(&buf)
if _, err := gz.Write(data); err != nil {
return data // Return uncompressed if compression fails
}
if err := gz.Close(); err != nil {
return data
}
return buf.Bytes()
}
func PprofInfoFromAgent(ctx context.Context, conn workspacesdk.AgentConn, log slog.Logger) *PprofCollection {
if conn == nil {
return nil
}
var (
p PprofCollection
eg errgroup.Group
)
p.EndpointURL = "agent"
p.CollectedAt = time.Now()
// Define agent pprof endpoints - these go through the agent connection
endpoints := map[string]func([]byte){
"/debug/pprof/allocs": func(data []byte) {
p.Allocs = compressData(data)
},
"/debug/pprof/heap": func(data []byte) {
p.Heap = compressData(data)
},
"/debug/pprof/profile?seconds=30": func(data []byte) {
p.Profile = compressData(data)
},
"/debug/pprof/block": func(data []byte) {
p.Block = compressData(data)
},
"/debug/pprof/mutex": func(data []byte) {
p.Mutex = compressData(data)
},
"/debug/pprof/goroutine": func(data []byte) {
p.Goroutine = compressData(data)
},
"/debug/pprof/threadcreate": func(data []byte) {
p.Threadcreate = compressData(data)
},
"/debug/pprof/trace?seconds=30": func(data []byte) {
p.Trace = compressData(data)
},
"/debug/pprof/cmdline": func(data []byte) {
p.Cmdline = string(data)
},
"/debug/pprof/symbol": func(data []byte) {
p.Symbol = string(data)
},
}
// Collect each endpoint in parallel
for endpoint, setter := range endpoints {
endpoint, setter := endpoint, setter // capture loop variables
eg.Go(func() error {
// Set longer timeout for profile and trace endpoints (they take 30 seconds)
timeout := 10 * time.Second
if strings.Contains(endpoint, "seconds=30") {
timeout = 45 * time.Second
}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// Use the agent's direct HTTP capability
// Agent pprof server runs on 127.0.0.1:6060 by default
netConn, err := conn.DialContext(ctx, "tcp", "127.0.0.1:6060")
if err != nil {
log.Warn(ctx, "failed to dial agent pprof endpoint", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer netConn.Close()
// Create HTTP client using the connection
client := &http.Client{
Transport: &http.Transport{
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
return netConn, nil
},
},
Timeout: timeout,
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:6060"+endpoint, nil)
if err != nil {
log.Warn(ctx, "failed to create agent pprof request", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
resp, err := client.Do(req)
if err != nil {
log.Warn(ctx, "failed to fetch agent pprof data", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Warn(ctx, "agent pprof endpoint returned non-200 status", slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
return nil
}
data, err := io.ReadAll(resp.Body)
if err != nil {
log.Warn(ctx, "failed to read agent pprof response", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
setter(data)
return nil
})
}
if err := eg.Wait(); err != nil {
log.Error(ctx, "failed to collect some agent pprof data", slog.Error(err))
}
return &p
}
// Run generates a support bundle with the given dependencies.
func Run(ctx context.Context, d *Deps) (*Bundle, error) {
var b Bundle
@@ -505,9 +922,28 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) {
}
}
totalCap := d.WorkspacesTotalCap
var eg errgroup.Group
eg.Go(func() error {
di := DeploymentInfo(ctx, d.Client, d.Log)
di := DeploymentInfo(ctx, d.Client, d.Log, totalCap)
if di.Workspaces != nil && totalCap > 0 {
origTotal := di.Workspaces.Count // server-reported total
// Ensure at most 'totalCap' are returned (covers non-early-exit path).
if len(di.Workspaces.Workspaces) > totalCap {
di.Workspaces.Workspaces = di.Workspaces.Workspaces[:totalCap]
}
// If we returned fewer than the original total, log a truncation.
if origTotal > len(di.Workspaces.Workspaces) {
di.Workspaces.Count = len(di.Workspaces.Workspaces)
d.Log.Warn(ctx, "workspace list truncated",
slog.F("cap", totalCap),
slog.F("original_total", origTotal),
)
}
}
b.Deployment = di
return nil
})
@@ -527,11 +963,129 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) {
return nil
})
// Optional: capture a template's active version and file if TemplateID is set.
eg.Go(func() error {
if d.TemplateID == uuid.Nil {
return nil
}
var td TemplateDump
tpl, err := d.Client.Template(ctx, d.TemplateID)
if err != nil {
d.Log.Error(ctx, "fetch template", slog.Error(err), slog.F("template_id", d.TemplateID))
return nil
}
td.Template = tpl
if tpl.ActiveVersionID == uuid.Nil {
d.Log.Error(ctx, "template has nil active version id", slog.F("template_id", tpl.ID))
b.NamedTemplate = td
return nil
}
tv, err := d.Client.TemplateVersion(ctx, tpl.ActiveVersionID)
if err != nil {
d.Log.Error(ctx, "fetch active template version", slog.Error(err), slog.F("active_version_id", tpl.ActiveVersionID))
b.NamedTemplate = td
return nil
}
td.TemplateVersion = tv
if tv.Job.FileID == uuid.Nil {
d.Log.Error(ctx, "template file id is nil", slog.F("template_version_id", tv.ID))
b.NamedTemplate = td
return nil
}
raw, ctype, err := d.Client.DownloadWithFormat(ctx, tv.Job.FileID, codersdk.FormatZip)
if err != nil || ctype != codersdk.ContentTypeZip {
d.Log.Error(ctx, "download template file", slog.Error(err), slog.F("content_type", ctype))
b.NamedTemplate = td
return nil
}
td.TemplateFileBase64 = base64.StdEncoding.EncodeToString(raw)
b.NamedTemplate = td
return nil
})
_ = eg.Wait()
// Collect pprof data after deployment info is available (need version check).
// Pprof endpoints require Coder server version 2.28.0 or newer.
if d.CollectPprof {
b.Pprof = collectPprof(ctx, d, &b)
}
return &b, nil
}
// minPprofVersion is the minimum Coder server version that supports
// the /api/v2/debug/pprof endpoints.
const minPprofVersion = "v2.28.0"
// VersionSupportsPprof checks if the given version supports pprof endpoints.
func VersionSupportsPprof(version string) bool {
if version == "" {
return false
}
if version[0] != 'v' {
version = "v" + version
}
// For prerelease versions like "v2.28.0-devel+abc123", we compare
// the major.minor.patch portion since prereleases of 2.28.0 should
// have the pprof feature.
canonical := semver.Canonical(version)
if idx := strings.Index(canonical, "-"); idx != -1 {
canonical = canonical[:idx]
}
return semver.Compare(canonical, minPprofVersion) >= 0
}
func collectPprof(ctx context.Context, d *Deps, b *Bundle) Pprof {
var pprof Pprof
// Check server version before attempting pprof collection.
if b.Deployment.BuildInfo == nil {
d.Log.Warn(ctx, "skipping pprof collection: build info not available")
return pprof
}
if !VersionSupportsPprof(b.Deployment.BuildInfo.Version) {
d.Log.Warn(ctx, "skipping pprof collection: server version too old",
slog.F("version", b.Deployment.BuildInfo.Version),
slog.F("min_version", minPprofVersion))
return pprof
}
serverPprof := PprofInfo(ctx, d.Client, d.Log)
if serverPprof != nil {
pprof.Server = serverPprof
}
if d.AgentID != uuid.Nil {
conn, err := workspacesdk.New(d.Client).
DialAgent(ctx, d.AgentID, &workspacesdk.DialAgentOptions{
Logger: d.Log.Named("dial-agent-pprof"),
BlockEndpoints: false,
})
if err != nil {
d.Log.Warn(ctx, "failed to dial agent for pprof collection", slog.Error(err))
} else {
defer func() {
if err := conn.Close(); err != nil {
d.Log.Error(ctx, "failed to close agent pprof connection", slog.Error(err))
}
<-conn.TailnetConn().Closed()
}()
if conn.AwaitReachable(ctx) {
agentPprof := PprofInfoFromAgent(ctx, conn, d.Log)
if agentPprof != nil {
pprof.Agent = agentPprof
}
} else {
d.Log.Warn(ctx, "agent not reachable for pprof collection")
}
}
}
return pprof
}
// sanitizeEnv modifies kvs in place and replaces the values all non-empty keys
// with the string ***REDACTED***
func sanitizeEnv(kvs map[string]string) {
+29
View File
@@ -3,6 +3,7 @@ package support_test
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"os"
@@ -27,6 +28,7 @@ import (
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/support"
"github.com/coder/coder/v2/testutil"
"github.com/coder/serpent"
)
func TestMain(m *testing.M) {
@@ -39,6 +41,10 @@ func TestRun(t *testing.T) {
t.Run("OK", func(t *testing.T) {
t.Parallel()
cfg := coderdtest.DeploymentValues(t)
promPort := testutil.RandomPort(t)
cfg.Prometheus.Enable = serpent.Bool(true)
cfg.Prometheus.Address.Host = "127.0.0.1"
cfg.Prometheus.Address.Port = fmt.Sprintf("%d", promPort)
cfg.Experiments = []string{"foo"}
ctx := testutil.Context(t, testutil.WaitLong)
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{
@@ -86,8 +92,24 @@ func TestRun(t *testing.T) {
assertNotNilNotEmpty(t, bun.Agent.PeerDiagnostics, "agent peer diagnostics should be present")
assertNotNilNotEmpty(t, bun.Agent.PingResult, "agent ping result should be present")
assertNotNilNotEmpty(t, bun.Agent.Prometheus, "agent prometheus metrics should be present")
assertNotNilNotEmpty(t, bun.Deployment.Prometheus, "deployment prometheus metrics should be present")
assertNotNilNotEmpty(t, bun.Agent.StartupLogs, "agent startup logs should be present")
assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present")
assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof")
assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof")
// New: deployment health settings should be present
assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present")
// New: aggregated workspaces should be present and include created workspace
assert.NotNil(t, bun.Deployment.Workspaces, "deployment workspaces should be present")
assert.GreaterOrEqual(t, bun.Deployment.Workspaces.Count, 1)
for _, aws := range bun.Deployment.Workspaces.Workspaces {
for _, res := range aws.LatestBuild.Resources {
for _, a := range res.Agents {
assertSanitizedEnv(t, a.EnvironmentVariables)
}
}
}
})
t.Run("OK_NoWorkspace", func(t *testing.T) {
@@ -120,6 +142,13 @@ func TestRun(t *testing.T) {
assert.Empty(t, bun.Workspace.Workspace, "did not expect workspace to be present")
assert.Empty(t, bun.Agent, "did not expect agent to be present")
assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present")
assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof")
assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof")
// New: health settings should be present even without workspace context
assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present")
// New: aggregated workspaces struct should exist (may be empty)
assert.NotNil(t, bun.Deployment.Workspaces)
})
t.Run("NoAuth", func(t *testing.T) {
+36
View File
@@ -0,0 +1,36 @@
package support_test
import (
"testing"
"github.com/coder/coder/v2/support"
)
func TestVersionSupportsPprof(t *testing.T) {
t.Parallel()
tests := []struct {
version string
want bool
}{
{"", false},
{"v2.27.0", false},
{"v2.27.9", false},
{"v2.28.0", true},
{"v2.28.1", true},
{"v2.29.0", true},
{"v3.0.0", true},
{"2.28.0", true}, // without v prefix
{"2.27.0", false}, // without v prefix
{"v2.28.0-devel+abc123", true}, // dev version
{"v2.27.0-devel+abc123", false},
}
for _, tt := range tests {
t.Run(tt.version, func(t *testing.T) {
t.Parallel()
got := support.VersionSupportsPprof(tt.version)
if got != tt.want {
t.Errorf("versionSupportsPprof(%q) = %v, want %v", tt.version, got, tt.want)
}
})
}
}