Files
coder/support/support.go
T
Kacper Sawicki df2360f56a feat(coderd): add consolidated /debug/profile endpoint for pprof collection (#22892)
## Summary

Adds a new `GET /api/v2/debug/profile` endpoint that collects multiple
pprof profiles in a single request and returns them as a tar.gz archive.
This allows collecting profiles (including block and mutex) without
requiring `CODER_PPROF_ENABLE` to be set, and without restarting
`coderd`.

Closes #21679

## What it does

The endpoint:
- Temporarily enables block and mutex profiling (normally disabled at
runtime)
- Runs CPU profile and/or trace for a configurable duration (default
10s, max 60s)
- Collects snapshot profiles (heap, allocs, block, mutex, goroutine,
threadcreate)
- Returns a tar.gz archive containing all requested `.prof` files
- Uses an atomic bool to prevent concurrent collections (returns 409
Conflict)
- Is protected by the existing debug endpoint RBAC (owner-only)

**Supported profile types:** cpu, heap, allocs, block, mutex, goroutine,
threadcreate, trace

**Query parameters:**
- `duration`: How long to run timed profiles (default: `10s`, max:
`60s`)
- `profiles`: Comma-separated list of profile types (default:
`cpu,heap,allocs,block,mutex,goroutine`)

## Additional changes

- **SDK client method** (`codersdk.Client.DebugCollectProfile`) for easy
programmatic access
- **`coder support bundle --pprof` integration**: tries the consolidated
endpoint first, falls back to individual `/debug/pprof/*` endpoints for
older servers
- **8 new tests** covering defaults, custom profiles, trace+CPU,
validation errors, authorization, and conflict detection
2026-03-13 14:09:39 +00:00

1195 lines
34 KiB
Go

package support
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"encoding/base64"
"encoding/json"
"errors"
"io"
"net"
"net/http"
"net/http/httptest"
"path"
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/mod/semver"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"tailscale.com/ipn/ipnstate"
"tailscale.com/net/netcheck"
"cdr.dev/slog/v3"
"cdr.dev/slog/v3/sloggers/sloghuman"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/healthsdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/tailnet"
)
// Bundle is a set of information discovered about a deployment.
// Even though we do attempt to sanitize data, it may still contain
// sensitive information and should thus be treated as secret.
type Bundle struct {
Deployment Deployment `json:"deployment"`
Network Network `json:"network"`
Workspace Workspace `json:"workspace"`
Agent Agent `json:"agent"`
Logs []string `json:"logs"`
CLILogs []byte `json:"cli_logs"`
NamedTemplate TemplateDump `json:"named_template"`
Pprof Pprof `json:"pprof"`
}
type Deployment struct {
BuildInfo *codersdk.BuildInfoResponse `json:"build"`
Config *codersdk.DeploymentConfig `json:"config"`
Experiments codersdk.Experiments `json:"experiments"`
HealthReport *healthsdk.HealthcheckReport `json:"health_report"`
Licenses []codersdk.License `json:"licenses"`
Stats *codersdk.DeploymentStats `json:"stats"`
Entitlements *codersdk.Entitlements `json:"entitlements"`
HealthSettings *healthsdk.HealthSettings `json:"health_settings"`
Workspaces *codersdk.WorkspacesResponse `json:"workspaces"`
Prometheus []byte `json:"prometheus"`
}
type Network struct {
ConnectionInfo workspacesdk.AgentConnectionInfo
CoordinatorDebug string `json:"coordinator_debug"`
Netcheck *derphealth.Report `json:"netcheck"`
TailnetDebug string `json:"tailnet_debug"`
Interfaces healthsdk.InterfacesReport `json:"interfaces"`
}
type Netcheck struct {
Report *netcheck.Report `json:"report"`
Error string `json:"error"`
Logs []string `json:"logs"`
}
type Workspace struct {
Workspace codersdk.Workspace `json:"workspace"`
Parameters []codersdk.WorkspaceBuildParameter `json:"parameters"`
Template codersdk.Template `json:"template"`
TemplateVersion codersdk.TemplateVersion `json:"template_version"`
TemplateFileBase64 string `json:"template_file_base64"`
BuildLogs []codersdk.ProvisionerJobLog `json:"build_logs"`
}
type Agent struct {
Agent *codersdk.WorkspaceAgent `json:"agent"`
ConnectionInfo *workspacesdk.AgentConnectionInfo `json:"connection_info"`
ListeningPorts *codersdk.WorkspaceAgentListeningPortsResponse `json:"listening_ports"`
Logs []byte `json:"logs"`
ClientMagicsockHTML []byte `json:"client_magicsock_html"`
AgentMagicsockHTML []byte `json:"agent_magicsock_html"`
Manifest *agentsdk.Manifest `json:"manifest"`
PeerDiagnostics *tailnet.PeerDiagnostics `json:"peer_diagnostics"`
PingResult *ipnstate.PingResult `json:"ping_result"`
Prometheus []byte `json:"prometheus"`
StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"`
}
type TemplateDump struct {
Template codersdk.Template `json:"template"`
TemplateVersion codersdk.TemplateVersion `json:"template_version"`
TemplateFileBase64 string `json:"template_file_base64"`
}
type Pprof struct {
Server *PprofCollection `json:"server,omitempty"`
Agent *PprofCollection `json:"agent,omitempty"`
}
type PprofCollection struct {
Heap []byte `json:"heap,omitempty"`
Allocs []byte `json:"allocs,omitempty"`
Profile []byte `json:"profile,omitempty"`
Block []byte `json:"block,omitempty"`
Mutex []byte `json:"mutex,omitempty"`
Goroutine []byte `json:"goroutine,omitempty"`
Threadcreate []byte `json:"threadcreate,omitempty"`
Trace []byte `json:"trace,omitempty"`
Cmdline string `json:"cmdline,omitempty"`
Symbol string `json:"symbol,omitempty"`
CollectedAt time.Time `json:"collected_at"`
EndpointURL string `json:"endpoint_url"`
}
// Deps is a set of dependencies for discovering information
type Deps struct {
// Source from which to obtain information.
Client *codersdk.Client
// Log is where to log any informational or warning messages.
Log slog.Logger
// WorkspaceID is the optional workspace against which to run connection tests.
WorkspaceID uuid.UUID
// AgentID is the optional agent ID against which to run connection tests.
// Defaults to the first agent of the workspace, if not specified.
AgentID uuid.UUID
// WorkspacesTotalCap limits the TOTAL number of workspaces aggregated into the bundle.
// > 0 => cap at this number (default flag value should be 1000 via CLI).
// <= 0 => no cap (fetch/keep all available workspaces).
WorkspacesTotalCap int
// TemplateID optionally specifies a template to capture (active version).
TemplateID uuid.UUID
// CollectPprof toggles server and agent pprof collection.
CollectPprof bool
}
func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, workspacesCap int) Deployment {
// Note: each goroutine assigns to a different struct field, hence no mutex.
var (
d Deployment
eg errgroup.Group
)
eg.Go(func() error {
bi, err := client.BuildInfo(ctx)
if err != nil {
return xerrors.Errorf("fetch build info: %w", err)
}
d.BuildInfo = &bi
return nil
})
eg.Go(func() error {
dc, err := client.DeploymentConfig(ctx)
if err != nil {
return xerrors.Errorf("fetch deployment config: %w", err)
}
d.Config = dc
return nil
})
eg.Go(func() error {
hr, err := healthsdk.New(client).DebugHealth(ctx)
if err != nil {
return xerrors.Errorf("fetch health report: %w", err)
}
d.HealthReport = &hr
return nil
})
eg.Go(func() error {
exp, err := client.Experiments(ctx)
if err != nil {
return xerrors.Errorf("fetch experiments: %w", err)
}
d.Experiments = exp
return nil
})
eg.Go(func() error {
licenses, err := client.Licenses(ctx)
if err != nil {
// Ignore 404 because AGPL doesn't have this endpoint
if cerr, ok := codersdk.AsError(err); ok && cerr.StatusCode() != http.StatusNotFound {
return xerrors.Errorf("fetch license status: %w", err)
}
}
if licenses == nil {
licenses = make([]codersdk.License, 0)
}
d.Licenses = licenses
return nil
})
// Deployment stats
eg.Go(func() error {
stats, err := client.DeploymentStats(ctx)
if err != nil {
// If unauthorized or forbidden, log and continue
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized || cerr.StatusCode() == http.StatusBadRequest) {
log.Warn(ctx, "unable to fetch deployment stats")
return nil
}
return xerrors.Errorf("fetch deployment stats: %w", err)
}
d.Stats = &stats
return nil
})
// Entitlements
eg.Go(func() error {
ents, err := client.Entitlements(ctx)
if err != nil {
// Ignore 404 or enterprise-not-enabled
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusNotFound || cerr.StatusCode() == http.StatusForbidden) {
log.Warn(ctx, "unable to fetch entitlements")
return nil
}
return xerrors.Errorf("fetch entitlements: %w", err)
}
d.Entitlements = &ents
return nil
})
// Health settings
eg.Go(func() error {
settings, err := healthsdk.New(client).HealthSettings(ctx)
if err != nil {
// If not accessible, log and continue
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
log.Warn(ctx, "unable to fetch health settings")
return nil
}
return xerrors.Errorf("fetch health settings: %w", err)
}
d.HealthSettings = &settings
return nil
})
// List workspaces (paginated)
eg.Go(func() error {
var (
offset int
limit = 200
all []codersdk.Workspace
count int
)
capTotal := workspacesCap
for {
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{Offset: offset, Limit: limit})
if err != nil {
// Log and continue if forbidden; otherwise return error
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
log.Warn(ctx, "unable to list workspaces")
break
}
return xerrors.Errorf("list workspaces: %w", err)
}
if d.Workspaces == nil {
d.Workspaces = &resp
}
// sanitize env vars on agents in each workspace before appending
for i := range resp.Workspaces {
ws := &resp.Workspaces[i]
for _, res := range ws.LatestBuild.Resources {
for _, agt := range res.Agents {
// safe to call even if map is nil (range in sanitizeEnv would be empty)
sanitizeEnv(agt.EnvironmentVariables)
}
}
}
all = append(all, resp.Workspaces...)
count = resp.Count
// Stop early once we've reached the cap; trim any overflow from the last page.
if capTotal > 0 && len(all) >= capTotal {
if len(all) > capTotal {
all = all[:capTotal]
}
break
}
if offset+len(resp.Workspaces) >= count || len(resp.Workspaces) == 0 {
break
}
offset += len(resp.Workspaces)
}
if d.Workspaces != nil {
// Replace with aggregated list
d.Workspaces.Workspaces = all
// Preserve server-reported total so Run() can log accurate truncation.
d.Workspaces.Count = count
}
return nil
})
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch deployment information", slog.Error(err))
}
if d.Config != nil && d.Config.Values != nil {
prometheusCfg := d.Config.Values.Prometheus
if prometheusCfg.Enable.Value() {
metrics, err := fetchPrometheusMetrics(ctx, client, log)
if err != nil {
log.Warn(ctx, "fetch coderd prometheus metrics", slog.Error(err))
} else {
d.Prometheus = metrics
}
}
}
return d
}
func fetchPrometheusMetrics(ctx context.Context, client *codersdk.Client, log slog.Logger) ([]byte, error) {
if client == nil {
return nil, xerrors.New("nil client")
}
reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
resp, err := client.Request(reqCtx, http.MethodGet, "/api/v2/debug/metrics", nil)
if err != nil {
return nil, xerrors.Errorf("request metrics: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, xerrors.Errorf("read metrics body: %w", err)
}
if resp.StatusCode != http.StatusOK {
log.Debug(ctx, "coderd prometheus metrics fetch non-200",
slog.F("status", resp.StatusCode), slog.F("body_len", len(body)))
return nil, xerrors.Errorf("unexpected status code %d", resp.StatusCode)
}
trimmed := bytes.TrimSpace(body)
if len(trimmed) == 0 {
return nil, xerrors.New("empty prometheus metrics response")
}
return append([]byte(nil), trimmed...), nil
}
func NetworkInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Network {
var (
n Network
eg errgroup.Group
)
eg.Go(func() error {
coordResp, err := client.Request(ctx, http.MethodGet, "/api/v2/debug/coordinator", nil)
if err != nil {
return xerrors.Errorf("fetch coordinator debug page: %w", err)
}
defer coordResp.Body.Close()
bs, err := io.ReadAll(coordResp.Body)
if err != nil {
return xerrors.Errorf("read coordinator debug page: %w", err)
}
n.CoordinatorDebug = string(bs)
return nil
})
eg.Go(func() error {
tailResp, err := client.Request(ctx, http.MethodGet, "/api/v2/debug/tailnet", nil)
if err != nil {
return xerrors.Errorf("fetch tailnet debug page: %w", err)
}
defer tailResp.Body.Close()
bs, err := io.ReadAll(tailResp.Body)
if err != nil {
return xerrors.Errorf("read tailnet debug page: %w", err)
}
n.TailnetDebug = string(bs)
return nil
})
eg.Go(func() error {
// Need connection info to get DERP map for netcheck
connInfo, err := workspacesdk.New(client).AgentConnectionInfoGeneric(ctx)
if err != nil {
log.Warn(ctx, "unable to fetch generic agent connection info")
return nil
}
n.ConnectionInfo = connInfo
var rpt derphealth.Report
rpt.Run(ctx, &derphealth.ReportOptions{
DERPMap: connInfo.DERPMap,
})
n.Netcheck = &rpt
return nil
})
eg.Go(func() error {
rpt, err := healthsdk.RunInterfacesReport()
if err != nil {
return xerrors.Errorf("run interfaces report: %w", err)
}
n.Interfaces = rpt
return nil
})
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch network information", slog.Error(err))
}
return n
}
func WorkspaceInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, workspaceID uuid.UUID) Workspace {
var (
w Workspace
eg errgroup.Group
)
if workspaceID == uuid.Nil {
log.Error(ctx, "no workspace id specified")
return w
}
// dependency, cannot fetch concurrently
ws, err := client.Workspace(ctx, workspaceID)
if err != nil {
log.Error(ctx, "fetch workspace", slog.Error(err), slog.F("workspace_id", workspaceID))
return w
}
for _, res := range ws.LatestBuild.Resources {
for _, agt := range res.Agents {
sanitizeEnv(agt.EnvironmentVariables)
}
}
w.Workspace = ws
eg.Go(func() error {
buildLogCh, closer, err := client.WorkspaceBuildLogsAfter(ctx, ws.LatestBuild.ID, 0)
if err != nil {
return xerrors.Errorf("fetch provisioner job logs: %w", err)
}
defer closer.Close()
for log := range buildLogCh {
w.BuildLogs = append(w.BuildLogs, log)
}
return nil
})
eg.Go(func() error {
if w.Workspace.TemplateActiveVersionID == uuid.Nil {
return xerrors.Errorf("workspace has nil template active version id")
}
tv, err := client.TemplateVersion(ctx, w.Workspace.TemplateActiveVersionID)
if err != nil {
return xerrors.Errorf("fetch template active version id")
}
w.TemplateVersion = tv
if tv.Job.FileID == uuid.Nil {
return xerrors.Errorf("template file id is nil")
}
raw, ctype, err := client.DownloadWithFormat(ctx, tv.Job.FileID, codersdk.FormatZip)
if err != nil {
return err
}
if ctype != codersdk.ContentTypeZip {
return xerrors.Errorf("expected content-type %s, got %s", codersdk.ContentTypeZip, ctype)
}
b64encoded := base64.StdEncoding.EncodeToString(raw)
w.TemplateFileBase64 = b64encoded
return nil
})
eg.Go(func() error {
if w.Workspace.TemplateID == uuid.Nil {
return xerrors.Errorf("workspace has nil version id")
}
tpl, err := client.Template(ctx, w.Workspace.TemplateID)
if err != nil {
return xerrors.Errorf("fetch template")
}
w.Template = tpl
return nil
})
eg.Go(func() error {
if ws.LatestBuild.ID == uuid.Nil {
return xerrors.Errorf("workspace has nil latest build id")
}
params, err := client.WorkspaceBuildParameters(ctx, ws.LatestBuild.ID)
if err != nil {
return xerrors.Errorf("fetch workspace build parameters: %w", err)
}
w.Parameters = params
return nil
})
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch workspace information", slog.Error(err))
}
return w
}
func AgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, agentID uuid.UUID) Agent {
var (
a Agent
eg errgroup.Group
)
if agentID == uuid.Nil {
log.Error(ctx, "no agent id specified")
return a
}
eg.Go(func() error {
agt, err := client.WorkspaceAgent(ctx, agentID)
if err != nil {
return xerrors.Errorf("fetch workspace agent: %w", err)
}
sanitizeEnv(agt.EnvironmentVariables)
a.Agent = &agt
return nil
})
eg.Go(func() error {
agentLogCh, closer, err := client.WorkspaceAgentLogsAfter(ctx, agentID, 0, false)
if err != nil {
return xerrors.Errorf("fetch agent startup logs: %w", err)
}
defer closer.Close()
var logs []codersdk.WorkspaceAgentLog
for logChunk := range agentLogCh {
logs = append(logs, logChunk...)
}
a.StartupLogs = logs
return nil
})
// to simplify control flow, fetching information directly from
// the agent is handled in a separate function
closer := connectedAgentInfo(ctx, client, log, agentID, &eg, &a)
defer closer()
if err := eg.Wait(); err != nil {
log.Error(ctx, "fetch agent information", slog.Error(err))
}
return a
}
func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, agentID uuid.UUID, eg *errgroup.Group, a *Agent) (closer func()) {
conn, err := workspacesdk.New(client).
DialAgent(ctx, agentID, &workspacesdk.DialAgentOptions{
Logger: log.Named("dial-agent"),
BlockEndpoints: false,
})
closer = func() {}
if err != nil {
log.Error(ctx, "dial agent", slog.Error(err))
return closer
}
if !conn.AwaitReachable(ctx) {
log.Error(ctx, "timed out waiting for agent")
return closer
}
closer = func() {
if err := conn.Close(); err != nil {
log.Error(ctx, "failed to close agent connection", slog.Error(err))
}
<-conn.TailnetConn().Closed()
}
eg.Go(func() error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/", nil)
if err != nil {
return xerrors.Errorf("create request: %w", err)
}
rr := httptest.NewRecorder()
conn.TailnetConn().MagicsockServeHTTPDebug(rr, req)
a.ClientMagicsockHTML = rr.Body.Bytes()
return nil
})
eg.Go(func() error {
promRes, err := conn.PrometheusMetrics(ctx)
if err != nil {
return xerrors.Errorf("fetch agent prometheus metrics: %w", err)
}
a.Prometheus = promRes
return nil
})
eg.Go(func() error {
_, _, pingRes, err := conn.Ping(ctx)
if err != nil {
return xerrors.Errorf("ping agent: %w", err)
}
a.PingResult = pingRes
return nil
})
eg.Go(func() error {
pds := conn.GetPeerDiagnostics()
a.PeerDiagnostics = &pds
return nil
})
eg.Go(func() error {
msBytes, err := conn.DebugMagicsock(ctx)
if err != nil {
return xerrors.Errorf("get agent magicsock page: %w", err)
}
a.AgentMagicsockHTML = msBytes
return nil
})
eg.Go(func() error {
manifestRes, err := conn.DebugManifest(ctx)
if err != nil {
return xerrors.Errorf("fetch manifest: %w", err)
}
if err := json.NewDecoder(bytes.NewReader(manifestRes)).Decode(&a.Manifest); err != nil {
return xerrors.Errorf("decode agent manifest: %w", err)
}
sanitizeEnv(a.Manifest.EnvironmentVariables)
return nil
})
eg.Go(func() error {
logBytes, err := conn.DebugLogs(ctx)
if err != nil {
return xerrors.Errorf("fetch coder agent logs: %w", err)
}
a.Logs = logBytes
return nil
})
eg.Go(func() error {
lps, err := conn.ListeningPorts(ctx)
if err != nil {
return xerrors.Errorf("get listening ports: %w", err)
}
a.ListeningPorts = &lps
return nil
})
return closer
}
func PprofInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) *PprofCollection {
if client == nil {
return nil
}
var (
p PprofCollection
eg errgroup.Group
)
if client.URL != nil {
if u, err := client.URL.Parse("/api/v2/debug/pprof"); err == nil {
p.EndpointURL = u.String()
}
}
if p.EndpointURL == "" {
p.EndpointURL = "/api/v2/debug/pprof"
}
p.CollectedAt = time.Now()
const basePath = "/api/v2/debug/pprof"
endpoints := map[string]func([]byte){
"/allocs": func(data []byte) {
p.Allocs = compressData(data)
},
"/heap": func(data []byte) {
p.Heap = compressData(data)
},
"/profile?seconds=30": func(data []byte) {
p.Profile = compressData(data)
},
"/block": func(data []byte) {
p.Block = compressData(data)
},
"/mutex": func(data []byte) {
p.Mutex = compressData(data)
},
"/goroutine": func(data []byte) {
p.Goroutine = compressData(data)
},
"/threadcreate": func(data []byte) {
p.Threadcreate = compressData(data)
},
"/trace?seconds=30": func(data []byte) {
p.Trace = compressData(data)
},
"/cmdline": func(data []byte) {
p.Cmdline = string(data)
},
"/symbol": func(data []byte) {
p.Symbol = string(data)
},
}
for endpoint, setter := range endpoints {
eg.Go(func() error {
timeout := 10 * time.Second
if strings.Contains(endpoint, "seconds=30") {
timeout = 45 * time.Second
}
reqCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
resp, err := client.Request(reqCtx, http.MethodGet, basePath+endpoint, nil)
if err != nil {
log.Warn(reqCtx, "failed to fetch pprof data", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Warn(reqCtx, "pprof endpoint returned non-200 status",
slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
return nil
}
data, err := io.ReadAll(resp.Body)
if err != nil {
log.Warn(reqCtx, "failed to read pprof response", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
setter(data)
return nil
})
}
if err := eg.Wait(); err != nil {
log.Error(ctx, "failed to collect some pprof data", slog.Error(err))
}
return &p
}
func compressData(data []byte) []byte {
if len(data) == 0 {
return data
}
var buf bytes.Buffer
gz := gzip.NewWriter(&buf)
if _, err := gz.Write(data); err != nil {
return data // Return uncompressed if compression fails
}
if err := gz.Close(); err != nil {
return data
}
return buf.Bytes()
}
// PprofInfoFromArchive uses the consolidated /api/v2/debug/profile endpoint
// to collect pprof data in a single request. The server temporarily enables
// block/mutex profiling, runs time-based profiles for the given duration,
// takes snapshots, and returns a tar.gz archive.
func PprofInfoFromArchive(ctx context.Context, client *codersdk.Client, log slog.Logger, duration time.Duration) (*PprofCollection, error) {
if client == nil {
return nil, xerrors.New("client is nil")
}
body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{
Duration: duration,
// Use the server defaults plus trace.
Profiles: []string{"cpu", "heap", "allocs", "block", "mutex", "goroutine", "threadcreate", "trace"},
})
if err != nil {
return nil, xerrors.Errorf("fetch consolidated profile: %w", err)
}
defer body.Close()
data, err := io.ReadAll(body)
if err != nil {
return nil, xerrors.Errorf("read profile archive: %w", err)
}
var p PprofCollection
if client.URL != nil {
if u, err := client.URL.Parse("/api/v2/debug/profile"); err == nil {
p.EndpointURL = u.String()
}
}
if p.EndpointURL == "" {
p.EndpointURL = "/api/v2/debug/profile"
}
p.CollectedAt = time.Now()
// Parse the tar.gz archive and populate the PprofCollection.
gr, err := gzip.NewReader(bytes.NewReader(data))
if err != nil {
return nil, xerrors.Errorf("open gzip reader: %w", err)
}
defer gr.Close()
tr := tar.NewReader(gr)
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, xerrors.Errorf("read tar entry %q: %w", hdr.Name, err)
}
content, err := io.ReadAll(tr)
if err != nil {
log.Warn(ctx, "failed to read tar entry", slog.F("name", hdr.Name), slog.Error(err))
continue
}
// Files in the archive are named like "cpu.prof", "heap.prof",
// "trace.out", etc. Compress binary profile data for storage in
// the bundle, matching what PprofInfo() does.
base := path.Base(hdr.Name)
switch base {
case "cpu.prof":
p.Profile = compressData(content)
case "heap.prof":
p.Heap = compressData(content)
case "allocs.prof":
p.Allocs = compressData(content)
case "block.prof":
p.Block = compressData(content)
case "mutex.prof":
p.Mutex = compressData(content)
case "goroutine.prof":
p.Goroutine = compressData(content)
case "threadcreate.prof":
p.Threadcreate = compressData(content)
case "trace.out":
p.Trace = compressData(content)
default:
log.Debug(ctx, "unknown profile in archive", slog.F("name", hdr.Name))
}
}
return &p, nil
}
func PprofInfoFromAgent(ctx context.Context, conn workspacesdk.AgentConn, log slog.Logger) *PprofCollection {
if conn == nil {
return nil
}
var (
p PprofCollection
eg errgroup.Group
)
p.EndpointURL = "agent"
p.CollectedAt = time.Now()
// Define agent pprof endpoints - these go through the agent connection
endpoints := map[string]func([]byte){
"/debug/pprof/allocs": func(data []byte) {
p.Allocs = compressData(data)
},
"/debug/pprof/heap": func(data []byte) {
p.Heap = compressData(data)
},
"/debug/pprof/profile?seconds=30": func(data []byte) {
p.Profile = compressData(data)
},
"/debug/pprof/block": func(data []byte) {
p.Block = compressData(data)
},
"/debug/pprof/mutex": func(data []byte) {
p.Mutex = compressData(data)
},
"/debug/pprof/goroutine": func(data []byte) {
p.Goroutine = compressData(data)
},
"/debug/pprof/threadcreate": func(data []byte) {
p.Threadcreate = compressData(data)
},
"/debug/pprof/trace?seconds=30": func(data []byte) {
p.Trace = compressData(data)
},
"/debug/pprof/cmdline": func(data []byte) {
p.Cmdline = string(data)
},
"/debug/pprof/symbol": func(data []byte) {
p.Symbol = string(data)
},
}
// Collect each endpoint in parallel
for endpoint, setter := range endpoints {
eg.Go(func() error {
// Set longer timeout for profile and trace endpoints (they take 30 seconds)
timeout := 10 * time.Second
if strings.Contains(endpoint, "seconds=30") {
timeout = 45 * time.Second
}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// Use the agent's direct HTTP capability
// Agent pprof server runs on 127.0.0.1:6060 by default
netConn, err := conn.DialContext(ctx, "tcp", "127.0.0.1:6060")
if err != nil {
log.Warn(ctx, "failed to dial agent pprof endpoint", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer netConn.Close()
// Create HTTP client using the connection
client := &http.Client{
Transport: &http.Transport{
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
return netConn, nil
},
},
Timeout: timeout,
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:6060"+endpoint, nil)
if err != nil {
log.Warn(ctx, "failed to create agent pprof request", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
resp, err := client.Do(req)
if err != nil {
log.Warn(ctx, "failed to fetch agent pprof data", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Warn(ctx, "agent pprof endpoint returned non-200 status", slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
return nil
}
data, err := io.ReadAll(resp.Body)
if err != nil {
log.Warn(ctx, "failed to read agent pprof response", slog.F("endpoint", endpoint), slog.Error(err))
return nil
}
setter(data)
return nil
})
}
if err := eg.Wait(); err != nil {
log.Error(ctx, "failed to collect some agent pprof data", slog.Error(err))
}
return &p
}
// Run generates a support bundle with the given dependencies.
func Run(ctx context.Context, d *Deps) (*Bundle, error) {
var b Bundle
if d.Client == nil {
return nil, xerrors.Errorf("developer error: missing client!")
}
authChecks := map[string]codersdk.AuthorizationCheck{
"Read DeploymentValues": {
Object: codersdk.AuthorizationObject{
ResourceType: codersdk.ResourceDeploymentConfig,
},
Action: codersdk.ActionRead,
},
}
// Ensure we capture logs from the client.
var logw strings.Builder
d.Log = d.Log.AppendSinks(sloghuman.Sink(&logw))
d.Client.SetLogger(d.Log)
defer func() {
b.Logs = strings.Split(logw.String(), "\n")
}()
authResp, err := d.Client.AuthCheck(ctx, codersdk.AuthorizationRequest{Checks: authChecks})
if err != nil {
return &b, xerrors.Errorf("check authorization: %w", err)
}
for k, v := range authResp {
if !v {
return &b, xerrors.Errorf("failed authorization check: cannot %s", k)
}
}
totalCap := d.WorkspacesTotalCap
var eg errgroup.Group
eg.Go(func() error {
di := DeploymentInfo(ctx, d.Client, d.Log, totalCap)
if di.Workspaces != nil && totalCap > 0 {
origTotal := di.Workspaces.Count // server-reported total
// Ensure at most 'totalCap' are returned (covers non-early-exit path).
if len(di.Workspaces.Workspaces) > totalCap {
di.Workspaces.Workspaces = di.Workspaces.Workspaces[:totalCap]
}
// If we returned fewer than the original total, log a truncation.
if origTotal > len(di.Workspaces.Workspaces) {
di.Workspaces.Count = len(di.Workspaces.Workspaces)
d.Log.Warn(ctx, "workspace list truncated",
slog.F("cap", totalCap),
slog.F("original_total", origTotal),
)
}
}
b.Deployment = di
return nil
})
eg.Go(func() error {
wi := WorkspaceInfo(ctx, d.Client, d.Log, d.WorkspaceID)
b.Workspace = wi
return nil
})
eg.Go(func() error {
ni := NetworkInfo(ctx, d.Client, d.Log)
b.Network = ni
return nil
})
eg.Go(func() error {
ai := AgentInfo(ctx, d.Client, d.Log, d.AgentID)
b.Agent = ai
return nil
})
// Optional: capture a template's active version and file if TemplateID is set.
eg.Go(func() error {
if d.TemplateID == uuid.Nil {
return nil
}
var td TemplateDump
tpl, err := d.Client.Template(ctx, d.TemplateID)
if err != nil {
d.Log.Error(ctx, "fetch template", slog.Error(err), slog.F("template_id", d.TemplateID))
return nil
}
td.Template = tpl
if tpl.ActiveVersionID == uuid.Nil {
d.Log.Error(ctx, "template has nil active version id", slog.F("template_id", tpl.ID))
b.NamedTemplate = td
return nil
}
tv, err := d.Client.TemplateVersion(ctx, tpl.ActiveVersionID)
if err != nil {
d.Log.Error(ctx, "fetch active template version", slog.Error(err), slog.F("active_version_id", tpl.ActiveVersionID))
b.NamedTemplate = td
return nil
}
td.TemplateVersion = tv
if tv.Job.FileID == uuid.Nil {
d.Log.Error(ctx, "template file id is nil", slog.F("template_version_id", tv.ID))
b.NamedTemplate = td
return nil
}
raw, ctype, err := d.Client.DownloadWithFormat(ctx, tv.Job.FileID, codersdk.FormatZip)
if err != nil || ctype != codersdk.ContentTypeZip {
d.Log.Error(ctx, "download template file", slog.Error(err), slog.F("content_type", ctype))
b.NamedTemplate = td
return nil
}
td.TemplateFileBase64 = base64.StdEncoding.EncodeToString(raw)
b.NamedTemplate = td
return nil
})
_ = eg.Wait()
// Collect pprof data after deployment info is available (need version check).
// Pprof endpoints require Coder server version 2.28.0 or newer.
if d.CollectPprof {
b.Pprof = collectPprof(ctx, d, &b)
}
return &b, nil
}
// minPprofVersion is the minimum Coder server version that supports
// the /api/v2/debug/pprof endpoints.
const minPprofVersion = "v2.28.0"
// VersionSupportsPprof checks if the given version supports pprof endpoints.
func VersionSupportsPprof(version string) bool {
if version == "" {
return false
}
if version[0] != 'v' {
version = "v" + version
}
// For prerelease versions like "v2.28.0-devel+abc123", we compare
// the major.minor.patch portion since prereleases of 2.28.0 should
// have the pprof feature.
canonical := semver.Canonical(version)
if idx := strings.Index(canonical, "-"); idx != -1 {
canonical = canonical[:idx]
}
return semver.Compare(canonical, minPprofVersion) >= 0
}
func collectPprof(ctx context.Context, d *Deps, b *Bundle) Pprof {
var pprof Pprof
// Check server version before attempting pprof collection.
if b.Deployment.BuildInfo == nil {
d.Log.Warn(ctx, "skipping pprof collection: build info not available")
return pprof
}
if !VersionSupportsPprof(b.Deployment.BuildInfo.Version) {
d.Log.Warn(ctx, "skipping pprof collection: server version too old",
slog.F("version", b.Deployment.BuildInfo.Version),
slog.F("min_version", minPprofVersion))
return pprof
}
// Try the consolidated /debug/profile endpoint first. It
// temporarily enables block/mutex profiling on the server and
// returns a single tar.gz archive.
serverPprof, err := PprofInfoFromArchive(ctx, d.Client, d.Log, 30*time.Second)
if err != nil {
d.Log.Warn(ctx, "consolidated profile endpoint unavailable, falling back to individual endpoints",
slog.Error(err))
// Fall back to the legacy per-profile endpoint approach.
serverPprof = PprofInfo(ctx, d.Client, d.Log)
}
if serverPprof != nil {
pprof.Server = serverPprof
}
if d.AgentID != uuid.Nil {
conn, err := workspacesdk.New(d.Client).
DialAgent(ctx, d.AgentID, &workspacesdk.DialAgentOptions{
Logger: d.Log.Named("dial-agent-pprof"),
BlockEndpoints: false,
})
if err != nil {
d.Log.Warn(ctx, "failed to dial agent for pprof collection", slog.Error(err))
} else {
defer func() {
if err := conn.Close(); err != nil {
d.Log.Error(ctx, "failed to close agent pprof connection", slog.Error(err))
}
<-conn.TailnetConn().Closed()
}()
if conn.AwaitReachable(ctx) {
agentPprof := PprofInfoFromAgent(ctx, conn, d.Log)
if agentPprof != nil {
pprof.Agent = agentPprof
}
} else {
d.Log.Warn(ctx, "agent not reachable for pprof collection")
}
}
}
return pprof
}
// sanitizeEnv modifies kvs in place and replaces the values all non-empty keys
// with the string ***REDACTED***
func sanitizeEnv(kvs map[string]string) {
for k, v := range kvs {
if v != "" {
kvs[k] = "***REDACTED***"
}
}
}