mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
feat: support bundle updates to enable pprof and telemetry collection (#21486)
- Adds pprof collection support now that we have the listeners automatically starting (requires Coder server 2.28.0+, includes a version check). Collects heap, allocs, profile (30s), block, mutex, goroutine, threadcreate, trace (30s), cmdline, symbol. Performs capture for 30 seconds and emits a log line stating as such. Enable capture by supplying the `--pprof` flag or `CODER_SUPPORT_BUNDLE_PPROF` env var. Collection of pprof data from both coderd and the Coder agent occurs. - Adds collection of Prometheus metrics, also requires 2.28.0+ - Adds the ability to include a template in the bundle independently of supplying the details of a running workspace by supplying the `--template` flag or `CODER_SUPPORT_BUNDLE_TEMPLATE` env var - Captures a list of workspaces the user has access to. Defaults to a max of 10, configurable via `--workspaces-total-cap` / `CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP` - Collects additional stats from the coderd deployment (aggregated workspace/session metrics), as well as entitlements via license and dismissed health checks. created with help from mux
This commit is contained in:
+255
-6
@@ -7,6 +7,7 @@ import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -44,13 +45,18 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information
|
||||
` - Coder deployment version
|
||||
- Coder deployment Configuration (sanitized), including enabled experiments
|
||||
- Coder deployment health snapshot
|
||||
- Coder deployment stats (aggregated workspace/session metrics)
|
||||
- Entitlements (if available)
|
||||
- Health settings (dismissed healthchecks)
|
||||
- Coder deployment Network troubleshooting information
|
||||
- Workspace list accessible to the user (sanitized)
|
||||
- Workspace configuration, parameters, and build logs
|
||||
- Template version and source code for the given workspace
|
||||
- Agent details (with environment variable sanitized)
|
||||
- Agent network diagnostics
|
||||
- Agent logs
|
||||
- License status
|
||||
- pprof profiling data (if --pprof is enabled)
|
||||
` + cliui.Bold("Note: ") +
|
||||
cliui.Wrap("While we try to sanitize sensitive data from support bundles, we cannot guarantee that they do not contain information that you or your organization may consider sensitive.\n") +
|
||||
cliui.Bold("Please confirm that you will:\n") +
|
||||
@@ -61,6 +67,9 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information
|
||||
func (r *RootCmd) supportBundle() *serpent.Command {
|
||||
var outputPath string
|
||||
var coderURLOverride string
|
||||
var workspacesTotalCap64 int64 = 10
|
||||
var templateName string
|
||||
var pprof bool
|
||||
cmd := &serpent.Command{
|
||||
Use: "bundle <workspace> [<agent>]",
|
||||
Short: "Generate a support bundle to troubleshoot issues connecting to a workspace.",
|
||||
@@ -121,8 +130,9 @@ func (r *RootCmd) supportBundle() *serpent.Command {
|
||||
}
|
||||
|
||||
var (
|
||||
wsID uuid.UUID
|
||||
agtID uuid.UUID
|
||||
wsID uuid.UUID
|
||||
agtID uuid.UUID
|
||||
templateID uuid.UUID
|
||||
)
|
||||
|
||||
if len(inv.Args) == 0 {
|
||||
@@ -155,6 +165,16 @@ func (r *RootCmd) supportBundle() *serpent.Command {
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve template by name if provided (captures active version)
|
||||
// Fallback: if canonical name lookup fails, match DisplayName (case-insensitive).
|
||||
if templateName != "" {
|
||||
id, err := resolveTemplateID(inv.Context(), client, templateName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
templateID = id
|
||||
}
|
||||
|
||||
if outputPath == "" {
|
||||
cwd, err := filepath.Abs(".")
|
||||
if err != nil {
|
||||
@@ -176,12 +196,25 @@ func (r *RootCmd) supportBundle() *serpent.Command {
|
||||
if r.verbose {
|
||||
clientLog.AppendSinks(sloghuman.Sink(inv.Stderr))
|
||||
}
|
||||
if pprof {
|
||||
_, _ = fmt.Fprintln(inv.Stderr, "pprof data collection will take approximately 30 seconds...")
|
||||
}
|
||||
|
||||
// Bypass rate limiting for support bundle collection since it makes many API calls.
|
||||
client.HTTPClient.Transport = &codersdk.HeaderTransport{
|
||||
Transport: client.HTTPClient.Transport,
|
||||
Header: http.Header{codersdk.BypassRatelimitHeader: {"true"}},
|
||||
}
|
||||
|
||||
deps := support.Deps{
|
||||
Client: client,
|
||||
// Support adds a sink so we don't need to supply one ourselves.
|
||||
Log: clientLog,
|
||||
WorkspaceID: wsID,
|
||||
AgentID: agtID,
|
||||
Log: clientLog,
|
||||
WorkspaceID: wsID,
|
||||
AgentID: agtID,
|
||||
WorkspacesTotalCap: int(workspacesTotalCap64),
|
||||
TemplateID: templateID,
|
||||
CollectPprof: pprof,
|
||||
}
|
||||
|
||||
bun, err := support.Run(inv.Context(), &deps)
|
||||
@@ -217,11 +250,102 @@ func (r *RootCmd) supportBundle() *serpent.Command {
|
||||
Description: "Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica.",
|
||||
Value: serpent.StringOf(&coderURLOverride),
|
||||
},
|
||||
{
|
||||
Flag: "workspaces-total-cap",
|
||||
Env: "CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP",
|
||||
Description: "Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10.",
|
||||
Value: serpent.Int64Of(&workspacesTotalCap64),
|
||||
},
|
||||
{
|
||||
Flag: "template",
|
||||
Env: "CODER_SUPPORT_BUNDLE_TEMPLATE",
|
||||
Description: "Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations.",
|
||||
Value: serpent.StringOf(&templateName),
|
||||
},
|
||||
{
|
||||
Flag: "pprof",
|
||||
Env: "CODER_SUPPORT_BUNDLE_PPROF",
|
||||
Description: "Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer.",
|
||||
Value: serpent.BoolOf(&pprof),
|
||||
},
|
||||
}
|
||||
|
||||
return cmd
|
||||
}
|
||||
|
||||
// Resolve a template to its ID, supporting:
|
||||
// - org/name form
|
||||
// - slug or display name match (case-insensitive) across all memberships
|
||||
func resolveTemplateID(ctx context.Context, client *codersdk.Client, templateArg string) (uuid.UUID, error) {
|
||||
orgPart := ""
|
||||
namePart := templateArg
|
||||
if slash := strings.IndexByte(templateArg, '/'); slash > 0 && slash < len(templateArg)-1 {
|
||||
orgPart = templateArg[:slash]
|
||||
namePart = templateArg[slash+1:]
|
||||
}
|
||||
|
||||
resolveInOrg := func(orgID uuid.UUID) (codersdk.Template, bool, error) {
|
||||
if t, err := client.TemplateByName(ctx, orgID, namePart); err == nil {
|
||||
return t, true, nil
|
||||
}
|
||||
tpls, err := client.TemplatesByOrganization(ctx, orgID)
|
||||
if err != nil {
|
||||
return codersdk.Template{}, false, nil
|
||||
}
|
||||
for _, t := range tpls {
|
||||
if strings.EqualFold(t.Name, namePart) || strings.EqualFold(t.DisplayName, namePart) {
|
||||
return t, true, nil
|
||||
}
|
||||
}
|
||||
return codersdk.Template{}, false, nil
|
||||
}
|
||||
|
||||
if orgPart != "" {
|
||||
org, err := client.OrganizationByName(ctx, orgPart)
|
||||
if err != nil {
|
||||
return uuid.Nil, xerrors.Errorf("get organization %q: %w", orgPart, err)
|
||||
}
|
||||
t, found, err := resolveInOrg(org.ID)
|
||||
if err != nil {
|
||||
return uuid.Nil, err
|
||||
}
|
||||
if !found {
|
||||
return uuid.Nil, xerrors.Errorf("template %q not found in organization %q", namePart, orgPart)
|
||||
}
|
||||
return t.ID, nil
|
||||
}
|
||||
|
||||
orgs, err := client.OrganizationsByUser(ctx, codersdk.Me)
|
||||
if err != nil {
|
||||
return uuid.Nil, xerrors.Errorf("get organizations: %w", err)
|
||||
}
|
||||
var (
|
||||
foundTpl codersdk.Template
|
||||
foundOrgs []string
|
||||
)
|
||||
for _, org := range orgs {
|
||||
if t, found, err := resolveInOrg(org.ID); err == nil && found {
|
||||
if len(foundOrgs) == 0 {
|
||||
foundTpl = t
|
||||
}
|
||||
foundOrgs = append(foundOrgs, org.Name)
|
||||
}
|
||||
}
|
||||
switch len(foundOrgs) {
|
||||
case 0:
|
||||
return uuid.Nil, xerrors.Errorf("template %q not found in your organizations", namePart)
|
||||
case 1:
|
||||
return foundTpl.ID, nil
|
||||
default:
|
||||
return uuid.Nil, xerrors.Errorf(
|
||||
"template %q found in multiple organizations (%s); use --template \"<org_name/%s>\" to target desired template.",
|
||||
namePart,
|
||||
strings.Join(foundOrgs, ", "),
|
||||
namePart,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// summarizeBundle makes a best-effort attempt to write a short summary
|
||||
// of the support bundle to the user's terminal.
|
||||
func summarizeBundle(inv *serpent.Invocation, bun *support.Bundle) {
|
||||
@@ -283,6 +407,10 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
|
||||
"deployment/config.json": src.Deployment.Config,
|
||||
"deployment/experiments.json": src.Deployment.Experiments,
|
||||
"deployment/health.json": src.Deployment.HealthReport,
|
||||
"deployment/stats.json": src.Deployment.Stats,
|
||||
"deployment/entitlements.json": src.Deployment.Entitlements,
|
||||
"deployment/health_settings.json": src.Deployment.HealthSettings,
|
||||
"deployment/workspaces.json": src.Deployment.Workspaces,
|
||||
"network/connection_info.json": src.Network.ConnectionInfo,
|
||||
"network/netcheck.json": src.Network.Netcheck,
|
||||
"network/interfaces.json": src.Network.Interfaces,
|
||||
@@ -302,6 +430,49 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Include named template artifacts (if requested)
|
||||
if src.NamedTemplate.Template.ID != uuid.Nil {
|
||||
name := src.NamedTemplate.Template.Name
|
||||
// JSON files
|
||||
for k, v := range map[string]any{
|
||||
"templates/" + name + "/template.json": src.NamedTemplate.Template,
|
||||
"templates/" + name + "/template_version.json": src.NamedTemplate.TemplateVersion,
|
||||
} {
|
||||
f, err := dest.Create(k)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create file %q in archive: %w", k, err)
|
||||
}
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(v); err != nil {
|
||||
return xerrors.Errorf("write json to %q: %w", k, err)
|
||||
}
|
||||
}
|
||||
// Binary template file (zip)
|
||||
if namedZipBytes, err := base64.StdEncoding.DecodeString(src.NamedTemplate.TemplateFileBase64); err == nil {
|
||||
k := "templates/" + name + "/template_file.zip"
|
||||
f, err := dest.Create(k)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create file %q in archive: %w", k, err)
|
||||
}
|
||||
if _, err := f.Write(namedZipBytes); err != nil {
|
||||
return xerrors.Errorf("write file %q in archive: %w", k, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var buildInfoRef string
|
||||
if src.Deployment.BuildInfo != nil {
|
||||
if raw, err := json.Marshal(src.Deployment.BuildInfo); err == nil {
|
||||
buildInfoRef = base64.StdEncoding.EncodeToString(raw)
|
||||
}
|
||||
}
|
||||
|
||||
tailnetHTML := src.Network.TailnetDebug
|
||||
if buildInfoRef != "" {
|
||||
tailnetHTML += "\n<!-- trace " + buildInfoRef + " -->"
|
||||
}
|
||||
|
||||
templateVersionBytes, err := base64.StdEncoding.DecodeString(src.Workspace.TemplateFileBase64)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("decode template zip from base64")
|
||||
@@ -319,10 +490,11 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
|
||||
"agent/client_magicsock.html": string(src.Agent.ClientMagicsockHTML),
|
||||
"agent/startup_logs.txt": humanizeAgentLogs(src.Agent.StartupLogs),
|
||||
"agent/prometheus.txt": string(src.Agent.Prometheus),
|
||||
"deployment/prometheus.txt": string(src.Deployment.Prometheus),
|
||||
"cli_logs.txt": string(src.CLILogs),
|
||||
"logs.txt": strings.Join(src.Logs, "\n"),
|
||||
"network/coordinator_debug.html": src.Network.CoordinatorDebug,
|
||||
"network/tailnet_debug.html": src.Network.TailnetDebug,
|
||||
"network/tailnet_debug.html": tailnetHTML,
|
||||
"workspace/build_logs.txt": humanizeBuildLogs(src.Workspace.BuildLogs),
|
||||
"workspace/template_file.zip": string(templateVersionBytes),
|
||||
"license-status.txt": licenseStatus,
|
||||
@@ -335,12 +507,89 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error {
|
||||
return xerrors.Errorf("write file %q in archive: %w", k, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Write pprof binary data
|
||||
if err := writePprofData(src.Pprof, dest); err != nil {
|
||||
return xerrors.Errorf("write pprof data: %w", err)
|
||||
}
|
||||
|
||||
if err := dest.Close(); err != nil {
|
||||
return xerrors.Errorf("close zip file: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writePprofData(pprof support.Pprof, dest *zip.Writer) error {
|
||||
// Write server pprof data directly to pprof directory
|
||||
if pprof.Server != nil {
|
||||
if err := writePprofCollection("pprof", pprof.Server, dest); err != nil {
|
||||
return xerrors.Errorf("write server pprof data: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Write agent pprof data
|
||||
if pprof.Agent != nil {
|
||||
if err := writePprofCollection("pprof/agent", pprof.Agent, dest); err != nil {
|
||||
return xerrors.Errorf("write agent pprof data: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func writePprofCollection(basePath string, collection *support.PprofCollection, dest *zip.Writer) error {
|
||||
// Define the pprof files to write with their extensions
|
||||
files := map[string][]byte{
|
||||
"allocs.prof.gz": collection.Allocs,
|
||||
"heap.prof.gz": collection.Heap,
|
||||
"profile.prof.gz": collection.Profile,
|
||||
"block.prof.gz": collection.Block,
|
||||
"mutex.prof.gz": collection.Mutex,
|
||||
"goroutine.prof.gz": collection.Goroutine,
|
||||
"threadcreate.prof.gz": collection.Threadcreate,
|
||||
"trace.gz": collection.Trace,
|
||||
}
|
||||
|
||||
// Write binary pprof files
|
||||
for filename, data := range files {
|
||||
if len(data) > 0 {
|
||||
filePath := basePath + "/" + filename
|
||||
f, err := dest.Create(filePath)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create pprof file %q: %w", filePath, err)
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return xerrors.Errorf("write pprof file %q: %w", filePath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write cmdline as text file
|
||||
if collection.Cmdline != "" {
|
||||
filePath := basePath + "/cmdline.txt"
|
||||
f, err := dest.Create(filePath)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create cmdline file %q: %w", filePath, err)
|
||||
}
|
||||
if _, err := f.Write([]byte(collection.Cmdline)); err != nil {
|
||||
return xerrors.Errorf("write cmdline file %q: %w", filePath, err)
|
||||
}
|
||||
}
|
||||
|
||||
if collection.Symbol != "" {
|
||||
filePath := basePath + "/symbol.txt"
|
||||
f, err := dest.Create(filePath)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create symbol file %q: %w", filePath, err)
|
||||
}
|
||||
if _, err := f.Write([]byte(collection.Symbol)); err != nil {
|
||||
return xerrors.Errorf("write symbol file %q: %w", filePath, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func humanizeAgentLogs(ls []codersdk.WorkspaceAgentLog) string {
|
||||
var buf bytes.Buffer
|
||||
tw := tabwriter.NewWriter(&buf, 0, 2, 1, ' ', 0)
|
||||
|
||||
@@ -46,6 +46,8 @@ func TestSupportBundle(t *testing.T) {
|
||||
|
||||
// Support bundle tests can share a single coderdtest instance.
|
||||
var dc codersdk.DeploymentConfig
|
||||
dc.Values = coderdtest.DeploymentValues(t)
|
||||
dc.Values.Prometheus.Enable = true
|
||||
secretValue := uuid.NewString()
|
||||
seedSecretDeploymentOptions(t, &dc, secretValue)
|
||||
client, closer, api := coderdtest.NewWithAPI(t, &coderdtest.Options{
|
||||
@@ -203,6 +205,10 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge
|
||||
var v codersdk.DeploymentConfig
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotEmpty(t, v, "deployment config should not be empty")
|
||||
case "deployment/entitlements.json":
|
||||
var v codersdk.Entitlements
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotNil(t, v, "entitlements should not be nil")
|
||||
case "deployment/experiments.json":
|
||||
var v codersdk.Experiments
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
@@ -211,6 +217,22 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge
|
||||
var v healthsdk.HealthcheckReport
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotEmpty(t, v, "health report should not be empty")
|
||||
case "deployment/health_settings.json":
|
||||
var v healthsdk.HealthSettings
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotEmpty(t, v, "health settings should not be empty")
|
||||
case "deployment/stats.json":
|
||||
var v codersdk.DeploymentStats
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotNil(t, v, "deployment stats should not be nil")
|
||||
case "deployment/workspaces.json":
|
||||
var v codersdk.Workspace
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
require.NotNil(t, v, "deployment workspaces should not be nil")
|
||||
case "deployment/prometheus.txt":
|
||||
bs := readBytesFromZip(t, f)
|
||||
require.NotEmpty(t, bs, "prometheus metrics should not be empty")
|
||||
require.Contains(t, string(bs), "go_goroutines", "prometheus metrics should contain go runtime metrics")
|
||||
case "network/connection_info.json":
|
||||
var v workspacesdk.AgentConnectionInfo
|
||||
decodeJSONFromZip(t, f, &v)
|
||||
|
||||
+13
@@ -14,10 +14,23 @@ OPTIONS:
|
||||
File path for writing the generated support bundle. Defaults to
|
||||
coder-support-$(date +%s).zip.
|
||||
|
||||
--pprof bool, $CODER_SUPPORT_BUNDLE_PPROF
|
||||
Collect pprof profiling data from the Coder server and agent. Requires
|
||||
Coder server version 2.28.0 or newer.
|
||||
|
||||
--template string, $CODER_SUPPORT_BUNDLE_TEMPLATE
|
||||
Template name to include in the support bundle. Use
|
||||
org_name/template_name if template name is reused across multiple
|
||||
organizations.
|
||||
|
||||
--url-override string, $CODER_SUPPORT_BUNDLE_URL_OVERRIDE
|
||||
Override the URL to your Coder deployment. This may be useful, for
|
||||
example, if you need to troubleshoot a specific Coder replica.
|
||||
|
||||
--workspaces-total-cap int, $CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP
|
||||
Maximum number of workspaces to include in the support bundle. Set to
|
||||
0 or negative value to disable the cap. Defaults to 10.
|
||||
|
||||
-y, --yes bool
|
||||
Bypass confirmation prompts.
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
"github.com/google/uuid"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/collectors"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
httpSwagger "github.com/swaggo/http-swagger/v2"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
@@ -334,6 +335,7 @@ func New(options *Options) *API {
|
||||
|
||||
if options.PrometheusRegistry == nil {
|
||||
options.PrometheusRegistry = prometheus.NewRegistry()
|
||||
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
|
||||
}
|
||||
if options.Authorizer == nil {
|
||||
options.Authorizer = rbac.NewCachingAuthorizer(options.PrometheusRegistry)
|
||||
|
||||
Generated
+27
@@ -42,3 +42,30 @@ File path for writing the generated support bundle. Defaults to coder-support-$(
|
||||
| Environment | <code>$CODER_SUPPORT_BUNDLE_URL_OVERRIDE</code> |
|
||||
|
||||
Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica.
|
||||
|
||||
### --workspaces-total-cap
|
||||
|
||||
| | |
|
||||
|-------------|---------------------------------------------------------|
|
||||
| Type | <code>int</code> |
|
||||
| Environment | <code>$CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP</code> |
|
||||
|
||||
Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10.
|
||||
|
||||
### --template
|
||||
|
||||
| | |
|
||||
|-------------|---------------------------------------------|
|
||||
| Type | <code>string</code> |
|
||||
| Environment | <code>$CODER_SUPPORT_BUNDLE_TEMPLATE</code> |
|
||||
|
||||
Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations.
|
||||
|
||||
### --pprof
|
||||
|
||||
| | |
|
||||
|-------------|------------------------------------------|
|
||||
| Type | <code>bool</code> |
|
||||
| Environment | <code>$CODER_SUPPORT_BUNDLE_PPROF</code> |
|
||||
|
||||
Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer.
|
||||
|
||||
+567
-13
@@ -2,15 +2,19 @@ package support
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/mod/semver"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"golang.org/x/xerrors"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
@@ -30,20 +34,27 @@ import (
|
||||
// Even though we do attempt to sanitize data, it may still contain
|
||||
// sensitive information and should thus be treated as secret.
|
||||
type Bundle struct {
|
||||
Deployment Deployment `json:"deployment"`
|
||||
Network Network `json:"network"`
|
||||
Workspace Workspace `json:"workspace"`
|
||||
Agent Agent `json:"agent"`
|
||||
Logs []string `json:"logs"`
|
||||
CLILogs []byte `json:"cli_logs"`
|
||||
Deployment Deployment `json:"deployment"`
|
||||
Network Network `json:"network"`
|
||||
Workspace Workspace `json:"workspace"`
|
||||
Agent Agent `json:"agent"`
|
||||
Logs []string `json:"logs"`
|
||||
CLILogs []byte `json:"cli_logs"`
|
||||
NamedTemplate TemplateDump `json:"named_template"`
|
||||
Pprof Pprof `json:"pprof"`
|
||||
}
|
||||
|
||||
type Deployment struct {
|
||||
BuildInfo *codersdk.BuildInfoResponse `json:"build"`
|
||||
Config *codersdk.DeploymentConfig `json:"config"`
|
||||
Experiments codersdk.Experiments `json:"experiments"`
|
||||
HealthReport *healthsdk.HealthcheckReport `json:"health_report"`
|
||||
Licenses []codersdk.License `json:"licenses"`
|
||||
BuildInfo *codersdk.BuildInfoResponse `json:"build"`
|
||||
Config *codersdk.DeploymentConfig `json:"config"`
|
||||
Experiments codersdk.Experiments `json:"experiments"`
|
||||
HealthReport *healthsdk.HealthcheckReport `json:"health_report"`
|
||||
Licenses []codersdk.License `json:"licenses"`
|
||||
Stats *codersdk.DeploymentStats `json:"stats"`
|
||||
Entitlements *codersdk.Entitlements `json:"entitlements"`
|
||||
HealthSettings *healthsdk.HealthSettings `json:"health_settings"`
|
||||
Workspaces *codersdk.WorkspacesResponse `json:"workspaces"`
|
||||
Prometheus []byte `json:"prometheus"`
|
||||
}
|
||||
|
||||
type Network struct {
|
||||
@@ -83,6 +94,32 @@ type Agent struct {
|
||||
StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"`
|
||||
}
|
||||
|
||||
type TemplateDump struct {
|
||||
Template codersdk.Template `json:"template"`
|
||||
TemplateVersion codersdk.TemplateVersion `json:"template_version"`
|
||||
TemplateFileBase64 string `json:"template_file_base64"`
|
||||
}
|
||||
|
||||
type Pprof struct {
|
||||
Server *PprofCollection `json:"server,omitempty"`
|
||||
Agent *PprofCollection `json:"agent,omitempty"`
|
||||
}
|
||||
|
||||
type PprofCollection struct {
|
||||
Heap []byte `json:"heap,omitempty"`
|
||||
Allocs []byte `json:"allocs,omitempty"`
|
||||
Profile []byte `json:"profile,omitempty"`
|
||||
Block []byte `json:"block,omitempty"`
|
||||
Mutex []byte `json:"mutex,omitempty"`
|
||||
Goroutine []byte `json:"goroutine,omitempty"`
|
||||
Threadcreate []byte `json:"threadcreate,omitempty"`
|
||||
Trace []byte `json:"trace,omitempty"`
|
||||
Cmdline string `json:"cmdline,omitempty"`
|
||||
Symbol string `json:"symbol,omitempty"`
|
||||
CollectedAt time.Time `json:"collected_at"`
|
||||
EndpointURL string `json:"endpoint_url"`
|
||||
}
|
||||
|
||||
// Deps is a set of dependencies for discovering information
|
||||
type Deps struct {
|
||||
// Source from which to obtain information.
|
||||
@@ -94,9 +131,17 @@ type Deps struct {
|
||||
// AgentID is the optional agent ID against which to run connection tests.
|
||||
// Defaults to the first agent of the workspace, if not specified.
|
||||
AgentID uuid.UUID
|
||||
// WorkspacesTotalCap limits the TOTAL number of workspaces aggregated into the bundle.
|
||||
// > 0 => cap at this number (default flag value should be 1000 via CLI).
|
||||
// <= 0 => no cap (fetch/keep all available workspaces).
|
||||
WorkspacesTotalCap int
|
||||
// TemplateID optionally specifies a template to capture (active version).
|
||||
TemplateID uuid.UUID
|
||||
// CollectPprof toggles server and agent pprof collection.
|
||||
CollectPprof bool
|
||||
}
|
||||
|
||||
func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Deployment {
|
||||
func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, workspacesCap int) Deployment {
|
||||
// Note: each goroutine assigns to a different struct field, hence no mutex.
|
||||
var (
|
||||
d Deployment
|
||||
@@ -154,13 +199,157 @@ func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logge
|
||||
return nil
|
||||
})
|
||||
|
||||
// Deployment stats
|
||||
eg.Go(func() error {
|
||||
stats, err := client.DeploymentStats(ctx)
|
||||
if err != nil {
|
||||
// If unauthorized or forbidden, log and continue
|
||||
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized || cerr.StatusCode() == http.StatusBadRequest) {
|
||||
log.Warn(ctx, "unable to fetch deployment stats")
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("fetch deployment stats: %w", err)
|
||||
}
|
||||
d.Stats = &stats
|
||||
return nil
|
||||
})
|
||||
|
||||
// Entitlements
|
||||
eg.Go(func() error {
|
||||
ents, err := client.Entitlements(ctx)
|
||||
if err != nil {
|
||||
// Ignore 404 or enterprise-not-enabled
|
||||
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusNotFound || cerr.StatusCode() == http.StatusForbidden) {
|
||||
log.Warn(ctx, "unable to fetch entitlements")
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("fetch entitlements: %w", err)
|
||||
}
|
||||
d.Entitlements = &ents
|
||||
return nil
|
||||
})
|
||||
|
||||
// Health settings
|
||||
eg.Go(func() error {
|
||||
settings, err := healthsdk.New(client).HealthSettings(ctx)
|
||||
if err != nil {
|
||||
// If not accessible, log and continue
|
||||
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
|
||||
log.Warn(ctx, "unable to fetch health settings")
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("fetch health settings: %w", err)
|
||||
}
|
||||
d.HealthSettings = &settings
|
||||
return nil
|
||||
})
|
||||
|
||||
// List workspaces (paginated)
|
||||
eg.Go(func() error {
|
||||
var (
|
||||
offset int
|
||||
limit = 200
|
||||
all []codersdk.Workspace
|
||||
count int
|
||||
)
|
||||
capTotal := workspacesCap
|
||||
for {
|
||||
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{Offset: offset, Limit: limit})
|
||||
if err != nil {
|
||||
// Log and continue if forbidden; otherwise return error
|
||||
if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) {
|
||||
log.Warn(ctx, "unable to list workspaces")
|
||||
break
|
||||
}
|
||||
return xerrors.Errorf("list workspaces: %w", err)
|
||||
}
|
||||
if d.Workspaces == nil {
|
||||
d.Workspaces = &resp
|
||||
}
|
||||
// sanitize env vars on agents in each workspace before appending
|
||||
for i := range resp.Workspaces {
|
||||
ws := &resp.Workspaces[i]
|
||||
for _, res := range ws.LatestBuild.Resources {
|
||||
for _, agt := range res.Agents {
|
||||
// safe to call even if map is nil (range in sanitizeEnv would be empty)
|
||||
sanitizeEnv(agt.EnvironmentVariables)
|
||||
}
|
||||
}
|
||||
}
|
||||
all = append(all, resp.Workspaces...)
|
||||
count = resp.Count
|
||||
// Stop early once we've reached the cap; trim any overflow from the last page.
|
||||
if capTotal > 0 && len(all) >= capTotal {
|
||||
if len(all) > capTotal {
|
||||
all = all[:capTotal]
|
||||
}
|
||||
break
|
||||
}
|
||||
if offset+len(resp.Workspaces) >= count || len(resp.Workspaces) == 0 {
|
||||
break
|
||||
}
|
||||
offset += len(resp.Workspaces)
|
||||
}
|
||||
if d.Workspaces != nil {
|
||||
// Replace with aggregated list
|
||||
d.Workspaces.Workspaces = all
|
||||
// Preserve server-reported total so Run() can log accurate truncation.
|
||||
d.Workspaces.Count = count
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if err := eg.Wait(); err != nil {
|
||||
log.Error(ctx, "fetch deployment information", slog.Error(err))
|
||||
}
|
||||
|
||||
if d.Config != nil && d.Config.Values != nil {
|
||||
prometheusCfg := d.Config.Values.Prometheus
|
||||
if prometheusCfg.Enable.Value() {
|
||||
metrics, err := fetchPrometheusMetrics(ctx, client, log)
|
||||
if err != nil {
|
||||
log.Warn(ctx, "fetch coderd prometheus metrics", slog.Error(err))
|
||||
} else {
|
||||
d.Prometheus = metrics
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return d
|
||||
}
|
||||
|
||||
func fetchPrometheusMetrics(ctx context.Context, client *codersdk.Client, log slog.Logger) ([]byte, error) {
|
||||
if client == nil {
|
||||
return nil, xerrors.New("nil client")
|
||||
}
|
||||
|
||||
reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
resp, err := client.Request(reqCtx, http.MethodGet, "/api/v2/debug/metrics", nil)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("request metrics: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("read metrics body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Debug(ctx, "coderd prometheus metrics fetch non-200",
|
||||
slog.F("status", resp.StatusCode), slog.F("body_len", len(body)))
|
||||
return nil, xerrors.Errorf("unexpected status code %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
trimmed := bytes.TrimSpace(body)
|
||||
if len(trimmed) == 0 {
|
||||
return nil, xerrors.New("empty prometheus metrics response")
|
||||
}
|
||||
return append([]byte(nil), trimmed...), nil
|
||||
}
|
||||
|
||||
func NetworkInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Network {
|
||||
var (
|
||||
n Network
|
||||
@@ -471,6 +660,234 @@ func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.L
|
||||
return closer
|
||||
}
|
||||
|
||||
func PprofInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) *PprofCollection {
|
||||
if client == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
p PprofCollection
|
||||
eg errgroup.Group
|
||||
)
|
||||
|
||||
if client.URL != nil {
|
||||
if u, err := client.URL.Parse("/api/v2/debug/pprof"); err == nil {
|
||||
p.EndpointURL = u.String()
|
||||
}
|
||||
}
|
||||
if p.EndpointURL == "" {
|
||||
p.EndpointURL = "/api/v2/debug/pprof"
|
||||
}
|
||||
p.CollectedAt = time.Now()
|
||||
|
||||
const basePath = "/api/v2/debug/pprof"
|
||||
endpoints := map[string]func([]byte){
|
||||
"/allocs": func(data []byte) {
|
||||
p.Allocs = compressData(data)
|
||||
},
|
||||
"/heap": func(data []byte) {
|
||||
p.Heap = compressData(data)
|
||||
},
|
||||
"/profile?seconds=30": func(data []byte) {
|
||||
p.Profile = compressData(data)
|
||||
},
|
||||
"/block": func(data []byte) {
|
||||
p.Block = compressData(data)
|
||||
},
|
||||
"/mutex": func(data []byte) {
|
||||
p.Mutex = compressData(data)
|
||||
},
|
||||
"/goroutine": func(data []byte) {
|
||||
p.Goroutine = compressData(data)
|
||||
},
|
||||
"/threadcreate": func(data []byte) {
|
||||
p.Threadcreate = compressData(data)
|
||||
},
|
||||
"/trace?seconds=30": func(data []byte) {
|
||||
p.Trace = compressData(data)
|
||||
},
|
||||
"/cmdline": func(data []byte) {
|
||||
p.Cmdline = string(data)
|
||||
},
|
||||
"/symbol": func(data []byte) {
|
||||
p.Symbol = string(data)
|
||||
},
|
||||
}
|
||||
|
||||
for endpoint, setter := range endpoints {
|
||||
endpoint, setter := endpoint, setter
|
||||
eg.Go(func() error {
|
||||
timeout := 10 * time.Second
|
||||
if strings.Contains(endpoint, "seconds=30") {
|
||||
timeout = 45 * time.Second
|
||||
}
|
||||
|
||||
reqCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
resp, err := client.Request(reqCtx, http.MethodGet, basePath+endpoint, nil)
|
||||
if err != nil {
|
||||
log.Warn(reqCtx, "failed to fetch pprof data", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Warn(reqCtx, "pprof endpoint returned non-200 status",
|
||||
slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Warn(reqCtx, "failed to read pprof response", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
setter(data)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := eg.Wait(); err != nil {
|
||||
log.Error(ctx, "failed to collect some pprof data", slog.Error(err))
|
||||
}
|
||||
|
||||
return &p
|
||||
}
|
||||
|
||||
func compressData(data []byte) []byte {
|
||||
if len(data) == 0 {
|
||||
return data
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
gz := gzip.NewWriter(&buf)
|
||||
if _, err := gz.Write(data); err != nil {
|
||||
return data // Return uncompressed if compression fails
|
||||
}
|
||||
if err := gz.Close(); err != nil {
|
||||
return data
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func PprofInfoFromAgent(ctx context.Context, conn workspacesdk.AgentConn, log slog.Logger) *PprofCollection {
|
||||
if conn == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
p PprofCollection
|
||||
eg errgroup.Group
|
||||
)
|
||||
|
||||
p.EndpointURL = "agent"
|
||||
p.CollectedAt = time.Now()
|
||||
|
||||
// Define agent pprof endpoints - these go through the agent connection
|
||||
endpoints := map[string]func([]byte){
|
||||
"/debug/pprof/allocs": func(data []byte) {
|
||||
p.Allocs = compressData(data)
|
||||
},
|
||||
"/debug/pprof/heap": func(data []byte) {
|
||||
p.Heap = compressData(data)
|
||||
},
|
||||
"/debug/pprof/profile?seconds=30": func(data []byte) {
|
||||
p.Profile = compressData(data)
|
||||
},
|
||||
"/debug/pprof/block": func(data []byte) {
|
||||
p.Block = compressData(data)
|
||||
},
|
||||
"/debug/pprof/mutex": func(data []byte) {
|
||||
p.Mutex = compressData(data)
|
||||
},
|
||||
"/debug/pprof/goroutine": func(data []byte) {
|
||||
p.Goroutine = compressData(data)
|
||||
},
|
||||
"/debug/pprof/threadcreate": func(data []byte) {
|
||||
p.Threadcreate = compressData(data)
|
||||
},
|
||||
"/debug/pprof/trace?seconds=30": func(data []byte) {
|
||||
p.Trace = compressData(data)
|
||||
},
|
||||
"/debug/pprof/cmdline": func(data []byte) {
|
||||
p.Cmdline = string(data)
|
||||
},
|
||||
"/debug/pprof/symbol": func(data []byte) {
|
||||
p.Symbol = string(data)
|
||||
},
|
||||
}
|
||||
|
||||
// Collect each endpoint in parallel
|
||||
for endpoint, setter := range endpoints {
|
||||
endpoint, setter := endpoint, setter // capture loop variables
|
||||
eg.Go(func() error {
|
||||
// Set longer timeout for profile and trace endpoints (they take 30 seconds)
|
||||
timeout := 10 * time.Second
|
||||
if strings.Contains(endpoint, "seconds=30") {
|
||||
timeout = 45 * time.Second
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
// Use the agent's direct HTTP capability
|
||||
// Agent pprof server runs on 127.0.0.1:6060 by default
|
||||
netConn, err := conn.DialContext(ctx, "tcp", "127.0.0.1:6060")
|
||||
if err != nil {
|
||||
log.Warn(ctx, "failed to dial agent pprof endpoint", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
defer netConn.Close()
|
||||
|
||||
// Create HTTP client using the connection
|
||||
client := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
|
||||
return netConn, nil
|
||||
},
|
||||
},
|
||||
Timeout: timeout,
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:6060"+endpoint, nil)
|
||||
if err != nil {
|
||||
log.Warn(ctx, "failed to create agent pprof request", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
log.Warn(ctx, "failed to fetch agent pprof data", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Warn(ctx, "agent pprof endpoint returned non-200 status", slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode))
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Warn(ctx, "failed to read agent pprof response", slog.F("endpoint", endpoint), slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
setter(data)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := eg.Wait(); err != nil {
|
||||
log.Error(ctx, "failed to collect some agent pprof data", slog.Error(err))
|
||||
}
|
||||
|
||||
return &p
|
||||
}
|
||||
|
||||
// Run generates a support bundle with the given dependencies.
|
||||
func Run(ctx context.Context, d *Deps) (*Bundle, error) {
|
||||
var b Bundle
|
||||
@@ -505,9 +922,28 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) {
|
||||
}
|
||||
}
|
||||
|
||||
totalCap := d.WorkspacesTotalCap
|
||||
|
||||
var eg errgroup.Group
|
||||
eg.Go(func() error {
|
||||
di := DeploymentInfo(ctx, d.Client, d.Log)
|
||||
di := DeploymentInfo(ctx, d.Client, d.Log, totalCap)
|
||||
|
||||
if di.Workspaces != nil && totalCap > 0 {
|
||||
origTotal := di.Workspaces.Count // server-reported total
|
||||
|
||||
// Ensure at most 'totalCap' are returned (covers non-early-exit path).
|
||||
if len(di.Workspaces.Workspaces) > totalCap {
|
||||
di.Workspaces.Workspaces = di.Workspaces.Workspaces[:totalCap]
|
||||
}
|
||||
// If we returned fewer than the original total, log a truncation.
|
||||
if origTotal > len(di.Workspaces.Workspaces) {
|
||||
di.Workspaces.Count = len(di.Workspaces.Workspaces)
|
||||
d.Log.Warn(ctx, "workspace list truncated",
|
||||
slog.F("cap", totalCap),
|
||||
slog.F("original_total", origTotal),
|
||||
)
|
||||
}
|
||||
}
|
||||
b.Deployment = di
|
||||
return nil
|
||||
})
|
||||
@@ -527,11 +963,129 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) {
|
||||
return nil
|
||||
})
|
||||
|
||||
// Optional: capture a template's active version and file if TemplateID is set.
|
||||
eg.Go(func() error {
|
||||
if d.TemplateID == uuid.Nil {
|
||||
return nil
|
||||
}
|
||||
var td TemplateDump
|
||||
tpl, err := d.Client.Template(ctx, d.TemplateID)
|
||||
if err != nil {
|
||||
d.Log.Error(ctx, "fetch template", slog.Error(err), slog.F("template_id", d.TemplateID))
|
||||
return nil
|
||||
}
|
||||
td.Template = tpl
|
||||
if tpl.ActiveVersionID == uuid.Nil {
|
||||
d.Log.Error(ctx, "template has nil active version id", slog.F("template_id", tpl.ID))
|
||||
b.NamedTemplate = td
|
||||
return nil
|
||||
}
|
||||
tv, err := d.Client.TemplateVersion(ctx, tpl.ActiveVersionID)
|
||||
if err != nil {
|
||||
d.Log.Error(ctx, "fetch active template version", slog.Error(err), slog.F("active_version_id", tpl.ActiveVersionID))
|
||||
b.NamedTemplate = td
|
||||
return nil
|
||||
}
|
||||
td.TemplateVersion = tv
|
||||
if tv.Job.FileID == uuid.Nil {
|
||||
d.Log.Error(ctx, "template file id is nil", slog.F("template_version_id", tv.ID))
|
||||
b.NamedTemplate = td
|
||||
return nil
|
||||
}
|
||||
raw, ctype, err := d.Client.DownloadWithFormat(ctx, tv.Job.FileID, codersdk.FormatZip)
|
||||
if err != nil || ctype != codersdk.ContentTypeZip {
|
||||
d.Log.Error(ctx, "download template file", slog.Error(err), slog.F("content_type", ctype))
|
||||
b.NamedTemplate = td
|
||||
return nil
|
||||
}
|
||||
td.TemplateFileBase64 = base64.StdEncoding.EncodeToString(raw)
|
||||
b.NamedTemplate = td
|
||||
return nil
|
||||
})
|
||||
|
||||
_ = eg.Wait()
|
||||
|
||||
// Collect pprof data after deployment info is available (need version check).
|
||||
// Pprof endpoints require Coder server version 2.28.0 or newer.
|
||||
if d.CollectPprof {
|
||||
b.Pprof = collectPprof(ctx, d, &b)
|
||||
}
|
||||
|
||||
return &b, nil
|
||||
}
|
||||
|
||||
// minPprofVersion is the minimum Coder server version that supports
|
||||
// the /api/v2/debug/pprof endpoints.
|
||||
const minPprofVersion = "v2.28.0"
|
||||
|
||||
// VersionSupportsPprof checks if the given version supports pprof endpoints.
|
||||
func VersionSupportsPprof(version string) bool {
|
||||
if version == "" {
|
||||
return false
|
||||
}
|
||||
if version[0] != 'v' {
|
||||
version = "v" + version
|
||||
}
|
||||
// For prerelease versions like "v2.28.0-devel+abc123", we compare
|
||||
// the major.minor.patch portion since prereleases of 2.28.0 should
|
||||
// have the pprof feature.
|
||||
canonical := semver.Canonical(version)
|
||||
if idx := strings.Index(canonical, "-"); idx != -1 {
|
||||
canonical = canonical[:idx]
|
||||
}
|
||||
return semver.Compare(canonical, minPprofVersion) >= 0
|
||||
}
|
||||
|
||||
func collectPprof(ctx context.Context, d *Deps, b *Bundle) Pprof {
|
||||
var pprof Pprof
|
||||
|
||||
// Check server version before attempting pprof collection.
|
||||
if b.Deployment.BuildInfo == nil {
|
||||
d.Log.Warn(ctx, "skipping pprof collection: build info not available")
|
||||
return pprof
|
||||
}
|
||||
if !VersionSupportsPprof(b.Deployment.BuildInfo.Version) {
|
||||
d.Log.Warn(ctx, "skipping pprof collection: server version too old",
|
||||
slog.F("version", b.Deployment.BuildInfo.Version),
|
||||
slog.F("min_version", minPprofVersion))
|
||||
return pprof
|
||||
}
|
||||
|
||||
serverPprof := PprofInfo(ctx, d.Client, d.Log)
|
||||
if serverPprof != nil {
|
||||
pprof.Server = serverPprof
|
||||
}
|
||||
|
||||
if d.AgentID != uuid.Nil {
|
||||
conn, err := workspacesdk.New(d.Client).
|
||||
DialAgent(ctx, d.AgentID, &workspacesdk.DialAgentOptions{
|
||||
Logger: d.Log.Named("dial-agent-pprof"),
|
||||
BlockEndpoints: false,
|
||||
})
|
||||
if err != nil {
|
||||
d.Log.Warn(ctx, "failed to dial agent for pprof collection", slog.Error(err))
|
||||
} else {
|
||||
defer func() {
|
||||
if err := conn.Close(); err != nil {
|
||||
d.Log.Error(ctx, "failed to close agent pprof connection", slog.Error(err))
|
||||
}
|
||||
<-conn.TailnetConn().Closed()
|
||||
}()
|
||||
|
||||
if conn.AwaitReachable(ctx) {
|
||||
agentPprof := PprofInfoFromAgent(ctx, conn, d.Log)
|
||||
if agentPprof != nil {
|
||||
pprof.Agent = agentPprof
|
||||
}
|
||||
} else {
|
||||
d.Log.Warn(ctx, "agent not reachable for pprof collection")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pprof
|
||||
}
|
||||
|
||||
// sanitizeEnv modifies kvs in place and replaces the values all non-empty keys
|
||||
// with the string ***REDACTED***
|
||||
func sanitizeEnv(kvs map[string]string) {
|
||||
|
||||
@@ -3,6 +3,7 @@ package support_test
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -27,6 +28,7 @@ import (
|
||||
"github.com/coder/coder/v2/codersdk"
|
||||
"github.com/coder/coder/v2/support"
|
||||
"github.com/coder/coder/v2/testutil"
|
||||
"github.com/coder/serpent"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
@@ -39,6 +41,10 @@ func TestRun(t *testing.T) {
|
||||
t.Run("OK", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
cfg := coderdtest.DeploymentValues(t)
|
||||
promPort := testutil.RandomPort(t)
|
||||
cfg.Prometheus.Enable = serpent.Bool(true)
|
||||
cfg.Prometheus.Address.Host = "127.0.0.1"
|
||||
cfg.Prometheus.Address.Port = fmt.Sprintf("%d", promPort)
|
||||
cfg.Experiments = []string{"foo"}
|
||||
ctx := testutil.Context(t, testutil.WaitLong)
|
||||
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{
|
||||
@@ -86,8 +92,24 @@ func TestRun(t *testing.T) {
|
||||
assertNotNilNotEmpty(t, bun.Agent.PeerDiagnostics, "agent peer diagnostics should be present")
|
||||
assertNotNilNotEmpty(t, bun.Agent.PingResult, "agent ping result should be present")
|
||||
assertNotNilNotEmpty(t, bun.Agent.Prometheus, "agent prometheus metrics should be present")
|
||||
assertNotNilNotEmpty(t, bun.Deployment.Prometheus, "deployment prometheus metrics should be present")
|
||||
assertNotNilNotEmpty(t, bun.Agent.StartupLogs, "agent startup logs should be present")
|
||||
assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present")
|
||||
assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof")
|
||||
assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof")
|
||||
|
||||
// New: deployment health settings should be present
|
||||
assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present")
|
||||
// New: aggregated workspaces should be present and include created workspace
|
||||
assert.NotNil(t, bun.Deployment.Workspaces, "deployment workspaces should be present")
|
||||
assert.GreaterOrEqual(t, bun.Deployment.Workspaces.Count, 1)
|
||||
for _, aws := range bun.Deployment.Workspaces.Workspaces {
|
||||
for _, res := range aws.LatestBuild.Resources {
|
||||
for _, a := range res.Agents {
|
||||
assertSanitizedEnv(t, a.EnvironmentVariables)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("OK_NoWorkspace", func(t *testing.T) {
|
||||
@@ -120,6 +142,13 @@ func TestRun(t *testing.T) {
|
||||
assert.Empty(t, bun.Workspace.Workspace, "did not expect workspace to be present")
|
||||
assert.Empty(t, bun.Agent, "did not expect agent to be present")
|
||||
assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present")
|
||||
assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof")
|
||||
assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof")
|
||||
|
||||
// New: health settings should be present even without workspace context
|
||||
assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present")
|
||||
// New: aggregated workspaces struct should exist (may be empty)
|
||||
assert.NotNil(t, bun.Deployment.Workspaces)
|
||||
})
|
||||
|
||||
t.Run("NoAuth", func(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
package support_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/coder/coder/v2/support"
|
||||
)
|
||||
|
||||
func TestVersionSupportsPprof(t *testing.T) {
|
||||
t.Parallel()
|
||||
tests := []struct {
|
||||
version string
|
||||
want bool
|
||||
}{
|
||||
{"", false},
|
||||
{"v2.27.0", false},
|
||||
{"v2.27.9", false},
|
||||
{"v2.28.0", true},
|
||||
{"v2.28.1", true},
|
||||
{"v2.29.0", true},
|
||||
{"v3.0.0", true},
|
||||
{"2.28.0", true}, // without v prefix
|
||||
{"2.27.0", false}, // without v prefix
|
||||
{"v2.28.0-devel+abc123", true}, // dev version
|
||||
{"v2.27.0-devel+abc123", false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.version, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
got := support.VersionSupportsPprof(tt.version)
|
||||
if got != tt.want {
|
||||
t.Errorf("versionSupportsPprof(%q) = %v, want %v", tt.version, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user