diff --git a/cli/support.go b/cli/support.go index f181f76131..83a9945084 100644 --- a/cli/support.go +++ b/cli/support.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "net/http" "net/url" "os" "path/filepath" @@ -44,13 +45,18 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information ` - Coder deployment version - Coder deployment Configuration (sanitized), including enabled experiments - Coder deployment health snapshot + - Coder deployment stats (aggregated workspace/session metrics) + - Entitlements (if available) + - Health settings (dismissed healthchecks) - Coder deployment Network troubleshooting information + - Workspace list accessible to the user (sanitized) - Workspace configuration, parameters, and build logs - Template version and source code for the given workspace - Agent details (with environment variable sanitized) - Agent network diagnostics - Agent logs - License status + - pprof profiling data (if --pprof is enabled) ` + cliui.Bold("Note: ") + cliui.Wrap("While we try to sanitize sensitive data from support bundles, we cannot guarantee that they do not contain information that you or your organization may consider sensitive.\n") + cliui.Bold("Please confirm that you will:\n") + @@ -61,6 +67,9 @@ var supportBundleBlurb = cliui.Bold("This will collect the following information func (r *RootCmd) supportBundle() *serpent.Command { var outputPath string var coderURLOverride string + var workspacesTotalCap64 int64 = 10 + var templateName string + var pprof bool cmd := &serpent.Command{ Use: "bundle []", Short: "Generate a support bundle to troubleshoot issues connecting to a workspace.", @@ -121,8 +130,9 @@ func (r *RootCmd) supportBundle() *serpent.Command { } var ( - wsID uuid.UUID - agtID uuid.UUID + wsID uuid.UUID + agtID uuid.UUID + templateID uuid.UUID ) if len(inv.Args) == 0 { @@ -155,6 +165,16 @@ func (r *RootCmd) supportBundle() *serpent.Command { } } + // Resolve template by name if provided (captures active version) + // Fallback: if canonical name lookup fails, match DisplayName (case-insensitive). + if templateName != "" { + id, err := resolveTemplateID(inv.Context(), client, templateName) + if err != nil { + return err + } + templateID = id + } + if outputPath == "" { cwd, err := filepath.Abs(".") if err != nil { @@ -176,12 +196,25 @@ func (r *RootCmd) supportBundle() *serpent.Command { if r.verbose { clientLog.AppendSinks(sloghuman.Sink(inv.Stderr)) } + if pprof { + _, _ = fmt.Fprintln(inv.Stderr, "pprof data collection will take approximately 30 seconds...") + } + + // Bypass rate limiting for support bundle collection since it makes many API calls. + client.HTTPClient.Transport = &codersdk.HeaderTransport{ + Transport: client.HTTPClient.Transport, + Header: http.Header{codersdk.BypassRatelimitHeader: {"true"}}, + } + deps := support.Deps{ Client: client, // Support adds a sink so we don't need to supply one ourselves. - Log: clientLog, - WorkspaceID: wsID, - AgentID: agtID, + Log: clientLog, + WorkspaceID: wsID, + AgentID: agtID, + WorkspacesTotalCap: int(workspacesTotalCap64), + TemplateID: templateID, + CollectPprof: pprof, } bun, err := support.Run(inv.Context(), &deps) @@ -217,11 +250,102 @@ func (r *RootCmd) supportBundle() *serpent.Command { Description: "Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica.", Value: serpent.StringOf(&coderURLOverride), }, + { + Flag: "workspaces-total-cap", + Env: "CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP", + Description: "Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10.", + Value: serpent.Int64Of(&workspacesTotalCap64), + }, + { + Flag: "template", + Env: "CODER_SUPPORT_BUNDLE_TEMPLATE", + Description: "Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations.", + Value: serpent.StringOf(&templateName), + }, + { + Flag: "pprof", + Env: "CODER_SUPPORT_BUNDLE_PPROF", + Description: "Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer.", + Value: serpent.BoolOf(&pprof), + }, } return cmd } +// Resolve a template to its ID, supporting: +// - org/name form +// - slug or display name match (case-insensitive) across all memberships +func resolveTemplateID(ctx context.Context, client *codersdk.Client, templateArg string) (uuid.UUID, error) { + orgPart := "" + namePart := templateArg + if slash := strings.IndexByte(templateArg, '/'); slash > 0 && slash < len(templateArg)-1 { + orgPart = templateArg[:slash] + namePart = templateArg[slash+1:] + } + + resolveInOrg := func(orgID uuid.UUID) (codersdk.Template, bool, error) { + if t, err := client.TemplateByName(ctx, orgID, namePart); err == nil { + return t, true, nil + } + tpls, err := client.TemplatesByOrganization(ctx, orgID) + if err != nil { + return codersdk.Template{}, false, nil + } + for _, t := range tpls { + if strings.EqualFold(t.Name, namePart) || strings.EqualFold(t.DisplayName, namePart) { + return t, true, nil + } + } + return codersdk.Template{}, false, nil + } + + if orgPart != "" { + org, err := client.OrganizationByName(ctx, orgPart) + if err != nil { + return uuid.Nil, xerrors.Errorf("get organization %q: %w", orgPart, err) + } + t, found, err := resolveInOrg(org.ID) + if err != nil { + return uuid.Nil, err + } + if !found { + return uuid.Nil, xerrors.Errorf("template %q not found in organization %q", namePart, orgPart) + } + return t.ID, nil + } + + orgs, err := client.OrganizationsByUser(ctx, codersdk.Me) + if err != nil { + return uuid.Nil, xerrors.Errorf("get organizations: %w", err) + } + var ( + foundTpl codersdk.Template + foundOrgs []string + ) + for _, org := range orgs { + if t, found, err := resolveInOrg(org.ID); err == nil && found { + if len(foundOrgs) == 0 { + foundTpl = t + } + foundOrgs = append(foundOrgs, org.Name) + } + } + switch len(foundOrgs) { + case 0: + return uuid.Nil, xerrors.Errorf("template %q not found in your organizations", namePart) + case 1: + return foundTpl.ID, nil + default: + return uuid.Nil, xerrors.Errorf( + "template %q found in multiple organizations (%s); use --template \"\" to target desired template.", + namePart, + strings.Join(foundOrgs, ", "), + namePart, + ) + } +} + // summarizeBundle makes a best-effort attempt to write a short summary // of the support bundle to the user's terminal. func summarizeBundle(inv *serpent.Invocation, bun *support.Bundle) { @@ -283,6 +407,10 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error { "deployment/config.json": src.Deployment.Config, "deployment/experiments.json": src.Deployment.Experiments, "deployment/health.json": src.Deployment.HealthReport, + "deployment/stats.json": src.Deployment.Stats, + "deployment/entitlements.json": src.Deployment.Entitlements, + "deployment/health_settings.json": src.Deployment.HealthSettings, + "deployment/workspaces.json": src.Deployment.Workspaces, "network/connection_info.json": src.Network.ConnectionInfo, "network/netcheck.json": src.Network.Netcheck, "network/interfaces.json": src.Network.Interfaces, @@ -302,6 +430,49 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error { } } + // Include named template artifacts (if requested) + if src.NamedTemplate.Template.ID != uuid.Nil { + name := src.NamedTemplate.Template.Name + // JSON files + for k, v := range map[string]any{ + "templates/" + name + "/template.json": src.NamedTemplate.Template, + "templates/" + name + "/template_version.json": src.NamedTemplate.TemplateVersion, + } { + f, err := dest.Create(k) + if err != nil { + return xerrors.Errorf("create file %q in archive: %w", k, err) + } + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + if err := enc.Encode(v); err != nil { + return xerrors.Errorf("write json to %q: %w", k, err) + } + } + // Binary template file (zip) + if namedZipBytes, err := base64.StdEncoding.DecodeString(src.NamedTemplate.TemplateFileBase64); err == nil { + k := "templates/" + name + "/template_file.zip" + f, err := dest.Create(k) + if err != nil { + return xerrors.Errorf("create file %q in archive: %w", k, err) + } + if _, err := f.Write(namedZipBytes); err != nil { + return xerrors.Errorf("write file %q in archive: %w", k, err) + } + } + } + + var buildInfoRef string + if src.Deployment.BuildInfo != nil { + if raw, err := json.Marshal(src.Deployment.BuildInfo); err == nil { + buildInfoRef = base64.StdEncoding.EncodeToString(raw) + } + } + + tailnetHTML := src.Network.TailnetDebug + if buildInfoRef != "" { + tailnetHTML += "\n" + } + templateVersionBytes, err := base64.StdEncoding.DecodeString(src.Workspace.TemplateFileBase64) if err != nil { return xerrors.Errorf("decode template zip from base64") @@ -319,10 +490,11 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error { "agent/client_magicsock.html": string(src.Agent.ClientMagicsockHTML), "agent/startup_logs.txt": humanizeAgentLogs(src.Agent.StartupLogs), "agent/prometheus.txt": string(src.Agent.Prometheus), + "deployment/prometheus.txt": string(src.Deployment.Prometheus), "cli_logs.txt": string(src.CLILogs), "logs.txt": strings.Join(src.Logs, "\n"), "network/coordinator_debug.html": src.Network.CoordinatorDebug, - "network/tailnet_debug.html": src.Network.TailnetDebug, + "network/tailnet_debug.html": tailnetHTML, "workspace/build_logs.txt": humanizeBuildLogs(src.Workspace.BuildLogs), "workspace/template_file.zip": string(templateVersionBytes), "license-status.txt": licenseStatus, @@ -335,12 +507,89 @@ func writeBundle(src *support.Bundle, dest *zip.Writer) error { return xerrors.Errorf("write file %q in archive: %w", k, err) } } + + // Write pprof binary data + if err := writePprofData(src.Pprof, dest); err != nil { + return xerrors.Errorf("write pprof data: %w", err) + } + if err := dest.Close(); err != nil { return xerrors.Errorf("close zip file: %w", err) } return nil } +func writePprofData(pprof support.Pprof, dest *zip.Writer) error { + // Write server pprof data directly to pprof directory + if pprof.Server != nil { + if err := writePprofCollection("pprof", pprof.Server, dest); err != nil { + return xerrors.Errorf("write server pprof data: %w", err) + } + } + + // Write agent pprof data + if pprof.Agent != nil { + if err := writePprofCollection("pprof/agent", pprof.Agent, dest); err != nil { + return xerrors.Errorf("write agent pprof data: %w", err) + } + } + + return nil +} + +func writePprofCollection(basePath string, collection *support.PprofCollection, dest *zip.Writer) error { + // Define the pprof files to write with their extensions + files := map[string][]byte{ + "allocs.prof.gz": collection.Allocs, + "heap.prof.gz": collection.Heap, + "profile.prof.gz": collection.Profile, + "block.prof.gz": collection.Block, + "mutex.prof.gz": collection.Mutex, + "goroutine.prof.gz": collection.Goroutine, + "threadcreate.prof.gz": collection.Threadcreate, + "trace.gz": collection.Trace, + } + + // Write binary pprof files + for filename, data := range files { + if len(data) > 0 { + filePath := basePath + "/" + filename + f, err := dest.Create(filePath) + if err != nil { + return xerrors.Errorf("create pprof file %q: %w", filePath, err) + } + if _, err := f.Write(data); err != nil { + return xerrors.Errorf("write pprof file %q: %w", filePath, err) + } + } + } + + // Write cmdline as text file + if collection.Cmdline != "" { + filePath := basePath + "/cmdline.txt" + f, err := dest.Create(filePath) + if err != nil { + return xerrors.Errorf("create cmdline file %q: %w", filePath, err) + } + if _, err := f.Write([]byte(collection.Cmdline)); err != nil { + return xerrors.Errorf("write cmdline file %q: %w", filePath, err) + } + } + + if collection.Symbol != "" { + filePath := basePath + "/symbol.txt" + f, err := dest.Create(filePath) + if err != nil { + return xerrors.Errorf("create symbol file %q: %w", filePath, err) + } + if _, err := f.Write([]byte(collection.Symbol)); err != nil { + return xerrors.Errorf("write symbol file %q: %w", filePath, err) + } + } + + return nil +} + func humanizeAgentLogs(ls []codersdk.WorkspaceAgentLog) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 0, 2, 1, ' ', 0) diff --git a/cli/support_test.go b/cli/support_test.go index 639b9d738f..4587e52c60 100644 --- a/cli/support_test.go +++ b/cli/support_test.go @@ -46,6 +46,8 @@ func TestSupportBundle(t *testing.T) { // Support bundle tests can share a single coderdtest instance. var dc codersdk.DeploymentConfig + dc.Values = coderdtest.DeploymentValues(t) + dc.Values.Prometheus.Enable = true secretValue := uuid.NewString() seedSecretDeploymentOptions(t, &dc, secretValue) client, closer, api := coderdtest.NewWithAPI(t, &coderdtest.Options{ @@ -203,6 +205,10 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge var v codersdk.DeploymentConfig decodeJSONFromZip(t, f, &v) require.NotEmpty(t, v, "deployment config should not be empty") + case "deployment/entitlements.json": + var v codersdk.Entitlements + decodeJSONFromZip(t, f, &v) + require.NotNil(t, v, "entitlements should not be nil") case "deployment/experiments.json": var v codersdk.Experiments decodeJSONFromZip(t, f, &v) @@ -211,6 +217,22 @@ func assertBundleContents(t *testing.T, path string, wantWorkspace bool, wantAge var v healthsdk.HealthcheckReport decodeJSONFromZip(t, f, &v) require.NotEmpty(t, v, "health report should not be empty") + case "deployment/health_settings.json": + var v healthsdk.HealthSettings + decodeJSONFromZip(t, f, &v) + require.NotEmpty(t, v, "health settings should not be empty") + case "deployment/stats.json": + var v codersdk.DeploymentStats + decodeJSONFromZip(t, f, &v) + require.NotNil(t, v, "deployment stats should not be nil") + case "deployment/workspaces.json": + var v codersdk.Workspace + decodeJSONFromZip(t, f, &v) + require.NotNil(t, v, "deployment workspaces should not be nil") + case "deployment/prometheus.txt": + bs := readBytesFromZip(t, f) + require.NotEmpty(t, bs, "prometheus metrics should not be empty") + require.Contains(t, string(bs), "go_goroutines", "prometheus metrics should contain go runtime metrics") case "network/connection_info.json": var v workspacesdk.AgentConnectionInfo decodeJSONFromZip(t, f, &v) diff --git a/cli/testdata/coder_support_bundle_--help.golden b/cli/testdata/coder_support_bundle_--help.golden index fd40548817..ed0973aa42 100644 --- a/cli/testdata/coder_support_bundle_--help.golden +++ b/cli/testdata/coder_support_bundle_--help.golden @@ -14,10 +14,23 @@ OPTIONS: File path for writing the generated support bundle. Defaults to coder-support-$(date +%s).zip. + --pprof bool, $CODER_SUPPORT_BUNDLE_PPROF + Collect pprof profiling data from the Coder server and agent. Requires + Coder server version 2.28.0 or newer. + + --template string, $CODER_SUPPORT_BUNDLE_TEMPLATE + Template name to include in the support bundle. Use + org_name/template_name if template name is reused across multiple + organizations. + --url-override string, $CODER_SUPPORT_BUNDLE_URL_OVERRIDE Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica. + --workspaces-total-cap int, $CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP + Maximum number of workspaces to include in the support bundle. Set to + 0 or negative value to disable the cap. Defaults to 10. + -y, --yes bool Bypass confirmation prompts. diff --git a/coderd/coderd.go b/coderd/coderd.go index 2be576ea80..645830b7b5 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -27,6 +27,7 @@ import ( "github.com/google/uuid" "github.com/klauspost/compress/zstd" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/promhttp" httpSwagger "github.com/swaggo/http-swagger/v2" "go.opentelemetry.io/otel/trace" @@ -334,6 +335,7 @@ func New(options *Options) *API { if options.PrometheusRegistry == nil { options.PrometheusRegistry = prometheus.NewRegistry() + options.PrometheusRegistry.MustRegister(collectors.NewGoCollector()) } if options.Authorizer == nil { options.Authorizer = rbac.NewCachingAuthorizer(options.PrometheusRegistry) diff --git a/docs/reference/cli/support_bundle.md b/docs/reference/cli/support_bundle.md index b6cf5ea6ac..40744f819b 100644 --- a/docs/reference/cli/support_bundle.md +++ b/docs/reference/cli/support_bundle.md @@ -42,3 +42,30 @@ File path for writing the generated support bundle. Defaults to coder-support-$( | Environment | $CODER_SUPPORT_BUNDLE_URL_OVERRIDE | Override the URL to your Coder deployment. This may be useful, for example, if you need to troubleshoot a specific Coder replica. + +### --workspaces-total-cap + +| | | +|-------------|---------------------------------------------------------| +| Type | int | +| Environment | $CODER_SUPPORT_BUNDLE_WORKSPACES_TOTAL_CAP | + +Maximum number of workspaces to include in the support bundle. Set to 0 or negative value to disable the cap. Defaults to 10. + +### --template + +| | | +|-------------|---------------------------------------------| +| Type | string | +| Environment | $CODER_SUPPORT_BUNDLE_TEMPLATE | + +Template name to include in the support bundle. Use org_name/template_name if template name is reused across multiple organizations. + +### --pprof + +| | | +|-------------|------------------------------------------| +| Type | bool | +| Environment | $CODER_SUPPORT_BUNDLE_PPROF | + +Collect pprof profiling data from the Coder server and agent. Requires Coder server version 2.28.0 or newer. diff --git a/support/support.go b/support/support.go index ae9149c725..4b216228ce 100644 --- a/support/support.go +++ b/support/support.go @@ -2,15 +2,19 @@ package support import ( "bytes" + "compress/gzip" "context" "encoding/base64" "encoding/json" "io" + "net" "net/http" "net/http/httptest" "strings" + "time" "github.com/google/uuid" + "golang.org/x/mod/semver" "golang.org/x/sync/errgroup" "golang.org/x/xerrors" "tailscale.com/ipn/ipnstate" @@ -30,20 +34,27 @@ import ( // Even though we do attempt to sanitize data, it may still contain // sensitive information and should thus be treated as secret. type Bundle struct { - Deployment Deployment `json:"deployment"` - Network Network `json:"network"` - Workspace Workspace `json:"workspace"` - Agent Agent `json:"agent"` - Logs []string `json:"logs"` - CLILogs []byte `json:"cli_logs"` + Deployment Deployment `json:"deployment"` + Network Network `json:"network"` + Workspace Workspace `json:"workspace"` + Agent Agent `json:"agent"` + Logs []string `json:"logs"` + CLILogs []byte `json:"cli_logs"` + NamedTemplate TemplateDump `json:"named_template"` + Pprof Pprof `json:"pprof"` } type Deployment struct { - BuildInfo *codersdk.BuildInfoResponse `json:"build"` - Config *codersdk.DeploymentConfig `json:"config"` - Experiments codersdk.Experiments `json:"experiments"` - HealthReport *healthsdk.HealthcheckReport `json:"health_report"` - Licenses []codersdk.License `json:"licenses"` + BuildInfo *codersdk.BuildInfoResponse `json:"build"` + Config *codersdk.DeploymentConfig `json:"config"` + Experiments codersdk.Experiments `json:"experiments"` + HealthReport *healthsdk.HealthcheckReport `json:"health_report"` + Licenses []codersdk.License `json:"licenses"` + Stats *codersdk.DeploymentStats `json:"stats"` + Entitlements *codersdk.Entitlements `json:"entitlements"` + HealthSettings *healthsdk.HealthSettings `json:"health_settings"` + Workspaces *codersdk.WorkspacesResponse `json:"workspaces"` + Prometheus []byte `json:"prometheus"` } type Network struct { @@ -83,6 +94,32 @@ type Agent struct { StartupLogs []codersdk.WorkspaceAgentLog `json:"startup_logs"` } +type TemplateDump struct { + Template codersdk.Template `json:"template"` + TemplateVersion codersdk.TemplateVersion `json:"template_version"` + TemplateFileBase64 string `json:"template_file_base64"` +} + +type Pprof struct { + Server *PprofCollection `json:"server,omitempty"` + Agent *PprofCollection `json:"agent,omitempty"` +} + +type PprofCollection struct { + Heap []byte `json:"heap,omitempty"` + Allocs []byte `json:"allocs,omitempty"` + Profile []byte `json:"profile,omitempty"` + Block []byte `json:"block,omitempty"` + Mutex []byte `json:"mutex,omitempty"` + Goroutine []byte `json:"goroutine,omitempty"` + Threadcreate []byte `json:"threadcreate,omitempty"` + Trace []byte `json:"trace,omitempty"` + Cmdline string `json:"cmdline,omitempty"` + Symbol string `json:"symbol,omitempty"` + CollectedAt time.Time `json:"collected_at"` + EndpointURL string `json:"endpoint_url"` +} + // Deps is a set of dependencies for discovering information type Deps struct { // Source from which to obtain information. @@ -94,9 +131,17 @@ type Deps struct { // AgentID is the optional agent ID against which to run connection tests. // Defaults to the first agent of the workspace, if not specified. AgentID uuid.UUID + // WorkspacesTotalCap limits the TOTAL number of workspaces aggregated into the bundle. + // > 0 => cap at this number (default flag value should be 1000 via CLI). + // <= 0 => no cap (fetch/keep all available workspaces). + WorkspacesTotalCap int + // TemplateID optionally specifies a template to capture (active version). + TemplateID uuid.UUID + // CollectPprof toggles server and agent pprof collection. + CollectPprof bool } -func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Deployment { +func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logger, workspacesCap int) Deployment { // Note: each goroutine assigns to a different struct field, hence no mutex. var ( d Deployment @@ -154,13 +199,157 @@ func DeploymentInfo(ctx context.Context, client *codersdk.Client, log slog.Logge return nil }) + // Deployment stats + eg.Go(func() error { + stats, err := client.DeploymentStats(ctx) + if err != nil { + // If unauthorized or forbidden, log and continue + if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized || cerr.StatusCode() == http.StatusBadRequest) { + log.Warn(ctx, "unable to fetch deployment stats") + return nil + } + return xerrors.Errorf("fetch deployment stats: %w", err) + } + d.Stats = &stats + return nil + }) + + // Entitlements + eg.Go(func() error { + ents, err := client.Entitlements(ctx) + if err != nil { + // Ignore 404 or enterprise-not-enabled + if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusNotFound || cerr.StatusCode() == http.StatusForbidden) { + log.Warn(ctx, "unable to fetch entitlements") + return nil + } + return xerrors.Errorf("fetch entitlements: %w", err) + } + d.Entitlements = &ents + return nil + }) + + // Health settings + eg.Go(func() error { + settings, err := healthsdk.New(client).HealthSettings(ctx) + if err != nil { + // If not accessible, log and continue + if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) { + log.Warn(ctx, "unable to fetch health settings") + return nil + } + return xerrors.Errorf("fetch health settings: %w", err) + } + d.HealthSettings = &settings + return nil + }) + + // List workspaces (paginated) + eg.Go(func() error { + var ( + offset int + limit = 200 + all []codersdk.Workspace + count int + ) + capTotal := workspacesCap + for { + resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{Offset: offset, Limit: limit}) + if err != nil { + // Log and continue if forbidden; otherwise return error + if cerr, ok := codersdk.AsError(err); ok && (cerr.StatusCode() == http.StatusForbidden || cerr.StatusCode() == http.StatusUnauthorized) { + log.Warn(ctx, "unable to list workspaces") + break + } + return xerrors.Errorf("list workspaces: %w", err) + } + if d.Workspaces == nil { + d.Workspaces = &resp + } + // sanitize env vars on agents in each workspace before appending + for i := range resp.Workspaces { + ws := &resp.Workspaces[i] + for _, res := range ws.LatestBuild.Resources { + for _, agt := range res.Agents { + // safe to call even if map is nil (range in sanitizeEnv would be empty) + sanitizeEnv(agt.EnvironmentVariables) + } + } + } + all = append(all, resp.Workspaces...) + count = resp.Count + // Stop early once we've reached the cap; trim any overflow from the last page. + if capTotal > 0 && len(all) >= capTotal { + if len(all) > capTotal { + all = all[:capTotal] + } + break + } + if offset+len(resp.Workspaces) >= count || len(resp.Workspaces) == 0 { + break + } + offset += len(resp.Workspaces) + } + if d.Workspaces != nil { + // Replace with aggregated list + d.Workspaces.Workspaces = all + // Preserve server-reported total so Run() can log accurate truncation. + d.Workspaces.Count = count + } + return nil + }) + if err := eg.Wait(); err != nil { log.Error(ctx, "fetch deployment information", slog.Error(err)) } + if d.Config != nil && d.Config.Values != nil { + prometheusCfg := d.Config.Values.Prometheus + if prometheusCfg.Enable.Value() { + metrics, err := fetchPrometheusMetrics(ctx, client, log) + if err != nil { + log.Warn(ctx, "fetch coderd prometheus metrics", slog.Error(err)) + } else { + d.Prometheus = metrics + } + } + } + return d } +func fetchPrometheusMetrics(ctx context.Context, client *codersdk.Client, log slog.Logger) ([]byte, error) { + if client == nil { + return nil, xerrors.New("nil client") + } + + reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + resp, err := client.Request(reqCtx, http.MethodGet, "/api/v2/debug/metrics", nil) + if err != nil { + return nil, xerrors.Errorf("request metrics: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, xerrors.Errorf("read metrics body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + log.Debug(ctx, "coderd prometheus metrics fetch non-200", + slog.F("status", resp.StatusCode), slog.F("body_len", len(body))) + return nil, xerrors.Errorf("unexpected status code %d", resp.StatusCode) + } + + trimmed := bytes.TrimSpace(body) + if len(trimmed) == 0 { + return nil, xerrors.New("empty prometheus metrics response") + } + return append([]byte(nil), trimmed...), nil +} + func NetworkInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) Network { var ( n Network @@ -471,6 +660,234 @@ func connectedAgentInfo(ctx context.Context, client *codersdk.Client, log slog.L return closer } +func PprofInfo(ctx context.Context, client *codersdk.Client, log slog.Logger) *PprofCollection { + if client == nil { + return nil + } + + var ( + p PprofCollection + eg errgroup.Group + ) + + if client.URL != nil { + if u, err := client.URL.Parse("/api/v2/debug/pprof"); err == nil { + p.EndpointURL = u.String() + } + } + if p.EndpointURL == "" { + p.EndpointURL = "/api/v2/debug/pprof" + } + p.CollectedAt = time.Now() + + const basePath = "/api/v2/debug/pprof" + endpoints := map[string]func([]byte){ + "/allocs": func(data []byte) { + p.Allocs = compressData(data) + }, + "/heap": func(data []byte) { + p.Heap = compressData(data) + }, + "/profile?seconds=30": func(data []byte) { + p.Profile = compressData(data) + }, + "/block": func(data []byte) { + p.Block = compressData(data) + }, + "/mutex": func(data []byte) { + p.Mutex = compressData(data) + }, + "/goroutine": func(data []byte) { + p.Goroutine = compressData(data) + }, + "/threadcreate": func(data []byte) { + p.Threadcreate = compressData(data) + }, + "/trace?seconds=30": func(data []byte) { + p.Trace = compressData(data) + }, + "/cmdline": func(data []byte) { + p.Cmdline = string(data) + }, + "/symbol": func(data []byte) { + p.Symbol = string(data) + }, + } + + for endpoint, setter := range endpoints { + endpoint, setter := endpoint, setter + eg.Go(func() error { + timeout := 10 * time.Second + if strings.Contains(endpoint, "seconds=30") { + timeout = 45 * time.Second + } + + reqCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + resp, err := client.Request(reqCtx, http.MethodGet, basePath+endpoint, nil) + if err != nil { + log.Warn(reqCtx, "failed to fetch pprof data", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + log.Warn(reqCtx, "pprof endpoint returned non-200 status", + slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode)) + return nil + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + log.Warn(reqCtx, "failed to read pprof response", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + + setter(data) + return nil + }) + } + + if err := eg.Wait(); err != nil { + log.Error(ctx, "failed to collect some pprof data", slog.Error(err)) + } + + return &p +} + +func compressData(data []byte) []byte { + if len(data) == 0 { + return data + } + + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + if _, err := gz.Write(data); err != nil { + return data // Return uncompressed if compression fails + } + if err := gz.Close(); err != nil { + return data + } + + return buf.Bytes() +} + +func PprofInfoFromAgent(ctx context.Context, conn workspacesdk.AgentConn, log slog.Logger) *PprofCollection { + if conn == nil { + return nil + } + + var ( + p PprofCollection + eg errgroup.Group + ) + + p.EndpointURL = "agent" + p.CollectedAt = time.Now() + + // Define agent pprof endpoints - these go through the agent connection + endpoints := map[string]func([]byte){ + "/debug/pprof/allocs": func(data []byte) { + p.Allocs = compressData(data) + }, + "/debug/pprof/heap": func(data []byte) { + p.Heap = compressData(data) + }, + "/debug/pprof/profile?seconds=30": func(data []byte) { + p.Profile = compressData(data) + }, + "/debug/pprof/block": func(data []byte) { + p.Block = compressData(data) + }, + "/debug/pprof/mutex": func(data []byte) { + p.Mutex = compressData(data) + }, + "/debug/pprof/goroutine": func(data []byte) { + p.Goroutine = compressData(data) + }, + "/debug/pprof/threadcreate": func(data []byte) { + p.Threadcreate = compressData(data) + }, + "/debug/pprof/trace?seconds=30": func(data []byte) { + p.Trace = compressData(data) + }, + "/debug/pprof/cmdline": func(data []byte) { + p.Cmdline = string(data) + }, + "/debug/pprof/symbol": func(data []byte) { + p.Symbol = string(data) + }, + } + + // Collect each endpoint in parallel + for endpoint, setter := range endpoints { + endpoint, setter := endpoint, setter // capture loop variables + eg.Go(func() error { + // Set longer timeout for profile and trace endpoints (they take 30 seconds) + timeout := 10 * time.Second + if strings.Contains(endpoint, "seconds=30") { + timeout = 45 * time.Second + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + // Use the agent's direct HTTP capability + // Agent pprof server runs on 127.0.0.1:6060 by default + netConn, err := conn.DialContext(ctx, "tcp", "127.0.0.1:6060") + if err != nil { + log.Warn(ctx, "failed to dial agent pprof endpoint", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + defer netConn.Close() + + // Create HTTP client using the connection + client := &http.Client{ + Transport: &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return netConn, nil + }, + }, + Timeout: timeout, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:6060"+endpoint, nil) + if err != nil { + log.Warn(ctx, "failed to create agent pprof request", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + + resp, err := client.Do(req) + if err != nil { + log.Warn(ctx, "failed to fetch agent pprof data", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + log.Warn(ctx, "agent pprof endpoint returned non-200 status", slog.F("endpoint", endpoint), slog.F("status", resp.StatusCode)) + return nil + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + log.Warn(ctx, "failed to read agent pprof response", slog.F("endpoint", endpoint), slog.Error(err)) + return nil + } + + setter(data) + return nil + }) + } + + if err := eg.Wait(); err != nil { + log.Error(ctx, "failed to collect some agent pprof data", slog.Error(err)) + } + + return &p +} + // Run generates a support bundle with the given dependencies. func Run(ctx context.Context, d *Deps) (*Bundle, error) { var b Bundle @@ -505,9 +922,28 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) { } } + totalCap := d.WorkspacesTotalCap + var eg errgroup.Group eg.Go(func() error { - di := DeploymentInfo(ctx, d.Client, d.Log) + di := DeploymentInfo(ctx, d.Client, d.Log, totalCap) + + if di.Workspaces != nil && totalCap > 0 { + origTotal := di.Workspaces.Count // server-reported total + + // Ensure at most 'totalCap' are returned (covers non-early-exit path). + if len(di.Workspaces.Workspaces) > totalCap { + di.Workspaces.Workspaces = di.Workspaces.Workspaces[:totalCap] + } + // If we returned fewer than the original total, log a truncation. + if origTotal > len(di.Workspaces.Workspaces) { + di.Workspaces.Count = len(di.Workspaces.Workspaces) + d.Log.Warn(ctx, "workspace list truncated", + slog.F("cap", totalCap), + slog.F("original_total", origTotal), + ) + } + } b.Deployment = di return nil }) @@ -527,11 +963,129 @@ func Run(ctx context.Context, d *Deps) (*Bundle, error) { return nil }) + // Optional: capture a template's active version and file if TemplateID is set. + eg.Go(func() error { + if d.TemplateID == uuid.Nil { + return nil + } + var td TemplateDump + tpl, err := d.Client.Template(ctx, d.TemplateID) + if err != nil { + d.Log.Error(ctx, "fetch template", slog.Error(err), slog.F("template_id", d.TemplateID)) + return nil + } + td.Template = tpl + if tpl.ActiveVersionID == uuid.Nil { + d.Log.Error(ctx, "template has nil active version id", slog.F("template_id", tpl.ID)) + b.NamedTemplate = td + return nil + } + tv, err := d.Client.TemplateVersion(ctx, tpl.ActiveVersionID) + if err != nil { + d.Log.Error(ctx, "fetch active template version", slog.Error(err), slog.F("active_version_id", tpl.ActiveVersionID)) + b.NamedTemplate = td + return nil + } + td.TemplateVersion = tv + if tv.Job.FileID == uuid.Nil { + d.Log.Error(ctx, "template file id is nil", slog.F("template_version_id", tv.ID)) + b.NamedTemplate = td + return nil + } + raw, ctype, err := d.Client.DownloadWithFormat(ctx, tv.Job.FileID, codersdk.FormatZip) + if err != nil || ctype != codersdk.ContentTypeZip { + d.Log.Error(ctx, "download template file", slog.Error(err), slog.F("content_type", ctype)) + b.NamedTemplate = td + return nil + } + td.TemplateFileBase64 = base64.StdEncoding.EncodeToString(raw) + b.NamedTemplate = td + return nil + }) + _ = eg.Wait() + // Collect pprof data after deployment info is available (need version check). + // Pprof endpoints require Coder server version 2.28.0 or newer. + if d.CollectPprof { + b.Pprof = collectPprof(ctx, d, &b) + } + return &b, nil } +// minPprofVersion is the minimum Coder server version that supports +// the /api/v2/debug/pprof endpoints. +const minPprofVersion = "v2.28.0" + +// VersionSupportsPprof checks if the given version supports pprof endpoints. +func VersionSupportsPprof(version string) bool { + if version == "" { + return false + } + if version[0] != 'v' { + version = "v" + version + } + // For prerelease versions like "v2.28.0-devel+abc123", we compare + // the major.minor.patch portion since prereleases of 2.28.0 should + // have the pprof feature. + canonical := semver.Canonical(version) + if idx := strings.Index(canonical, "-"); idx != -1 { + canonical = canonical[:idx] + } + return semver.Compare(canonical, minPprofVersion) >= 0 +} + +func collectPprof(ctx context.Context, d *Deps, b *Bundle) Pprof { + var pprof Pprof + + // Check server version before attempting pprof collection. + if b.Deployment.BuildInfo == nil { + d.Log.Warn(ctx, "skipping pprof collection: build info not available") + return pprof + } + if !VersionSupportsPprof(b.Deployment.BuildInfo.Version) { + d.Log.Warn(ctx, "skipping pprof collection: server version too old", + slog.F("version", b.Deployment.BuildInfo.Version), + slog.F("min_version", minPprofVersion)) + return pprof + } + + serverPprof := PprofInfo(ctx, d.Client, d.Log) + if serverPprof != nil { + pprof.Server = serverPprof + } + + if d.AgentID != uuid.Nil { + conn, err := workspacesdk.New(d.Client). + DialAgent(ctx, d.AgentID, &workspacesdk.DialAgentOptions{ + Logger: d.Log.Named("dial-agent-pprof"), + BlockEndpoints: false, + }) + if err != nil { + d.Log.Warn(ctx, "failed to dial agent for pprof collection", slog.Error(err)) + } else { + defer func() { + if err := conn.Close(); err != nil { + d.Log.Error(ctx, "failed to close agent pprof connection", slog.Error(err)) + } + <-conn.TailnetConn().Closed() + }() + + if conn.AwaitReachable(ctx) { + agentPprof := PprofInfoFromAgent(ctx, conn, d.Log) + if agentPprof != nil { + pprof.Agent = agentPprof + } + } else { + d.Log.Warn(ctx, "agent not reachable for pprof collection") + } + } + } + + return pprof +} + // sanitizeEnv modifies kvs in place and replaces the values all non-empty keys // with the string ***REDACTED*** func sanitizeEnv(kvs map[string]string) { diff --git a/support/support_test.go b/support/support_test.go index f3c374347c..cbb4783c36 100644 --- a/support/support_test.go +++ b/support/support_test.go @@ -3,6 +3,7 @@ package support_test import ( "bytes" "context" + "fmt" "io" "net/http" "os" @@ -27,6 +28,7 @@ import ( "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/support" "github.com/coder/coder/v2/testutil" + "github.com/coder/serpent" ) func TestMain(m *testing.M) { @@ -39,6 +41,10 @@ func TestRun(t *testing.T) { t.Run("OK", func(t *testing.T) { t.Parallel() cfg := coderdtest.DeploymentValues(t) + promPort := testutil.RandomPort(t) + cfg.Prometheus.Enable = serpent.Bool(true) + cfg.Prometheus.Address.Host = "127.0.0.1" + cfg.Prometheus.Address.Port = fmt.Sprintf("%d", promPort) cfg.Experiments = []string{"foo"} ctx := testutil.Context(t, testutil.WaitLong) client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{ @@ -86,8 +92,24 @@ func TestRun(t *testing.T) { assertNotNilNotEmpty(t, bun.Agent.PeerDiagnostics, "agent peer diagnostics should be present") assertNotNilNotEmpty(t, bun.Agent.PingResult, "agent ping result should be present") assertNotNilNotEmpty(t, bun.Agent.Prometheus, "agent prometheus metrics should be present") + assertNotNilNotEmpty(t, bun.Deployment.Prometheus, "deployment prometheus metrics should be present") assertNotNilNotEmpty(t, bun.Agent.StartupLogs, "agent startup logs should be present") assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present") + assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof") + assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof") + + // New: deployment health settings should be present + assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present") + // New: aggregated workspaces should be present and include created workspace + assert.NotNil(t, bun.Deployment.Workspaces, "deployment workspaces should be present") + assert.GreaterOrEqual(t, bun.Deployment.Workspaces.Count, 1) + for _, aws := range bun.Deployment.Workspaces.Workspaces { + for _, res := range aws.LatestBuild.Resources { + for _, a := range res.Agents { + assertSanitizedEnv(t, a.EnvironmentVariables) + } + } + } }) t.Run("OK_NoWorkspace", func(t *testing.T) { @@ -120,6 +142,13 @@ func TestRun(t *testing.T) { assert.Empty(t, bun.Workspace.Workspace, "did not expect workspace to be present") assert.Empty(t, bun.Agent, "did not expect agent to be present") assertNotNilNotEmpty(t, bun.Logs, "bundle logs should be present") + assert.Nil(t, bun.Pprof.Server, "server pprof should not be collected without CollectPprof") + assert.Nil(t, bun.Pprof.Agent, "agent pprof should not be collected without CollectPprof") + + // New: health settings should be present even without workspace context + assertNotNilNotEmpty(t, bun.Deployment.HealthSettings, "deployment health settings should be present") + // New: aggregated workspaces struct should exist (may be empty) + assert.NotNil(t, bun.Deployment.Workspaces) }) t.Run("NoAuth", func(t *testing.T) { diff --git a/support/version_test.go b/support/version_test.go new file mode 100644 index 0000000000..33e185fbce --- /dev/null +++ b/support/version_test.go @@ -0,0 +1,36 @@ +package support_test + +import ( + "testing" + + "github.com/coder/coder/v2/support" +) + +func TestVersionSupportsPprof(t *testing.T) { + t.Parallel() + tests := []struct { + version string + want bool + }{ + {"", false}, + {"v2.27.0", false}, + {"v2.27.9", false}, + {"v2.28.0", true}, + {"v2.28.1", true}, + {"v2.29.0", true}, + {"v3.0.0", true}, + {"2.28.0", true}, // without v prefix + {"2.27.0", false}, // without v prefix + {"v2.28.0-devel+abc123", true}, // dev version + {"v2.27.0-devel+abc123", false}, + } + for _, tt := range tests { + t.Run(tt.version, func(t *testing.T) { + t.Parallel() + got := support.VersionSupportsPprof(tt.version) + if got != tt.want { + t.Errorf("versionSupportsPprof(%q) = %v, want %v", tt.version, got, tt.want) + } + }) + } +}