diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index 4f9a7c96bc..6c62a51fc7 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -869,6 +869,28 @@ const docTemplate = `{ } } }, + "/debug/profile": { + "post": { + "security": [ + { + "CoderSessionToken": [] + } + ], + "tags": [ + "Debug" + ], + "summary": "Collect debug profiles", + "operationId": "collect-debug-profiles", + "responses": { + "200": { + "description": "OK" + } + }, + "x-apidocgen": { + "skip": true + } + } + }, "/debug/tailnet": { "get": { "security": [ diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index 9280851f02..17f2a705de 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -752,6 +752,26 @@ } } }, + "/debug/profile": { + "post": { + "security": [ + { + "CoderSessionToken": [] + } + ], + "tags": ["Debug"], + "summary": "Collect debug profiles", + "operationId": "collect-debug-profiles", + "responses": { + "200": { + "description": "OK" + } + }, + "x-apidocgen": { + "skip": true + } + } + }, "/debug/tailnet": { "get": { "security": [ diff --git a/coderd/coderd.go b/coderd/coderd.go index ced5b3968b..19caea0445 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -627,7 +627,8 @@ func New(options *Options) *API { options.Database, options.Pubsub, ), - dbRolluper: options.DatabaseRolluper, + dbRolluper: options.DatabaseRolluper, + ProfileCollector: defaultProfileCollector{}, } api.WorkspaceAppsProvider = workspaceapps.NewDBTokenProvider( ctx, @@ -1732,6 +1733,8 @@ func New(options *Options) *API { } r.Method("GET", "/expvar", expvar.Handler()) // contains DERP metrics as well as cmdline and memstats + r.Post("/profile", api.debugCollectProfile) + r.Route("/pprof", func(r chi.Router) { r.Use(func(next http.Handler) http.Handler { // Some of the pprof handlers strip the `/debug/pprof` @@ -2019,6 +2022,15 @@ type API struct { // gitSyncWorker refreshes stale chat diff statuses in the // background. gitSyncWorker *gitsync.Worker + + // ProfileCollector abstracts the runtime/pprof and runtime/trace + // calls used by the /debug/profile endpoint. Tests override this + // with a stub to avoid process-global side-effects. + ProfileCollector ProfileCollector + // ProfileCollecting is used as a concurrency guard so that only one + // profile collection (via /debug/profile) can run at a time. The CPU + // profiler is process-global, so concurrent collections would fail. + ProfileCollecting atomic.Bool } // Close waits for all WebSocket connections to drain before returning. diff --git a/coderd/debug.go b/coderd/debug.go index cd07fde235..0887485aaa 100644 --- a/coderd/debug.go +++ b/coderd/debug.go @@ -1,13 +1,20 @@ package coderd import ( + "archive/tar" "bytes" + "compress/gzip" "context" "database/sql" "encoding/json" "fmt" + "io" "net/http" + "runtime" + "runtime/pprof" + "runtime/trace" "slices" + "strings" "time" "github.com/google/uuid" @@ -330,6 +337,298 @@ func loadDismissedHealthchecks(ctx context.Context, db database.Store, logger sl return dismissedHealthchecks } +// ProfileCollector abstracts the mechanics of collecting pprof/trace +// data from the Go runtime. Production code uses defaultProfileCollector; +// tests can substitute a stub to avoid process-global side-effects. +type ProfileCollector interface { + // StartCPUProfile begins CPU profiling, writing to w. It returns + // a stop function that must be called to finish profiling. + StartCPUProfile(w io.Writer) (stop func(), err error) + // StartTrace begins execution tracing, writing to w. It returns + // a stop function that must be called to finish tracing. + StartTrace(w io.Writer) (stop func(), err error) + // LookupProfile writes the named snapshot profile to w. + LookupProfile(name string, w io.Writer) error + // SetBlockProfileRate enables/disables block profiling. + SetBlockProfileRate(rate int) + // SetMutexProfileFraction enables/disables mutex profiling. + // Returns the previous fraction. + SetMutexProfileFraction(rate int) int +} + +// defaultProfileCollector delegates to the real runtime/pprof and +// runtime/trace packages. +type defaultProfileCollector struct{} + +func (defaultProfileCollector) StartCPUProfile(w io.Writer) (func(), error) { + if err := pprof.StartCPUProfile(w); err != nil { + return nil, err + } + return pprof.StopCPUProfile, nil +} + +func (defaultProfileCollector) StartTrace(w io.Writer) (func(), error) { + if err := trace.Start(w); err != nil { + return nil, err + } + return trace.Stop, nil +} + +func (defaultProfileCollector) LookupProfile(name string, w io.Writer) error { + p := pprof.Lookup(name) + if p == nil { + return nil + } + return p.WriteTo(w, 0) +} + +func (defaultProfileCollector) SetBlockProfileRate(rate int) { runtime.SetBlockProfileRate(rate) } +func (defaultProfileCollector) SetMutexProfileFraction(rate int) int { + return runtime.SetMutexProfileFraction(rate) +} + +// defaultProfiles is the set of profiles collected when none are specified. +var defaultProfiles = []string{"cpu", "heap", "allocs", "block", "mutex", "goroutine"} + +// allValidProfiles enumerates every profile name accepted by the endpoint. +var allValidProfiles = map[string]bool{ + "cpu": true, + "heap": true, + "allocs": true, + "block": true, + "mutex": true, + "goroutine": true, + "threadcreate": true, + "trace": true, +} + +const ( + // profileDurationDefault is used when no ?duration is supplied. + profileDurationDefault = 10 * time.Second + // profileDurationMax prevents callers from asking for arbitrarily long + // collections that tie up the runtime-global CPU profiler. + profileDurationMax = 60 * time.Second +) + +// @Summary Collect debug profiles +// @ID collect-debug-profiles +// @Security CoderSessionToken +// @Tags Debug +// @Success 200 +// @Router /debug/profile [post] +// @x-apidocgen {"skip": true} +func (api *API) debugCollectProfile(rw http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + // Parse duration. + duration := profileDurationDefault + if v := r.URL.Query().Get("duration"); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: "Invalid duration parameter.", + Detail: err.Error(), + }) + return + } + if d <= 0 { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: "Duration must be positive.", + }) + return + } + if d > profileDurationMax { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: fmt.Sprintf("Duration cannot exceed %s.", profileDurationMax), + }) + return + } + duration = d + } + + // Parse requested profiles. + profiles := defaultProfiles + if v := r.URL.Query().Get("profiles"); v != "" { + profiles = strings.Split(v, ",") + for _, p := range profiles { + if !allValidProfiles[p] { + httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ + Message: fmt.Sprintf("Unknown profile type: %q.", p), + Detail: "Valid types: cpu, heap, allocs, block, mutex, goroutine, threadcreate, trace", + }) + return + } + } + } + + // Only one profile collection can run at a time because the CPU + // profiler is process-global. + if !api.ProfileCollecting.CompareAndSwap(false, true) { + httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{ + Message: "A profile collection is already in progress. Try again later.", + }) + return + } + defer api.ProfileCollecting.Store(false) + + // Temporarily enable block and mutex profiling so those profiles are + // actually populated. Restore previous values when we are done. + // SetBlockProfileRate does not return the previous value, so we + // simply disable it again after collection (the default is 0). + pc := api.ProfileCollector + pc.SetBlockProfileRate(1) + prevMutexFraction := pc.SetMutexProfileFraction(1) + defer pc.SetBlockProfileRate(0) + defer pc.SetMutexProfileFraction(prevMutexFraction) + + // Determine which profiles need the timed collection (cpu, trace) vs + // instant snapshots. + wantCPU := false + wantTrace := false + for _, p := range profiles { + switch p { + case "cpu": + wantCPU = true + case "trace": + wantTrace = true + } + } + + // Collect timed profiles (cpu and/or trace) for the requested + // duration. StartCPUProfile and StartTrace each return a stop + // function that must be called to finish collection. + var cpuBuf, traceBuf bytes.Buffer + var stopCPU, stopTrace func() + if wantCPU { + var err error + stopCPU, err = pc.StartCPUProfile(&cpuBuf) + if err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to start CPU profile.", + Detail: err.Error(), + }) + return + } + } + if wantTrace { + var err error + stopTrace, err = pc.StartTrace(&traceBuf) + if err != nil { + if stopCPU != nil { + stopCPU() + } + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to start trace.", + Detail: err.Error(), + }) + return + } + } + + if wantCPU || wantTrace { + timer := api.Clock.NewTimer(duration, "debugCollectProfile") + defer timer.Stop() + select { + case <-ctx.Done(): + if stopCPU != nil { + stopCPU() + } + if stopTrace != nil { + stopTrace() + } + // Client disconnected; nothing to write. + return + case <-timer.C: + } + if stopCPU != nil { + stopCPU() + } + if stopTrace != nil { + stopTrace() + } + } + + // Build the tar.gz archive. + var archive bytes.Buffer + gzw := gzip.NewWriter(&archive) + tw := tar.NewWriter(gzw) + + addFile := func(name string, data []byte) error { + hdr := &tar.Header{ + Name: name, + Mode: 0o644, + Size: int64(len(data)), + } + if err := tw.WriteHeader(hdr); err != nil { + return xerrors.Errorf("write tar header for %s: %w", name, err) + } + if _, err := tw.Write(data); err != nil { + return xerrors.Errorf("write tar data for %s: %w", name, err) + } + return nil + } + + for _, p := range profiles { + switch p { + case "cpu": + if err := addFile("cpu.prof", cpuBuf.Bytes()); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to write CPU profile to archive.", + Detail: err.Error(), + }) + return + } + case "trace": + if err := addFile("trace.out", traceBuf.Bytes()); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to write trace to archive.", + Detail: err.Error(), + }) + return + } + default: + // Snapshot profiles: heap, allocs, block, mutex, goroutine, + // threadcreate. + var buf bytes.Buffer + if err := pc.LookupProfile(p, &buf); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: fmt.Sprintf("Failed to collect %s profile.", p), + Detail: err.Error(), + }) + return + } + if err := addFile(p+".prof", buf.Bytes()); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: fmt.Sprintf("Failed to write %s profile to archive.", p), + Detail: err.Error(), + }) + return + } + } + } + + if err := tw.Close(); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to finalize tar archive.", + Detail: err.Error(), + }) + return + } + if err := gzw.Close(); err != nil { + httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ + Message: "Failed to finalize gzip archive.", + Detail: err.Error(), + }) + return + } + + filename := fmt.Sprintf("coderd-profile-%d.tar.gz", time.Now().Unix()) + rw.Header().Set("Content-Type", "application/gzip") + rw.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filename)) + rw.WriteHeader(http.StatusOK) + _, _ = rw.Write(archive.Bytes()) +} + // @Summary Debug pprof index // @ID debug-pprof-index // @Security CoderSessionToken diff --git a/coderd/debug_test.go b/coderd/debug_test.go index c24f84923f..a2e888a631 100644 --- a/coderd/debug_test.go +++ b/coderd/debug_test.go @@ -1,6 +1,9 @@ package coderd_test import ( + "archive/tar" + "bytes" + "compress/gzip" "context" "encoding/json" "io" @@ -13,8 +16,11 @@ import ( "github.com/stretchr/testify/require" "cdr.dev/slog/v3/sloggers/slogtest" + "github.com/coder/coder/v2/coderd" "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/healthcheck" + "github.com/coder/coder/v2/coderd/rbac" + "github.com/coder/coder/v2/coderd/rbac/policy" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/healthsdk" "github.com/coder/coder/v2/testutil" @@ -370,3 +376,252 @@ func TestDebugWebsocket(t *testing.T) { t.Parallel() }) } + +// noopProfileCollector avoids calling process-global runtime functions +// (CPU profiler, tracer) so that tests can run in parallel safely. +type noopProfileCollector struct{} + +func (noopProfileCollector) StartCPUProfile(io.Writer) (func(), error) { return func() {}, nil } +func (noopProfileCollector) StartTrace(io.Writer) (func(), error) { return func() {}, nil } +func (noopProfileCollector) LookupProfile(string, io.Writer) error { return nil } +func (noopProfileCollector) SetBlockProfileRate(int) {} +func (noopProfileCollector) SetMutexProfileFraction(int) int { return 0 } + +// Compile-time check. +var _ coderd.ProfileCollector = noopProfileCollector{} + +// blockingProfileCollector blocks in StartCPUProfile until unblocked, +// allowing deterministic testing of the concurrency guard. +type blockingProfileCollector struct { + noopProfileCollector + started chan struct{} // closed when StartCPUProfile is entered + block chan struct{} // StartCPUProfile blocks until this is closed +} + +func (b *blockingProfileCollector) StartCPUProfile(io.Writer) (func(), error) { + close(b.started) + <-b.block + return func() {}, nil +} + +func newTestAPI(t *testing.T) (*codersdk.Client, io.Closer, *coderd.API) { + t.Helper() + client, closer, api := coderdtest.NewWithAPI(t, nil) + api.ProfileCollector = noopProfileCollector{} + return client, closer, api +} + +func TestDebugCollectProfile(t *testing.T) { + t.Parallel() + + t.Run("Defaults", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + + client, closer, api := newTestAPI(t) + defer closer.Close() + _ = coderdtest.CreateFirstUser(t, client) + + asserter := coderdtest.AssertRBAC(t, api, client) + + body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{ + // Use a very short duration so the test finishes quickly. + // The noop collector means no real profiling occurs. + Duration: 100 * time.Millisecond, + }) + require.NoError(t, err) + defer body.Close() + + data, err := io.ReadAll(body) + require.NoError(t, err) + require.NotEmpty(t, data, "archive should not be empty") + + // Verify that the response is a valid tar.gz archive containing + // the expected profile files. + files := extractTarGzFiles(t, data) + require.Contains(t, files, "cpu.prof") + require.Contains(t, files, "heap.prof") + require.Contains(t, files, "allocs.prof") + require.Contains(t, files, "block.prof") + require.Contains(t, files, "mutex.prof") + require.Contains(t, files, "goroutine.prof") + + // Verify the endpoint checks the correct RBAC permission. + asserter.AssertChecked(t, policy.ActionRead, rbac.ResourceDebugInfo) + }) + + t.Run("CustomProfiles", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + + client, closer, _ := newTestAPI(t) + defer closer.Close() + _ = coderdtest.CreateFirstUser(t, client) + + body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{ + Duration: 100 * time.Millisecond, + Profiles: []string{"heap", "goroutine"}, + }) + require.NoError(t, err) + defer body.Close() + + data, err := io.ReadAll(body) + require.NoError(t, err) + + files := extractTarGzFiles(t, data) + require.Contains(t, files, "heap.prof") + require.Contains(t, files, "goroutine.prof") + // Should NOT contain profiles we didn't ask for. + require.NotContains(t, files, "cpu.prof") + require.NotContains(t, files, "allocs.prof") + }) + + t.Run("WithTraceAndCPU", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + + client, closer, _ := newTestAPI(t) + defer closer.Close() + _ = coderdtest.CreateFirstUser(t, client) + + body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{ + Duration: 100 * time.Millisecond, + Profiles: []string{"cpu", "trace"}, + }) + require.NoError(t, err) + defer body.Close() + + data, err := io.ReadAll(body) + require.NoError(t, err) + + files := extractTarGzFiles(t, data) + require.Contains(t, files, "cpu.prof") + require.Contains(t, files, "trace.out") + }) + + t.Run("DurationTooLong", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitShort) + + client := coderdtest.New(t, nil) + _ = coderdtest.CreateFirstUser(t, client) + + res, err := client.Request(ctx, "POST", "/api/v2/debug/profile?duration=5m", nil) + require.NoError(t, err) + defer res.Body.Close() + require.Equal(t, http.StatusBadRequest, res.StatusCode) + }) + + t.Run("InvalidDuration", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitShort) + + client := coderdtest.New(t, nil) + _ = coderdtest.CreateFirstUser(t, client) + + res, err := client.Request(ctx, "POST", "/api/v2/debug/profile?duration=notaduration", nil) + require.NoError(t, err) + defer res.Body.Close() + require.Equal(t, http.StatusBadRequest, res.StatusCode) + }) + + t.Run("InvalidProfile", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitShort) + + client := coderdtest.New(t, nil) + _ = coderdtest.CreateFirstUser(t, client) + + res, err := client.Request(ctx, "POST", "/api/v2/debug/profile?profiles=nonexistent", nil) + require.NoError(t, err) + defer res.Body.Close() + require.Equal(t, http.StatusBadRequest, res.StatusCode) + }) + + t.Run("Unauthorized", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitShort) + + client := coderdtest.New(t, nil) + firstUser := coderdtest.CreateFirstUser(t, client) + + // Create a non-admin user. + memberClient, _ := coderdtest.CreateAnotherUser(t, client, firstUser.OrganizationID) + + res, err := memberClient.Request(ctx, "POST", "/api/v2/debug/profile", nil) + require.NoError(t, err) + defer res.Body.Close() + require.Equal(t, http.StatusForbidden, res.StatusCode) + }) + + t.Run("Conflict", func(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + + blocker := &blockingProfileCollector{ + started: make(chan struct{}), + block: make(chan struct{}), + } + + client, closer, api := coderdtest.NewWithAPI(t, nil) + defer closer.Close() + api.ProfileCollector = blocker + _ = coderdtest.CreateFirstUser(t, client) + + // Start a profile collection that will block inside + // StartCPUProfile until we explicitly unblock it. + done := make(chan struct{}) + go func() { + defer close(done) + body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{ + Duration: 1 * time.Second, + }) + if err == nil { + body.Close() + } + }() + + // Wait deterministically for the first request to enter the + // collector — no time.Sleep needed. + testutil.TryReceive(ctx, t, blocker.started) + + // The second request should get 409 Conflict. + res, err := client.Request(ctx, "POST", "/api/v2/debug/profile?duration=1s", nil) + require.NoError(t, err) + defer res.Body.Close() + require.Equal(t, http.StatusConflict, res.StatusCode) + + // Unblock the first request and wait for it to finish. + close(blocker.block) + testutil.TryReceive(ctx, t, done) + }) +} + +// extractTarGzFiles extracts file names from a tar.gz archive. +func extractTarGzFiles(t *testing.T, data []byte) map[string]bool { + t.Helper() + + gr, err := gzip.NewReader(bytes.NewReader(data)) + require.NoError(t, err) + defer gr.Close() + + tr := tar.NewReader(gr) + files := make(map[string]bool) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + files[hdr.Name] = true + } + return files +} diff --git a/codersdk/debug.go b/codersdk/debug.go new file mode 100644 index 0000000000..fbdaf44bc6 --- /dev/null +++ b/codersdk/debug.go @@ -0,0 +1,56 @@ +package codersdk + +import ( + "context" + "io" + "net/http" + "net/url" + "strings" + "time" + + "golang.org/x/xerrors" +) + +// DebugProfileDurationMax is the maximum duration the server will accept +// for a profile collection. Callers should ensure their context deadline +// exceeds this to avoid premature cancellation. +const DebugProfileDurationMax = 60 * time.Second + +// DebugProfileOptions are options for collecting debug profiles from the +// server via the consolidated /debug/profile endpoint. +type DebugProfileOptions struct { + // Duration controls how long time-based profiles (cpu, trace) run. + // Zero uses the server default (10s). + Duration time.Duration + // Profiles is the list of profile types to collect. Nil or empty uses + // the server default (cpu, heap, allocs, block, mutex, goroutine). + Profiles []string +} + +// DebugCollectProfile fetches a tar.gz archive of pprof profiles from the +// server. The caller is responsible for closing the returned ReadCloser. +func (c *Client) DebugCollectProfile(ctx context.Context, opts DebugProfileOptions) (io.ReadCloser, error) { + qp := url.Values{} + if opts.Duration > 0 { + qp.Set("duration", opts.Duration.String()) + } + if len(opts.Profiles) > 0 { + qp.Set("profiles", strings.Join(opts.Profiles, ",")) + } + + reqPath := "/api/v2/debug/profile" + if len(qp) > 0 { + reqPath += "?" + qp.Encode() + } + + resp, err := c.Request(ctx, http.MethodPost, reqPath, nil) + if err != nil { + return nil, xerrors.Errorf("request debug profile: %w", err) + } + if resp.StatusCode != http.StatusOK { + defer resp.Body.Close() + return nil, ReadBodyAsError(resp) + } + + return resp.Body, nil +} diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 681d77e8e0..fd9181b607 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -2369,6 +2369,24 @@ export interface DatabaseReport extends BaseReport { readonly threshold_ms: number; } +// From codersdk/debug.go +/** + * DebugProfileOptions are options for collecting debug profiles from the + * server via the consolidated /debug/profile endpoint. + */ +export interface DebugProfileOptions { + /** + * Duration controls how long time-based profiles (cpu, trace) run. + * Zero uses the server default (10s). + */ + readonly Duration: number; + /** + * Profiles is the list of profile types to collect. Nil or empty uses + * the server default (cpu, heap, allocs, block, mutex, goroutine). + */ + readonly Profiles: readonly string[]; +} + // From codersdk/externalauth.go export interface DeleteExternalAuthByIDResponse { /** diff --git a/support/support.go b/support/support.go index cb7b2e1c36..de490741b8 100644 --- a/support/support.go +++ b/support/support.go @@ -1,15 +1,18 @@ package support import ( + "archive/tar" "bytes" "compress/gzip" "context" "encoding/base64" "encoding/json" + "errors" "io" "net" "net/http" "net/http/httptest" + "path" "strings" "time" @@ -772,6 +775,93 @@ func compressData(data []byte) []byte { return buf.Bytes() } +// PprofInfoFromArchive uses the consolidated /api/v2/debug/profile endpoint +// to collect pprof data in a single request. The server temporarily enables +// block/mutex profiling, runs time-based profiles for the given duration, +// takes snapshots, and returns a tar.gz archive. +func PprofInfoFromArchive(ctx context.Context, client *codersdk.Client, log slog.Logger, duration time.Duration) (*PprofCollection, error) { + if client == nil { + return nil, xerrors.New("client is nil") + } + + body, err := client.DebugCollectProfile(ctx, codersdk.DebugProfileOptions{ + Duration: duration, + // Use the server defaults plus trace. + Profiles: []string{"cpu", "heap", "allocs", "block", "mutex", "goroutine", "threadcreate", "trace"}, + }) + if err != nil { + return nil, xerrors.Errorf("fetch consolidated profile: %w", err) + } + defer body.Close() + + data, err := io.ReadAll(body) + if err != nil { + return nil, xerrors.Errorf("read profile archive: %w", err) + } + + var p PprofCollection + if client.URL != nil { + if u, err := client.URL.Parse("/api/v2/debug/profile"); err == nil { + p.EndpointURL = u.String() + } + } + if p.EndpointURL == "" { + p.EndpointURL = "/api/v2/debug/profile" + } + p.CollectedAt = time.Now() + + // Parse the tar.gz archive and populate the PprofCollection. + gr, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, xerrors.Errorf("open gzip reader: %w", err) + } + defer gr.Close() + + tr := tar.NewReader(gr) + for { + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return nil, xerrors.Errorf("read tar entry %q: %w", hdr.Name, err) + } + + content, err := io.ReadAll(tr) + if err != nil { + log.Warn(ctx, "failed to read tar entry", slog.F("name", hdr.Name), slog.Error(err)) + continue + } + + // Files in the archive are named like "cpu.prof", "heap.prof", + // "trace.out", etc. Compress binary profile data for storage in + // the bundle, matching what PprofInfo() does. + base := path.Base(hdr.Name) + switch base { + case "cpu.prof": + p.Profile = compressData(content) + case "heap.prof": + p.Heap = compressData(content) + case "allocs.prof": + p.Allocs = compressData(content) + case "block.prof": + p.Block = compressData(content) + case "mutex.prof": + p.Mutex = compressData(content) + case "goroutine.prof": + p.Goroutine = compressData(content) + case "threadcreate.prof": + p.Threadcreate = compressData(content) + case "trace.out": + p.Trace = compressData(content) + default: + log.Debug(ctx, "unknown profile in archive", slog.F("name", hdr.Name)) + } + } + + return &p, nil +} + func PprofInfoFromAgent(ctx context.Context, conn workspacesdk.AgentConn, log slog.Logger) *PprofCollection { if conn == nil { return nil @@ -1049,7 +1139,16 @@ func collectPprof(ctx context.Context, d *Deps, b *Bundle) Pprof { return pprof } - serverPprof := PprofInfo(ctx, d.Client, d.Log) + // Try the consolidated /debug/profile endpoint first. It + // temporarily enables block/mutex profiling on the server and + // returns a single tar.gz archive. + serverPprof, err := PprofInfoFromArchive(ctx, d.Client, d.Log, 30*time.Second) + if err != nil { + d.Log.Warn(ctx, "consolidated profile endpoint unavailable, falling back to individual endpoints", + slog.Error(err)) + // Fall back to the legacy per-profile endpoint approach. + serverPprof = PprofInfo(ctx, d.Client, d.Log) + } if serverPprof != nil { pprof.Server = serverPprof }