mirror of
https://github.com/coder/coder.git
synced 2026-06-04 21:48:22 +00:00
3a62a8e70e
Relates to https://github.com/coder/internal/issues/272 This flake has been persisting for a while, and unfortunately there's no detail on which healthcheck in particular is holding things up. This PR adds a concurrency-safe `healthcheck.Progress` and wires it through `healthcheck.Run`. If the healthcheck times out, it will provide information on which healthchecks are completed / running, and how long they took / are still taking. 🤖 Claude Opus 4.5 completed the first round of this implementation, which I then refactored.
326 lines
9.4 KiB
Go
326 lines
9.4 KiB
Go
package healthcheck
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"slices"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/coder/coder/v2/buildinfo"
|
|
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
|
|
"github.com/coder/coder/v2/coderd/healthcheck/health"
|
|
"github.com/coder/coder/v2/coderd/util/ptr"
|
|
"github.com/coder/coder/v2/codersdk/healthsdk"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
// Progress tracks the progress of healthcheck components for timeout
|
|
// diagnostics. It records which checks have started and completed, along with
|
|
// their durations, to provide useful information when a healthcheck times out.
|
|
// The zero value is usable.
|
|
type Progress struct {
|
|
Clock quartz.Clock
|
|
mu sync.Mutex
|
|
checks map[string]*checkStatus
|
|
}
|
|
|
|
type checkStatus struct {
|
|
startedAt time.Time
|
|
completedAt time.Time
|
|
}
|
|
|
|
// Start records that a check has started.
|
|
func (p *Progress) Start(name string) {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
if p.Clock == nil {
|
|
p.Clock = quartz.NewReal()
|
|
}
|
|
if p.checks == nil {
|
|
p.checks = make(map[string]*checkStatus)
|
|
}
|
|
p.checks[name] = &checkStatus{startedAt: p.Clock.Now()}
|
|
}
|
|
|
|
// Complete records that a check has finished.
|
|
func (p *Progress) Complete(name string) {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
if p.Clock == nil {
|
|
p.Clock = quartz.NewReal()
|
|
}
|
|
if p.checks == nil {
|
|
p.checks = make(map[string]*checkStatus)
|
|
}
|
|
if p.checks[name] == nil {
|
|
p.checks[name] = &checkStatus{startedAt: p.Clock.Now()}
|
|
}
|
|
p.checks[name].completedAt = p.Clock.Now()
|
|
}
|
|
|
|
// Reset clears all recorded check statuses.
|
|
func (p *Progress) Reset() {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
p.checks = make(map[string]*checkStatus)
|
|
}
|
|
|
|
// Summary returns a human-readable summary of check progress.
|
|
// Example: "Completed: AccessURL (95ms), Database (120ms). Still running: DERP, Websocket"
|
|
func (p *Progress) Summary() string {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
|
|
var completed, running []string
|
|
for name, status := range p.checks {
|
|
if status.completedAt.IsZero() {
|
|
elapsed := p.Clock.Now().Sub(status.startedAt).Round(time.Millisecond)
|
|
running = append(running, fmt.Sprintf("%s (elapsed: %dms)", name, elapsed.Milliseconds()))
|
|
continue
|
|
}
|
|
duration := status.completedAt.Sub(status.startedAt).Round(time.Millisecond)
|
|
completed = append(completed, fmt.Sprintf("%s (%dms)", name, duration.Milliseconds()))
|
|
}
|
|
|
|
// Sort for consistent output.
|
|
slices.Sort(completed)
|
|
slices.Sort(running)
|
|
|
|
var parts []string
|
|
if len(completed) > 0 {
|
|
parts = append(parts, "Completed: "+strings.Join(completed, ", "))
|
|
}
|
|
if len(running) > 0 {
|
|
parts = append(parts, "Still running: "+strings.Join(running, ", "))
|
|
}
|
|
return strings.Join(parts, ". ")
|
|
}
|
|
|
|
type Checker interface {
|
|
DERP(ctx context.Context, opts *derphealth.ReportOptions) healthsdk.DERPHealthReport
|
|
AccessURL(ctx context.Context, opts *AccessURLReportOptions) healthsdk.AccessURLReport
|
|
Websocket(ctx context.Context, opts *WebsocketReportOptions) healthsdk.WebsocketReport
|
|
Database(ctx context.Context, opts *DatabaseReportOptions) healthsdk.DatabaseReport
|
|
WorkspaceProxy(ctx context.Context, opts *WorkspaceProxyReportOptions) healthsdk.WorkspaceProxyReport
|
|
ProvisionerDaemons(ctx context.Context, opts *ProvisionerDaemonsReportDeps) healthsdk.ProvisionerDaemonsReport
|
|
}
|
|
|
|
type ReportOptions struct {
|
|
AccessURL AccessURLReportOptions
|
|
Database DatabaseReportOptions
|
|
DerpHealth derphealth.ReportOptions
|
|
Websocket WebsocketReportOptions
|
|
WorkspaceProxy WorkspaceProxyReportOptions
|
|
ProvisionerDaemons ProvisionerDaemonsReportDeps
|
|
|
|
Checker Checker
|
|
|
|
// Progress tracks healthcheck progress for timeout diagnostics.
|
|
// If set, each check will record its start and completion time.
|
|
Progress *Progress
|
|
}
|
|
|
|
type defaultChecker struct{}
|
|
|
|
func (defaultChecker) DERP(ctx context.Context, opts *derphealth.ReportOptions) healthsdk.DERPHealthReport {
|
|
var report derphealth.Report
|
|
report.Run(ctx, opts)
|
|
return healthsdk.DERPHealthReport(report)
|
|
}
|
|
|
|
func (defaultChecker) AccessURL(ctx context.Context, opts *AccessURLReportOptions) healthsdk.AccessURLReport {
|
|
var report AccessURLReport
|
|
report.Run(ctx, opts)
|
|
return healthsdk.AccessURLReport(report)
|
|
}
|
|
|
|
func (defaultChecker) Websocket(ctx context.Context, opts *WebsocketReportOptions) healthsdk.WebsocketReport {
|
|
var report WebsocketReport
|
|
report.Run(ctx, opts)
|
|
return healthsdk.WebsocketReport(report)
|
|
}
|
|
|
|
func (defaultChecker) Database(ctx context.Context, opts *DatabaseReportOptions) healthsdk.DatabaseReport {
|
|
var report DatabaseReport
|
|
report.Run(ctx, opts)
|
|
return healthsdk.DatabaseReport(report)
|
|
}
|
|
|
|
func (defaultChecker) WorkspaceProxy(ctx context.Context, opts *WorkspaceProxyReportOptions) healthsdk.WorkspaceProxyReport {
|
|
var report WorkspaceProxyReport
|
|
report.Run(ctx, opts)
|
|
return healthsdk.WorkspaceProxyReport(report)
|
|
}
|
|
|
|
func (defaultChecker) ProvisionerDaemons(ctx context.Context, opts *ProvisionerDaemonsReportDeps) healthsdk.ProvisionerDaemonsReport {
|
|
var report ProvisionerDaemonsReport
|
|
report.Run(ctx, opts)
|
|
return healthsdk.ProvisionerDaemonsReport(report)
|
|
}
|
|
|
|
func Run(ctx context.Context, opts *ReportOptions) *healthsdk.HealthcheckReport {
|
|
var (
|
|
wg sync.WaitGroup
|
|
report healthsdk.HealthcheckReport
|
|
)
|
|
|
|
if opts.Checker == nil {
|
|
opts.Checker = defaultChecker{}
|
|
}
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.DERP.Error = health.Errorf(health.CodeUnknown, "derp report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("DERP")
|
|
defer opts.Progress.Complete("DERP")
|
|
}
|
|
report.DERP = opts.Checker.DERP(ctx, &opts.DerpHealth)
|
|
}()
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.AccessURL.Error = health.Errorf(health.CodeUnknown, "access url report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("AccessURL")
|
|
defer opts.Progress.Complete("AccessURL")
|
|
}
|
|
report.AccessURL = opts.Checker.AccessURL(ctx, &opts.AccessURL)
|
|
}()
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.Websocket.Error = health.Errorf(health.CodeUnknown, "websocket report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("Websocket")
|
|
defer opts.Progress.Complete("Websocket")
|
|
}
|
|
report.Websocket = opts.Checker.Websocket(ctx, &opts.Websocket)
|
|
}()
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.Database.Error = health.Errorf(health.CodeUnknown, "database report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("Database")
|
|
defer opts.Progress.Complete("Database")
|
|
}
|
|
report.Database = opts.Checker.Database(ctx, &opts.Database)
|
|
}()
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.WorkspaceProxy.Error = health.Errorf(health.CodeUnknown, "proxy report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("WorkspaceProxy")
|
|
defer opts.Progress.Complete("WorkspaceProxy")
|
|
}
|
|
report.WorkspaceProxy = opts.Checker.WorkspaceProxy(ctx, &opts.WorkspaceProxy)
|
|
}()
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
report.ProvisionerDaemons.Error = health.Errorf(health.CodeUnknown, "provisioner daemon report panic: %s", err)
|
|
}
|
|
}()
|
|
|
|
if opts.Progress != nil {
|
|
opts.Progress.Start("ProvisionerDaemons")
|
|
defer opts.Progress.Complete("ProvisionerDaemons")
|
|
}
|
|
report.ProvisionerDaemons = opts.Checker.ProvisionerDaemons(ctx, &opts.ProvisionerDaemons)
|
|
}()
|
|
|
|
report.CoderVersion = buildinfo.Version()
|
|
wg.Wait()
|
|
|
|
report.Time = time.Now()
|
|
failingSections := []healthsdk.HealthSection{}
|
|
if report.DERP.Severity.Value() > health.SeverityWarning.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionDERP)
|
|
}
|
|
if report.AccessURL.Severity.Value() > health.SeverityOK.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionAccessURL)
|
|
}
|
|
if report.Websocket.Severity.Value() > health.SeverityWarning.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionWebsocket)
|
|
}
|
|
if report.Database.Severity.Value() > health.SeverityWarning.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionDatabase)
|
|
}
|
|
if report.WorkspaceProxy.Severity.Value() > health.SeverityWarning.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionWorkspaceProxy)
|
|
}
|
|
if report.ProvisionerDaemons.Severity.Value() > health.SeverityWarning.Value() {
|
|
failingSections = append(failingSections, healthsdk.HealthSectionProvisionerDaemons)
|
|
}
|
|
|
|
report.Healthy = len(failingSections) == 0
|
|
|
|
// Review healthcheck sub-reports.
|
|
report.Severity = health.SeverityOK
|
|
|
|
if report.DERP.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.DERP.Severity
|
|
}
|
|
if report.AccessURL.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.AccessURL.Severity
|
|
}
|
|
if report.Websocket.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.Websocket.Severity
|
|
}
|
|
if report.Database.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.Database.Severity
|
|
}
|
|
if report.WorkspaceProxy.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.WorkspaceProxy.Severity
|
|
}
|
|
if report.ProvisionerDaemons.Severity.Value() > report.Severity.Value() {
|
|
report.Severity = report.ProvisionerDaemons.Severity
|
|
}
|
|
return &report
|
|
}
|
|
|
|
func convertError(err error) *string {
|
|
if err != nil {
|
|
return ptr.Ref(err.Error())
|
|
}
|
|
|
|
return nil
|
|
}
|