feat(coderd/healthcheck): allow configuring database hc threshold (#10623)

* feat(coderd/healthcheck): allow configuring database hc threshold
* feat(coderd): add database hc latency, plumb through
* feat(coderd): allow configuring healthcheck refresh interval
This commit is contained in:
Cian Johnston
2023-11-13 14:14:43 +00:00
committed by GitHub
parent e4211ccb40
commit b69c237b8a
17 changed files with 288 additions and 55 deletions
+9
View File
@@ -80,6 +80,15 @@ Use a YAML configuration file when your server launch become unwieldy.
Write out the current server config as YAML to stdout.
INTROSPECTION / HEALTH CHECK OPTIONS:
--health-check-refresh duration, $CODER_HEALTH_CHECK_REFRESH (default: 10m0s)
Refresh interval for healthchecks.
--health-check-threshold-database duration, $CODER_HEALTH_CHECK_THRESHOLD_DATABASE (default: 15ms)
The threshold for the database health check. If the median latency of
the database exceeds this threshold over 5 attempts, the database is
considered unhealthy. The default value is 15ms.
INTROSPECTION / LOGGING OPTIONS:
--enable-terraform-debug-mode bool, $CODER_ENABLE_TERRAFORM_DEBUG_MODE (default: false)
Allow administrators to enable Terraform debug output.
+9
View File
@@ -232,6 +232,15 @@ introspection:
# Allow administrators to enable Terraform debug output.
# (default: false, type: bool)
enableTerraformDebugMode: false
healthcheck:
# Refresh interval for healthchecks.
# (default: 10m0s, type: duration)
refresh: 10m0s
# The threshold for the database health check. If the median latency of the
# database exceeds this threshold over 5 attempts, the database is considered
# unhealthy. The default value is 15ms.
# (default: 15ms, type: duration)
thresholdDatabase: 15ms
oauth2:
github:
# Client ID for Login with GitHub.
+17
View File
@@ -8380,6 +8380,9 @@ const docTemplate = `{
"type": "string"
}
},
"healthcheck": {
"$ref": "#/definitions/codersdk.HealthcheckConfig"
},
"http_address": {
"description": "HTTPAddress is a string because it may be set to zero to disable.",
"type": "string"
@@ -8859,6 +8862,17 @@ const docTemplate = `{
}
}
},
"codersdk.HealthcheckConfig": {
"type": "object",
"properties": {
"refresh": {
"type": "integer"
},
"threshold_database": {
"type": "integer"
}
}
},
"codersdk.InsightsReportInterval": {
"type": "string",
"enum": [
@@ -12177,6 +12191,9 @@ const docTemplate = `{
},
"reachable": {
"type": "boolean"
},
"threshold_ms": {
"type": "integer"
}
}
},
+17
View File
@@ -7492,6 +7492,9 @@
"type": "string"
}
},
"healthcheck": {
"$ref": "#/definitions/codersdk.HealthcheckConfig"
},
"http_address": {
"description": "HTTPAddress is a string because it may be set to zero to disable.",
"type": "string"
@@ -7961,6 +7964,17 @@
}
}
},
"codersdk.HealthcheckConfig": {
"type": "object",
"properties": {
"refresh": {
"type": "integer"
},
"threshold_database": {
"type": "integer"
}
}
},
"codersdk.InsightsReportInterval": {
"type": "string",
"enum": ["day", "week"],
@@ -11102,6 +11116,9 @@
},
"reachable": {
"type": "boolean"
},
"threshold_ms": {
"type": "integer"
}
}
},
+16 -5
View File
@@ -38,6 +38,7 @@ import (
// Used for swagger docs.
_ "github.com/coder/coder/v2/coderd/apidoc"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
"cdr.dev/slog"
"github.com/coder/coder/v2/buildinfo"
@@ -398,10 +399,20 @@ func New(options *Options) *API {
if options.HealthcheckFunc == nil {
options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report {
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
DB: options.Database,
AccessURL: options.AccessURL,
DERPMap: api.DERPMap(),
APIKey: apiKey,
Database: healthcheck.DatabaseReportOptions{
DB: options.Database,
Threshold: options.DeploymentValues.Healthcheck.ThresholdDatabase.Value(),
},
Websocket: healthcheck.WebsocketReportOptions{
AccessURL: options.AccessURL,
APIKey: apiKey,
},
AccessURL: healthcheck.AccessURLReportOptions{
AccessURL: options.AccessURL,
},
DerpHealth: derphealth.ReportOptions{
DERPMap: api.DERPMap(),
},
})
}
}
@@ -409,7 +420,7 @@ func New(options *Options) *API {
options.HealthcheckTimeout = 30 * time.Second
}
if options.HealthcheckRefresh == 0 {
options.HealthcheckRefresh = 10 * time.Minute
options.HealthcheckRefresh = options.DeploymentValues.Healthcheck.Refresh.Value()
}
var oidcAuthURLParams map[string]string
+3 -3
View File
@@ -32,12 +32,12 @@ func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
// @Router /debug/health [get]
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
apiKey := httpmw.APITokenFromRequest(r)
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
defer cancel()
// Get cached report if it exists.
if report := api.healthCheckCache.Load(); report != nil {
if time.Since(report.Time) < api.HealthcheckRefresh {
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
formatHealthcheck(ctx, rw, r, report)
return
}
@@ -45,7 +45,7 @@ func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
// Create a new context not tied to the request.
ctx, cancel := context.WithTimeout(context.Background(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
defer cancel()
report := api.HealthcheckFunc(ctx, apiKey)
+45
View File
@@ -72,6 +72,51 @@ func TestDebugHealth(t *testing.T) {
require.Equal(t, http.StatusNotFound, res.StatusCode)
})
t.Run("Refresh", func(t *testing.T) {
t.Parallel()
var (
calls = make(chan struct{})
callsDone = make(chan struct{})
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
client = coderdtest.New(t, &coderdtest.Options{
HealthcheckRefresh: time.Microsecond,
HealthcheckFunc: func(context.Context, string) *healthcheck.Report {
calls <- struct{}{}
return &healthcheck.Report{}
},
})
_ = coderdtest.CreateFirstUser(t, client)
)
defer cancel()
go func() {
defer close(callsDone)
<-calls
<-time.After(testutil.IntervalFast)
<-calls
}()
res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)
res, err = client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)
select {
case <-callsDone:
case <-ctx.Done():
t.Fatal("timed out waiting for calls to finish")
}
})
t.Run("Deduplicated", func(t *testing.T) {
t.Parallel()
+18 -10
View File
@@ -10,20 +10,30 @@ import (
"github.com/coder/coder/v2/coderd/database"
)
const (
DatabaseDefaultThreshold = 15 * time.Millisecond
)
// @typescript-generate DatabaseReport
type DatabaseReport struct {
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMs int `json:"latency_ms"`
Error *string `json:"error"`
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMS int64 `json:"latency_ms"`
ThresholdMS int64 `json:"threshold_ms"`
Error *string `json:"error"`
}
type DatabaseReportOptions struct {
DB database.Store
DB database.Store
Threshold time.Duration
}
func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
r.ThresholdMS = opts.Threshold.Milliseconds()
if r.ThresholdMS == 0 {
r.ThresholdMS = DatabaseDefaultThreshold.Milliseconds()
}
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
@@ -43,10 +53,8 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
// Take the median ping.
latency := pings[pingCount/2]
r.Latency = latency.String()
r.LatencyMs = int(latency.Milliseconds())
// Somewhat arbitrary, but if the latency is over 15ms, we consider it
// unhealthy.
if latency < 15*time.Millisecond {
r.LatencyMS = latency.Milliseconds()
if r.LatencyMS < r.ThresholdMS {
r.Healthy = true
}
r.Reachable = true
+31 -2
View File
@@ -36,7 +36,8 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, ping.String(), report.Latency)
assert.Equal(t, int(ping.Milliseconds()), report.LatencyMs)
assert.Equal(t, ping.Milliseconds(), report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
@@ -59,6 +60,7 @@ func TestDatabase(t *testing.T) {
assert.False(t, report.Reachable)
assert.Zero(t, report.Latency)
require.NotNil(t, report.Error)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Contains(t, *report.Error, err.Error())
})
@@ -83,7 +85,34 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Millisecond.String(), report.Latency)
assert.Equal(t, 1, report.LatencyMs)
assert.EqualValues(t, 1, report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
t.Run("Threshold", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
report = healthcheck.DatabaseReport{}
db = dbmock.NewMockStore(gomock.NewController(t))
)
defer cancel()
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
report.Run(ctx, &healthcheck.DatabaseReportOptions{DB: db, Threshold: time.Second})
assert.False(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Second.String(), report.Latency)
assert.EqualValues(t, 1000, report.LatencyMS)
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
}
+8 -25
View File
@@ -3,15 +3,10 @@ package healthcheck
import (
"context"
"fmt"
"net/http"
"net/url"
"sync"
"time"
"tailscale.com/tailcfg"
"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
"github.com/coder/coder/v2/coderd/util/ptr"
)
@@ -49,12 +44,10 @@ type Report struct {
}
type ReportOptions struct {
DB database.Store
// TODO: support getting this over HTTP?
DERPMap *tailcfg.DERPMap
AccessURL *url.URL
Client *http.Client
APIKey string
AccessURL AccessURLReportOptions
Database DatabaseReportOptions
DerpHealth derphealth.ReportOptions
Websocket WebsocketReportOptions
Checker Checker
}
@@ -100,9 +93,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.DERP = opts.Checker.DERP(ctx, &derphealth.ReportOptions{
DERPMap: opts.DERPMap,
})
report.DERP = opts.Checker.DERP(ctx, &opts.DerpHealth)
}()
wg.Add(1)
@@ -114,10 +105,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.AccessURL = opts.Checker.AccessURL(ctx, &AccessURLReportOptions{
AccessURL: opts.AccessURL,
Client: opts.Client,
})
report.AccessURL = opts.Checker.AccessURL(ctx, &opts.AccessURL)
}()
wg.Add(1)
@@ -129,10 +117,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.Websocket = opts.Checker.Websocket(ctx, &WebsocketReportOptions{
APIKey: opts.APIKey,
AccessURL: opts.AccessURL,
})
report.Websocket = opts.Checker.Websocket(ctx, &opts.Websocket)
}()
wg.Add(1)
@@ -144,9 +129,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.Database = opts.Checker.Database(ctx, &DatabaseReportOptions{
DB: opts.DB,
})
report.Database = opts.Checker.Database(ctx, &opts.Database)
}()
report.CoderVersion = buildinfo.Version()
+33
View File
@@ -183,6 +183,7 @@ type DeploymentValues struct {
EnableTerraformDebugMode clibase.Bool `json:"enable_terraform_debug_mode,omitempty" typescript:",notnull"`
UserQuietHoursSchedule UserQuietHoursScheduleConfig `json:"user_quiet_hours_schedule,omitempty" typescript:",notnull"`
WebTerminalRenderer clibase.String `json:"web_terminal_renderer,omitempty" typescript:",notnull"`
Healthcheck HealthcheckConfig `json:"healthcheck,omitempty" typescript:",notnull"`
Config clibase.YAMLConfigPath `json:"config,omitempty" typescript:",notnull"`
WriteConfig clibase.Bool `json:"write_config,omitempty" typescript:",notnull"`
@@ -395,6 +396,12 @@ type UserQuietHoursScheduleConfig struct {
// WindowDuration clibase.Duration `json:"window_duration" typescript:",notnull"`
}
// HealthcheckConfig contains configuration for healthchecks.
type HealthcheckConfig struct {
Refresh clibase.Duration `json:"refresh" typescript:",notnull"`
ThresholdDatabase clibase.Duration `json:"threshold_database" typescript:",notnull"`
}
const (
annotationEnterpriseKey = "enterprise"
annotationSecretKey = "secret"
@@ -489,6 +496,11 @@ func (c *DeploymentValues) Options() clibase.OptionSet {
Name: "Logging",
YAML: "logging",
}
deploymentGroupIntrospectionHealthcheck = clibase.Group{
Parent: &deploymentGroupIntrospection,
Name: "Health Check",
YAML: "healthcheck",
}
deploymentGroupOAuth2 = clibase.Group{
Name: "OAuth2",
Description: `Configure login and user-provisioning with GitHub via oAuth2.`,
@@ -1799,6 +1811,27 @@ Write out the current server config as YAML to stdout.`,
Group: &deploymentGroupClient,
YAML: "webTerminalRenderer",
},
// Healthcheck Options
{
Name: "Health Check Refresh",
Description: "Refresh interval for healthchecks.",
Flag: "health-check-refresh",
Env: "CODER_HEALTH_CHECK_REFRESH",
Default: (10 * time.Minute).String(),
Value: &c.Healthcheck.Refresh,
Group: &deploymentGroupIntrospectionHealthcheck,
YAML: "refresh",
},
{
Name: "Health Check Threshold: Database",
Description: "The threshold for the database health check. If the median latency of the database exceeds this threshold over 5 attempts, the database is considered unhealthy. The default value is 15ms.",
Flag: "health-check-threshold-database",
Env: "CODER_HEALTH_CHECK_THRESHOLD_DATABASE",
Default: (15 * time.Millisecond).String(),
Value: &c.Healthcheck.ThresholdDatabase,
Group: &deploymentGroupIntrospectionHealthcheck,
YAML: "thresholdDatabase",
},
}
return opts
+2 -1
View File
@@ -53,7 +53,8 @@ curl -X GET http://coder-server:8080/api/v2/debug/health \
"healthy": true,
"latency": "string",
"latency_ms": 0,
"reachable": true
"reachable": true,
"threshold_ms": 0
},
"derp": {
"error": "string",
+4
View File
@@ -235,6 +235,10 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \
]
},
"external_token_encryption_keys": ["string"],
"healthcheck": {
"refresh": 0,
"threshold_database": 0
},
"http_address": "string",
"in_memory_database": true,
"job_hang_detector_interval": 0,
+37 -9
View File
@@ -2156,6 +2156,10 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
]
},
"external_token_encryption_keys": ["string"],
"healthcheck": {
"refresh": 0,
"threshold_database": 0
},
"http_address": "string",
"in_memory_database": true,
"job_hang_detector_interval": 0,
@@ -2527,6 +2531,10 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
]
},
"external_token_encryption_keys": ["string"],
"healthcheck": {
"refresh": 0,
"threshold_database": 0
},
"http_address": "string",
"in_memory_database": true,
"job_hang_detector_interval": 0,
@@ -2726,6 +2734,7 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
| `experiments` | array of string | false | | |
| `external_auth` | [clibase.Struct-array_codersdk_ExternalAuthConfig](#clibasestruct-array_codersdk_externalauthconfig) | false | | |
| `external_token_encryption_keys` | array of string | false | | |
| `healthcheck` | [codersdk.HealthcheckConfig](#codersdkhealthcheckconfig) | false | | |
| `http_address` | string | false | | Http address is a string because it may be set to zero to disable. |
| `in_memory_database` | boolean | false | | |
| `job_hang_detector_interval` | integer | false | | |
@@ -3176,6 +3185,22 @@ AuthorizationObject can represent a "set" of objects, such as: all workspaces in
| `threshold` | integer | false | | Threshold specifies the number of consecutive failed health checks before returning "unhealthy". |
| `url` | string | false | | URL specifies the endpoint to check for the app health. |
## codersdk.HealthcheckConfig
```json
{
"refresh": 0,
"threshold_database": 0
}
```
### Properties
| Name | Type | Required | Restrictions | Description |
| -------------------- | ------- | -------- | ------------ | ----------- |
| `refresh` | integer | false | | |
| `threshold_database` | integer | false | | |
## codersdk.InsightsReportInterval
```json
@@ -7444,19 +7469,21 @@ If the schedule is empty, the user will be updated to use the default schedule.|
"healthy": true,
"latency": "string",
"latency_ms": 0,
"reachable": true
"reachable": true,
"threshold_ms": 0
}
```
### Properties
| Name | Type | Required | Restrictions | Description |
| ------------ | ------- | -------- | ------------ | ----------- |
| `error` | string | false | | |
| `healthy` | boolean | false | | |
| `latency` | string | false | | |
| `latency_ms` | integer | false | | |
| `reachable` | boolean | false | | |
| Name | Type | Required | Restrictions | Description |
| -------------- | ------- | -------- | ------------ | ----------- |
| `error` | string | false | | |
| `healthy` | boolean | false | | |
| `latency` | string | false | | |
| `latency_ms` | integer | false | | |
| `reachable` | boolean | false | | |
| `threshold_ms` | integer | false | | |
## healthcheck.Report
@@ -7476,7 +7503,8 @@ If the schedule is empty, the user will be updated to use the default schedule.|
"healthy": true,
"latency": "string",
"latency_ms": 0,
"reachable": true
"reachable": true,
"threshold_ms": 0
},
"derp": {
"error": "string",
+22
View File
@@ -305,6 +305,28 @@ Time to force cancel provisioning tasks that are stuck.
HTTP bind address of the server. Unset to disable the HTTP endpoint.
### --health-check-refresh
| | |
| ----------- | ---------------------------------------------- |
| Type | <code>duration</code> |
| Environment | <code>$CODER_HEALTH_CHECK_REFRESH</code> |
| YAML | <code>introspection.healthcheck.refresh</code> |
| Default | <code>10m0s</code> |
Refresh interval for healthchecks.
### --health-check-threshold-database
| | |
| ----------- | -------------------------------------------------------- |
| Type | <code>duration</code> |
| Environment | <code>$CODER_HEALTH_CHECK_THRESHOLD_DATABASE</code> |
| YAML | <code>introspection.healthcheck.thresholdDatabase</code> |
| Default | <code>15ms</code> |
The threshold for the database health check. If the median latency of the database exceeds this threshold over 5 attempts, the database is considered unhealthy. The default value is 15ms.
### --log-human
| | |
+9
View File
@@ -81,6 +81,15 @@ Use a YAML configuration file when your server launch become unwieldy.
Write out the current server config as YAML to stdout.
INTROSPECTION / HEALTH CHECK OPTIONS:
--health-check-refresh duration, $CODER_HEALTH_CHECK_REFRESH (default: 10m0s)
Refresh interval for healthchecks.
--health-check-threshold-database duration, $CODER_HEALTH_CHECK_THRESHOLD_DATABASE (default: 15ms)
The threshold for the database health check. If the median latency of
the database exceeds this threshold over 5 attempts, the database is
considered unhealthy. The default value is 15ms.
INTROSPECTION / LOGGING OPTIONS:
--enable-terraform-debug-mode bool, $CODER_ENABLE_TERRAFORM_DEBUG_MODE (default: false)
Allow administrators to enable Terraform debug output.
+8
View File
@@ -423,6 +423,7 @@ export interface DeploymentValues {
readonly enable_terraform_debug_mode?: boolean;
readonly user_quiet_hours_schedule?: UserQuietHoursScheduleConfig;
readonly web_terminal_renderer?: string;
readonly healthcheck?: HealthcheckConfig;
readonly config?: string;
readonly write_config?: boolean;
readonly address?: string;
@@ -548,6 +549,12 @@ export interface Healthcheck {
readonly threshold: number;
}
// From codersdk/deployment.go
export interface HealthcheckConfig {
readonly refresh: number;
readonly threshold_database: number;
}
// From codersdk/workspaceagents.go
export interface IssueReconnectingPTYSignedTokenRequest {
readonly url: string;
@@ -2088,6 +2095,7 @@ export interface HealthcheckDatabaseReport {
readonly reachable: boolean;
readonly latency: string;
readonly latency_ms: number;
readonly threshold_ms: number;
readonly error?: string;
}