feat(scripts/develop): add --prometheus-server flag to run Prometheus UI (#24408)

Builds on #24389. Adds `--prometheus-server` flag that starts a
`prom/prometheus:v3.11.2` container alongside the dev environment.

> 🤖
This commit is contained in:
Cian Johnston
2026-04-20 15:08:13 +01:00
committed by GitHub
parent 81bd78d1c0
commit d99949df43
2 changed files with 391 additions and 49 deletions
+225 -19
View File
@@ -42,9 +42,18 @@ const (
defaultAPIPort = "3000"
defaultWebPort = "8080"
defaultProxyPort = "3010"
// prometheusServerPort is an int64 (not a string like the
// user-facing defaults) because it has no corresponding CLI
// flag; the Prometheus UI port is fixed at 9090.
prometheusServerPort int64 = 9090
// prometheusContainerName is the Docker container name for
// the embedded Prometheus server, used for reuse detection
// and explicit cleanup on shutdown.
prometheusContainerName = "coder-prometheus"
// defaultPrometheusPort avoids 2112 (agent prometheus) and
// 2113 (agent debug) already bound inside Coder workspaces.
defaultPrometheusPort = "2114"
prometheusImage = "prom/prometheus:v3.11.2"
defaultAccessURL = "http://127.0.0.1:%d"
defaultPassword = "SomeSecurePassword!"
defaultStarterTemplate = "docker"
@@ -85,7 +94,13 @@ func main() {
Env: "CODER_DEV_PROMETHEUS_PORT",
Default: defaultPrometheusPort,
Description: "Prometheus metrics port. Set to 0 to disable.",
Value: serpent.Int64Of(&cfg.prometheusPort),
Value: serpent.Int64Of(&cfg.coderMetricsPort),
},
{
Flag: "prometheus-server",
Env: "CODER_DEV_PROMETHEUS_SERVER",
Description: "Run a Prometheus server to scrape and visualize metrics. Requires Docker. Linux only.",
Value: serpent.BoolOf(&cfg.prometheusServer),
},
{
Flag: "agpl",
@@ -173,7 +188,8 @@ type devConfig struct {
apiPort int64
webPort int64
proxyPort int64
prometheusPort int64
coderMetricsPort int64
prometheusServer bool
agpl bool
accessURL string
password string
@@ -217,7 +233,7 @@ func (c *devConfig) validate() error {
return xerrors.Errorf("%s must be between 1 and 65535", p.name)
}
}
if c.prometheusPort < 0 || c.prometheusPort > 65535 {
if c.coderMetricsPort < 0 || c.coderMetricsPort > 65535 {
return xerrors.Errorf("--prometheus-port must be 0 (disabled) or between 1 and 65535")
}
if c.apiPort == c.webPort {
@@ -229,15 +245,39 @@ func (c *devConfig) validate() error {
if c.useProxy && c.webPort == c.proxyPort {
return xerrors.Errorf("--web-port %d conflicts with --proxy-port", c.webPort)
}
if c.prometheusPort != 0 {
if c.prometheusPort == c.apiPort {
return xerrors.Errorf("--prometheus-port %d conflicts with API server", c.prometheusPort)
if c.coderMetricsPort != 0 {
if c.coderMetricsPort == c.apiPort {
return xerrors.Errorf("--prometheus-port %d conflicts with API server", c.coderMetricsPort)
}
if c.prometheusPort == c.webPort {
return xerrors.Errorf("--prometheus-port %d conflicts with frontend dev server", c.prometheusPort)
if c.coderMetricsPort == c.webPort {
return xerrors.Errorf("--prometheus-port %d conflicts with frontend dev server", c.coderMetricsPort)
}
if c.useProxy && c.coderMetricsPort == c.proxyPort {
return xerrors.Errorf("--prometheus-port %d conflicts with workspace proxy", c.coderMetricsPort)
}
}
if c.prometheusServer && c.coderMetricsPort == 0 {
return xerrors.New("--prometheus-server requires prometheus to be enabled (--prometheus-port != 0)")
}
if c.prometheusServer {
conflicts := []struct {
flag string
val int64
}{
{"--port", c.apiPort},
{"--web-port", c.webPort},
{"--prometheus-port", c.coderMetricsPort},
}
if c.useProxy {
conflicts = append(conflicts, struct {
flag string
val int64
}{"--proxy-port", c.proxyPort})
}
for _, conflict := range conflicts {
if prometheusServerPort == conflict.val {
return xerrors.Errorf("%s %d conflicts with prometheus server", conflict.flag, conflict.val)
}
if c.useProxy && c.prometheusPort == c.proxyPort {
return xerrors.Errorf("--prometheus-port %d conflicts with workspace proxy", c.prometheusPort)
}
}
return nil
@@ -462,7 +502,17 @@ func develop(ctx context.Context, logger slog.Logger, cfg *devConfig) error {
}
}
printBanner(ctx, logger, cfg)
var prometheusServerStarted bool
if cfg.prometheusServer {
started, err := startPrometheusServer(ctx, logger, cfg)
if err != nil {
logger.Warn(ctx, "prometheus server setup failed, continuing",
slog.Error(err))
}
prometheusServerStarted = started
}
printBanner(ctx, logger, cfg, prometheusServerStarted)
// Block until a signal fires or a child process exits.
<-ctx.Done()
@@ -506,8 +556,8 @@ func preflight(ctx context.Context, logger slog.Logger, cfg *devConfig) error {
if cfg.useProxy && isPortBusy(ctx, cfg.proxyPort) {
return xerrors.Errorf("port %d is already in use (proxy)", cfg.proxyPort)
}
if cfg.prometheusPort != 0 && isPortBusy(ctx, cfg.prometheusPort) {
return xerrors.Errorf("port %d is already in use (prometheus)", cfg.prometheusPort)
if cfg.coderMetricsPort != 0 && isPortBusy(ctx, cfg.coderMetricsPort) {
return xerrors.Errorf("port %d is already in use (prometheus)", cfg.coderMetricsPort)
}
return nil
}
@@ -541,10 +591,10 @@ func startServer(cfg *devConfig, group *procGroup) error {
"--dangerous-allow-cors-requests=true",
"--enable-terraform-debug-mode",
}
if cfg.prometheusPort != 0 {
if cfg.coderMetricsPort != 0 {
serverArgs = append(serverArgs,
"--prometheus-enable",
"--prometheus-address", fmt.Sprintf("0.0.0.0:%d", cfg.prometheusPort),
"--prometheus-address", fmt.Sprintf("0.0.0.0:%d", cfg.coderMetricsPort),
"--prometheus-collect-agent-stats",
"--prometheus-collect-db-metrics",
)
@@ -896,6 +946,147 @@ func createTemplateInOrg(ctx context.Context, logger slog.Logger, client *coders
return nil
}
// startPrometheusServer runs the official Prometheus Docker image
// with a generated config that scrapes the local Coder metrics
// endpoint. It uses --net=host so the container can reach the
// host-bound metrics port directly. Only supported on Linux;
// returns false without error on other platforms.
// Returns true if the server was started or is already running.
func startPrometheusServer(ctx context.Context, logger slog.Logger, cfg *devConfig) (bool, error) {
if runtime.GOOS != "linux" {
logger.Warn(ctx, "prometheus server is only supported on Linux, skipping",
slog.F("os", runtime.GOOS))
return false, nil
}
// Verify Docker is available before attempting anything.
if err := exec.CommandContext(ctx, "docker", "info").Run(); err != nil {
logger.Warn(ctx, "docker not available, skipping prometheus server",
slog.Error(err))
return false, nil
}
// If the port is already in use, check whether it's our
// container from a previous run. If so, reuse it.
if isPortBusy(ctx, prometheusServerPort) {
out, err := exec.CommandContext(ctx, "docker", "inspect",
"-f", "{{.State.Running}}",
prometheusContainerName).Output()
if err == nil && strings.TrimSpace(string(out)) == "true" {
logger.Info(ctx, "reusing existing prometheus server",
slog.F("ui", fmt.Sprintf("http://localhost:%d", prometheusServerPort)),
slog.F("note", fmt.Sprintf("scrape target may differ from current --prometheus-port %d; restart to apply", cfg.coderMetricsPort)))
return true, nil
}
logger.Info(ctx, "prometheus server port already in use, skipping",
slog.F("port", prometheusServerPort))
return false, nil
}
// Remove any stopped leftover container from a previous run.
// Failure is fine; it just means the container doesn't exist.
rmCmd := exec.CommandContext(ctx, "docker", "rm", "-f", prometheusContainerName) //nolint:gosec
rmCmd.Stdout = nil
rmCmd.Stderr = nil
_ = rmCmd.Run()
// Persist TSDB data across dev environment restarts. The
// container runs as nobody (UID 65534), so the directory must
// be world-writable. os.MkdirAll applies the umask, so we
// chmod explicitly after creation.
prometheusDataDir := filepath.Join(cfg.configDir, "prometheus")
if err := os.MkdirAll(prometheusDataDir, 0o777); err != nil {
return false, xerrors.Errorf("creating prometheus data directory: %w", err)
}
if err := os.Chmod(prometheusDataDir, 0o777); err != nil {
return false, xerrors.Errorf("chmod prometheus data directory: %w", err)
}
// Write a minimal scrape config to a temp file.
promCfg := fmt.Sprintf(`global:
scrape_interval: 15s
scrape_configs:
- job_name: coder
scheme: http
static_configs:
- targets: ["127.0.0.1:%d"]
`, cfg.coderMetricsPort)
tmpFile, err := os.CreateTemp("", "coder-prometheus-*.yml")
if err != nil {
return false, xerrors.Errorf("creating prometheus config: %w", err)
}
// Stop the container and remove the temp file when the context is
// done. The stop must happen before the file removal so Prometheus
// is not holding the bind mount open when we delete the source.
// Registering this cleanup immediately after CreateTemp means every
// later failure path can simply return without its own cleanup call.
context.AfterFunc(ctx, func() {
stopCmd := exec.Command("docker", "stop", "-t", "5", prometheusContainerName) //nolint:gosec
stopCmd.Stdout = nil
stopCmd.Stderr = nil
_ = stopCmd.Run()
_ = os.Remove(tmpFile.Name())
})
if _, err := tmpFile.WriteString(promCfg); err != nil {
_ = tmpFile.Close()
return false, xerrors.Errorf("writing prometheus config: %w", err)
}
_ = tmpFile.Close()
// The Prometheus container runs as nobody, so the file must be
// world-readable. os.CreateTemp creates files with mode 0600.
if err := os.Chmod(tmpFile.Name(), 0o644); err != nil {
return false, xerrors.Errorf("chmod prometheus config: %w", err)
}
cmd := exec.CommandContext(ctx, "docker", "run", //nolint:gosec // args are all controlled constants or our own temp file path
"--rm",
"--name", prometheusContainerName,
"--net=host",
"-v", tmpFile.Name()+":/etc/prometheus/prometheus.yml:ro",
"-v", prometheusDataDir+":/prometheus",
prometheusImage,
"--config.file=/etc/prometheus/prometheus.yml",
fmt.Sprintf("--web.listen-address=0.0.0.0:%d", prometheusServerPort),
)
named := logger.Named("prometheus")
w := &logWriter{logger: named}
cmd.Stdout = w
cmd.Stderr = w
named.Info(ctx, "starting prometheus server",
slog.F("image", prometheusImage),
slog.F("scrape_target", fmt.Sprintf("127.0.0.1:%d", cfg.coderMetricsPort)),
slog.F("ui", fmt.Sprintf("http://localhost:%d", prometheusServerPort)),
)
if err := cmd.Start(); err != nil {
return false, xerrors.Errorf("starting prometheus container: %w", err)
}
// Wait for the container in a separate goroutine. Prometheus is
// optional, so if it dies we just log a warning rather than
// tearing down the entire dev environment.
go func() {
if err := cmd.Wait(); err != nil {
if ctx.Err() != nil {
// Normal shutdown: context was canceled.
named.Info(ctx, "prometheus server stopped")
return
}
named.Warn(ctx, "prometheus server exited", slog.Error(err))
} else {
named.Warn(ctx, "prometheus server exited unexpectedly")
}
}()
return true, nil
}
func pnpmCmd(ctx context.Context, cfg *devConfig) *exec.Cmd {
cmd := cfg.cmd(ctx, "pnpm", "--dir", "./site", "dev", "--host")
cmd.Env = append(cmd.Env,
@@ -905,7 +1096,22 @@ func pnpmCmd(ctx context.Context, cfg *devConfig) *exec.Cmd {
return cmd
}
func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig) {
// prometheusBannerEntry decides which (if any) prometheus-related URL
// the dev banner should advertise. When the embedded Prometheus server
// is running we prefer its UI; otherwise fall back to the raw metrics
// endpoint. Returns an empty label when metrics are disabled entirely.
func prometheusBannerEntry(cfg *devConfig, prometheusServerStarted bool) (label string, port int64) {
switch {
case prometheusServerStarted:
return "Prometheus UI:", prometheusServerPort
case cfg.coderMetricsPort != 0:
return "Metrics:", cfg.coderMetricsPort
default:
return "", 0
}
}
func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig, prometheusServerStarted bool) {
ifaces := []string{"localhost"}
if addrs, err := net.InterfaceAddrs(); err == nil {
for _, addr := range addrs {
@@ -960,13 +1166,13 @@ func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig) {
line(indent(fmt.Sprintf("http://%s:%d", h, cfg.proxyPort)))
}
}
if cfg.prometheusPort != 0 {
if label, port := prometheusBannerEntry(cfg, prometheusServerStarted); label != "" {
line(
"",
"Metrics:",
label,
)
for _, h := range ifaces {
line(indent(fmt.Sprintf("http://%s:%d", h, cfg.prometheusPort)))
line(indent(fmt.Sprintf("http://%s:%d", h, port)))
}
}
line(
+145 -9
View File
@@ -174,7 +174,7 @@ func TestDevConfigValidate(t *testing.T) {
apiPort: 3000,
webPort: 8080,
proxyPort: 3010,
prometheusPort: 2114,
coderMetricsPort: 2114,
password: defaultPassword,
}
}
@@ -288,7 +288,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithAPI", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3000
cfg.coderMetricsPort = 3000
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port 3000 conflicts with")
@@ -297,7 +297,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithWeb", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 8080
cfg.coderMetricsPort = 8080
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port 8080 conflicts with")
@@ -306,7 +306,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3010
cfg.coderMetricsPort = 3010
cfg.useProxy = true
err := cfg.validate()
require.Error(t, err)
@@ -316,21 +316,21 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortZeroDisabled", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 0
cfg.coderMetricsPort = 0
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusPortValid", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 9090
cfg.coderMetricsPort = 9090
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusPortTooHigh", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 70000
cfg.coderMetricsPort = 70000
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port must be 0 (disabled) or between 1 and 65535")
@@ -339,7 +339,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortNegative", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = -1
cfg.coderMetricsPort = -1
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port must be 0 (disabled) or between 1 and 65535")
@@ -348,9 +348,83 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusProxyProxyConflictIgnoredWithoutProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3010
cfg.coderMetricsPort = 3010
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerRequiresMetrics", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = 0
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-server requires prometheus to be enabled")
})
t.Run("PrometheusServerValid", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = 2114
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerPortConflictWithAPI", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.apiPort = prometheusServerPort
cfg.coderMetricsPort = 2114
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortConflictWithWeb", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.webPort = prometheusServerPort
cfg.coderMetricsPort = 2114
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--web-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortConflictWithProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.useProxy = true
cfg.proxyPort = prometheusServerPort
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--proxy-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortNoProxyConflictWithoutFlag", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.proxyPort = prometheusServerPort
// useProxy is false, so no conflict.
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerPortConflictWithMetrics", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = prometheusServerPort
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
}
func TestDevConfigResolveEnv(t *testing.T) {
@@ -515,3 +589,65 @@ func TestPoll(t *testing.T) {
assert.Equal(t, 2, calls)
})
}
func TestStartPrometheusServerDockerMissing(t *testing.T) {
// Not t.Parallel(): mutates PATH via t.Setenv.
t.Setenv("PATH", "")
logger := slog.Make(sloghuman.Sink(&bytes.Buffer{}))
cfg := &devConfig{prometheusServer: true, coderMetricsPort: 2114}
started, err := startPrometheusServer(t.Context(), logger, cfg)
require.NoError(t, err)
assert.False(t, started)
}
func TestPrometheusBannerEntry(t *testing.T) {
t.Parallel()
cases := []struct {
name string
cfg *devConfig
started bool
wantLabel string
wantPort int64
}{
{
name: "MetricsDisabled",
cfg: &devConfig{coderMetricsPort: 0},
started: false,
wantLabel: "",
wantPort: 0,
},
{
name: "MetricsOnlyDefault",
cfg: &devConfig{coderMetricsPort: 2114},
started: false,
wantLabel: "Metrics:",
wantPort: 2114,
},
{
name: "PrometheusServerUp",
cfg: &devConfig{coderMetricsPort: 2114, prometheusServer: true},
started: true,
wantLabel: "Prometheus UI:",
wantPort: prometheusServerPort,
},
{
name: "ServerRequestedButDown",
cfg: &devConfig{coderMetricsPort: 2114, prometheusServer: true},
started: false,
wantLabel: "Metrics:",
wantPort: 2114,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
label, port := prometheusBannerEntry(tc.cfg, tc.started)
assert.Equal(t, tc.wantLabel, label)
assert.Equal(t, tc.wantPort, port)
})
}
}