feat(scripts/develop): add --prometheus-server flag to run Prometheus UI (#24408)

Builds on #24389. Adds `--prometheus-server` flag that starts a
`prom/prometheus:v3.11.2` container alongside the dev environment.

> 🤖
This commit is contained in:
Cian Johnston
2026-04-20 15:08:13 +01:00
committed by GitHub
parent 81bd78d1c0
commit d99949df43
2 changed files with 391 additions and 49 deletions
+242 -36
View File
@@ -42,9 +42,18 @@ const (
defaultAPIPort = "3000"
defaultWebPort = "8080"
defaultProxyPort = "3010"
// prometheusServerPort is an int64 (not a string like the
// user-facing defaults) because it has no corresponding CLI
// flag; the Prometheus UI port is fixed at 9090.
prometheusServerPort int64 = 9090
// prometheusContainerName is the Docker container name for
// the embedded Prometheus server, used for reuse detection
// and explicit cleanup on shutdown.
prometheusContainerName = "coder-prometheus"
// defaultPrometheusPort avoids 2112 (agent prometheus) and
// 2113 (agent debug) already bound inside Coder workspaces.
defaultPrometheusPort = "2114"
prometheusImage = "prom/prometheus:v3.11.2"
defaultAccessURL = "http://127.0.0.1:%d"
defaultPassword = "SomeSecurePassword!"
defaultStarterTemplate = "docker"
@@ -85,7 +94,13 @@ func main() {
Env: "CODER_DEV_PROMETHEUS_PORT",
Default: defaultPrometheusPort,
Description: "Prometheus metrics port. Set to 0 to disable.",
Value: serpent.Int64Of(&cfg.prometheusPort),
Value: serpent.Int64Of(&cfg.coderMetricsPort),
},
{
Flag: "prometheus-server",
Env: "CODER_DEV_PROMETHEUS_SERVER",
Description: "Run a Prometheus server to scrape and visualize metrics. Requires Docker. Linux only.",
Value: serpent.BoolOf(&cfg.prometheusServer),
},
{
Flag: "agpl",
@@ -170,24 +185,25 @@ func main() {
}
type devConfig struct {
apiPort int64
webPort int64
proxyPort int64
prometheusPort int64
agpl bool
accessURL string
password string
useProxy bool
multiOrg bool
debug bool
starterTemplate string
dbRollback bool
dbReset bool
dbContinue bool
projectRoot string
binaryPath string
configDir string
childEnv []string
apiPort int64
webPort int64
proxyPort int64
coderMetricsPort int64
prometheusServer bool
agpl bool
accessURL string
password string
useProxy bool
multiOrg bool
debug bool
starterTemplate string
dbRollback bool
dbReset bool
dbContinue bool
projectRoot string
binaryPath string
configDir string
childEnv []string
// Extra args after flags forwarded to "coder server".
serverExtraArgs []string
}
@@ -217,7 +233,7 @@ func (c *devConfig) validate() error {
return xerrors.Errorf("%s must be between 1 and 65535", p.name)
}
}
if c.prometheusPort < 0 || c.prometheusPort > 65535 {
if c.coderMetricsPort < 0 || c.coderMetricsPort > 65535 {
return xerrors.Errorf("--prometheus-port must be 0 (disabled) or between 1 and 65535")
}
if c.apiPort == c.webPort {
@@ -229,15 +245,39 @@ func (c *devConfig) validate() error {
if c.useProxy && c.webPort == c.proxyPort {
return xerrors.Errorf("--web-port %d conflicts with --proxy-port", c.webPort)
}
if c.prometheusPort != 0 {
if c.prometheusPort == c.apiPort {
return xerrors.Errorf("--prometheus-port %d conflicts with API server", c.prometheusPort)
if c.coderMetricsPort != 0 {
if c.coderMetricsPort == c.apiPort {
return xerrors.Errorf("--prometheus-port %d conflicts with API server", c.coderMetricsPort)
}
if c.prometheusPort == c.webPort {
return xerrors.Errorf("--prometheus-port %d conflicts with frontend dev server", c.prometheusPort)
if c.coderMetricsPort == c.webPort {
return xerrors.Errorf("--prometheus-port %d conflicts with frontend dev server", c.coderMetricsPort)
}
if c.useProxy && c.prometheusPort == c.proxyPort {
return xerrors.Errorf("--prometheus-port %d conflicts with workspace proxy", c.prometheusPort)
if c.useProxy && c.coderMetricsPort == c.proxyPort {
return xerrors.Errorf("--prometheus-port %d conflicts with workspace proxy", c.coderMetricsPort)
}
}
if c.prometheusServer && c.coderMetricsPort == 0 {
return xerrors.New("--prometheus-server requires prometheus to be enabled (--prometheus-port != 0)")
}
if c.prometheusServer {
conflicts := []struct {
flag string
val int64
}{
{"--port", c.apiPort},
{"--web-port", c.webPort},
{"--prometheus-port", c.coderMetricsPort},
}
if c.useProxy {
conflicts = append(conflicts, struct {
flag string
val int64
}{"--proxy-port", c.proxyPort})
}
for _, conflict := range conflicts {
if prometheusServerPort == conflict.val {
return xerrors.Errorf("%s %d conflicts with prometheus server", conflict.flag, conflict.val)
}
}
}
return nil
@@ -462,7 +502,17 @@ func develop(ctx context.Context, logger slog.Logger, cfg *devConfig) error {
}
}
printBanner(ctx, logger, cfg)
var prometheusServerStarted bool
if cfg.prometheusServer {
started, err := startPrometheusServer(ctx, logger, cfg)
if err != nil {
logger.Warn(ctx, "prometheus server setup failed, continuing",
slog.Error(err))
}
prometheusServerStarted = started
}
printBanner(ctx, logger, cfg, prometheusServerStarted)
// Block until a signal fires or a child process exits.
<-ctx.Done()
@@ -506,8 +556,8 @@ func preflight(ctx context.Context, logger slog.Logger, cfg *devConfig) error {
if cfg.useProxy && isPortBusy(ctx, cfg.proxyPort) {
return xerrors.Errorf("port %d is already in use (proxy)", cfg.proxyPort)
}
if cfg.prometheusPort != 0 && isPortBusy(ctx, cfg.prometheusPort) {
return xerrors.Errorf("port %d is already in use (prometheus)", cfg.prometheusPort)
if cfg.coderMetricsPort != 0 && isPortBusy(ctx, cfg.coderMetricsPort) {
return xerrors.Errorf("port %d is already in use (prometheus)", cfg.coderMetricsPort)
}
return nil
}
@@ -541,10 +591,10 @@ func startServer(cfg *devConfig, group *procGroup) error {
"--dangerous-allow-cors-requests=true",
"--enable-terraform-debug-mode",
}
if cfg.prometheusPort != 0 {
if cfg.coderMetricsPort != 0 {
serverArgs = append(serverArgs,
"--prometheus-enable",
"--prometheus-address", fmt.Sprintf("0.0.0.0:%d", cfg.prometheusPort),
"--prometheus-address", fmt.Sprintf("0.0.0.0:%d", cfg.coderMetricsPort),
"--prometheus-collect-agent-stats",
"--prometheus-collect-db-metrics",
)
@@ -896,6 +946,147 @@ func createTemplateInOrg(ctx context.Context, logger slog.Logger, client *coders
return nil
}
// startPrometheusServer runs the official Prometheus Docker image
// with a generated config that scrapes the local Coder metrics
// endpoint. It uses --net=host so the container can reach the
// host-bound metrics port directly. Only supported on Linux;
// returns false without error on other platforms.
// Returns true if the server was started or is already running.
func startPrometheusServer(ctx context.Context, logger slog.Logger, cfg *devConfig) (bool, error) {
if runtime.GOOS != "linux" {
logger.Warn(ctx, "prometheus server is only supported on Linux, skipping",
slog.F("os", runtime.GOOS))
return false, nil
}
// Verify Docker is available before attempting anything.
if err := exec.CommandContext(ctx, "docker", "info").Run(); err != nil {
logger.Warn(ctx, "docker not available, skipping prometheus server",
slog.Error(err))
return false, nil
}
// If the port is already in use, check whether it's our
// container from a previous run. If so, reuse it.
if isPortBusy(ctx, prometheusServerPort) {
out, err := exec.CommandContext(ctx, "docker", "inspect",
"-f", "{{.State.Running}}",
prometheusContainerName).Output()
if err == nil && strings.TrimSpace(string(out)) == "true" {
logger.Info(ctx, "reusing existing prometheus server",
slog.F("ui", fmt.Sprintf("http://localhost:%d", prometheusServerPort)),
slog.F("note", fmt.Sprintf("scrape target may differ from current --prometheus-port %d; restart to apply", cfg.coderMetricsPort)))
return true, nil
}
logger.Info(ctx, "prometheus server port already in use, skipping",
slog.F("port", prometheusServerPort))
return false, nil
}
// Remove any stopped leftover container from a previous run.
// Failure is fine; it just means the container doesn't exist.
rmCmd := exec.CommandContext(ctx, "docker", "rm", "-f", prometheusContainerName) //nolint:gosec
rmCmd.Stdout = nil
rmCmd.Stderr = nil
_ = rmCmd.Run()
// Persist TSDB data across dev environment restarts. The
// container runs as nobody (UID 65534), so the directory must
// be world-writable. os.MkdirAll applies the umask, so we
// chmod explicitly after creation.
prometheusDataDir := filepath.Join(cfg.configDir, "prometheus")
if err := os.MkdirAll(prometheusDataDir, 0o777); err != nil {
return false, xerrors.Errorf("creating prometheus data directory: %w", err)
}
if err := os.Chmod(prometheusDataDir, 0o777); err != nil {
return false, xerrors.Errorf("chmod prometheus data directory: %w", err)
}
// Write a minimal scrape config to a temp file.
promCfg := fmt.Sprintf(`global:
scrape_interval: 15s
scrape_configs:
- job_name: coder
scheme: http
static_configs:
- targets: ["127.0.0.1:%d"]
`, cfg.coderMetricsPort)
tmpFile, err := os.CreateTemp("", "coder-prometheus-*.yml")
if err != nil {
return false, xerrors.Errorf("creating prometheus config: %w", err)
}
// Stop the container and remove the temp file when the context is
// done. The stop must happen before the file removal so Prometheus
// is not holding the bind mount open when we delete the source.
// Registering this cleanup immediately after CreateTemp means every
// later failure path can simply return without its own cleanup call.
context.AfterFunc(ctx, func() {
stopCmd := exec.Command("docker", "stop", "-t", "5", prometheusContainerName) //nolint:gosec
stopCmd.Stdout = nil
stopCmd.Stderr = nil
_ = stopCmd.Run()
_ = os.Remove(tmpFile.Name())
})
if _, err := tmpFile.WriteString(promCfg); err != nil {
_ = tmpFile.Close()
return false, xerrors.Errorf("writing prometheus config: %w", err)
}
_ = tmpFile.Close()
// The Prometheus container runs as nobody, so the file must be
// world-readable. os.CreateTemp creates files with mode 0600.
if err := os.Chmod(tmpFile.Name(), 0o644); err != nil {
return false, xerrors.Errorf("chmod prometheus config: %w", err)
}
cmd := exec.CommandContext(ctx, "docker", "run", //nolint:gosec // args are all controlled constants or our own temp file path
"--rm",
"--name", prometheusContainerName,
"--net=host",
"-v", tmpFile.Name()+":/etc/prometheus/prometheus.yml:ro",
"-v", prometheusDataDir+":/prometheus",
prometheusImage,
"--config.file=/etc/prometheus/prometheus.yml",
fmt.Sprintf("--web.listen-address=0.0.0.0:%d", prometheusServerPort),
)
named := logger.Named("prometheus")
w := &logWriter{logger: named}
cmd.Stdout = w
cmd.Stderr = w
named.Info(ctx, "starting prometheus server",
slog.F("image", prometheusImage),
slog.F("scrape_target", fmt.Sprintf("127.0.0.1:%d", cfg.coderMetricsPort)),
slog.F("ui", fmt.Sprintf("http://localhost:%d", prometheusServerPort)),
)
if err := cmd.Start(); err != nil {
return false, xerrors.Errorf("starting prometheus container: %w", err)
}
// Wait for the container in a separate goroutine. Prometheus is
// optional, so if it dies we just log a warning rather than
// tearing down the entire dev environment.
go func() {
if err := cmd.Wait(); err != nil {
if ctx.Err() != nil {
// Normal shutdown: context was canceled.
named.Info(ctx, "prometheus server stopped")
return
}
named.Warn(ctx, "prometheus server exited", slog.Error(err))
} else {
named.Warn(ctx, "prometheus server exited unexpectedly")
}
}()
return true, nil
}
func pnpmCmd(ctx context.Context, cfg *devConfig) *exec.Cmd {
cmd := cfg.cmd(ctx, "pnpm", "--dir", "./site", "dev", "--host")
cmd.Env = append(cmd.Env,
@@ -905,7 +1096,22 @@ func pnpmCmd(ctx context.Context, cfg *devConfig) *exec.Cmd {
return cmd
}
func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig) {
// prometheusBannerEntry decides which (if any) prometheus-related URL
// the dev banner should advertise. When the embedded Prometheus server
// is running we prefer its UI; otherwise fall back to the raw metrics
// endpoint. Returns an empty label when metrics are disabled entirely.
func prometheusBannerEntry(cfg *devConfig, prometheusServerStarted bool) (label string, port int64) {
switch {
case prometheusServerStarted:
return "Prometheus UI:", prometheusServerPort
case cfg.coderMetricsPort != 0:
return "Metrics:", cfg.coderMetricsPort
default:
return "", 0
}
}
func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig, prometheusServerStarted bool) {
ifaces := []string{"localhost"}
if addrs, err := net.InterfaceAddrs(); err == nil {
for _, addr := range addrs {
@@ -960,13 +1166,13 @@ func printBanner(ctx context.Context, logger slog.Logger, cfg *devConfig) {
line(indent(fmt.Sprintf("http://%s:%d", h, cfg.proxyPort)))
}
}
if cfg.prometheusPort != 0 {
if label, port := prometheusBannerEntry(cfg, prometheusServerStarted); label != "" {
line(
"",
"Metrics:",
label,
)
for _, h := range ifaces {
line(indent(fmt.Sprintf("http://%s:%d", h, cfg.prometheusPort)))
line(indent(fmt.Sprintf("http://%s:%d", h, port)))
}
}
line(
+149 -13
View File
@@ -171,11 +171,11 @@ func TestDevConfigValidate(t *testing.T) {
base := func() *devConfig {
return &devConfig{
apiPort: 3000,
webPort: 8080,
proxyPort: 3010,
prometheusPort: 2114,
password: defaultPassword,
apiPort: 3000,
webPort: 8080,
proxyPort: 3010,
coderMetricsPort: 2114,
password: defaultPassword,
}
}
@@ -288,7 +288,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithAPI", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3000
cfg.coderMetricsPort = 3000
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port 3000 conflicts with")
@@ -297,7 +297,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithWeb", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 8080
cfg.coderMetricsPort = 8080
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port 8080 conflicts with")
@@ -306,7 +306,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortConflictWithProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3010
cfg.coderMetricsPort = 3010
cfg.useProxy = true
err := cfg.validate()
require.Error(t, err)
@@ -316,21 +316,21 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortZeroDisabled", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 0
cfg.coderMetricsPort = 0
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusPortValid", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 9090
cfg.coderMetricsPort = 9090
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusPortTooHigh", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 70000
cfg.coderMetricsPort = 70000
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port must be 0 (disabled) or between 1 and 65535")
@@ -339,7 +339,7 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusPortNegative", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = -1
cfg.coderMetricsPort = -1
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port must be 0 (disabled) or between 1 and 65535")
@@ -348,9 +348,83 @@ func TestDevConfigValidate(t *testing.T) {
t.Run("PrometheusProxyProxyConflictIgnoredWithoutProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusPort = 3010
cfg.coderMetricsPort = 3010
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerRequiresMetrics", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = 0
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-server requires prometheus to be enabled")
})
t.Run("PrometheusServerValid", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = 2114
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerPortConflictWithAPI", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.apiPort = prometheusServerPort
cfg.coderMetricsPort = 2114
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortConflictWithWeb", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.webPort = prometheusServerPort
cfg.coderMetricsPort = 2114
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--web-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortConflictWithProxy", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.useProxy = true
cfg.proxyPort = prometheusServerPort
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--proxy-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
t.Run("PrometheusServerPortNoProxyConflictWithoutFlag", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.proxyPort = prometheusServerPort
// useProxy is false, so no conflict.
assert.NoError(t, cfg.validate())
})
t.Run("PrometheusServerPortConflictWithMetrics", func(t *testing.T) {
t.Parallel()
cfg := base()
cfg.prometheusServer = true
cfg.coderMetricsPort = prometheusServerPort
err := cfg.validate()
require.Error(t, err)
assert.Contains(t, err.Error(), "--prometheus-port")
assert.Contains(t, err.Error(), "conflicts with prometheus server")
})
}
func TestDevConfigResolveEnv(t *testing.T) {
@@ -515,3 +589,65 @@ func TestPoll(t *testing.T) {
assert.Equal(t, 2, calls)
})
}
func TestStartPrometheusServerDockerMissing(t *testing.T) {
// Not t.Parallel(): mutates PATH via t.Setenv.
t.Setenv("PATH", "")
logger := slog.Make(sloghuman.Sink(&bytes.Buffer{}))
cfg := &devConfig{prometheusServer: true, coderMetricsPort: 2114}
started, err := startPrometheusServer(t.Context(), logger, cfg)
require.NoError(t, err)
assert.False(t, started)
}
func TestPrometheusBannerEntry(t *testing.T) {
t.Parallel()
cases := []struct {
name string
cfg *devConfig
started bool
wantLabel string
wantPort int64
}{
{
name: "MetricsDisabled",
cfg: &devConfig{coderMetricsPort: 0},
started: false,
wantLabel: "",
wantPort: 0,
},
{
name: "MetricsOnlyDefault",
cfg: &devConfig{coderMetricsPort: 2114},
started: false,
wantLabel: "Metrics:",
wantPort: 2114,
},
{
name: "PrometheusServerUp",
cfg: &devConfig{coderMetricsPort: 2114, prometheusServer: true},
started: true,
wantLabel: "Prometheus UI:",
wantPort: prometheusServerPort,
},
{
name: "ServerRequestedButDown",
cfg: &devConfig{coderMetricsPort: 2114, prometheusServer: true},
started: false,
wantLabel: "Metrics:",
wantPort: 2114,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
label, port := prometheusBannerEntry(tc.cfg, tc.started)
assert.Equal(t, tc.wantLabel, label)
assert.Equal(t, tc.wantPort, port)
})
}
}