From 1460293de4deb63a5125260fe7cc66d06d864161 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:16:38 +0100 Subject: [PATCH] feat(coder/mux): add restart retries for mux exits (#800) ## Summary - add optional mux auto-restarts with delay, lock cleanup, and restart-attempt caps - restart mux after any exit when enabled, including intentional exits and signals - require `max_restart_attempts` to be a non-negative whole number and update docs/tests for the new restart semantics ## Validation - `bash -n registry/coder/modules/mux/run.sh` - `cd registry/coder/modules/mux && terraform validate` - `cd registry/coder/modules/mux && terraform test -verbose` - `cd registry/coder/modules/mux && bun test main.test.ts` Generated with OpenAI using Mux --- registry/coder/modules/mux/README.md | 40 +++++-- registry/coder/modules/mux/main.test.ts | 137 ++++++++++++++++++++++ registry/coder/modules/mux/main.tf | 31 +++++ registry/coder/modules/mux/mux.tftest.hcl | 105 +++++++++++++++++ registry/coder/modules/mux/run.sh | 91 ++++++++++++-- 5 files changed, 384 insertions(+), 20 deletions(-) diff --git a/registry/coder/modules/mux/README.md b/registry/coder/modules/mux/README.md index 46bf295b..fb26d381 100644 --- a/registry/coder/modules/mux/README.md +++ b/registry/coder/modules/mux/README.md @@ -8,13 +8,13 @@ tags: [ai, agents, development, multiplexer] # Mux -Automatically install and run [Mux](https://github.com/coder/mux) in a Coder workspace. By default, the module auto-detects an available package manager (`npm`, `pnpm`, or `bun`) to install `mux@next` (with a fallback to downloading the npm tarball if none is found). You can also force a specific package manager via `package_manager` and point to a custom registry with `registry_url`. The launcher now keeps watching the mux process after startup and appends signal/exit-code diagnostics to the mux log when the server is killed outside the Node runtime. Mux is a desktop application for parallel agentic development that enables developers to run multiple AI agents simultaneously across isolated workspaces. +Automatically install and run [Mux](https://github.com/coder/mux) in a Coder workspace. By default, the module auto-detects an available package manager (`npm`, `pnpm`, or `bun`) to install `mux@next` (with a fallback to downloading the npm tarball if none is found). You can also force a specific package manager via `package_manager` and point to a custom registry with `registry_url`. The launcher keeps watching the mux process after startup, appends signal/exit-code diagnostics to the mux log when the server is killed outside the Node runtime, and can optionally wait a few seconds, remove the stale server lock, and restart Mux after any exit until an optional restart-attempt cap is reached. Mux is a desktop application for parallel agentic development that enables developers to run multiple AI agents simultaneously across isolated workspaces. ```tf module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id } ``` @@ -37,7 +37,7 @@ module "mux" { module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id } ``` @@ -48,7 +48,7 @@ module "mux" { module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id # Default is "latest"; set to a specific version to pin install_version = "0.4.0" @@ -63,7 +63,7 @@ Start Mux with `mux server --add-project /path/to/project`: module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id add_project = "/path/to/project" } @@ -78,19 +78,35 @@ The module parses quoted values, so grouped arguments remain intact. module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id additional_arguments = "--open-mode pinned --add-project '/workspaces/my repo'" } ``` +### Restart After Mux Exits + +Enable automatic restarts after Mux exits, including clean exits and intentional shutdown signals such as `SIGTERM`. The launcher waits for `restart_delay_seconds`, removes `~/.mux/server.lock`, and starts Mux again. Set `max_restart_attempts` to a whole number to stop retrying after a fixed number of restarts, or leave it at `0` for unlimited retries. + +```tf +module "mux" { + count = data.coder_workspace.me.start_count + source = "registry.coder.com/coder/mux/coder" + version = "1.4.3" + agent_id = coder_agent.main.id + restart_on_kill = true + restart_delay_seconds = 3 + max_restart_attempts = 5 +} +``` + ### Custom Port ```tf module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id port = 8080 } @@ -104,7 +120,7 @@ Force a specific package manager instead of auto-detection: module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id package_manager = "pnpm" # or "npm", "bun" } @@ -118,7 +134,7 @@ Use a private or mirrored npm registry: module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id registry_url = "https://npm.pkg.github.com" } @@ -132,7 +148,7 @@ Run an existing copy of Mux if found, otherwise install from npm: module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id use_cached = true } @@ -146,7 +162,7 @@ Run without installing from the network (requires Mux to be pre-installed): module "mux" { count = data.coder_workspace.me.start_count source = "registry.coder.com/coder/mux/coder" - version = "1.4.0" + version = "1.4.3" agent_id = coder_agent.main.id install = false } @@ -164,3 +180,5 @@ module "mux" { - Installs `mux@next` from the npm registry by default; set `registry_url` to use a private or mirrored registry - Falls back to a direct tarball download when no package manager is found - Appends best-effort signal and external-kill diagnostics to `log_path` if the mux process dies after startup +- Set `restart_on_kill = true` to wait `restart_delay_seconds`, remove `~/.mux/server.lock`, and restart Mux after it exits +- Set `max_restart_attempts` to a whole-number cap on restart attempts, or leave it at `0` for unlimited retries diff --git a/registry/coder/modules/mux/main.test.ts b/registry/coder/modules/mux/main.test.ts index 9537e9de..a8944dee 100644 --- a/registry/coder/modules/mux/main.test.ts +++ b/registry/coder/modules/mux/main.test.ts @@ -145,6 +145,143 @@ chmod +x /tmp/mux/mux`, } }, 60000); + it("restarts after a clean exit when enabled", async () => { + const state = await runTerraformApply(import.meta.dir, { + agent_id: "foo", + install: false, + log_path: "/tmp/mux.log", + restart_on_kill: true, + restart_delay_seconds: 1, + max_restart_attempts: 1, + }); + + const instance = findResourceInstance(state, "coder_script"); + const id = await runContainer("alpine/curl"); + + try { + const setup = await execContainer(id, [ + "sh", + "-c", + `apk add --no-cache bash >/dev/null +mkdir -p /tmp/mux +cat <<'EOF' > /tmp/mux/mux +#!/usr/bin/env sh +run_count_file="/tmp/mux-run-count" +run_count=0 +if [ -f "$run_count_file" ]; then + run_count=$(cat "$run_count_file") +fi +run_count=$((run_count + 1)) +printf '%s' "$run_count" > "$run_count_file" +echo "run=$run_count" +if [ "$run_count" -eq 1 ]; then + mkdir -p "$HOME/.mux" + touch "$HOME/.mux/server.lock" + exit 0 +fi +if [ -f "$HOME/.mux/server.lock" ]; then + echo "lock=present" +else + echo "lock=cleaned" +fi +exit 0 +EOF +chmod +x /tmp/mux/mux`, + ]); + expect(setup.exitCode).toBe(0); + + const output = await execContainer(id, ["sh", "-c", instance.script]); + if (output.exitCode !== 0) { + console.log("STDOUT:\n" + output.stdout); + console.log("STDERR:\n" + output.stderr); + } + expect(output.exitCode).toBe(0); + + await execContainer(id, ["sh", "-c", "sleep 4"]); + const log = await readFileContainer(id, "/tmp/mux.log"); + const runCount = await readFileContainer(id, "/tmp/mux-run-count"); + expect(log).toContain("run=1"); + expect(log).toContain("mux server exited cleanly."); + expect(log).toContain( + "Waiting 1 seconds before restarting mux after it exited.", + ); + expect(log).toContain( + "Removing /root/.mux/server.lock before restarting mux.", + ); + expect(log).toContain("run=2"); + expect(log).toContain("lock=cleaned"); + expect(log).toContain( + "Reached the max restart attempts limit (1); not restarting mux again.", + ); + expect(runCount.trim()).toBe("2"); + } finally { + await removeContainer(id); + } + }, 60000); + + it("restarts after SIGTERM when enabled", async () => { + const state = await runTerraformApply(import.meta.dir, { + agent_id: "foo", + install: false, + log_path: "/tmp/mux.log", + restart_on_kill: true, + restart_delay_seconds: 1, + max_restart_attempts: 1, + }); + + const instance = findResourceInstance(state, "coder_script"); + const id = await runContainer("alpine/curl"); + + try { + const setup = await execContainer(id, [ + "sh", + "-c", + `apk add --no-cache bash >/dev/null +mkdir -p /tmp/mux +cat <<'EOF' > /tmp/mux/mux +#!/usr/bin/env sh +run_count_file="/tmp/mux-run-count" +run_count=0 +if [ -f "$run_count_file" ]; then + run_count=$(cat "$run_count_file") +fi +run_count=$((run_count + 1)) +printf '%s' "$run_count" > "$run_count_file" +echo "run=$run_count" +if [ "$run_count" -eq 1 ]; then + kill -TERM $$ +fi +exit 0 +EOF +chmod +x /tmp/mux/mux`, + ]); + expect(setup.exitCode).toBe(0); + + const output = await execContainer(id, ["sh", "-c", instance.script]); + if (output.exitCode !== 0) { + console.log("STDOUT:\n" + output.stdout); + console.log("STDERR:\n" + output.stderr); + } + expect(output.exitCode).toBe(0); + + await execContainer(id, ["sh", "-c", "sleep 4"]); + const log = await readFileContainer(id, "/tmp/mux.log"); + const runCount = await readFileContainer(id, "/tmp/mux-run-count"); + expect(log).toContain("run=1"); + expect(log).toContain("signal TERM (15); shell exit code 143."); + expect(log).toContain( + "Waiting 1 seconds before restarting mux after it exited.", + ); + expect(log).toContain("run=2"); + expect(log).toContain( + "Reached the max restart attempts limit (1); not restarting mux again.", + ); + expect(runCount.trim()).toBe("2"); + } finally { + await removeContainer(id); + } + }, 60000); + it("runs with npm present", async () => { const state = await runTerraformApply(import.meta.dir, { agent_id: "foo", diff --git a/registry/coder/modules/mux/main.tf b/registry/coder/modules/mux/main.tf index ba475b0c..f80b8b3f 100644 --- a/registry/coder/modules/mux/main.tf +++ b/registry/coder/modules/mux/main.tf @@ -49,6 +49,34 @@ variable "log_path" { default = "/tmp/mux.log" } +variable "restart_on_kill" { + type = bool + description = "Restart Mux after it exits by waiting briefly, removing the server lock, and launching it again." + default = false +} + +variable "restart_delay_seconds" { + type = number + description = "How long to wait before restarting Mux after it exits when restart_on_kill is enabled." + default = 5 + + validation { + condition = var.restart_delay_seconds >= 0 + error_message = "The 'restart_delay_seconds' variable must be greater than or equal to 0." + } +} + +variable "max_restart_attempts" { + type = number + description = "Maximum whole-number restart attempts before giving up. Set to 0 for unlimited restarts when restart_on_kill is enabled." + default = 0 + + validation { + condition = var.max_restart_attempts >= 0 && floor(var.max_restart_attempts) == var.max_restart_attempts + error_message = "The 'max_restart_attempts' variable must be a whole number greater than or equal to 0." + } +} + variable "add_project" { type = string description = "Optional path to add/open as a project in Mux on startup." @@ -171,6 +199,9 @@ resource "coder_script" "mux" { OFFLINE : !var.install, USE_CACHED : var.use_cached, AUTH_TOKEN : local.mux_auth_token, + RESTART_ON_KILL : var.restart_on_kill, + RESTART_DELAY_SECONDS : var.restart_delay_seconds, + MAX_RESTART_ATTEMPTS : var.max_restart_attempts, PACKAGE_MANAGER : var.package_manager, REGISTRY_URL : local.registry_url, }) diff --git a/registry/coder/modules/mux/mux.tftest.hcl b/registry/coder/modules/mux/mux.tftest.hcl index e7816de8..af4cbfe2 100644 --- a/registry/coder/modules/mux/mux.tftest.hcl +++ b/registry/coder/modules/mux/mux.tftest.hcl @@ -111,6 +111,111 @@ run "launcher_logs_external_kills" { } } +run "restart_on_kill_enabled" { + command = plan + + variables { + agent_id = "foo" + restart_on_kill = true + restart_delay_seconds = 7 + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "restart_on_kill_value=\"true\"") + error_message = "mux launcher must receive the restart_on_kill setting" + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "restart_delay_seconds_value=\"7\"") + error_message = "mux launcher must receive the configured restart delay" + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "Waiting $${RESTART_DELAY_SECONDS_VALUE} seconds before restarting mux after it exited.") + error_message = "mux launcher must log the restart delay before relaunching" + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "Removing $HOME/.mux/server.lock before restarting mux.") + error_message = "mux launcher must clean up the server lock before relaunching" + } + + assert { + condition = !strcontains(resource.coder_script.mux.script, "\"$exit_code\" -le 128") + error_message = "mux launcher must no longer exclude non-signal exits from restart handling" + } + + assert { + condition = !strcontains(resource.coder_script.mux.script, "1|2|15)") + error_message = "mux launcher must no longer exclude intentional signals from restart handling" + } +} + +run "restart_on_kill_with_restart_cap" { + command = plan + + variables { + agent_id = "foo" + restart_on_kill = true + restart_delay_seconds = 7 + max_restart_attempts = 2 + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "max_restart_attempts_value=\"2\"") + error_message = "mux launcher must receive the configured restart cap" + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "Mux will stop restarting after $${max_restart_attempts_value} restart attempts.") + error_message = "mux launcher must describe the configured restart cap" + } + + assert { + condition = strcontains(resource.coder_script.mux.script, "Reached the max restart attempts limit ($MAX_RESTART_ATTEMPTS_VALUE); not restarting mux again.") + error_message = "mux launcher must log when it hits the restart cap" + } +} + +run "invalid_max_restart_attempts" { + command = plan + + variables { + agent_id = "foo" + max_restart_attempts = -1 + } + + expect_failures = [ + var.max_restart_attempts + ] +} + +run "fractional_max_restart_attempts" { + command = plan + + variables { + agent_id = "foo" + max_restart_attempts = 0.5 + } + + expect_failures = [ + var.max_restart_attempts + ] +} + +run "invalid_restart_delay_seconds" { + command = plan + + variables { + agent_id = "foo" + restart_delay_seconds = -1 + } + + expect_failures = [ + var.restart_delay_seconds + ] +} + run "custom_version" { command = plan diff --git a/registry/coder/modules/mux/run.sh b/registry/coder/modules/mux/run.sh index fb583480..bd2bb811 100644 --- a/registry/coder/modules/mux/run.sh +++ b/registry/coder/modules/mux/run.sh @@ -5,17 +5,30 @@ RESET='\033[0m' MUX_BINARY="${INSTALL_PREFIX}/mux" function run_mux() { - # Remove stale server lock if present - rm -f "$HOME/.mux/server.lock" - local port_value local auth_token_value + local restart_on_kill_value + local restart_delay_seconds_value + local max_restart_attempts_value + port_value="${PORT}" auth_token_value="${AUTH_TOKEN}" + restart_on_kill_value="${RESTART_ON_KILL}" + restart_delay_seconds_value="${RESTART_DELAY_SECONDS}" + max_restart_attempts_value="${MAX_RESTART_ATTEMPTS}" + if [ -z "$port_value" ]; then port_value="4000" fi + if [ -z "$restart_delay_seconds_value" ]; then + restart_delay_seconds_value="5" + fi + + if [ -z "$max_restart_attempts_value" ]; then + max_restart_attempts_value="0" + fi + mkdir -p "$(dirname "${LOG_PATH}")" # Build args for mux (POSIX-compatible, avoid bash arrays) @@ -41,13 +54,24 @@ EOF_ARGS echo "🚀 Starting mux server on port $port_value..." echo "Check logs at ${LOG_PATH}!" - echo "ℹ️ Unexpected exits will be appended to ${LOG_PATH} by the launcher." + echo "ℹ️ Mux exit details will be appended to ${LOG_PATH} by the launcher." + if [ "$restart_on_kill_value" = true ]; then + echo "ℹ️ Auto-restart after mux exits is enabled with a $${restart_delay_seconds_value}-second delay." + if [ "$max_restart_attempts_value" = "0" ]; then + echo "ℹ️ Automatic restarts are unlimited for every mux exit." + else + echo "ℹ️ Mux will stop restarting after $${max_restart_attempts_value} restart attempts." + fi + fi nohup env \ LOG_PATH="${LOG_PATH}" \ MUX_BINARY="$MUX_BINARY" \ AUTH_TOKEN="$auth_token_value" \ PORT_VALUE="$port_value" \ + RESTART_ON_KILL_VALUE="$restart_on_kill_value" \ + RESTART_DELAY_SECONDS_VALUE="$restart_delay_seconds_value" \ + MAX_RESTART_ATTEMPTS_VALUE="$max_restart_attempts_value" \ bash -s -- "$@" > /dev/null 2>&1 << 'EOF_LAUNCHER' & signal_name() { local signal_number="$1" @@ -82,6 +106,14 @@ append_kernel_kill_context() { fi } +cleanup_mux_lock() { + rm -f "$HOME/.mux/server.lock" +} + +should_restart_mux() { + [ "$RESTART_ON_KILL_VALUE" = "true" ] +} + log_mux_exit() { local mux_pid="$1" local exit_code="$2" @@ -114,11 +146,52 @@ log_mux_exit() { echo "[$timestamp] Check the earlier mux log lines for any in-process crash breadcrumbs from mux itself." } -MUX_SERVER_AUTH_TOKEN="$AUTH_TOKEN" PORT="$PORT_VALUE" "$MUX_BINARY" "$@" >> "$LOG_PATH" 2>&1 & -mux_pid=$! -wait "$mux_pid" -exit_code=$? -log_mux_exit "$mux_pid" "$exit_code" >> "$LOG_PATH" 2>&1 +log_mux_restart_wait() { + local timestamp + + timestamp="$(date -Iseconds 2> /dev/null || date)" + echo "[$timestamp] Waiting $${RESTART_DELAY_SECONDS_VALUE} seconds before restarting mux after it exited." +} + +log_mux_restart_cleanup() { + local timestamp + + timestamp="$(date -Iseconds 2> /dev/null || date)" + echo "[$timestamp] Removing $HOME/.mux/server.lock before restarting mux." +} + +log_mux_restart_cap_reached() { + local timestamp + + timestamp="$(date -Iseconds 2> /dev/null || date)" + echo "[$timestamp] Reached the max restart attempts limit ($MAX_RESTART_ATTEMPTS_VALUE); not restarting mux again." +} + +restart_attempt_count=0 +while true; do + cleanup_mux_lock + MUX_SERVER_AUTH_TOKEN="$AUTH_TOKEN" PORT="$PORT_VALUE" "$MUX_BINARY" "$@" >> "$LOG_PATH" 2>&1 & + mux_pid=$! + wait "$mux_pid" + exit_code=$? + log_mux_exit "$mux_pid" "$exit_code" >> "$LOG_PATH" 2>&1 + + if should_restart_mux; then + if [ "$MAX_RESTART_ATTEMPTS_VALUE" -gt 0 ] && [ "$restart_attempt_count" -ge "$MAX_RESTART_ATTEMPTS_VALUE" ]; then + log_mux_restart_cap_reached >> "$LOG_PATH" 2>&1 + break + fi + + restart_attempt_count=$((restart_attempt_count + 1)) + log_mux_restart_wait >> "$LOG_PATH" 2>&1 + sleep "$RESTART_DELAY_SECONDS_VALUE" + cleanup_mux_lock + log_mux_restart_cleanup >> "$LOG_PATH" 2>&1 + continue + fi + + break +done EOF_LAUNCHER } # Check if mux is already installed for offline mode