feat(coder/mux): add restart retries for mux exits (#800)

## Summary
- add optional mux auto-restarts with delay, lock cleanup, and
restart-attempt caps
- restart mux after any exit when enabled, including intentional exits
and signals
- require `max_restart_attempts` to be a non-negative whole number and
update docs/tests for the new restart semantics

## Validation
- `bash -n registry/coder/modules/mux/run.sh`
- `cd registry/coder/modules/mux && terraform validate`
- `cd registry/coder/modules/mux && terraform test -verbose`
- `cd registry/coder/modules/mux && bun test main.test.ts`

Generated with OpenAI using Mux
This commit is contained in:
Michael Suchacz
2026-03-13 15:16:38 +01:00
committed by GitHub
parent 9606297620
commit 1460293de4
5 changed files with 384 additions and 20 deletions
+29 -11
View File
@@ -8,13 +8,13 @@ tags: [ai, agents, development, multiplexer]
# Mux
Automatically install and run [Mux](https://github.com/coder/mux) in a Coder workspace. By default, the module auto-detects an available package manager (`npm`, `pnpm`, or `bun`) to install `mux@next` (with a fallback to downloading the npm tarball if none is found). You can also force a specific package manager via `package_manager` and point to a custom registry with `registry_url`. The launcher now keeps watching the mux process after startup and appends signal/exit-code diagnostics to the mux log when the server is killed outside the Node runtime. Mux is a desktop application for parallel agentic development that enables developers to run multiple AI agents simultaneously across isolated workspaces.
Automatically install and run [Mux](https://github.com/coder/mux) in a Coder workspace. By default, the module auto-detects an available package manager (`npm`, `pnpm`, or `bun`) to install `mux@next` (with a fallback to downloading the npm tarball if none is found). You can also force a specific package manager via `package_manager` and point to a custom registry with `registry_url`. The launcher keeps watching the mux process after startup, appends signal/exit-code diagnostics to the mux log when the server is killed outside the Node runtime, and can optionally wait a few seconds, remove the stale server lock, and restart Mux after any exit until an optional restart-attempt cap is reached. Mux is a desktop application for parallel agentic development that enables developers to run multiple AI agents simultaneously across isolated workspaces.
```tf
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
}
```
@@ -37,7 +37,7 @@ module "mux" {
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
}
```
@@ -48,7 +48,7 @@ module "mux" {
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
# Default is "latest"; set to a specific version to pin
install_version = "0.4.0"
@@ -63,7 +63,7 @@ Start Mux with `mux server --add-project /path/to/project`:
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
add_project = "/path/to/project"
}
@@ -78,19 +78,35 @@ The module parses quoted values, so grouped arguments remain intact.
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
additional_arguments = "--open-mode pinned --add-project '/workspaces/my repo'"
}
```
### Restart After Mux Exits
Enable automatic restarts after Mux exits, including clean exits and intentional shutdown signals such as `SIGTERM`. The launcher waits for `restart_delay_seconds`, removes `~/.mux/server.lock`, and starts Mux again. Set `max_restart_attempts` to a whole number to stop retrying after a fixed number of restarts, or leave it at `0` for unlimited retries.
```tf
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.3"
agent_id = coder_agent.main.id
restart_on_kill = true
restart_delay_seconds = 3
max_restart_attempts = 5
}
```
### Custom Port
```tf
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
port = 8080
}
@@ -104,7 +120,7 @@ Force a specific package manager instead of auto-detection:
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
package_manager = "pnpm" # or "npm", "bun"
}
@@ -118,7 +134,7 @@ Use a private or mirrored npm registry:
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
registry_url = "https://npm.pkg.github.com"
}
@@ -132,7 +148,7 @@ Run an existing copy of Mux if found, otherwise install from npm:
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
use_cached = true
}
@@ -146,7 +162,7 @@ Run without installing from the network (requires Mux to be pre-installed):
module "mux" {
count = data.coder_workspace.me.start_count
source = "registry.coder.com/coder/mux/coder"
version = "1.4.0"
version = "1.4.3"
agent_id = coder_agent.main.id
install = false
}
@@ -164,3 +180,5 @@ module "mux" {
- Installs `mux@next` from the npm registry by default; set `registry_url` to use a private or mirrored registry
- Falls back to a direct tarball download when no package manager is found
- Appends best-effort signal and external-kill diagnostics to `log_path` if the mux process dies after startup
- Set `restart_on_kill = true` to wait `restart_delay_seconds`, remove `~/.mux/server.lock`, and restart Mux after it exits
- Set `max_restart_attempts` to a whole-number cap on restart attempts, or leave it at `0` for unlimited retries
+137
View File
@@ -145,6 +145,143 @@ chmod +x /tmp/mux/mux`,
}
}, 60000);
it("restarts after a clean exit when enabled", async () => {
const state = await runTerraformApply(import.meta.dir, {
agent_id: "foo",
install: false,
log_path: "/tmp/mux.log",
restart_on_kill: true,
restart_delay_seconds: 1,
max_restart_attempts: 1,
});
const instance = findResourceInstance(state, "coder_script");
const id = await runContainer("alpine/curl");
try {
const setup = await execContainer(id, [
"sh",
"-c",
`apk add --no-cache bash >/dev/null
mkdir -p /tmp/mux
cat <<'EOF' > /tmp/mux/mux
#!/usr/bin/env sh
run_count_file="/tmp/mux-run-count"
run_count=0
if [ -f "$run_count_file" ]; then
run_count=$(cat "$run_count_file")
fi
run_count=$((run_count + 1))
printf '%s' "$run_count" > "$run_count_file"
echo "run=$run_count"
if [ "$run_count" -eq 1 ]; then
mkdir -p "$HOME/.mux"
touch "$HOME/.mux/server.lock"
exit 0
fi
if [ -f "$HOME/.mux/server.lock" ]; then
echo "lock=present"
else
echo "lock=cleaned"
fi
exit 0
EOF
chmod +x /tmp/mux/mux`,
]);
expect(setup.exitCode).toBe(0);
const output = await execContainer(id, ["sh", "-c", instance.script]);
if (output.exitCode !== 0) {
console.log("STDOUT:\n" + output.stdout);
console.log("STDERR:\n" + output.stderr);
}
expect(output.exitCode).toBe(0);
await execContainer(id, ["sh", "-c", "sleep 4"]);
const log = await readFileContainer(id, "/tmp/mux.log");
const runCount = await readFileContainer(id, "/tmp/mux-run-count");
expect(log).toContain("run=1");
expect(log).toContain("mux server exited cleanly.");
expect(log).toContain(
"Waiting 1 seconds before restarting mux after it exited.",
);
expect(log).toContain(
"Removing /root/.mux/server.lock before restarting mux.",
);
expect(log).toContain("run=2");
expect(log).toContain("lock=cleaned");
expect(log).toContain(
"Reached the max restart attempts limit (1); not restarting mux again.",
);
expect(runCount.trim()).toBe("2");
} finally {
await removeContainer(id);
}
}, 60000);
it("restarts after SIGTERM when enabled", async () => {
const state = await runTerraformApply(import.meta.dir, {
agent_id: "foo",
install: false,
log_path: "/tmp/mux.log",
restart_on_kill: true,
restart_delay_seconds: 1,
max_restart_attempts: 1,
});
const instance = findResourceInstance(state, "coder_script");
const id = await runContainer("alpine/curl");
try {
const setup = await execContainer(id, [
"sh",
"-c",
`apk add --no-cache bash >/dev/null
mkdir -p /tmp/mux
cat <<'EOF' > /tmp/mux/mux
#!/usr/bin/env sh
run_count_file="/tmp/mux-run-count"
run_count=0
if [ -f "$run_count_file" ]; then
run_count=$(cat "$run_count_file")
fi
run_count=$((run_count + 1))
printf '%s' "$run_count" > "$run_count_file"
echo "run=$run_count"
if [ "$run_count" -eq 1 ]; then
kill -TERM $$
fi
exit 0
EOF
chmod +x /tmp/mux/mux`,
]);
expect(setup.exitCode).toBe(0);
const output = await execContainer(id, ["sh", "-c", instance.script]);
if (output.exitCode !== 0) {
console.log("STDOUT:\n" + output.stdout);
console.log("STDERR:\n" + output.stderr);
}
expect(output.exitCode).toBe(0);
await execContainer(id, ["sh", "-c", "sleep 4"]);
const log = await readFileContainer(id, "/tmp/mux.log");
const runCount = await readFileContainer(id, "/tmp/mux-run-count");
expect(log).toContain("run=1");
expect(log).toContain("signal TERM (15); shell exit code 143.");
expect(log).toContain(
"Waiting 1 seconds before restarting mux after it exited.",
);
expect(log).toContain("run=2");
expect(log).toContain(
"Reached the max restart attempts limit (1); not restarting mux again.",
);
expect(runCount.trim()).toBe("2");
} finally {
await removeContainer(id);
}
}, 60000);
it("runs with npm present", async () => {
const state = await runTerraformApply(import.meta.dir, {
agent_id: "foo",
+31
View File
@@ -49,6 +49,34 @@ variable "log_path" {
default = "/tmp/mux.log"
}
variable "restart_on_kill" {
type = bool
description = "Restart Mux after it exits by waiting briefly, removing the server lock, and launching it again."
default = false
}
variable "restart_delay_seconds" {
type = number
description = "How long to wait before restarting Mux after it exits when restart_on_kill is enabled."
default = 5
validation {
condition = var.restart_delay_seconds >= 0
error_message = "The 'restart_delay_seconds' variable must be greater than or equal to 0."
}
}
variable "max_restart_attempts" {
type = number
description = "Maximum whole-number restart attempts before giving up. Set to 0 for unlimited restarts when restart_on_kill is enabled."
default = 0
validation {
condition = var.max_restart_attempts >= 0 && floor(var.max_restart_attempts) == var.max_restart_attempts
error_message = "The 'max_restart_attempts' variable must be a whole number greater than or equal to 0."
}
}
variable "add_project" {
type = string
description = "Optional path to add/open as a project in Mux on startup."
@@ -171,6 +199,9 @@ resource "coder_script" "mux" {
OFFLINE : !var.install,
USE_CACHED : var.use_cached,
AUTH_TOKEN : local.mux_auth_token,
RESTART_ON_KILL : var.restart_on_kill,
RESTART_DELAY_SECONDS : var.restart_delay_seconds,
MAX_RESTART_ATTEMPTS : var.max_restart_attempts,
PACKAGE_MANAGER : var.package_manager,
REGISTRY_URL : local.registry_url,
})
+105
View File
@@ -111,6 +111,111 @@ run "launcher_logs_external_kills" {
}
}
run "restart_on_kill_enabled" {
command = plan
variables {
agent_id = "foo"
restart_on_kill = true
restart_delay_seconds = 7
}
assert {
condition = strcontains(resource.coder_script.mux.script, "restart_on_kill_value=\"true\"")
error_message = "mux launcher must receive the restart_on_kill setting"
}
assert {
condition = strcontains(resource.coder_script.mux.script, "restart_delay_seconds_value=\"7\"")
error_message = "mux launcher must receive the configured restart delay"
}
assert {
condition = strcontains(resource.coder_script.mux.script, "Waiting $${RESTART_DELAY_SECONDS_VALUE} seconds before restarting mux after it exited.")
error_message = "mux launcher must log the restart delay before relaunching"
}
assert {
condition = strcontains(resource.coder_script.mux.script, "Removing $HOME/.mux/server.lock before restarting mux.")
error_message = "mux launcher must clean up the server lock before relaunching"
}
assert {
condition = !strcontains(resource.coder_script.mux.script, "\"$exit_code\" -le 128")
error_message = "mux launcher must no longer exclude non-signal exits from restart handling"
}
assert {
condition = !strcontains(resource.coder_script.mux.script, "1|2|15)")
error_message = "mux launcher must no longer exclude intentional signals from restart handling"
}
}
run "restart_on_kill_with_restart_cap" {
command = plan
variables {
agent_id = "foo"
restart_on_kill = true
restart_delay_seconds = 7
max_restart_attempts = 2
}
assert {
condition = strcontains(resource.coder_script.mux.script, "max_restart_attempts_value=\"2\"")
error_message = "mux launcher must receive the configured restart cap"
}
assert {
condition = strcontains(resource.coder_script.mux.script, "Mux will stop restarting after $${max_restart_attempts_value} restart attempts.")
error_message = "mux launcher must describe the configured restart cap"
}
assert {
condition = strcontains(resource.coder_script.mux.script, "Reached the max restart attempts limit ($MAX_RESTART_ATTEMPTS_VALUE); not restarting mux again.")
error_message = "mux launcher must log when it hits the restart cap"
}
}
run "invalid_max_restart_attempts" {
command = plan
variables {
agent_id = "foo"
max_restart_attempts = -1
}
expect_failures = [
var.max_restart_attempts
]
}
run "fractional_max_restart_attempts" {
command = plan
variables {
agent_id = "foo"
max_restart_attempts = 0.5
}
expect_failures = [
var.max_restart_attempts
]
}
run "invalid_restart_delay_seconds" {
command = plan
variables {
agent_id = "foo"
restart_delay_seconds = -1
}
expect_failures = [
var.restart_delay_seconds
]
}
run "custom_version" {
command = plan
+82 -9
View File
@@ -5,17 +5,30 @@ RESET='\033[0m'
MUX_BINARY="${INSTALL_PREFIX}/mux"
function run_mux() {
# Remove stale server lock if present
rm -f "$HOME/.mux/server.lock"
local port_value
local auth_token_value
local restart_on_kill_value
local restart_delay_seconds_value
local max_restart_attempts_value
port_value="${PORT}"
auth_token_value="${AUTH_TOKEN}"
restart_on_kill_value="${RESTART_ON_KILL}"
restart_delay_seconds_value="${RESTART_DELAY_SECONDS}"
max_restart_attempts_value="${MAX_RESTART_ATTEMPTS}"
if [ -z "$port_value" ]; then
port_value="4000"
fi
if [ -z "$restart_delay_seconds_value" ]; then
restart_delay_seconds_value="5"
fi
if [ -z "$max_restart_attempts_value" ]; then
max_restart_attempts_value="0"
fi
mkdir -p "$(dirname "${LOG_PATH}")"
# Build args for mux (POSIX-compatible, avoid bash arrays)
@@ -41,13 +54,24 @@ EOF_ARGS
echo "🚀 Starting mux server on port $port_value..."
echo "Check logs at ${LOG_PATH}!"
echo "Unexpected exits will be appended to ${LOG_PATH} by the launcher."
echo "Mux exit details will be appended to ${LOG_PATH} by the launcher."
if [ "$restart_on_kill_value" = true ]; then
echo "️ Auto-restart after mux exits is enabled with a $${restart_delay_seconds_value}-second delay."
if [ "$max_restart_attempts_value" = "0" ]; then
echo "️ Automatic restarts are unlimited for every mux exit."
else
echo "️ Mux will stop restarting after $${max_restart_attempts_value} restart attempts."
fi
fi
nohup env \
LOG_PATH="${LOG_PATH}" \
MUX_BINARY="$MUX_BINARY" \
AUTH_TOKEN="$auth_token_value" \
PORT_VALUE="$port_value" \
RESTART_ON_KILL_VALUE="$restart_on_kill_value" \
RESTART_DELAY_SECONDS_VALUE="$restart_delay_seconds_value" \
MAX_RESTART_ATTEMPTS_VALUE="$max_restart_attempts_value" \
bash -s -- "$@" > /dev/null 2>&1 << 'EOF_LAUNCHER' &
signal_name() {
local signal_number="$1"
@@ -82,6 +106,14 @@ append_kernel_kill_context() {
fi
}
cleanup_mux_lock() {
rm -f "$HOME/.mux/server.lock"
}
should_restart_mux() {
[ "$RESTART_ON_KILL_VALUE" = "true" ]
}
log_mux_exit() {
local mux_pid="$1"
local exit_code="$2"
@@ -114,11 +146,52 @@ log_mux_exit() {
echo "[$timestamp] Check the earlier mux log lines for any in-process crash breadcrumbs from mux itself."
}
MUX_SERVER_AUTH_TOKEN="$AUTH_TOKEN" PORT="$PORT_VALUE" "$MUX_BINARY" "$@" >> "$LOG_PATH" 2>&1 &
mux_pid=$!
wait "$mux_pid"
exit_code=$?
log_mux_exit "$mux_pid" "$exit_code" >> "$LOG_PATH" 2>&1
log_mux_restart_wait() {
local timestamp
timestamp="$(date -Iseconds 2> /dev/null || date)"
echo "[$timestamp] Waiting $${RESTART_DELAY_SECONDS_VALUE} seconds before restarting mux after it exited."
}
log_mux_restart_cleanup() {
local timestamp
timestamp="$(date -Iseconds 2> /dev/null || date)"
echo "[$timestamp] Removing $HOME/.mux/server.lock before restarting mux."
}
log_mux_restart_cap_reached() {
local timestamp
timestamp="$(date -Iseconds 2> /dev/null || date)"
echo "[$timestamp] Reached the max restart attempts limit ($MAX_RESTART_ATTEMPTS_VALUE); not restarting mux again."
}
restart_attempt_count=0
while true; do
cleanup_mux_lock
MUX_SERVER_AUTH_TOKEN="$AUTH_TOKEN" PORT="$PORT_VALUE" "$MUX_BINARY" "$@" >> "$LOG_PATH" 2>&1 &
mux_pid=$!
wait "$mux_pid"
exit_code=$?
log_mux_exit "$mux_pid" "$exit_code" >> "$LOG_PATH" 2>&1
if should_restart_mux; then
if [ "$MAX_RESTART_ATTEMPTS_VALUE" -gt 0 ] && [ "$restart_attempt_count" -ge "$MAX_RESTART_ATTEMPTS_VALUE" ]; then
log_mux_restart_cap_reached >> "$LOG_PATH" 2>&1
break
fi
restart_attempt_count=$((restart_attempt_count + 1))
log_mux_restart_wait >> "$LOG_PATH" 2>&1
sleep "$RESTART_DELAY_SECONDS_VALUE"
cleanup_mux_lock
log_mux_restart_cleanup >> "$LOG_PATH" 2>&1
continue
fi
break
done
EOF_LAUNCHER
}
# Check if mux is already installed for offline mode