mirror of
https://github.com/coder/coder.git
synced 2026-06-03 13:08:25 +00:00
3a9080fff6
Workspace-agent logs emitted while serving chatd-driven requests were not correlated with the originating chat, making agent logs hard to attribute to the corresponding/originating chat. This adds agent-side chat context middleware that parses `Coder-Chat-Id` once, enriches agent access logs and structured handler/background logs, and adds a chatd bridge log when chat headers are attached to an agent connection. Closes CODAGT-324
387 lines
9.6 KiB
Go
387 lines
9.6 KiB
Go
package agentproc
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"github.com/coder/coder/v2/agent/agentexec"
|
|
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
var (
|
|
errProcessNotFound = xerrors.New("process not found")
|
|
errProcessNotRunning = xerrors.New("process is not running")
|
|
|
|
// exitedProcessReapAge is how long an exited process is
|
|
// kept before being automatically removed from the map.
|
|
exitedProcessReapAge = 5 * time.Minute
|
|
)
|
|
|
|
// process represents a running or completed process.
|
|
type process struct {
|
|
mu sync.Mutex
|
|
id string
|
|
command string
|
|
workDir string
|
|
background bool
|
|
chatID string
|
|
cmd *exec.Cmd
|
|
cancel context.CancelFunc
|
|
buf *HeadTailBuffer
|
|
logger slog.Logger
|
|
running bool
|
|
exitCode *int
|
|
startedAt int64
|
|
exitedAt *int64
|
|
done chan struct{} // closed when process exits
|
|
}
|
|
|
|
// info returns a snapshot of the process state.
|
|
func (p *process) info() workspacesdk.ProcessInfo {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
|
|
return workspacesdk.ProcessInfo{
|
|
ID: p.id,
|
|
Command: p.command,
|
|
WorkDir: p.workDir,
|
|
Background: p.background,
|
|
Running: p.running,
|
|
ExitCode: p.exitCode,
|
|
StartedAt: p.startedAt,
|
|
ExitedAt: p.exitedAt,
|
|
}
|
|
}
|
|
|
|
// output returns the truncated output from the process buffer
|
|
// along with optional truncation metadata.
|
|
func (p *process) output() (string, *workspacesdk.ProcessTruncation) {
|
|
return p.buf.Output()
|
|
}
|
|
|
|
// manager tracks processes spawned by the agent.
|
|
type manager struct {
|
|
mu sync.Mutex
|
|
logger slog.Logger
|
|
execer agentexec.Execer
|
|
clock quartz.Clock
|
|
procs map[string]*process
|
|
closed bool
|
|
updateEnv func(current []string) (updated []string, err error)
|
|
workingDir func() string
|
|
}
|
|
|
|
// newManager creates a new process manager.
|
|
func newManager(logger slog.Logger, execer agentexec.Execer, updateEnv func(current []string) (updated []string, err error), workingDir func() string) *manager {
|
|
return &manager{
|
|
logger: logger,
|
|
execer: execer,
|
|
clock: quartz.NewReal(),
|
|
procs: make(map[string]*process),
|
|
updateEnv: updateEnv,
|
|
workingDir: workingDir,
|
|
}
|
|
}
|
|
|
|
// start spawns a new process. Both foreground and background
|
|
// processes use a long-lived context so the process survives
|
|
// the HTTP request lifecycle. The background flag only affects
|
|
// client-side polling behavior.
|
|
func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*process, error) {
|
|
m.mu.Lock()
|
|
if m.closed {
|
|
m.mu.Unlock()
|
|
return nil, xerrors.New("manager is closed")
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
id := uuid.New().String()
|
|
logger := m.logger
|
|
if chatID != "" {
|
|
logger = logger.With(slog.F("chat_id", chatID))
|
|
}
|
|
|
|
// Use a cancellable context so Close() can terminate
|
|
// all processes. context.Background() is the parent so
|
|
// the process is not tied to any HTTP request.
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
cmd := m.execer.CommandContext(ctx, "sh", "-c", req.Command)
|
|
cmd.Dir = m.resolveWorkDir(req.WorkDir)
|
|
cmd.Stdin = nil
|
|
cmd.SysProcAttr = procSysProcAttr()
|
|
|
|
// WaitDelay ensures cmd.Wait returns promptly after
|
|
// the process is killed, even if child processes are
|
|
// still holding the stdout/stderr pipes open.
|
|
cmd.WaitDelay = 5 * time.Second
|
|
|
|
buf := NewHeadTailBuffer()
|
|
cmd.Stdout = buf
|
|
cmd.Stderr = buf
|
|
|
|
// Build the process environment. If the manager has an
|
|
// updateEnv hook (provided by the agent), use it to get the
|
|
// full agent environment including GIT_ASKPASS, CODER_* vars,
|
|
// etc. Otherwise fall back to the current process env.
|
|
baseEnv := os.Environ()
|
|
if m.updateEnv != nil {
|
|
updated, err := m.updateEnv(baseEnv)
|
|
if err != nil {
|
|
logger.Warn(
|
|
context.Background(),
|
|
"failed to update command environment, falling back to os env",
|
|
slog.Error(err),
|
|
)
|
|
} else {
|
|
baseEnv = updated
|
|
}
|
|
}
|
|
|
|
// Always set cmd.Env explicitly so that req.Env overrides
|
|
// are applied on top of the full agent environment.
|
|
cmd.Env = baseEnv
|
|
for k, v := range req.Env {
|
|
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
|
|
}
|
|
// Propagate the chat ID so child processes (e.g.
|
|
// GIT_ASKPASS) can send it back to the server.
|
|
if chatID != "" {
|
|
cmd.Env = append(cmd.Env, fmt.Sprintf("CODER_CHAT_ID=%s", chatID))
|
|
}
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
cancel()
|
|
return nil, xerrors.Errorf("start process: %w", err)
|
|
}
|
|
|
|
now := m.clock.Now().Unix()
|
|
proc := &process{
|
|
id: id,
|
|
command: req.Command,
|
|
workDir: cmd.Dir,
|
|
background: req.Background,
|
|
chatID: chatID,
|
|
cmd: cmd,
|
|
cancel: cancel,
|
|
buf: buf,
|
|
logger: logger,
|
|
running: true,
|
|
startedAt: now,
|
|
done: make(chan struct{}),
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if m.closed {
|
|
m.mu.Unlock()
|
|
// Manager closed between our check and now. Kill the
|
|
// process we just started.
|
|
cancel()
|
|
_ = cmd.Wait()
|
|
return nil, xerrors.New("manager is closed")
|
|
}
|
|
m.procs[id] = proc
|
|
m.mu.Unlock()
|
|
|
|
go func() {
|
|
err := cmd.Wait()
|
|
exitedAt := m.clock.Now().Unix()
|
|
|
|
proc.mu.Lock()
|
|
proc.running = false
|
|
proc.exitedAt = &exitedAt
|
|
code := 0
|
|
if err != nil {
|
|
// Extract the exit code from the error.
|
|
var exitErr *exec.ExitError
|
|
if xerrors.As(err, &exitErr) {
|
|
code = exitErr.ExitCode()
|
|
} else {
|
|
// Unknown error; use -1 as a sentinel.
|
|
code = -1
|
|
proc.logger.Warn(
|
|
context.Background(),
|
|
"process wait returned non-exit error",
|
|
slog.F("id", id),
|
|
slog.Error(err),
|
|
)
|
|
}
|
|
}
|
|
proc.exitCode = &code
|
|
proc.mu.Unlock()
|
|
|
|
// Wake any waiters blocked on new output or
|
|
// process exit before closing the done channel.
|
|
proc.buf.Close()
|
|
close(proc.done)
|
|
}()
|
|
|
|
return proc, nil
|
|
}
|
|
|
|
// get returns a process by ID.
|
|
func (m *manager) get(id string) (*process, bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
proc, ok := m.procs[id]
|
|
return proc, ok
|
|
}
|
|
|
|
// list returns info about all tracked processes. Exited
|
|
// processes older than exitedProcessReapAge are removed.
|
|
// If chatID is non-empty, only processes belonging to that
|
|
// chat are returned.
|
|
func (m *manager) list(chatID string) []workspacesdk.ProcessInfo {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
now := m.clock.Now()
|
|
infos := make([]workspacesdk.ProcessInfo, 0, len(m.procs))
|
|
for id, proc := range m.procs {
|
|
info := proc.info()
|
|
// Reap processes that exited more than 5 minutes ago
|
|
// to prevent unbounded map growth.
|
|
if !info.Running && info.ExitedAt != nil {
|
|
exitedAt := time.Unix(*info.ExitedAt, 0)
|
|
if now.Sub(exitedAt) > exitedProcessReapAge {
|
|
delete(m.procs, id)
|
|
continue
|
|
}
|
|
}
|
|
// Filter by chatID if provided.
|
|
if chatID != "" && proc.chatID != chatID {
|
|
continue
|
|
}
|
|
infos = append(infos, info)
|
|
}
|
|
return infos
|
|
}
|
|
|
|
// signal sends a signal to a running process. It returns
|
|
// sentinel errors errProcessNotFound and errProcessNotRunning
|
|
// so callers can distinguish failure modes.
|
|
func (m *manager) signal(id string, sig string) error {
|
|
m.mu.Lock()
|
|
proc, ok := m.procs[id]
|
|
m.mu.Unlock()
|
|
|
|
if !ok {
|
|
return errProcessNotFound
|
|
}
|
|
|
|
proc.mu.Lock()
|
|
defer proc.mu.Unlock()
|
|
|
|
if !proc.running {
|
|
return errProcessNotRunning
|
|
}
|
|
|
|
switch sig {
|
|
case "kill":
|
|
// Use process group kill to ensure child processes
|
|
// (e.g. from shell pipelines) are also killed.
|
|
if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil {
|
|
return xerrors.Errorf("kill process: %w", err)
|
|
}
|
|
case "terminate":
|
|
// Use process group signal to ensure child processes
|
|
// are also terminated.
|
|
if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil {
|
|
return xerrors.Errorf("terminate process: %w", err)
|
|
}
|
|
default:
|
|
return xerrors.Errorf("unsupported signal %q", sig)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Close kills all running processes and prevents new ones from
|
|
// starting. It cancels each process's context, which causes
|
|
// CommandContext to kill the process and its pipe goroutines to
|
|
// drain.
|
|
func (m *manager) Close() error {
|
|
m.mu.Lock()
|
|
if m.closed {
|
|
m.mu.Unlock()
|
|
return nil
|
|
}
|
|
m.closed = true
|
|
procs := make([]*process, 0, len(m.procs))
|
|
for _, p := range m.procs {
|
|
procs = append(procs, p)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, p := range procs {
|
|
p.cancel()
|
|
}
|
|
|
|
// Wait for all processes to exit.
|
|
for _, p := range procs {
|
|
<-p.done
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// waitForOutput blocks until the buffer is closed (process
|
|
// exited) or the context is canceled. Returns nil when the
|
|
// buffer closed, ctx.Err() when the context expired.
|
|
func (p *process) waitForOutput(ctx context.Context) error {
|
|
p.buf.cond.L.Lock()
|
|
defer p.buf.cond.L.Unlock()
|
|
|
|
nevermind := make(chan struct{})
|
|
defer close(nevermind)
|
|
go func() {
|
|
select {
|
|
case <-ctx.Done():
|
|
// Acquire the lock before broadcasting to
|
|
// guarantee the waiter has entered cond.Wait()
|
|
// (which atomically releases the lock).
|
|
// Without this, a Broadcast between the loop
|
|
// predicate check and cond.Wait() is lost.
|
|
p.buf.cond.L.Lock()
|
|
defer p.buf.cond.L.Unlock()
|
|
p.buf.cond.Broadcast()
|
|
case <-nevermind:
|
|
}
|
|
}()
|
|
|
|
for ctx.Err() == nil && !p.buf.closed {
|
|
p.buf.cond.Wait()
|
|
}
|
|
return ctx.Err()
|
|
}
|
|
|
|
// resolveWorkDir returns the directory a process should start in.
|
|
// Priority: explicit request dir > agent configured dir > $HOME.
|
|
// Falls through when a candidate is empty or does not exist on
|
|
// disk, matching the behavior of SSH sessions.
|
|
func (m *manager) resolveWorkDir(requested string) string {
|
|
if requested != "" {
|
|
return requested
|
|
}
|
|
if m.workingDir != nil {
|
|
if dir := m.workingDir(); dir != "" {
|
|
if info, err := os.Stat(dir); err == nil && info.IsDir() {
|
|
return dir
|
|
}
|
|
}
|
|
}
|
|
if home, err := os.UserHomeDir(); err == nil {
|
|
return home
|
|
}
|
|
return ""
|
|
}
|