Files
coder/agent/reconnectingpty/reconnectingpty.go
T
Spike Curtis bddb808b25 chore: arrange imports in a standard way (#21452)
Fixes all our Go file imports to match the preferred spec that we've _mostly_ been using. For example:

```
import (
	"context"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"golang.org/x/xerrors"
	"gopkg.in/natefinch/lumberjack.v2"

	"cdr.dev/slog/v3"
	"github.com/coder/coder/v2/codersdk/agentsdk"
	"github.com/coder/serpent"
)
```

3 groups: standard library, 3rd partly libs, Coder libs.

This PR makes the change across the codebase. The PR in the stack above modifies our formatting to maintain this state of affairs, and is a separate PR so it's possible to review that one in detail.
2026-01-08 15:24:11 +04:00

236 lines
7.2 KiB
Go

package reconnectingpty
import (
"context"
"encoding/json"
"io"
"net"
"os/exec"
"runtime"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/agent/agentexec"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/pty"
)
// attachTimeout is the initial timeout for attaching and will probably be far
// shorter than the reconnect timeout in most cases; in tests it might be
// longer. It should be at least long enough for the first screen attach to be
// able to start up the daemon and for the buffered pty to start.
const attachTimeout = 30 * time.Second
// Options allows configuring the reconnecting pty.
type Options struct {
// Timeout describes how long to keep the pty alive without any connections.
// Once elapsed the pty will be killed.
Timeout time.Duration
// Metrics tracks various error counters.
Metrics *prometheus.CounterVec
// BackendType specifies the ReconnectingPTY backend to use.
BackendType string
}
// ReconnectingPTY is a pty that can be reconnected within a timeout and to
// simultaneous connections. The reconnecting pty can be backed by screen if
// installed or a (buggy) buffer replay fallback.
type ReconnectingPTY interface {
// Attach pipes the connection and pty, spawning it if necessary, replays
// history, then blocks until EOF, an error, or the context's end. The
// connection is expected to send JSON-encoded messages and accept raw output
// from the ptty. If the context ends or the process dies the connection will
// be detached.
Attach(ctx context.Context, connID string, conn net.Conn, height, width uint16, logger slog.Logger) error
// Wait waits for the reconnecting pty to close. The underlying process might
// still be exiting.
Wait()
// Close kills the reconnecting pty process.
Close(err error)
}
// New sets up a new reconnecting pty that wraps the provided command. Any
// errors with starting are returned on Attach(). The reconnecting pty will
// close itself (and all connections to it) if nothing is attached for the
// duration of the timeout, if the context ends, or the process exits (buffered
// backend only).
func New(ctx context.Context, logger slog.Logger, execer agentexec.Execer, cmd *pty.Cmd, options *Options) ReconnectingPTY {
if options.Timeout == 0 {
options.Timeout = 5 * time.Minute
}
// Screen seems flaky on Darwin. Locally the tests pass 100% of the time (100
// runs) but in CI screen often incorrectly claims the session name does not
// exist even though screen -list shows it. For now, restrict screen to
// Linux.
autoBackendType := "buffered"
if runtime.GOOS == "linux" {
_, err := exec.LookPath("screen")
if err == nil {
autoBackendType = "screen"
}
}
var backendType string
switch options.BackendType {
case "":
backendType = autoBackendType
default:
backendType = options.BackendType
}
logger.Info(ctx, "start reconnecting pty", slog.F("backend_type", backendType))
switch backendType {
case "screen":
return newScreen(ctx, logger, execer, cmd, options)
default:
return newBuffered(ctx, logger, execer, cmd, options)
}
}
// heartbeat resets timer before timeout elapses and blocks until ctx ends.
func heartbeat(ctx context.Context, timer *time.Timer, timeout time.Duration) {
// Reset now in case it is near the end.
timer.Reset(timeout)
// Reset when the context ends to ensure the pty stays up for the full
// timeout.
defer timer.Reset(timeout)
heartbeat := time.NewTicker(timeout / 2)
defer heartbeat.Stop()
for {
select {
case <-ctx.Done():
return
case <-heartbeat.C:
timer.Reset(timeout)
}
}
}
// State represents the current state of the reconnecting pty. States are
// sequential and will only move forward.
type State int
const (
// StateStarting is the default/start state. Attaching will block until the
// reconnecting pty becomes ready.
StateStarting = iota
// StateReady means the reconnecting pty is ready to be attached.
StateReady
// StateClosing means the reconnecting pty has begun closing. The underlying
// process may still be exiting. Attaching will result in an error.
StateClosing
// StateDone means the reconnecting pty has completely shut down and the
// process has exited. Attaching will result in an error.
StateDone
)
// ptyState is a helper for tracking the reconnecting PTY's state.
type ptyState struct {
// cond broadcasts state changes and any accompanying errors.
cond *sync.Cond
// error describes the error that caused the state change, if there was one.
// It is not safe to access outside of cond.L.
error error
// state holds the current reconnecting pty state. It is not safe to access
// this outside of cond.L.
state State
}
func newState() *ptyState {
return &ptyState{
cond: sync.NewCond(&sync.Mutex{}),
state: StateStarting,
}
}
// setState sets and broadcasts the provided state if it is greater than the
// current state and the error if one has not already been set.
func (s *ptyState) setState(state State, err error) {
s.cond.L.Lock()
defer s.cond.L.Unlock()
// Cannot regress states. For example, trying to close after the process is
// done should leave us in the done state and not the closing state.
if state <= s.state {
return
}
s.error = err
s.state = state
s.cond.Broadcast()
}
// waitForState blocks until the state or a greater one is reached.
func (s *ptyState) waitForState(state State) (State, error) {
s.cond.L.Lock()
defer s.cond.L.Unlock()
for state > s.state {
s.cond.Wait()
}
return s.state, s.error
}
// waitForStateOrContext blocks until the state or a greater one is reached or
// the provided context ends.
func (s *ptyState) waitForStateOrContext(ctx context.Context, state State) (State, error) {
s.cond.L.Lock()
defer s.cond.L.Unlock()
nevermind := make(chan struct{})
defer close(nevermind)
go func() {
select {
case <-ctx.Done():
// Wake up when the context ends.
s.cond.Broadcast()
case <-nevermind:
}
}()
for ctx.Err() == nil && state > s.state {
s.cond.Wait()
}
if ctx.Err() != nil {
return s.state, ctx.Err()
}
return s.state, s.error
}
// readConnLoop reads messages from conn and writes to ptty as needed. Blocks
// until EOF or an error writing to ptty or reading from conn.
func readConnLoop(ctx context.Context, conn net.Conn, ptty pty.PTYCmd, metrics *prometheus.CounterVec, logger slog.Logger) {
decoder := json.NewDecoder(conn)
for {
var req workspacesdk.ReconnectingPTYRequest
err := decoder.Decode(&req)
if xerrors.Is(err, io.EOF) {
return
}
if err != nil {
logger.Warn(ctx, "reconnecting pty failed with read error", slog.Error(err))
return
}
_, err = ptty.InputWriter().Write([]byte(req.Data))
if err != nil {
logger.Warn(ctx, "reconnecting pty failed with write error", slog.Error(err))
metrics.WithLabelValues("input_writer").Add(1)
return
}
// Check if a resize needs to happen!
if req.Height == 0 || req.Width == 0 {
continue
}
err = ptty.Resize(req.Height, req.Width)
if err != nil {
// We can continue after this, it's not fatal!
logger.Warn(ctx, "reconnecting pty resize failed, but will continue", slog.Error(err))
metrics.WithLabelValues("resize").Add(1)
}
}
}