Files
coder/aibridge/keypool/keypool.go
T
github-actions[bot] 663f1ee834 fix: track credential hint across key failover attempts in aibridge (#25735) (#25847)
Cherry-pick of https://github.com/coder/coder/pull/25735

Original PR: #25735 — fix: track credential hint across key failover
attempts in aibridge
Merge commit: 7b903cad73
Requested by: @ssncferreira

Co-authored-by: Susana Ferreira <susana@coder.com>
2026-05-29 12:59:39 -04:00

279 lines
7.0 KiB
Go

package keypool
import (
"fmt"
"sync"
"time"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/aibridge/utils"
"github.com/coder/quartz"
)
// Configuration validation type errors. These surface when the
// pool is built from invalid input.
var (
// ErrNoKeys is returned when the input is empty.
ErrNoKeys = xerrors.New("no keys provided")
// ErrDuplicateKey is returned when the input contains
// duplicate key values.
ErrDuplicateKey = xerrors.New("duplicate key")
)
// ErrorKind classifies a runtime key-pool failure.
type ErrorKind int
const (
// ErrorKindRateLimited means no key is currently available
// but at least one key will recover after a cooldown.
ErrorKindRateLimited ErrorKind = iota
// ErrorKindPermanent means every key is permanently marked
// and no key can satisfy the request.
ErrorKindPermanent
)
// Error is returned when no key is available for the
// current attempt. RetryAfter is the soonest remaining
// cooldown across the pool.
type Error struct {
Kind ErrorKind
RetryAfter time.Duration
}
func (e *Error) Error() string {
switch e.Kind {
case ErrorKindPermanent:
return "all configured keys failed authentication"
case ErrorKindRateLimited:
return fmt.Sprintf("all configured keys are rate-limited (retry after %s)", e.RetryAfter)
default:
return "key pool error"
}
}
// KeyState represents the current state of a key in the pool.
type KeyState int
const (
// KeyStateValid means the key is available for use.
KeyStateValid KeyState = iota
// KeyStateTemporary means the key is temporarily unavailable
// (e.g. rate-limited) and will recover after a cooldown.
KeyStateTemporary
// KeyStatePermanent means the key is permanently unavailable
// (e.g. revoked or unauthorized) until process restart.
KeyStatePermanent
)
// defaultCooldown is applied when a key is marked temporary
// with a zero or negative cooldown duration.
const defaultCooldown = 60 * time.Second
// Key holds a key value and its runtime state.
type Key struct {
value string
permanent bool
cooldownUntil time.Time
mu sync.RWMutex
clock quartz.Clock
}
// Pool manages a set of keys with state tracking and
// cooldown expiry. It is safe for concurrent use.
type Pool struct {
keys []Key
}
// New creates a pool from the given keys. All keys start in
// the valid state. Returns ErrNoKeys if keys is empty and
// ErrDuplicateKey if any key appears more than once.
func New(keys []string, clk quartz.Clock) (*Pool, error) {
if len(keys) == 0 {
return nil, ErrNoKeys
}
pool := &Pool{
keys: make([]Key, len(keys)),
}
seen := make(map[string]struct{}, len(keys))
for i, val := range keys {
if _, exists := seen[val]; exists {
return nil, ErrDuplicateKey
}
seen[val] = struct{}{}
pool.keys[i] = Key{
clock: clk,
value: val,
}
}
return pool, nil
}
// Value returns the key string.
func (k *Key) Value() string {
return k.value
}
// Hint returns a masked, identifiable fragment of the key, suitable
// for logs and persisted records.
func (k *Key) Hint() string {
return utils.MaskSecret(k.value)
}
// State returns the current state of the key, derived from its
// permanent flag and cooldown deadline.
func (k *Key) State() KeyState {
k.mu.RLock()
defer k.mu.RUnlock()
if k.permanent {
return KeyStatePermanent
}
// Cooldown still active: key is temporarily unavailable.
if k.clock.Now().Before(k.cooldownUntil) {
return KeyStateTemporary
}
return KeyStateValid
}
// stateAndCooldown returns the key's state and remaining
// cooldown as a single atomic snapshot.
func (k *Key) stateAndCooldown() (KeyState, time.Duration) {
k.mu.RLock()
defer k.mu.RUnlock()
if k.permanent {
return KeyStatePermanent, 0
}
now := k.clock.Now()
if now.Before(k.cooldownUntil) {
return KeyStateTemporary, k.cooldownUntil.Sub(now)
}
return KeyStateValid, 0
}
// MarkTemporary marks the key as temporarily unavailable with
// the specified cooldown duration. Returns true if this call
// transitions the key to temporary.
func (k *Key) MarkTemporary(cooldown time.Duration) bool {
k.mu.Lock()
defer k.mu.Unlock()
// Permanent is irreversible.
if k.permanent {
return false
}
if cooldown <= 0 {
cooldown = defaultCooldown
}
now := k.clock.Now()
// Used to detect the valid -> temporary transition.
inCooldown := k.cooldownUntil.After(now)
newDeadline := now.Add(cooldown)
// In case the key has a later expiry, keep it.
if k.cooldownUntil.After(newDeadline) {
return false
}
k.cooldownUntil = newDeadline
return !inCooldown
}
// MarkPermanent marks the key as permanently unavailable. This
// is a terminal state. Returns true if this call transitions
// the key to permanent.
func (k *Key) MarkPermanent() bool {
k.mu.Lock()
defer k.mu.Unlock()
if k.permanent {
return false
}
k.permanent = true
return true
}
// keyPoolError returns an Error summarizing why no
// key is currently available. When at least one key is
// temporary, the smallest remaining cooldown is used as the
// retry-after.
func (p *Pool) keyPoolError() *Error {
var retryAfter time.Duration
var hasCooldown bool
for i := range p.keys {
state, cooldown := p.keys[i].stateAndCooldown()
switch state {
// Recoverable now: a key's cooldown expired between the walker's
// check and this scan. Return Retry-After: 0 to indicate that
// an immediate retry will succeed.
case KeyStateValid:
return &Error{Kind: ErrorKindRateLimited}
// Recoverable later: track soonest remaining cooldown.
case KeyStateTemporary:
if !hasCooldown || cooldown < retryAfter {
retryAfter = cooldown
hasCooldown = true
}
// Permanent: keep walking to confirm error type.
default:
}
}
if hasCooldown {
return &Error{Kind: ErrorKindRateLimited, RetryAfter: retryAfter}
}
return &Error{Kind: ErrorKindPermanent}
}
// PoolState returns a snapshot of each key's state in the pool's
// original order, used by tests and other diagnostic callers. Use
// Walker for the failover iteration path.
func (p *Pool) PoolState() []KeyState {
states := make([]KeyState, len(p.keys))
for i := range p.keys {
states[i] = p.keys[i].State()
}
return states
}
// Walker traverses a Pool for a single request. Each request
// creates its own walker so that it can independently iterate
// through keys without interfering with other requests.
type Walker struct {
pool *Pool
pos int // Next index to consider.
}
// Walker creates a new Walker that follows a primary-with-fallback
// strategy, starting from the first key in the pool. The walker
// is not safe for concurrent use. It is intended for a single
// request's failover loop.
func (p *Pool) Walker() *Walker {
return &Walker{pool: p, pos: 0}
}
// Next returns a Key handle for the next available key without
// modifying the pool state.
//
// Returns *Error when no more keys are available.
func (w *Walker) Next() (*Key, *Error) {
for i := w.pos; i < len(w.pool.keys); i++ {
key := &w.pool.keys[i]
if key.State() != KeyStateValid {
continue
}
// Key is available.
w.pos = i + 1
return key, nil
}
// No keys available.
return nil, w.pool.keyPoolError()
}