mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix(agent/agentcontext): address coder-agents-review round 4
CRF-40: ensure runDoneCh is closed even if NewWatcher fails so Close does not deadlock waiting on a goroutine that already exited. CRF-41: gate snapshot publishes with a monotonic resolveEpoch counter. Resync and resolveAndBroadcast drop m.mu around the filesystem walk; under concurrency a stale walk could overwrite a fresher one at a higher version number. Each pass now captures its epoch under the lock and skips the publish if a newer pass has started. Resync returns the currently published Snapshot in the stale case, which is guaranteed to be at least as fresh as the discarded result. CRF-42: hash the (capped) prefix of oversize SKILL.md files so an edit that keeps the file oversize still shifts the aggregate hash, matching readFileResource and preserving the change-detection contract.
This commit is contained in:
@@ -78,6 +78,14 @@ type Manager struct {
|
||||
snapshot Snapshot
|
||||
// version monotonically increases per resolve pass.
|
||||
version uint64
|
||||
// resolveEpoch increments at the start of every resolver
|
||||
// pass that drops m.mu around the filesystem walk. Each
|
||||
// pass captures the epoch it claimed; at publish time it
|
||||
// compares its captured epoch against the current epoch and
|
||||
// skips the publish if a newer pass has started, preventing
|
||||
// an old walk's stale result from overwriting a newer one's
|
||||
// fresh result at a higher version number.
|
||||
resolveEpoch uint64
|
||||
|
||||
// subscribers receive a non-blocking signal whenever the
|
||||
// snapshot changes. Subscribers must drain their channel
|
||||
@@ -182,6 +190,10 @@ func (m *Manager) Run(ctx context.Context) error {
|
||||
m.running = true
|
||||
close(m.runStartedCh)
|
||||
m.mu.Unlock()
|
||||
// Close any early-exit path so Close does not block on
|
||||
// runDoneCh after Run already set running=true. The deferred
|
||||
// close runs even when NewWatcher fails.
|
||||
defer close(m.runDoneCh)
|
||||
|
||||
watcher, err := NewWatcher(WatcherOptions{
|
||||
Logger: m.logger.Named("watcher"),
|
||||
@@ -202,7 +214,6 @@ func (m *Manager) Run(ctx context.Context) error {
|
||||
m.mu.Unlock()
|
||||
watcher.Sync(ctx, roots)
|
||||
|
||||
defer close(m.runDoneCh)
|
||||
defer watcher.Close()
|
||||
|
||||
for {
|
||||
@@ -386,6 +397,8 @@ func (m *Manager) Resync(ctx context.Context) (Snapshot, error) {
|
||||
resolver := m.resolver
|
||||
watcher := m.watcher
|
||||
schemaVersion := m.schemaVersion
|
||||
m.resolveEpoch++
|
||||
myEpoch := m.resolveEpoch
|
||||
m.mu.Unlock()
|
||||
|
||||
if ctxErr := ctx.Err(); ctxErr != nil {
|
||||
@@ -404,6 +417,20 @@ func (m *Manager) Resync(ctx context.Context) (Snapshot, error) {
|
||||
m.mu.Unlock()
|
||||
return m.Snapshot(), ErrManagerClosed
|
||||
}
|
||||
if m.resolveEpoch != myEpoch {
|
||||
// A newer resolve pass started while this one was
|
||||
// walking the filesystem. The newer pass's data
|
||||
// strictly supersedes ours, so skip the publish to
|
||||
// avoid overwriting a fresher Snapshot at a higher
|
||||
// version. Return the currently published Snapshot,
|
||||
// which is at least as fresh as ours.
|
||||
published := m.snapshot
|
||||
m.mu.Unlock()
|
||||
if watcher != nil {
|
||||
watcher.Sync(ctx, roots)
|
||||
}
|
||||
return published, nil
|
||||
}
|
||||
m.version++
|
||||
snap.Version = m.version
|
||||
m.snapshot = snap
|
||||
@@ -510,6 +537,8 @@ func (m *Manager) resolveAndBroadcast(ctx context.Context) {
|
||||
resolver := m.resolver
|
||||
watcher := m.watcher
|
||||
schemaVersion := m.schemaVersion
|
||||
m.resolveEpoch++
|
||||
myEpoch := m.resolveEpoch
|
||||
m.mu.Unlock()
|
||||
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -526,6 +555,15 @@ func (m *Manager) resolveAndBroadcast(ctx context.Context) {
|
||||
snap.SchemaVersion = schemaVersion
|
||||
|
||||
m.mu.Lock()
|
||||
if m.resolveEpoch != myEpoch {
|
||||
// A newer resolve pass started while this one was
|
||||
// walking the filesystem. Skip the publish so a
|
||||
// stale-epoch result does not overwrite a fresher
|
||||
// Snapshot at a higher version number. The newer
|
||||
// pass will broadcast its own result.
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
m.version++
|
||||
snap.Version = m.version
|
||||
m.snapshot = snap
|
||||
|
||||
@@ -443,6 +443,13 @@ func (r *Resolver) readSkillMeta(path string, info fs.FileInfo, userSource strin
|
||||
if safeUint64(info.Size()) > r.MaxResourceBytes {
|
||||
res.Status = StatusOversize
|
||||
res.Error = fmt.Sprintf("file size %d exceeds per-resource cap of %d bytes", info.Size(), r.MaxResourceBytes)
|
||||
// Hash the (capped) prefix so an edit that keeps
|
||||
// the file oversize still shifts the aggregate
|
||||
// hash and triggers a re-broadcast. Mirrors the
|
||||
// behavior in readFileResource.
|
||||
if data, err := readFileCapped(path, safeInt64(r.MaxResourceBytes)); err == nil {
|
||||
res.ContentHash = sha256.Sum256(data)
|
||||
}
|
||||
return res, true
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
|
||||
Reference in New Issue
Block a user