From eb613a42a1e1f77dfcde0fc2536eadea8d60fb39 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Tue, 2 Jun 2026 18:41:04 +0000 Subject: [PATCH] fix(agent/agentcontext): address coder-agents-review round 4 CRF-40: ensure runDoneCh is closed even if NewWatcher fails so Close does not deadlock waiting on a goroutine that already exited. CRF-41: gate snapshot publishes with a monotonic resolveEpoch counter. Resync and resolveAndBroadcast drop m.mu around the filesystem walk; under concurrency a stale walk could overwrite a fresher one at a higher version number. Each pass now captures its epoch under the lock and skips the publish if a newer pass has started. Resync returns the currently published Snapshot in the stale case, which is guaranteed to be at least as fresh as the discarded result. CRF-42: hash the (capped) prefix of oversize SKILL.md files so an edit that keeps the file oversize still shifts the aggregate hash, matching readFileResource and preserving the change-detection contract. --- agent/agentcontext/manager.go | 40 ++++++++++++++++++++++++++++++++++- agent/agentcontext/resolve.go | 7 ++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/agent/agentcontext/manager.go b/agent/agentcontext/manager.go index 743cf3eb4d..55ed98a1c2 100644 --- a/agent/agentcontext/manager.go +++ b/agent/agentcontext/manager.go @@ -78,6 +78,14 @@ type Manager struct { snapshot Snapshot // version monotonically increases per resolve pass. version uint64 + // resolveEpoch increments at the start of every resolver + // pass that drops m.mu around the filesystem walk. Each + // pass captures the epoch it claimed; at publish time it + // compares its captured epoch against the current epoch and + // skips the publish if a newer pass has started, preventing + // an old walk's stale result from overwriting a newer one's + // fresh result at a higher version number. + resolveEpoch uint64 // subscribers receive a non-blocking signal whenever the // snapshot changes. Subscribers must drain their channel @@ -182,6 +190,10 @@ func (m *Manager) Run(ctx context.Context) error { m.running = true close(m.runStartedCh) m.mu.Unlock() + // Close any early-exit path so Close does not block on + // runDoneCh after Run already set running=true. The deferred + // close runs even when NewWatcher fails. + defer close(m.runDoneCh) watcher, err := NewWatcher(WatcherOptions{ Logger: m.logger.Named("watcher"), @@ -202,7 +214,6 @@ func (m *Manager) Run(ctx context.Context) error { m.mu.Unlock() watcher.Sync(ctx, roots) - defer close(m.runDoneCh) defer watcher.Close() for { @@ -386,6 +397,8 @@ func (m *Manager) Resync(ctx context.Context) (Snapshot, error) { resolver := m.resolver watcher := m.watcher schemaVersion := m.schemaVersion + m.resolveEpoch++ + myEpoch := m.resolveEpoch m.mu.Unlock() if ctxErr := ctx.Err(); ctxErr != nil { @@ -404,6 +417,20 @@ func (m *Manager) Resync(ctx context.Context) (Snapshot, error) { m.mu.Unlock() return m.Snapshot(), ErrManagerClosed } + if m.resolveEpoch != myEpoch { + // A newer resolve pass started while this one was + // walking the filesystem. The newer pass's data + // strictly supersedes ours, so skip the publish to + // avoid overwriting a fresher Snapshot at a higher + // version. Return the currently published Snapshot, + // which is at least as fresh as ours. + published := m.snapshot + m.mu.Unlock() + if watcher != nil { + watcher.Sync(ctx, roots) + } + return published, nil + } m.version++ snap.Version = m.version m.snapshot = snap @@ -510,6 +537,8 @@ func (m *Manager) resolveAndBroadcast(ctx context.Context) { resolver := m.resolver watcher := m.watcher schemaVersion := m.schemaVersion + m.resolveEpoch++ + myEpoch := m.resolveEpoch m.mu.Unlock() if err := ctx.Err(); err != nil { @@ -526,6 +555,15 @@ func (m *Manager) resolveAndBroadcast(ctx context.Context) { snap.SchemaVersion = schemaVersion m.mu.Lock() + if m.resolveEpoch != myEpoch { + // A newer resolve pass started while this one was + // walking the filesystem. Skip the publish so a + // stale-epoch result does not overwrite a fresher + // Snapshot at a higher version number. The newer + // pass will broadcast its own result. + m.mu.Unlock() + return + } m.version++ snap.Version = m.version m.snapshot = snap diff --git a/agent/agentcontext/resolve.go b/agent/agentcontext/resolve.go index 2e40bb8c87..c78769eb7e 100644 --- a/agent/agentcontext/resolve.go +++ b/agent/agentcontext/resolve.go @@ -443,6 +443,13 @@ func (r *Resolver) readSkillMeta(path string, info fs.FileInfo, userSource strin if safeUint64(info.Size()) > r.MaxResourceBytes { res.Status = StatusOversize res.Error = fmt.Sprintf("file size %d exceeds per-resource cap of %d bytes", info.Size(), r.MaxResourceBytes) + // Hash the (capped) prefix so an edit that keeps + // the file oversize still shifts the aggregate + // hash and triggers a re-broadcast. Mirrors the + // behavior in readFileResource. + if data, err := readFileCapped(path, safeInt64(r.MaxResourceBytes)); err == nil { + res.ContentHash = sha256.Sum256(data) + } return res, true } data, err := os.ReadFile(path)