fix(agent): make manifest-driven resync asynchronous and ctx-aware

The handleManifest Resync call was synchronous; on slow filesystems
(CI macOS and Windows runners walking a real $HOME) the resolver
walk could outlive the agent's graceful shutdown window, leaking a
goroutine inside filepath.WalkDir and tripping goleak in the agent
test suite.

Two changes break the leak:

- New Manager.Trigger() queues an asynchronous re-resolve on the
  trigger channel. handleManifest now calls Trigger instead of
  Resync, so manifest handling returns immediately and the
  background walk is owned by the Manager.Run goroutine that
  already respects gracefulCtx cancellation.
- Resolver.walkDir threads ctx into the filepath.WalkDir callback,
  so when ctx cancels the walk aborts after the current directory
  read instead of running to completion.
This commit is contained in:
Kyle Carberry
2026-06-02 18:08:58 +00:00
parent 5f48ef872e
commit ef1e235804
3 changed files with 25 additions and 9 deletions
+6 -6
View File
@@ -1384,13 +1384,13 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
// Manifest just landed; the agentcontext manager now has
// a working directory to scan and a known set of scan
// roots. Trigger a resync so the snapshot reflects the
// workspace immediately instead of waiting for the next
// filesystem event.
// roots. Queue an asynchronous re-resolve so the snapshot
// reflects the workspace immediately instead of waiting
// for the next filesystem event. The result is handled
// by the Manager.Run loop, which respects gracefulCtx
// cancellation during shutdown.
if a.contextManager != nil {
if _, resyncErr := a.contextManager.Resync(ctx); resyncErr != nil {
a.logger.Debug(ctx, "agentcontext resync after manifest failed", slog.Error(resyncErr))
}
a.contextManager.Trigger()
}
// Write secret files after signaling manifest readiness so that network
+11
View File
@@ -439,6 +439,17 @@ func (m *Manager) signal() {
}
}
// Trigger queues an asynchronous re-resolve. Trigger returns
// immediately; the Run goroutine performs the filesystem walk
// in the background and broadcasts when it finishes. Use
// Trigger when the caller wants the watcher to pick up an
// updated working directory or scan-root set but does not need
// the new Snapshot synchronously. Trigger is a no-op when Run
// has not started or the Manager is closed.
func (m *Manager) Trigger() {
m.signal()
}
// scanRootsLocked returns the list of ScanRoots to feed the
// resolver and watcher. The Manager's mutex must be held.
func (m *Manager) scanRootsLocked() []ScanRoot {
+8 -3
View File
@@ -242,7 +242,7 @@ func (r *Resolver) walk(ctx context.Context, roots []ScanRoot) (resources []Reso
}
continue
}
walkErr := r.walkDir(root, &resources, seenID)
walkErr := r.walkDir(ctx, root, &resources, seenID)
if walkErr != nil {
snapErrs = append(snapErrs, fmt.Sprintf("walk %q: %s", root.Path, walkErr))
}
@@ -251,12 +251,17 @@ func (r *Resolver) walk(ctx context.Context, roots []ScanRoot) (resources []Reso
}
// walkDir performs the recursive descent for a single scan
// directory. It honors r.MaxDepth and skipDirNames.
func (r *Resolver) walkDir(root ScanRoot, out *[]Resource, seenID map[string]struct{}) error {
// directory. It honors r.MaxDepth and skipDirNames. The ctx is
// checked inside the WalkDir callback so cancellation
// terminates the walk even mid-root.
func (r *Resolver) walkDir(ctx context.Context, root ScanRoot, out *[]Resource, seenID map[string]struct{}) error {
rootDepth := strings.Count(filepath.Clean(root.Path), string(os.PathSeparator))
maxDepth := rootDepth + r.MaxDepth
return filepath.WalkDir(root.Path, func(path string, d fs.DirEntry, err error) error {
if ctxErr := ctx.Err(); ctxErr != nil {
return ctxErr
}
if err != nil {
// Surface the error as Unreadable when we can
// associate it with a single recognized file;