Files
Kyle Carberry 68e4155fed feat(agent/filefinder): add plocate-lite file finder package (#22453)
Adds an in-memory trigram-indexed file finder package at
`agent/filefinder`, designed to power a future `FindFiles` HTTP handler
on the WorkspaceAgent.

## What it does

Fast fuzzy file search with VS Code-quality matching across millions of
files. Sub-millisecond search latency at 100K files.

## Architecture

- **Index**: append-only docs slice with trigram + prefix posting lists
- **Snapshot**: lock-free reader view via frozen slice headers +
shallow-copied deleted set
- **Search pipeline**: trigram intersection → fuzzy fallback (prefix
bucket + subsequence) → brute-force scan (capped at 5K docs)
- **Scoring**: subsequence match, basename prefix, boundary hits,
contiguous runs, depth/length penalties
- **Engine**: multi-root with fsnotify watcher (50ms batch coalescing),
atomic snapshot publishing

## Benchmarks (10K files)

| Query Type | Latency |
|---|---|
| exact_basename (`handler.go`) | ~43µs |
| short_query (`ha`) | ~7µs |
| fuzzy_basename (`hndlr`) | ~50µs |
| path_structured (`internal/handler`) | ~29µs |
| multi_token (`api handler`) | ~15µs |

## File inventory (11 files, 3273 lines)

| File | Lines | Purpose |
|---|---|---|
| `text.go` | 264 | Normalization, trigram extraction, scoring |
| `delta.go` | 128 | Index, Snapshot, CRUD operations |
| `query.go` | 272 | Query planning, search strategies, top-K merge |
| `engine.go` | 323 | Multi-root engine, watcher integration |
| `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing |
| `*_test.go` | 2085 | Unit tests, integration tests, benchmarks |

---------

Co-authored-by: Coder <coder@users.noreply.github.com>
2026-02-28 23:37:07 -05:00

365 lines
7.9 KiB
Go

// Package filefinder provides an in-memory file index with trigram
// matching, fuzzy search, and filesystem watching. It is designed
// to power file-finding features on workspace agents.
package filefinder
import (
"context"
"os"
"path/filepath"
"slices"
"strings"
"sync"
"sync/atomic"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
)
// SearchOptions controls search behavior.
type SearchOptions struct {
Limit int
MaxCandidates int
}
// DefaultSearchOptions returns sensible default search options.
func DefaultSearchOptions() SearchOptions {
return SearchOptions{Limit: 100, MaxCandidates: 10000}
}
type rootSnapshot struct {
root string
snap *Snapshot
}
// Engine is the main file finder. Safe for concurrent use.
type Engine struct {
snap atomic.Pointer[[]*rootSnapshot]
logger slog.Logger
mu sync.Mutex
roots map[string]*rootState
eventCh chan rootEvent
closeCh chan struct{}
closed atomic.Bool
wg sync.WaitGroup
}
type rootState struct {
root string
index *Index
watcher *fsWatcher
cancel context.CancelFunc
}
type rootEvent struct {
root string
events []FSEvent
}
// walkRoot performs a full filesystem walk of absRoot and returns
// a populated Index containing all discovered files and directories.
func walkRoot(absRoot string) (*Index, error) {
idx := NewIndex()
err := filepath.Walk(absRoot, func(path string, info os.FileInfo, walkErr error) error {
if walkErr != nil {
return nil //nolint:nilerr
}
base := filepath.Base(path)
if _, skip := skipDirs[base]; skip && info.IsDir() {
return filepath.SkipDir
}
if path == absRoot {
return nil
}
relPath, relErr := filepath.Rel(absRoot, path)
if relErr != nil {
return nil //nolint:nilerr
}
relPath = filepath.ToSlash(relPath)
var flags uint16
if info.IsDir() {
flags = uint16(FlagDir)
} else if info.Mode()&os.ModeSymlink != 0 {
flags = uint16(FlagSymlink)
}
idx.Add(relPath, flags)
return nil
})
return idx, err
}
// NewEngine creates a new Engine.
func NewEngine(logger slog.Logger) *Engine {
e := &Engine{
logger: logger,
roots: make(map[string]*rootState),
eventCh: make(chan rootEvent, 256),
closeCh: make(chan struct{}),
}
empty := make([]*rootSnapshot, 0)
e.snap.Store(&empty)
e.wg.Add(1)
go e.start()
return e
}
// ErrClosed is returned when operations are attempted on a
// closed engine.
var ErrClosed = xerrors.New("engine is closed")
// AddRoot adds a directory root to the engine.
func (e *Engine) AddRoot(ctx context.Context, root string) error {
absRoot, err := filepath.Abs(root)
if err != nil {
return xerrors.Errorf("resolve root: %w", err)
}
e.mu.Lock()
if e.closed.Load() {
e.mu.Unlock()
return ErrClosed
}
if _, exists := e.roots[absRoot]; exists {
e.mu.Unlock()
return nil
}
e.mu.Unlock()
// Walk and create the watcher outside the lock to avoid
// blocking the event pipeline on filesystem I/O.
idx, walkErr := walkRoot(absRoot)
if walkErr != nil {
return xerrors.Errorf("walk root: %w", walkErr)
}
wCtx, wCancel := context.WithCancel(context.Background())
w, wErr := newFSWatcher(absRoot, e.logger)
if wErr != nil {
wCancel()
return xerrors.Errorf("create watcher: %w", wErr)
}
e.mu.Lock()
// Re-check after re-acquiring the lock: another goroutine
// may have added this root or closed the engine while we
// were walking.
if e.closed.Load() {
e.mu.Unlock()
wCancel()
_ = w.Close()
return ErrClosed
}
if _, exists := e.roots[absRoot]; exists {
e.mu.Unlock()
wCancel()
_ = w.Close()
return nil
}
rs := &rootState{root: absRoot, index: idx, watcher: w, cancel: wCancel}
e.roots[absRoot] = rs
w.Start(wCtx)
e.wg.Add(1)
go e.forwardEvents(wCtx, absRoot, w)
e.publishSnapshot()
fileCount := idx.Len()
e.mu.Unlock()
e.logger.Info(ctx, "added root to engine",
slog.F("root", absRoot),
slog.F("files", fileCount),
)
return nil
}
// RemoveRoot stops watching a root and removes it.
func (e *Engine) RemoveRoot(root string) error {
absRoot, err := filepath.Abs(root)
if err != nil {
return xerrors.Errorf("resolve root: %w", err)
}
e.mu.Lock()
defer e.mu.Unlock()
rs, exists := e.roots[absRoot]
if !exists {
return xerrors.Errorf("root %q not found", absRoot)
}
rs.cancel()
_ = rs.watcher.Close()
delete(e.roots, absRoot)
e.publishSnapshot()
return nil
}
// Search performs a fuzzy file search across all roots.
func (e *Engine) Search(_ context.Context, query string, opts SearchOptions) ([]Result, error) {
if e.closed.Load() {
return nil, ErrClosed
}
snapPtr := e.snap.Load()
if snapPtr == nil || len(*snapPtr) == 0 {
return nil, nil
}
roots := *snapPtr
plan := newQueryPlan(query)
if len(plan.Normalized) == 0 {
return nil, nil
}
if opts.Limit <= 0 {
opts.Limit = 100
}
if opts.MaxCandidates <= 0 {
opts.MaxCandidates = 10000
}
params := defaultScoreParams()
var allCands []candidate
for _, rs := range roots {
allCands = append(allCands, searchSnapshot(plan, rs.snap, opts.MaxCandidates)...)
}
results := mergeAndScore(allCands, plan, params, opts.Limit)
return results, nil
}
// Close shuts down the engine.
func (e *Engine) Close() error {
if e.closed.Swap(true) {
return nil
}
close(e.closeCh)
e.mu.Lock()
for _, rs := range e.roots {
rs.cancel()
_ = rs.watcher.Close()
}
e.roots = make(map[string]*rootState)
e.mu.Unlock()
e.wg.Wait()
return nil
}
// Rebuild forces a complete re-walk and re-index of a root.
func (e *Engine) Rebuild(ctx context.Context, root string) error {
absRoot, err := filepath.Abs(root)
if err != nil {
return xerrors.Errorf("resolve root: %w", err)
}
// Walk outside the lock to avoid blocking the event
// pipeline on potentially slow filesystem I/O.
idx, walkErr := walkRoot(absRoot)
if walkErr != nil {
return xerrors.Errorf("rebuild walk: %w", walkErr)
}
e.mu.Lock()
rs, exists := e.roots[absRoot]
if !exists {
e.mu.Unlock()
return xerrors.Errorf("root %q not found", absRoot)
}
rs.index = idx
e.publishSnapshot()
fileCount := idx.Len()
e.mu.Unlock()
e.logger.Info(ctx, "rebuilt root in engine",
slog.F("root", absRoot),
slog.F("files", fileCount),
)
return nil
}
func (e *Engine) start() {
defer e.wg.Done()
for {
select {
case <-e.closeCh:
return
case re, ok := <-e.eventCh:
if !ok {
return
}
e.applyEvents(re)
}
}
}
func (e *Engine) forwardEvents(ctx context.Context, root string, w *fsWatcher) {
defer e.wg.Done()
for {
select {
case <-ctx.Done():
return
case <-e.closeCh:
return
case evts, ok := <-w.Events():
if !ok {
return
}
select {
case e.eventCh <- rootEvent{root: root, events: evts}:
case <-ctx.Done():
return
case <-e.closeCh:
return
}
}
}
}
func (e *Engine) applyEvents(re rootEvent) {
e.mu.Lock()
defer e.mu.Unlock()
rs, exists := e.roots[re.root]
if !exists {
return
}
changed := false
for _, ev := range re.events {
relPath, err := filepath.Rel(rs.root, ev.Path)
if err != nil {
continue
}
relPath = filepath.ToSlash(relPath)
switch ev.Op {
case OpCreate:
if rs.index.Has(relPath) {
continue
}
var flags uint16
if ev.IsDir {
flags = uint16(FlagDir)
}
rs.index.Add(relPath, flags)
changed = true
case OpRemove, OpRename:
if rs.index.Remove(relPath) {
changed = true
}
if ev.IsDir || ev.Op == OpRename {
prefix := strings.ToLower(filepath.ToSlash(relPath)) + "/"
for path := range rs.index.byPath {
if strings.HasPrefix(path, prefix) {
rs.index.Remove(path)
changed = true
}
}
}
case OpModify:
}
}
if changed {
e.publishSnapshot()
}
}
// publishSnapshot builds and atomically publishes a new snapshot.
// Must be called with e.mu held.
func (e *Engine) publishSnapshot() {
roots := make([]*rootSnapshot, 0, len(e.roots))
for _, rs := range e.roots {
roots = append(roots, &rootSnapshot{
root: rs.root,
snap: rs.index.Snapshot(),
})
}
slices.SortFunc(roots, func(a, b *rootSnapshot) int {
return strings.Compare(a.root, b.root)
})
e.snap.Store(&roots)
}