mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
68e4155fed
Adds an in-memory trigram-indexed file finder package at `agent/filefinder`, designed to power a future `FindFiles` HTTP handler on the WorkspaceAgent. ## What it does Fast fuzzy file search with VS Code-quality matching across millions of files. Sub-millisecond search latency at 100K files. ## Architecture - **Index**: append-only docs slice with trigram + prefix posting lists - **Snapshot**: lock-free reader view via frozen slice headers + shallow-copied deleted set - **Search pipeline**: trigram intersection → fuzzy fallback (prefix bucket + subsequence) → brute-force scan (capped at 5K docs) - **Scoring**: subsequence match, basename prefix, boundary hits, contiguous runs, depth/length penalties - **Engine**: multi-root with fsnotify watcher (50ms batch coalescing), atomic snapshot publishing ## Benchmarks (10K files) | Query Type | Latency | |---|---| | exact_basename (`handler.go`) | ~43µs | | short_query (`ha`) | ~7µs | | fuzzy_basename (`hndlr`) | ~50µs | | path_structured (`internal/handler`) | ~29µs | | multi_token (`api handler`) | ~15µs | ## File inventory (11 files, 3273 lines) | File | Lines | Purpose | |---|---|---| | `text.go` | 264 | Normalization, trigram extraction, scoring | | `delta.go` | 128 | Index, Snapshot, CRUD operations | | `query.go` | 272 | Query planning, search strategies, top-K merge | | `engine.go` | 323 | Multi-root engine, watcher integration | | `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing | | `*_test.go` | 2085 | Unit tests, integration tests, benchmarks | --------- Co-authored-by: Coder <coder@users.noreply.github.com>
365 lines
7.9 KiB
Go
365 lines
7.9 KiB
Go
// Package filefinder provides an in-memory file index with trigram
|
|
// matching, fuzzy search, and filesystem watching. It is designed
|
|
// to power file-finding features on workspace agents.
|
|
package filefinder
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"path/filepath"
|
|
"slices"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog/v3"
|
|
)
|
|
|
|
// SearchOptions controls search behavior.
|
|
type SearchOptions struct {
|
|
Limit int
|
|
MaxCandidates int
|
|
}
|
|
|
|
// DefaultSearchOptions returns sensible default search options.
|
|
func DefaultSearchOptions() SearchOptions {
|
|
return SearchOptions{Limit: 100, MaxCandidates: 10000}
|
|
}
|
|
|
|
type rootSnapshot struct {
|
|
root string
|
|
snap *Snapshot
|
|
}
|
|
|
|
// Engine is the main file finder. Safe for concurrent use.
|
|
type Engine struct {
|
|
snap atomic.Pointer[[]*rootSnapshot]
|
|
logger slog.Logger
|
|
mu sync.Mutex
|
|
roots map[string]*rootState
|
|
eventCh chan rootEvent
|
|
closeCh chan struct{}
|
|
closed atomic.Bool
|
|
wg sync.WaitGroup
|
|
}
|
|
type rootState struct {
|
|
root string
|
|
index *Index
|
|
watcher *fsWatcher
|
|
cancel context.CancelFunc
|
|
}
|
|
type rootEvent struct {
|
|
root string
|
|
events []FSEvent
|
|
}
|
|
|
|
// walkRoot performs a full filesystem walk of absRoot and returns
|
|
// a populated Index containing all discovered files and directories.
|
|
func walkRoot(absRoot string) (*Index, error) {
|
|
idx := NewIndex()
|
|
err := filepath.Walk(absRoot, func(path string, info os.FileInfo, walkErr error) error {
|
|
if walkErr != nil {
|
|
return nil //nolint:nilerr
|
|
}
|
|
base := filepath.Base(path)
|
|
if _, skip := skipDirs[base]; skip && info.IsDir() {
|
|
return filepath.SkipDir
|
|
}
|
|
if path == absRoot {
|
|
return nil
|
|
}
|
|
relPath, relErr := filepath.Rel(absRoot, path)
|
|
if relErr != nil {
|
|
return nil //nolint:nilerr
|
|
}
|
|
relPath = filepath.ToSlash(relPath)
|
|
var flags uint16
|
|
if info.IsDir() {
|
|
flags = uint16(FlagDir)
|
|
} else if info.Mode()&os.ModeSymlink != 0 {
|
|
flags = uint16(FlagSymlink)
|
|
}
|
|
idx.Add(relPath, flags)
|
|
return nil
|
|
})
|
|
return idx, err
|
|
}
|
|
|
|
// NewEngine creates a new Engine.
|
|
func NewEngine(logger slog.Logger) *Engine {
|
|
e := &Engine{
|
|
logger: logger,
|
|
roots: make(map[string]*rootState),
|
|
eventCh: make(chan rootEvent, 256),
|
|
closeCh: make(chan struct{}),
|
|
}
|
|
empty := make([]*rootSnapshot, 0)
|
|
e.snap.Store(&empty)
|
|
e.wg.Add(1)
|
|
go e.start()
|
|
return e
|
|
}
|
|
|
|
// ErrClosed is returned when operations are attempted on a
|
|
// closed engine.
|
|
var ErrClosed = xerrors.New("engine is closed")
|
|
|
|
// AddRoot adds a directory root to the engine.
|
|
func (e *Engine) AddRoot(ctx context.Context, root string) error {
|
|
absRoot, err := filepath.Abs(root)
|
|
if err != nil {
|
|
return xerrors.Errorf("resolve root: %w", err)
|
|
}
|
|
e.mu.Lock()
|
|
if e.closed.Load() {
|
|
e.mu.Unlock()
|
|
return ErrClosed
|
|
}
|
|
if _, exists := e.roots[absRoot]; exists {
|
|
e.mu.Unlock()
|
|
return nil
|
|
}
|
|
e.mu.Unlock()
|
|
|
|
// Walk and create the watcher outside the lock to avoid
|
|
// blocking the event pipeline on filesystem I/O.
|
|
idx, walkErr := walkRoot(absRoot)
|
|
if walkErr != nil {
|
|
return xerrors.Errorf("walk root: %w", walkErr)
|
|
}
|
|
wCtx, wCancel := context.WithCancel(context.Background())
|
|
w, wErr := newFSWatcher(absRoot, e.logger)
|
|
if wErr != nil {
|
|
wCancel()
|
|
return xerrors.Errorf("create watcher: %w", wErr)
|
|
}
|
|
|
|
e.mu.Lock()
|
|
// Re-check after re-acquiring the lock: another goroutine
|
|
// may have added this root or closed the engine while we
|
|
// were walking.
|
|
if e.closed.Load() {
|
|
e.mu.Unlock()
|
|
wCancel()
|
|
_ = w.Close()
|
|
return ErrClosed
|
|
}
|
|
if _, exists := e.roots[absRoot]; exists {
|
|
e.mu.Unlock()
|
|
wCancel()
|
|
_ = w.Close()
|
|
return nil
|
|
}
|
|
rs := &rootState{root: absRoot, index: idx, watcher: w, cancel: wCancel}
|
|
e.roots[absRoot] = rs
|
|
w.Start(wCtx)
|
|
e.wg.Add(1)
|
|
go e.forwardEvents(wCtx, absRoot, w)
|
|
e.publishSnapshot()
|
|
fileCount := idx.Len()
|
|
e.mu.Unlock()
|
|
e.logger.Info(ctx, "added root to engine",
|
|
slog.F("root", absRoot),
|
|
slog.F("files", fileCount),
|
|
)
|
|
return nil
|
|
}
|
|
|
|
// RemoveRoot stops watching a root and removes it.
|
|
func (e *Engine) RemoveRoot(root string) error {
|
|
absRoot, err := filepath.Abs(root)
|
|
if err != nil {
|
|
return xerrors.Errorf("resolve root: %w", err)
|
|
}
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
rs, exists := e.roots[absRoot]
|
|
if !exists {
|
|
return xerrors.Errorf("root %q not found", absRoot)
|
|
}
|
|
rs.cancel()
|
|
_ = rs.watcher.Close()
|
|
delete(e.roots, absRoot)
|
|
e.publishSnapshot()
|
|
return nil
|
|
}
|
|
|
|
// Search performs a fuzzy file search across all roots.
|
|
func (e *Engine) Search(_ context.Context, query string, opts SearchOptions) ([]Result, error) {
|
|
if e.closed.Load() {
|
|
return nil, ErrClosed
|
|
}
|
|
snapPtr := e.snap.Load()
|
|
if snapPtr == nil || len(*snapPtr) == 0 {
|
|
return nil, nil
|
|
}
|
|
roots := *snapPtr
|
|
plan := newQueryPlan(query)
|
|
if len(plan.Normalized) == 0 {
|
|
return nil, nil
|
|
}
|
|
if opts.Limit <= 0 {
|
|
opts.Limit = 100
|
|
}
|
|
if opts.MaxCandidates <= 0 {
|
|
opts.MaxCandidates = 10000
|
|
}
|
|
params := defaultScoreParams()
|
|
var allCands []candidate
|
|
for _, rs := range roots {
|
|
allCands = append(allCands, searchSnapshot(plan, rs.snap, opts.MaxCandidates)...)
|
|
}
|
|
results := mergeAndScore(allCands, plan, params, opts.Limit)
|
|
return results, nil
|
|
}
|
|
|
|
// Close shuts down the engine.
|
|
func (e *Engine) Close() error {
|
|
if e.closed.Swap(true) {
|
|
return nil
|
|
}
|
|
close(e.closeCh)
|
|
e.mu.Lock()
|
|
for _, rs := range e.roots {
|
|
rs.cancel()
|
|
_ = rs.watcher.Close()
|
|
}
|
|
e.roots = make(map[string]*rootState)
|
|
e.mu.Unlock()
|
|
e.wg.Wait()
|
|
return nil
|
|
}
|
|
|
|
// Rebuild forces a complete re-walk and re-index of a root.
|
|
func (e *Engine) Rebuild(ctx context.Context, root string) error {
|
|
absRoot, err := filepath.Abs(root)
|
|
if err != nil {
|
|
return xerrors.Errorf("resolve root: %w", err)
|
|
}
|
|
|
|
// Walk outside the lock to avoid blocking the event
|
|
// pipeline on potentially slow filesystem I/O.
|
|
idx, walkErr := walkRoot(absRoot)
|
|
if walkErr != nil {
|
|
return xerrors.Errorf("rebuild walk: %w", walkErr)
|
|
}
|
|
|
|
e.mu.Lock()
|
|
rs, exists := e.roots[absRoot]
|
|
if !exists {
|
|
e.mu.Unlock()
|
|
return xerrors.Errorf("root %q not found", absRoot)
|
|
}
|
|
rs.index = idx
|
|
e.publishSnapshot()
|
|
fileCount := idx.Len()
|
|
e.mu.Unlock()
|
|
e.logger.Info(ctx, "rebuilt root in engine",
|
|
slog.F("root", absRoot),
|
|
slog.F("files", fileCount),
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func (e *Engine) start() {
|
|
defer e.wg.Done()
|
|
for {
|
|
select {
|
|
case <-e.closeCh:
|
|
return
|
|
case re, ok := <-e.eventCh:
|
|
if !ok {
|
|
return
|
|
}
|
|
e.applyEvents(re)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Engine) forwardEvents(ctx context.Context, root string, w *fsWatcher) {
|
|
defer e.wg.Done()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-e.closeCh:
|
|
return
|
|
case evts, ok := <-w.Events():
|
|
if !ok {
|
|
return
|
|
}
|
|
select {
|
|
case e.eventCh <- rootEvent{root: root, events: evts}:
|
|
case <-ctx.Done():
|
|
return
|
|
case <-e.closeCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Engine) applyEvents(re rootEvent) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
rs, exists := e.roots[re.root]
|
|
if !exists {
|
|
return
|
|
}
|
|
changed := false
|
|
for _, ev := range re.events {
|
|
relPath, err := filepath.Rel(rs.root, ev.Path)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
relPath = filepath.ToSlash(relPath)
|
|
switch ev.Op {
|
|
case OpCreate:
|
|
if rs.index.Has(relPath) {
|
|
continue
|
|
}
|
|
var flags uint16
|
|
if ev.IsDir {
|
|
flags = uint16(FlagDir)
|
|
}
|
|
rs.index.Add(relPath, flags)
|
|
changed = true
|
|
case OpRemove, OpRename:
|
|
if rs.index.Remove(relPath) {
|
|
changed = true
|
|
}
|
|
if ev.IsDir || ev.Op == OpRename {
|
|
prefix := strings.ToLower(filepath.ToSlash(relPath)) + "/"
|
|
for path := range rs.index.byPath {
|
|
if strings.HasPrefix(path, prefix) {
|
|
rs.index.Remove(path)
|
|
changed = true
|
|
}
|
|
}
|
|
}
|
|
case OpModify:
|
|
}
|
|
}
|
|
if changed {
|
|
e.publishSnapshot()
|
|
}
|
|
}
|
|
|
|
// publishSnapshot builds and atomically publishes a new snapshot.
|
|
// Must be called with e.mu held.
|
|
func (e *Engine) publishSnapshot() {
|
|
roots := make([]*rootSnapshot, 0, len(e.roots))
|
|
for _, rs := range e.roots {
|
|
roots = append(roots, &rootSnapshot{
|
|
root: rs.root,
|
|
snap: rs.index.Snapshot(),
|
|
})
|
|
}
|
|
slices.SortFunc(roots, func(a, b *rootSnapshot) int {
|
|
return strings.Compare(a.root, b.root)
|
|
})
|
|
e.snap.Store(&roots)
|
|
}
|