Files
Kyle Carberry 68e4155fed feat(agent/filefinder): add plocate-lite file finder package (#22453)
Adds an in-memory trigram-indexed file finder package at
`agent/filefinder`, designed to power a future `FindFiles` HTTP handler
on the WorkspaceAgent.

## What it does

Fast fuzzy file search with VS Code-quality matching across millions of
files. Sub-millisecond search latency at 100K files.

## Architecture

- **Index**: append-only docs slice with trigram + prefix posting lists
- **Snapshot**: lock-free reader view via frozen slice headers +
shallow-copied deleted set
- **Search pipeline**: trigram intersection → fuzzy fallback (prefix
bucket + subsequence) → brute-force scan (capped at 5K docs)
- **Scoring**: subsequence match, basename prefix, boundary hits,
contiguous runs, depth/length penalties
- **Engine**: multi-root with fsnotify watcher (50ms batch coalescing),
atomic snapshot publishing

## Benchmarks (10K files)

| Query Type | Latency |
|---|---|
| exact_basename (`handler.go`) | ~43µs |
| short_query (`ha`) | ~7µs |
| fuzzy_basename (`hndlr`) | ~50µs |
| path_structured (`internal/handler`) | ~29µs |
| multi_token (`api handler`) | ~15µs |

## File inventory (11 files, 3273 lines)

| File | Lines | Purpose |
|---|---|---|
| `text.go` | 264 | Normalization, trigram extraction, scoring |
| `delta.go` | 128 | Index, Snapshot, CRUD operations |
| `query.go` | 272 | Query planning, search strategies, top-K merge |
| `engine.go` | 323 | Multi-root engine, watcher integration |
| `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing |
| `*_test.go` | 2085 | Unit tests, integration tests, benchmarks |

---------

Co-authored-by: Coder <coder@users.noreply.github.com>
2026-02-28 23:37:07 -05:00

317 lines
7.7 KiB
Go

package filefinder_test
import (
"context"
"fmt"
"math/rand"
"os"
"path/filepath"
"runtime"
"sync"
"testing"
"github.com/stretchr/testify/require"
"cdr.dev/slog/v3"
"cdr.dev/slog/v3/sloggers/slogtest"
"github.com/coder/coder/v2/agent/filefinder"
)
var (
dirNames = []string{
"cmd", "internal", "pkg", "api", "auth", "database", "server", "client", "middleware",
"handler", "config", "utils", "models", "service", "worker", "scheduler", "notification",
"provisioner", "template", "workspace", "agent", "proxy", "crypto", "telemetry", "billing",
}
fileExts = []string{
".go", ".ts", ".tsx", ".js", ".py", ".sql", ".yaml", ".json", ".md", ".proto", ".sh",
}
fileStems = []string{
"main", "handler", "middleware", "service", "model", "query", "config", "utils", "helpers",
"types", "interface", "test", "mock", "factory", "builder", "adapter", "observer", "provider",
"resolver", "schema", "migration", "fixture", "snapshot", "checkpoint",
}
)
// generateFileTree creates n files under root in a realistic nested directory structure.
func generateFileTree(t testing.TB, root string, n int, seed int64) {
t.Helper()
rng := rand.New(rand.NewSource(seed)) //nolint:gosec // deterministic benchmarks
numDirs := n / 5
if numDirs < 10 {
numDirs = 10
}
dirs := make([]string, 0, numDirs)
for i := 0; i < numDirs; i++ {
depth := rng.Intn(6) + 1
parts := make([]string, depth)
for d := 0; d < depth; d++ {
parts[d] = dirNames[rng.Intn(len(dirNames))]
}
dirs = append(dirs, filepath.Join(parts...))
}
created := make(map[string]struct{})
for _, d := range dirs {
full := filepath.Join(root, d)
if _, ok := created[full]; ok {
continue
}
require.NoError(t, os.MkdirAll(full, 0o755))
created[full] = struct{}{}
}
for i := 0; i < n; i++ {
dir := dirs[rng.Intn(len(dirs))]
stem := fileStems[rng.Intn(len(fileStems))]
ext := fileExts[rng.Intn(len(fileExts))]
name := fmt.Sprintf("%s_%d%s", stem, i, ext)
full := filepath.Join(root, dir, name)
f, err := os.Create(full)
require.NoError(t, err)
_ = f.Close()
}
}
// buildIndex walks root and returns a populated Index, the same
// way Engine.AddRoot does but without starting a watcher.
func buildIndex(t testing.TB, root string) *filefinder.Index {
t.Helper()
absRoot, err := filepath.Abs(root)
require.NoError(t, err)
idx, err := filefinder.BuildTestIndex(absRoot)
require.NoError(t, err)
return idx
}
func BenchmarkBuildIndex(b *testing.B) {
scales := []struct {
name string
n int
}{
{"1K", 1_000},
{"10K", 10_000},
{"100K", 100_000},
}
for _, sc := range scales {
b.Run(sc.name, func(b *testing.B) {
if sc.n >= 100_000 && testing.Short() {
b.Skip("skipping large-scale benchmark")
}
dir := b.TempDir()
generateFileTree(b, dir, sc.n, 42)
b.ResetTimer()
for i := 0; i < b.N; i++ {
idx := buildIndex(b, dir)
if idx.Len() == 0 {
b.Fatal("expected non-empty index")
}
}
b.StopTimer()
idx := buildIndex(b, dir)
b.ReportMetric(float64(idx.Len())/b.Elapsed().Seconds(), "files/sec")
})
}
}
func BenchmarkSearch_ByScale(b *testing.B) {
queries := []struct {
name string
query string
}{
{"exact_basename", "handler.go"},
{"short_query", "ha"},
{"fuzzy_basename", "hndlr"},
{"path_structured", "internal/handler"},
{"multi_token", "api handler"},
}
scales := []struct {
name string
n int
}{
{"1K", 1_000},
{"10K", 10_000},
{"100K", 100_000},
}
for _, sc := range scales {
b.Run(sc.name, func(b *testing.B) {
if sc.n >= 100_000 && testing.Short() {
b.Skip("skipping large-scale benchmark")
}
dir := b.TempDir()
generateFileTree(b, dir, sc.n, 42)
idx := buildIndex(b, dir)
snap := idx.Snapshot()
opts := filefinder.DefaultSearchOptions()
for _, q := range queries {
b.Run(q.name, func(b *testing.B) {
p := filefinder.NewQueryPlanForTest(q.query)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = filefinder.SearchSnapshotForTest(p, snap, opts.MaxCandidates)
}
})
}
})
}
}
func BenchmarkSearch_ConcurrentReads(b *testing.B) {
dir := b.TempDir()
generateFileTree(b, dir, 10_000, 42)
logger := slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelError)
ctx := context.Background()
eng := filefinder.NewEngine(logger)
require.NoError(b, eng.AddRoot(ctx, dir))
b.Cleanup(func() { _ = eng.Close() })
opts := filefinder.DefaultSearchOptions()
goroutines := []int{1, 4, 16, 64}
for _, g := range goroutines {
b.Run(fmt.Sprintf("goroutines_%d", g), func(b *testing.B) {
b.SetParallelism(g)
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
results, err := eng.Search(ctx, "handler", opts)
if err != nil {
b.Fatal(err)
}
_ = results
}
})
})
}
}
func BenchmarkDeltaUpdate(b *testing.B) {
dir := b.TempDir()
generateFileTree(b, dir, 10_000, 42)
addCounts := []int{1, 10, 100}
for _, count := range addCounts {
b.Run(fmt.Sprintf("add_%d_files", count), func(b *testing.B) {
paths := make([]string, count)
for i := range paths {
paths[i] = fmt.Sprintf("injected/dir_%d/newfile_%d.go", i%10, i)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
idx := buildIndex(b, dir)
b.StartTimer()
for _, p := range paths {
idx.Add(p, 0)
}
}
b.ReportMetric(float64(count), "files_added/op")
})
}
b.Run("search_after_100_additions", func(b *testing.B) {
idx := buildIndex(b, dir)
for i := 0; i < 100; i++ {
idx.Add(fmt.Sprintf("injected/extra/file_%d.go", i), 0)
}
snap := idx.Snapshot()
plan := filefinder.NewQueryPlanForTest("handler")
opts := filefinder.DefaultSearchOptions()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = filefinder.SearchSnapshotForTest(plan, snap, opts.MaxCandidates)
}
})
}
func BenchmarkMemoryProfile(b *testing.B) {
scales := []struct {
name string
n int
}{
{"10K", 10_000},
{"100K", 100_000},
}
for _, sc := range scales {
b.Run(sc.name, func(b *testing.B) {
if sc.n >= 100_000 && testing.Short() {
b.Skip("skipping large-scale memory profile")
}
dir := b.TempDir()
generateFileTree(b, dir, sc.n, 42)
b.ResetTimer()
for i := 0; i < b.N; i++ {
idx := buildIndex(b, dir)
_ = idx.Snapshot()
}
b.StopTimer()
// Report memory stats on the last iteration.
runtime.GC()
var before runtime.MemStats
runtime.ReadMemStats(&before)
idx := buildIndex(b, dir)
var after runtime.MemStats
runtime.ReadMemStats(&after)
allocDelta := after.TotalAlloc - before.TotalAlloc
b.ReportMetric(float64(allocDelta)/float64(idx.Len()), "bytes/file")
runtime.GC()
runtime.ReadMemStats(&before)
snap := idx.Snapshot()
_ = snap
runtime.GC()
runtime.ReadMemStats(&after)
snapAlloc := after.TotalAlloc - before.TotalAlloc
b.ReportMetric(float64(snapAlloc)/float64(idx.Len()), "snap-bytes/file")
})
}
}
func BenchmarkSearch_ConcurrentReads_Throughput(b *testing.B) {
dir := b.TempDir()
generateFileTree(b, dir, 10_000, 42)
idx := buildIndex(b, dir)
snap := idx.Snapshot()
goroutines := []int{1, 4, 16, 64}
plan := filefinder.NewQueryPlanForTest("handler.go")
maxCands := filefinder.DefaultSearchOptions().MaxCandidates
for _, g := range goroutines {
b.Run(fmt.Sprintf("goroutines_%d", g), func(b *testing.B) {
b.ResetTimer()
var wg sync.WaitGroup
perGoroutine := b.N / g
if perGoroutine < 1 {
perGoroutine = 1
}
for gi := 0; gi < g; gi++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < perGoroutine; j++ {
_ = filefinder.SearchSnapshotForTest(plan, snap, maxCands)
}
}()
}
wg.Wait()
totalOps := float64(g * perGoroutine)
b.ReportMetric(totalOps/b.Elapsed().Seconds(), "searches/sec")
})
}
}