mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
68e4155fed
Adds an in-memory trigram-indexed file finder package at `agent/filefinder`, designed to power a future `FindFiles` HTTP handler on the WorkspaceAgent. ## What it does Fast fuzzy file search with VS Code-quality matching across millions of files. Sub-millisecond search latency at 100K files. ## Architecture - **Index**: append-only docs slice with trigram + prefix posting lists - **Snapshot**: lock-free reader view via frozen slice headers + shallow-copied deleted set - **Search pipeline**: trigram intersection → fuzzy fallback (prefix bucket + subsequence) → brute-force scan (capped at 5K docs) - **Scoring**: subsequence match, basename prefix, boundary hits, contiguous runs, depth/length penalties - **Engine**: multi-root with fsnotify watcher (50ms batch coalescing), atomic snapshot publishing ## Benchmarks (10K files) | Query Type | Latency | |---|---| | exact_basename (`handler.go`) | ~43µs | | short_query (`ha`) | ~7µs | | fuzzy_basename (`hndlr`) | ~50µs | | path_structured (`internal/handler`) | ~29µs | | multi_token (`api handler`) | ~15µs | ## File inventory (11 files, 3273 lines) | File | Lines | Purpose | |---|---|---| | `text.go` | 264 | Normalization, trigram extraction, scoring | | `delta.go` | 128 | Index, Snapshot, CRUD operations | | `query.go` | 272 | Query planning, search strategies, top-K merge | | `engine.go` | 323 | Multi-root engine, watcher integration | | `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing | | `*_test.go` | 2085 | Unit tests, integration tests, benchmarks | --------- Co-authored-by: Coder <coder@users.noreply.github.com>
317 lines
7.7 KiB
Go
317 lines
7.7 KiB
Go
package filefinder_test
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"sync"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"cdr.dev/slog/v3"
|
|
"cdr.dev/slog/v3/sloggers/slogtest"
|
|
"github.com/coder/coder/v2/agent/filefinder"
|
|
)
|
|
|
|
var (
|
|
dirNames = []string{
|
|
"cmd", "internal", "pkg", "api", "auth", "database", "server", "client", "middleware",
|
|
"handler", "config", "utils", "models", "service", "worker", "scheduler", "notification",
|
|
"provisioner", "template", "workspace", "agent", "proxy", "crypto", "telemetry", "billing",
|
|
}
|
|
fileExts = []string{
|
|
".go", ".ts", ".tsx", ".js", ".py", ".sql", ".yaml", ".json", ".md", ".proto", ".sh",
|
|
}
|
|
fileStems = []string{
|
|
"main", "handler", "middleware", "service", "model", "query", "config", "utils", "helpers",
|
|
"types", "interface", "test", "mock", "factory", "builder", "adapter", "observer", "provider",
|
|
"resolver", "schema", "migration", "fixture", "snapshot", "checkpoint",
|
|
}
|
|
)
|
|
|
|
// generateFileTree creates n files under root in a realistic nested directory structure.
|
|
func generateFileTree(t testing.TB, root string, n int, seed int64) {
|
|
t.Helper()
|
|
rng := rand.New(rand.NewSource(seed)) //nolint:gosec // deterministic benchmarks
|
|
|
|
numDirs := n / 5
|
|
if numDirs < 10 {
|
|
numDirs = 10
|
|
}
|
|
dirs := make([]string, 0, numDirs)
|
|
for i := 0; i < numDirs; i++ {
|
|
depth := rng.Intn(6) + 1
|
|
parts := make([]string, depth)
|
|
for d := 0; d < depth; d++ {
|
|
parts[d] = dirNames[rng.Intn(len(dirNames))]
|
|
}
|
|
dirs = append(dirs, filepath.Join(parts...))
|
|
}
|
|
|
|
created := make(map[string]struct{})
|
|
for _, d := range dirs {
|
|
full := filepath.Join(root, d)
|
|
if _, ok := created[full]; ok {
|
|
continue
|
|
}
|
|
require.NoError(t, os.MkdirAll(full, 0o755))
|
|
created[full] = struct{}{}
|
|
}
|
|
|
|
for i := 0; i < n; i++ {
|
|
dir := dirs[rng.Intn(len(dirs))]
|
|
stem := fileStems[rng.Intn(len(fileStems))]
|
|
ext := fileExts[rng.Intn(len(fileExts))]
|
|
name := fmt.Sprintf("%s_%d%s", stem, i, ext)
|
|
full := filepath.Join(root, dir, name)
|
|
f, err := os.Create(full)
|
|
require.NoError(t, err)
|
|
_ = f.Close()
|
|
}
|
|
}
|
|
|
|
// buildIndex walks root and returns a populated Index, the same
|
|
// way Engine.AddRoot does but without starting a watcher.
|
|
func buildIndex(t testing.TB, root string) *filefinder.Index {
|
|
t.Helper()
|
|
absRoot, err := filepath.Abs(root)
|
|
require.NoError(t, err)
|
|
idx, err := filefinder.BuildTestIndex(absRoot)
|
|
require.NoError(t, err)
|
|
return idx
|
|
}
|
|
|
|
func BenchmarkBuildIndex(b *testing.B) {
|
|
scales := []struct {
|
|
name string
|
|
n int
|
|
}{
|
|
{"1K", 1_000},
|
|
{"10K", 10_000},
|
|
{"100K", 100_000},
|
|
}
|
|
|
|
for _, sc := range scales {
|
|
b.Run(sc.name, func(b *testing.B) {
|
|
if sc.n >= 100_000 && testing.Short() {
|
|
b.Skip("skipping large-scale benchmark")
|
|
}
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, sc.n, 42)
|
|
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
idx := buildIndex(b, dir)
|
|
if idx.Len() == 0 {
|
|
b.Fatal("expected non-empty index")
|
|
}
|
|
}
|
|
b.StopTimer()
|
|
|
|
idx := buildIndex(b, dir)
|
|
b.ReportMetric(float64(idx.Len())/b.Elapsed().Seconds(), "files/sec")
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkSearch_ByScale(b *testing.B) {
|
|
queries := []struct {
|
|
name string
|
|
query string
|
|
}{
|
|
{"exact_basename", "handler.go"},
|
|
{"short_query", "ha"},
|
|
{"fuzzy_basename", "hndlr"},
|
|
{"path_structured", "internal/handler"},
|
|
{"multi_token", "api handler"},
|
|
}
|
|
scales := []struct {
|
|
name string
|
|
n int
|
|
}{
|
|
{"1K", 1_000},
|
|
{"10K", 10_000},
|
|
{"100K", 100_000},
|
|
}
|
|
|
|
for _, sc := range scales {
|
|
b.Run(sc.name, func(b *testing.B) {
|
|
if sc.n >= 100_000 && testing.Short() {
|
|
b.Skip("skipping large-scale benchmark")
|
|
}
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, sc.n, 42)
|
|
idx := buildIndex(b, dir)
|
|
snap := idx.Snapshot()
|
|
opts := filefinder.DefaultSearchOptions()
|
|
|
|
for _, q := range queries {
|
|
b.Run(q.name, func(b *testing.B) {
|
|
p := filefinder.NewQueryPlanForTest(q.query)
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
_ = filefinder.SearchSnapshotForTest(p, snap, opts.MaxCandidates)
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkSearch_ConcurrentReads(b *testing.B) {
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, 10_000, 42)
|
|
|
|
logger := slogtest.Make(b, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelError)
|
|
ctx := context.Background()
|
|
eng := filefinder.NewEngine(logger)
|
|
require.NoError(b, eng.AddRoot(ctx, dir))
|
|
b.Cleanup(func() { _ = eng.Close() })
|
|
|
|
opts := filefinder.DefaultSearchOptions()
|
|
goroutines := []int{1, 4, 16, 64}
|
|
|
|
for _, g := range goroutines {
|
|
b.Run(fmt.Sprintf("goroutines_%d", g), func(b *testing.B) {
|
|
b.SetParallelism(g)
|
|
b.ResetTimer()
|
|
b.RunParallel(func(pb *testing.PB) {
|
|
for pb.Next() {
|
|
results, err := eng.Search(ctx, "handler", opts)
|
|
if err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
_ = results
|
|
}
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkDeltaUpdate(b *testing.B) {
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, 10_000, 42)
|
|
|
|
addCounts := []int{1, 10, 100}
|
|
|
|
for _, count := range addCounts {
|
|
b.Run(fmt.Sprintf("add_%d_files", count), func(b *testing.B) {
|
|
paths := make([]string, count)
|
|
for i := range paths {
|
|
paths[i] = fmt.Sprintf("injected/dir_%d/newfile_%d.go", i%10, i)
|
|
}
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
b.StopTimer()
|
|
idx := buildIndex(b, dir)
|
|
b.StartTimer()
|
|
for _, p := range paths {
|
|
idx.Add(p, 0)
|
|
}
|
|
}
|
|
b.ReportMetric(float64(count), "files_added/op")
|
|
})
|
|
}
|
|
|
|
b.Run("search_after_100_additions", func(b *testing.B) {
|
|
idx := buildIndex(b, dir)
|
|
for i := 0; i < 100; i++ {
|
|
idx.Add(fmt.Sprintf("injected/extra/file_%d.go", i), 0)
|
|
}
|
|
snap := idx.Snapshot()
|
|
plan := filefinder.NewQueryPlanForTest("handler")
|
|
opts := filefinder.DefaultSearchOptions()
|
|
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
_ = filefinder.SearchSnapshotForTest(plan, snap, opts.MaxCandidates)
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkMemoryProfile(b *testing.B) {
|
|
scales := []struct {
|
|
name string
|
|
n int
|
|
}{
|
|
{"10K", 10_000},
|
|
{"100K", 100_000},
|
|
}
|
|
|
|
for _, sc := range scales {
|
|
b.Run(sc.name, func(b *testing.B) {
|
|
if sc.n >= 100_000 && testing.Short() {
|
|
b.Skip("skipping large-scale memory profile")
|
|
}
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, sc.n, 42)
|
|
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
idx := buildIndex(b, dir)
|
|
_ = idx.Snapshot()
|
|
}
|
|
b.StopTimer()
|
|
|
|
// Report memory stats on the last iteration.
|
|
runtime.GC()
|
|
var before runtime.MemStats
|
|
runtime.ReadMemStats(&before)
|
|
idx := buildIndex(b, dir)
|
|
var after runtime.MemStats
|
|
runtime.ReadMemStats(&after)
|
|
|
|
allocDelta := after.TotalAlloc - before.TotalAlloc
|
|
b.ReportMetric(float64(allocDelta)/float64(idx.Len()), "bytes/file")
|
|
|
|
runtime.GC()
|
|
runtime.ReadMemStats(&before)
|
|
snap := idx.Snapshot()
|
|
_ = snap
|
|
runtime.GC()
|
|
runtime.ReadMemStats(&after)
|
|
|
|
snapAlloc := after.TotalAlloc - before.TotalAlloc
|
|
b.ReportMetric(float64(snapAlloc)/float64(idx.Len()), "snap-bytes/file")
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkSearch_ConcurrentReads_Throughput(b *testing.B) {
|
|
dir := b.TempDir()
|
|
generateFileTree(b, dir, 10_000, 42)
|
|
idx := buildIndex(b, dir)
|
|
snap := idx.Snapshot()
|
|
|
|
goroutines := []int{1, 4, 16, 64}
|
|
plan := filefinder.NewQueryPlanForTest("handler.go")
|
|
maxCands := filefinder.DefaultSearchOptions().MaxCandidates
|
|
|
|
for _, g := range goroutines {
|
|
b.Run(fmt.Sprintf("goroutines_%d", g), func(b *testing.B) {
|
|
b.ResetTimer()
|
|
var wg sync.WaitGroup
|
|
perGoroutine := b.N / g
|
|
if perGoroutine < 1 {
|
|
perGoroutine = 1
|
|
}
|
|
for gi := 0; gi < g; gi++ {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
for j := 0; j < perGoroutine; j++ {
|
|
_ = filefinder.SearchSnapshotForTest(plan, snap, maxCands)
|
|
}
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
totalOps := float64(g * perGoroutine)
|
|
b.ReportMetric(totalOps/b.Elapsed().Seconds(), "searches/sec")
|
|
})
|
|
}
|
|
}
|