Files
Kyle Carberry 68e4155fed feat(agent/filefinder): add plocate-lite file finder package (#22453)
Adds an in-memory trigram-indexed file finder package at
`agent/filefinder`, designed to power a future `FindFiles` HTTP handler
on the WorkspaceAgent.

## What it does

Fast fuzzy file search with VS Code-quality matching across millions of
files. Sub-millisecond search latency at 100K files.

## Architecture

- **Index**: append-only docs slice with trigram + prefix posting lists
- **Snapshot**: lock-free reader view via frozen slice headers +
shallow-copied deleted set
- **Search pipeline**: trigram intersection → fuzzy fallback (prefix
bucket + subsequence) → brute-force scan (capped at 5K docs)
- **Scoring**: subsequence match, basename prefix, boundary hits,
contiguous runs, depth/length penalties
- **Engine**: multi-root with fsnotify watcher (50ms batch coalescing),
atomic snapshot publishing

## Benchmarks (10K files)

| Query Type | Latency |
|---|---|
| exact_basename (`handler.go`) | ~43µs |
| short_query (`ha`) | ~7µs |
| fuzzy_basename (`hndlr`) | ~50µs |
| path_structured (`internal/handler`) | ~29µs |
| multi_token (`api handler`) | ~15µs |

## File inventory (11 files, 3273 lines)

| File | Lines | Purpose |
|---|---|---|
| `text.go` | 264 | Normalization, trigram extraction, scoring |
| `delta.go` | 128 | Index, Snapshot, CRUD operations |
| `query.go` | 272 | Query planning, search strategies, top-K merge |
| `engine.go` | 323 | Multi-root engine, watcher integration |
| `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing |
| `*_test.go` | 2085 | Unit tests, integration tests, benchmarks |

---------

Co-authored-by: Coder <coder@users.noreply.github.com>
2026-02-28 23:37:07 -05:00

389 lines
11 KiB
Go

package filefinder_test
import (
"slices"
"testing"
"github.com/coder/coder/v2/agent/filefinder"
)
func TestNormalizeQuery(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
want string
}{
{"empty", "", ""},
{"leading and trailing spaces", " hello ", "hello"},
{"multiple internal spaces", "foo bar baz", "foo bar baz"},
{"uppercase to lower", "FooBar", "foobar"},
{"backslash to slash", `foo\bar\baz`, "foo/bar/baz"},
{"mixed case and spaces", " Hello World ", "hello world"},
{"unicode passthrough", "héllo wörld", "héllo wörld"},
{"only spaces", " ", ""},
{"single char", "A", "a"},
{"slashes preserved", "/foo/bar/", "/foo/bar/"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.NormalizeQueryForTest(tt.input)
if got != tt.want {
t.Errorf("normalizeQuery(%q) = %q, want %q", tt.input, got, tt.want)
}
})
}
}
func TestExtractTrigrams(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
want []uint32
}{
{"too short", "ab", nil},
{"exactly three bytes", "abc", []uint32{uint32('a')<<16 | uint32('b')<<8 | uint32('c')}},
{"case insensitive", "ABC", []uint32{uint32('a')<<16 | uint32('b')<<8 | uint32('c')}},
{"deduplication", "aaaa", []uint32{uint32('a')<<16 | uint32('a')<<8 | uint32('a')}},
{"four bytes produces two trigrams", "abcd", []uint32{
uint32('a')<<16 | uint32('b')<<8 | uint32('c'),
uint32('b')<<16 | uint32('c')<<8 | uint32('d'),
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.ExtractTrigramsForTest([]byte(tt.input))
if !slices.Equal(got, tt.want) {
t.Errorf("extractTrigrams(%q) = %v, want %v", tt.input, got, tt.want)
}
})
}
}
func TestExtractBasename(t *testing.T) {
t.Parallel()
tests := []struct {
name string
path string
wantOff int
wantName string
}{
{"full path", "/foo/bar/baz.go", 9, "baz.go"},
{"bare filename", "baz.go", 0, "baz.go"},
{"trailing slash", "/a/b/", 3, "b"},
{"root slash", "/", 0, ""},
{"empty", "", 0, ""},
{"single dir with slash", "/foo", 1, "foo"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
off, length := filefinder.ExtractBasenameForTest([]byte(tt.path))
if off != tt.wantOff {
t.Errorf("extractBasename(%q) offset = %d, want %d", tt.path, off, tt.wantOff)
}
gotName := string([]byte(tt.path)[off : off+length])
if gotName != tt.wantName {
t.Errorf("extractBasename(%q) name = %q, want %q", tt.path, gotName, tt.wantName)
}
})
}
}
func TestExtractSegments(t *testing.T) {
t.Parallel()
tests := []struct {
name string
path string
want []string
}{
{"absolute path", "/foo/bar/baz", []string{"foo", "bar", "baz"}},
{"relative path", "foo/bar", []string{"foo", "bar"}},
{"trailing slash", "/a/b/", []string{"a", "b"}},
{"multiple slashes", "//a///b//", []string{"a", "b"}},
{"empty", "", nil},
{"single segment", "foo", []string{"foo"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.ExtractSegmentsForTest([]byte(tt.path))
if len(got) != len(tt.want) {
t.Fatalf("extractSegments(%q) got %d segments, want %d", tt.path, len(got), len(tt.want))
}
for i := range got {
if string(got[i]) != tt.want[i] {
t.Errorf("extractSegments(%q)[%d] = %q, want %q", tt.path, i, got[i], tt.want[i])
}
}
})
}
}
func TestPrefix1(t *testing.T) {
t.Parallel()
tests := []struct {
name string
in string
want byte
}{
{"lowercase", "foo", 'f'},
{"uppercase", "Foo", 'f'},
{"empty", "", 0},
{"digit", "1abc", '1'},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.Prefix1ForTest([]byte(tt.in))
if got != tt.want {
t.Errorf("prefix1(%q) = %d (%c), want %d (%c)", tt.in, got, got, tt.want, tt.want)
}
})
}
}
func TestPrefix2(t *testing.T) {
t.Parallel()
tests := []struct {
name string
in string
want uint16
}{
{"two chars", "ab", uint16('a')<<8 | uint16('b')},
{"uppercase", "AB", uint16('a')<<8 | uint16('b')},
{"single char", "A", uint16('a') << 8},
{"empty", "", 0},
{"longer string", "Hello", uint16('h')<<8 | uint16('e')},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.Prefix2ForTest([]byte(tt.in))
if got != tt.want {
t.Errorf("prefix2(%q) = %d, want %d", tt.in, got, tt.want)
}
})
}
}
func TestNormalizePathBytes(t *testing.T) {
t.Parallel()
tests := []struct {
name string
input string
want string
}{
{"backslash to slash", `C:\Users\test`, "c:/users/test"},
{"collapse slashes", "//foo///bar//", "/foo/bar/"},
{"lowercase", "FooBar", "foobar"},
{"empty", "", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
buf := []byte(tt.input)
got := string(filefinder.NormalizePathBytesForTest(buf))
if got != tt.want {
t.Errorf("normalizePathBytes(%q) = %q, want %q", tt.input, got, tt.want)
}
})
}
}
func TestIsSubsequence(t *testing.T) {
t.Parallel()
tests := []struct {
name string
haystack string
needle string
want bool
}{
{"empty needle", "anything", "", true},
{"empty both", "", "", true},
{"empty haystack", "", "a", false},
{"exact match", "abc", "abc", true},
{"scattered", "axbycz", "abc", true},
{"prefix", "abcdef", "abc", true},
{"suffix", "xyzabc", "abc", true},
{"case insensitive", "AbCdEf", "ace", true},
{"case insensitive reverse", "abcdef", "ACE", true},
{"no match", "abcdef", "xyz", false},
{"partial match", "abcdef", "abz", false},
{"longer needle", "ab", "abc", false},
{"single char match", "hello", "l", true},
{"single char no match", "hello", "z", false},
{"path like", "src/internal/foo.go", "sif", true},
{"path like no match", "src/internal/foo.go", "zzz", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.IsSubsequenceForTest([]byte(tt.haystack), []byte(tt.needle))
if got != tt.want {
t.Errorf("isSubsequence(%q, %q) = %v, want %v", tt.haystack, tt.needle, got, tt.want)
}
})
}
}
func TestLongestContiguousMatch(t *testing.T) {
t.Parallel()
tests := []struct {
name string
haystack string
needle string
want int
}{
{"empty needle", "abc", "", 0},
{"empty haystack", "", "abc", 0},
{"full match", "abc", "abc", 3},
{"prefix match", "abcdef", "abc", 3},
{"middle match", "xxabcyy", "abc", 3},
{"suffix match", "xxabc", "abc", 3},
{"partial", "axbc", "abc", 1},
{"scattered no contiguous", "axbxcx", "abc", 1},
{"case insensitive", "ABCdef", "abc", 3},
{"no match", "xyz", "abc", 0},
{"single char", "abc", "b", 1},
{"repeated", "aababc", "abc", 3},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.LongestContiguousMatchForTest([]byte(tt.haystack), []byte(tt.needle))
if got != tt.want {
t.Errorf("longestContiguousMatch(%q, %q) = %d, want %d", tt.haystack, tt.needle, got, tt.want)
}
})
}
}
func TestIsBoundary(t *testing.T) {
t.Parallel()
for _, b := range []byte{'/', '.', '_', '-'} {
if !filefinder.IsBoundaryForTest(b) {
t.Errorf("isBoundary(%q) = false, want true", b)
}
}
for _, b := range []byte{'a', 'Z', '0', ' ', '('} {
if filefinder.IsBoundaryForTest(b) {
t.Errorf("isBoundary(%q) = true, want false", b)
}
}
}
func TestCountBoundaryHits(t *testing.T) {
t.Parallel()
tests := []struct {
name string
path string
query string
want int
}{
{"start of string", "foo/bar", "f", 1},
{"after slash", "foo/bar", "fb", 2},
{"after dot", "foo.bar", "fb", 2},
{"after underscore", "foo_bar", "fb", 2},
{"no hits", "xxxx", "y", 0},
{"empty query", "foo", "", 0},
{"empty path", "", "f", 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
got := filefinder.CountBoundaryHitsForTest([]byte(tt.path), []byte(tt.query))
if got != tt.want {
t.Errorf("countBoundaryHits(%q, %q) = %d, want %d", tt.path, tt.query, got, tt.want)
}
})
}
}
func TestScorePath_NoSubsequenceReturnsZero(t *testing.T) {
t.Parallel()
path := []byte("src/internal/handler.go")
query := []byte("zzz")
tokens := [][]byte{[]byte("zzz")}
params := filefinder.DefaultScoreParamsForTest()
s := filefinder.ScorePathForTest(path, 13, 10, 2, query, tokens, params)
if s != 0 {
t.Errorf("expected 0 for no subsequence match, got %f", s)
}
}
func TestScorePath_ExactBasenameOverPartial(t *testing.T) {
t.Parallel()
params := filefinder.DefaultScoreParamsForTest()
query := []byte("main")
tokens := [][]byte{query}
pathExact := []byte("src/main")
scoreExact := filefinder.ScorePathForTest(pathExact, 4, 4, 1, query, tokens, params)
pathPartial := []byte("module/amazing")
scorePartial := filefinder.ScorePathForTest(pathPartial, 7, 7, 1, query, tokens, params)
if scoreExact <= scorePartial {
t.Errorf("exact basename (%f) should score higher than partial (%f)", scoreExact, scorePartial)
}
}
func TestScorePath_BasenamePrefixOverScattered(t *testing.T) {
t.Parallel()
params := filefinder.DefaultScoreParamsForTest()
query := []byte("han")
tokens := [][]byte{query}
pathPrefix := []byte("src/handler.go")
scorePrefix := filefinder.ScorePathForTest(pathPrefix, 4, 10, 1, query, tokens, params)
pathScattered := []byte("has/another/thing")
scoreScattered := filefinder.ScorePathForTest(pathScattered, 12, 5, 2, query, tokens, params)
if scorePrefix <= scoreScattered {
t.Errorf("basename prefix (%f) should score higher than scattered (%f)", scorePrefix, scoreScattered)
}
}
func TestScorePath_ShallowOverDeep(t *testing.T) {
t.Parallel()
params := filefinder.DefaultScoreParamsForTest()
query := []byte("foo")
tokens := [][]byte{query}
pathShallow := []byte("src/foo.go")
scoreShallow := filefinder.ScorePathForTest(pathShallow, 4, 6, 1, query, tokens, params)
pathDeep := []byte("a/b/c/d/e/foo.go")
scoreDeep := filefinder.ScorePathForTest(pathDeep, 10, 6, 5, query, tokens, params)
if scoreShallow <= scoreDeep {
t.Errorf("shallow path (%f) should score higher than deep (%f)", scoreShallow, scoreDeep)
}
}
func TestScorePath_ShorterOverLongerSameMatch(t *testing.T) {
t.Parallel()
params := filefinder.DefaultScoreParamsForTest()
query := []byte("foo")
tokens := [][]byte{query}
pathShort := []byte("x/foo")
scoreShort := filefinder.ScorePathForTest(pathShort, 2, 3, 1, query, tokens, params)
pathLong := []byte("x/foo_extremely_long_suffix_name")
scoreLong := filefinder.ScorePathForTest(pathLong, 2, 29, 1, query, tokens, params)
if scoreShort <= scoreLong {
t.Errorf("shorter path (%f) should score higher than longer (%f)", scoreShort, scoreLong)
}
}
func BenchmarkScorePath(b *testing.B) {
path := []byte("src/internal/coderd/database/queries/workspaces.sql")
query := []byte("workspace")
tokens := [][]byte{query}
params := filefinder.DefaultScoreParamsForTest()
baseOff, baseLen := filefinder.ExtractBasenameForTest(path)
s := filefinder.ScorePathForTest(path, baseOff, baseLen, 4, query, tokens, params)
if s == 0 {
b.Fatal("expected non-zero score for benchmark path")
}
b.ResetTimer()
for b.Loop() {
filefinder.ScorePathForTest(path, baseOff, baseLen, 4, query, tokens, params)
}
}