Files
Kyle Carberry 68e4155fed feat(agent/filefinder): add plocate-lite file finder package (#22453)
Adds an in-memory trigram-indexed file finder package at
`agent/filefinder`, designed to power a future `FindFiles` HTTP handler
on the WorkspaceAgent.

## What it does

Fast fuzzy file search with VS Code-quality matching across millions of
files. Sub-millisecond search latency at 100K files.

## Architecture

- **Index**: append-only docs slice with trigram + prefix posting lists
- **Snapshot**: lock-free reader view via frozen slice headers +
shallow-copied deleted set
- **Search pipeline**: trigram intersection → fuzzy fallback (prefix
bucket + subsequence) → brute-force scan (capped at 5K docs)
- **Scoring**: subsequence match, basename prefix, boundary hits,
contiguous runs, depth/length penalties
- **Engine**: multi-root with fsnotify watcher (50ms batch coalescing),
atomic snapshot publishing

## Benchmarks (10K files)

| Query Type | Latency |
|---|---|
| exact_basename (`handler.go`) | ~43µs |
| short_query (`ha`) | ~7µs |
| fuzzy_basename (`hndlr`) | ~50µs |
| path_structured (`internal/handler`) | ~29µs |
| multi_token (`api handler`) | ~15µs |

## File inventory (11 files, 3273 lines)

| File | Lines | Purpose |
|---|---|---|
| `text.go` | 264 | Normalization, trigram extraction, scoring |
| `delta.go` | 128 | Index, Snapshot, CRUD operations |
| `query.go` | 272 | Query planning, search strategies, top-K merge |
| `engine.go` | 323 | Multi-root engine, watcher integration |
| `watcher_fs.go` | 201 | fsnotify wrapper with batch coalescing |
| `*_test.go` | 2085 | Unit tests, integration tests, benchmarks |

---------

Co-authored-by: Coder <coder@users.noreply.github.com>
2026-02-28 23:37:07 -05:00

300 lines
6.6 KiB
Go

package filefinder
import (
"container/heap"
"slices"
"strings"
)
type candidate struct {
DocID uint32
Path string
BaseOff int
BaseLen int
Depth int
Flags uint16
}
// Result is a scored search result returned to callers.
type Result struct {
Path string
Score float32
IsDir bool
}
type queryPlan struct {
Original string
Normalized string
Tokens [][]byte
Trigrams []uint32
IsShort bool
HasSlash bool
BasenameQ []byte
DirTokens [][]byte
}
func newQueryPlan(q string) *queryPlan {
norm := normalizeQuery(q)
p := &queryPlan{Original: q, Normalized: norm}
if len(norm) == 0 {
p.IsShort = true
return p
}
raw := strings.ReplaceAll(norm, "/", " ")
parts := strings.Fields(raw)
p.HasSlash = strings.ContainsRune(norm, '/')
for _, part := range parts {
p.Tokens = append(p.Tokens, []byte(part))
}
if len(p.Tokens) > 0 {
p.BasenameQ = p.Tokens[len(p.Tokens)-1]
if len(p.Tokens) > 1 {
p.DirTokens = p.Tokens[:len(p.Tokens)-1]
}
}
p.IsShort = true
for _, tok := range p.Tokens {
if len(tok) >= 3 {
p.IsShort = false
break
}
}
if !p.IsShort {
p.Trigrams = extractQueryTrigrams(p.Tokens)
}
return p
}
func extractQueryTrigrams(tokens [][]byte) []uint32 {
seen := make(map[uint32]struct{})
for _, tok := range tokens {
if len(tok) < 3 {
continue
}
for i := 0; i <= len(tok)-3; i++ {
seen[packTrigram(tok[i], tok[i+1], tok[i+2])] = struct{}{}
}
}
if len(seen) == 0 {
return nil
}
result := make([]uint32, 0, len(seen))
for g := range seen {
result = append(result, g)
}
return result
}
func packTrigram(a, b, c byte) uint32 {
return uint32(toLowerASCII(a))<<16 | uint32(toLowerASCII(b))<<8 | uint32(toLowerASCII(c))
}
// searchSnapshot runs the full search pipeline against a single
// root snapshot: it selects a strategy (prefix, trigram, or
// fuzzy fallback) based on query length, retrieves candidate
// doc IDs, and converts them into candidate structs.
func searchSnapshot(plan *queryPlan, snap *Snapshot, limit int) []candidate {
if snap == nil || len(snap.docs) == 0 || len(plan.Normalized) == 0 {
return nil
}
var ids []uint32
if plan.IsShort {
ids = searchShort(plan, snap)
} else {
ids = searchTrigrams(plan, snap)
if len(ids) == 0 && len(plan.BasenameQ) > 0 {
ids = searchFuzzyFallback(plan, snap)
}
}
if len(ids) == 0 {
return nil
}
cands := make([]candidate, 0, min(len(ids), limit))
for _, id := range ids {
if snap.deleted[id] || int(id) >= len(snap.docs) {
continue
}
d := snap.docs[id]
cands = append(cands, candidate{
DocID: id, Path: d.path, BaseOff: d.baseOff,
BaseLen: d.baseLen, Depth: d.depth, Flags: d.flags,
})
if len(cands) >= limit {
break
}
}
return cands
}
func searchShort(plan *queryPlan, snap *Snapshot) []uint32 {
if len(plan.BasenameQ) == 0 {
return nil
}
if len(plan.BasenameQ) >= 2 {
if ids := snap.byPrefix2[prefix2(plan.BasenameQ)]; len(ids) > 0 {
return ids
}
}
return snap.byPrefix1[prefix1(plan.BasenameQ)]
}
func searchTrigrams(plan *queryPlan, snap *Snapshot) []uint32 {
if len(plan.Trigrams) == 0 {
return nil
}
lists := make([][]uint32, 0, len(plan.Trigrams))
for _, g := range plan.Trigrams {
ids, ok := snap.byGram[g]
if !ok || len(ids) == 0 {
return nil
}
lists = append(lists, ids)
}
return intersectAll(lists)
}
func searchFuzzyFallback(plan *queryPlan, snap *Snapshot) []uint32 {
if len(plan.BasenameQ) == 0 {
return nil
}
bucket := snap.byPrefix1[prefix1(plan.BasenameQ)]
if len(bucket) == 0 {
return searchSubsequenceScan(plan, snap, 5000)
}
var ids []uint32
for _, id := range bucket {
if snap.deleted[id] || int(id) >= len(snap.docs) {
continue
}
if isSubsequence([]byte(snap.docs[id].path), plan.BasenameQ) {
ids = append(ids, id)
}
}
if len(ids) == 0 {
return searchSubsequenceScan(plan, snap, 5000)
}
return ids
}
func searchSubsequenceScan(plan *queryPlan, snap *Snapshot, maxCheck int) []uint32 {
if len(plan.BasenameQ) == 0 {
return nil
}
var ids []uint32
checked := 0
for id := 0; id < len(snap.docs) && checked < maxCheck; id++ {
uid := uint32(id) //nolint:gosec // Snapshot count is bounded well below 2^32.
if snap.deleted[uid] {
continue
}
checked++
if isSubsequence([]byte(snap.docs[id].path), plan.BasenameQ) {
ids = append(ids, uid)
}
}
return ids
}
func intersectSorted(a, b []uint32) []uint32 {
if len(a) == 0 || len(b) == 0 {
return nil
}
var result []uint32
ai, bi := 0, 0
for ai < len(a) && bi < len(b) {
switch {
case a[ai] < b[bi]:
ai++
case a[ai] > b[bi]:
bi++
default:
result = append(result, a[ai])
ai++
bi++
}
}
return result
}
func intersectAll(lists [][]uint32) []uint32 {
if len(lists) == 0 {
return nil
}
if len(lists) == 1 {
return lists[0]
}
slices.SortFunc(lists, func(a, b []uint32) int { return len(a) - len(b) })
result := lists[0]
for i := 1; i < len(lists) && len(result) > 0; i++ {
result = intersectSorted(result, lists[i])
}
return result
}
func mergeAndScore(cands []candidate, plan *queryPlan, params scoreParams, topK int) []Result {
if topK <= 0 || len(cands) == 0 {
return nil
}
query := []byte(plan.Normalized)
h := &resultHeap{}
heap.Init(h)
for i := range cands {
c := &cands[i]
s := scorePath([]byte(c.Path), c.BaseOff, c.BaseLen, c.Depth, query, plan.Tokens, params)
if s <= 0 {
continue
}
// DirTokenHit is applied here rather than in scorePath because
// it depends on the query plan's directory tokens, which are
// split from the full query during planning. scorePath operates
// on raw query bytes without knowledge of token boundaries.
if len(plan.DirTokens) > 0 {
segments := extractSegments([]byte(c.Path))
for _, dt := range plan.DirTokens {
for _, seg := range segments {
if equalFoldASCII(seg, dt) {
s += params.DirTokenHit
break
}
}
}
}
r := Result{Path: c.Path, Score: s, IsDir: c.Flags == uint16(FlagDir)}
if h.Len() < topK {
heap.Push(h, r)
} else if s > (*h)[0].Score {
(*h)[0] = r
heap.Fix(h, 0)
}
}
n := h.Len()
results := make([]Result, n)
for i := n - 1; i >= 0; i-- {
v := heap.Pop(h)
if r, ok := v.(Result); ok {
results[i] = r
}
}
return results
}
type resultHeap []Result
func (h resultHeap) Len() int { return len(h) }
func (h resultHeap) Less(i, j int) bool { return h[i].Score < h[j].Score }
func (h resultHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *resultHeap) Push(x interface{}) {
r, ok := x.(Result)
if ok {
*h = append(*h, r)
}
}
func (h *resultHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}