Skip to content

Commit fe8f2a3

Browse files
authored
Avoid overlapping trigrams in distanceHitIterator (#779)
We select the two least frequent trigrams to create the candidate match iterator. It's common for these trigrams to overlap. This change shifts the first and last trigrams to avoid overlap, which is guaranteed to result in a smaller intersection. For frequent terms, this can substantially reduce the number of candidate matches we consider.
1 parent 4e674a4 commit fe8f2a3

4 files changed

Lines changed: 68 additions & 16 deletions

File tree

bits.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func (n ngram) String() string {
110110
type runeNgramOff struct {
111111
ngram ngram
112112
// index is the original index inside of the returned array of splitNGrams
113-
index uint32
113+
index int
114114
}
115115

116116
func (a runeNgramOff) Compare(b runeNgramOff) int {
@@ -149,7 +149,7 @@ func splitNGrams(str []byte) []runeNgramOff {
149149
ng := runesToNGram(runeGram)
150150
result = append(result, runeNgramOff{
151151
ngram: ng,
152-
index: uint32(len(result)),
152+
index: len(result),
153153
})
154154
}
155155
return result

index_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ func TestSearchStats(t *testing.T) {
441441
Want: Stats{
442442
FilesLoaded: 1,
443443
ContentBytesLoaded: 22,
444-
IndexBytesLoaded: 8,
444+
IndexBytesLoaded: 10,
445445
NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND
446446
NgramLookups: 104,
447447
MatchCount: 2,
@@ -556,7 +556,7 @@ func TestSearchStats(t *testing.T) {
556556
}},
557557
Want: Stats{
558558
ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents
559-
IndexBytesLoaded: 8,
559+
IndexBytesLoaded: 10,
560560
FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index
561561
FilesLoaded: 2,
562562
MatchCount: 0, // even though there is a match it doesn't align with a symbol

indexdata.go

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,31 @@ func min2Index(xs []uint32) (idx0, idx1 int) {
336336
return
337337
}
338338

339-
// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
340-
// the distance iterator. If they have the same frequency, we maximise the
341-
// distance between them. first will always have a smaller index than last.
339+
// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
340+
// produce a small file intersection. It finds the two lowest frequency ngrams, making
341+
// sure to maximize the distance between them in case of ties. It avoids overlapping
342+
// trigrams to keep their intersection as small as possible.
343+
//
344+
// Invariant: first will always have a smaller index than last.
345+
func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
346+
first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)
347+
348+
// If the trigrams are overlapping, then try to shift one to reduce overlap.
349+
// This is guaranteed to produce a smaller intersection.
350+
if last.index-first.index < ngramSize {
351+
newFirstIndex := max(last.index-ngramSize, 0)
352+
if newFirstIndex != first.index {
353+
first = ngramOffs[indexMap[newFirstIndex]]
354+
}
355+
356+
newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
357+
if newLastIndex != last.index {
358+
last = ngramOffs[indexMap[newLastIndex]]
359+
}
360+
}
361+
return
362+
}
363+
342364
func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
343365
firstI, lastI := min2Index(frequencies)
344366
// If the frequencies are equal lets maximise distance in the query
@@ -357,13 +379,15 @@ func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (f
357379
}
358380
}
359381
}
382+
360383
first = ngramOffs[firstI]
361384
last = ngramOffs[lastI]
362-
// Ensure first appears before last to make distance logic below clean.
385+
386+
// Ensure first appears before last as a helpful invariant.
363387
if first.index > last.index {
364388
last, first = first, last
365389
}
366-
return first, last
390+
return
367391
}
368392

369393
func (data *indexData) ngrams(filename bool) btreeIndex {
@@ -412,9 +436,10 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
412436
// bucket (which can cause disk IO).
413437
slices.SortFunc(ngramOffs, runeNgramOff.Compare)
414438
frequencies := make([]uint32, 0, len(ngramOffs))
439+
indexMap := make([]int, len(ngramOffs))
415440
ngramLookups := 0
416441
ngrams := d.ngrams(query.FileName)
417-
for _, o := range ngramOffs {
442+
for i, o := range ngramOffs {
418443
var freq uint32
419444
if query.CaseSensitive {
420445
freq = ngrams.Get(o.ngram).sz
@@ -438,15 +463,14 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
438463
}
439464

440465
frequencies = append(frequencies, freq)
466+
indexMap[o.index] = i
441467
}
442468

443-
// first and last are now the smallest trigram posting lists to iterate
444-
// through.
445-
first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
469+
first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
446470

447471
iter := &ngramDocIterator{
448-
leftPad: first.index,
449-
rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
472+
leftPad: uint32(first.index),
473+
rightPad: uint32(utf8.RuneCountInString(str) - first.index),
450474
ngramLookups: ngramLookups,
451475
}
452476
if query.FileName {
@@ -456,7 +480,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
456480
}
457481

458482
if first != last {
459-
runeDist := last.index - first.index
483+
runeDist := uint32(last.index - first.index)
460484
i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
461485
if err != nil {
462486
return nil, err

indexdata_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,31 @@ func TestMinFrequencyNgramOffsets(t *testing.T) {
7272
t.Fatal(err)
7373
}
7474
}
75+
76+
func TestFindSelectiveNGrams(t *testing.T) {
77+
if err := quick.Check(func(s string, maxFreq uint16) bool {
78+
ngramOffs := splitNGrams([]byte(s))
79+
if len(ngramOffs) == 0 {
80+
return true
81+
}
82+
83+
slices.SortFunc(ngramOffs, runeNgramOff.Compare)
84+
indexMap := make([]int, len(ngramOffs))
85+
for i, n := range ngramOffs {
86+
indexMap[n.index] = i
87+
}
88+
89+
frequencies := genFrequencies(ngramOffs, int(maxFreq))
90+
x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies)
91+
92+
if len(ngramOffs) <= 1 {
93+
return true
94+
}
95+
96+
// Just assert the invariant that x0 is before x1. This test mostly checks
97+
// for out-of-bounds errors.
98+
return x0.index < x1.index
99+
}, nil); err != nil {
100+
t.Fatal(err)
101+
}
102+
}

0 commit comments

Comments
 (0)