@@ -336,9 +336,31 @@ func min2Index(xs []uint32) (idx0, idx1 int) {
336336 return
337337}
338338
339- // minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
340- // the distance iterator. If they have the same frequency, we maximise the
341- // distance between them. first will always have a smaller index than last.
339+ // findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
340+ // produce a small file intersection. It finds the two lowest frequency ngrams, making
341+ // sure to maximize the distance between them in case of ties. It avoids overlapping
342+ // trigrams to keep their intersection as small as possible.
343+ //
344+ // Invariant: first will always have a smaller index than last.
345+ func findSelectiveNgrams (ngramOffs []runeNgramOff , indexMap []int , frequencies []uint32 ) (first , last runeNgramOff ) {
346+ first , last = minFrequencyNgramOffsets (ngramOffs , frequencies )
347+
348+ // If the trigrams are overlapping, then try to shift one to reduce overlap.
349+ // This is guaranteed to produce a smaller intersection.
350+ if last .index - first .index < ngramSize {
351+ newFirstIndex := max (last .index - ngramSize , 0 )
352+ if newFirstIndex != first .index {
353+ first = ngramOffs [indexMap [newFirstIndex ]]
354+ }
355+
356+ newLastIndex := min (first .index + ngramSize , len (ngramOffs )- 1 )
357+ if newLastIndex != last .index {
358+ last = ngramOffs [indexMap [newLastIndex ]]
359+ }
360+ }
361+ return
362+ }
363+
342364func minFrequencyNgramOffsets (ngramOffs []runeNgramOff , frequencies []uint32 ) (first , last runeNgramOff ) {
343365 firstI , lastI := min2Index (frequencies )
344366 // If the frequencies are equal lets maximise distance in the query
@@ -357,13 +379,15 @@ func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (f
357379 }
358380 }
359381 }
382+
360383 first = ngramOffs [firstI ]
361384 last = ngramOffs [lastI ]
362- // Ensure first appears before last to make distance logic below clean.
385+
386+ // Ensure first appears before last as a helpful invariant.
363387 if first .index > last .index {
364388 last , first = first , last
365389 }
366- return first , last
390+ return
367391}
368392
369393func (data * indexData ) ngrams (filename bool ) btreeIndex {
@@ -412,9 +436,10 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
412436 // bucket (which can cause disk IO).
413437 slices .SortFunc (ngramOffs , runeNgramOff .Compare )
414438 frequencies := make ([]uint32 , 0 , len (ngramOffs ))
439+ indexMap := make ([]int , len (ngramOffs ))
415440 ngramLookups := 0
416441 ngrams := d .ngrams (query .FileName )
417- for _ , o := range ngramOffs {
442+ for i , o := range ngramOffs {
418443 var freq uint32
419444 if query .CaseSensitive {
420445 freq = ngrams .Get (o .ngram ).sz
@@ -438,15 +463,14 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
438463 }
439464
440465 frequencies = append (frequencies , freq )
466+ indexMap [o .index ] = i
441467 }
442468
443- // first and last are now the smallest trigram posting lists to iterate
444- // through.
445- first , last := minFrequencyNgramOffsets (ngramOffs , frequencies )
469+ first , last := findSelectiveNgrams (ngramOffs , indexMap , frequencies )
446470
447471 iter := & ngramDocIterator {
448- leftPad : first .index ,
449- rightPad : uint32 (utf8 .RuneCountInString (str )) - first .index ,
472+ leftPad : uint32 ( first .index ) ,
473+ rightPad : uint32 (utf8 .RuneCountInString (str ) - first .index ) ,
450474 ngramLookups : ngramLookups ,
451475 }
452476 if query .FileName {
@@ -456,7 +480,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
456480 }
457481
458482 if first != last {
459- runeDist := last .index - first .index
483+ runeDist := uint32 ( last .index - first .index )
460484 i , err := d .newDistanceTrigramIter (first .ngram , last .ngram , runeDist , query .CaseSensitive , query .FileName )
461485 if err != nil {
462486 return nil , err
0 commit comments