Skip to content

Commit df7a7e7

Browse files
authored
Simplify trigram selection in distanceHitIterator (#782)
Follow up to #779. This PR removes the logic for trigrams with the same frequency, because it will no longer have a big effect.
1 parent fe8f2a3 commit df7a7e7

1 file changed

Lines changed: 16 additions & 36 deletions

File tree

indexdata.go

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -320,26 +320,9 @@ func (d *indexData) memoryUse() int {
320320
return sz
321321
}
322322

323-
const maxUInt32 = 0xffffffff
324-
325-
func min2Index(xs []uint32) (idx0, idx1 int) {
326-
min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
327-
for i, x := range xs {
328-
if x <= min0 {
329-
idx0, idx1 = i, idx0
330-
min0, min1 = x, min0
331-
} else if x <= min1 {
332-
idx1 = i
333-
min1 = x
334-
}
335-
}
336-
return
337-
}
338-
339323
// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
340-
// produce a small file intersection. It finds the two lowest frequency ngrams, making
341-
// sure to maximize the distance between them in case of ties. It avoids overlapping
342-
// trigrams to keep their intersection as small as possible.
324+
// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids
325+
// overlapping trigrams to keep their intersection as small as possible.
343326
//
344327
// Invariant: first will always have a smaller index than last.
345328
func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
@@ -361,27 +344,24 @@ func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies [
361344
return
362345
}
363346

347+
const maxUInt32 = 0xffffffff
348+
364349
func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
365-
firstI, lastI := min2Index(frequencies)
366-
// If the frequencies are equal lets maximise distance in the query
367-
// string. This optimization normally triggers for long repeated trigrams
368-
// in a string, eg a query like "AAAAA..."
369-
if frequencies[firstI] == frequencies[lastI] {
370-
for i, freq := range frequencies {
371-
if freq != frequencies[firstI] {
372-
continue
373-
}
374-
if ngramOffs[i].index < ngramOffs[firstI].index {
375-
firstI = i
376-
}
377-
if ngramOffs[i].index > ngramOffs[lastI].index {
378-
lastI = i
379-
}
350+
// Find the two lowest frequency ngrams.
351+
idx0, idx1 := 0, 0
352+
min0, min1 := uint32(maxUInt32), uint32(maxUInt32)
353+
for i, x := range frequencies {
354+
if x <= min0 {
355+
idx0, idx1 = i, idx0
356+
min0, min1 = x, min0
357+
} else if x <= min1 {
358+
idx1 = i
359+
min1 = x
380360
}
381361
}
382362

383-
first = ngramOffs[firstI]
384-
last = ngramOffs[lastI]
363+
first = ngramOffs[idx0]
364+
last = ngramOffs[idx1]
385365

386366
// Ensure first appears before last as a helpful invariant.
387367
if first.index > last.index {

0 commit comments

Comments
 (0)