index: use a random sample of ngrams when limiting (#797)

keegancsmith · web-flow · commit ebb3ca24249d · 2024-07-29T14:53:13.000+02:00
The first bit of data I am getting back indicates this strategy of
limiting the number of ngrams we lookup isn't working. I am still
experimenting with different limits, but in the meantime it is easy to
implement a strategy which picks a random subset. This is so that the
first N ngrams of a query aren't the only ones being consulted.

Test Plan: ran all tests with the envvar set to 2. I expected tests that
assert on stats to fail, but everything else to pass. This was the case.

  SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT=2 go test ./...
diff --git a/bits.go b/bits.go
@@ -18,6 +18,8 @@ import (
 	"cmp"
 	"encoding/binary"
 	"math"
+	"math/rand/v2"
+	"slices"
 	"sort"
 	"unicode"
 	"unicode/utf8"
@@ -136,7 +138,7 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
 	result := make([]runeNgramOff, 0, len(str))
 	var i uint32
 
-	for len(str) > 0 && len(result) < maxNgrams {
+	for len(str) > 0 {
 		r, sz := utf8.DecodeRune(str)
 		str = str[sz:]
 		runeGram[0] = runeGram[1]
@@ -157,6 +159,22 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
 			index: len(result),
 		})
 	}
+
+	// We return a random subset of size maxNgrams. This is to prevent the start
+	// of the string biasing ngram selection.
+	if maxNgrams < len(result) {
+		//  Deterministic seed for tests. Additionally makes comparing repeated
+		//  queries performance easier.
+		r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
+
+		// Pick random subset via a shuffle
+		r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
+		result = result[:maxNgrams]
+
+		// Caller expects ngrams in order of appearance.
+		slices.SortFunc(result, runeNgramOff.Compare)
+	}
+
 	return result
 }