index: experiment to limit ngram lookups for large snippets (#795)

keegancsmith · web-flow · commit 12ce07a298ae · 2024-07-26T18:01:08.000+02:00
This introduces an experiment where we can stop looking up ngrams at a
certain limit. The insight here is that for large substrings we spend
more time finding the smallest ngram frequency than the time a normal
search takes. So instead we can try and find a good balance between
looking for a good (two) ngrams and actually searching the corpus.

The plan is to set different values for
SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT in sourcegraph production and
see how it affects performance of attribution search service.

Test Plan: ran all tests with the envvar set to 2. I expected tests that
assert on stats to fail, but everything else to pass. This was the case.

  SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT=2 go test ./...
diff --git a/bits.go b/bits.go
@@ -124,14 +124,19 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
 }
 
 func splitNGrams(str []byte) []runeNgramOff {
+	// len(maxNgrams) >= the number of ngrams in str => no limit
+	return splitNGramsLimit(str, len(str))
+}
+
+func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
 	var runeGram [3]rune
 	var off [3]uint32
 	var runeCount int
 
 	result := make([]runeNgramOff, 0, len(str))
 	var i uint32
 
-	for len(str) > 0 {
+	for len(str) > 0 && len(result) < maxNgrams {
 		r, sz := utf8.DecodeRune(str)
 		str = str[sz:]
 		runeGram[0] = runeGram[1]
diff --git a/indexdata.go b/indexdata.go
@@ -21,7 +21,9 @@ import (
 	"hash/crc64"
 	"log"
 	"math/bits"
+	"os"
 	"slices"
+	"strconv"
 	"unicode/utf8"
 
 	"github.com/sourcegraph/zoekt/query"
@@ -401,11 +403,37 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
 	return cs
 }
 
+// experimentIterateNgramLookupLimit when non-zero will only lookup this many
+// ngrams from a query string. Note: that if case-insensitive, this only
+// limits the input. So we will still lookup the case folding.
+//
+// This experiment is targetting looking up large snippets. If it is
+// successful, we will likely hardcode the value we use in production.
+//
+// Future note: if we find cases where this works badly, we can consider only
+// searching a random subset of the query string to avoid bad strings.
+var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
+
+func getEnvInt(k string) int {
+	v, _ := strconv.Atoi(os.Getenv(k))
+	if v != 0 {
+		log.Printf("%s = %d\n", k, v)
+	}
+	return v
+}
+
 func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
 	str := query.Pattern
 
 	// Find the 2 least common ngrams from the string.
-	ngramOffs := splitNGrams([]byte(query.Pattern))
+	var ngramOffs []runeNgramOff
+	if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
+		// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
+		// length is asked later on for other optimizations.
+		ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
+	} else {
+		ngramOffs = splitNGrams([]byte(str))
+	}
 
 	// protect against accidental searching of empty strings
 	if len(ngramOffs) == 0 {