Skip to content

Commit 12ce07a

Browse files
authored
index: experiment to limit ngram lookups for large snippets (#795)
This introduces an experiment where we can stop looking up ngrams at a certain limit. The insight here is that for large substrings we spend more time finding the smallest ngram frequency than the time a normal search takes. So instead we can try and find a good balance between looking for a good (two) ngrams and actually searching the corpus. The plan is to set different values for SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT in sourcegraph production and see how it affects performance of attribution search service. Test Plan: ran all tests with the envvar set to 2. I expected tests that assert on stats to fail, but everything else to pass. This was the case. SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT=2 go test ./...
1 parent 5ac92b1 commit 12ce07a

2 files changed

Lines changed: 35 additions & 2 deletions

File tree

bits.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,19 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
124124
}
125125

126126
func splitNGrams(str []byte) []runeNgramOff {
127+
// len(maxNgrams) >= the number of ngrams in str => no limit
128+
return splitNGramsLimit(str, len(str))
129+
}
130+
131+
func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
127132
var runeGram [3]rune
128133
var off [3]uint32
129134
var runeCount int
130135

131136
result := make([]runeNgramOff, 0, len(str))
132137
var i uint32
133138

134-
for len(str) > 0 {
139+
for len(str) > 0 && len(result) < maxNgrams {
135140
r, sz := utf8.DecodeRune(str)
136141
str = str[sz:]
137142
runeGram[0] = runeGram[1]

indexdata.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import (
2121
"hash/crc64"
2222
"log"
2323
"math/bits"
24+
"os"
2425
"slices"
26+
"strconv"
2527
"unicode/utf8"
2628

2729
"github.com/sourcegraph/zoekt/query"
@@ -401,11 +403,37 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
401403
return cs
402404
}
403405

406+
// experimentIterateNgramLookupLimit when non-zero will only lookup this many
407+
// ngrams from a query string. Note: that if case-insensitive, this only
408+
// limits the input. So we will still lookup the case folding.
409+
//
410+
// This experiment is targetting looking up large snippets. If it is
411+
// successful, we will likely hardcode the value we use in production.
412+
//
413+
// Future note: if we find cases where this works badly, we can consider only
414+
// searching a random subset of the query string to avoid bad strings.
415+
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
416+
417+
func getEnvInt(k string) int {
418+
v, _ := strconv.Atoi(os.Getenv(k))
419+
if v != 0 {
420+
log.Printf("%s = %d\n", k, v)
421+
}
422+
return v
423+
}
424+
404425
func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
405426
str := query.Pattern
406427

407428
// Find the 2 least common ngrams from the string.
408-
ngramOffs := splitNGrams([]byte(query.Pattern))
429+
var ngramOffs []runeNgramOff
430+
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
431+
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
432+
// length is asked later on for other optimizations.
433+
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
434+
} else {
435+
ngramOffs = splitNGrams([]byte(str))
436+
}
409437

410438
// protect against accidental searching of empty strings
411439
if len(ngramOffs) == 0 {

0 commit comments

Comments
 (0)