Skip to content

Commit c01b6c7

Browse files
authored
remove SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT (#800)
After defaulting to shard merging for all inactive repos, this in fact makes searches slightly slower. So we can remove the experiment. Test Plan: go test
1 parent ebb3ca2 commit c01b6c7

2 files changed

Lines changed: 1 addition & 51 deletions

File tree

bits.go

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ import (
1818
"cmp"
1919
"encoding/binary"
2020
"math"
21-
"math/rand/v2"
22-
"slices"
2321
"sort"
2422
"unicode"
2523
"unicode/utf8"
@@ -126,11 +124,6 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
126124
}
127125

128126
func splitNGrams(str []byte) []runeNgramOff {
129-
// len(maxNgrams) >= the number of ngrams in str => no limit
130-
return splitNGramsLimit(str, len(str))
131-
}
132-
133-
func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
134127
var runeGram [3]rune
135128
var off [3]uint32
136129
var runeCount int
@@ -160,21 +153,6 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
160153
})
161154
}
162155

163-
// We return a random subset of size maxNgrams. This is to prevent the start
164-
// of the string biasing ngram selection.
165-
if maxNgrams < len(result) {
166-
// Deterministic seed for tests. Additionally makes comparing repeated
167-
// queries performance easier.
168-
r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
169-
170-
// Pick random subset via a shuffle
171-
r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
172-
result = result[:maxNgrams]
173-
174-
// Caller expects ngrams in order of appearance.
175-
slices.SortFunc(result, runeNgramOff.Compare)
176-
}
177-
178156
return result
179157
}
180158

indexdata.go

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ import (
2121
"hash/crc64"
2222
"log"
2323
"math/bits"
24-
"os"
2524
"slices"
26-
"strconv"
2725
"unicode/utf8"
2826

2927
"github.com/sourcegraph/zoekt/query"
@@ -403,37 +401,11 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
403401
return cs
404402
}
405403

406-
// experimentIterateNgramLookupLimit when non-zero will only lookup this many
407-
// ngrams from a query string. Note: that if case-insensitive, this only
408-
// limits the input. So we will still lookup the case folding.
409-
//
410-
// This experiment is targetting looking up large snippets. If it is
411-
// successful, we will likely hardcode the value we use in production.
412-
//
413-
// Future note: if we find cases where this works badly, we can consider only
414-
// searching a random subset of the query string to avoid bad strings.
415-
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
416-
417-
func getEnvInt(k string) int {
418-
v, _ := strconv.Atoi(os.Getenv(k))
419-
if v != 0 {
420-
log.Printf("%s = %d\n", k, v)
421-
}
422-
return v
423-
}
424-
425404
func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
426405
str := query.Pattern
427406

428407
// Find the 2 least common ngrams from the string.
429-
var ngramOffs []runeNgramOff
430-
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
431-
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
432-
// length is asked later on for other optimizations.
433-
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
434-
} else {
435-
ngramOffs = splitNGrams([]byte(str))
436-
}
408+
ngramOffs := splitNGrams([]byte(str))
437409

438410
// protect against accidental searching of empty strings
439411
if len(ngramOffs) == 0 {

0 commit comments

Comments
 (0)