Skip to content

Commit 45f608f

Browse files
sort ngrams before looking them up (#617)
We believe this will improve performance of the btree lookups. We are investigating this to make it faster to rule out a shard (when freq==0). Testing locally on a large corpus we halved the time spent in IO. Locally Sort shows up in the profiles significantly, but there are two facts mitigating that: - Locally my file page cache is primed so IO rarely is going to disk. - We likely will implement an IR for Zoekt which will amortize the Sort to once per search rather than once per shard. Test Plan: go test ./... and performance profiling via via ./cmd/zoekt. Co-authored-by: Stefan Hengl <stefan@sourcegraph.com>
1 parent 3d0bdd5 commit 45f608f

6 files changed

Lines changed: 34 additions & 24 deletions

File tree

BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ go_library(
5555
"@com_github_rs_xid//:xid",
5656
"@org_golang_google_protobuf//types/known/durationpb",
5757
"@org_golang_google_protobuf//types/known/timestamppb",
58+
"@org_golang_x_exp//slices",
5859
] + select({
5960
"@io_bazel_rules_go//go/platform:aix": [
6061
"@org_golang_x_sys//unix",

bits.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,9 @@ func (n ngram) String() string {
106106
}
107107

108108
type runeNgramOff struct {
109-
ngram ngram
110-
byteSize uint32 // size of ngram
111-
byteOff uint32
112-
runeOff uint32
109+
ngram ngram
110+
// index is the original index inside of the returned array of splitNGrams
111+
index uint32
113112
}
114113

115114
func splitNGrams(str []byte) []runeNgramOff {
@@ -120,9 +119,7 @@ func splitNGrams(str []byte) []runeNgramOff {
120119
result := make([]runeNgramOff, 0, len(str))
121120
var i uint32
122121

123-
chars := -1
124122
for len(str) > 0 {
125-
chars++
126123
r, sz := utf8.DecodeRune(str)
127124
str = str[sz:]
128125
runeGram[0] = runeGram[1]
@@ -139,10 +136,8 @@ func splitNGrams(str []byte) []runeNgramOff {
139136

140137
ng := runesToNGram(runeGram)
141138
result = append(result, runeNgramOff{
142-
ngram: ng,
143-
byteSize: i - off[0],
144-
byteOff: off[0],
145-
runeOff: uint32(chars),
139+
ngram: ng,
140+
index: uint32(len(result)),
146141
})
147142
}
148143
return result

deps.bzl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2655,8 +2655,8 @@ def go_dependencies():
26552655
name = "org_golang_x_exp",
26562656
build_file_proto_mode = "disable_global",
26572657
importpath = "golang.org/x/exp",
2658-
sum = "h1:A1gGSx58LAGVHUUsOf7IiR0u8Xb6W51gRwfDBhkdcaw=",
2659-
version = "v0.0.0-20191030013958-a1ab85dbe136",
2658+
sum = "h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=",
2659+
version = "v0.0.0-20230713183714-613f0c0eb8a1",
26602660
)
26612661
go_repository(
26622662
name = "org_golang_x_image",

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ require (
4646
go.opentelemetry.io/otel/trace v1.16.0
4747
go.uber.org/atomic v1.11.0
4848
go.uber.org/automaxprocs v1.5.2
49+
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
4950
golang.org/x/net v0.11.0
5051
golang.org/x/oauth2 v0.9.0
5152
golang.org/x/sync v0.3.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,8 @@ golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL
368368
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
369369
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
370370
golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
371+
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=
372+
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
371373
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
372374
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
373375
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=

indexdata.go

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"unicode/utf8"
2424

2525
"github.com/sourcegraph/zoekt/query"
26+
"golang.org/x/exp/slices"
2627
)
2728

2829
// indexData holds the pattern-independent data that we have to have
@@ -381,6 +382,11 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
381382

382383
// Find the 2 least common ngrams from the string.
383384
ngramOffs := splitNGrams([]byte(query.Pattern))
385+
// PERF: Sort to increase the chances adjacent checks are in the same btree
386+
// bucket (which can cause disk IO).
387+
slices.SortFunc(ngramOffs, func(a, b runeNgramOff) bool {
388+
return a.ngram < b.ngram
389+
})
384390
frequencies := make([]uint32, 0, len(ngramOffs))
385391
ngramLookups := 0
386392
for _, o := range ngramOffs {
@@ -408,18 +414,22 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
408414

409415
frequencies = append(frequencies, freq)
410416
}
411-
firstI := firstMinarg(frequencies)
412-
frequencies[firstI] = maxUInt32
413-
lastI := lastMinarg(frequencies)
414-
if firstI > lastI {
415-
lastI, firstI = firstI, lastI
417+
418+
var first, last runeNgramOff
419+
{
420+
firstI := firstMinarg(frequencies)
421+
frequencies[firstI] = maxUInt32
422+
lastI := lastMinarg(frequencies)
423+
first = ngramOffs[firstI]
424+
last = ngramOffs[lastI]
425+
if first.index > last.index {
426+
last, first = first, last
427+
}
416428
}
417429

418-
firstNG := ngramOffs[firstI].ngram
419-
lastNG := ngramOffs[lastI].ngram
420430
iter := &ngramDocIterator{
421-
leftPad: firstI,
422-
rightPad: uint32(utf8.RuneCountInString(str)) - firstI,
431+
leftPad: first.index,
432+
rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
423433
ngramLookups: ngramLookups,
424434
}
425435
if query.FileName {
@@ -428,15 +438,16 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
428438
iter.ends = d.fileEndRunes
429439
}
430440

431-
if firstI != lastI {
432-
i, err := d.newDistanceTrigramIter(firstNG, lastNG, lastI-firstI, query.CaseSensitive, query.FileName)
441+
if first != last {
442+
runeDist := last.index - first.index
443+
i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
433444
if err != nil {
434445
return nil, err
435446
}
436447

437448
iter.iter = i
438449
} else {
439-
hitIter, err := d.trigramHitIterator(lastNG, query.CaseSensitive, query.FileName)
450+
hitIter, err := d.trigramHitIterator(last.ngram, query.CaseSensitive, query.FileName)
440451
if err != nil {
441452
return nil, err
442453
}

0 commit comments

Comments
 (0)