Skip to content

Commit 3d0bdd5

Browse files
authored
remove ngram offset code (#616)
We have been running btree as the default for many months. We worried about a performance hit, but it never happened. After some recent local testing I did I noticed the btree actually interacted with the disk more efficiently. So the old code both uses more memory and is slower, lets just remove it. Test Plan: go test ./...
1 parent f9d3a0e commit 3d0bdd5

6 files changed

Lines changed: 20 additions & 647 deletions

File tree

btree.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ type btreeIndex struct {
289289
postingIndex simpleSection
290290
}
291291

292+
// SizeBytes returns how much memory this structure uses in the heap.
292293
func (b btreeIndex) SizeBytes() (sz int) {
293294
// btree
294295
if b.bt != nil {
@@ -401,6 +402,9 @@ func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
401402
return
402403
}
403404

405+
// DumpMap is a debug method which returns the btree as an in-memory
406+
// representation. This is how zoekt represents the ngram index in
407+
// google/zoekt.
404408
func (b btreeIndex) DumpMap() map[ngram]simpleSection {
405409
if b.bt == nil {
406410
return nil
@@ -427,3 +431,11 @@ func (b btreeIndex) DumpMap() map[ngram]simpleSection {
427431

428432
return m
429433
}
434+
435+
// GetBlob returns the raw encoded offset list for ng.
436+
//
437+
// Note: the returned byte slice is mmap backed normally.
438+
func (b btreeIndex) GetBlob(ng ngram) ([]byte, error) {
439+
sec := b.Get(ng)
440+
return b.file.Read(sec.off, sec.sz)
441+
}

hititer.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,6 @@ func (d *indexData) newDistanceTrigramIter(ng1, ng2 ngram, dist uint32, caseSens
110110
}
111111

112112
func (d *indexData) trigramHitIterator(ng ngram, caseSensitive, fileName bool) (hitIterator, error) {
113-
if d.ngrams == nil {
114-
return nil, fmt.Errorf("trigramHitIterator: ngrams=nil")
115-
}
116-
117113
variants := []ngram{ng}
118114
if !caseSensitive {
119115
variants = generateCaseNgrams(ng)

indexdata.go

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ type indexData struct {
3333

3434
file IndexFile
3535

36-
ngrams ngramIndex
36+
ngrams btreeIndex
3737

3838
newlinesStart uint32
3939
newlinesIndex []uint32
@@ -56,7 +56,7 @@ type indexData struct {
5656

5757
fileNameContent []byte
5858
fileNameIndex []uint32
59-
fileNameNgrams fileNameNgrams
59+
fileNameNgrams btreeIndex
6060

6161
// fileEndSymbol[i] is the index of the first symbol for document i.
6262
fileEndSymbol []uint32
@@ -314,9 +314,7 @@ func (d *indexData) memoryUse() int {
314314
}
315315
sz += 8 * len(d.runeDocSections)
316316
sz += 8 * len(d.fileBranchMasks)
317-
if d.ngrams != nil {
318-
sz += d.ngrams.SizeBytes()
319-
}
317+
sz += d.ngrams.SizeBytes()
320318
sz += d.fileNameNgrams.SizeBytes()
321319
return sz
322320
}
@@ -349,13 +347,8 @@ func lastMinarg(xs []uint32) uint32 {
349347

350348
func (data *indexData) ngramFrequency(ng ngram, filename bool) uint32 {
351349
if filename {
352-
return data.fileNameNgrams.Frequency(ng)
353-
}
354-
355-
if data.ngrams == nil {
356-
return 0
350+
return data.fileNameNgrams.Get(ng).sz
357351
}
358-
359352
return data.ngrams.Get(ng).sz
360353
}
361354

0 commit comments

Comments
 (0)