Skip to content

Commit 4e674a4

Browse files
authored
Rename UseKeywordScoring to mention BM25 (#778)
It's confusing to call this `UseKeywordScoring`, since we do not use it for the `keyword` patterntype in Sourcegraph. This commit clarifies the name to mention BM25.
1 parent 9f35cb1 commit 4e674a4

8 files changed

Lines changed: 406 additions & 407 deletions

File tree

api.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -946,10 +946,10 @@ type SearchOptions struct {
946946
// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
947947
DocumentRanksWeight float64
948948

949-
// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
950-
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
951-
// When enabled, all other scoring signals are ignored, including document ranks.
952-
UseKeywordScoring bool
949+
// EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
950+
// The scoring algorithm treats each match in a file as a term and computes an approximation to
951+
// BM25. When enabled, all other scoring signals are ignored, including document ranks.
952+
UseBM25Scoring bool
953953

954954
// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
955955
// a command-line flag
@@ -1015,7 +1015,7 @@ func (s *SearchOptions) String() string {
10151015
addBool("Whole", s.Whole)
10161016
addBool("ChunkMatches", s.ChunkMatches)
10171017
addBool("UseDocumentRanks", s.UseDocumentRanks)
1018-
addBool("UseKeywordScoring", s.UseKeywordScoring)
1018+
addBool("UseBM25Scoring", s.UseBM25Scoring)
10191019
addBool("Trace", s.Trace)
10201020
addBool("DebugScore", s.DebugScore)
10211021

api_proto.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
700700
DocumentRanksWeight: p.GetDocumentRanksWeight(),
701701
Trace: p.GetTrace(),
702702
DebugScore: p.GetDebugScore(),
703-
UseKeywordScoring: p.GetUseKeywordScoring(),
703+
UseBM25Scoring: p.GetUseBm25Scoring(),
704704
}
705705
}
706706

@@ -725,6 +725,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
725725
DocumentRanksWeight: s.DocumentRanksWeight,
726726
Trace: s.Trace,
727727
DebugScore: s.DebugScore,
728-
UseKeywordScoring: s.UseKeywordScoring,
728+
UseBm25Scoring: s.UseBM25Scoring,
729729
}
730730
}

build/scoring_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func TestBM25(t *testing.T) {
7777
query: &query.Substring{Pattern: "example"},
7878
content: exampleJava,
7979
language: "Java",
80-
// keyword-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
80+
// bm25-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
8181
wantScore: 1.69,
8282
}, {
8383
// Matches only on content
@@ -89,7 +89,7 @@ func TestBM25(t *testing.T) {
8989
}},
9090
content: exampleJava,
9191
language: "Java",
92-
// keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
92+
// bm25-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
9393
wantScore: 5.75,
9494
},
9595
{
@@ -98,15 +98,15 @@ func TestBM25(t *testing.T) {
9898
query: &query.Substring{Pattern: "java"},
9999
content: exampleJava,
100100
language: "Java",
101-
// keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
101+
// bm25-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
102102
wantScore: 1.07,
103103
},
104104
{
105105
// Matches only on filename, and content is missing
106106
fileName: "a/b/c/config.go",
107107
query: &query.Substring{Pattern: "config.go"},
108108
language: "Go",
109-
// keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
109+
// bm25-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
110110
wantScore: 1.91,
111111
},
112112
}
@@ -584,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) {
584584
}
585585
}
586586

587-
func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) {
587+
func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTagsParserType) {
588588
skipIfCTagsUnavailable(t, parserType)
589589

590590
name := c.language
@@ -625,9 +625,9 @@ func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType cta
625625
defer ss.Close()
626626

627627
srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{
628-
UseKeywordScoring: keywordScoring,
629-
ChunkMatches: true,
630-
DebugScore: true})
628+
UseBM25Scoring: useBM25,
629+
ChunkMatches: true,
630+
DebugScore: true})
631631
if err != nil {
632632
t.Fatal(err)
633633
}

eval.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ nextFileMatch:
317317
fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
318318
}
319319

320-
if opts.UseKeywordScoring {
320+
if opts.UseBM25Scoring {
321321
d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
322322
} else {
323323
// Use the standard, non-experimental scoring method by default

grpc/protos/zoekt/webserver/v1/webserver.pb.go

Lines changed: 382 additions & 383 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

grpc/protos/zoekt/webserver/v1/webserver.proto

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ message SearchOptions {
107107
// If set, the search results will contain debug information for scoring.
108108
bool debug_score = 14;
109109

110-
// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
110+
// EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula.
111111
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
112112
// When enabled, all other scoring signals are ignored, including document ranks.
113-
bool use_keyword_scoring = 15;
113+
bool use_bm25_scoring = 15;
114114
}
115115

116116
message ListRequest {

score.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc
3939
m.Score += computed
4040
}
4141

42-
func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
42+
func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) {
4343
if debugScore {
44-
m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
44+
m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
4545
}
4646
m.Score += score
4747
}
@@ -116,7 +116,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
116116
}
117117

118118
// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
119-
// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
119+
// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
120120
// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
121121
//
122122
// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
@@ -160,5 +160,5 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
160160
score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
161161
}
162162

163-
fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
163+
fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore)
164164
}

shards/shards_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) {
10871087
}
10881088
}
10891089

1090-
func TestUseKeywordScoring(t *testing.T) {
1090+
func TestUseBM25Scoring(t *testing.T) {
10911091
b := testIndexBuilder(t,
10921092
&zoekt.Repository{},
10931093
zoekt.Document{Name: "f1", Content: []byte("one two two three")},
@@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) {
11031103
&query.Substring{Pattern: "three"})
11041104

11051105
opts := zoekt.SearchOptions{
1106-
UseKeywordScoring: true,
1106+
UseBM25Scoring: true,
11071107
}
11081108

11091109
results, err := ss.Search(context.Background(), q, &opts)

0 commit comments

Comments
 (0)