Rename UseKeywordScoring to mention BM25 (#778)

jtibshirani · web-flow · commit 4e674a49795c · 2024-05-07T10:57:42.000-07:00
It's confusing to call this `UseKeywordScoring`, since we do not use it for the
`keyword` patterntype in Sourcegraph. This commit clarifies the name to mention
BM25.
diff --git a/api.go b/api.go
@@ -946,10 +946,10 @@ type SearchOptions struct {
 	// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
 	DocumentRanksWeight float64
 
-	// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
-	// Currently, this treats each match in a file as a term and computes an approximation to BM25.
-	// When enabled, all other scoring signals are ignored, including document ranks.
-	UseKeywordScoring bool
+	// EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
+	// The scoring algorithm treats each match in a file as a term and computes an approximation to
+	// BM25. When enabled, all other scoring signals are ignored, including document ranks.
+	UseBM25Scoring bool
 
 	// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
 	// a command-line flag
@@ -1015,7 +1015,7 @@ func (s *SearchOptions) String() string {
 	addBool("Whole", s.Whole)
 	addBool("ChunkMatches", s.ChunkMatches)
 	addBool("UseDocumentRanks", s.UseDocumentRanks)
-	addBool("UseKeywordScoring", s.UseKeywordScoring)
+	addBool("UseBM25Scoring", s.UseBM25Scoring)
 	addBool("Trace", s.Trace)
 	addBool("DebugScore", s.DebugScore)
 
diff --git a/api_proto.go b/api_proto.go
@@ -700,7 +700,7 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
 		DocumentRanksWeight:    p.GetDocumentRanksWeight(),
 		Trace:                  p.GetTrace(),
 		DebugScore:             p.GetDebugScore(),
-		UseKeywordScoring:      p.GetUseKeywordScoring(),
+		UseBM25Scoring:         p.GetUseBm25Scoring(),
 	}
 }
 
@@ -725,6 +725,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
 		DocumentRanksWeight:    s.DocumentRanksWeight,
 		Trace:                  s.Trace,
 		DebugScore:             s.DebugScore,
-		UseKeywordScoring:      s.UseKeywordScoring,
+		UseBm25Scoring:         s.UseBM25Scoring,
 	}
 }
diff --git a/build/scoring_test.go b/build/scoring_test.go
@@ -77,7 +77,7 @@ func TestBM25(t *testing.T) {
 			query:    &query.Substring{Pattern: "example"},
 			content:  exampleJava,
 			language: "Java",
-			// keyword-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
+			// bm25-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
 			wantScore: 1.69,
 		}, {
 			// Matches only on content
@@ -89,7 +89,7 @@ func TestBM25(t *testing.T) {
 			}},
 			content:  exampleJava,
 			language: "Java",
-			// keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
+			// bm25-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
 			wantScore: 5.75,
 		},
 		{
@@ -98,15 +98,15 @@ func TestBM25(t *testing.T) {
 			query:    &query.Substring{Pattern: "java"},
 			content:  exampleJava,
 			language: "Java",
-			// keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
+			// bm25-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
 			wantScore: 1.07,
 		},
 		{
 			// Matches only on filename, and content is missing
 			fileName: "a/b/c/config.go",
 			query:    &query.Substring{Pattern: "config.go"},
 			language: "Go",
-			// keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
+			// bm25-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
 			wantScore: 1.91,
 		},
 	}
@@ -584,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) {
 	}
 }
 
-func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) {
+func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTagsParserType) {
 	skipIfCTagsUnavailable(t, parserType)
 
 	name := c.language
@@ -625,9 +625,9 @@ func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType cta
 		defer ss.Close()
 
 		srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{
-			UseKeywordScoring: keywordScoring,
-			ChunkMatches:      true,
-			DebugScore:        true})
+			UseBM25Scoring: useBM25,
+			ChunkMatches:   true,
+			DebugScore:     true})
 		if err != nil {
 			t.Fatal(err)
 		}
diff --git a/eval.go b/eval.go
@@ -317,7 +317,7 @@ nextFileMatch:
 			fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
 		}
 
-		if opts.UseKeywordScoring {
+		if opts.UseBM25Scoring {
 			d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
 		} else {
 			// Use the standard, non-experimental scoring method by default
diff --git a/grpc/protos/zoekt/webserver/v1/webserver.pb.go b/grpc/protos/zoekt/webserver/v1/webserver.pb.go
diff --git a/grpc/protos/zoekt/webserver/v1/webserver.proto b/grpc/protos/zoekt/webserver/v1/webserver.proto
@@ -107,10 +107,10 @@ message SearchOptions {
   // If set, the search results will contain debug information for scoring.
   bool debug_score = 14;
 
-  // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
+  // EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula.
   // Currently, this treats each match in a file as a term and computes an approximation to BM25.
   // When enabled, all other scoring signals are ignored, including document ranks.
-  bool use_keyword_scoring = 15;
+  bool use_bm25_scoring = 15;
 }
 
 message ListRequest {
diff --git a/score.go b/score.go
@@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc
 	m.Score += computed
 }
 
-func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
+func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) {
 	if debugScore {
-		m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
+		m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
 	}
 	m.Score += score
 }
@@ -116,7 +116,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
 }
 
 // scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
-// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
+// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
 // except inverse document frequency (idf), since we don't have access to global term frequency statistics.
 //
 // Filename matches count twice as much as content matches. This mimics a common text search strategy where you
@@ -160,5 +160,5 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
 		score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
 	}
 
-	fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
+	fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore)
 }
diff --git a/shards/shards_test.go b/shards/shards_test.go
@@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) {
 	}
 }
 
-func TestUseKeywordScoring(t *testing.T) {
+func TestUseBM25Scoring(t *testing.T) {
 	b := testIndexBuilder(t,
 		&zoekt.Repository{},
 		zoekt.Document{Name: "f1", Content: []byte("one two two three")},
@@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) {
 		&query.Substring{Pattern: "three"})
 
 	opts := zoekt.SearchOptions{
-		UseKeywordScoring: true,
+		UseBM25Scoring: true,
 	}
 
 	results, err := ss.Search(context.Background(), q, &opts)

Original file line number	Diff line number	Diff line change
`@@ -700,7 +700,7 @@ func SearchOptionsFromProto(p proto.SearchOptions) SearchOptions {`
`700`	`700`	`DocumentRanksWeight: p.GetDocumentRanksWeight(),`
`701`	`701`	`Trace: p.GetTrace(),`
`702`	`702`	`DebugScore: p.GetDebugScore(),`
`703`		`- UseKeywordScoring: p.GetUseKeywordScoring(),`
	`703`	`+ UseBM25Scoring: p.GetUseBm25Scoring(),`
`704`	`704`	`}`
`705`	`705`	`}`
`706`	`706`
`@@ -725,6 +725,6 @@ func (s SearchOptions) ToProto() proto.SearchOptions {`
`725`	`725`	`DocumentRanksWeight: s.DocumentRanksWeight,`
`726`	`726`	`Trace: s.Trace,`
`727`	`727`	`DebugScore: s.DebugScore,`
`728`		`- UseKeywordScoring: s.UseKeywordScoring,`
	`728`	`+ UseBm25Scoring: s.UseBM25Scoring,`
`729`	`729`	`}`
`730`	`730`	`}`
Original file line number	Diff line number	Diff line change
`@@ -317,7 +317,7 @@ nextFileMatch:`
`317`	`317`	`fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)`
`318`	`318`	`}`
`319`	`319`
`320`		`- if opts.UseKeywordScoring {`
	`320`	`+ if opts.UseBM25Scoring {`
`321`	`321`	`d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)`
`322`	`322`	`} else {`
`323`	`323`	`// Use the standard, non-experimental scoring method by default`
Original file line number	Diff line number	Diff line change
`@@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc`
`39`	`39`	`m.Score += computed`
`40`	`40`	`}`
`41`	`41`
`42`		`-func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {`
	`42`	`+func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) {`
`43`	`43`	`if debugScore {`
`44`		`- m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)`
	`44`	`+ m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)`
`45`	`45`	`}`
`46`	`46`	`m.Score += score`
`47`	`47`	`}`
`@@ -116,7 +116,7 @@ func (d indexData) scoreFile(fileMatch FileMatch, doc uint32, mt matchTree, kn`
`116`	`116`	`}`
`117`	`117`
`118`	`118`	`// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring`
`119`		`-// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula`
	`119`	`+// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula`
`120`	`120`	`// except inverse document frequency (idf), since we don't have access to global term frequency statistics.`
`121`	`121`	`//`
`122`	`122`	`// Filename matches count twice as much as content matches. This mimics a common text search strategy where you`
`@@ -160,5 +160,5 @@ func (d indexData) scoreFileUsingBM25(fileMatch FileMatch, doc uint32, cands [`
`160`	`160`	`score += ((k + 1.0) * tf) / (k(1.0-b+bL) + tf)`
`161`	`161`	`}`
`162`	`162`
`163`		`- fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)`
	`163`	`+ fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore)`
`164`	`164`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) {`
`1087`	`1087`	`}`
`1088`	`1088`	`}`
`1089`	`1089`
`1090`		`-func TestUseKeywordScoring(t *testing.T) {`
	`1090`	`+func TestUseBM25Scoring(t *testing.T) {`
`1091`	`1091`	`b := testIndexBuilder(t,`
`1092`	`1092`	`&zoekt.Repository{},`
`1093`	`1093`	`zoekt.Document{Name: "f1", Content: []byte("one two two three")},`
`@@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) {`
`1103`	`1103`	`&query.Substring{Pattern: "three"})`
`1104`	`1104`
`1105`	`1105`	`opts := zoekt.SearchOptions{`
`1106`		`- UseKeywordScoring: true,`
	`1106`	`+ UseBM25Scoring: true,`
`1107`	`1107`	`}`
`1108`	`1108`
`1109`	`1109`	`results, err := ss.Search(context.Background(), q, &opts)`