From d15aa285519974429c449a4ccd849b55eb02501f Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Tue, 1 Oct 2024 08:33:26 +0200 Subject: [PATCH] scoring: use repo freshness as tiebreaker (#832) We ignore priority and instead use the latest commit date as repo rank. This has a big impact for Sourcegraph because it means we switch from star count to repo freshness as tiebreaker. As a minor tweak, we also separate query based scores from tiebreakers. To achieve this we reserve the last 7 digits of a score for tiebreakers: - 5 digits (maxUint16) for repo rank - 2 digits ([0,10]) for file order (2 digits). Example: Before: score: 8775.35 <- atom(2):200, fragment:8550.00, repo-rank: 19, doc-order:6.35 After: score: 8750_00019_06.35 <- atom(2):200, fragment:8550.00, repo-rank: 19, doc-order:6.35 --- api.go | 29 ++- api_test.go | 26 +++ build/scoring_test.go | 202 +++++++++--------- cmd/zoekt-sourcegraph-indexserver/index.go | 2 + .../index_test.go | 9 +- contentprovider.go | 11 +- internal/e2e/e2e_rank_test.go | 5 + internal/e2e/testdata/WaitGroup.txt | 26 +-- internal/e2e/testdata/rank_stats.txt | 4 +- score.go | 26 ++- shards/shards_test.go | 14 +- .../TestReadSearch/ctagsrepo_v16.00000.golden | 8 +- .../TestReadSearch/ctagsrepo_v17.00000.golden | 8 +- .../TestReadSearch/repo17_v17.00000.golden | 4 +- .../TestReadSearch/repo2_v16.00000.golden | 4 +- .../TestReadSearch/repo_v16.00000.golden | 4 +- 16 files changed, 238 insertions(+), 144 deletions(-) diff --git a/api.go b/api.go index 1e478dfa..95b0b422 100644 --- a/api.go +++ b/api.go @@ -635,7 +635,16 @@ func (r *Repository) UnmarshalJSON(data []byte) error { r.ID = uint32(id) } - if v, ok := repo.RawConfig["priority"]; ok { + // Sourcegraph indexserver doesn't set repo.Rank, so we set it here. Setting it + // on read instead of during indexing allows us to avoid a complete reindex. + // + // Prefer "latest_commit_date" over "priority" for ranking. We keep priority for + // backwards compatibility. + if _, ok := repo.RawConfig["latest_commit_date"]; ok { + // We use the number of months since 1970 as a simple measure of repo freshness. + // It is monotonically increasing and stable across re-indexes and restarts. + r.Rank = monthsSince1970(repo.LatestCommitDate) + } else if v, ok := repo.RawConfig["priority"]; ok { r.priority, err = strconv.ParseFloat(v, 64) if err != nil { r.priority = 0 @@ -645,14 +654,28 @@ func (r *Repository) UnmarshalJSON(data []byte) error { // based on priority. Setting it on read instead of during indexing // allows us to avoid a complete reindex. if r.Rank == 0 && r.priority > 0 { - // Normalize the repo score within [0, 1), with the midpoint at 5,000. This means popular - // repos (roughly ones with over 5,000 stars) see diminishing returns from more stars. + // Normalize the repo score within [0, maxUint16), with the midpoint at 5,000. + // This means popular repos (roughly ones with over 5,000 stars) see diminishing + // returns from more stars. r.Rank = uint16(r.priority / (5000.0 + r.priority) * maxUInt16) } } + return nil } +// monthsSince1970 returns the number of months since 1970. It returns values in +// the range [0, maxUInt16]. The upper bound is reached in the year 7431, the +// lower bound for all dates before 1970. +func monthsSince1970(t time.Time) uint16 { + base := time.Unix(0, 0) + if t.Before(base) { + return 0 + } + months := int(t.Year()-1970)*12 + int(t.Month()-1) + return uint16(min(months, maxUInt16)) +} + // MergeMutable will merge x into r. mutated will be true if it made any // changes. err is non-nil if we needed to mutate an immutable field. // diff --git a/api_test.go b/api_test.go index 87ad4167..b8e0c9d9 100644 --- a/api_test.go +++ b/api_test.go @@ -368,3 +368,29 @@ func TestRepositoryMergeMutable(t *testing.T) { } }) } + +func TestMonthsSince1970(t *testing.T) { + tests := []struct { + name string + input time.Time + expected uint16 + }{ + {"Before 1970", time.Date(1950, 12, 31, 0, 0, 0, 0, time.UTC), 0}, + {"Unix 0", time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC), 0}, + {"Feb 1970", time.Date(1970, 2, 1, 0, 0, 0, 0, time.UTC), 1}, + {"Year 1989", time.Date(1989, 12, 13, 0, 0, 0, 0, time.UTC), 239}, + {"Sep 2024", time.Date(2024, 9, 20, 0, 0, 0, 0, time.UTC), 656}, + {"Oct 2024", time.Date(2024, 10, 20, 0, 0, 0, 0, time.UTC), 657}, + {"Apr 7431", time.Date(7431, 4, 1, 0, 0, 0, 0, time.UTC), 65535}, + {"9999", time.Date(9999, 0, 0, 0, 0, 0, 0, time.UTC), 65535}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := monthsSince1970(tt.input) + if result != tt.expected { + t.Errorf("expected %d, got %d", tt.expected, result) + } + }) + } +} diff --git a/build/scoring_test.go b/build/scoring_test.go index 8dc2f2f9..02260c4c 100644 --- a/build/scoring_test.go +++ b/build/scoring_test.go @@ -40,22 +40,22 @@ func TestFileNameMatch(t *testing.T) { fileName: "a/b/c/config.go", query: &query.Substring{FileName: true, Pattern: "config"}, language: "Go", - // 5500 (partial base at boundary) + 500 (word) + 10 (file order) - wantScore: 6010, + // 5500 (partial base at boundary) + 500 (word) + wantScore: 6000, }, { fileName: "a/b/c/config.go", query: &query.Substring{FileName: true, Pattern: "config.go"}, language: "Go", - // 7000 (full base match) + 500 (word) + 10 (file order) - wantScore: 7510, + // 7000 (full base match) + 500 (word) + wantScore: 7500, }, { fileName: "a/config/c/d.go", query: &query.Substring{FileName: true, Pattern: "config"}, language: "Go", - // 500 (word) + 10 (file order) - wantScore: 510, + // 500 (word) + wantScore: 500, }, } @@ -128,56 +128,56 @@ func TestJava(t *testing.T) { content: exampleJava, query: &query.Substring{Content: true, Pattern: "nerClass"}, language: "Java", - // 5500 (partial symbol at boundary) + 1000 (Java class) + 50 (partial word) + 10 (file order) - wantScore: 6560, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 50 (partial word) + wantScore: 6550, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "StaticClass"}, language: "Java", - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word) + 10 (file order) - wantScore: 7010, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word) + wantScore: 7000, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "innerEnum"}, language: "Java", - // 7000 (symbol) + 900 (Java enum) + 500 (word) + 10 (file order) - wantScore: 8410, + // 7000 (symbol) + 900 (Java enum) + 500 (word) + wantScore: 8400, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "innerInterface"}, language: "Java", - // 7000 (symbol) + 800 (Java interface) + 500 (word) + 10 (file order) - wantScore: 8310, + // 7000 (symbol) + 800 (Java interface) + 500 (word) + wantScore: 8300, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "innerMethod"}, language: "Java", - // 7000 (symbol) + 700 (Java method) + 500 (word) + 10 (file order) - wantScore: 8210, + // 7000 (symbol) + 700 (Java method) + 500 (word) + wantScore: 8200, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "field"}, language: "Java", - // 7000 (symbol) + 600 (Java field) + 500 (word) + 10 (file order) - wantScore: 8110, + // 7000 (symbol) + 600 (Java field) + 500 (word) + wantScore: 8100, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "B"}, language: "Java", - // 7000 (symbol) + 500 (Java enum constant) + 500 (word) + 10 (file order) - wantScore: 8010, + // 7000 (symbol) + 500 (Java enum constant) + 500 (word) + wantScore: 8000, }, // 2 Atoms (1x content and 1x filename) { @@ -185,8 +185,8 @@ func TestJava(t *testing.T) { content: exampleJava, query: &query.Substring{Pattern: "example"}, // matches filename and a Java field language: "Java", - // 5500 (edge symbol) + 600 (Java field) + 500 (word) + 200 (atom) + 10 (file order) - wantScore: 6810, + // 5500 (edge symbol) + 600 (Java field) + 500 (word) + 200 (atom) + wantScore: 6800, }, // 3 Atoms (2x content, 1x filename) { @@ -197,8 +197,8 @@ func TestJava(t *testing.T) { &query.Substring{Content: true, Pattern: "runInnerInterface"}, // matches a Java method }}, language: "Java", - // 7000 (symbol) + 700 (Java method) + 500 (word) + 266.67 (atom) + 10 (file order) - wantScore: 8476.667, + // 7000 (symbol) + 700 (Java method) + 500 (word) + 266.67 (atom) + wantScore: 8466, }, // 4 Atoms (4x content) { @@ -211,40 +211,40 @@ func TestJava(t *testing.T) { &query.Substring{Content: true, Pattern: "app"}, }}, language: "Java", - // 7000 (symbol) + 900 (Java enum) + 500 (word) + 300 (atom) + 10 (file order) - wantScore: 8710, + // 7000 (symbol) + 900 (Java enum) + 500 (word) + 300 (atom) + wantScore: 8700, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "unInnerInterface("}, language: "Java", - // 4000 (overlap Symbol) + 700 (Java method) + 50 (partial word) + 10 (file order) - wantScore: 4760, + // 4000 (overlap Symbol) + 700 (Java method) + 50 (partial word) + wantScore: 4750, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "InnerEnum"}, language: "Java", - // 7000 (Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) - wantScore: 8410, + // 7000 (Symbol) + 900 (Java enum) + 500 (word) + wantScore: 8400, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "enum InnerEnum"}, language: "Java", - // 5500 (edge Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) - wantScore: 6910, + // 5500 (edge Symbol) + 900 (Java enum) + 500 (word) + wantScore: 6900, }, { fileName: "example.java", content: exampleJava, query: &query.Substring{Content: true, Pattern: "public enum InnerEnum {"}, language: "Java", - // 4000 (overlap Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) - wantScore: 5410, + // 4000 (overlap Symbol) + 900 (Java enum) + 500 (word) + wantScore: 5400, }, } @@ -265,48 +265,48 @@ func TestKotlin(t *testing.T) { content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "oxyPreloader"}, language: "Kotlin", - // 5500 (partial symbol at boundary) + 1000 (Kotlin class) + 50 (partial word) + 10 (file order) - wantScore: 6560, + // 5500 (partial symbol at boundary) + 1000 (Kotlin class) + 50 (partial word) + wantScore: 6550, }, { fileName: "example.kt", content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "ViewMetadata"}, language: "Kotlin", - // 7000 (symbol) + 900 (Kotlin interface) + 500 (word) + 10 (file order) - wantScore: 8410, + // 7000 (symbol) + 900 (Kotlin interface) + 500 (word) + wantScore: 8400, }, { fileName: "example.kt", content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "onScrolled"}, language: "Kotlin", - // 7000 (symbol) + 800 (Kotlin method) + 500 (word) + 10 (file order) - wantScore: 8310, + // 7000 (symbol) + 800 (Kotlin method) + 500 (word) + wantScore: 8300, }, { fileName: "example.kt", content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "PreloadErrorHandler"}, language: "Kotlin", - // 7000 (symbol) + 700 (Kotlin typealias) + 500 (word) + 10 (file order) - wantScore: 8210, + // 7000 (symbol) + 700 (Kotlin typealias) + 500 (word) + wantScore: 8200, }, { fileName: "example.kt", content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "FLING_THRESHOLD_PX"}, language: "Kotlin", - // 7000 (symbol) + 600 (Kotlin constant) + 500 (word) + 10 (file order) - wantScore: 8110, + // 7000 (symbol) + 600 (Kotlin constant) + 500 (word) + wantScore: 8100, }, { fileName: "example.kt", content: exampleKotlin, query: &query.Substring{Content: true, Pattern: "scrollState"}, language: "Kotlin", - // 7000 (symbol) + 500 (Kotlin variable) + 500 (word) + 10 (file order) - wantScore: 8010, + // 7000 (symbol) + 500 (Kotlin variable) + 500 (word) + wantScore: 8000, }, } @@ -330,40 +330,40 @@ func TestCpp(t *testing.T) { content: exampleCpp, query: &query.Substring{Content: true, Pattern: "FooClass"}, language: "C++", - // 7000 (Symbol) + 1000 (C++ class) + 500 (full word) + 10 (file order) - wantScore: 8510, + // 7000 (Symbol) + 1000 (C++ class) + 500 (full word) + wantScore: 8500, }, { fileName: "example.cc", content: exampleCpp, query: &query.Substring{Content: true, Pattern: "NestedEnum"}, language: "C++", - // 7000 (Symbol) + 900 (C++ enum) + 500 (full word) + 10 (file order) - wantScore: 8410, + // 7000 (Symbol) + 900 (C++ enum) + 500 (full word) + wantScore: 8400, }, { fileName: "example.cc", content: exampleCpp, query: &query.Substring{Content: true, Pattern: "main"}, language: "C++", - // 7000 (Symbol) + 800 (C++ function) + 500 (full word) + 10 (file order) - wantScore: 8310, + // 7000 (Symbol) + 800 (C++ function) + 500 (full word) + wantScore: 8300, }, { fileName: "example.cc", content: exampleCpp, query: &query.Substring{Content: true, Pattern: "FooStruct"}, language: "C++", - // 7000 (Symbol) + 700 (C++ struct) + 500 (full word) + 10 (file order) - wantScore: 8210, + // 7000 (Symbol) + 700 (C++ struct) + 500 (full word) + wantScore: 8200, }, { fileName: "example.cc", content: exampleCpp, query: &query.Substring{Content: true, Pattern: "TheUnion"}, language: "C++", - // 7000 (Symbol) + 600 (C++ union) + 500 (full word) + 10 (file order) - wantScore: 8110, + // 7000 (Symbol) + 600 (C++ union) + 500 (full word) + wantScore: 8100, }, } @@ -387,16 +387,16 @@ func TestPython(t *testing.T) { content: examplePython, query: &query.Substring{Content: true, Pattern: "C1"}, language: "Python", - // 7000 (symbol) + 1000 (Python class) + 500 (word) + 10 (file order) - wantScore: 8510, + // 7000 (symbol) + 1000 (Python class) + 500 (word) + wantScore: 8500, }, { fileName: "example.py", content: examplePython, query: &query.Substring{Content: true, Pattern: "g"}, language: "Python", - // 7000 (symbol) + 800 (Python function) + 500 (word) + 10 (file order) - wantScore: 8310, + // 7000 (symbol) + 800 (Python function) + 500 (word) + wantScore: 8300, }, } @@ -412,8 +412,8 @@ func TestPython(t *testing.T) { content: examplePython, query: &query.Substring{Content: true, Pattern: "__init__"}, language: "Python", - // 7000 (symbol) + 800 (Python method) + 50 (partial word) + 10 (file order) - wantScore: 7860, + // 7000 (symbol) + 800 (Python method) + 50 (partial word) + wantScore: 7850, } checkScoring(t, scipOnlyCase, false, ctags.ScipCTags) @@ -431,24 +431,24 @@ func TestRuby(t *testing.T) { content: exampleRuby, query: &query.Substring{Content: true, Pattern: "Parental"}, language: "Ruby", - // 7000 (symbol) + 1000 (Ruby class) + 500 (word) + 10 (file order) - wantScore: 8510, + // 7000 (symbol) + 1000 (Ruby class) + 500 (word) + wantScore: 8500, }, { fileName: "example.rb", content: exampleRuby, query: &query.Substring{Content: true, Pattern: "parental_func"}, language: "Ruby", - // 7000 (symbol) + 900 (Ruby method) + 500 (word) + 10 (file order) - wantScore: 8410, + // 7000 (symbol) + 900 (Ruby method) + 500 (word) + wantScore: 8400, }, { fileName: "example.rb", content: exampleRuby, query: &query.Substring{Content: true, Pattern: "MyModule"}, language: "Ruby", - // 7000 (symbol) + 500 (Ruby module) + 500 (word) + 10 (file order) - wantScore: 8210, + // 7000 (symbol) + 500 (Ruby module) + 500 (word) + wantScore: 8200, }, } @@ -471,32 +471,32 @@ func TestScala(t *testing.T) { content: exampleScala, query: &query.Substring{Content: true, Pattern: "SymbolIndexBucket"}, language: "Scala", - // 7000 (symbol) + 1000 (Scala class) + 500 (word) + 10 (file order) - wantScore: 8510, + // 7000 (symbol) + 1000 (Scala class) + 500 (word) + wantScore: 8500, }, { fileName: "example.scala", content: exampleScala, query: &query.Substring{Content: true, Pattern: "stdLibPatches"}, language: "Scala", - // 7000 (symbol) + 800 (Scala object) + 500 (word) + 10 (file order) - wantScore: 8310, + // 7000 (symbol) + 800 (Scala object) + 500 (word) + wantScore: 8300, }, { fileName: "example.scala", content: exampleScala, query: &query.Substring{Content: true, Pattern: "close"}, language: "Scala", - // 7000 (symbol) + 700 (Scala method) + 500 (word) + 10 (file order) - wantScore: 8210, + // 7000 (symbol) + 700 (Scala method) + 500 (word) + wantScore: 8200, }, { fileName: "example.scala", content: exampleScala, query: &query.Substring{Content: true, Pattern: "javaSymbol"}, language: "Scala", - // 7000 (symbol) + 500 (Scala method) + 500 (word) + 10 (file order) - wantScore: 8010, + // 7000 (symbol) + 500 (Scala method) + 500 (word) + wantScore: 8000, }, } @@ -516,8 +516,8 @@ type aInterface interface {} `), query: &query.Substring{Content: true, Pattern: "aInterface"}, language: "Go", - // 7000 (full base match) + 1000 (Go interface) + 500 (word) + 10 (file order) - wantScore: 8510, + // 7000 (full base match) + 1000 (Go interface) + 500 (word) + wantScore: 8500, }, { fileName: "src/net/http/client.go", @@ -527,8 +527,8 @@ type aStruct struct {} `), query: &query.Substring{Content: true, Pattern: "aStruct"}, language: "Go", - // 7000 (full base match) + 900 (Go struct) + 500 (word) + 10 (file order) - wantScore: 8410, + // 7000 (full base match) + 900 (Go struct) + 500 (word) + wantScore: 8400, }, { fileName: "src/net/http/client.go", @@ -538,8 +538,8 @@ func aFunc() bool {} `), query: &query.Substring{Content: true, Pattern: "aFunc"}, language: "Go", - // 7000 (full base match) + 800 (Go function) + 500 (word) + 10 (file order) - wantScore: 8310, + // 7000 (full base match) + 800 (Go function) + 500 (word) + wantScore: 8300, }, { fileName: "src/net/http/client.go", @@ -554,8 +554,8 @@ func Get() { &query.Symbol{Expr: &query.Substring{Pattern: "Get", Content: true}}, }}, language: "Go", - // 7000 (full base match) + 800 (Go func) + 50 (Exported Go) + 500 (word) + 200 (atom) + 10 (file order) - wantScore: 8560, + // 7000 (full base match) + 800 (Go func) + 50 (Exported Go) + 500 (word) + 200 (atom) + wantScore: 8550, }, } @@ -636,7 +636,7 @@ func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTag t.Fatalf("file matches: want %d, got %d", want, got) } - if got := srs.Files[0].Score; math.Abs(got-c.wantScore) > epsilon { + if got := withoutTiebreaker(srs.Files[0].Score, useBM25); math.Abs(got-c.wantScore) > epsilon { t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].ChunkMatches[0].DebugScore) } @@ -646,6 +646,14 @@ func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTag }) } +// helper to remove the tiebreaker from the score for easier comparison +func withoutTiebreaker(fullScore float64, useBM25 bool) float64 { + if useBM25 { + return fullScore + } + return math.Trunc(fullScore / zoekt.ScoreOffset) +} + func TestDocumentRanks(t *testing.T) { requireCTags(t) dir := t.TempDir() @@ -672,21 +680,21 @@ func TestDocumentRanks(t *testing.T) { }{ { name: "score with no document ranks", - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) - wantScore: 7010.00, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + wantScore: 7000.00, }, { name: "score with document ranks", documentRank: 0.8, - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 225 (file rank) + 10 (file order) - wantScore: 7235.00, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 225 (file rank) + wantScore: 7225.00, }, { name: "score with custom document ranks weight", documentRank: 0.8, documentRanksWeight: 1000.0, - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 25.00 (file rank) + 10 (file order) - wantScore: 7035.00, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 25.00 (file rank) + wantScore: 7025.00, }, } @@ -725,7 +733,7 @@ func TestDocumentRanks(t *testing.T) { t.Fatalf("file matches: want %d, got %d", want, got) } - if got := srs.Files[0].Score; got != c.wantScore { + if got := withoutTiebreaker(srs.Files[0].Score, false); got != c.wantScore { t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) } }) @@ -758,19 +766,19 @@ func TestRepoRanks(t *testing.T) { { name: "no shard rank", // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) - wantScore: 7010.00, + wantScore: 7000_00000_10.00, }, { name: "medium shard rank", repoRank: 30000, - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) + 9.16 (repo rank) - wantScore: 7019.16, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 30000 (repo rank) + 10 (file order) + wantScore: 7000_30000_10.00, }, { name: "high shard rank", repoRank: 60000, - // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 10 (file order) + 18.31 (repo rank) - wantScore: 7028.31, + // 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 60000 (repo rank) + 10 (file order) + wantScore: 7000_60000_10.00, }, } diff --git a/cmd/zoekt-sourcegraph-indexserver/index.go b/cmd/zoekt-sourcegraph-indexserver/index.go index be395276..7ffc0188 100644 --- a/cmd/zoekt-sourcegraph-indexserver/index.go +++ b/cmd/zoekt-sourcegraph-indexserver/index.go @@ -121,6 +121,8 @@ func (o *indexArgs) BuildOptions() *build.Options { "public": marshalBool(o.Public), "fork": marshalBool(o.Fork), "archived": marshalBool(o.Archived), + // Calculate repo rank based on the latest commit date. + "latest_commit_date": "1", }, }, IndexDir: o.IndexDir, diff --git a/cmd/zoekt-sourcegraph-indexserver/index_test.go b/cmd/zoekt-sourcegraph-indexserver/index_test.go index 7ff7fcd0..c1eeb1ac 100644 --- a/cmd/zoekt-sourcegraph-indexserver/index_test.go +++ b/cmd/zoekt-sourcegraph-indexserver/index_test.go @@ -14,12 +14,13 @@ import ( "time" "github.com/sourcegraph/log/logtest" - proto "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/protos/sourcegraph/zoekt/configuration/v1" - "github.com/sourcegraph/zoekt/ctags" "google.golang.org/grpc" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/timestamppb" + proto "github.com/sourcegraph/zoekt/cmd/zoekt-sourcegraph-indexserver/protos/sourcegraph/zoekt/configuration/v1" + "github.com/sourcegraph/zoekt/ctags" + "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -493,6 +494,7 @@ func TestIndex(t *testing.T) { "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", + "git -C $TMPDIR/test%2Frepo.git config zoekt.latest_commit_date 1", "git -C $TMPDIR/test%2Frepo.git config zoekt.name test/repo", "git -C $TMPDIR/test%2Frepo.git config zoekt.priority 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.public 0", @@ -515,6 +517,7 @@ func TestIndex(t *testing.T) { "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", + "git -C $TMPDIR/test%2Frepo.git config zoekt.latest_commit_date 1", "git -C $TMPDIR/test%2Frepo.git config zoekt.name test/repo", "git -C $TMPDIR/test%2Frepo.git config zoekt.priority 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.public 0", @@ -546,6 +549,7 @@ func TestIndex(t *testing.T) { "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", + "git -C $TMPDIR/test%2Frepo.git config zoekt.latest_commit_date 1", "git -C $TMPDIR/test%2Frepo.git config zoekt.name test/repo", "git -C $TMPDIR/test%2Frepo.git config zoekt.priority 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.public 0", @@ -593,6 +597,7 @@ func TestIndex(t *testing.T) { "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/release 12345678", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", + "git -C $TMPDIR/test%2Frepo.git config zoekt.latest_commit_date 1", "git -C $TMPDIR/test%2Frepo.git config zoekt.name test/repo", "git -C $TMPDIR/test%2Frepo.git config zoekt.priority 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.public 0", diff --git a/contentprovider.go b/contentprovider.go index e4a1ce1d..44863752 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -530,12 +530,17 @@ const ( // File-only scoring signals. For now these are also bounded ~9000 to give them // equal weight with the query-dependent signals. - scoreFileRankFactor = 9000.0 - scoreFileOrderFactor = 10.0 - scoreRepoRankFactor = 20.0 + scoreFileRankFactor = 9000.0 // Used for ordering line and chunk matches within a file. scoreLineOrderFactor = 1.0 + + // Used for tiebreakers. The scores are not combined with the main score, but + // are used to break ties between matches with the same score. The factors are + // chosen to separate the tiebreakers from the main score and from each other. + // If you make changes here, make sure to update indexData.scoreFile too. + scoreRepoRankFactor = 100.0 + scoreFileOrderFactor = 10.0 ) // findMaxOverlappingSection returns the index of the section in secs that diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go index 860194b9..ae70b882 100644 --- a/internal/e2e/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -251,6 +251,11 @@ func indexURL(indexDir, u string) error { err := archive.Index(opts, build.Options{ IndexDir: indexDir, CTagsMustSucceed: true, + RepositoryDescription: zoekt.Repository{ + // Use the latest commit date to calculate the repo rank when loading the shard. + // This is the same setting we use in production. + RawConfig: map[string]string{"latest_commit_date": "1"}, + }, }) if err != nil { return fmt.Errorf("failed to index %s: %w", opts.Archive, err) diff --git a/internal/e2e/testdata/WaitGroup.txt b/internal/e2e/testdata/WaitGroup.txt index fc215576..cddc9d4c 100644 --- a/internal/e2e/testdata/WaitGroup.txt +++ b/internal/e2e/testdata/WaitGroup.txt @@ -1,12 +1,6 @@ queryString: WaitGroup query: case_substr:"WaitGroup" -targetRank: 2 - -github.com/golang/go/src/sync/waitgroup.go -23:type WaitGroup struct { -91:func (wg *WaitGroup) Wait() { -13:// A WaitGroup waits for a collection of goroutines to finish. -hidden 13 more line matches +targetRank: 1 **github.com/sourcegraph/conc/waitgroup.go** 22:type WaitGroup struct { @@ -14,6 +8,12 @@ hidden 13 more line matches 38:func (h *WaitGroup) Wait() { hidden 10 more line matches +github.com/golang/go/src/sync/waitgroup.go +23:type WaitGroup struct { +91:func (wg *WaitGroup) Wait() { +13:// A WaitGroup waits for a collection of goroutines to finish. +hidden 13 more line matches + github.com/golang/go/test/fixedbugs/issue19467.dir/mysync.go 9:type WaitGroup struct { 13:func (wg *WaitGroup) Add(x int) { @@ -25,16 +25,16 @@ github.com/golang/go/test/fixedbugs/issue44370.dir/a.go 7:// A StoppableWaitGroup waits for a collection of goroutines to finish. hidden 3 more line matches -github.com/golang/go/src/sync/example_test.go -20:func ExampleWaitGroup() { -19:// using a WaitGroup to block until all the fetches are complete. -21: var wg sync.WaitGroup -hidden 1 more line matches - github.com/sourcegraph/conc/waitgroup_test.go 13:func ExampleWaitGroup() { 42:func TestWaitGroup(t *testing.T) { 29:func ExampleWaitGroup_WaitAndRecover() { hidden 12 more line matches +github.com/golang/go/src/sync/example_test.go +20:func ExampleWaitGroup() { +19:// using a WaitGroup to block until all the fetches are complete. +21: var wg sync.WaitGroup +hidden 1 more line matches + hidden 227 more file matches diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt index 528e444c..94fb3e15 100644 --- a/internal/e2e/testdata/rank_stats.txt +++ b/internal/e2e/testdata/rank_stats.txt @@ -1,4 +1,4 @@ queries: 16 -recall@1: 8 (50%) +recall@1: 9 (56%) recall@5: 11 (69%) -mrr: 0.600787 +mrr: 0.632037 diff --git a/score.go b/score.go index a2579df2..37e01979 100644 --- a/score.go +++ b/score.go @@ -21,7 +21,10 @@ import ( "strings" ) -const maxUInt16 = 0xffff +const ( + maxUInt16 = 0xffff + ScoreOffset = 10_000_000 +) // addScore increments the score of the FileMatch by the computed score. If // debugScore is true, it also adds a debug string to the FileMatch. If raw is @@ -99,12 +102,29 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn } } + // Add tiebreakers + // + // ScoreOffset shifts the score 7 digits to the left. + fileMatch.Score = math.Trunc(fileMatch.Score) * ScoreOffset + md := d.repoMetaData[d.repos[doc]] + + // md.Rank lies in the range [0, 65535]. Hence, we have to allocate 5 digits for + // the rank. The scoreRepoRankFactor shifts the rank score 2 digits to the left, + // reserving digits 3-7 for the repo rank. + addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)) + + // digits 1-2 and the decimals are reserved for the doc order. Doc order + // (without the scaling factor) lies in the range [0, 1]. The upper bound is + // achieved for matches in the first document of a shard. addScore("doc-order", scoreFileOrderFactor*(1.0-float64(doc)/float64(len(d.boundaries)))) - addScore("repo-rank", scoreRepoRankFactor*float64(md.Rank)/maxUInt16) if opts.DebugScore { - fileMatch.Debug = fmt.Sprintf("score: %.2f <- %s", fileMatch.Score, strings.TrimSuffix(fileMatch.Debug, ", ")) + // To make the debug output easier to read, we split the score into the query + // dependent score and the tiebreaker + score := math.Trunc(fileMatch.Score / ScoreOffset) + tiebreaker := fileMatch.Score - score*ScoreOffset + fileMatch.Debug = fmt.Sprintf("score: %d (%.2f) <- %s", int(score), tiebreaker, strings.TrimSuffix(fileMatch.Debug, ", ")) } } diff --git a/shards/shards_test.go b/shards/shards_test.go index 4a9a3865..7c4ae3d6 100644 --- a/shards/shards_test.go +++ b/shards/shards_test.go @@ -229,12 +229,12 @@ func TestShardedSearcher_DocumentRanking(t *testing.T) { ss := newShardedSearcher(1) var nextShardNum int - addShard := func(repo string, priority float64, docs ...zoekt.Document) { + addShard := func(repo string, rank uint16, docs ...zoekt.Document) { r := &zoekt.Repository{ID: hash(repo), Name: repo} r.RawConfig = map[string]string{ - "public": "1", - "priority": strconv.FormatFloat(priority, 'f', 2, 64), + "public": "1", } + r.Rank = rank b := testIndexBuilder(t, r, docs...) shard := searcherForTest(t, b) ss.replace(map[string]zoekt.Searcher{ @@ -243,10 +243,10 @@ func TestShardedSearcher_DocumentRanking(t *testing.T) { nextShardNum++ } - addShard("weekend-project", 20, zoekt.Document{Name: "f1", Content: []byte("foobar")}) - addShard("moderately-popular", 500, zoekt.Document{Name: "f2", Content: []byte("foobaz")}) - addShard("weekend-project-2", 20, zoekt.Document{Name: "f3", Content: []byte("foo bar")}) - addShard("super-star", 5000, zoekt.Document{Name: "f4", Content: []byte("foo baz")}, + addShard("old-project", 1, zoekt.Document{Name: "f1", Content: []byte("foobar")}) + addShard("recent", 2, zoekt.Document{Name: "f2", Content: []byte("foobaz")}) + addShard("old-project-2", 1, zoekt.Document{Name: "f3", Content: []byte("foo bar")}) + addShard("new", 3, zoekt.Document{Name: "f4", Content: []byte("foo baz")}, zoekt.Document{Name: "f5", Content: []byte("fooooo")}) // Run a stream search and gather the results diff --git a/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden b/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden index 6c5faaac..e86b63f0 100644 --- a/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden +++ b/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 6810 + "Score": 68000000010 } ], [ @@ -59,7 +59,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], [ @@ -94,7 +94,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 8010 + "Score": 80000000010 } ], [ @@ -129,7 +129,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 6060 + "Score": 60500000010 } ] ] diff --git a/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden b/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden index d8054cc3..dacd0702 100644 --- a/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden +++ b/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 6810 + "Score": 68000000010 } ], [ @@ -59,7 +59,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], [ @@ -94,7 +94,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 8010 + "Score": 80000000010 } ], [ @@ -129,7 +129,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 6060 + "Score": 60500000010 } ] ] diff --git a/testdata/golden/TestReadSearch/repo17_v17.00000.golden b/testdata/golden/TestReadSearch/repo17_v17.00000.golden index 452429bc..6182ba83 100644 --- a/testdata/golden/TestReadSearch/repo17_v17.00000.golden +++ b/testdata/golden/TestReadSearch/repo17_v17.00000.golden @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], [ @@ -59,7 +59,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], null, diff --git a/testdata/golden/TestReadSearch/repo2_v16.00000.golden b/testdata/golden/TestReadSearch/repo2_v16.00000.golden index cf70c4e0..92d97e1b 100644 --- a/testdata/golden/TestReadSearch/repo2_v16.00000.golden +++ b/testdata/golden/TestReadSearch/repo2_v16.00000.golden @@ -29,7 +29,7 @@ } ], "Checksum": "Ju1TnQKZ6mE=", - "Score": 6810 + "Score": 68000000010 } ], [ @@ -59,7 +59,7 @@ } ], "Checksum": "Ju1TnQKZ6mE=", - "Score": 510 + "Score": 5000000010 } ], null, diff --git a/testdata/golden/TestReadSearch/repo_v16.00000.golden b/testdata/golden/TestReadSearch/repo_v16.00000.golden index 7f5f3c41..13cb6680 100644 --- a/testdata/golden/TestReadSearch/repo_v16.00000.golden +++ b/testdata/golden/TestReadSearch/repo_v16.00000.golden @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], [ @@ -59,7 +59,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 5000000010 } ], null,