Skip to content

Commit

Permalink
chore: remove unused feature "document ranks"
Browse files Browse the repository at this point in the history
Document ranks was an experimental feature that allowed Zoekt to query
Sourcegraph for additional ranking signals for files at index-time. We
have decided to remove this feature because it was never used in
production.

Note:
Most of the changes are Sourcegraph specific. However there are some
exported fields in `SearchOptions` and `build.Options` as well as flags
for zoekt-git-index which I simply removed.

Test plan:
updated tests
  • Loading branch information
stefanhengl committed Oct 1, 2024
1 parent 6755eca commit f4436af
Show file tree
Hide file tree
Showing 25 changed files with 564 additions and 1,325 deletions.
14 changes: 0 additions & 14 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -960,15 +960,6 @@ type SearchOptions struct {
// EXPERIMENTAL: the behavior of this flag may be changed in future versions.
ChunkMatches bool

// EXPERIMENTAL. If true, document ranks are used as additional input for
// sorting matches.
UseDocumentRanks bool

// EXPERIMENTAL. When UseDocumentRanks is enabled, this can be optionally set to adjust
// their weight in the file match score. If the value is <= 0.0, the default weight value
// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
DocumentRanksWeight float64

// EXPERIMENTAL. If true, use text-search style scoring instead of the default
// scoring formula. The scoring algorithm treats each match in a file as a term
// and computes an approximation to BM25.
Expand Down Expand Up @@ -1036,14 +1027,9 @@ func (s *SearchOptions) String() string {
addDuration("MaxWallTime", s.MaxWallTime)
addDuration("FlushWallTime", s.FlushWallTime)

if s.DocumentRanksWeight > 0 {
add("DocumentRanksWeight", strconv.FormatFloat(s.DocumentRanksWeight, 'g', -1, 64))
}

addBool("EstimateDocCount", s.EstimateDocCount)
addBool("Whole", s.Whole)
addBool("ChunkMatches", s.ChunkMatches)
addBool("UseDocumentRanks", s.UseDocumentRanks)
addBool("UseBM25Scoring", s.UseBM25Scoring)
addBool("Trace", s.Trace)
addBool("DebugScore", s.DebugScore)
Expand Down
7 changes: 2 additions & 5 deletions api_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ import (
"math/rand"
"reflect"

proto "github.com/sourcegraph/zoekt/grpc/protos/zoekt/webserver/v1"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/timestamppb"

proto "github.com/sourcegraph/zoekt/grpc/protos/zoekt/webserver/v1"
)

func FileMatchFromProto(p *proto.FileMatch) FileMatch {
Expand Down Expand Up @@ -696,8 +697,6 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
MaxMatchDisplayCount: int(p.GetMaxMatchDisplayCount()),
NumContextLines: int(p.GetNumContextLines()),
ChunkMatches: p.GetChunkMatches(),
UseDocumentRanks: p.GetUseDocumentRanks(),
DocumentRanksWeight: p.GetDocumentRanksWeight(),
Trace: p.GetTrace(),
DebugScore: p.GetDebugScore(),
UseBM25Scoring: p.GetUseBm25Scoring(),
Expand All @@ -721,8 +720,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
MaxMatchDisplayCount: int64(s.MaxMatchDisplayCount),
NumContextLines: int64(s.NumContextLines),
ChunkMatches: s.ChunkMatches,
UseDocumentRanks: s.UseDocumentRanks,
DocumentRanksWeight: s.DocumentRanksWeight,
Trace: s.Trace,
DebugScore: s.DebugScore,
UseBm25Scoring: s.UseBM25Scoring,
Expand Down
29 changes: 5 additions & 24 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,6 @@ type Options struct {
// last run.
IsDelta bool

// DocumentRanksPath is the path to the file with document ranks. If empty,
// ranks will be computed on-the-fly.
DocumentRanksPath string

// DocumentRanksVersion is a string which when changed will cause us to
// reindex a shard. This field is used so that when the contents of
// DocumentRanksPath changes, we can reindex.
DocumentRanksVersion string

// changedOrRemovedFiles is a list of file paths that have been changed or removed
// since the last indexing job for this repository. These files will be tombstoned
// in the older shards for this repository.
Expand All @@ -133,20 +124,15 @@ type HashOptions struct {
ctagsPath string
cTagsMustSucceed bool
largeFiles []string

// documentRankVersion is an experimental field which will change when the
// DocumentRanksPath content changes. If empty we ignore it.
documentRankVersion string
}

func (o *Options) HashOptions() HashOptions {
return HashOptions{
sizeMax: o.SizeMax,
disableCTags: o.DisableCTags,
ctagsPath: o.CTagsPath,
cTagsMustSucceed: o.CTagsMustSucceed,
largeFiles: o.LargeFiles,
documentRankVersion: o.DocumentRanksVersion,
sizeMax: o.SizeMax,
disableCTags: o.DisableCTags,
ctagsPath: o.CTagsPath,
cTagsMustSucceed: o.CTagsMustSucceed,
largeFiles: o.LargeFiles,
}
}

Expand All @@ -160,11 +146,6 @@ func (o *Options) GetHash() string {
hasher.Write([]byte(fmt.Sprintf("%q", h.largeFiles)))
hasher.Write([]byte(fmt.Sprintf("%t", h.disableCTags)))

if h.documentRankVersion != "" {
hasher.Write([]byte{0})
io.WriteString(hasher, h.documentRankVersion)
}

return fmt.Sprintf("%x", hasher.Sum(nil))
}

Expand Down
90 changes: 1 addition & 89 deletions build/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,92 +654,6 @@ func withoutTiebreaker(fullScore float64, useBM25 bool) float64 {
return math.Trunc(fullScore / zoekt.ScoreOffset)
}

func TestDocumentRanks(t *testing.T) {
requireCTags(t)
dir := t.TempDir()

opts := Options{
IndexDir: dir,
RepositoryDescription: zoekt.Repository{
Name: "repo",
},
DocumentRanksVersion: "ranking",
}

searchQuery := &query.Substring{Content: true, Pattern: "Inner"}
exampleJava, err := os.ReadFile("./testdata/example.java")
if err != nil {
t.Fatal(err)
}

cases := []struct {
name string
documentRank float64
documentRanksWeight float64
wantScore float64
}{
{
name: "score with no document ranks",
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match)
wantScore: 7000.00,
},
{
name: "score with document ranks",
documentRank: 0.8,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 225 (file rank)
wantScore: 7225.00,
},
{
name: "score with custom document ranks weight",
documentRank: 0.8,
documentRanksWeight: 1000.0,
// 5500 (partial symbol at boundary) + 1000 (Java class) + 500 (word match) + 25.00 (file rank)
wantScore: 7025.00,
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
b, err := NewBuilder(opts)
if err != nil {
t.Fatalf("NewBuilder: %v", err)
}

err = b.Add(zoekt.Document{Name: "example.java", Content: exampleJava, Ranks: []float64{c.documentRank}})
if err != nil {
t.Fatal(err)
}

if err := b.Finish(); err != nil {
t.Fatalf("Finish: %v", err)
}

ss, err := shards.NewDirectorySearcher(dir)
if err != nil {
t.Fatalf("NewDirectorySearcher(%s): %v", dir, err)
}
defer ss.Close()

srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{
UseDocumentRanks: true,
DocumentRanksWeight: c.documentRanksWeight,
DebugScore: true,
})
if err != nil {
t.Fatal(err)
}

if got, want := len(srs.Files), 1; got != want {
t.Fatalf("file matches: want %d, got %d", want, got)
}

if got := withoutTiebreaker(srs.Files[0].Score, false); got != c.wantScore {
t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore)
}
})
}
}

func TestRepoRanks(t *testing.T) {
requireCTags(t)
dir := t.TempDir()
Expand All @@ -749,7 +663,6 @@ func TestRepoRanks(t *testing.T) {
RepositoryDescription: zoekt.Repository{
Name: "repo",
},
DocumentRanksVersion: "ranking",
}

searchQuery := &query.Substring{Content: true, Pattern: "Inner"}
Expand Down Expand Up @@ -810,8 +723,7 @@ func TestRepoRanks(t *testing.T) {
defer ss.Close()

srs, err := ss.Search(context.Background(), searchQuery, &zoekt.SearchOptions{
UseDocumentRanks: true,
DebugScore: true,
DebugScore: true,
})
if err != nil {
t.Fatal(err)
Expand Down
6 changes: 1 addition & 5 deletions cmd/zoekt-git-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ import (
"strings"

"github.com/dustin/go-humanize"
"github.com/sourcegraph/zoekt/internal/profiler"
"go.uber.org/automaxprocs/maxprocs"

"github.com/sourcegraph/zoekt/cmd"
"github.com/sourcegraph/zoekt/ctags"
"github.com/sourcegraph/zoekt/gitindex"
"github.com/sourcegraph/zoekt/internal/profiler"
)

func run() int {
Expand All @@ -43,8 +43,6 @@ func run() int {
"It also affects name if the indexed repository is under this directory.")
isDelta := flag.Bool("delta", false, "whether we should use delta build")
deltaShardNumberFallbackThreshold := flag.Uint64("delta_threshold", 0, "upper limit on the number of preexisting shards that can exist before attempting a delta build (0 to disable fallback behavior)")
offlineRanking := flag.String("offline_ranking", "", "the name of the file that contains the ranking info.")
offlineRankingVersion := flag.String("offline_ranking_version", "", "a version string identifying the contents in offline_ranking.")
languageMap := flag.String("language_map", "", "a mapping between a language and its ctags processor (a:0,b:3).")

cpuProfile := flag.String("cpuprofile", "", "write cpu profile to `file`")
Expand Down Expand Up @@ -76,8 +74,6 @@ func run() int {

opts := cmd.OptionsFromFlags()
opts.IsDelta = *isDelta
opts.DocumentRanksPath = *offlineRanking
opts.DocumentRanksVersion = *offlineRankingVersion

var branches []string
if *branchesStr != "" {
Expand Down
46 changes: 0 additions & 46 deletions cmd/zoekt-sourcegraph-indexserver/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"context"
"crypto/sha1"
"encoding/json"
"errors"
"fmt"
"io"
Expand Down Expand Up @@ -53,11 +52,6 @@ type IndexOptions struct {
// Priority indicates ranking in results, higher first.
Priority float64

// DocumentRanksVersion when non-empty will lead to indexing using offline
// ranking. When the string changes this will also cause us to re-index with
// new ranks.
DocumentRanksVersion string

// Public is true if the repository is public.
Public bool

Expand Down Expand Up @@ -133,8 +127,6 @@ func (o *indexArgs) BuildOptions() *build.Options {
DisableCTags: !o.Symbols,
IsDelta: o.UseDelta,

DocumentRanksVersion: o.DocumentRanksVersion,

LanguageMap: o.LanguageMap,

ShardMerging: o.ShardMerging,
Expand Down Expand Up @@ -381,44 +373,6 @@ func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *i
"-submodules=false",
}

if o.DocumentRanksVersion != "" {
// We store the document ranks as JSON in gitDir and tell zoekt-git-index where
// to find the file.
documentsRankFile := filepath.Join(gitDir, "documents.rank")

saveDocumentRanks := func() error {
r, err := sourcegraph.GetDocumentRanks(context.Background(), o.Name)
if err != nil {
return fmt.Errorf("GetDocumentRanks: %w", err)
}

b, err := json.Marshal(r)
if err != nil {
return err
}

if err := os.WriteFile(documentsRankFile, b, 0o600); err != nil {
return fmt.Errorf("failed to write %s to disk: %w", documentsRankFile, err)
}

return nil
}

if err := saveDocumentRanks(); err != nil {
// log and fall back to online ranking
logger.Warn(
"error saving document ranks. Falling back to online ranking",
sglog.Error(err),
sglog.String("repo", o.Name),
sglog.Uint32("id", o.RepoID),
)
} else {
args = append(args,
"-offline_ranking", documentsRankFile,
"-offline_ranking_version", o.DocumentRanksVersion)
}
}

// Even though we check for incremental in this process, we still pass it
// in just in case we regress in how we check in process. We will still
// notice thanks to metrics and increased load on gitserver.
Expand Down
9 changes: 0 additions & 9 deletions cmd/zoekt-sourcegraph-indexserver/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,6 @@ var splitargs = cmpopts.AcyclicTransformer("splitargs", func(cmd string) []strin
type mockGRPCClient struct {
mockSearchConfiguration func(context.Context, *proto.SearchConfigurationRequest, ...grpc.CallOption) (*proto.SearchConfigurationResponse, error)
mockList func(context.Context, *proto.ListRequest, ...grpc.CallOption) (*proto.ListResponse, error)
mockDocumentRanks func(context.Context, *proto.DocumentRanksRequest, ...grpc.CallOption) (*proto.DocumentRanksResponse, error)
mockUpdateIndexStatus func(context.Context, *proto.UpdateIndexStatusRequest, ...grpc.CallOption) (*proto.UpdateIndexStatusResponse, error)
}

Expand All @@ -669,14 +668,6 @@ func (m *mockGRPCClient) List(ctx context.Context, in *proto.ListRequest, opts .
return nil, fmt.Errorf("mock RPC List not implemented")
}

func (m *mockGRPCClient) DocumentRanks(ctx context.Context, in *proto.DocumentRanksRequest, opts ...grpc.CallOption) (*proto.DocumentRanksResponse, error) {
if m.mockDocumentRanks != nil {
return m.mockDocumentRanks(ctx, in, opts...)
}

return nil, fmt.Errorf("mock RPC DocumentRanks not implemented")
}

func (m *mockGRPCClient) UpdateIndexStatus(ctx context.Context, in *proto.UpdateIndexStatusRequest, opts ...grpc.CallOption) (*proto.UpdateIndexStatusResponse, error) {
if m.mockUpdateIndexStatus != nil {
return m.mockUpdateIndexStatus(ctx, in, opts...)
Expand Down
Loading

0 comments on commit f4436af

Please sign in to comment.