Skip to content

Commit

Permalink
Indexing: improve skipped doc handling (#687)
Browse files Browse the repository at this point in the history
This change makes a couple small improvements to how we handle skipped docs:
* Immediately skip ctags parsing if the content is `nil`
* Always sort skipped docs to the end of the shard. This seems like a nice
invariant. And generally it's good for performance to group data that is
expected to be accessed together and has similar content.
  • Loading branch information
jtibshirani authored Nov 13, 2023
1 parent 2355607 commit e068116
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
8 changes: 8 additions & 0 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,11 @@ type rankedDoc struct {
// at query time, because earlier documents receive a boost at query time and
// have a higher chance of being searched before limits kick in.
func rank(d *zoekt.Document, origIdx int) []float64 {
skipped := 0.0
if d.SkipReason != "" {
skipped = 1.0
}

generated := 0.0
if isGenerated(d.Name) {
generated = 1.0
Expand All @@ -968,6 +973,9 @@ func rank(d *zoekt.Document, origIdx int) []float64 {

// Smaller is earlier (=better).
return []float64{
// Always place skipped docs last
skipped,

// Prefer docs that are not generated
generated,

Expand Down
2 changes: 1 addition & 1 deletion build/ctags.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.Language
var tagsToSections tagsToSections

for _, doc := range todo {
if doc.Symbols != nil {
if len(doc.Content) == 0 || doc.Symbols != nil {
continue
}

Expand Down
21 changes: 21 additions & 0 deletions build/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,27 @@ func TestFileRank(t *testing.T) {
},
},
want: []int{0, 2, 1},
}, {
name: "skipped docs",
docs: []*zoekt.Document{
{
Name: "binary_file",
SkipReason: "binary file",
},
{
Name: "some_test.go",
Content: []byte("bla"),
},
{
Name: "large_file.go",
SkipReason: "too large",
},
{
Name: "file.go",
Content: []byte("blabla"),
},
},
want: []int{3, 1, 0, 2},
}} {
t.Run(c.name, func(t *testing.T) {
testFileRankAspect(t, c)
Expand Down

0 comments on commit e068116

Please sign in to comment.