diff --git a/indexdata.go b/indexdata.go index 3e8095d3..6cc11da5 100644 --- a/indexdata.go +++ b/indexdata.go @@ -76,7 +76,12 @@ type indexData struct { // name => mask (power of 2) branchIDs []map[string]uint - metaData IndexMetadata + metaData IndexMetadata + + // Repository metadata for all repositories contained in this shard. + // + // Invariant: Repositories in repoMetaData are sorted by ID in increasing order, + // see zoekt.merge. repoMetaData []Repository subRepos []uint32 diff --git a/merge.go b/merge.go index 0473663e..f574f5e0 100644 --- a/merge.go +++ b/merge.go @@ -93,9 +93,19 @@ func merge(ds ...*indexData) (*IndexBuilder, error) { if len(ds) == 0 { return nil, fmt.Errorf("need 1 or more indexData to merge") } - + // We sort shards in compound shards by repository ID so that we can use binary + // search instead of looping through the entire list of repos when calling + // `build.findShard`. + // + // We have seen that for very large compound shards (5k+ repos) it can take a + // long time to find a specific repo which can slow down initial indexing even + // for no-op indexes after a restart of indexserver. + // + // The sort order will impact ranking of non-bm25 search, because doc-order acts + // as a tie-breaker. However, the actual impact is very small, because + // "LastCommitDate" is a stronger signal. sort.Slice(ds, func(i, j int) bool { - return ds[i].repoMetaData[0].priority > ds[j].repoMetaData[0].priority + return ds[i].repoMetaData[0].ID < ds[j].repoMetaData[0].ID }) ib := newIndexBuilder() diff --git a/merge_test.go b/merge_test.go index 07abf1ca..27a4bc78 100644 --- a/merge_test.go +++ b/merge_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/require" ) // We compare 2 simple shards before and after the transformation @@ -124,3 +125,44 @@ func checkSameShards(t *testing.T, shard1, shard2 string) { t.Fatalf("-%s\n+%s:\n%s", shard1, shard2, d) } + +func TestRepoOrder(t *testing.T) { + idNames := []struct { + id uint32 + name string + }{ + {3, "foo"}, + {1, "bar"}, + {4, "baz"}, + {2, "bas"}, + } + + ds := make([]*indexData, 0, len(idNames)) + for _, repo := range idNames { + ib := newIndexBuilder() + ib.indexFormatVersion = NextIndexFormatVersion + + err := ib.setRepository(&Repository{Name: repo.name, ID: repo.id}) + require.NoError(t, err) + + // Add some documents to the index. + for _, doc := range []Document{ + {Name: repo.name + ".txt", Content: []byte(repo.name + " content")}, + {Name: repo.name + ".2.txt", Content: []byte(repo.name + " content 2")}, + } { + err := ib.Add(doc) + require.NoError(t, err) + } + + d := searcherForTest(t, ib) + ds = append(ds, d.(*indexData)) + } + + ib, err := merge(ds...) + require.NoError(t, err) + + require.Len(t, ib.repoList, len(idNames)) + for i := 1; i < len(ib.repoList); i++ { + require.Less(t, ib.repoList[i-1].ID, ib.repoList[i].ID) + } +}