sourcegraph · stefanhengl · Jan 13, 2025 · Jan 14, 2025 · jtibshirani · Jan 13, 2025
diff --git a/indexdata.go b/indexdata.go
@@ -76,7 +76,12 @@ type indexData struct {
 	// name => mask (power of 2)
 	branchIDs []map[string]uint
 
-	metaData     IndexMetadata
+	metaData IndexMetadata
+
+	// Repository metadata for all repositories contained in this shard.
+	//
+	// Invariant: Repositories in repoMetaData are sorted by ID in increasing order,
+	// see zoekt.merge.
 	repoMetaData []Repository
 
 	subRepos     []uint32

diff --git a/merge.go b/merge.go
@@ -93,9 +93,19 @@ func merge(ds ...*indexData) (*IndexBuilder, error) {
 	if len(ds) == 0 {
 		return nil, fmt.Errorf("need 1 or more indexData to merge")
 	}
-
+	// We sort shards in compound shards by repository ID so that we can use binary
+	// search instead of looping through the entire list of repos when calling
+	// `build.findShard`.
+	//
+	// We have seen that for very large compound shards (5k+ repos) it can take a
+	// long time to find a specific repo which can slow down initial indexing even
+	// for no-op indexes after a restart of indexserver.
+	//
+	// The sort order will impact ranking of non-bm25 search, because doc-order acts
+	// as a tie-breaker. However, the actual impact is very small, because
+	// "LastCommitDate" is a stronger signal.
 	sort.Slice(ds, func(i, j int) bool {
-		return ds[i].repoMetaData[0].priority > ds[j].repoMetaData[0].priority
+		return ds[i].repoMetaData[0].ID < ds[j].repoMetaData[0].ID
 	})
 
 	ib := newIndexBuilder()

diff --git a/merge_test.go b/merge_test.go
@@ -6,6 +6,7 @@ import (
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
+	"github.com/stretchr/testify/require"
 )
 
 // We compare 2 simple shards before and after the transformation
@@ -124,3 +125,44 @@ func checkSameShards(t *testing.T, shard1, shard2 string) {
 
 	t.Fatalf("-%s\n+%s:\n%s", shard1, shard2, d)
 }
+
+func TestRepoOrder(t *testing.T) {
+	idNames := []struct {
+		id   uint32
+		name string
+	}{
+		{3, "foo"},
+		{1, "bar"},
+		{4, "baz"},
+		{2, "bas"},
+	}
+
+	ds := make([]*indexData, 0, len(idNames))
+	for _, repo := range idNames {
+		ib := newIndexBuilder()
+		ib.indexFormatVersion = NextIndexFormatVersion
+
+		err := ib.setRepository(&Repository{Name: repo.name, ID: repo.id})
+		require.NoError(t, err)
+
+		// Add some documents to the index.
+		for _, doc := range []Document{
+			{Name: repo.name + ".txt", Content: []byte(repo.name + " content")},
+			{Name: repo.name + ".2.txt", Content: []byte(repo.name + " content 2")},
+		} {
+			err := ib.Add(doc)
+			require.NoError(t, err)
+		}
+
+		d := searcherForTest(t, ib)
+		ds = append(ds, d.(*indexData))
+	}
+
+	ib, err := merge(ds...)
+	require.NoError(t, err)
+
+	require.Len(t, ib.repoList, len(idNames))
+	for i := 1; i < len(ib.repoList); i++ {
+		require.Less(t, ib.repoList[i-1].ID, ib.repoList[i].ID)
+	}
+}