Skip to content

Commit

Permalink
Avoid reopening packfile on every object access (#852)
Browse files Browse the repository at this point in the history
By default, the `go-git` library will open the packfile on every call to `Repository.BlobObject`, then close it. During indexing, we collect the list of files to index, then iterate through each one calling `Repository.BlobObject`. So on every object access the packfile reopened, and `go-git` reallocates some in-memory buffers.

This PR bypasses `git.PlainOpen` to allow us to enable the `KeepDescriptors` option. This option keeps packfile files open, and caches wrappers for them. The files then need to be explicitly closed when done with the repo.

Benefits:
* Avoid reallocating the memory buffers on every object access (see benchmark results below)
* (Highly speculative) I suspect this could improve OS decisions around when to cache portions of the packfile. Maybe constantly reopening and seeking within the file makes it harder for the OS to determine the true access pattern, which is roughly random access. This can affect decisions like readahead and whether to consider pages 'active'.
  • Loading branch information
jtibshirani authored Oct 30, 2024
1 parent 7caa174 commit 6a4b615
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 3 deletions.
54 changes: 51 additions & 3 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ import (
"strconv"
"strings"

"github.com/go-git/go-billy/v5/osfs"
"github.com/go-git/go-git/v5/plumbing/cache"
"github.com/go-git/go-git/v5/storage/filesystem"
"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/build"
"github.com/sourcegraph/zoekt/ignore"
Expand Down Expand Up @@ -404,9 +407,23 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
}

opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir
repo, err := git.PlainOpen(opts.RepoDir)
if err != nil {
return false, fmt.Errorf("git.PlainOpen: %w", err)

var repo *git.Repository

// TODO: remove this feature flag once we test this on a large-scale instance.
optimizeRepoOpen := os.Getenv("ZOEKT_ENABLE_GOGIT_OPTIMIZATION")
if b, err := strconv.ParseBool(optimizeRepoOpen); b && err == nil {
var repoCloser io.Closer
repo, repoCloser, err = openRepo(opts.RepoDir)
if err != nil {
return false, fmt.Errorf("openRepo: %w", err)
}
defer repoCloser.Close()
} else {
repo, err = git.PlainOpen(opts.RepoDir)
if err != nil {
return false, fmt.Errorf("git.PlainOpen: %w", err)
}
}

if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil {
Expand Down Expand Up @@ -572,6 +589,37 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
return true, builder.Finish()
}

// openRepo opens a git repository in a way that's optimized for indexing.
//
// It copies the relevant logic from git.PlainOpen, and enables the filesystem KeepDescriptors option. This
// caches the packfile handles, preventing the packfile from being opened then closed on every object access.
func openRepo(repoDir string) (*git.Repository, io.Closer, error) {
fs := osfs.New(repoDir)

// Check if the root directory exists.
if _, err := fs.Stat(""); err != nil {
if os.IsNotExist(err) {
return nil, nil, git.ErrRepositoryNotExists
}
return nil, nil, err
}

// If there's a .git directory, use that as the new root.
if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() {
if fs, err = fs.Chroot(git.GitDirName); err != nil {
return nil, nil, fmt.Errorf("fs.Chroot: %w", err)
}
}

s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{
KeepDescriptors: true,
})

// Because we're keeping descriptors open, we need to close the storage object when we're done.
repo, err := git.Open(s, fs)
return repo, s, err
}

type repoPathRanks struct {
MeanRank float64 `json:"mean_reference_count"`
Paths map[string]float64 `json:"paths"`
Expand Down
78 changes: 78 additions & 0 deletions gitindex/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package gitindex
import (
"bytes"
"context"
"errors"
"fmt"
"net/url"
"os"
Expand Down Expand Up @@ -64,6 +65,83 @@ func TestIndexEmptyRepo(t *testing.T) {
}
}

func TestIndexNonexistentRepo(t *testing.T) {
dir := t.TempDir()
desc := zoekt.Repository{
Name: "nonexistent",
}
opts := Options{
RepoDir: "does/not/exist",
Branches: []string{"main"},
BuildOptions: build.Options{
RepositoryDescription: desc,
IndexDir: dir,
},
}

if _, err := IndexGitRepo(opts); err == nil {
t.Fatal("expected error, got none")
} else if !errors.Is(err, git.ErrRepositoryNotExists) {
t.Fatalf("expected git.ErrRepositoryNotExists, got %v", err)
}
}

func TestIndexTinyRepo(t *testing.T) {
// Create a repo with one file in it.
dir := t.TempDir()
executeCommand(t, dir, exec.Command("git", "init", "-b", "main", "repo"))

repoDir := filepath.Join(dir, "repo")
executeCommand(t, repoDir, exec.Command("git", "config", "user.name", "Thomas"))
executeCommand(t, repoDir, exec.Command("git", "config", "user.email", "[email protected]"))

if err := os.WriteFile(filepath.Join(repoDir, "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
executeCommand(t, repoDir, exec.Command("git", "add", "."))
executeCommand(t, repoDir, exec.Command("git", "commit", "-m", "initial commit"))

// Test that indexing accepts both the repo directory, and the .git subdirectory.
for _, testDir := range []string{"repo", "repo/.git"} {
opts := Options{
RepoDir: filepath.Join(dir, testDir),
Branches: []string{"main"},
BuildOptions: build.Options{
RepositoryDescription: zoekt.Repository{Name: "repo"},
IndexDir: dir,
},
}

if _, err := IndexGitRepo(opts); err != nil {
t.Fatalf("unexpected error %v", err)
}

searcher, err := shards.NewDirectorySearcher(dir)
if err != nil {
t.Fatal("NewDirectorySearcher", err)
}

results, err := searcher.Search(context.Background(), &query.Const{Value: true}, &zoekt.SearchOptions{})
searcher.Close()

if err != nil {
t.Fatal("search failed", err)
}

if len(results.Files) != 1 {
t.Fatalf("got search result %v, want 1 file", results.Files)
}
}
}

func executeCommand(t *testing.T, dir string, cmd *exec.Cmd) *exec.Cmd {
cmd.Dir = dir
if err := cmd.Run(); err != nil {
t.Fatalf("cmd.Run: %v", err)
}
return cmd
}

func TestIndexDeltaBasic(t *testing.T) {
type branchToDocumentMap map[string][]zoekt.Document

Expand Down

0 comments on commit 6a4b615

Please sign in to comment.