From aaeeb646e45790ea1bb77ca06a25e3c900bc0029 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Mon, 28 Oct 2024 12:16:40 -0700 Subject: [PATCH 1/4] Avoid reopening packfile on every object access --- gitindex/index.go | 40 ++++++++++++++++++++-- gitindex/index_test.go | 78 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/gitindex/index.go b/gitindex/index.go index 183c3249..b6105ef8 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -32,6 +32,9 @@ import ( "strconv" "strings" + "github.com/go-git/go-billy/v5/osfs" + "github.com/go-git/go-git/v5/plumbing/cache" + "github.com/go-git/go-git/v5/storage/filesystem" "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/build" "github.com/sourcegraph/zoekt/ignore" @@ -404,10 +407,12 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { } opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir - repo, err := git.PlainOpen(opts.RepoDir) + + repo, repoCloser, err := openRepo(opts.RepoDir) if err != nil { - return false, fmt.Errorf("git.PlainOpen: %w", err) + return false, fmt.Errorf("openRepo: %w", err) } + defer repoCloser.Close() if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) @@ -572,6 +577,37 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { return true, builder.Finish() } +// openRepo opens a git repository in a way that's optimized for indexing. +// +// It copies the relevant logic from git.PlainOpen, and enables the filesystem KeepDescriptors option. This +// caches the packfile handles, preventing the packfile from being opened then closed on every object access. +func openRepo(repoDir string) (*git.Repository, io.Closer, error) { + fs := osfs.New(repoDir) + + // Check if the root directory exists. + if _, err := fs.Stat(""); err != nil { + if os.IsNotExist(err) { + return nil, nil, git.ErrRepositoryNotExists + } + return nil, nil, err + } + + // If there's a .git directory, use that as the new root. + if _, err := fs.Stat(git.GitDirName); err == nil { + if fs, err = fs.Chroot(git.GitDirName); err != nil { + return nil, nil, fmt.Errorf("fs.Chroot: %w", err) + } + } + + s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ + KeepDescriptors: true, + }) + + // Because we're keeping descriptors open, we need to close the storage object when we're done. + repo, err := git.Open(s, fs) + return repo, s, err +} + type repoPathRanks struct { MeanRank float64 `json:"mean_reference_count"` Paths map[string]float64 `json:"paths"` diff --git a/gitindex/index_test.go b/gitindex/index_test.go index d38b49f0..f655c960 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -17,6 +17,7 @@ package gitindex import ( "bytes" "context" + "errors" "fmt" "net/url" "os" @@ -64,6 +65,83 @@ func TestIndexEmptyRepo(t *testing.T) { } } +func TestIndexNonexistentRepo(t *testing.T) { + dir := t.TempDir() + desc := zoekt.Repository{ + Name: "nonexistent", + } + opts := Options{ + RepoDir: "does/not/exist", + Branches: []string{"main"}, + BuildOptions: build.Options{ + RepositoryDescription: desc, + IndexDir: dir, + }, + } + + if _, err := IndexGitRepo(opts); err == nil { + t.Fatal("expected error, got none") + } else if !errors.Is(err, git.ErrRepositoryNotExists) { + t.Fatalf("expected git.ErrRepositoryNotExists, got %v", err) + } +} + +func TestIndexTinyRepo(t *testing.T) { + // Create a repo with one file in it. + dir := t.TempDir() + executeCommand(t, dir, exec.Command("git", "init", "-b", "main", "repo")) + executeCommand(t, dir, exec.Command("git", "config", "--global", "user.name", "Thomas")) + executeCommand(t, dir, exec.Command("git", "config", "--global", "user.email", "thomas@google.com")) + + if err := os.WriteFile(filepath.Join(dir, "repo", "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + repoDir := filepath.Join(dir, "repo") + executeCommand(t, repoDir, exec.Command("git", "add", ".")) + executeCommand(t, repoDir, exec.Command("git", "commit", "-m", "initial commit")) + + // Test that indexing accepts both the repo directory, and the .git subdirectory. + for _, testDir := range []string{"repo", "repo/.git"} { + opts := Options{ + RepoDir: filepath.Join(dir, testDir), + Branches: []string{"main"}, + BuildOptions: build.Options{ + RepositoryDescription: zoekt.Repository{Name: "repo"}, + IndexDir: dir, + }, + } + + if _, err := IndexGitRepo(opts); err != nil { + t.Fatalf("unexpected error %v", err) + } + + searcher, err := shards.NewDirectorySearcher(dir) + if err != nil { + t.Fatal("NewDirectorySearcher", err) + } + + results, err := searcher.Search(context.Background(), &query.Const{Value: true}, &zoekt.SearchOptions{}) + searcher.Close() + + if err != nil { + t.Fatal("search failed", err) + } + + if len(results.Files) != 1 { + t.Fatalf("got search result %v, want 1 file", results.Files) + } + } +} + +func executeCommand(t *testing.T, dir string, cmd *exec.Cmd) *exec.Cmd { + cmd.Dir = dir + if err := cmd.Run(); err != nil { + t.Fatalf("cmd.Run: %v", err) + } + return cmd +} + func TestIndexDeltaBasic(t *testing.T) { type branchToDocumentMap map[string][]zoekt.Document From 5da54b9a82e919e995ad94a21d4f54622aa9e5b6 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 30 Oct 2024 10:40:43 -0700 Subject: [PATCH 2/4] Avoid setting global git config --- gitindex/index_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gitindex/index_test.go b/gitindex/index_test.go index f655c960..34098362 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -90,14 +90,14 @@ func TestIndexTinyRepo(t *testing.T) { // Create a repo with one file in it. dir := t.TempDir() executeCommand(t, dir, exec.Command("git", "init", "-b", "main", "repo")) - executeCommand(t, dir, exec.Command("git", "config", "--global", "user.name", "Thomas")) - executeCommand(t, dir, exec.Command("git", "config", "--global", "user.email", "thomas@google.com")) - if err := os.WriteFile(filepath.Join(dir, "repo", "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0644); err != nil { + repoDir := filepath.Join(dir, "repo") + executeCommand(t, repoDir, exec.Command("git", "config", "user.name", "Thomas")) + executeCommand(t, repoDir, exec.Command("git", "config", "user.email", "thomas@google.com")) + + if err := os.WriteFile(filepath.Join(repoDir, "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0644); err != nil { t.Fatalf("WriteFile: %v", err) } - - repoDir := filepath.Join(dir, "repo") executeCommand(t, repoDir, exec.Command("git", "add", ".")) executeCommand(t, repoDir, exec.Command("git", "commit", "-m", "initial commit")) From 7af6e78f32ff660092f9e21c6f503a08e38c71a3 Mon Sep 17 00:00:00 2001 From: thomas Date: Wed, 30 Oct 2024 11:01:09 -0700 Subject: [PATCH 3/4] Put optimization behind temporary feature flag --- gitindex/index.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/gitindex/index.go b/gitindex/index.go index b6105ef8..70d6eefa 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -408,11 +408,23 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { opts.BuildOptions.RepositoryDescription.Source = opts.RepoDir - repo, repoCloser, err := openRepo(opts.RepoDir) - if err != nil { - return false, fmt.Errorf("openRepo: %w", err) + var repo *git.Repository + + // TODO: remove this feature flag once we test this on a large-scale instance. + optimizeRepoOpen := os.Getenv("ZOEKT_ENABLE_GOGIT_OPTIMIZATION") + if b, err := strconv.ParseBool(optimizeRepoOpen); b && err == nil { + var repoCloser io.Closer + repo, repoCloser, err = openRepo(opts.RepoDir) + if err != nil { + return false, fmt.Errorf("openRepo: %w", err) + } + defer repoCloser.Close() + } else { + repo, err = git.PlainOpen(opts.RepoDir) + if err != nil { + return false, fmt.Errorf("git.PlainOpen: %w", err) + } } - defer repoCloser.Close() if err := setTemplatesFromConfig(&opts.BuildOptions.RepositoryDescription, opts.RepoDir); err != nil { log.Printf("setTemplatesFromConfig(%s): %s", opts.RepoDir, err) From 9b029298c648bf073897b295f97bfc67718dfd14 Mon Sep 17 00:00:00 2001 From: thomas Date: Wed, 30 Oct 2024 11:04:54 -0700 Subject: [PATCH 4/4] Only change root if .git is directory --- gitindex/index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitindex/index.go b/gitindex/index.go index 70d6eefa..d0682f7e 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -605,7 +605,7 @@ func openRepo(repoDir string) (*git.Repository, io.Closer, error) { } // If there's a .git directory, use that as the new root. - if _, err := fs.Stat(git.GitDirName); err == nil { + if fi, err := fs.Stat(git.GitDirName); err == nil && fi.IsDir() { if fs, err = fs.Chroot(git.GitDirName); err != nil { return nil, nil, fmt.Errorf("fs.Chroot: %w", err) }