From 77547cc97e63bfc8e8a045e0ed99000bb7cc9fc1 Mon Sep 17 00:00:00 2001 From: Nick Date: Sun, 27 Nov 2022 00:28:55 -0500 Subject: [PATCH] git-annex: create modules/annex (#21) This moves the `annexObjectPath()` helper out of the tests and into a dedicated sub-package as `annex.ContentLocation()`, and expands it with `.Pointer()` (which validates using `git annex examinekey`), `.IsAnnexed()` and `.Content()` to make it a more useful module. The tests retain their own wrapper version of `ContentLocation()` because I tried to follow close to the API modules/lfs uses, which in terms of abstract `git.Blob` and `git.TreeEntry` objects, not in terms of `repoPath string`s which are more convenient for the tests. --- modules/annex/annex.go | 154 ++++++++++++++++++++++++++++ modules/git/blob.go | 4 + modules/git/blob_gogit.go | 3 +- modules/git/repo_blob_gogit.go | 1 + modules/git/tree_entry_gogit.go | 1 + tests/integration/git_annex_test.go | 39 ++++--- 6 files changed, 184 insertions(+), 18 deletions(-) create mode 100644 modules/annex/annex.go diff --git a/modules/annex/annex.go b/modules/annex/annex.go new file mode 100644 index 0000000000000..bb049d77ed686 --- /dev/null +++ b/modules/annex/annex.go @@ -0,0 +1,154 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys. +// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path, +// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock +// filesystem, living only in process RAM). We must have the on-disk path to do anything +// useful with git-annex because all of its interesting data is on-disk under .git/annex/. + +package annex + +import ( + "errors" + "fmt" + "os" + "path" + "strings" + + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/util" +) + +const ( + // > The maximum size of a pointer file is 32 kb. + // - https://git-annex.branchable.com/internals/pointer_file/ + // It's unclear if that's kilobytes or kibibytes; assuming kibibytes: + blobSizeCutoff = 32 * 1024 +) + +// ErrInvalidPointer occurs if the pointer's value doesn't parse +var ErrInvalidPointer = errors.New("Not a git-annex pointer") + +// Gets the content of the blob as raw text, up to n bytes. +// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit) +func getBlobContent(b *git.Blob, n int) (string, error) { + dataRc, err := b.DataAsync() + if err != nil { + return "", err + } + defer dataRc.Close() + buf := make([]byte, n) + n, _ = util.ReadAtMost(dataRc, buf) + buf = buf[:n] + return string(buf), nil +} + +func Pointer(blob *git.Blob) (string, error) { + // git-annex doesn't seem fully spec what its pointer are, but + // the fullest description is here: + // https://git-annex.branchable.com/internals/pointer_file/ + + // a pointer can be: + // the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY' + // the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY' + // + // in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because + // git.Blob.DataAsync() works like open() + readlink(), handling both cases in one. + + if blob.Size() > blobSizeCutoff { + // > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file. + // https://git-annex.branchable.com/internals/pointer_file/ + + // It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too. + return "", ErrInvalidPointer + } + + pointer, err := getBlobContent(blob, blobSizeCutoff) + if err != nil { + return "", fmt.Errorf("error reading %s: %w", blob.Name(), err) + } + + // the spec says a pointer file can contain multiple lines each with a pointer in them + // but that makes no sense to me, so I'm just ignoring all but the first + lines := strings.Split(pointer, "\n") + if len(lines) < 1 { + return "", ErrInvalidPointer + } + pointer = lines[0] + + // in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it + if !strings.Contains(pointer, "/annex/") { + return "", ErrInvalidPointer + } + + // extract $KEY + pointer = path.Base(strings.TrimSpace(pointer)) + + // ask git-annex's opinion on $KEY + // XXX: this is probably a bit slow, especially if this operation gets run often + // and examinekey is not that strict: + // - it doesn't enforce that the "BACKEND" tag is one it knows, + // - it doesn't enforce that the fields and their format fit the "BACKEND" tag + // so maybe this is a wasteful step + _, examineStderr, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "examinekey").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path}) + if err != nil { + // TODO: make ErrInvalidPointer into a type capable of wrapping err + if strings.TrimSpace(examineStderr) == "git-annex: bad key" { + return "", ErrInvalidPointer + } + return "", err + } + + return pointer, nil +} + +// return the absolute path of the content pointed to by the annex pointer stored in the git object +// errors if the content is not found in this repo +func ContentLocation(blob *git.Blob) (string, error) { + pointer, err := Pointer(blob) + if err != nil { + return "", err + } + + contentLocation, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path}) + if err != nil { + return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", blob.Repo().Path, pointer, err) + } + contentLocation = strings.TrimSpace(contentLocation) + contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals + contentLocation = path.Join(blob.Repo().Path, contentLocation) + + return contentLocation, nil +} + +// returns a stream open to the annex content +func Content(blob *git.Blob) (*os.File, error) { + contentLocation, err := ContentLocation(blob) + if err != nil { + return nil, err + } + + return os.Open(contentLocation) +} + +// whether the object appears to be a valid annex pointer +// does *not* verify if the content is actually in this repo; +// for that, use ContentLocation() +func IsAnnexed(blob *git.Blob) (bool, error) { + if !setting.Annex.Enabled { + return false, nil + } + + // Pointer() is written to only return well-formed pointers + // so the test is just to see if it errors + _, err := Pointer(blob) + if err != nil { + if errors.Is(err, ErrInvalidPointer) { + return false, nil + } + return false, err + } + return true, nil +} diff --git a/modules/git/blob.go b/modules/git/blob.go index bcecb42e16ebb..34224f6c085b0 100644 --- a/modules/git/blob.go +++ b/modules/git/blob.go @@ -15,6 +15,10 @@ import ( // This file contains common functions between the gogit and !gogit variants for git Blobs +func (b *Blob) Repo() *Repository { + return b.repo +} + // Name returns name of the tree entry this blob object was created from (or empty string) func (b *Blob) Name() string { return b.name diff --git a/modules/git/blob_gogit.go b/modules/git/blob_gogit.go index aa206409d0b6f..f98a9d9084f99 100644 --- a/modules/git/blob_gogit.go +++ b/modules/git/blob_gogit.go @@ -14,7 +14,8 @@ import ( // Blob represents a Git object. type Blob struct { - ID SHA1 + ID SHA1 + repo *Repository gogitEncodedObj plumbing.EncodedObject name string diff --git a/modules/git/repo_blob_gogit.go b/modules/git/repo_blob_gogit.go index 7f0892f6f5e91..605c05072b771 100644 --- a/modules/git/repo_blob_gogit.go +++ b/modules/git/repo_blob_gogit.go @@ -17,6 +17,7 @@ func (repo *Repository) getBlob(id SHA1) (*Blob, error) { return &Blob{ ID: id, + repo: repo, gogitEncodedObj: encodedObj, }, nil } diff --git a/modules/git/tree_entry_gogit.go b/modules/git/tree_entry_gogit.go index 194dd12f7dbb1..0c08a766d8229 100644 --- a/modules/git/tree_entry_gogit.go +++ b/modules/git/tree_entry_gogit.go @@ -89,6 +89,7 @@ func (te *TreeEntry) Blob() *Blob { return &Blob{ ID: te.gogitTreeEntry.Hash, + repo: te.ptree.repo, gogitEncodedObj: encodedObj, name: te.Name(), } diff --git a/tests/integration/git_annex_test.go b/tests/integration/git_annex_test.go index 634c3dfa0ecb8..2e1d0eef7df7f 100644 --- a/tests/integration/git_annex_test.go +++ b/tests/integration/git_annex_test.go @@ -19,6 +19,7 @@ import ( "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/models/perm" repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/annex" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/setting" api "code.gitea.io/gitea/modules/structs" @@ -788,13 +789,13 @@ func doAnnexDownloadTest(remoteRepoPath, repoPath string) (err error) { } // verify the file was downloaded - localObjectPath, err := annexObjectPath(repoPath, "large.bin") + localObjectPath, err := contentLocation(repoPath, "large.bin") if err != nil { return err } // localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file - remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin") + remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin") if err != nil { return err } @@ -841,13 +842,13 @@ func doAnnexUploadTest(remoteRepoPath, repoPath string) (err error) { } // verify the file was uploaded - localObjectPath, err := annexObjectPath(repoPath, "contribution.bin") + localObjectPath, err := contentLocation(repoPath, "contribution.bin") if err != nil { return err } // localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file - remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin") + remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin") if err != nil { return err } @@ -1001,26 +1002,30 @@ Find the path in .git/annex/objects/ of the contents for a given annexed file. TODO: pass a parameter to allow examining non-HEAD branches */ -func annexObjectPath(repoPath, file string) (string, error) { - // NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos. - annexKey, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "show").AddDynamicArguments("HEAD:" + file).RunStdString(&git.RunOpts{Dir: repoPath}) +func contentLocation(repoPath, file string) (path string, err error) { + path = "" + + repo, err := git.OpenRepository(git.DefaultContext, repoPath) + if err != nil { + return path, nil + } + + commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags if err != nil { - return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo + return path, nil } - // There are two formats an annexed file pointer might be: - // * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add' - // * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge - // This recovers $ANNEX_KEY from either case: - annexKey = path.Base(strings.TrimSpace(annexKey)) + commit, err := repo.GetCommit(commitID) + if err != nil { + return path, nil + } - contentPath, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(annexKey).RunStdString(&git.RunOpts{Dir: repoPath}) + treeEntry, err := commit.GetTreeEntryByPath(file) if err != nil { - return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err) + return path, nil } - contentPath = strings.TrimSpace(contentPath) - return path.Join(repoPath, contentPath), nil + return annex.ContentLocation(treeEntry.Blob()) } /* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */