diff --git a/internal/tool/file.go b/internal/tool/file.go index 6d1e3ad7c..f62df7ae0 100644 --- a/internal/tool/file.go +++ b/internal/tool/file.go @@ -8,6 +8,7 @@ import ( "fmt" "math" "net/http" + "regexp" "strings" ) @@ -27,16 +28,80 @@ func IsTextFile(data []byte) bool { return strings.Contains(http.DetectContentType(data), "text/") } +/* +* An annex object can be checked into git in 2 ways: +* 1. As a "pointer file" (structure described here: https://git-annex.branchable.com/internals/pointer_file/ ) +* 2. As a symbolic link pointing to a file in the git-annex directory (located in the .git dir at the base of the repository). + */ + +//A pointer file starts with "/annex/objects/", which is followed by the key +var RE_ANNEXPOINTERFILE = regexp.MustCompile(`^(/annex/objects/([A-Z][\-\._0-9A-Za-z]+)(?:\n|\r|\z))`) + +//The symbolic target is a relative path pointing to a file under the .git/annex/objects/ dir +var RE_SYMLINKPOINTATANNEX = regexp.MustCompile(`^(?:\.\./)*.git/annex/objects/.+`) + func IsAnnexedFile(data []byte) bool { - const ANNEXSNIFFSIZE = 5000 - if !(len(data) < ANNEXSNIFFSIZE) { - data = data[:ANNEXSNIFFSIZE] + + const ANNEXPOINTERFILE_MAXSIZE = 32 * 1024 + const ANNEXSNIFFSIZE = 512 + + var dataLen = len(data) + + //The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file. + //The maximum size of a symlink target is SYMLINK_MAX (which is filesystem dependent) but typically way smaller than 32kb. + if dataLen > ANNEXPOINTERFILE_MAXSIZE { + return false + } + + var sniffData []byte + if !(dataLen < ANNEXSNIFFSIZE) { + sniffData = data[:ANNEXSNIFFSIZE] + } else { + sniffData = data } - if strings.Contains(http.DetectContentType(data), "text/") { - return strings.Contains(string(data), "/annex/objects") + + //Annex pointer file/symlink target content is text type + if strings.Contains(http.DetectContentType(sniffData), "text/") { + + sniffStr := string(sniffData) + //Check if it's a symbolic link pointing to git-annex subdir + matchSymlinkTarget := RE_SYMLINKPOINTATANNEX.FindStringSubmatch(sniffStr) + + if len(matchSymlinkTarget) > 0 { + return true + } else { + //Check if it's a valid pointer file + + matchAnnexPointer := RE_ANNEXPOINTERFILE.FindStringSubmatch(sniffStr) + + if len(matchAnnexPointer) > 0 { + //var annexKey = matchAnnexPointer[2] + + //git-annex does support pointer files with additional text on subsequent lines. + var hasAdditionalText = len(sniffData) > len(matchAnnexPointer[1]) || dataLen > ANNEXSNIFFSIZE + + if hasAdditionalText { + //every such subsequent line must contain "/annex/" somewhere in it, and end with a newline. + var extraLines = strings.SplitAfter(string(data), "\n")[1:] + + if extraLines[len(extraLines)-1] != "" { + //if last line isn't empty, it means it was missing required newline character + return false + } else { + for _, line := range extraLines[0 : len(extraLines)-1] { + if !strings.Contains(line, "/annex/") { + return false + } + } + } + } + return true + } + } } return false } + func IsImageFile(data []byte) bool { return strings.Contains(http.DetectContentType(data), "image/") } diff --git a/internal/tool/file_test.go b/internal/tool/file_test.go new file mode 100644 index 000000000..1a2d3dacb --- /dev/null +++ b/internal/tool/file_test.go @@ -0,0 +1,64 @@ +package tool + +import ( + "strings" + "testing" + + . "github.com/smartystreets/goconvey/convey" +) + +func Test_IsValidAnnexPointerFile(t *testing.T) { + Convey("Check if a (file) content is a valid annex pointer file", t, func() { + testCases := []struct { + expect bool + content string + }{ + // valid key and EOF + {true, "/annex/objects/MD0-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + {true, "/annex/objects/SHA256E-s31390--f50d7ac4c6b9031379986bc362fcefb65f1e52621ce1708d537e740fefc59cc0.mp3"}, + {true, "/annex/objects/MD5E-s33142576--02b5f38377b5d268384633b3f1154d4e.nii.gz"}, + + // not a key pattern + {false, "foo/bar"}, + + // key pattern doesn't start at the beginning of content + {false, " /annex/objects/MD1-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + // key contain invalid character + {false, "/annex/objects/M+D2-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c"}, + // newline after key (and no more content) + {true, "/annex/objects/MD3-f4d0aaf2b2ac-7a4cf00fbae9158a1b7c\n"}, + // key can contains underscore (depending on backend) + {true, "/annex/objects/SHA4_384-232439cf00fbae9158a1b7c"}, + + // empty additional line + {false, "/annex/objects/MD5-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n\n"}, + + // valid additional line + {true, "/annex/objects/MD6-f4d0aaf2ba4cf00fbae9158a1b7c\n/annex/\n"}, + // empty additional line + {false, "/annex/objects/MD7-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n/annex/\n\n"}, + // additional line not terminated by new line + {false, "/annex/objects/MD8-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n/annex/"}, + + // valid additional lines + {true, "/annex/objects/MD9-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\r /annex/\n /annex/\n/annex/ \n"}, + // many valid additional lines, within the 32kb max file size + {true, "/annex/objects/MD10-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 31*1024/10)}, + // many valid additional lines, over the 32kb max file size + {false, "/annex/objects/MD11-s232439--f4d0aaf2b2ac7a4cf00fbae9158a1b7c\n" + strings.Repeat("/annex/89\n", 32*1024/10)}, + + // valid symlink target + {true, ".git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + // invalid symlink target + {false, "git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + {false, ".git/annex/objects/"}, + // valid symlink target for files in sub-directory + {true, "../.git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + {true, "../../.git/annex/objects/Z9/qQ/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png/MD5E-s886791--49e415b10841cacff2d8fb8456ca1e67.png"}, + } + + for _, tc := range testCases { + So(IsAnnexedFile([]byte(tc.content)), ShouldEqual, tc.expect) + } + }) +}