LibraryOfCongress · kba · Nov 27, 2018 · Nov 27, 2018 · Nov 27, 2018 · Nov 27, 2018
diff --git a/.gitignore b/.gitignore
@@ -3,5 +3,4 @@ bench-data
 build
 dist
 MANIFEST
-bagit.egg-info
 .idea
diff --git a/bagit.py b/bagit.py
@@ -16,16 +16,20 @@
 import tempfile
 import unicodedata
 import warnings
+from fnmatch import fnmatch
 from collections import defaultdict
 from datetime import date
 from functools import partial
 from os.path import abspath, isdir, isfile, join
 
 from pkg_resources import DistributionNotFound, get_distribution
 
-try:
+# pylint: disable=no-name-in-module, import-error, wrong-import-position
+if sys.version_info >= (3,):
     from urllib.parse import urlparse
-except ImportError:
+    from urllib.request import ProxyHandler, Request, build_opener
+else:
+    from urllib2 import ProxyHandler, Request, build_opener
     from urlparse import urlparse
 
 
@@ -124,6 +128,7 @@ def find_locale_dir():
 
 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed
 DEFAULT_CHECKSUMS = ["sha256", "sha512"]
+DEFAULT_FETCH_URL_WHITELIST = ["https://*", "http://*", "ftp://*", "sftp://"]
 
 #: Block size used when reading files for hashing:
 HASH_BLOCK_SIZE = 512 * 1024
@@ -137,7 +142,7 @@ def find_locale_dir():
 
 
 def make_bag(
-    bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
+    bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", fetch_url_whitelist=None
 ):
     """
     Convert a given directory into a bag. You can pass in arbitrary
@@ -275,7 +280,7 @@ class Bag(object):
     valid_files = ["bagit.txt", "fetch.txt"]
     valid_directories = ["data"]
 
-    def __init__(self, path=None):
+    def __init__(self, path=None, fetch_url_whitelist=None):
         super(Bag, self).__init__()
         self.tags = {}
         self.info = {}
@@ -296,6 +301,7 @@ def __init__(self, path=None):
         self.normalized_manifest_names = {}
 
         self.algorithms = []
+        self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
-        self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
+        self.fetch_url_whitelist = fetch_url_whitelist or DEFAULT_FETCH_URL_WHITELIST
-        self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
+        self.fetch_url_whitelist = fetch_url_whitelist or DEFAULT_FETCH_URL_WHITELIST
         self.tag_file_name = None
         self.path = abspath(path)
         if path:
@@ -579,9 +585,50 @@ def files_to_be_fetched(self):
         local filename
         """
 
-        for url, file_size, filename in self.fetch_entries():
+        for _, _, filename in self.fetch_entries():
             yield filename
 
+    def fetch(self, force=False):
+        """
+        Fetches files from the fetch.txt
+
+        Arguments:
+            force (boolean): Fetch files even when they are present in the data directory
+        """
+        proxy_handler = ProxyHandler() # will default to adhere to *_proxy env vars
+        opener = build_opener(proxy_handler)
+        user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info)
+        for url, expected_size, filename in self.fetch_entries():
+            if not fnmatch_any(url, self.fetch_url_whitelist):
+                raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))
+            expected_size = -1 if expected_size == '-' else int(expected_size)
-            expected_size = -1 if expected_size == '-' else int(expected_size)
+            expected_size = None if expected_size == '-' else int(expected_size)
-            expected_size = -1 if expected_size == '-' else int(expected_size)
+            expected_size = None if expected_size == '-' else int(expected_size)
+            if filename in self.payload_files() and not force:
+                LOGGER.info(_("File already fetched: %s"), filename)
+                continue
+            req = Request(url)
+            req.add_header('User-Agent', user_agent)
+            resp = opener.open(req)
+            headers = resp.info()
+            if expected_size >= 0:
+                if "content-length" not in headers:
+                    LOGGER.warning(_("Server sent no content-length for <%s>"), url)
+                else:
+                    content_length = int(headers['content-length'])
+                    if content_length != expected_size:
+                        raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
-                        raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
+                        raise BagError(_("Inconsistent size for %s: expected %s bytes but %s had a Content-Length of %s") % (filename, expected_size, url, content_length))
-                        raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
+                        raise BagError(_("Inconsistent size for %s: expected %s bytes but %s had a Content-Length of %s") % (filename, expected_size, url, content_length))
+            with open(join(self.path, filename), 'wb') as out:
+                read = 0
+                while True:
+                    block = resp.read(1024 * 8)
+                    if not block:
+                        break
+                    read += len(block)
+                    out.write(block)
+                if expected_size >= 0 and read != expected_size:
+                    raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read))
+            LOGGER.info(_("Fetched %s from %s"), filename, url)
+
+
     def has_oxum(self):
         return "Payload-Oxum" in self.info
 
@@ -761,14 +808,10 @@ def validate_fetch(self):
         Raises `BagError` for errors and otherwise returns no value
         """
 
-        for url, file_size, filename in self.fetch_entries():
-            # fetch_entries will raise a BagError for unsafe filenames
-            # so at this point we will check only that the URL is minimally
-            # well formed:
-            parsed_url = urlparse(url)
-
-            if not all((parsed_url.scheme, parsed_url.netloc)):
-                raise BagError(_("Malformed URL in fetch.txt: %s") % url)
+        for url, expected_size, filename in self.fetch_entries():
+            # ensure url matches one of the allowed patterns
+            if not fnmatch_any(url, self.fetch_url_whitelist):
+                raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))
 
     def _validate_contents(self, processes=1, fast=False, completeness_only=False):
         if fast and not self.has_oxum():
@@ -1411,6 +1454,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
 
     return results
 
+# Return true if any of the pattern fnmatches a string
+def fnmatch_any(s, pats):
+    for pat in pats:
+        if fnmatch(s, pat):
+            return True
+    return False
 
 def _encode_filename(s):
     s = s.replace("\r", "%0D")

diff --git a/locale/bagit-python.pot b/locale/bagit-python.pot
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-26 10:28-0400\n"
+"POT-Creation-Date: 2018-12-03 12:06+0100\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <[email protected]>\n"
@@ -168,6 +168,30 @@ msgstr ""
 msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe"
 msgstr ""
 
+#, python-format
+msgid "Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s"
+msgstr ""
+
+#, python-format
+msgid "File already fetched: %s"
+msgstr ""
+
+#, python-format
+msgid "Server sent no content-length for <%s>"
+msgstr ""
+
+#, python-format
+msgid "Inconsistent size of %s: Expected %s but Content-Length is %s"
+msgstr ""
+
+#, python-format
+msgid "Inconsistent size of %s: Expected %s but received %s"
+msgstr ""
+
+#, python-format
+msgid "Fetched %s from %s"
+msgstr ""
+
 #, python-format
 msgid ""
 "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which "
@@ -205,10 +229,6 @@ msgstr ""
 msgid "Expected %s to contain \"bagit.txt\""
 msgstr ""
 
-#, python-format
-msgid "Malformed URL in fetch.txt: %s"
-msgstr ""
-
 msgid "Fast validation requires bag-info.txt to include Payload-Oxum"
 msgstr ""
 

diff --git a/test.py b/test.py
@@ -1081,24 +1081,82 @@ def test_fetch_unsafe_payloads(self):
 
         self.assertEqual(expected_msg, str(cm.exception))
 
-    def test_fetch_malformed_url(self):
-        with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
-            print(
-                "//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg",
-                file=fetch_txt,
-            )
-
-        self.bag.save(manifests=True)
-
-        expected_msg = (
-            "Malformed URL in fetch.txt: //photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg"
-        )
-
-        with self.assertRaises(bagit.BagError) as cm:
+    def test_invalid_urls(self):
+        invalid_urls = [
+                '//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg',
+                'file://%s' % j(self.tmpdir, "mock_data"),
+                '../../../../../etc/passwd',
+        ]
+        for url in invalid_urls:
+            with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
+                print("%s - data/mock_data" % url, file=fetch_txt)
+            with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
+                self.bag.validate_fetch()
+
+    def test_invalid_urls_whitelist(self):
+        self.bag.fetch_url_whitelist = [
+            'https://my.inst.edu/data/*.png'
+        ]
+        valid_urls = [
+            'https://my.inst.edu/data/foo.png'
+        ]
+        invalid_urls = [
+            'https://my.inst.edu/data/foo'
+            'https://my.inst.edu/robots.txt',
+            'http://my.inst.edu/data/foo',
+            'https://example.org',
+        ]
+        for url in invalid_urls:
+            with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
+                print("%s - data/mock_data" % url, file=fetch_txt)
+            with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
+                self.bag.validate_fetch()
+        for url in valid_urls:
+            with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
+                print("%s - data/mock_data" % url, file=fetch_txt)
             self.bag.validate_fetch()
 
-        self.assertEqual(expected_msg, str(cm.exception))
+    def test_fetching_payload_file(self):
+        test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
+        with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
+            print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
+                  test_payload, 139367, test_payload), file=fetch_txt)
+        self.bag.save(manifests=True)
+        # should be valid
+        self.bag.validate()
+        # now delete the payload, should be invalid
+        os.unlink(j(self.tmpdir, "data", test_payload))
+        self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch')
+        with self.assertRaises(bagit.BagError):
+            self.bag.validate()
+        # re-fetch it
+        self.bag.fetch()
+        # should be valid again
+        self.bag.validate()
+        self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')
 
+    def test_force_fetching(self):
+        test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
+        with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
+            print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
+                  test_payload, 139367, test_payload), file=fetch_txt)
+        self.bag.save(manifests=True)
+        # now replace one payload file with an empty string
+        with open(j(self.tmpdir, "data", test_payload), 'w') as payload:
+            payload.write('')
+        # should be invalid now
+        with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."):
+            self.bag.validate()
+        # non-forcefully downloading should not help
+        self.bag.fetch()
+        # should **still* be invalid now
+        with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."):
+            self.bag.validate()
+        # fetch with force
+        self.bag.fetch(force=True)
+        # should be valid again
+        self.bag.validate()
+        self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')
 
 class TestUtils(unittest.TestCase):
     def setUp(self):
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,5 +3,4 @@ bench-data @@
     build
     dist
     MANIFEST
-    bagit.egg-info
     .idea