Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download files in fetch.txt #119

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ bench-data
build
dist
MANIFEST
bagit.egg-info
.idea
75 changes: 62 additions & 13 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@
import tempfile
import unicodedata
import warnings
from fnmatch import fnmatch
from collections import defaultdict
from datetime import date
from functools import partial
from os.path import abspath, isdir, isfile, join

from pkg_resources import DistributionNotFound, get_distribution

try:
# pylint: disable=no-name-in-module, import-error, wrong-import-position
if sys.version_info >= (3,):
from urllib.parse import urlparse
except ImportError:
from urllib.request import ProxyHandler, Request, build_opener
else:
from urllib2 import ProxyHandler, Request, build_opener
from urlparse import urlparse


Expand Down Expand Up @@ -124,6 +128,7 @@ def find_locale_dir():

CHECKSUM_ALGOS = hashlib.algorithms_guaranteed
DEFAULT_CHECKSUMS = ["sha256", "sha512"]
DEFAULT_FETCH_URL_WHITELIST = ["https://*", "http://*", "ftp://*", "sftp://"]

#: Block size used when reading files for hashing:
HASH_BLOCK_SIZE = 512 * 1024
Expand All @@ -137,7 +142,7 @@ def find_locale_dir():


def make_bag(
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", fetch_url_whitelist=None
):
"""
Convert a given directory into a bag. You can pass in arbitrary
Expand Down Expand Up @@ -275,7 +280,7 @@ class Bag(object):
valid_files = ["bagit.txt", "fetch.txt"]
valid_directories = ["data"]

def __init__(self, path=None):
def __init__(self, path=None, fetch_url_whitelist=None):
super(Bag, self).__init__()
self.tags = {}
self.info = {}
Expand All @@ -296,6 +301,7 @@ def __init__(self, path=None):
self.normalized_manifest_names = {}

self.algorithms = []
self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be simplified:

Suggested change
self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
self.fetch_url_whitelist = fetch_url_whitelist or DEFAULT_FETCH_URL_WHITELIST

self.tag_file_name = None
self.path = abspath(path)
if path:
Expand Down Expand Up @@ -579,9 +585,50 @@ def files_to_be_fetched(self):
local filename
"""

for url, file_size, filename in self.fetch_entries():
for _, _, filename in self.fetch_entries():
yield filename

def fetch(self, force=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether we should support force() or simply have the user handle file operations outside of scope. The main thing which that does is open up the need to think about potential security concerns so perhaps that should just be confirming that we correctly validate file paths not being outside of the bag scope.

"""
Fetches files from the fetch.txt

Arguments:
force (boolean): Fetch files even when they are present in the data directory
"""
proxy_handler = ProxyHandler() # will default to adhere to *_proxy env vars
opener = build_opener(proxy_handler)
user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info)
for url, expected_size, filename in self.fetch_entries():
if not fnmatch_any(url, self.fetch_url_whitelist):
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))
expected_size = -1 if expected_size == '-' else int(expected_size)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking this would be more idiomatic as:

Suggested change
expected_size = -1 if expected_size == '-' else int(expected_size)
expected_size = None if expected_size == '-' else int(expected_size)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also wanted to check whether fetch_entries() is correctly validating that these values are either “-” or digits.

if filename in self.payload_files() and not force:
LOGGER.info(_("File already fetched: %s"), filename)
continue
req = Request(url)
req.add_header('User-Agent', user_agent)
resp = opener.open(req)
headers = resp.info()
if expected_size >= 0:
if "content-length" not in headers:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would only apply to HTTP requests but the intention of this code is to support other protocols, which suggests that this would need to be a conditional warning since I don't believe the stdlib sets a header like that for other protocols.

LOGGER.warning(_("Server sent no content-length for <%s>"), url)
else:
content_length = int(headers['content-length'])
if content_length != expected_size:
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering whether this should have the URL as well:

Suggested change
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
raise BagError(_("Inconsistent size for %s: expected %s bytes but %s had a Content-Length of %s") % (filename, expected_size, url, content_length))

with open(join(self.path, filename), 'wb') as out:
read = 0
while True:
block = resp.read(1024 * 8)
if not block:
break
read += len(block)
out.write(block)
if expected_size >= 0 and read != expected_size:
raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read))
LOGGER.info(_("Fetched %s from %s"), filename, url)


def has_oxum(self):
return "Payload-Oxum" in self.info

Expand Down Expand Up @@ -761,14 +808,10 @@ def validate_fetch(self):
Raises `BagError` for errors and otherwise returns no value
"""

for url, file_size, filename in self.fetch_entries():
# fetch_entries will raise a BagError for unsafe filenames
# so at this point we will check only that the URL is minimally
# well formed:
parsed_url = urlparse(url)

if not all((parsed_url.scheme, parsed_url.netloc)):
raise BagError(_("Malformed URL in fetch.txt: %s") % url)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should still have this validation step since it reports some malformed URLs earlier. This would also allow us to remove both of the fnmatch checks by changing the whitelist to be a list of URL schemes and simply adding a second if parsed_url.scheme not in self.fetch_url_schema_whitelist: … check here.

for url, expected_size, filename in self.fetch_entries():
# ensure url matches one of the allowed patterns
if not fnmatch_any(url, self.fetch_url_whitelist):
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))

def _validate_contents(self, processes=1, fast=False, completeness_only=False):
if fast and not self.has_oxum():
Expand Down Expand Up @@ -1411,6 +1454,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):

return results

# Return true if any of the pattern fnmatches a string
def fnmatch_any(s, pats):
for pat in pats:
if fnmatch(s, pat):
return True
return False

def _encode_filename(s):
s = s.replace("\r", "%0D")
Expand Down
30 changes: 25 additions & 5 deletions locale/bagit-python.pot
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2018-06-26 10:28-0400\n"
"POT-Creation-Date: 2018-12-03 12:06+0100\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[email protected]>\n"
Expand Down Expand Up @@ -168,6 +168,30 @@ msgstr ""
msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe"
msgstr ""

#, python-format
msgid "Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s"
msgstr ""

#, python-format
msgid "File already fetched: %s"
msgstr ""

#, python-format
msgid "Server sent no content-length for <%s>"
msgstr ""

#, python-format
msgid "Inconsistent size of %s: Expected %s but Content-Length is %s"
msgstr ""

#, python-format
msgid "Inconsistent size of %s: Expected %s but received %s"
msgstr ""

#, python-format
msgid "Fetched %s from %s"
msgstr ""

#, python-format
msgid ""
"%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which "
Expand Down Expand Up @@ -205,10 +229,6 @@ msgstr ""
msgid "Expected %s to contain \"bagit.txt\""
msgstr ""

#, python-format
msgid "Malformed URL in fetch.txt: %s"
msgstr ""

msgid "Fast validation requires bag-info.txt to include Payload-Oxum"
msgstr ""

Expand Down
88 changes: 73 additions & 15 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1081,24 +1081,82 @@ def test_fetch_unsafe_payloads(self):

self.assertEqual(expected_msg, str(cm.exception))

def test_fetch_malformed_url(self):
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print(
"//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg",
file=fetch_txt,
)

self.bag.save(manifests=True)

expected_msg = (
"Malformed URL in fetch.txt: //photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg"
)

with self.assertRaises(bagit.BagError) as cm:
def test_invalid_urls(self):
invalid_urls = [
'//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg',
'file://%s' % j(self.tmpdir, "mock_data"),
'../../../../../etc/passwd',
]
for url in invalid_urls:
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("%s - data/mock_data" % url, file=fetch_txt)
with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
self.bag.validate_fetch()

def test_invalid_urls_whitelist(self):
self.bag.fetch_url_whitelist = [
'https://my.inst.edu/data/*.png'
]
valid_urls = [
'https://my.inst.edu/data/foo.png'
]
invalid_urls = [
'https://my.inst.edu/data/foo'
'https://my.inst.edu/robots.txt',
'http://my.inst.edu/data/foo',
'https://example.org',
]
for url in invalid_urls:
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("%s - data/mock_data" % url, file=fetch_txt)
with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
self.bag.validate_fetch()
for url in valid_urls:
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("%s - data/mock_data" % url, file=fetch_txt)
self.bag.validate_fetch()

self.assertEqual(expected_msg, str(cm.exception))
def test_fetching_payload_file(self):
test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
test_payload, 139367, test_payload), file=fetch_txt)
self.bag.save(manifests=True)
# should be valid
self.bag.validate()
# now delete the payload, should be invalid
os.unlink(j(self.tmpdir, "data", test_payload))
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch')
with self.assertRaises(bagit.BagError):
self.bag.validate()
# re-fetch it
self.bag.fetch()
# should be valid again
self.bag.validate()
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')

def test_force_fetching(self):
test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
test_payload, 139367, test_payload), file=fetch_txt)
self.bag.save(manifests=True)
# now replace one payload file with an empty string
with open(j(self.tmpdir, "data", test_payload), 'w') as payload:
payload.write('')
# should be invalid now
with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."):
self.bag.validate()
# non-forcefully downloading should not help
self.bag.fetch()
# should **still* be invalid now
with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."):
self.bag.validate()
# fetch with force
self.bag.fetch(force=True)
# should be valid again
self.bag.validate()
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')

class TestUtils(unittest.TestCase):
def setUp(self):
Expand Down