-
Notifications
You must be signed in to change notification settings - Fork 84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Download files in fetch.txt #119
base: master
Are you sure you want to change the base?
Changes from all commits
dae7b40
aae8cc3
6f93786
ac5ea43
91f5f4d
cacf9d9
6fda25c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,5 +3,4 @@ bench-data | |
build | ||
dist | ||
MANIFEST | ||
bagit.egg-info | ||
.idea |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -16,16 +16,20 @@ | |||||
import tempfile | ||||||
import unicodedata | ||||||
import warnings | ||||||
from fnmatch import fnmatch | ||||||
from collections import defaultdict | ||||||
from datetime import date | ||||||
from functools import partial | ||||||
from os.path import abspath, isdir, isfile, join | ||||||
|
||||||
from pkg_resources import DistributionNotFound, get_distribution | ||||||
|
||||||
try: | ||||||
# pylint: disable=no-name-in-module, import-error, wrong-import-position | ||||||
if sys.version_info >= (3,): | ||||||
from urllib.parse import urlparse | ||||||
except ImportError: | ||||||
from urllib.request import ProxyHandler, Request, build_opener | ||||||
else: | ||||||
from urllib2 import ProxyHandler, Request, build_opener | ||||||
from urlparse import urlparse | ||||||
|
||||||
|
||||||
|
@@ -124,6 +128,7 @@ def find_locale_dir(): | |||||
|
||||||
CHECKSUM_ALGOS = hashlib.algorithms_guaranteed | ||||||
DEFAULT_CHECKSUMS = ["sha256", "sha512"] | ||||||
DEFAULT_FETCH_URL_WHITELIST = ["https://*", "http://*", "ftp://*", "sftp://"] | ||||||
|
||||||
#: Block size used when reading files for hashing: | ||||||
HASH_BLOCK_SIZE = 512 * 1024 | ||||||
|
@@ -137,7 +142,7 @@ def find_locale_dir(): | |||||
|
||||||
|
||||||
def make_bag( | ||||||
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | ||||||
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", fetch_url_whitelist=None | ||||||
): | ||||||
""" | ||||||
Convert a given directory into a bag. You can pass in arbitrary | ||||||
|
@@ -275,7 +280,7 @@ class Bag(object): | |||||
valid_files = ["bagit.txt", "fetch.txt"] | ||||||
valid_directories = ["data"] | ||||||
|
||||||
def __init__(self, path=None): | ||||||
def __init__(self, path=None, fetch_url_whitelist=None): | ||||||
super(Bag, self).__init__() | ||||||
self.tags = {} | ||||||
self.info = {} | ||||||
|
@@ -296,6 +301,7 @@ def __init__(self, path=None): | |||||
self.normalized_manifest_names = {} | ||||||
|
||||||
self.algorithms = [] | ||||||
self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist | ||||||
self.tag_file_name = None | ||||||
self.path = abspath(path) | ||||||
if path: | ||||||
|
@@ -579,9 +585,50 @@ def files_to_be_fetched(self): | |||||
local filename | ||||||
""" | ||||||
|
||||||
for url, file_size, filename in self.fetch_entries(): | ||||||
for _, _, filename in self.fetch_entries(): | ||||||
yield filename | ||||||
|
||||||
def fetch(self, force=False): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering whether we should support |
||||||
""" | ||||||
Fetches files from the fetch.txt | ||||||
|
||||||
Arguments: | ||||||
force (boolean): Fetch files even when they are present in the data directory | ||||||
""" | ||||||
proxy_handler = ProxyHandler() # will default to adhere to *_proxy env vars | ||||||
opener = build_opener(proxy_handler) | ||||||
user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info) | ||||||
for url, expected_size, filename in self.fetch_entries(): | ||||||
if not fnmatch_any(url, self.fetch_url_whitelist): | ||||||
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist)) | ||||||
expected_size = -1 if expected_size == '-' else int(expected_size) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm thinking this would be more idiomatic as:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also wanted to check whether |
||||||
if filename in self.payload_files() and not force: | ||||||
LOGGER.info(_("File already fetched: %s"), filename) | ||||||
continue | ||||||
req = Request(url) | ||||||
req.add_header('User-Agent', user_agent) | ||||||
resp = opener.open(req) | ||||||
headers = resp.info() | ||||||
if expected_size >= 0: | ||||||
if "content-length" not in headers: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would only apply to HTTP requests but the intention of this code is to support other protocols, which suggests that this would need to be a conditional warning since I don't believe the stdlib sets a header like that for other protocols. |
||||||
LOGGER.warning(_("Server sent no content-length for <%s>"), url) | ||||||
else: | ||||||
content_length = int(headers['content-length']) | ||||||
if content_length != expected_size: | ||||||
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering whether this should have the URL as well:
Suggested change
|
||||||
with open(join(self.path, filename), 'wb') as out: | ||||||
read = 0 | ||||||
while True: | ||||||
block = resp.read(1024 * 8) | ||||||
if not block: | ||||||
break | ||||||
read += len(block) | ||||||
out.write(block) | ||||||
if expected_size >= 0 and read != expected_size: | ||||||
raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read)) | ||||||
LOGGER.info(_("Fetched %s from %s"), filename, url) | ||||||
|
||||||
|
||||||
def has_oxum(self): | ||||||
return "Payload-Oxum" in self.info | ||||||
|
||||||
|
@@ -761,14 +808,10 @@ def validate_fetch(self): | |||||
Raises `BagError` for errors and otherwise returns no value | ||||||
""" | ||||||
|
||||||
for url, file_size, filename in self.fetch_entries(): | ||||||
# fetch_entries will raise a BagError for unsafe filenames | ||||||
# so at this point we will check only that the URL is minimally | ||||||
# well formed: | ||||||
parsed_url = urlparse(url) | ||||||
|
||||||
if not all((parsed_url.scheme, parsed_url.netloc)): | ||||||
raise BagError(_("Malformed URL in fetch.txt: %s") % url) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should still have this validation step since it reports some malformed URLs earlier. This would also allow us to remove both of the |
||||||
for url, expected_size, filename in self.fetch_entries(): | ||||||
# ensure url matches one of the allowed patterns | ||||||
if not fnmatch_any(url, self.fetch_url_whitelist): | ||||||
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist)) | ||||||
|
||||||
def _validate_contents(self, processes=1, fast=False, completeness_only=False): | ||||||
if fast and not self.has_oxum(): | ||||||
|
@@ -1411,6 +1454,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |||||
|
||||||
return results | ||||||
|
||||||
# Return true if any of the pattern fnmatches a string | ||||||
def fnmatch_any(s, pats): | ||||||
for pat in pats: | ||||||
if fnmatch(s, pat): | ||||||
return True | ||||||
return False | ||||||
|
||||||
def _encode_filename(s): | ||||||
s = s.replace("\r", "%0D") | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ msgid "" | |
msgstr "" | ||
"Project-Id-Version: PACKAGE VERSION\n" | ||
"Report-Msgid-Bugs-To: \n" | ||
"POT-Creation-Date: 2018-06-26 10:28-0400\n" | ||
"POT-Creation-Date: 2018-12-03 12:06+0100\n" | ||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" | ||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" | ||
"Language-Team: LANGUAGE <[email protected]>\n" | ||
|
@@ -168,6 +168,30 @@ msgstr "" | |
msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "File already fetched: %s" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Server sent no content-length for <%s>" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Inconsistent size of %s: Expected %s but Content-Length is %s" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Inconsistent size of %s: Expected %s but received %s" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Fetched %s from %s" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "" | ||
"%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " | ||
|
@@ -205,10 +229,6 @@ msgstr "" | |
msgid "Expected %s to contain \"bagit.txt\"" | ||
msgstr "" | ||
|
||
#, python-format | ||
msgid "Malformed URL in fetch.txt: %s" | ||
msgstr "" | ||
|
||
msgid "Fast validation requires bag-info.txt to include Payload-Oxum" | ||
msgstr "" | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be simplified: