Skip to content
This repository has been archived by the owner on Sep 5, 2024. It is now read-only.

Commit

Permalink
Merge pull request #31 from Sarunas-Girdenas/master
Browse files Browse the repository at this point in the history
Update download.py to handle error when decoding downloaded data
  • Loading branch information
michaelharms authored Jul 26, 2020
2 parents 1519272 + ddd763e commit a5f3db0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
6 changes: 5 additions & 1 deletion comcrawl/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ def download_single_result(result: Result) -> Result:
unzipped_file = gzip.GzipFile(fileobj=zipped_file)

raw_data: bytes = unzipped_file.read()
data: str = raw_data.decode("utf-8")
try:
data: str = raw_data.decode("utf-8")
except UnicodeDecodeError:
print(f"Warning: Could not extract file downloaded from {url}")
data = ''

result["html"] = ""

Expand Down
21 changes: 21 additions & 0 deletions tests/comcrawl/utils/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@
'status': '301',
'mime': 'text/html'}

KNOWN_RESULT_NO_HTML_ERROR_HANDLING = {
'urlkey': ('com,publicstorage)/blog/seasonal/-/media/website/'
'blog/photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'),
'timestamp': '20200531041432',
'digest': '4R4CS4CNOPMA7H6ITWLHTTCSIXQMMYZ3',
'redirect': '',
'mime-detected': 'image/jpeg',
'offset': '886432941',
'length': '37721',
'filename': ('crawl-data/CC-MAIN-2020-24/segments/1590347410745.37/'
'warc/CC-MAIN-20200531023023-20200531053023-00208.warc.gz'),
'url': ('https://www.publicstorage.com/blog/seasonal/-/media/'
'Website/Blog/Photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'),
'status': '200',
'mime': 'image/jpeg'}


def test_download_single_result(snapshot):
result = download_single_result(KNOWN_RESULT)
Expand All @@ -42,6 +58,11 @@ def test_download_single_result_without_html():
assert result["html"] == ""


def test_download_single_result_without_html_error_handling():
result = download_single_result(KNOWN_RESULT_NO_HTML_ERROR_HANDLING)
assert result["html"] == ""


KNOWN_RESULTS = [{'charset': 'UTF-8',
'digest': '745JGUNVPWB4L3TWJIGUQRQFTFSREJ5J',
'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540500637.40/'
Expand Down

0 comments on commit a5f3db0

Please sign in to comment.