From 3ceb2c95a68589578899ede9534fbd37d1f5df7a Mon Sep 17 00:00:00 2001 From: Sarunas Date: Wed, 15 Jul 2020 16:24:19 +0100 Subject: [PATCH 1/2] Update download.py --- comcrawl/utils/download.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comcrawl/utils/download.py b/comcrawl/utils/download.py index 6d05774..623f20e 100644 --- a/comcrawl/utils/download.py +++ b/comcrawl/utils/download.py @@ -40,7 +40,11 @@ def download_single_result(result: Result) -> Result: unzipped_file = gzip.GzipFile(fileobj=zipped_file) raw_data: bytes = unzipped_file.read() - data: str = raw_data.decode("utf-8") + try: + data: str = raw_data.decode("utf-8") + except UnicodeDecodeError: + print(f"Warning: Could not extract file downloaded from {url}") + data = '' result["html"] = "" From ddd763e605e5af141fb5f674be3874b93abac5de Mon Sep 17 00:00:00 2001 From: Sarunas Date: Mon, 20 Jul 2020 21:21:46 +0100 Subject: [PATCH 2/2] Update test_download.py --- tests/comcrawl/utils/test_download.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/comcrawl/utils/test_download.py b/tests/comcrawl/utils/test_download.py index fec2a2c..7098502 100644 --- a/tests/comcrawl/utils/test_download.py +++ b/tests/comcrawl/utils/test_download.py @@ -31,6 +31,22 @@ 'status': '301', 'mime': 'text/html'} +KNOWN_RESULT_NO_HTML_ERROR_HANDLING = { + 'urlkey': ('com,publicstorage)/blog/seasonal/-/media/website/' + 'blog/photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'), + 'timestamp': '20200531041432', + 'digest': '4R4CS4CNOPMA7H6ITWLHTTCSIXQMMYZ3', + 'redirect': '', + 'mime-detected': 'image/jpeg', + 'offset': '886432941', + 'length': '37721', + 'filename': ('crawl-data/CC-MAIN-2020-24/segments/1590347410745.37/' + 'warc/CC-MAIN-20200531023023-20200531053023-00208.warc.gz'), + 'url': ('https://www.publicstorage.com/blog/seasonal/-/media/' + 'Website/Blog/Photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'), + 'status': '200', + 'mime': 'image/jpeg'} + def test_download_single_result(snapshot): result = download_single_result(KNOWN_RESULT) @@ -42,6 +58,11 @@ def test_download_single_result_without_html(): assert result["html"] == "" +def test_download_single_result_without_html_error_handling(): + result = download_single_result(KNOWN_RESULT_NO_HTML_ERROR_HANDLING) + assert result["html"] == "" + + KNOWN_RESULTS = [{'charset': 'UTF-8', 'digest': '745JGUNVPWB4L3TWJIGUQRQFTFSREJ5J', 'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540500637.40/'