Merge pull request #31 from Sarunas-Girdenas/master

Update download.py to handle error when decoding downloaded data
michaelharms · Jul 26, 2020 · a5f3db0 · a5f3db0
2 parents 1519272 + ddd763e
commit a5f3db0
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 1 deletion.
diff --git a/comcrawl/utils/download.py b/comcrawl/utils/download.py
@@ -40,7 +40,11 @@ def download_single_result(result: Result) -> Result:
     unzipped_file = gzip.GzipFile(fileobj=zipped_file)
 
     raw_data: bytes = unzipped_file.read()
-    data: str = raw_data.decode("utf-8")
+    try:
+        data: str = raw_data.decode("utf-8")
+    except UnicodeDecodeError:
+        print(f"Warning: Could not extract file downloaded from {url}")
+        data = ''
 
     result["html"] = ""
 

diff --git a/tests/comcrawl/utils/test_download.py b/tests/comcrawl/utils/test_download.py
@@ -31,6 +31,22 @@
     'status': '301',
     'mime': 'text/html'}
 
+KNOWN_RESULT_NO_HTML_ERROR_HANDLING = {
+    'urlkey': ('com,publicstorage)/blog/seasonal/-/media/website/'
+               'blog/photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'),
+    'timestamp': '20200531041432',
+    'digest': '4R4CS4CNOPMA7H6ITWLHTTCSIXQMMYZ3',
+    'redirect': '',
+    'mime-detected': 'image/jpeg',
+    'offset': '886432941',
+    'length': '37721',
+    'filename': ('crawl-data/CC-MAIN-2020-24/segments/1590347410745.37/'
+                 'warc/CC-MAIN-20200531023023-20200531053023-00208.warc.gz'),
+    'url': ('https://www.publicstorage.com/blog/seasonal/-/media/'
+            'Website/Blog/Photos/2014/01/red-fabric-christmas-ornament-storage-box.ashx'),
+    'status': '200',
+    'mime': 'image/jpeg'}
+
 
 def test_download_single_result(snapshot):
     result = download_single_result(KNOWN_RESULT)
@@ -42,6 +58,11 @@ def test_download_single_result_without_html():
     assert result["html"] == ""
 
 
+def test_download_single_result_without_html_error_handling():
+    result = download_single_result(KNOWN_RESULT_NO_HTML_ERROR_HANDLING)
+    assert result["html"] == ""
+
+
 KNOWN_RESULTS = [{'charset': 'UTF-8',
                   'digest': '745JGUNVPWB4L3TWJIGUQRQFTFSREJ5J',
                   'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540500637.40/'