From cd9911c9f2d5e880ec87fed904b16a6f926a0bd8 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 20 Dec 2024 16:43:14 -0800 Subject: [PATCH] Cleanup hydroshare provider - Cleanup how the provider is detected, as we were simply doing a domain check but with many extra steps - Move the tests to be real integration tests - Test detection, not content_id --- repo2docker/contentproviders/hydroshare.py | 69 ++++--- tests/contentproviders/test_hydroshare.py | 59 ++++++ .../unit/contentproviders/test_hydroshare.py | 188 ------------------ 3 files changed, 100 insertions(+), 216 deletions(-) create mode 100755 tests/contentproviders/test_hydroshare.py delete mode 100755 tests/unit/contentproviders/test_hydroshare.py diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 90c41914..ce870b59 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -2,20 +2,29 @@ import os import shutil import time +import tempfile import zipfile from datetime import datetime, timedelta, timezone from urllib.request import urlretrieve +from urllib.parse import urlparse, urlunparse from .base import ContentProviderException from .doi import DoiProvider +from ..utils import is_doi class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" - def _fetch_version(self, host): - """Fetch resource modified date and convert to epoch""" - json_response = self.session.get(host["version"].format(self.resource_id)).json() + HYDROSHARE_DOMAINS = ["www.hydroshare.org"] + + def get_version(self, resource_id: str) -> str: + """ + Get current version of given resource_id + """ + api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements" + + json_response = self.session.get(api_url).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] @@ -26,7 +35,7 @@ def _fetch_version(self, host): # truncate the timestamp return str(int(epoch)) - def detect(self, doi, ref=None, extra_args=None): + def detect(self, spec, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Hydroshare resource""" hosts = [ { @@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None): "http://www.hydroshare.org/resource/", ], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + "version": "", } ] - url = self.doi2url(doi) - - for host in hosts: - if any([url.startswith(s) for s in host["hostname"]]): - self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] - self.version = self._fetch_version(host) - return { - "resource": self.resource_id, - "host": host, - "version": self.version, - } + + # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL + if is_doi(spec): + url = self.doi2url(spec) + else: + url = spec + + parsed = urlparse(url) + + print(url) + if parsed.netloc in self.HYDROSHARE_DOMAINS: + return url def _urlretrieve(self, bag_url): return urlretrieve(bag_url) def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" - resource_id = spec["resource"] - host = spec["host"] + url = spec + print(url) + parts = urlparse(url) + self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1] - bag_url = f'{host["django_irods"]}{resource_id}' + bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}")) yield f"Downloading {bag_url}.\n" @@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): filehandle, _ = self._urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, "r") yield "Downloaded, unpacking contents.\n" - zip_file_object.extractall("temp") - # resources store the contents in the data/contents directory, which is all we want to keep - contents_dir = os.path.join("temp", self.resource_id, "data", "contents") - files = os.listdir(contents_dir) - for f in files: - shutil.move(os.path.join(contents_dir, f), output_dir) - yield "Finished, cleaning up.\n" - shutil.rmtree("temp") + + with tempfile.TemporaryDirectory() as d: + zip_file_object.extractall(d) + # resources store the contents in the data/contents directory, which is all we want to keep + contents_dir = os.path.join(d, self.resource_id, "data", "contents") + files = os.listdir(contents_dir) + for f in files: + shutil.move(os.path.join(contents_dir, f), output_dir) + yield "Finished, cleaning up.\n" @property def content_id(self): """The HydroShare resource ID""" - return f"{self.resource_id}.v{self.version}" + return f"{self.resource_id}" diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py new file mode 100755 index 00000000..abd485f3 --- /dev/null +++ b/tests/contentproviders/test_hydroshare.py @@ -0,0 +1,59 @@ +import os +import hashlib +from tempfile import TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import Hydroshare + + +@pytest.mark.parametrize( + ("spec", "url"), + [ + # Test a hydroshare DOI + ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Hydroshare DOI in a different form + ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Test a non-hydroshare DOI + ("doi:10.7910/DVN/TJCLKP", None), + # Test a hydroshare URL + ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Test a random URL + ("https://www.eff.org/cyberspace-independence", None) + ] +) +def test_detect(spec, url): + assert Hydroshare().detect(spec) == url + + +@pytest.mark.parametrize( + ("specs", "md5tree"), + [ + ( + ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ), + { + "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77", + "README.md": "88ac547c3a5f616f6d26e0689d63a113", + "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6" + }, + ), + ], +) +def test_fetch(specs: list[str], md5tree): + dv = Hydroshare() + + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) + + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha + diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py deleted file mode 100755 index 41e234cd..00000000 --- a/tests/unit/contentproviders/test_hydroshare.py +++ /dev/null @@ -1,188 +0,0 @@ -import os -import re -from contextlib import contextmanager -from tempfile import NamedTemporaryFile, TemporaryDirectory -from unittest.mock import patch -from zipfile import ZipFile - -import pytest - -from repo2docker.contentproviders import Hydroshare -from repo2docker.contentproviders.base import ContentProviderException - -doi_responses = { - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ), - "https://doi.org/10.21105/joss.01277": ( - "https://joss.theoj.org/papers/10.21105/joss.01277" - ), -} - - -def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - -hydroshare_data = { - "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}] -} - - -def test_content_id(): - hydro = Hydroshare() - - hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408" - - -def test_detect_hydroshare(): - # valid Hydroshare DOIs trigger this content provider - expected = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", - }, - "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1585005408", - } - - assert ( - Hydroshare().detect( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - assert ( - Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected - ) - - assert ( - Hydroshare().detect( - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - # Don't trigger the Hydroshare content provider - assert Hydroshare().detect("/some/path/here") is None - assert Hydroshare().detect("https://example.com/path/here") is None - - # don't handle DOIs that aren't from Hydroshare - assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None - - -@contextmanager -def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): - with NamedTemporaryFile(suffix=".zip") as zfile: - with ZipFile(zfile.name, mode="w") as zip: - zip.writestr(f"{prefix}/some-file.txt", "some content") - zip.writestr(f"{prefix}/some-other-file.txt", "some more content") - - yield zfile - - -class MockResponse: - def __init__(self, content_type, status_code): - self.status_code = status_code - self.headers = dict() - self.headers["content-type"] = content_type - - -def test_fetch_bag(): - # we "fetch" a local ZIP file to simulate a Hydroshare resource - with hydroshare_archive() as hydro_path: - with patch.object( - Hydroshare, - "urlopen", - side_effect=[ - MockResponse("application/html", 200), - MockResponse("application/zip", 200), - ], - ): - with patch.object( - Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)] - ): - hydro = Hydroshare() - hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - - with TemporaryDirectory() as d: - output = [] - for l in hydro.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = {"some-other-file.txt", "some-file.txt"} - assert expected == unpacked_files - - -def test_fetch_bag_failure(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Failed to download bag\. status code 500\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d): - pass - - -def test_fetch_bag_timeout(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Bag taking too long to prepare, exiting now, try again later\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d, timeout=0): - pass