From cd9911c9f2d5e880ec87fed904b16a6f926a0bd8 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 20 Dec 2024 16:43:14 -0800
Subject: [PATCH] Cleanup hydroshare provider

- Cleanup how the provider is detected, as we were simply doing
  a domain check but with many extra steps
- Move the tests to be real integration tests
- Test detection, not content_id
---
 repo2docker/contentproviders/hydroshare.py    |  69 ++++---
 tests/contentproviders/test_hydroshare.py     |  59 ++++++
 .../unit/contentproviders/test_hydroshare.py  | 188 ------------------
 3 files changed, 100 insertions(+), 216 deletions(-)
 create mode 100755 tests/contentproviders/test_hydroshare.py
 delete mode 100755 tests/unit/contentproviders/test_hydroshare.py

diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
index 90c41914..ce870b59 100755
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@@ -2,20 +2,29 @@
 import os
 import shutil
 import time
+import tempfile
 import zipfile
 from datetime import datetime, timedelta, timezone
 from urllib.request import urlretrieve
+from urllib.parse import urlparse, urlunparse
 
 from .base import ContentProviderException
 from .doi import DoiProvider
+from ..utils import is_doi
 
 
 class Hydroshare(DoiProvider):
     """Provide contents of a Hydroshare resource."""
 
-    def _fetch_version(self, host):
-        """Fetch resource modified date and convert to epoch"""
-        json_response = self.session.get(host["version"].format(self.resource_id)).json()
+    HYDROSHARE_DOMAINS = ["www.hydroshare.org"]
+
+    def get_version(self, resource_id: str) -> str:
+        """
+        Get current version of given resource_id
+        """
+        api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"
+
+        json_response = self.session.get(api_url).json()
         date = next(
             item for item in json_response["dates"] if item["type"] == "modified"
         )["start_date"]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
         # truncate the timestamp
         return str(int(epoch))
 
-    def detect(self, doi, ref=None, extra_args=None):
+    def detect(self, spec, ref=None, extra_args=None):
         """Trigger this provider for things that resolve to a Hydroshare resource"""
         hosts = [
             {
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
                     "http://www.hydroshare.org/resource/",
                 ],
                 "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
+                "version": "",
             }
         ]
-        url = self.doi2url(doi)
-
-        for host in hosts:
-            if any([url.startswith(s) for s in host["hostname"]]):
-                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
-                self.version = self._fetch_version(host)
-                return {
-                    "resource": self.resource_id,
-                    "host": host,
-                    "version": self.version,
-                }
+
+        # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
+        if is_doi(spec):
+            url = self.doi2url(spec)
+        else:
+            url = spec
+
+        parsed = urlparse(url)
+
+        print(url)
+        if parsed.netloc in self.HYDROSHARE_DOMAINS:
+            return url
 
     def _urlretrieve(self, bag_url):
         return urlretrieve(bag_url)
 
     def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         """Fetch and unpack a Hydroshare resource"""
-        resource_id = spec["resource"]
-        host = spec["host"]
+        url = spec
+        print(url)
+        parts = urlparse(url)
+        self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
 
-        bag_url = f'{host["django_irods"]}{resource_id}'
+        bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))
 
         yield f"Downloading {bag_url}.\n"
 
@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         filehandle, _ = self._urlretrieve(bag_url)
         zip_file_object = zipfile.ZipFile(filehandle, "r")
         yield "Downloaded, unpacking contents.\n"
-        zip_file_object.extractall("temp")
-        # resources store the contents in the data/contents directory, which is all we want to keep
-        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
-        files = os.listdir(contents_dir)
-        for f in files:
-            shutil.move(os.path.join(contents_dir, f), output_dir)
-        yield "Finished, cleaning up.\n"
-        shutil.rmtree("temp")
+
+        with tempfile.TemporaryDirectory() as d:
+            zip_file_object.extractall(d)
+            # resources store the contents in the data/contents directory, which is all we want to keep
+            contents_dir = os.path.join(d, self.resource_id, "data", "contents")
+            files = os.listdir(contents_dir)
+            for f in files:
+                shutil.move(os.path.join(contents_dir, f), output_dir)
+            yield "Finished, cleaning up.\n"
 
     @property
     def content_id(self):
         """The HydroShare resource ID"""
-        return f"{self.resource_id}.v{self.version}"
+        return f"{self.resource_id}"
diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py
new file mode 100755
index 00000000..abd485f3
--- /dev/null
+++ b/tests/contentproviders/test_hydroshare.py
@@ -0,0 +1,59 @@
+import os
+import hashlib
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from repo2docker.contentproviders import Hydroshare
+
+
+@pytest.mark.parametrize(
+    ("spec", "url"),
+    [
+        # Test a hydroshare DOI
+        ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Hydroshare DOI in a different form
+        ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a non-hydroshare DOI
+        ("doi:10.7910/DVN/TJCLKP", None),
+        # Test a hydroshare URL
+        ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a random URL
+        ("https://www.eff.org/cyberspace-independence", None)
+    ]
+)
+def test_detect(spec, url):
+    assert Hydroshare().detect(spec) == url
+
+
+@pytest.mark.parametrize(
+    ("specs", "md5tree"),
+    [
+        (
+            ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
+            {
+                "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
+                "README.md": "88ac547c3a5f616f6d26e0689d63a113",
+                "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
+            },
+        ),
+    ],
+)
+def test_fetch(specs: list[str], md5tree):
+    dv = Hydroshare()
+
+    for spec in specs:
+        with TemporaryDirectory() as d:
+            output = []
+            for l in dv.fetch(dv.detect(spec), d):
+                output.append(l)
+
+            # Verify md5 sum of the files we expect to find
+            # We are using md5 instead of something more secure because that is what
+            # dataverse itself uses
+            for subpath, expected_sha in md5tree.items():
+                with open(os.path.join(d, subpath), "rb") as f:
+                    h = hashlib.md5()
+                    h.update(f.read())
+                    assert h.hexdigest() == expected_sha
+
diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py
deleted file mode 100755
index 41e234cd..00000000
--- a/tests/unit/contentproviders/test_hydroshare.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import os
-import re
-from contextlib import contextmanager
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-from unittest.mock import patch
-from zipfile import ZipFile
-
-import pytest
-
-from repo2docker.contentproviders import Hydroshare
-from repo2docker.contentproviders.base import ContentProviderException
-
-doi_responses = {
-    "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
-        "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
-    ),
-    "https://doi.org/10.21105/joss.01277": (
-        "https://joss.theoj.org/papers/10.21105/joss.01277"
-    ),
-}
-
-
-def doi_resolver(req, context):
-    resp = doi_responses.get(req.url)
-    # doi responses are redirects
-    if resp is not None:
-        context.status_code = 302
-        context.headers["Location"] = resp
-    return resp
-
-
-hydroshare_data = {
-    "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
-}
-
-
-def test_content_id():
-    hydro = Hydroshare()
-
-    hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
-    assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
-
-
-def test_detect_hydroshare():
-    # valid Hydroshare DOIs trigger this content provider
-    expected = {
-        "host": {
-            "hostname": [
-                "https://www.hydroshare.org/resource/",
-                "http://www.hydroshare.org/resource/",
-            ],
-            "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-            "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
-        },
-        "resource": "b8f6eae9d89241cf8b5904033460af61",
-        "version": "1585005408",
-    }
-
-    assert (
-        Hydroshare().detect(
-            "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
-        )
-        == expected
-    )
-
-    assert (
-        Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
-    )
-
-    assert (
-        Hydroshare().detect(
-            "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
-        )
-        == expected
-    )
-
-    # Don't trigger the Hydroshare content provider
-    assert Hydroshare().detect("/some/path/here") is None
-    assert Hydroshare().detect("https://example.com/path/here") is None
-
-    # don't handle DOIs that aren't from Hydroshare
-    assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
-
-
-@contextmanager
-def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
-    with NamedTemporaryFile(suffix=".zip") as zfile:
-        with ZipFile(zfile.name, mode="w") as zip:
-            zip.writestr(f"{prefix}/some-file.txt", "some content")
-            zip.writestr(f"{prefix}/some-other-file.txt", "some more content")
-
-        yield zfile
-
-
-class MockResponse:
-    def __init__(self, content_type, status_code):
-        self.status_code = status_code
-        self.headers = dict()
-        self.headers["content-type"] = content_type
-
-
-def test_fetch_bag():
-    # we "fetch" a local ZIP file to simulate a Hydroshare resource
-    with hydroshare_archive() as hydro_path:
-        with patch.object(
-            Hydroshare,
-            "urlopen",
-            side_effect=[
-                MockResponse("application/html", 200),
-                MockResponse("application/zip", 200),
-            ],
-        ):
-            with patch.object(
-                Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]
-            ):
-                hydro = Hydroshare()
-                hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
-                spec = {
-                    "host": {
-                        "hostname": [
-                            "https://www.hydroshare.org/resource/",
-                            "http://www.hydroshare.org/resource/",
-                        ],
-                        "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                    },
-                    "resource": "123456789",
-                }
-
-                with TemporaryDirectory() as d:
-                    output = []
-                    for l in hydro.fetch(spec, d):
-                        output.append(l)
-
-                    unpacked_files = set(os.listdir(d))
-                    expected = {"some-other-file.txt", "some-file.txt"}
-                    assert expected == unpacked_files
-
-
-def test_fetch_bag_failure():
-    with hydroshare_archive():
-        with patch.object(
-            Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]
-        ):
-            hydro = Hydroshare()
-            spec = {
-                "host": {
-                    "hostname": [
-                        "https://www.hydroshare.org/resource/",
-                        "http://www.hydroshare.org/resource/",
-                    ],
-                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                },
-                "resource": "123456789",
-            }
-            with TemporaryDirectory() as d:
-                with pytest.raises(
-                    ContentProviderException,
-                    match=r"Failed to download bag\. status code 500\.",
-                ):
-                    # loop for yield statements
-                    for l in hydro.fetch(spec, d):
-                        pass
-
-
-def test_fetch_bag_timeout():
-    with hydroshare_archive():
-        with patch.object(
-            Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]
-        ):
-            hydro = Hydroshare()
-            spec = {
-                "host": {
-                    "hostname": [
-                        "https://www.hydroshare.org/resource/",
-                        "http://www.hydroshare.org/resource/",
-                    ],
-                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                },
-                "resource": "123456789",
-            }
-            with TemporaryDirectory() as d:
-                with pytest.raises(
-                    ContentProviderException,
-                    match=r"Bag taking too long to prepare, exiting now, try again later\.",
-                ):
-                    # loop for yield statements
-                    for l in hydro.fetch(spec, d, timeout=0):
-                        pass