From 91242e5150bf0d2c4cd64bf3a0c447fdd16b5b2b Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 12:53:34 -0800 Subject: [PATCH 01/19] Use the doi.org API to resolve URLs --- repo2docker/contentproviders/doi.py | 35 +++++++++++++++++------------ 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 64b93202..130368ba 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -46,21 +46,28 @@ def doi2url(self, doi): # Transform a DOI to a URL # If not a doi, assume we have a URL and return if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._request(f"https://doi.org/{doi}") - resp.raise_for_status() - except HTTPError as e: - # If the DOI doesn't exist, just return URL - if e.response.status_code == 404: - return doi - # Reraise any other errors because if the DOI service is down (or - # we hit a rate limit) we don't want to silently continue to the - # default Git provider as this leads to a misleading error. - self.log.error(f"DOI {doi} does not resolve: {e}") + normalized_doi = normalize_doi(doi) + + # Use the doi.org resolver API + # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api + req_url = f"https://doi.org/api/handles/{normalize_doi}" + resp = self._request(req_url) + if resp.status_code == 404: + # Not a doi, return what we were passed in + return doi + elif resp.status_code == 200: + data = resp.json() + # Pick the first URL we find from the doi response + for v in data["values"]: + if v["type"] == "URL": + return v["data"]["string"] + + # No URLs found for this doi, what do we do? + self.log.error("DOI {normalized_doi} doesn't point to any URLs") + return doi + else: + # If we get any other status codes, raise error raise - return resp.url else: # Just return what is actulally just a URL return doi From 52eeb8f16126ab6d36b91988c71197ddafaa54d9 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 14:51:39 -0800 Subject: [PATCH 02/19] Stop mocking dataverse contentprovider test As I was debugging https://github.com/jupyterhub/repo2docker/pull/1388, I realized that PR actually broke the dataverse provider, but the existing test was mocking so much that we didn't actually catch it! IMO, since the point of contentproviders is to integrate with external content providers, they should be integration tests so we can catch issues with them more easily. Integration tests would have caught https://jupyter.zulipchat.com/#narrow/channel/103349-ask-anything/topic/Binder.20Dataverse.20error more cleanly than how it happened for example. This PR removes all mocks from the dataverse test, and we immediately benefit - it shows us that the dataverse provider *only* actually handles DOIs, and not direct URLs! So even though we technically had tests earlier that showed our dataverse provider supporting direct dataverse URLs, it simply was not true. So we actually catch the failure. I will try to see if we can use a demo or test instance for the fetch test though, so we don't screw up download stats even more for the existing test doi we use. --- .github/workflows/test.yml | 1 + tests/contentproviders/test_dataverse.py | 54 ++++++ tests/unit/contentproviders/test_dataverse.py | 160 ------------------ 3 files changed, 55 insertions(+), 160 deletions(-) create mode 100644 tests/contentproviders/test_dataverse.py delete mode 100644 tests/unit/contentproviders/test_dataverse.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6f7001d2..30045851 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -60,6 +60,7 @@ jobs: - r - unit - venv + - contentproviders include: # The actions/setup-python action with Python version 3.6 isn't # possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04 diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py new file mode 100644 index 00000000..16ee2151 --- /dev/null +++ b/tests/contentproviders/test_dataverse.py @@ -0,0 +1,54 @@ +import hashlib +import os +from tempfile import TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import Dataverse + +test_dv = Dataverse() +harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") +cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") + +@pytest.mark.parametrize( + ("doi", "resolved"), + [ + ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), + ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), + ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), + ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), + ("/some/random/string", None), + ("https://example.com/path/here", None), + # Non dataverse DOIs + ("https://doi.org/10.21105/joss.01277", None) + ] +) +def test_detect(doi, resolved): + assert Dataverse().detect(doi) == resolved + + +def test_dataverse_fetch(): + spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/TJCLKP"} + + dv = Dataverse() + + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) + + # Verify two directories + assert set(os.listdir(d)) == {"data", "code"} + + # Verify sha256sum of three files + expected_sha = { + 'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b', + 'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f', + 'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215' + } + + for subpath, expected_sha in expected_sha.items(): + with open(os.path.join(d, subpath), 'rb') as f: + h = hashlib.sha256() + h.update(f.read()) + assert h.hexdigest() == expected_sha \ No newline at end of file diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py deleted file mode 100644 index abdfe456..00000000 --- a/tests/unit/contentproviders/test_dataverse.py +++ /dev/null @@ -1,160 +0,0 @@ -import json -import os -import re -from io import BytesIO -from tempfile import TemporaryDirectory -from unittest.mock import patch -from urllib.parse import urlsplit -from urllib.request import Request, urlopen - -import pytest - -from repo2docker.contentproviders import Dataverse - -test_dv = Dataverse() -harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") -cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") -test_hosts = [ - ( - [ - "doi:10.7910/DVN/6ZXAGT/3YRRYJ", - "10.7910/DVN/6ZXAGT", - "https://dataverse.harvard.edu/api/access/datafile/3323458", - "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - ], - [ - {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, - {"host": cimmyt_dv, "record": "hdl:11529/10016"}, - ], - ) -] -doi_responses = { - "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": ( - "https://dataverse.harvard.edu/file.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" - ), - "https://doi.org/10.7910/DVN/6ZXAGT": ( - "https://dataverse.harvard.edu/dataset.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT" - ), - "https://dataverse.harvard.edu/api/access/datafile/3323458": ( - "https://dataverse.harvard.edu/api/access/datafile/3323458" - ), - "https://doi.org/10.21105/joss.01277": ( - "https://joss.theoj.org/papers/10.21105/joss.01277" - ), -} - - -@pytest.mark.parametrize("test_input, expected", test_hosts) -def test_detect_dataverse(test_input, expected, requests_mock): - def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - requests_mock.get(re.compile("https://"), json=doi_resolver) - requests_mock.get( - "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file", - json={ - "data": { - "count_in_response": 1, - "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], - } - }, - ) - - assert requests_mock.call_count == 0 - # valid Dataverse DOIs trigger this content provider - assert Dataverse().detect(test_input[0]) == expected[0] - # 4: doi resolution (302), File, doi resolution (302), then dataset - assert requests_mock.call_count == 4 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[1]) == expected[0] - # 2: doi (302), dataset - assert requests_mock.call_count == 2 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[2]) == expected[0] - # 1: datafile (search dataverse for the file id) - assert requests_mock.call_count == 1 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[3]) == expected[1] - requests_mock.reset_mock() - - # Don't trigger the Dataverse content provider - assert Dataverse().detect("/some/path/here") is None - assert Dataverse().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Dataverse - assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None - - -@pytest.fixture -def dv_files(tmpdir): - f1 = tmpdir.join("some-file.txt") - f1.write("some content") - - f2 = tmpdir.mkdir("directory").join("some-other-file.txt") - f2.write("some other content") - - f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") - f3.write("yet another content") - - return [f1, f2, f3] - - -def test_dataverse_fetch(dv_files, requests_mock): - mock_response = { - "data": { - "latestVersion": { - "files": [ - {"dataFile": {"id": 1}, "label": "some-file.txt"}, - { - "dataFile": {"id": 2}, - "label": "some-other-file.txt", - "directoryLabel": "directory", - }, - { - "dataFile": {"id": 3}, - "label": "the-other-file.txt", - "directoryLabel": "directory/subdirectory", - }, - ] - } - } - } - - spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} - - def mock_filecontent(req, context): - parts = urlsplit(req.url) - file_no = int(parts.path.split("/")[-1]) - 1 - return open(dv_files[file_no], "rb").read() - - requests_mock.get( - "https://dataverse.harvard.edu/api/datasets/" - ":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT", - json=mock_response, - ) - requests_mock.get( - re.compile("https://dataverse.harvard.edu/api/access/datafile"), - content=mock_filecontent, - ) - - dv = Dataverse() - - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - unpacked_files = set(os.listdir(d)) - expected = {"directory", "some-file.txt"} - assert expected == unpacked_files - assert os.path.isfile( - os.path.join(d, "directory", "subdirectory", "the-other-file.txt") - ) From bf40856c8315ab1fa693fdc4f2c1225ef09035ac Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 15:02:00 -0800 Subject: [PATCH 03/19] Fix text fixtures --- tests/contentproviders/test_dataverse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index 16ee2151..b5457c69 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -15,8 +15,8 @@ [ ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), + ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/3MJ7IR"}), + ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "hdl:11529/10016"}), ("/some/random/string", None), ("https://example.com/path/here", None), # Non dataverse DOIs From 172f8b017d82b4bcad55448882804e4a24a737af Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 16:05:30 -0800 Subject: [PATCH 04/19] [WIP] Cleanup dataverse contentprovider --- repo2docker/contentproviders/dataverse.py | 111 ++++++++++++++-------- repo2docker/contentproviders/doi.py | 4 +- tests/contentproviders/test_dataverse.py | 23 ++++- 3 files changed, 92 insertions(+), 46 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 9054f53c..a8ee69fc 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -3,7 +3,7 @@ import shutil from urllib.parse import parse_qs, urlparse, urlunparse -from ..utils import copytree, deep_get +from ..utils import copytree, deep_get, is_doi from .doi import DoiProvider @@ -23,10 +23,11 @@ def __init__(self): self.hosts = json.load(fp)["installations"] super().__init__() - def detect(self, doi, ref=None, extra_args=None): - """Trigger this provider for things that resolve to a Dataverse dataset. + def detect(self, spec, ref=None, extra_args=None): + """ + Detect if given spec is hosted on dataverse - Handles: + The spec can be: - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... - URL {siteURL}/api/access/datafile/{fileId} @@ -35,9 +36,11 @@ def detect(self, doi, ref=None, extra_args=None): - https://dataverse.harvard.edu/api/access/datafile/3323458 - doi:10.7910/DVN/6ZXAGT - doi:10.7910/DVN/6ZXAGT/3YRRYJ - """ - url = self.doi2url(doi) + if is_doi(spec): + url = self.doi2url(spec) + else: + url = spec # Parse the url, to get the base for later API calls parsed_url = urlparse(url) @@ -53,51 +56,77 @@ def detect(self, doi, ref=None, extra_args=None): if host is None: return - query_args = parse_qs(parsed_url.query) - # Corner case handling - if parsed_url.path.startswith("/file.xhtml"): - # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi - # is structured as "doi:/" and try to handle dataset that way. - new_doi = doi.rsplit("/", 1)[0] - if new_doi == doi: - # tough luck :( Avoid inifite recursion and exit. - return - return self.detect(new_doi) - elif parsed_url.path.startswith("/api/access/datafile"): - # Raw url pointing to a datafile is a typical output from an External Tool integration + # At this point, we *know* this is a dataverse URL, because: + # 1. The DOI resolved to a particular host (if using DOI) + # 2. The host is in the list of known dataverse installations + # + # We don't know exactly what kind of dataverse object this is, but + # that can be figured out during fetch as needed + return {"host": host, "url": url} + + def get_persistent_id_from_url(self, url: str) -> str: + """ + Return the persistentId for given dataverse URL. + + Supports the following *dataset* URL styles: + - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + + Supports the following *file* URL styles: + - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458 + + Supports a subset of the following *file* URL styles: + - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + + If a URL can not be parsed, throw an exception + """ + parsed_url = urlparse(url) + path = parsed_url.path + qs = parse_qs(parsed_url.query) + + # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + if path.startswith("/citation") or path.startswith("/dataset.xhtml"): + return qs["persistentId"][0] + # https://dataverse.harvard.edu/api/access/datafile/3323458 + elif path.startswith("/api/access/datafile"): + # What we have here is an entity id, which we can use to get a persistentId entity_id = os.path.basename(parsed_url.path) - search_query = "q=entityId:" + entity_id + "&type=file" - # Knowing the file identifier query search api to get parent dataset - search_url = urlunparse( + # FIXME: Should we be URL Encoding something here to protect from path traversal + # or similar attacks? + search_query = f"q=entityId:{entity_id}&type=file" + search_api_url = urlunparse( parsed_url._replace(path="/api/search", query=search_query) ) - self.log.debug("Querying Dataverse: " + search_url) - data = self.urlopen(search_url).json()["data"] + self.log.debug("Querying Dataverse: " + search_api_url) + data = self.urlopen(search_api_url).json()["data"] if data["count_in_response"] != 1: - self.log.debug( - f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n" + raise ValueError( + f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n" ) - return - - self.record_id = deep_get(data, "items.0.dataset_persistent_id") - elif ( - parsed_url.path.startswith("/dataset.xhtml") - and "persistentId" in query_args - ): - self.record_id = deep_get(query_args, "persistentId.0") + return data["items"][0]["dataset_persistent_id"] + elif parsed_url.path.startswith("/file.xhtml"): + file_persistent_id = qs['persistentId'][0] + dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0] + if file_persistent_id == dataset_persistent_id: + # We can't figure this one out, throw an error + raise ValueError(f"Could not find dataset id for {url}") + return dataset_persistent_id - if hasattr(self, "record_id"): - return {"record": self.record_id, "host": host} + raise ValueError(f"Could not determine persistent id for dataverse URL {url}") def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" - record_id = spec["record"] + url = spec["url"] host = spec["host"] - yield f"Fetching Dataverse record {record_id}.\n" - url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}' + persistent_id = self.get_persistent_id_from_url(url) + + yield f"Fetching Dataverse record {persistent_id}.\n" + url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}' resp = self.urlopen(url, headers={"accept": "application/json"}) + print(resp.json()) record = resp.json()["data"] for fobj in deep_get(record, "latestVersion.files"): @@ -126,7 +155,11 @@ def fetch(self, spec, output_dir, yield_output=False): copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) + + # Save persistent id + self.persitent_id = persistent_id + @property def content_id(self): """The Dataverse persistent identifier.""" - return self.record_id + return self.persistent_id diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 130368ba..cf43a157 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -50,7 +50,7 @@ def doi2url(self, doi): # Use the doi.org resolver API # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api - req_url = f"https://doi.org/api/handles/{normalize_doi}" + req_url = f"https://doi.org/api/handles/{normalized_doi}" resp = self._request(req_url) if resp.status_code == 404: # Not a doi, return what we were passed in @@ -60,7 +60,7 @@ def doi2url(self, doi): # Pick the first URL we find from the doi response for v in data["values"]: if v["type"] == "URL": - return v["data"]["string"] + return v["data"]["value"] # No URLs found for this doi, what do we do? self.log.error("DOI {normalized_doi} doesn't point to any URLs") diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index b5457c69..3fcefd32 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -13,10 +13,11 @@ @pytest.mark.parametrize( ("doi", "resolved"), [ - ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), - ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/3MJ7IR"}), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "hdl:11529/10016"}), + ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), + ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), + ("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}), + ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}), + ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}), ("/some/random/string", None), ("https://example.com/path/here", None), # Non dataverse DOIs @@ -27,10 +28,22 @@ def test_detect(doi, resolved): assert Dataverse().detect(doi) == resolved +@pytest.mark.parametrize( + ("url", "persistent_id"), + [ + ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"), + ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"), + ("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"), + ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"), + ] +) +def test_get_persistent_id(url, persistent_id): + assert Dataverse().get_persistent_id_from_url(url) == persistent_id + def test_dataverse_fetch(): - spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/TJCLKP"} dv = Dataverse() + spec = dv.detect("doi:10.7910/DVN/TJCLKP") with TemporaryDirectory() as d: output = [] From 1260a5a394665d8de7ca8de09b18ddecb0ebfa3b Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 17:02:25 -0800 Subject: [PATCH 05/19] Support fetcing single files in dataverse --- repo2docker/contentproviders/dataverse.py | 42 ++++++-- tests/contentproviders/test_dataverse.py | 113 ++++++++++++++++------ 2 files changed, 118 insertions(+), 37 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index a8ee69fc..9b42b07f 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -106,7 +106,7 @@ def get_persistent_id_from_url(self, url: str) -> str: ) return data["items"][0]["dataset_persistent_id"] elif parsed_url.path.startswith("/file.xhtml"): - file_persistent_id = qs['persistentId'][0] + file_persistent_id = qs["persistentId"][0] dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0] if file_persistent_id == dataset_persistent_id: # We can't figure this one out, throw an error @@ -115,6 +115,38 @@ def get_persistent_id_from_url(self, url: str) -> str: raise ValueError(f"Could not determine persistent id for dataverse URL {url}") + def get_datafiles(self, host: str, persistent_id: str) -> list[dict]: + """ + Return a list of dataFiles for given persistent_id + """ + dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}" + + resp = self._request(dataset_url, headers={"accept": "application/json"}) + # Assume it's a dataset + is_dataset = True + if resp.status_code == 404: + # It's possible this is a *file* persistent_id, not a dataset one + file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}" + resp = self._request(file_url, headers={"accept": "application/json"}) + + if resp.status_code == 404: + # This persistent id is just not here + raise ValueError(f"{persistent_id} on {host} is not found") + + # It's not a dataset, it's a file! + is_dataset = False + + # We already handled 404, raise error for everything else + resp.raise_for_status() + + data = resp.json()["data"] + + if is_dataset: + return data["latestVersion"]["files"] + else: + # Only one file object + return [data] + def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" url = spec["url"] @@ -123,13 +155,8 @@ def fetch(self, spec, output_dir, yield_output=False): persistent_id = self.get_persistent_id_from_url(url) yield f"Fetching Dataverse record {persistent_id}.\n" - url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}' - - resp = self.urlopen(url, headers={"accept": "application/json"}) - print(resp.json()) - record = resp.json()["data"] - for fobj in deep_get(record, "latestVersion.files"): + for fobj in self.get_datafiles(host["url"], persistent_id): file_url = ( # without format=original you get the preservation format (plain text, tab separated) f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' @@ -155,7 +182,6 @@ def fetch(self, spec, output_dir, yield_output=False): copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) - # Save persistent id self.persitent_id = persistent_id diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index 3fcefd32..e0c7e178 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -10,19 +10,50 @@ harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") + @pytest.mark.parametrize( ("doi", "resolved"), [ - ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), - ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), - ("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}), + ( + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + }, + ), + ( + "10.7910/DVN/6ZXAGT/3YRRYJ", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + }, + ), + ( + "10.7910/DVN/TJCLKP", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + }, + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/api/access/datafile/3323458", + }, + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + { + "host": cimmyt_dv, + "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + }, + ), ("/some/random/string", None), ("https://example.com/path/here", None), # Non dataverse DOIs - ("https://doi.org/10.21105/joss.01277", None) - ] + ("https://doi.org/10.21105/joss.01277", None), + ], ) def test_detect(doi, resolved): assert Dataverse().detect(doi) == resolved @@ -31,37 +62,61 @@ def test_detect(doi, resolved): @pytest.mark.parametrize( ("url", "persistent_id"), [ - ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"), - ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"), - ] + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT", + ), + ( + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + "doi:10.7910/DVN/TJCLKP", + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + "doi:10.7910/DVN/3MJ7IR", + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + "hdl:11529/10016", + ), + ], ) def test_get_persistent_id(url, persistent_id): assert Dataverse().get_persistent_id_from_url(url) == persistent_id -def test_dataverse_fetch(): +@pytest.mark.parametrize( + ("spec", "md5tree"), + [ + ( + "doi:10.7910/DVN/TJCLKP", + { + "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694", + "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917", + "code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e", + }, + ), + ( + # A citation targeting a single file + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + { + "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5" + } + ) + ], +) +def test_fetch(spec, md5tree): dv = Dataverse() - spec = dv.detect("doi:10.7910/DVN/TJCLKP") with TemporaryDirectory() as d: output = [] - for l in dv.fetch(spec, d): + for l in dv.fetch(dv.detect(spec), d): output.append(l) - # Verify two directories - assert set(os.listdir(d)) == {"data", "code"} - - # Verify sha256sum of three files - expected_sha = { - 'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b', - 'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f', - 'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215' - } - - for subpath, expected_sha in expected_sha.items(): - with open(os.path.join(d, subpath), 'rb') as f: - h = hashlib.sha256() + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() h.update(f.read()) - assert h.hexdigest() == expected_sha \ No newline at end of file + assert h.hexdigest() == expected_sha From b7050ba096eea7f1e2ae96ba438783ea982ba4bd Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 19:55:23 -0800 Subject: [PATCH 06/19] Always fetch entire dataset for dataverse --- repo2docker/contentproviders/dataverse.py | 94 ++++++++++++----------- tests/contentproviders/test_dataverse.py | 72 ++++++++++++----- 2 files changed, 103 insertions(+), 63 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 9b42b07f..88640a2a 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -64,6 +64,26 @@ def detect(self, spec, ref=None, extra_args=None): # that can be figured out during fetch as needed return {"host": host, "url": url} + def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: + """ + Return the persistent_id (DOI) that a given file_id (int or doi) belongs to + """ + if file_id.isdigit(): + # the file_id is an integer, rather than a persistent id (DOI) + api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true" + else: + # the file_id is a doi itself + api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" + + resp = self._request(api_url) + if resp.status_code == 404: + raise ValueError(f"File with id {file_id} not found in {host}") + + resp.raise_for_status() + + data = resp.json()["data"] + return data["datasetVersion"]["datasetPersistentId"] + def get_persistent_id_from_url(self, url: str) -> str: """ Return the persistentId for given dataverse URL. @@ -80,72 +100,56 @@ def get_persistent_id_from_url(self, url: str) -> str: If a URL can not be parsed, throw an exception """ + + def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: + """ + Return a list of dataFiles for given persistent_id + """ + parsed_url = urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) + dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}" + url_kind = None + persistent_id = None + is_ambiguous = False # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + if path.startswith("/citation"): + is_ambiguous = True + persistent_id = qs["persistentId"][0] # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP - if path.startswith("/citation") or path.startswith("/dataset.xhtml"): - return qs["persistentId"][0] + elif path.startswith("/dataset.xhtml"): # https://dataverse.harvard.edu/api/access/datafile/3323458 + persistent_id = qs["persistentId"][0] elif path.startswith("/api/access/datafile"): # What we have here is an entity id, which we can use to get a persistentId - entity_id = os.path.basename(parsed_url.path) - # FIXME: Should we be URL Encoding something here to protect from path traversal - # or similar attacks? - search_query = f"q=entityId:{entity_id}&type=file" - search_api_url = urlunparse( - parsed_url._replace(path="/api/search", query=search_query) - ) - self.log.debug("Querying Dataverse: " + search_api_url) - data = self.urlopen(search_api_url).json()["data"] - if data["count_in_response"] != 1: - raise ValueError( - f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n" - ) - return data["items"][0]["dataset_persistent_id"] + file_id = os.path.basename(parsed_url.path) + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id) elif parsed_url.path.startswith("/file.xhtml"): file_persistent_id = qs["persistentId"][0] - dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0] - if file_persistent_id == dataset_persistent_id: - # We can't figure this one out, throw an error - raise ValueError(f"Could not find dataset id for {url}") - return dataset_persistent_id - - raise ValueError(f"Could not determine persistent id for dataverse URL {url}") - - def get_datafiles(self, host: str, persistent_id: str) -> list[dict]: - """ - Return a list of dataFiles for given persistent_id - """ - dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}" + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_persistent_id) + else: + raise ValueError(f"Could not determine persistent id for dataverse URL {url}") - resp = self._request(dataset_url, headers={"accept": "application/json"}) - # Assume it's a dataset - is_dataset = True - if resp.status_code == 404: + dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + resp = self._request(dataset_api_url, headers={"accept": "application/json"}) + if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one - file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}" - resp = self._request(file_url, headers={"accept": "application/json"}) + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, persistent_id) + dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404: # This persistent id is just not here - raise ValueError(f"{persistent_id} on {host} is not found") - - # It's not a dataset, it's a file! - is_dataset = False + raise ValueError(f"{persistent_id} on {dataverse_host} is not found") # We already handled 404, raise error for everything else resp.raise_for_status() data = resp.json()["data"] - if is_dataset: - return data["latestVersion"]["files"] - else: - # Only one file object - return [data] + return data["latestVersion"]["files"] def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" @@ -156,7 +160,7 @@ def fetch(self, spec, output_dir, yield_output=False): yield f"Fetching Dataverse record {persistent_id}.\n" - for fobj in self.get_datafiles(host["url"], persistent_id): + for fobj in self.get_datafiles(host["url"], url): file_url = ( # without format=original you get the preservation format (plain text, tab separated) f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index e0c7e178..2ab9ec69 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -85,10 +85,13 @@ def test_get_persistent_id(url, persistent_id): @pytest.mark.parametrize( - ("spec", "md5tree"), + ("specs", "md5tree"), [ ( - "doi:10.7910/DVN/TJCLKP", + ( + "doi:10.7910/DVN/TJCLKP", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + ), { "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694", "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917", @@ -96,27 +99,60 @@ def test_get_persistent_id(url, persistent_id): }, ), ( - # A citation targeting a single file - "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT/3YRRYJ" + ), { - "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5" + 'ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png': '243c6a3dd66bc3c84102829b277ef333', + 'ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png': '2ace6ae9d470dda6cf2f9f9a6588171a', + 'ArchaeoGLOBE-master/analysis/figures/3_trends_global.png': '63ccd0a7b2d20440cd8f418d4ee88c4d', + 'ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png': 'facfaedabeac77c4496d4b9e962a917f', + 'ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png': '8e002e4d50f179fc1808f562b1353588', + 'ArchaeoGLOBE-master/apt.txt': 'b4224032da6c71d48f46c9b78fc6ed77', + 'ArchaeoGLOBE-master/analysis/archaeoglobe.pdf': 'f575be4790efc963ef1bd40d097cc06d', + 'ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd': 'f37d5f7993fde9ebd64d16b20fc22905', + 'ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj': 'd0250e7918993bab1e707358fe5633e0', + 'ArchaeoGLOBE-master/CONDUCT.md': 'f87ef290340322089c32b4e573d8f1e8', + 'ArchaeoGLOBE-master/.circleci/config.yml': '6eaa54073a682b3195d8fab3a9dd8344', + 'ArchaeoGLOBE-master/CONTRIBUTING.md': 'b3a6abfc749dd155a3049f94a855bf9f', + 'ArchaeoGLOBE-master/DESCRIPTION': '745ef979494999e483987de72c0adfbd', + 'ArchaeoGLOBE-master/dockerfile': 'aedce68e5a7d6e79cbb24c9cffeae593', + 'ArchaeoGLOBE-master/.binder/Dockerfile': '7564a41246ba99b60144afb1d3b6d7de', + 'ArchaeoGLOBE-master/.gitignore': '62c1482e4febbd35dc02fb7e2a31246b', + 'ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS': '2aea7748b5586923b0de9d13af58e59d', + 'ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS': '145a9e5dd2c95625626a720b52178b70', + 'ArchaeoGLOBE-master/LICENSE.md': '3aa9d41a92a57944bd4590e004898445', + 'ArchaeoGLOBE-master/analysis/data/derived-data/placeholder': 'd41d8cd98f00b204e9800998ecf8427e', + 'ArchaeoGLOBE-master/.Rbuildignore': 'df15e4fed49abd685b536fef4472b01f', + 'ArchaeoGLOBE-master/README.md': '0b0faabe580c4d76a0e0d64a4f54bca4', + 'ArchaeoGLOBE-master/analysis/data/derived-data/README.md': '547fd1a6e874f6178b1cf525b5b9ae72', + 'ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png': 'd2584352e5442b33e4b23e361ca70fe1', + 'ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png': '513eddfdad01fd01a20263a55ca6dbe3', + 'ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png': 'b16ba0ecd21b326f873209a7e55a8deb', + 'ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png': '05695f9412337a00c1cb6d1757d0ec5c', + 'ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png': '10119f7495d3b8e7ad7f8a0770574f15', + 'ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png': 'b1db7c97f39ccfc3a9e094c3e6307af0', + 'ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png': '30341748324f5f66acadb34c114c3e9d', } ) ], ) -def test_fetch(spec, md5tree): +def test_fetch(specs: list[str], md5tree): dv = Dataverse() - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(dv.detect(spec), d): - output.append(l) + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) - # Verify md5 sum of the files we expect to find - # We are using md5 instead of something more secure because that is what - # dataverse itself uses - for subpath, expected_sha in md5tree.items(): - with open(os.path.join(d, subpath), "rb") as f: - h = hashlib.md5() - h.update(f.read()) - assert h.hexdigest() == expected_sha + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha From fde74efc2e1c3881d0dd41e95d41826c642e15fc Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 09:14:32 -0800 Subject: [PATCH 07/19] Fix content_id for dataverse URLs --- repo2docker/contentproviders/dataverse.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 88640a2a..90215748 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -1,6 +1,7 @@ import json import os import shutil +import hashlib from urllib.parse import parse_qs, urlparse, urlunparse from ..utils import copytree, deep_get, is_doi @@ -56,6 +57,9 @@ def detect(self, spec, ref=None, extra_args=None): if host is None: return + # Used only for content_id + self.url = url + # At this point, we *know* this is a dataverse URL, because: # 1. The DOI resolved to a particular host (if using DOI) # 2. The host is in the list of known dataverse installations @@ -84,9 +88,9 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: data = resp.json()["data"] return data["datasetVersion"]["datasetPersistentId"] - def get_persistent_id_from_url(self, url: str) -> str: + def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: """ - Return the persistentId for given dataverse URL. + Return a list of dataFiles for given persistent_id Supports the following *dataset* URL styles: - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP @@ -101,11 +105,6 @@ def get_persistent_id_from_url(self, url: str) -> str: If a URL can not be parsed, throw an exception """ - def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: - """ - Return a list of dataFiles for given persistent_id - """ - parsed_url = urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) @@ -156,9 +155,7 @@ def fetch(self, spec, output_dir, yield_output=False): url = spec["url"] host = spec["host"] - persistent_id = self.get_persistent_id_from_url(url) - - yield f"Fetching Dataverse record {persistent_id}.\n" + yield f"Fetching Dataverse record {url}.\n" for fobj in self.get_datafiles(host["url"], url): file_url = ( @@ -186,10 +183,7 @@ def fetch(self, spec, output_dir, yield_output=False): copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) - # Save persistent id - self.persitent_id = persistent_id - @property def content_id(self): """The Dataverse persistent identifier.""" - return self.persistent_id + return self.url From 96057f98216ba364d671f615fab34ad5462bd25d Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 09:30:53 -0800 Subject: [PATCH 08/19] Use List from typing --- repo2docker/contentproviders/dataverse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 90215748..5797aacc 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -2,7 +2,8 @@ import os import shutil import hashlib -from urllib.parse import parse_qs, urlparse, urlunparse +from urllib.parse import parse_qs, urlparse +from typing import List from ..utils import copytree, deep_get, is_doi from .doi import DoiProvider @@ -88,7 +89,7 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: data = resp.json()["data"] return data["datasetVersion"]["datasetPersistentId"] - def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: + def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: """ Return a list of dataFiles for given persistent_id From fda5339bd4026bbb6b736ea90f53e5b25d7bd6d3 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 09:30:59 -0800 Subject: [PATCH 09/19] Use hash for content_id --- repo2docker/contentproviders/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 5797aacc..220f147e 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -187,4 +187,4 @@ def fetch(self, spec, output_dir, yield_output=False): @property def content_id(self): """The Dataverse persistent identifier.""" - return self.url + return hashlib.sha256(self.url.encode()).hexdigest() From f6037cab744c85722150de4c232b3aaec64978c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:31:18 +0000 Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- repo2docker/contentproviders/dataverse.py | 26 ++++++--- tests/contentproviders/test_dataverse.py | 66 +++++++++++------------ 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 220f147e..125af7a5 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -1,9 +1,9 @@ +import hashlib import json import os import shutil -import hashlib -from urllib.parse import parse_qs, urlparse from typing import List +from urllib.parse import parse_qs, urlparse from ..utils import copytree, deep_get, is_doi from .doi import DoiProvider @@ -120,7 +120,7 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: persistent_id = qs["persistentId"][0] # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP elif path.startswith("/dataset.xhtml"): - # https://dataverse.harvard.edu/api/access/datafile/3323458 + # https://dataverse.harvard.edu/api/access/datafile/3323458 persistent_id = qs["persistentId"][0] elif path.startswith("/api/access/datafile"): # What we have here is an entity id, which we can use to get a persistentId @@ -128,17 +128,27 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id) elif parsed_url.path.startswith("/file.xhtml"): file_persistent_id = qs["persistentId"][0] - persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_persistent_id) + persistent_id = self.get_dataset_id_from_file_id( + dataverse_host, file_persistent_id + ) else: - raise ValueError(f"Could not determine persistent id for dataverse URL {url}") + raise ValueError( + f"Could not determine persistent id for dataverse URL {url}" + ) - dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + dataset_api_url = ( + f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + ) resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one - persistent_id = self.get_dataset_id_from_file_id(dataverse_host, persistent_id) + persistent_id = self.get_dataset_id_from_file_id( + dataverse_host, persistent_id + ) dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" - resp = self._request(dataset_api_url, headers={"accept": "application/json"}) + resp = self._request( + dataset_api_url, headers={"accept": "application/json"} + ) if resp.status_code == 404: # This persistent id is just not here diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index 2ab9ec69..cb5e06f3 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -102,41 +102,41 @@ def test_get_persistent_id(url, persistent_id): ( "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", - "doi:10.7910/DVN/6ZXAGT/3YRRYJ" + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", ), { - 'ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png': '243c6a3dd66bc3c84102829b277ef333', - 'ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png': '2ace6ae9d470dda6cf2f9f9a6588171a', - 'ArchaeoGLOBE-master/analysis/figures/3_trends_global.png': '63ccd0a7b2d20440cd8f418d4ee88c4d', - 'ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png': 'facfaedabeac77c4496d4b9e962a917f', - 'ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png': '8e002e4d50f179fc1808f562b1353588', - 'ArchaeoGLOBE-master/apt.txt': 'b4224032da6c71d48f46c9b78fc6ed77', - 'ArchaeoGLOBE-master/analysis/archaeoglobe.pdf': 'f575be4790efc963ef1bd40d097cc06d', - 'ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd': 'f37d5f7993fde9ebd64d16b20fc22905', - 'ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj': 'd0250e7918993bab1e707358fe5633e0', - 'ArchaeoGLOBE-master/CONDUCT.md': 'f87ef290340322089c32b4e573d8f1e8', - 'ArchaeoGLOBE-master/.circleci/config.yml': '6eaa54073a682b3195d8fab3a9dd8344', - 'ArchaeoGLOBE-master/CONTRIBUTING.md': 'b3a6abfc749dd155a3049f94a855bf9f', - 'ArchaeoGLOBE-master/DESCRIPTION': '745ef979494999e483987de72c0adfbd', - 'ArchaeoGLOBE-master/dockerfile': 'aedce68e5a7d6e79cbb24c9cffeae593', - 'ArchaeoGLOBE-master/.binder/Dockerfile': '7564a41246ba99b60144afb1d3b6d7de', - 'ArchaeoGLOBE-master/.gitignore': '62c1482e4febbd35dc02fb7e2a31246b', - 'ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS': '2aea7748b5586923b0de9d13af58e59d', - 'ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS': '145a9e5dd2c95625626a720b52178b70', - 'ArchaeoGLOBE-master/LICENSE.md': '3aa9d41a92a57944bd4590e004898445', - 'ArchaeoGLOBE-master/analysis/data/derived-data/placeholder': 'd41d8cd98f00b204e9800998ecf8427e', - 'ArchaeoGLOBE-master/.Rbuildignore': 'df15e4fed49abd685b536fef4472b01f', - 'ArchaeoGLOBE-master/README.md': '0b0faabe580c4d76a0e0d64a4f54bca4', - 'ArchaeoGLOBE-master/analysis/data/derived-data/README.md': '547fd1a6e874f6178b1cf525b5b9ae72', - 'ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png': 'd2584352e5442b33e4b23e361ca70fe1', - 'ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png': '513eddfdad01fd01a20263a55ca6dbe3', - 'ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png': 'b16ba0ecd21b326f873209a7e55a8deb', - 'ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png': '05695f9412337a00c1cb6d1757d0ec5c', - 'ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png': '10119f7495d3b8e7ad7f8a0770574f15', - 'ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png': 'b1db7c97f39ccfc3a9e094c3e6307af0', - 'ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png': '30341748324f5f66acadb34c114c3e9d', - } - ) + "ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333", + "ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a", + "ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d", + "ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f", + "ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588", + "ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77", + "ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d", + "ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905", + "ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0", + "ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8", + "ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344", + "ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f", + "ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd", + "ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593", + "ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de", + "ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b", + "ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d", + "ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70", + "ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445", + "ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e", + "ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f", + "ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4", + "ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72", + "ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1", + "ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3", + "ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb", + "ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c", + "ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15", + "ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0", + "ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d", + }, + ), ], ) def test_fetch(specs: list[str], md5tree): From b854b77be55c4ff20f3b8b1b21e896f405a80b8c Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 11:50:10 -0800 Subject: [PATCH 11/19] Fix tests --- repo2docker/contentproviders/dataverse.py | 87 ++++++++++++++--------- tests/contentproviders/test_dataverse.py | 40 ++++------- 2 files changed, 64 insertions(+), 63 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 125af7a5..3cf8cb20 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -2,7 +2,7 @@ import json import os import shutil -from typing import List +from typing import List, Tuple from urllib.parse import parse_qs, urlparse from ..utils import copytree, deep_get, is_doi @@ -67,53 +67,44 @@ def detect(self, spec, ref=None, extra_args=None): # # We don't know exactly what kind of dataverse object this is, but # that can be figured out during fetch as needed - return {"host": host, "url": url} + return url - def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: + def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str: """ Return the persistent_id (DOI) that a given file_id (int or doi) belongs to """ if file_id.isdigit(): # the file_id is an integer, rather than a persistent id (DOI) - api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true" + api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true" else: # the file_id is a doi itself - api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" + api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" resp = self._request(api_url) if resp.status_code == 404: - raise ValueError(f"File with id {file_id} not found in {host}") + raise ValueError(f"File with id {file_id} not found in {base_url}") resp.raise_for_status() data = resp.json()["data"] return data["datasetVersion"]["datasetPersistentId"] - def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: + def parse_dataverse_url(self, url: str) -> Tuple[str, bool]: """ - Return a list of dataFiles for given persistent_id - - Supports the following *dataset* URL styles: - - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP - - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + Parse the persistent id out of a dataverse URL - Supports the following *file* URL styles: - - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458 - - Supports a subset of the following *file* URL styles: - - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + persistent_id can point to either a dataset or a file. The second return + value is False if we know that the persistent id is a file or a dataset, + and True if it is ambiguous. - If a URL can not be parsed, throw an exception + Raises a ValueError if we can not parse the url """ - - parsed_url = urlparse(url) + parsed_url= urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) - dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}" - url_kind = None - persistent_id = None - is_ambiguous = False + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + is_ambiguous = False # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP if path.startswith("/citation"): is_ambiguous = True @@ -124,35 +115,59 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: persistent_id = qs["persistentId"][0] elif path.startswith("/api/access/datafile"): # What we have here is an entity id, which we can use to get a persistentId - file_id = os.path.basename(parsed_url.path) - persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id) + file_id = os.path.basename(path) + persistent_id = self.get_dataset_id_from_file_id(base_url, file_id) elif parsed_url.path.startswith("/file.xhtml"): file_persistent_id = qs["persistentId"][0] persistent_id = self.get_dataset_id_from_file_id( - dataverse_host, file_persistent_id + base_url, file_persistent_id ) else: raise ValueError( f"Could not determine persistent id for dataverse URL {url}" ) + return persistent_id, is_ambiguous + + def get_datafiles(self, url: str) -> List[dict]: + """ + Return a list of dataFiles for given persistent_id + + Supports the following *dataset* URL styles: + - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + + Supports the following *file* URL styles: + - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458 + + Supports a subset of the following *file* URL styles: + - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + + If a URL can not be parsed, throw an exception + """ + + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + persistent_id, is_ambiguous = self.parse_dataverse_url(url) + dataset_api_url = ( - f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" ) resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one persistent_id = self.get_dataset_id_from_file_id( - dataverse_host, persistent_id + base_url, persistent_id ) - dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + dataset_api_url = f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" resp = self._request( dataset_api_url, headers={"accept": "application/json"} ) if resp.status_code == 404: # This persistent id is just not here - raise ValueError(f"{persistent_id} on {dataverse_host} is not found") + raise ValueError(f"{persistent_id} on {base_url} is not found") # We already handled 404, raise error for everything else resp.raise_for_status() @@ -163,15 +178,17 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" - url = spec["url"] - host = spec["host"] + url = spec + parsed_url = urlparse(url) + # FIXME: Support determining API URL better + base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' yield f"Fetching Dataverse record {url}.\n" - for fobj in self.get_datafiles(host["url"], url): + for fobj in self.get_datafiles(url): file_url = ( # without format=original you get the preservation format (plain text, tab separated) - f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' + f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' ) filename = fobj["label"] original_filename = fobj["dataFile"].get("originalFileName", None) diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index cb5e06f3..7b6d1f60 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -6,48 +6,28 @@ from repo2docker.contentproviders import Dataverse -test_dv = Dataverse() -harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") -cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") - - @pytest.mark.parametrize( ("doi", "resolved"), [ ( "doi:10.7910/DVN/6ZXAGT/3YRRYJ", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", - }, + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", ), ( "10.7910/DVN/6ZXAGT/3YRRYJ", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", - }, + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", ), ( "10.7910/DVN/TJCLKP", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", - }, + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", ), ( "https://dataverse.harvard.edu/api/access/datafile/3323458", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/api/access/datafile/3323458", - }, + "https://dataverse.harvard.edu/api/access/datafile/3323458", ), ( "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - { - "host": cimmyt_dv, - "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - }, + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", ), ("/some/random/string", None), ("https://example.com/path/here", None), @@ -60,28 +40,32 @@ def test_detect(doi, resolved): @pytest.mark.parametrize( - ("url", "persistent_id"), + ("url", "persistent_id", "is_ambiguous"), [ ( "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT", + False ), ( "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP", + True ), ( "https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR", + False ), ( "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016", + False ), ], ) -def test_get_persistent_id(url, persistent_id): - assert Dataverse().get_persistent_id_from_url(url) == persistent_id +def test_get_persistent_id(url, persistent_id, is_ambiguous): + assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous) @pytest.mark.parametrize( From f9e3d706dbb36df7d43c91e9d67ec74200ea37b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:51:31 +0000 Subject: [PATCH 12/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- repo2docker/contentproviders/dataverse.py | 10 +++++----- tests/contentproviders/test_dataverse.py | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 3cf8cb20..e786fe24 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -99,7 +99,7 @@ def parse_dataverse_url(self, url: str) -> Tuple[str, bool]: Raises a ValueError if we can not parse the url """ - parsed_url= urlparse(url) + parsed_url = urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" @@ -157,10 +157,10 @@ def get_datafiles(self, url: str) -> List[dict]: resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one - persistent_id = self.get_dataset_id_from_file_id( - base_url, persistent_id + persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id) + dataset_api_url = ( + f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" ) - dataset_api_url = f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" resp = self._request( dataset_api_url, headers={"accept": "application/json"} ) @@ -181,7 +181,7 @@ def fetch(self, spec, output_dir, yield_output=False): url = spec parsed_url = urlparse(url) # FIXME: Support determining API URL better - base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" yield f"Fetching Dataverse record {url}.\n" diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index 7b6d1f60..9ca25852 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -6,6 +6,7 @@ from repo2docker.contentproviders import Dataverse + @pytest.mark.parametrize( ("doi", "resolved"), [ @@ -45,22 +46,22 @@ def test_detect(doi, resolved): ( "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT", - False + False, ), ( "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP", - True + True, ), ( "https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR", - False + False, ), ( "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016", - False + False, ), ], ) From 53fba84e18d1af7edadbceb01462c9f916c283e7 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 11:52:45 -0800 Subject: [PATCH 13/19] Add note about supporting /citation --- repo2docker/contentproviders/dataverse.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index e786fe24..fb050800 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -137,11 +137,10 @@ def get_datafiles(self, url: str) -> List[dict]: - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP - Supports the following *file* URL styles: + Supports the following *file* URL styles (entire dataset file belongs to will be fetched): - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458 - - Supports a subset of the following *file* URL styles: - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ If a URL can not be parsed, throw an exception """ From f4d58dcd2fee7a1d4e9976133b717d3cd61765db Mon Sep 17 00:00:00 2001 From: Yuvi Panda Date: Tue, 17 Dec 2024 11:54:10 -0800 Subject: [PATCH 14/19] Describe what kind of DOI is being returned Co-authored-by: Philip Durbin --- repo2docker/contentproviders/dataverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index fb050800..0d652c08 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -71,7 +71,7 @@ def detect(self, spec, ref=None, extra_args=None): def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str: """ - Return the persistent_id (DOI) that a given file_id (int or doi) belongs to + Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to """ if file_id.isdigit(): # the file_id is an integer, rather than a persistent id (DOI) From d71efb892db240759d5a09749c9b0e2808c370f0 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 12:52:47 -0800 Subject: [PATCH 15/19] Fix figshare unit test --- tests/unit/contentproviders/test_figshare.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index b4f42de3..760c0e48 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -21,16 +21,9 @@ @pytest.mark.parametrize("link,expected", test_content_ids) -def test_content_id(link, expected, requests_mock): - def mocked_get(req, context): - if req.url.startswith("https://doi.org"): - context.status_code = 302 - context.headers["Location"] = link - return link - - requests_mock.get(re.compile("https://"), text=mocked_get) +def test_content_id(link, expected): fig = Figshare() - fig.detect("10.6084/m9.figshare.9782777") + fig.detect(link) assert fig.content_id == expected From f7dfff18762337b355cb7e911ee6b73c4e806cab Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 12:56:36 -0800 Subject: [PATCH 16/19] Fix hydroshare tests --- .../unit/contentproviders/test_hydroshare.py | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index 0c8acfd1..41e234cd 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -34,20 +34,14 @@ def doi_resolver(req, context): } -def test_content_id(requests_mock): - requests_mock.get(re.compile("https://"), json=hydroshare_data) - requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) - +def test_content_id(): hydro = Hydroshare() hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" - + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408" -def test_detect_hydroshare(requests_mock): - requests_mock.get(re.compile("https://"), json=hydroshare_data) - requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) +def test_detect_hydroshare(): # valid Hydroshare DOIs trigger this content provider expected = { "host": { @@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock): "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", }, "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569427757", + "version": "1585005408", } assert ( @@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock): ) == expected ) - # assert a call to urlopen was called to fetch version - assert requests_mock.call_count == 1 - requests_mock.reset_mock() assert ( Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected ) - # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version - assert requests_mock.call_count == 3 - requests_mock.reset_mock() assert ( Hydroshare().detect( @@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock): ) == expected ) - # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version - assert requests_mock.call_count == 3 - requests_mock.reset_mock() # Don't trigger the Hydroshare content provider assert Hydroshare().detect("/some/path/here") is None From 3be6ca99c21473bb7a80a0186a7c8f54c1e520ab Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 12:58:01 -0800 Subject: [PATCH 17/19] Fix zenodo tests --- tests/unit/contentproviders/test_zenodo.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index 38737ae1..a1e1226f 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -21,18 +21,7 @@ } -def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - -def test_content_id(requests_mock): - requests_mock.get(re.compile("https://"), json=doi_resolver) - +def test_content_id(): zen = Zenodo() zen.detect("10.5281/zenodo.3232985") assert zen.content_id == "3232985" @@ -60,15 +49,11 @@ def test_content_id(requests_mock): @pytest.mark.parametrize("test_input,expected", test_hosts) -def test_detect_zenodo(test_input, expected, requests_mock): - requests_mock.get(re.compile("https://"), json=doi_resolver) +def test_detect_zenodo(test_input, expected): # valid Zenodo DOIs trigger this content provider assert Zenodo().detect(test_input[0]) == expected assert Zenodo().detect(test_input[1]) == expected assert Zenodo().detect(test_input[2]) == expected - # only two of the three calls above have to resolve a DOI (2 req per doi resolution) - assert requests_mock.call_count == 4 - requests_mock.reset_mock() # Don't trigger the Zenodo content provider assert Zenodo().detect("/some/path/here") is None From 60c0d70f373fc1ec1ee9542709757b681717e8ac Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 13:10:54 -0800 Subject: [PATCH 18/19] Fix doi provider test We no longer follow redirects, so this is the canonical URL --- tests/unit/contentproviders/test_doi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 301b50be..58cae5f5 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -33,7 +33,7 @@ def test_url_headers(requests_mock): @pytest.mark.parametrize( "requested_doi, expected", [ - ("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"), + ("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"), # Unresolving DOI: ("10.1/1234", "10.1/1234"), ], From e48f5b773e3687faf9b8fb3ebb5846bd8a3686cd Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Thu, 19 Dec 2024 12:08:10 -0800 Subject: [PATCH 19/19] Switch back to using DOI as persistent_id --- repo2docker/contentproviders/dataverse.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 0d652c08..248a3f3a 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -58,9 +58,6 @@ def detect(self, spec, ref=None, extra_args=None): if host is None: return - # Used only for content_id - self.url = url - # At this point, we *know* this is a dataverse URL, because: # 1. The DOI resolved to a particular host (if using DOI) # 2. The host is in the list of known dataverse installations @@ -171,6 +168,10 @@ def get_datafiles(self, url: str) -> List[dict]: # We already handled 404, raise error for everything else resp.raise_for_status() + # We know the exact persistent_id of the dataset we fetched now + # Save it for use as content_id + self.persistent_id = persistent_id + data = resp.json()["data"] return data["latestVersion"]["files"] @@ -212,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False): @property def content_id(self): - """The Dataverse persistent identifier.""" - return hashlib.sha256(self.url.encode()).hexdigest() + """ + The Dataverse persistent identifier. + + Only valid if called after a succesfull fetch + """ + return self.persistent_id