diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6f7001d2c..300458514 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -60,6 +60,7 @@ jobs: - r - unit - venv + - contentproviders include: # The actions/setup-python action with Python version 3.6 isn't # possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04 diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 9054f53ce..248a3f3a2 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -1,9 +1,11 @@ +import hashlib import json import os import shutil -from urllib.parse import parse_qs, urlparse, urlunparse +from typing import List, Tuple +from urllib.parse import parse_qs, urlparse -from ..utils import copytree, deep_get +from ..utils import copytree, deep_get, is_doi from .doi import DoiProvider @@ -23,10 +25,11 @@ def __init__(self): self.hosts = json.load(fp)["installations"] super().__init__() - def detect(self, doi, ref=None, extra_args=None): - """Trigger this provider for things that resolve to a Dataverse dataset. + def detect(self, spec, ref=None, extra_args=None): + """ + Detect if given spec is hosted on dataverse - Handles: + The spec can be: - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... - URL {siteURL}/api/access/datafile/{fileId} @@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None): - https://dataverse.harvard.edu/api/access/datafile/3323458 - doi:10.7910/DVN/6ZXAGT - doi:10.7910/DVN/6ZXAGT/3YRRYJ - """ - url = self.doi2url(doi) + if is_doi(spec): + url = self.doi2url(spec) + else: + url = spec # Parse the url, to get the base for later API calls parsed_url = urlparse(url) @@ -53,57 +58,137 @@ def detect(self, doi, ref=None, extra_args=None): if host is None: return - query_args = parse_qs(parsed_url.query) - # Corner case handling - if parsed_url.path.startswith("/file.xhtml"): - # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi - # is structured as "doi:/" and try to handle dataset that way. - new_doi = doi.rsplit("/", 1)[0] - if new_doi == doi: - # tough luck :( Avoid inifite recursion and exit. - return - return self.detect(new_doi) - elif parsed_url.path.startswith("/api/access/datafile"): - # Raw url pointing to a datafile is a typical output from an External Tool integration - entity_id = os.path.basename(parsed_url.path) - search_query = "q=entityId:" + entity_id + "&type=file" - # Knowing the file identifier query search api to get parent dataset - search_url = urlunparse( - parsed_url._replace(path="/api/search", query=search_query) + # At this point, we *know* this is a dataverse URL, because: + # 1. The DOI resolved to a particular host (if using DOI) + # 2. The host is in the list of known dataverse installations + # + # We don't know exactly what kind of dataverse object this is, but + # that can be figured out during fetch as needed + return url + + def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str: + """ + Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to + """ + if file_id.isdigit(): + # the file_id is an integer, rather than a persistent id (DOI) + api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true" + else: + # the file_id is a doi itself + api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" + + resp = self._request(api_url) + if resp.status_code == 404: + raise ValueError(f"File with id {file_id} not found in {base_url}") + + resp.raise_for_status() + + data = resp.json()["data"] + return data["datasetVersion"]["datasetPersistentId"] + + def parse_dataverse_url(self, url: str) -> Tuple[str, bool]: + """ + Parse the persistent id out of a dataverse URL + + persistent_id can point to either a dataset or a file. The second return + value is False if we know that the persistent id is a file or a dataset, + and True if it is ambiguous. + + Raises a ValueError if we can not parse the url + """ + parsed_url = urlparse(url) + path = parsed_url.path + qs = parse_qs(parsed_url.query) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + is_ambiguous = False + # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + if path.startswith("/citation"): + is_ambiguous = True + persistent_id = qs["persistentId"][0] + # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + elif path.startswith("/dataset.xhtml"): + # https://dataverse.harvard.edu/api/access/datafile/3323458 + persistent_id = qs["persistentId"][0] + elif path.startswith("/api/access/datafile"): + # What we have here is an entity id, which we can use to get a persistentId + file_id = os.path.basename(path) + persistent_id = self.get_dataset_id_from_file_id(base_url, file_id) + elif parsed_url.path.startswith("/file.xhtml"): + file_persistent_id = qs["persistentId"][0] + persistent_id = self.get_dataset_id_from_file_id( + base_url, file_persistent_id + ) + else: + raise ValueError( + f"Could not determine persistent id for dataverse URL {url}" ) - self.log.debug("Querying Dataverse: " + search_url) - data = self.urlopen(search_url).json()["data"] - if data["count_in_response"] != 1: - self.log.debug( - f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n" - ) - return - - self.record_id = deep_get(data, "items.0.dataset_persistent_id") - elif ( - parsed_url.path.startswith("/dataset.xhtml") - and "persistentId" in query_args - ): - self.record_id = deep_get(query_args, "persistentId.0") - - if hasattr(self, "record_id"): - return {"record": self.record_id, "host": host} + + return persistent_id, is_ambiguous + + def get_datafiles(self, url: str) -> List[dict]: + """ + Return a list of dataFiles for given persistent_id + + Supports the following *dataset* URL styles: + - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + + Supports the following *file* URL styles (entire dataset file belongs to will be fetched): + - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458 + - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ + + If a URL can not be parsed, throw an exception + """ + + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + persistent_id, is_ambiguous = self.parse_dataverse_url(url) + + dataset_api_url = ( + f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" + ) + resp = self._request(dataset_api_url, headers={"accept": "application/json"}) + if resp.status_code == 404 and is_ambiguous: + # It's possible this is a *file* persistent_id, not a dataset one + persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id) + dataset_api_url = ( + f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" + ) + resp = self._request( + dataset_api_url, headers={"accept": "application/json"} + ) + + if resp.status_code == 404: + # This persistent id is just not here + raise ValueError(f"{persistent_id} on {base_url} is not found") + + # We already handled 404, raise error for everything else + resp.raise_for_status() + + # We know the exact persistent_id of the dataset we fetched now + # Save it for use as content_id + self.persistent_id = persistent_id + + data = resp.json()["data"] + + return data["latestVersion"]["files"] def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" - record_id = spec["record"] - host = spec["host"] - - yield f"Fetching Dataverse record {record_id}.\n" - url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}' + url = spec + parsed_url = urlparse(url) + # FIXME: Support determining API URL better + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - resp = self.urlopen(url, headers={"accept": "application/json"}) - record = resp.json()["data"] + yield f"Fetching Dataverse record {url}.\n" - for fobj in deep_get(record, "latestVersion.files"): + for fobj in self.get_datafiles(url): file_url = ( # without format=original you get the preservation format (plain text, tab separated) - f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' + f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' ) filename = fobj["label"] original_filename = fobj["dataFile"].get("originalFileName", None) @@ -128,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False): @property def content_id(self): - """The Dataverse persistent identifier.""" - return self.record_id + """ + The Dataverse persistent identifier. + + Only valid if called after a succesfull fetch + """ + return self.persistent_id diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 64b93202f..cf43a1578 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -46,21 +46,28 @@ def doi2url(self, doi): # Transform a DOI to a URL # If not a doi, assume we have a URL and return if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._request(f"https://doi.org/{doi}") - resp.raise_for_status() - except HTTPError as e: - # If the DOI doesn't exist, just return URL - if e.response.status_code == 404: - return doi - # Reraise any other errors because if the DOI service is down (or - # we hit a rate limit) we don't want to silently continue to the - # default Git provider as this leads to a misleading error. - self.log.error(f"DOI {doi} does not resolve: {e}") + normalized_doi = normalize_doi(doi) + + # Use the doi.org resolver API + # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api + req_url = f"https://doi.org/api/handles/{normalized_doi}" + resp = self._request(req_url) + if resp.status_code == 404: + # Not a doi, return what we were passed in + return doi + elif resp.status_code == 200: + data = resp.json() + # Pick the first URL we find from the doi response + for v in data["values"]: + if v["type"] == "URL": + return v["data"]["value"] + + # No URLs found for this doi, what do we do? + self.log.error("DOI {normalized_doi} doesn't point to any URLs") + return doi + else: + # If we get any other status codes, raise error raise - return resp.url else: # Just return what is actulally just a URL return doi diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py new file mode 100644 index 000000000..9ca25852f --- /dev/null +++ b/tests/contentproviders/test_dataverse.py @@ -0,0 +1,143 @@ +import hashlib +import os +from tempfile import TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import Dataverse + + +@pytest.mark.parametrize( + ("doi", "resolved"), + [ + ( + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + ), + ( + "10.7910/DVN/6ZXAGT/3YRRYJ", + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + ), + ( + "10.7910/DVN/TJCLKP", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + "https://dataverse.harvard.edu/api/access/datafile/3323458", + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + ), + ("/some/random/string", None), + ("https://example.com/path/here", None), + # Non dataverse DOIs + ("https://doi.org/10.21105/joss.01277", None), + ], +) +def test_detect(doi, resolved): + assert Dataverse().detect(doi) == resolved + + +@pytest.mark.parametrize( + ("url", "persistent_id", "is_ambiguous"), + [ + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT", + False, + ), + ( + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + "doi:10.7910/DVN/TJCLKP", + True, + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + "doi:10.7910/DVN/3MJ7IR", + False, + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + "hdl:11529/10016", + False, + ), + ], +) +def test_get_persistent_id(url, persistent_id, is_ambiguous): + assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous) + + +@pytest.mark.parametrize( + ("specs", "md5tree"), + [ + ( + ( + "doi:10.7910/DVN/TJCLKP", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + ), + { + "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694", + "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917", + "code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e", + }, + ), + ( + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", + ), + { + "ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333", + "ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a", + "ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d", + "ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f", + "ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588", + "ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77", + "ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d", + "ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905", + "ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0", + "ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8", + "ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344", + "ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f", + "ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd", + "ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593", + "ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de", + "ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b", + "ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d", + "ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70", + "ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445", + "ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e", + "ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f", + "ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4", + "ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72", + "ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1", + "ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3", + "ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb", + "ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c", + "ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15", + "ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0", + "ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d", + }, + ), + ], +) +def test_fetch(specs: list[str], md5tree): + dv = Dataverse() + + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) + + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py deleted file mode 100644 index abdfe456d..000000000 --- a/tests/unit/contentproviders/test_dataverse.py +++ /dev/null @@ -1,160 +0,0 @@ -import json -import os -import re -from io import BytesIO -from tempfile import TemporaryDirectory -from unittest.mock import patch -from urllib.parse import urlsplit -from urllib.request import Request, urlopen - -import pytest - -from repo2docker.contentproviders import Dataverse - -test_dv = Dataverse() -harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") -cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") -test_hosts = [ - ( - [ - "doi:10.7910/DVN/6ZXAGT/3YRRYJ", - "10.7910/DVN/6ZXAGT", - "https://dataverse.harvard.edu/api/access/datafile/3323458", - "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - ], - [ - {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, - {"host": cimmyt_dv, "record": "hdl:11529/10016"}, - ], - ) -] -doi_responses = { - "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": ( - "https://dataverse.harvard.edu/file.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" - ), - "https://doi.org/10.7910/DVN/6ZXAGT": ( - "https://dataverse.harvard.edu/dataset.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT" - ), - "https://dataverse.harvard.edu/api/access/datafile/3323458": ( - "https://dataverse.harvard.edu/api/access/datafile/3323458" - ), - "https://doi.org/10.21105/joss.01277": ( - "https://joss.theoj.org/papers/10.21105/joss.01277" - ), -} - - -@pytest.mark.parametrize("test_input, expected", test_hosts) -def test_detect_dataverse(test_input, expected, requests_mock): - def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - requests_mock.get(re.compile("https://"), json=doi_resolver) - requests_mock.get( - "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file", - json={ - "data": { - "count_in_response": 1, - "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], - } - }, - ) - - assert requests_mock.call_count == 0 - # valid Dataverse DOIs trigger this content provider - assert Dataverse().detect(test_input[0]) == expected[0] - # 4: doi resolution (302), File, doi resolution (302), then dataset - assert requests_mock.call_count == 4 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[1]) == expected[0] - # 2: doi (302), dataset - assert requests_mock.call_count == 2 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[2]) == expected[0] - # 1: datafile (search dataverse for the file id) - assert requests_mock.call_count == 1 - requests_mock.reset_mock() - - assert Dataverse().detect(test_input[3]) == expected[1] - requests_mock.reset_mock() - - # Don't trigger the Dataverse content provider - assert Dataverse().detect("/some/path/here") is None - assert Dataverse().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Dataverse - assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None - - -@pytest.fixture -def dv_files(tmpdir): - f1 = tmpdir.join("some-file.txt") - f1.write("some content") - - f2 = tmpdir.mkdir("directory").join("some-other-file.txt") - f2.write("some other content") - - f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") - f3.write("yet another content") - - return [f1, f2, f3] - - -def test_dataverse_fetch(dv_files, requests_mock): - mock_response = { - "data": { - "latestVersion": { - "files": [ - {"dataFile": {"id": 1}, "label": "some-file.txt"}, - { - "dataFile": {"id": 2}, - "label": "some-other-file.txt", - "directoryLabel": "directory", - }, - { - "dataFile": {"id": 3}, - "label": "the-other-file.txt", - "directoryLabel": "directory/subdirectory", - }, - ] - } - } - } - - spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} - - def mock_filecontent(req, context): - parts = urlsplit(req.url) - file_no = int(parts.path.split("/")[-1]) - 1 - return open(dv_files[file_no], "rb").read() - - requests_mock.get( - "https://dataverse.harvard.edu/api/datasets/" - ":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT", - json=mock_response, - ) - requests_mock.get( - re.compile("https://dataverse.harvard.edu/api/access/datafile"), - content=mock_filecontent, - ) - - dv = Dataverse() - - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - unpacked_files = set(os.listdir(d)) - expected = {"directory", "some-file.txt"} - assert expected == unpacked_files - assert os.path.isfile( - os.path.join(d, "directory", "subdirectory", "the-other-file.txt") - ) diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 301b50beb..58cae5f5b 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -33,7 +33,7 @@ def test_url_headers(requests_mock): @pytest.mark.parametrize( "requested_doi, expected", [ - ("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"), + ("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"), # Unresolving DOI: ("10.1/1234", "10.1/1234"), ], diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index b4f42de31..760c0e487 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -21,16 +21,9 @@ @pytest.mark.parametrize("link,expected", test_content_ids) -def test_content_id(link, expected, requests_mock): - def mocked_get(req, context): - if req.url.startswith("https://doi.org"): - context.status_code = 302 - context.headers["Location"] = link - return link - - requests_mock.get(re.compile("https://"), text=mocked_get) +def test_content_id(link, expected): fig = Figshare() - fig.detect("10.6084/m9.figshare.9782777") + fig.detect(link) assert fig.content_id == expected diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index 0c8acfd14..41e234cd1 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -34,20 +34,14 @@ def doi_resolver(req, context): } -def test_content_id(requests_mock): - requests_mock.get(re.compile("https://"), json=hydroshare_data) - requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) - +def test_content_id(): hydro = Hydroshare() hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" - + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408" -def test_detect_hydroshare(requests_mock): - requests_mock.get(re.compile("https://"), json=hydroshare_data) - requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) +def test_detect_hydroshare(): # valid Hydroshare DOIs trigger this content provider expected = { "host": { @@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock): "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", }, "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569427757", + "version": "1585005408", } assert ( @@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock): ) == expected ) - # assert a call to urlopen was called to fetch version - assert requests_mock.call_count == 1 - requests_mock.reset_mock() assert ( Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected ) - # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version - assert requests_mock.call_count == 3 - requests_mock.reset_mock() assert ( Hydroshare().detect( @@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock): ) == expected ) - # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version - assert requests_mock.call_count == 3 - requests_mock.reset_mock() # Don't trigger the Hydroshare content provider assert Hydroshare().detect("/some/path/here") is None diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index 38737ae1a..a1e1226fb 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -21,18 +21,7 @@ } -def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - -def test_content_id(requests_mock): - requests_mock.get(re.compile("https://"), json=doi_resolver) - +def test_content_id(): zen = Zenodo() zen.detect("10.5281/zenodo.3232985") assert zen.content_id == "3232985" @@ -60,15 +49,11 @@ def test_content_id(requests_mock): @pytest.mark.parametrize("test_input,expected", test_hosts) -def test_detect_zenodo(test_input, expected, requests_mock): - requests_mock.get(re.compile("https://"), json=doi_resolver) +def test_detect_zenodo(test_input, expected): # valid Zenodo DOIs trigger this content provider assert Zenodo().detect(test_input[0]) == expected assert Zenodo().detect(test_input[1]) == expected assert Zenodo().detect(test_input[2]) == expected - # only two of the three calls above have to resolve a DOI (2 req per doi resolution) - assert requests_mock.call_count == 4 - requests_mock.reset_mock() # Don't trigger the Zenodo content provider assert Zenodo().detect("/some/path/here") is None