From b7050ba096eea7f1e2ae96ba438783ea982ba4bd Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 19:55:23 -0800 Subject: [PATCH] Always fetch entire dataset for dataverse --- repo2docker/contentproviders/dataverse.py | 94 ++++++++++++----------- tests/contentproviders/test_dataverse.py | 72 ++++++++++++----- 2 files changed, 103 insertions(+), 63 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 9b42b07f..88640a2a 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -64,6 +64,26 @@ def detect(self, spec, ref=None, extra_args=None): # that can be figured out during fetch as needed return {"host": host, "url": url} + def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: + """ + Return the persistent_id (DOI) that a given file_id (int or doi) belongs to + """ + if file_id.isdigit(): + # the file_id is an integer, rather than a persistent id (DOI) + api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true" + else: + # the file_id is a doi itself + api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" + + resp = self._request(api_url) + if resp.status_code == 404: + raise ValueError(f"File with id {file_id} not found in {host}") + + resp.raise_for_status() + + data = resp.json()["data"] + return data["datasetVersion"]["datasetPersistentId"] + def get_persistent_id_from_url(self, url: str) -> str: """ Return the persistentId for given dataverse URL. @@ -80,72 +100,56 @@ def get_persistent_id_from_url(self, url: str) -> str: If a URL can not be parsed, throw an exception """ + + def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: + """ + Return a list of dataFiles for given persistent_id + """ + parsed_url = urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) + dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}" + url_kind = None + persistent_id = None + is_ambiguous = False # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + if path.startswith("/citation"): + is_ambiguous = True + persistent_id = qs["persistentId"][0] # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP - if path.startswith("/citation") or path.startswith("/dataset.xhtml"): - return qs["persistentId"][0] + elif path.startswith("/dataset.xhtml"): # https://dataverse.harvard.edu/api/access/datafile/3323458 + persistent_id = qs["persistentId"][0] elif path.startswith("/api/access/datafile"): # What we have here is an entity id, which we can use to get a persistentId - entity_id = os.path.basename(parsed_url.path) - # FIXME: Should we be URL Encoding something here to protect from path traversal - # or similar attacks? - search_query = f"q=entityId:{entity_id}&type=file" - search_api_url = urlunparse( - parsed_url._replace(path="/api/search", query=search_query) - ) - self.log.debug("Querying Dataverse: " + search_api_url) - data = self.urlopen(search_api_url).json()["data"] - if data["count_in_response"] != 1: - raise ValueError( - f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n" - ) - return data["items"][0]["dataset_persistent_id"] + file_id = os.path.basename(parsed_url.path) + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id) elif parsed_url.path.startswith("/file.xhtml"): file_persistent_id = qs["persistentId"][0] - dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0] - if file_persistent_id == dataset_persistent_id: - # We can't figure this one out, throw an error - raise ValueError(f"Could not find dataset id for {url}") - return dataset_persistent_id - - raise ValueError(f"Could not determine persistent id for dataverse URL {url}") - - def get_datafiles(self, host: str, persistent_id: str) -> list[dict]: - """ - Return a list of dataFiles for given persistent_id - """ - dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}" + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_persistent_id) + else: + raise ValueError(f"Could not determine persistent id for dataverse URL {url}") - resp = self._request(dataset_url, headers={"accept": "application/json"}) - # Assume it's a dataset - is_dataset = True - if resp.status_code == 404: + dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + resp = self._request(dataset_api_url, headers={"accept": "application/json"}) + if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one - file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}" - resp = self._request(file_url, headers={"accept": "application/json"}) + persistent_id = self.get_dataset_id_from_file_id(dataverse_host, persistent_id) + dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404: # This persistent id is just not here - raise ValueError(f"{persistent_id} on {host} is not found") - - # It's not a dataset, it's a file! - is_dataset = False + raise ValueError(f"{persistent_id} on {dataverse_host} is not found") # We already handled 404, raise error for everything else resp.raise_for_status() data = resp.json()["data"] - if is_dataset: - return data["latestVersion"]["files"] - else: - # Only one file object - return [data] + return data["latestVersion"]["files"] def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" @@ -156,7 +160,7 @@ def fetch(self, spec, output_dir, yield_output=False): yield f"Fetching Dataverse record {persistent_id}.\n" - for fobj in self.get_datafiles(host["url"], persistent_id): + for fobj in self.get_datafiles(host["url"], url): file_url = ( # without format=original you get the preservation format (plain text, tab separated) f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index e0c7e178..2ab9ec69 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -85,10 +85,13 @@ def test_get_persistent_id(url, persistent_id): @pytest.mark.parametrize( - ("spec", "md5tree"), + ("specs", "md5tree"), [ ( - "doi:10.7910/DVN/TJCLKP", + ( + "doi:10.7910/DVN/TJCLKP", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + ), { "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694", "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917", @@ -96,27 +99,60 @@ def test_get_persistent_id(url, persistent_id): }, ), ( - # A citation targeting a single file - "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT/3YRRYJ" + ), { - "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5" + 'ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png': '243c6a3dd66bc3c84102829b277ef333', + 'ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png': '2ace6ae9d470dda6cf2f9f9a6588171a', + 'ArchaeoGLOBE-master/analysis/figures/3_trends_global.png': '63ccd0a7b2d20440cd8f418d4ee88c4d', + 'ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png': 'facfaedabeac77c4496d4b9e962a917f', + 'ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png': '8e002e4d50f179fc1808f562b1353588', + 'ArchaeoGLOBE-master/apt.txt': 'b4224032da6c71d48f46c9b78fc6ed77', + 'ArchaeoGLOBE-master/analysis/archaeoglobe.pdf': 'f575be4790efc963ef1bd40d097cc06d', + 'ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd': 'f37d5f7993fde9ebd64d16b20fc22905', + 'ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj': 'd0250e7918993bab1e707358fe5633e0', + 'ArchaeoGLOBE-master/CONDUCT.md': 'f87ef290340322089c32b4e573d8f1e8', + 'ArchaeoGLOBE-master/.circleci/config.yml': '6eaa54073a682b3195d8fab3a9dd8344', + 'ArchaeoGLOBE-master/CONTRIBUTING.md': 'b3a6abfc749dd155a3049f94a855bf9f', + 'ArchaeoGLOBE-master/DESCRIPTION': '745ef979494999e483987de72c0adfbd', + 'ArchaeoGLOBE-master/dockerfile': 'aedce68e5a7d6e79cbb24c9cffeae593', + 'ArchaeoGLOBE-master/.binder/Dockerfile': '7564a41246ba99b60144afb1d3b6d7de', + 'ArchaeoGLOBE-master/.gitignore': '62c1482e4febbd35dc02fb7e2a31246b', + 'ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS': '2aea7748b5586923b0de9d13af58e59d', + 'ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS': '145a9e5dd2c95625626a720b52178b70', + 'ArchaeoGLOBE-master/LICENSE.md': '3aa9d41a92a57944bd4590e004898445', + 'ArchaeoGLOBE-master/analysis/data/derived-data/placeholder': 'd41d8cd98f00b204e9800998ecf8427e', + 'ArchaeoGLOBE-master/.Rbuildignore': 'df15e4fed49abd685b536fef4472b01f', + 'ArchaeoGLOBE-master/README.md': '0b0faabe580c4d76a0e0d64a4f54bca4', + 'ArchaeoGLOBE-master/analysis/data/derived-data/README.md': '547fd1a6e874f6178b1cf525b5b9ae72', + 'ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png': 'd2584352e5442b33e4b23e361ca70fe1', + 'ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png': '513eddfdad01fd01a20263a55ca6dbe3', + 'ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png': 'b16ba0ecd21b326f873209a7e55a8deb', + 'ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png': '05695f9412337a00c1cb6d1757d0ec5c', + 'ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png': '10119f7495d3b8e7ad7f8a0770574f15', + 'ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png': 'b1db7c97f39ccfc3a9e094c3e6307af0', + 'ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png': '30341748324f5f66acadb34c114c3e9d', } ) ], ) -def test_fetch(spec, md5tree): +def test_fetch(specs: list[str], md5tree): dv = Dataverse() - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(dv.detect(spec), d): - output.append(l) + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) - # Verify md5 sum of the files we expect to find - # We are using md5 instead of something more secure because that is what - # dataverse itself uses - for subpath, expected_sha in md5tree.items(): - with open(os.path.join(d, subpath), "rb") as f: - h = hashlib.md5() - h.update(f.read()) - assert h.hexdigest() == expected_sha + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha