From fde74efc2e1c3881d0dd41e95d41826c642e15fc Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 09:14:32 -0800 Subject: [PATCH] Fix content_id for dataverse URLs --- repo2docker/contentproviders/dataverse.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 88640a2a..90215748 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -1,6 +1,7 @@ import json import os import shutil +import hashlib from urllib.parse import parse_qs, urlparse, urlunparse from ..utils import copytree, deep_get, is_doi @@ -56,6 +57,9 @@ def detect(self, spec, ref=None, extra_args=None): if host is None: return + # Used only for content_id + self.url = url + # At this point, we *know* this is a dataverse URL, because: # 1. The DOI resolved to a particular host (if using DOI) # 2. The host is in the list of known dataverse installations @@ -84,9 +88,9 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: data = resp.json()["data"] return data["datasetVersion"]["datasetPersistentId"] - def get_persistent_id_from_url(self, url: str) -> str: + def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: """ - Return the persistentId for given dataverse URL. + Return a list of dataFiles for given persistent_id Supports the following *dataset* URL styles: - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP @@ -101,11 +105,6 @@ def get_persistent_id_from_url(self, url: str) -> str: If a URL can not be parsed, throw an exception """ - def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]: - """ - Return a list of dataFiles for given persistent_id - """ - parsed_url = urlparse(url) path = parsed_url.path qs = parse_qs(parsed_url.query) @@ -156,9 +155,7 @@ def fetch(self, spec, output_dir, yield_output=False): url = spec["url"] host = spec["host"] - persistent_id = self.get_persistent_id_from_url(url) - - yield f"Fetching Dataverse record {persistent_id}.\n" + yield f"Fetching Dataverse record {url}.\n" for fobj in self.get_datafiles(host["url"], url): file_url = ( @@ -186,10 +183,7 @@ def fetch(self, spec, output_dir, yield_output=False): copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) - # Save persistent id - self.persitent_id = persistent_id - @property def content_id(self): """The Dataverse persistent identifier.""" - return self.persistent_id + return self.url