Skip to content

Commit

Permalink
Fix content_id for dataverse URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
yuvipanda committed Dec 17, 2024
1 parent b7050ba commit fde74ef
Showing 1 changed file with 8 additions and 14 deletions.
22 changes: 8 additions & 14 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import shutil
import hashlib
from urllib.parse import parse_qs, urlparse, urlunparse

from ..utils import copytree, deep_get, is_doi
Expand Down Expand Up @@ -56,6 +57,9 @@ def detect(self, spec, ref=None, extra_args=None):
if host is None:
return

# Used only for content_id
self.url = url

# At this point, we *know* this is a dataverse URL, because:
# 1. The DOI resolved to a particular host (if using DOI)
# 2. The host is in the list of known dataverse installations
Expand Down Expand Up @@ -84,9 +88,9 @@ def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str:
data = resp.json()["data"]
return data["datasetVersion"]["datasetPersistentId"]

def get_persistent_id_from_url(self, url: str) -> str:
def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]:
"""
Return the persistentId for given dataverse URL.
Return a list of dataFiles for given persistent_id
Supports the following *dataset* URL styles:
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
Expand All @@ -101,11 +105,6 @@ def get_persistent_id_from_url(self, url: str) -> str:
If a URL can not be parsed, throw an exception
"""

def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]:
"""
Return a list of dataFiles for given persistent_id
"""

parsed_url = urlparse(url)
path = parsed_url.path
qs = parse_qs(parsed_url.query)
Expand Down Expand Up @@ -156,9 +155,7 @@ def fetch(self, spec, output_dir, yield_output=False):
url = spec["url"]
host = spec["host"]

persistent_id = self.get_persistent_id_from_url(url)

yield f"Fetching Dataverse record {persistent_id}.\n"
yield f"Fetching Dataverse record {url}.\n"

for fobj in self.get_datafiles(host["url"], url):
file_url = (
Expand Down Expand Up @@ -186,10 +183,7 @@ def fetch(self, spec, output_dir, yield_output=False):
copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d))

# Save persistent id
self.persitent_id = persistent_id

@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.persistent_id
return self.url

0 comments on commit fde74ef

Please sign in to comment.