Skip to content

Commit

Permalink
Merge pull request #1390 from yuvipanda/use-api
Browse files Browse the repository at this point in the history
Use REST APIs to resolve DOIs + cleanup dataverse provider
  • Loading branch information
minrk authored Dec 20, 2024
2 parents f0b1c0c + e48f5b7 commit b7c1515
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 272 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
- r
- unit
- venv
- contentproviders
include:
# The actions/setup-python action with Python version 3.6 isn't
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
Expand Down
193 changes: 141 additions & 52 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import hashlib
import json
import os
import shutil
from urllib.parse import parse_qs, urlparse, urlunparse
from typing import List, Tuple
from urllib.parse import parse_qs, urlparse

from ..utils import copytree, deep_get
from ..utils import copytree, deep_get, is_doi
from .doi import DoiProvider


Expand All @@ -23,10 +25,11 @@ def __init__(self):
self.hosts = json.load(fp)["installations"]
super().__init__()

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.
def detect(self, spec, ref=None, extra_args=None):
"""
Detect if given spec is hosted on dataverse
Handles:
The spec can be:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}
Expand All @@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None):
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
"""
url = self.doi2url(doi)
if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)

Expand All @@ -53,57 +58,137 @@ def detect(self, doi, ref=None, extra_args=None):
if host is None:
return

query_args = parse_qs(parsed_url.query)
# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
# At this point, we *know* this is a dataverse URL, because:
# 1. The DOI resolved to a particular host (if using DOI)
# 2. The host is in the list of known dataverse installations
#
# We don't know exactly what kind of dataverse object this is, but
# that can be figured out during fetch as needed
return url

def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
"""
Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
"""
if file_id.isdigit():
# the file_id is an integer, rather than a persistent id (DOI)
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
else:
# the file_id is a doi itself
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"

resp = self._request(api_url)
if resp.status_code == 404:
raise ValueError(f"File with id {file_id} not found in {base_url}")

resp.raise_for_status()

data = resp.json()["data"]
return data["datasetVersion"]["datasetPersistentId"]

def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
"""
Parse the persistent id out of a dataverse URL
persistent_id can point to either a dataset or a file. The second return
value is False if we know that the persistent id is a file or a dataset,
and True if it is ambiguous.
Raises a ValueError if we can not parse the url
"""
parsed_url = urlparse(url)
path = parsed_url.path
qs = parse_qs(parsed_url.query)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

is_ambiguous = False
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
if path.startswith("/citation"):
is_ambiguous = True
persistent_id = qs["persistentId"][0]
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
elif path.startswith("/dataset.xhtml"):
# https://dataverse.harvard.edu/api/access/datafile/3323458
persistent_id = qs["persistentId"][0]
elif path.startswith("/api/access/datafile"):
# What we have here is an entity id, which we can use to get a persistentId
file_id = os.path.basename(path)
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
elif parsed_url.path.startswith("/file.xhtml"):
file_persistent_id = qs["persistentId"][0]
persistent_id = self.get_dataset_id_from_file_id(
base_url, file_persistent_id
)
else:
raise ValueError(
f"Could not determine persistent id for dataverse URL {url}"
)
self.log.debug("Querying Dataverse: " + search_url)
data = self.urlopen(search_url).json()["data"]
if data["count_in_response"] != 1:
self.log.debug(
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
)
return

self.record_id = deep_get(data, "items.0.dataset_persistent_id")
elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")

if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}

return persistent_id, is_ambiguous

def get_datafiles(self, url: str) -> List[dict]:
"""
Return a list of dataFiles for given persistent_id
Supports the following *dataset* URL styles:
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
If a URL can not be parsed, throw an exception
"""

parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

persistent_id, is_ambiguous = self.parse_dataverse_url(url)

dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
if resp.status_code == 404 and is_ambiguous:
# It's possible this is a *file* persistent_id, not a dataset one
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(
dataset_api_url, headers={"accept": "application/json"}
)

if resp.status_code == 404:
# This persistent id is just not here
raise ValueError(f"{persistent_id} on {base_url} is not found")

# We already handled 404, raise error for everything else
resp.raise_for_status()

# We know the exact persistent_id of the dataset we fetched now
# Save it for use as content_id
self.persistent_id = persistent_id

data = resp.json()["data"]

return data["latestVersion"]["files"]

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]

yield f"Fetching Dataverse record {record_id}.\n"
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
url = spec
parsed_url = urlparse(url)
# FIXME: Support determining API URL better
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()["data"]
yield f"Fetching Dataverse record {url}.\n"

for fobj in deep_get(record, "latestVersion.files"):
for fobj in self.get_datafiles(url):
file_url = (
# without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
)
filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
Expand All @@ -128,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False):

@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.record_id
"""
The Dataverse persistent identifier.
Only valid if called after a succesfull fetch
"""
return self.persistent_id
35 changes: 21 additions & 14 deletions repo2docker/contentproviders/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,28 @@ def doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)

try:
resp = self._request(f"https://doi.org/{doi}")
resp.raise_for_status()
except HTTPError as e:
# If the DOI doesn't exist, just return URL
if e.response.status_code == 404:
return doi
# Reraise any other errors because if the DOI service is down (or
# we hit a rate limit) we don't want to silently continue to the
# default Git provider as this leads to a misleading error.
self.log.error(f"DOI {doi} does not resolve: {e}")
normalized_doi = normalize_doi(doi)

# Use the doi.org resolver API
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
req_url = f"https://doi.org/api/handles/{normalized_doi}"
resp = self._request(req_url)
if resp.status_code == 404:
# Not a doi, return what we were passed in
return doi
elif resp.status_code == 200:
data = resp.json()
# Pick the first URL we find from the doi response
for v in data["values"]:
if v["type"] == "URL":
return v["data"]["value"]

# No URLs found for this doi, what do we do?
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
return doi
else:
# If we get any other status codes, raise error
raise
return resp.url
else:
# Just return what is actulally just a URL
return doi
Expand Down
Loading

0 comments on commit b7c1515

Please sign in to comment.