Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harvard Dataverse file retriever implementation for Heal SDK #11

Merged
merged 27 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
25d80c1
black formatting
piotrsenkow Oct 2, 2024
e24fc88
black formatting
piotrsenkow Oct 22, 2024
55c1a9f
adding unit testing and modified logic for harvard dataverse file ret…
piotrsenkow Dec 5, 2024
d9bb4a1
missing variables breaking unit test
piotrsenkow Dec 5, 2024
62ac89f
fixing breaking unit test
piotrsenkow Dec 5, 2024
393f158
fixing breaking tests
piotrsenkow Dec 11, 2024
df99f92
fixing breaking tests
piotrsenkow Dec 11, 2024
93b1364
fixing breaking tests
piotrsenkow Dec 11, 2024
4a4f2d2
fixing breaking tests
piotrsenkow Dec 11, 2024
cbe00cd
fixing breaking tests
piotrsenkow Dec 11, 2024
dcf78b8
fixing breaking tests
piotrsenkow Dec 11, 2024
4b21499
fixing breaking tests
piotrsenkow Dec 11, 2024
f4aa6a5
fixing breaking tests
piotrsenkow Dec 12, 2024
17529f3
fixing breaking tests
piotrsenkow Dec 12, 2024
b1533f5
fixing breaking tests
piotrsenkow Dec 12, 2024
14b9e53
Merge branch 'master' into piotr/harvard
piotrsenkow Jan 21, 2025
92255f4
refactor and adding utils
piotrsenkow Jan 28, 2025
d67c2f3
black
piotrsenkow Jan 28, 2025
009f18e
broken import due to refactoring
piotrsenkow Jan 28, 2025
c9fe359
changing parameter names in unit tests as functions have been slightl…
piotrsenkow Jan 28, 2025
1dca4cb
refactoring
piotrsenkow Jan 28, 2025
1b41e11
refactoring
piotrsenkow Jan 28, 2025
9466551
refactoring
piotrsenkow Jan 28, 2025
b4f00a7
refactor
piotrsenkow Jan 29, 2025
5a8ce80
Merge branch 'master' into piotr/harvard
piotrsenkow Jan 30, 2025
f5fcf7e
Adding TODO disclaimer about not having to use WTS token to access ha…
piotrsenkow Jan 30, 2025
b89b3e3
Merge branch 'piotr/harvard' of https://github.com/uc-cdis/heal-platf…
piotrsenkow Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions heal/harvard_downloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
This module includes an external file retriever function intended to be called
by the external_files_download module in the Gen3-SDK.

The retriever function sends requests to the Harvard Dataverse for downloading studies or files.
TODO: QDR and Harvard Dataverse use the same Dataverse API, however, we do NOT need to use WTS token to access with Harvard
The Dataverse documentation describes how to download studies
https://guides.dataverse.org/en/latest/api/dataaccess.html#basic-download-by-dataset

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add some TODO notes in here to remind us about the lack of auth parts

"""

from pathlib import Path
from typing import Dict, List
from utils import unpackage_object, get_id, download_from_url

from cdislogging import get_logger
from gen3.tools.download.drs_download import DownloadStatus

logger = get_logger("__name__", log_level="debug")


def get_harvard_dataverse_files(
file_metadata_list: List, download_path: str = "."
) -> Dict:
"""
Retrieves external data from the Harvard Dataverse.

Args:
file_metadata_list (List of Dict): list of studies or files
download_path (str): path to download files and unpack

Returns:
Dict of download status
"""
if not Path(download_path).exists():
logger.critical(f"Download path does not exist: {download_path}")
return None

completed = {}
logger.debug(f"Input file metadata list={file_metadata_list}")

for file_metadata in file_metadata_list:
id = get_id(file_metadata)
if id is None:
logger.warning(
f"Could not find 'study_id' or 'file_id' in metadata {file_metadata}"
)
continue
logger.info(f"ID = {id}")
completed[id] = DownloadStatus(filename=id, status="pending")

download_url = get_download_url_for_harvard_dataverse(file_metadata)
if download_url is None:
logger.critical(f"Could not get download_url for {id}")
completed[id].status = "invalid url"
continue

logger.debug(f"Ready to send request to download_url: GET {download_url}")
downloaded_file = download_from_url(
api_url=download_url,
headers=None,
download_path=download_path,
)
if downloaded_file is None:
completed[id].status = "failed"
continue

if downloaded_file.endswith("zip"):
# unpack if download is zip file
try:
logger.debug(f"Ready to unpack {downloaded_file}.")
unpackage_object(filepath=downloaded_file)
except Exception as e:
logger.critical(f"{id} had an issue while being unpackaged: {e}")
completed[id].status = "failed"

completed[id].status = "downloaded"
# remove the zip file
Path(downloaded_file).unlink()
else:
completed[id].status = "downloaded"

if not completed:
return None
return completed


def get_download_url_for_harvard_dataverse(file_metadata: Dict) -> str:
"""
Get the download url for Harvard Dataverse.

Args:
file_metadata (Dict)

Returns:
url, None if there are errors
"""
base_url = "https://dataverse.harvard.edu/api/access"
if "use_harvard_staging" in file_metadata and bool(
file_metadata["use_harvard_staging"]
):
base_url = "https://demo.dataverse.org/api/access"
if "study_id" in file_metadata:
url = f"{base_url}/dataset/:persistentId/?persistentId={file_metadata.get('study_id')}"
else:
url = None

return url


def is_valid_harvard_file_metadata(file_metadata: Dict) -> bool:
"""
Check that the file_metadata has the required keys:
'study_id' or 'file_id'.

Args:
file_metadata (Dict)

Returns:
True if valid file_metadata object.
"""
if not isinstance(file_metadata, dict):
logger.critical(f"Invalid metadata - item is not a dict: {file_metadata}")
return False
if "study_id" not in file_metadata:
logger.critical(
f"Invalid metadata - missing required Harvard Dataverse keys {file_metadata}"
)
return False
return True
150 changes: 3 additions & 147 deletions heal/qdr_downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,12 @@
tokens for the 'idp' specified in the external_file_metadata.
"""

import os

from pathlib import Path
import requests
from typing import Dict, List, Tuple
from urllib.parse import unquote
import zipfile
from typing import Dict, List
from utils import unpackage_object, get_id, download_from_url, get_idp_access_token

from cdislogging import get_logger
from gen3.auth import Gen3Auth
from gen3.tools.download.drs_download import DownloadStatus, wts_get_token

logger = get_logger("__name__", log_level="debug")
Expand Down Expand Up @@ -81,7 +77,7 @@ def get_syracuse_qdr_files(

logger.debug(f"Ready to send request to download_url: GET {download_url}")
downloaded_file = download_from_url(
qdr_url=download_url,
api_url=download_url,
headers=request_headers,
download_path=download_path,
)
Expand Down Expand Up @@ -109,72 +105,6 @@ def get_syracuse_qdr_files(
return completed


def download_from_url(
qdr_url: str,
headers=None,
download_path: str = ".",
) -> str:
"""
Retrieve data file (study_id or file_id) from url.
Save the file based on the filename in the Content-Disposition response header.

Args:
qdr_url (str): url for QDR API
headers (Dict): request headers
download_path (str): path for saving downloaded zip file

Returns:
path to downloaded and renamed file.
"""
try:
response = requests.get(url=qdr_url, headers=headers, stream=True)
response.raise_for_status()
except requests.exceptions.Timeout:
logger.critical(
f"Was unable to get the download url: {qdr_url}. Timeout Error."
)
return None
except requests.exceptions.HTTPError as exc:
logger.critical(f"HTTPError in download {exc}")
return None
except requests.exceptions.ConnectionError as exc:
logger.critical(f"ConnectionError in download {exc}")
return None
except Exception as exc:
logger.critical(f"Error in download {exc}")
return None
logger.debug(f"Status code={response.status_code}")
downloaded_file_name = get_filename_from_headers(response.headers)
if downloaded_file_name is None:
downloaded_file_name = qdr_url.split("/")[-1]
logger.info(f"Using file name from id in url {downloaded_file_name}")

if downloaded_file_name.endswith(
"zip"
) and not "application/zip" in response.headers.get("Content-Type"):
logger.critical("Response headers do not show zipfile content-type")

total_downloaded = 0
block_size = 8092 # 8K blocks might want to tune this.
download_filename = f"{download_path}/{downloaded_file_name}"
try:
logger.info(f"Saving download as {download_filename}")
with open(download_filename, "wb") as file:
for data in response.iter_content(block_size):
total_downloaded += len(data)
file.write(data)
except IOError as ex:
logger.critical(f"IOError opening {download_filename} for writing: {ex}")
return None

if total_downloaded == 0:
logger.critical("content-length is 0 and it should not be")
return None
logger.debug(f"Download size = {total_downloaded}")

return download_filename


def get_download_url_for_qdr(file_metadata: Dict) -> str:
"""
Get the download url for Syracuse QDR.
Expand All @@ -199,56 +129,6 @@ def get_download_url_for_qdr(file_metadata: Dict) -> str:
return url


def get_filename_from_headers(headers: Dict) -> str:
"""
Parse and decode downloaded file name from response headers

Args:
headers (dict): response headers

Returns:
file name as string
"""
try:
file_name = None
content_response = headers.get("Content-Disposition").split(";")
for part in content_response:
# look for UTF-8 encoded file name
if part.strip().startswith("filename*="):
file_name = part.split("=", 1)[1].strip()
if file_name.lower().startswith("utf-8''"):
file_name = file_name[7:]
file_name = unquote(file_name)
break
elif part.strip().startswith("filename="):
file_name = part.split("=", 1)[1].strip().strip('"')
break
if file_name is None:
logger.info("Could not parse file name from headers")

except Exception as e:
logger.warning("Could not get file name from headers")

return file_name


def get_idp_access_token(wts_hostname: str, auth: Gen3Auth, file_metadata: Dict) -> str:
"""Get an access token for QDR using a Gen3 commons WTS"""
try:
logger.debug("Ready to get auth token")
wts_access_token = auth.get_access_token()
logger.debug("Ready to get idp token")
idp = file_metadata.get("external_oidc_idp")
idp_access_token = wts_get_token(
hostname=wts_hostname, idp=idp, access_token=wts_access_token
)
except Exception as e:
logger.critical(f"Could not get token: {e}")
return None

return idp_access_token


def get_request_headers(idp_access_token: str) -> Dict:
"""
Generate the request headers.
Expand All @@ -267,24 +147,6 @@ def get_request_headers(idp_access_token: str) -> Dict:
return headers


def get_id(file_metadata: Dict) -> str:
"""
Parse out the object id from the metadata.

Args:
file_metadata (Dict)

Returns:
string
"""
id_types = ["study_id", "file_id"]
for id_type in id_types:
if id_type in file_metadata:
return file_metadata.get(id_type)

return None


def is_valid_qdr_file_metadata(file_metadata: Dict) -> bool:
"""
Check that the file_metadata has the required keys:
Expand All @@ -308,9 +170,3 @@ def is_valid_qdr_file_metadata(file_metadata: Dict) -> bool:
)
return False
return True


def unpackage_object(filepath: str):
"""Unpackage the downloaded zip file"""
with zipfile.ZipFile(filepath, "r") as package:
package.extractall(os.path.dirname(filepath))
Loading