uc-cdis · piotrsenkow · Jan 30, 2025 · Oct 2, 2024 · Oct 22, 2024 · Dec 5, 2024
diff --git a/heal/harvard_downloads.py b/heal/harvard_downloads.py
@@ -0,0 +1,130 @@
+"""
+This module includes an external file retriever function intended to be called
+by the external_files_download module in the Gen3-SDK.
+
+The retriever function sends requests to the Harvard Dataverse for downloading studies or files.
+TODO: QDR and Harvard Dataverse use the same Dataverse API, however, we do NOT need to use WTS token to access with Harvard
+The Dataverse documentation describes how to download studies
+https://guides.dataverse.org/en/latest/api/dataaccess.html#basic-download-by-dataset
+
+"""
+
+from pathlib import Path
+from typing import Dict, List
+from utils import unpackage_object, get_id, download_from_url
+
+from cdislogging import get_logger
+from gen3.tools.download.drs_download import DownloadStatus
+
+logger = get_logger("__name__", log_level="debug")
+
+
+def get_harvard_dataverse_files(
+    file_metadata_list: List, download_path: str = "."
+) -> Dict:
+    """
+    Retrieves external data from the Harvard Dataverse.
+
+    Args:
+        file_metadata_list (List of Dict): list of studies or files
+        download_path (str): path to download files and unpack
+
+    Returns:
+        Dict of download status
+    """
+    if not Path(download_path).exists():
+        logger.critical(f"Download path does not exist: {download_path}")
+        return None
+
+    completed = {}
+    logger.debug(f"Input file metadata list={file_metadata_list}")
+
+    for file_metadata in file_metadata_list:
+        id = get_id(file_metadata)
+        if id is None:
+            logger.warning(
+                f"Could not find 'study_id' or 'file_id' in metadata {file_metadata}"
+            )
+            continue
+        logger.info(f"ID = {id}")
+        completed[id] = DownloadStatus(filename=id, status="pending")
+
+        download_url = get_download_url_for_harvard_dataverse(file_metadata)
+        if download_url is None:
+            logger.critical(f"Could not get download_url for {id}")
+            completed[id].status = "invalid url"
+            continue
+
+        logger.debug(f"Ready to send request to download_url: GET {download_url}")
+        downloaded_file = download_from_url(
+            api_url=download_url,
+            headers=None,
+            download_path=download_path,
+        )
+        if downloaded_file is None:
+            completed[id].status = "failed"
+            continue
+
+        if downloaded_file.endswith("zip"):
+            # unpack if download is zip file
+            try:
+                logger.debug(f"Ready to unpack {downloaded_file}.")
+                unpackage_object(filepath=downloaded_file)
+            except Exception as e:
+                logger.critical(f"{id} had an issue while being unpackaged: {e}")
+                completed[id].status = "failed"
+
+            completed[id].status = "downloaded"
+            # remove the zip file
+            Path(downloaded_file).unlink()
+        else:
+            completed[id].status = "downloaded"
+
+    if not completed:
+        return None
+    return completed
+
+
+def get_download_url_for_harvard_dataverse(file_metadata: Dict) -> str:
+    """
+    Get the download url for Harvard Dataverse.
+
+    Args:
+        file_metadata (Dict)
+
+    Returns:
+        url, None if there are errors
+    """
+    base_url = "https://dataverse.harvard.edu/api/access"
+    if "use_harvard_staging" in file_metadata and bool(
+        file_metadata["use_harvard_staging"]
+    ):
+        base_url = "https://demo.dataverse.org/api/access"
+    if "study_id" in file_metadata:
+        url = f"{base_url}/dataset/:persistentId/?persistentId={file_metadata.get('study_id')}"
+    else:
+        url = None
+
+    return url
+
+
+def is_valid_harvard_file_metadata(file_metadata: Dict) -> bool:
+    """
+    Check that the file_metadata has the required keys:
+    'study_id' or 'file_id'.
+
+    Args:
+        file_metadata (Dict)
+
+    Returns:
+        True if valid file_metadata object.
+    """
+    if not isinstance(file_metadata, dict):
+        logger.critical(f"Invalid metadata - item is not a dict: {file_metadata}")
+        return False
+    if "study_id" not in file_metadata:
+        logger.critical(
+            f"Invalid metadata - missing required Harvard Dataverse keys {file_metadata}"
+        )
+        return False
+    return True
diff --git a/heal/qdr_downloads.py b/heal/qdr_downloads.py
@@ -19,16 +19,12 @@
 tokens for the 'idp' specified in the external_file_metadata.
 """
 
-import os
 
 from pathlib import Path
-import requests
-from typing import Dict, List, Tuple
-from urllib.parse import unquote
-import zipfile
+from typing import Dict, List
+from utils import unpackage_object, get_id, download_from_url, get_idp_access_token
 
 from cdislogging import get_logger
-from gen3.auth import Gen3Auth
 from gen3.tools.download.drs_download import DownloadStatus, wts_get_token
 
 logger = get_logger("__name__", log_level="debug")
@@ -81,7 +77,7 @@ def get_syracuse_qdr_files(
 
         logger.debug(f"Ready to send request to download_url: GET {download_url}")
         downloaded_file = download_from_url(
-            qdr_url=download_url,
+            api_url=download_url,
             headers=request_headers,
             download_path=download_path,
         )
@@ -109,72 +105,6 @@ def get_syracuse_qdr_files(
     return completed
 
 
-def download_from_url(
-    qdr_url: str,
-    headers=None,
-    download_path: str = ".",
-) -> str:
-    """
-    Retrieve data file (study_id or file_id) from url.
-    Save the file based on the filename in the Content-Disposition response header.
-
-    Args:
-        qdr_url (str): url for QDR API
-        headers (Dict): request headers
-        download_path (str): path for saving downloaded zip file
-
-    Returns:
-        path to downloaded and renamed file.
-    """
-    try:
-        response = requests.get(url=qdr_url, headers=headers, stream=True)
-        response.raise_for_status()
-    except requests.exceptions.Timeout:
-        logger.critical(
-            f"Was unable to get the download url: {qdr_url}. Timeout Error."
-        )
-        return None
-    except requests.exceptions.HTTPError as exc:
-        logger.critical(f"HTTPError in download {exc}")
-        return None
-    except requests.exceptions.ConnectionError as exc:
-        logger.critical(f"ConnectionError in download {exc}")
-        return None
-    except Exception as exc:
-        logger.critical(f"Error in download {exc}")
-        return None
-    logger.debug(f"Status code={response.status_code}")
-    downloaded_file_name = get_filename_from_headers(response.headers)
-    if downloaded_file_name is None:
-        downloaded_file_name = qdr_url.split("/")[-1]
-        logger.info(f"Using file name from id in url {downloaded_file_name}")
-
-    if downloaded_file_name.endswith(
-        "zip"
-    ) and not "application/zip" in response.headers.get("Content-Type"):
-        logger.critical("Response headers do not show zipfile content-type")
-
-    total_downloaded = 0
-    block_size = 8092  # 8K blocks might want to tune this.
-    download_filename = f"{download_path}/{downloaded_file_name}"
-    try:
-        logger.info(f"Saving download as {download_filename}")
-        with open(download_filename, "wb") as file:
-            for data in response.iter_content(block_size):
-                total_downloaded += len(data)
-                file.write(data)
-    except IOError as ex:
-        logger.critical(f"IOError opening {download_filename} for writing: {ex}")
-        return None
-
-    if total_downloaded == 0:
-        logger.critical("content-length is 0 and it should not be")
-        return None
-    logger.debug(f"Download size = {total_downloaded}")
-
-    return download_filename
-
-
 def get_download_url_for_qdr(file_metadata: Dict) -> str:
     """
     Get the download url for Syracuse QDR.
@@ -199,56 +129,6 @@ def get_download_url_for_qdr(file_metadata: Dict) -> str:
     return url
 
 
-def get_filename_from_headers(headers: Dict) -> str:
-    """
-    Parse and decode downloaded file name from response headers
-
-    Args:
-        headers (dict): response headers
-
-    Returns:
-        file name as string
-    """
-    try:
-        file_name = None
-        content_response = headers.get("Content-Disposition").split(";")
-        for part in content_response:
-            # look for UTF-8 encoded file name
-            if part.strip().startswith("filename*="):
-                file_name = part.split("=", 1)[1].strip()
-                if file_name.lower().startswith("utf-8''"):
-                    file_name = file_name[7:]
-                    file_name = unquote(file_name)
-                    break
-            elif part.strip().startswith("filename="):
-                file_name = part.split("=", 1)[1].strip().strip('"')
-                break
-        if file_name is None:
-            logger.info("Could not parse file name from headers")
-
-    except Exception as e:
-        logger.warning("Could not get file name from headers")
-
-    return file_name
-
-
-def get_idp_access_token(wts_hostname: str, auth: Gen3Auth, file_metadata: Dict) -> str:
-    """Get an access token for QDR using a Gen3 commons WTS"""
-    try:
-        logger.debug("Ready to get auth token")
-        wts_access_token = auth.get_access_token()
-        logger.debug("Ready to get idp token")
-        idp = file_metadata.get("external_oidc_idp")
-        idp_access_token = wts_get_token(
-            hostname=wts_hostname, idp=idp, access_token=wts_access_token
-        )
-    except Exception as e:
-        logger.critical(f"Could not get token: {e}")
-        return None
-
-    return idp_access_token
-
-
 def get_request_headers(idp_access_token: str) -> Dict:
     """
     Generate the request headers.
@@ -267,24 +147,6 @@ def get_request_headers(idp_access_token: str) -> Dict:
     return headers
 
 
-def get_id(file_metadata: Dict) -> str:
-    """
-    Parse out the object id from the metadata.
-
-    Args:
-        file_metadata (Dict)
-
-    Returns:
-        string
-    """
-    id_types = ["study_id", "file_id"]
-    for id_type in id_types:
-        if id_type in file_metadata:
-            return file_metadata.get(id_type)
-
-    return None
-
-
 def is_valid_qdr_file_metadata(file_metadata: Dict) -> bool:
     """
     Check that the file_metadata has the required keys:
@@ -308,9 +170,3 @@ def is_valid_qdr_file_metadata(file_metadata: Dict) -> bool:
         )
         return False
     return True
-
-
-def unpackage_object(filepath: str):
-    """Unpackage the downloaded zip file"""
-    with zipfile.ZipFile(filepath, "r") as package:
-        package.extractall(os.path.dirname(filepath))