diff --git a/heal/harvard_downloads.py b/heal/harvard_downloads.py new file mode 100644 index 0000000..b5d314c --- /dev/null +++ b/heal/harvard_downloads.py @@ -0,0 +1,268 @@ +""" +This module includes an external file retriever function intended to be called +by the external_files_download module in the Gen3-SDK. + +The retriever function sends requests to the Syracuse QDR API for downloading studies or files. + +The QDR documentation describes how to download studies +https://guides.dataverse.org/en/latest/api/dataaccess.html#basic-download-by-dataset + +and how to download files +https://guides.dataverse.org/en/latest/api/dataaccess.html#basic-file-access + +In order to get an API token from the WTS server, users should have already +sent a request to +/oauth2/authorization_url?idp=externaldata-keycloak +and logged in to QDR after the redirect. + +The WTS-SERVER is a Gen3 commons that has been configured to return +tokens for the 'idp' specified in the external_file_metadata. +""" + +import os + +from pathlib import Path +import requests +from typing import Dict, List, Tuple +from urllib.parse import unquote +import zipfile + +from cdislogging import get_logger +from gen3.auth import Gen3Auth +from gen3.tools.download.drs_download import DownloadStatus, wts_get_token + +logger = get_logger("__name__", log_level="debug") + + +def get_harvard_dataverse_files( + wts_hostname: str, auth, file_metadata_list: List, download_path: str = "." +) -> Dict: + """ + Retrieves external data from the Syracuse QDR. + + Args: + wts_hostname (str): hostname for commons with wts + auth (Gen3Auth): auth for commons with wts + file_metadata_list (List of Dict): list of studies or files + download_path (str): path to download files and unpack + + Returns: + Dict of download status + """ + if not Path(download_path).exists(): + logger.critical(f"Download path does not exist: {download_path}") + return None + + completed = {} + logger.debug(f"Input file metadata list={file_metadata_list}") + + for file_metadata in file_metadata_list: + id = get_id(file_metadata) + if id is None: + logger.warning( + f"Could not find 'study_id' or 'file_id' in metadata {file_metadata}" + ) + continue + logger.info(f"ID = {id}") + completed[id] = DownloadStatus(filename=id, status="pending") + + download_url = get_download_url_for_harvard_dataverse(file_metadata) + if download_url is None: + logger.critical(f"Could not get download_url for {id}") + completed[id].status = "invalid url" + continue + + logger.debug(f"Ready to send request to download_url: GET {download_url}") + downloaded_file = download_from_url( + harvard_url=download_url, + headers=None, + download_path=download_path, + ) + if downloaded_file is None: + completed[id].status = "failed" + continue + + if downloaded_file.endswith("zip"): + # unpack if download is zip file + try: + logger.debug(f"Ready to unpack {downloaded_file}.") + unpackage_object(filepath=downloaded_file) + except Exception as e: + logger.critical(f"{id} had an issue while being unpackaged: {e}") + completed[id].status = "failed" + + completed[id].status = "downloaded" + # remove the zip file + Path(downloaded_file).unlink() + else: + completed[id].status = "downloaded" + + if not completed: + return None + return completed + + +def download_from_url( + harvard_url: str, + headers=None, + download_path: str = ".", +) -> str: + """ + Retrieve data file (study_id or file_id) from url. + Save the file based on the filename in the Content-Disposition response header. + + Args: + harvard_url (str): url for QDR API + headers (Dict): request headers + download_path (str): path for saving downloaded zip file + + Returns: + path to downloaded and renamed file. + """ + try: + response = requests.get(url=harvard_url, headers=headers, stream=True) + response.raise_for_status() + except requests.exceptions.Timeout: + logger.critical( + f"Was unable to get the download url: {harvard_url}. Timeout Error." + ) + return None + except requests.exceptions.HTTPError as exc: + logger.critical(f"HTTPError in download {exc}") + return None + except requests.exceptions.ConnectionError as exc: + logger.critical(f"ConnectionError in download {exc}") + return None + except Exception as exc: + logger.critical(f"Error in download {exc}") + return None + logger.debug(f"Status code={response.status_code}") + downloaded_file_name = get_filename_from_headers(response.headers) + if downloaded_file_name is None: + downloaded_file_name = harvard_url.split("/")[-1] + logger.info(f"Using file name from id in url {downloaded_file_name}") + + if downloaded_file_name.endswith( + "zip" + ) and not "application/zip" in response.headers.get("Content-Type"): + logger.critical("Response headers do not show zipfile content-type") + + total_downloaded = 0 + block_size = 8092 # 8K blocks might want to tune this. + download_filename = f"{download_path}/{downloaded_file_name}" + try: + logger.info(f"Saving download as {download_filename}") + with open(download_filename, "wb") as file: + for data in response.iter_content(block_size): + total_downloaded += len(data) + file.write(data) + except IOError as ex: + logger.critical(f"IOError opening {download_filename} for writing: {ex}") + return None + + if total_downloaded == 0: + logger.critical("content-length is 0 and it should not be") + return None + logger.debug(f"Download size = {total_downloaded}") + + return download_filename + + +def get_download_url_for_harvard_dataverse(file_metadata: Dict) -> str: + """ + Get the download url for Syracuse QDR. + + Args: + file_metadata (Dict) + + Returns: + url, None if there are errors + """ + base_url = "https://dataverse.harvard.edu/api/access" + if "use_harvard_staging" in file_metadata and bool( + file_metadata["use_harvard_staging"] + ): + base_url = "https://demo.dataverse.org/api/access" + if "study_id" in file_metadata: + url = f"{base_url}/dataset/:persistentId/?persistentId={file_metadata.get('study_id')}" + else: + url = None + + return url + + +def get_filename_from_headers(headers: Dict) -> str: + """ + Parse and decode downloaded file name from response headers + + Args: + headers (dict): response headers + + Returns: + file name as string + """ + try: + file_name = None + content_response = headers.get("Content-Disposition").split(";") + for part in content_response: + # look for UTF-8 encoded file name + if part.strip().startswith("filename*="): + file_name = part.split("=", 1)[1].strip() + if file_name.lower().startswith("utf-8''"): + file_name = file_name[7:] + file_name = unquote(file_name) + break + elif part.strip().startswith("filename="): + file_name = part.split("=", 1)[1].strip().strip('"') + break + if file_name is None: + logger.info("Could not parse file name from headers") + + except Exception as e: + logger.warning("Could not get file name from headers") + + return file_name + + +def get_id(file_metadata: Dict) -> str: + """ + Parse out the object id from the metadata. + + Args: + file_metadata (Dict) + + Returns: + string + """ + if "study_id" in file_metadata: + return file_metadata.get("study_id") + + return None + + +def is_valid_harvard_file_metadata(file_metadata: Dict) -> bool: + """ + Check that the file_metadata has the required keys: + 'study_id' or 'file_id'. + + Args: + file_metadata (Dict) + + Returns: + True if valid file_metadata object. + """ + if not isinstance(file_metadata, dict): + logger.critical(f"Invalid metadata - item is not a dict: {file_metadata}") + return False + if "study_id" not in file_metadata: + logger.critical( + f"Invalid metadata - missing required Harvard Dataverse keys {file_metadata}" + ) + return False + return True + + +def unpackage_object(filepath: str): + """Unpackage the downloaded zip file""" + with zipfile.ZipFile(filepath, "r") as package: + package.extractall(os.path.dirname(filepath)) diff --git a/tests/test_harvard_download.py b/tests/test_harvard_download.py new file mode 100644 index 0000000..c389c28 --- /dev/null +++ b/tests/test_harvard_download.py @@ -0,0 +1,302 @@ +import os +from pathlib import Path +import requests +from typing import Dict, List, Tuple +from unittest import mock + +import pytest +import requests_mock +from unittest.mock import MagicMock + +from gen3.tools.download.drs_download import DownloadStatus +from heal.harvard_downloads import ( + is_valid_harvard_file_metadata, + get_id, + get_download_url_for_harvard_dataverse, + get_filename_from_headers, + download_from_url, + get_harvard_dataverse_files, +) + + +@pytest.fixture(scope="session") +def download_dir(tmpdir_factory): + path = tmpdir_factory.mktemp("harvard_download_dir") + return path + + +@pytest.mark.parametrize( + "file_metadata", + [ + ( + { + "external_oidc_idp": "test-external-idp", + "file_retriever": "harvard", + "study_id": "harvard_study_01", + } + ), + ], +) +def test_is_valid_harvard_file_metadata(file_metadata: Dict): + assert is_valid_harvard_file_metadata(file_metadata) == True + + +def test_is_valid_harvard_file_metadata_failed(): + # missing keys + file_metadata = ( + { + "external_oidc_idp": "test-external-idp", + "file_retriever": "harvard", + "description": "missing study_id or file_id", + }, + ) + assert is_valid_harvard_file_metadata(file_metadata) == False + + # not a dict + file_metadata = ("file_metadata_is_not_a_dict",) + assert is_valid_harvard_file_metadata(file_metadata) == False + + +@pytest.mark.parametrize( + "file_metadata, expected", + [ + ( + { + "external_oidc_idp": "test-external-idp", + "file_retriever": "harvard", + "study_id": "harvard_study_01", + }, + "harvard_study_01", + ), + ], +) +def test_get_id(file_metadata: Dict, expected: str): + assert get_id(file_metadata) == expected + + +def test_get_id_bad_input(): + # missing study_id + file_metadata = { + "external_oidc_idp": "test-external-idp", + "file_retriever": "harvard", + } + assert get_id(file_metadata) == None + + +@pytest.mark.parametrize( + "file_metadata, expected", + [ + ( + { + "study_id": "harvard_study_01", + }, + "https://dataverse.harvard.edu/api/access/dataset/:persistentId/?persistentId=harvard_study_01", + ), + ], +) +def test_get_download_url_for_qdr(file_metadata: Dict, expected: str): + assert get_download_url_for_harvard_dataverse(file_metadata) == expected + + +@pytest.mark.parametrize( + "file_metadata_harvard_staging, expected", + [ + ( + { + "external_oidc_idp": "test-external-idp", + "file_retriever": "harvard", + "study_id": "harvard_study_01", + "use_harvard_staging": True, + }, + "https://demo.dataverse.org/api/access/dataset/:persistentId/?persistentId=harvard_study_01", + ) + ], +) +def test_get_download_url_for_harvard_staging( + file_metadata_harvard_staging: Dict, expected: str +): + assert ( + get_download_url_for_harvard_dataverse(file_metadata_harvard_staging) + == expected + ) + + +def test_get_download_url_for_harvard_failed(): + # missing file_ids or study_id + file_metadata = {} + assert get_download_url_for_harvard_dataverse(file_metadata) == None + + +def test_get_filename_from_headers(): + # zip file for study_id + mock_zip_file_name = "test.zip" + mock_response_headers = { + "Content-Type": "application/zip", + "Content-Disposition": f"application; filename={mock_zip_file_name}", + } + assert get_filename_from_headers(mock_response_headers) == mock_zip_file_name + + # utf-8 encoded file name for file_id + mock_file_name = "test.pdf" + mock_response_headers = { + "Content-Type": "application/pdf", + "Content-Disposition": f"application; filename*=UTF-8''{mock_file_name}", + } + assert get_filename_from_headers(mock_response_headers) == mock_file_name + + +def test_get_filename_from_headers_invalid(): + mock_response_headers = { + "Content-Type": "application/pdf", + "Content-Disposition": f"application; filename", + } + assert get_filename_from_headers(mock_response_headers) == None + + +def test_download_from_url(download_dir): + request_headers = {"Authorization": "Bearer some-idp-token"} + mock_data = "foo" + mock_filename = "test.zip" + + with requests_mock.Mocker() as m: + # get study_id + mock_zip_file_name = "dataverse_files.zip" + harvard_url = "https://demo.dataverse.org/api/access/:persistentId/?persistentId=harvard_study_01" + valid_response_headers = { + "Content-Type": "application/zip", + "Content-Disposition": f"application; filename={mock_zip_file_name}", + } + m.get( + harvard_url, + headers=valid_response_headers, + content=bytes(mock_data, "utf-8"), + ) + download_filename = download_from_url( + harvard_url=harvard_url, + headers=request_headers, + download_path=download_dir, + ) + assert download_filename == f"{download_dir}/{mock_zip_file_name}" + assert os.path.exists(download_filename) + with open(download_filename, "r") as f: + assert f.read() == mock_data + + # cannot get downloaded file name from header - fall back to file id + mock_file_id = "123456" + mock_data = "foo" + response_headers = { + "Content-Disposition": "application; ", + } + harvard_url = ( + f"https://dataverse.harvard.edu/api/access/datafile/{mock_file_id}" + ) + m.get( + harvard_url, + headers=response_headers, + content=bytes(mock_data, "utf-8"), + ) + download_filename = download_from_url( + harvard_url=harvard_url, + headers=request_headers, + download_path=download_dir, + ) + # filename is from file_id + assert download_filename != f"{download_dir}/{mock_filename}" + assert download_filename == f"{download_dir}/{mock_file_id}" + assert os.path.exists(download_filename) + with open(download_filename, "r") as f: + assert f.read() == mock_data + + +def test_download_from_url_failures(download_dir): + request_headers = {"Authorization": "Bearer some-idp-token"} + valid_response_headers = {"Content-Type": "application/zip"} + mock_data = "foo" + mock_zip_file_name = "dataverse_files.zip" + download_filename = f"{download_dir}/dataverse_files.zip" + if os.path.exists(download_filename): + Path(download_filename).unlink() + + # bad url + downloaded_file = download_from_url( + harvard_url="https://bad_url", + headers=request_headers, + download_path=download_dir, + ) + assert downloaded_file == None + assert not os.path.exists(download_filename) + + with requests_mock.Mocker() as m: + valid_response_headers = { + "Content-Type": "application/zip", + "Content-Disposition": f"application; filename={mock_zip_file_name}", + } + + # bad download path + harvard_url = "https://demo.dataverse.org/api/access/datafiles/some_id" + m.get( + harvard_url, + headers=valid_response_headers, + content=bytes(mock_data, "utf-8"), + ) + + download_file = download_from_url( + harvard_url=harvard_url, + headers=request_headers, + download_path="/path/does/not/exist", + ) + assert download_file == None + assert not os.path.exists(download_filename) + + # zero size response + m.get(harvard_url, headers=valid_response_headers, content=bytes()) + download_file = download_from_url( + harvard_url=harvard_url, + headers=request_headers, + download_path=download_dir, + ) + assert download_file == None + assert (os.path.getsize(download_filename)) == 0 + Path(download_filename).unlink() + + +def test_get_harvard_dataverse_files(download_dir): + test_data = "foo" + test_study_id = "some_id" + file_metadata_list = [ + { + "file_retriever": "harvard_dataverse", + "study_id": test_study_id, + }, + ] + expected_status = { + test_study_id: DownloadStatus(filename=test_study_id, status="downloaded") + } + + valid_response_headers = { + "Content-Disposition": "attachment; filename=test_file.txt" + } + + with requests_mock.Mocker() as m: + # Mocking the Harvard Dataverse API endpoint + m.get( + f"https://dataverse.harvard.edu/api/access/dataset/:persistentId/?persistentId={test_study_id}", + headers=valid_response_headers, + content=bytes(test_data, "utf-8"), + ) + + # Call the function to test + result = get_harvard_dataverse_files( + wts_hostname=None, # Not required for Harvard Dataverse + auth=None, # Not required for Harvard Dataverse + file_metadata_list=file_metadata_list, + download_path=download_dir, + ) + + # Check if the result matches the expected status + assert result == expected_status + + # Verify the file was saved + downloaded_file_path = Path(download_dir) / "test_file.txt" + assert downloaded_file_path.exists() + assert downloaded_file_path.read_text() == test_data