From 277b0a5221b0ee6861e822434eb2527895c1fb36 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 22 May 2024 17:20:13 +0100 Subject: [PATCH] adding mental health --- .../2024-05-20/gbd_mental_health.csv.dvc | 27 ++++++++ .../ihme_gbd/2024-05-20/gbd_mental_health.py | 61 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc create mode 100644 snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py diff --git a/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc new file mode 100644 index 00000000000..4c2889293a4 --- /dev/null +++ b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + is_public: false + origin: + # Data product / Snapshot + title: Global Burden of Disease - Mental Health Prevalence + description: |- + The Global Burden of Disease (GBD) study provides a comprehensive assessment of global health trends. This dataset contains the prevalence of mental health conditions for a range of age-groups across males and females. + date_published: "2024-05-17" + # Citation + producer: Institute of Health Metrics and Evaluation, Global Burden of Disease Study + citation_full: |- + "Global Burden of Disease Collaborative Network. Global Burden of Disease Study 2021 (GBD 2021) Results. Seattle, United States: Institute for Health Metrics and Evaluation (IHME), 2022. Available from https://vizhub.healthdata.org/gbd-results/." + attribution_short: "IHME-GBD" + # Files + url_main: https://vizhub.healthdata.org/gbd-results/ + date_accessed: 2024-05-20 + + # License + license: + name: Free-of-Charge Non-commercial User Agreement + url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement +outs: + - md5: 7992fab7bb0ce5b0a5d4d1a5b94785ae + size: 5376380405 + path: gbd_prevalence.csv diff --git a/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py new file mode 100644 index 00000000000..65e56095103 --- /dev/null +++ b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py @@ -0,0 +1,61 @@ +"""Script to create a snapshot of dataset. + +To get the data follow the following steps: + +Important - You need and account to access the data. + +* Go to: https://vizhub.healthdata.org/gbd-results/ +* In 'GBD Estimate' select 'Cause of death or injury' +* In Measure select 'Prevalence' +* In Metric select 'Number', 'Percent' and 'Rate' +* In Impairment select all under 'Mental Disorders' and 'Substance Use Disorders' +* In Location select 'Global', 'Select all countries and territories', each of the regions in the following groups: 'WHO region', 'World Bank Income Level' and 'World Bank Regions' +* In Age select 'All ages', 'Age-standardized', '<5 years', '5-14 years', '15-49 years', '50-69 years', '70+ years', 10 to 14, '15-19', '20-24', '25-29', '30-34', 35-39, 40-44, 45-49, 50-54, 55-59, 60-64, 65-69, +* In Sex select 'Both' +* In Year select 'Select all' + +The data will then be requested and a download link will be sent to you with a number of zip files containing the data (approx < 10 files). + +We will download and combine the files in the following script. +""" +from pathlib import Path + +import click +import pandas as pd +from owid.datautils.dataframes import concatenate +from owid.repack import repack_frame +from shared import download_data +from structlog import get_logger + +from etl.snapshot import Snapshot + +log = get_logger() +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name +# The base url is the url given by the IHME website to download the data, with the file number and .zip removed e.g. '1.zip' +BASE_URL = "https://dl.healthdata.org:443/gbd-api-2021-public/a92a4a6e3c03db48983c60f4bab6129b_files/IHME-GBD_2021_DATA-a92a4a6e-" +NUMBER_OF_FILES = 39 + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"ihme_gbd/{SNAPSHOT_VERSION}/gbd_mental_health.csv") + # Download data from source. + dfs: list[pd.DataFrame] = [] + for file_number in range(1, NUMBER_OF_FILES + 1): + log.info(f"Downloading file {file_number} of {NUMBER_OF_FILES}") + df = download_data(file_number, base_url=BASE_URL) + log.info(f"Download of file {file_number} finished", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB") + dfs.append(df) + + # Concatenate the dataframes while keeping categorical columns to reduce memory usage. + df = repack_frame(concatenate(dfs)) + + log.info("Uploading final file", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB") + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main()