From 277b0a5221b0ee6861e822434eb2527895c1fb36 Mon Sep 17 00:00:00 2001
From: spoonerf <spoonerfiona@gmail.com>
Date: Wed, 22 May 2024 17:20:13 +0100
Subject: [PATCH] adding mental health

---
 .../2024-05-20/gbd_mental_health.csv.dvc      | 27 ++++++++
 .../ihme_gbd/2024-05-20/gbd_mental_health.py  | 61 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc
 create mode 100644 snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py

diff --git a/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc
new file mode 100644
index 00000000000..4c2889293a4
--- /dev/null
+++ b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc
@@ -0,0 +1,27 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  is_public: false
+  origin:
+    # Data product / Snapshot
+    title: Global Burden of Disease - Mental Health Prevalence
+    description: |-
+      The Global Burden of Disease (GBD) study provides a comprehensive assessment of global health trends. This dataset contains the prevalence of mental health conditions for a range of age-groups across males and females.
+    date_published: "2024-05-17"
+    # Citation
+    producer: Institute of Health Metrics and Evaluation, Global Burden of Disease Study
+    citation_full: |-
+      "Global Burden of Disease Collaborative Network. Global Burden of Disease Study 2021 (GBD 2021) Results. Seattle, United States: Institute for Health Metrics and Evaluation (IHME), 2022. Available from https://vizhub.healthdata.org/gbd-results/."
+    attribution_short: "IHME-GBD"
+    # Files
+    url_main: https://vizhub.healthdata.org/gbd-results/
+    date_accessed: 2024-05-20
+
+    # License
+    license:
+      name: Free-of-Charge Non-commercial User Agreement
+      url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement
+outs:
+  - md5: 7992fab7bb0ce5b0a5d4d1a5b94785ae
+    size: 5376380405
+    path: gbd_prevalence.csv
diff --git a/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py
new file mode 100644
index 00000000000..65e56095103
--- /dev/null
+++ b/snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py
@@ -0,0 +1,61 @@
+"""Script to create a snapshot of dataset.
+
+To get the data follow the following steps:
+
+Important - You need and account to access the data.
+
+* Go to: https://vizhub.healthdata.org/gbd-results/
+* In 'GBD Estimate' select 'Cause of death or injury'
+* In Measure select 'Prevalence'
+* In Metric select 'Number', 'Percent' and 'Rate'
+* In Impairment select all under 'Mental Disorders' and 'Substance Use Disorders'
+* In Location select 'Global', 'Select all countries and territories', each of the regions in the following groups: 'WHO region', 'World Bank Income Level' and 'World Bank Regions'
+* In Age select 'All ages', 'Age-standardized', '<5 years', '5-14 years', '15-49 years', '50-69 years', '70+ years', 10 to 14, '15-19', '20-24', '25-29', '30-34', 35-39, 40-44, 45-49, 50-54, 55-59, 60-64, 65-69,
+* In Sex select 'Both'
+* In Year select 'Select all'
+
+The data will then be requested and a download link will be sent to you with a number of zip files containing the data (approx < 10 files).
+
+We will download and combine the files in the following script.
+"""
+from pathlib import Path
+
+import click
+import pandas as pd
+from owid.datautils.dataframes import concatenate
+from owid.repack import repack_frame
+from shared import download_data
+from structlog import get_logger
+
+from etl.snapshot import Snapshot
+
+log = get_logger()
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+# The base url is the url given by the IHME website to download the data, with the file number and .zip removed e.g. '1.zip'
+BASE_URL = "https://dl.healthdata.org:443/gbd-api-2021-public/a92a4a6e3c03db48983c60f4bab6129b_files/IHME-GBD_2021_DATA-a92a4a6e-"
+NUMBER_OF_FILES = 39
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"ihme_gbd/{SNAPSHOT_VERSION}/gbd_mental_health.csv")
+    # Download data from source.
+    dfs: list[pd.DataFrame] = []
+    for file_number in range(1, NUMBER_OF_FILES + 1):
+        log.info(f"Downloading file {file_number} of {NUMBER_OF_FILES}")
+        df = download_data(file_number, base_url=BASE_URL)
+        log.info(f"Download of file {file_number} finished", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB")
+        dfs.append(df)
+
+    # Concatenate the dataframes while keeping categorical columns to reduce memory usage.
+    df = repack_frame(concatenate(dfs))
+
+    log.info("Uploading final file", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB")
+    snap.create_snapshot(upload=upload, data=df)
+
+
+if __name__ == "__main__":
+    main()