Skip to content

Commit

Permalink
adding mental health
Browse files Browse the repository at this point in the history
  • Loading branch information
spoonerf committed May 22, 2024
1 parent cea7177 commit 277b0a5
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
27 changes: 27 additions & 0 deletions snapshots/ihme_gbd/2024-05-20/gbd_mental_health.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
is_public: false
origin:
# Data product / Snapshot
title: Global Burden of Disease - Mental Health Prevalence
description: |-
The Global Burden of Disease (GBD) study provides a comprehensive assessment of global health trends. This dataset contains the prevalence of mental health conditions for a range of age-groups across males and females.
date_published: "2024-05-17"
# Citation
producer: Institute of Health Metrics and Evaluation, Global Burden of Disease Study
citation_full: |-
"Global Burden of Disease Collaborative Network. Global Burden of Disease Study 2021 (GBD 2021) Results. Seattle, United States: Institute for Health Metrics and Evaluation (IHME), 2022. Available from https://vizhub.healthdata.org/gbd-results/."
attribution_short: "IHME-GBD"
# Files
url_main: https://vizhub.healthdata.org/gbd-results/
date_accessed: 2024-05-20

# License
license:
name: Free-of-Charge Non-commercial User Agreement
url: https://www.healthdata.org/Data-tools-practices/data-practices/ihme-free-charge-non-commercial-user-agreement
outs:
- md5: 7992fab7bb0ce5b0a5d4d1a5b94785ae
size: 5376380405
path: gbd_prevalence.csv
61 changes: 61 additions & 0 deletions snapshots/ihme_gbd/2024-05-20/gbd_mental_health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Script to create a snapshot of dataset.
To get the data follow the following steps:
Important - You need and account to access the data.
* Go to: https://vizhub.healthdata.org/gbd-results/
* In 'GBD Estimate' select 'Cause of death or injury'
* In Measure select 'Prevalence'
* In Metric select 'Number', 'Percent' and 'Rate'
* In Impairment select all under 'Mental Disorders' and 'Substance Use Disorders'
* In Location select 'Global', 'Select all countries and territories', each of the regions in the following groups: 'WHO region', 'World Bank Income Level' and 'World Bank Regions'
* In Age select 'All ages', 'Age-standardized', '<5 years', '5-14 years', '15-49 years', '50-69 years', '70+ years', 10 to 14, '15-19', '20-24', '25-29', '30-34', 35-39, 40-44, 45-49, 50-54, 55-59, 60-64, 65-69,
* In Sex select 'Both'
* In Year select 'Select all'
The data will then be requested and a download link will be sent to you with a number of zip files containing the data (approx < 10 files).
We will download and combine the files in the following script.
"""
from pathlib import Path

import click
import pandas as pd
from owid.datautils.dataframes import concatenate
from owid.repack import repack_frame
from shared import download_data
from structlog import get_logger

from etl.snapshot import Snapshot

log = get_logger()
# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name
# The base url is the url given by the IHME website to download the data, with the file number and .zip removed e.g. '1.zip'
BASE_URL = "https://dl.healthdata.org:443/gbd-api-2021-public/a92a4a6e3c03db48983c60f4bab6129b_files/IHME-GBD_2021_DATA-a92a4a6e-"
NUMBER_OF_FILES = 39


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"ihme_gbd/{SNAPSHOT_VERSION}/gbd_mental_health.csv")
# Download data from source.
dfs: list[pd.DataFrame] = []
for file_number in range(1, NUMBER_OF_FILES + 1):
log.info(f"Downloading file {file_number} of {NUMBER_OF_FILES}")
df = download_data(file_number, base_url=BASE_URL)
log.info(f"Download of file {file_number} finished", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB")
dfs.append(df)

# Concatenate the dataframes while keeping categorical columns to reduce memory usage.
df = repack_frame(concatenate(dfs))

log.info("Uploading final file", size=f"{df.memory_usage(deep=True).sum()/1e6:.2f} MB")
snap.create_snapshot(upload=upload, data=df)


if __name__ == "__main__":
main()

0 comments on commit 277b0a5

Please sign in to comment.