From 078e189271eadcc16d7f1cc97e9520b5a4eddc30 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 27 Nov 2024 18:50:36 +0100 Subject: [PATCH 01/35] =?UTF-8?q?=F0=9F=93=8A=20hmd=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 6ab0c4257b701dbefd8de329fd41d6e059d76eee Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 27 Nov 2024 19:17:56 +0100 Subject: [PATCH 02/35] snapshot --- snapshots/hmd/2024-11-27/hmd.py | 25 ++++++++++ snapshots/hmd/2024-11-27/hmd.zip.dvc | 74 ++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 snapshots/hmd/2024-11-27/hmd.py create mode 100644 snapshots/hmd/2024-11-27/hmd.zip.dvc diff --git a/snapshots/hmd/2024-11-27/hmd.py b/snapshots/hmd/2024-11-27/hmd.py new file mode 100644 index 00000000000..c5180a4dfa6 --- /dev/null +++ b/snapshots/hmd/2024-11-27/hmd.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"hmd/{SNAPSHOT_VERSION}/hmd.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/hmd/2024-11-27/hmd.zip.dvc b/snapshots/hmd/2024-11-27/hmd.zip.dvc new file mode 100644 index 00000000000..63f4dfe3a51 --- /dev/null +++ b/snapshots/hmd/2024-11-27/hmd.zip.dvc @@ -0,0 +1,74 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Mortality Database + description: |- + The Human Mortality Database (HMD) contains original calculations of all-cause death rates and life tables for national populations (countries or areas), as well as the input data used in constructing those tables. The input data consist of death counts from vital statistics, plus census counts, birth counts, and population estimates from various sources. + + + # Scope and basic principles + + The database is limited by design to populations where death registration and census data are virtually complete, since this type of information is required for the uniform method used to reconstruct historical data series. As a result, the countries and areas included here are relatively wealthy and for the most part highly industrialized. + + The main goal of the Human Mortality Database is to document the longevity revolution of the modern era and to facilitate research into its causes and consequences. As much as possible, the authors of the database have followed four guiding principles: comparability, flexibility, accessibility, reproducibility. + + + # Computing death rates and life tables + + Their process for computing mortality rates and life tables can be described in terms of six steps, corresponding to six data types that are available from the HMD. Here is an overview of the process: + + 1. Births. Annual counts of live births by sex are collected for each population over the longest possible time period. These counts are used mainly for making population estimates at younger ages. + 2. Deaths. Death counts are collected at the finest level of detail available. If raw data are aggregated, uniform methods are used to estimate death counts by completed age (i.e., age-last-birthday at time of death), calendar year of death, and calendar year of birth. + 3. Population size. Annual estimates of population size on January 1st are either obtained from another source or are derived from census data plus birth and death counts. + 4. Exposure-to-risk. Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with a small correction that reflects the timing of deaths within the interval. + 5. Death rates. Death rates are always a ratio of the death count for a given age-time interval divided by an estimate of the exposure-to-risk in the same interval. + 6. Life tables. To build a life table, probabilities of death are computed from death rates. These probabilities are used to construct life tables, which include life expectancies and other useful indicators of mortality and longevity. + + + # Corrections to the data + + The data presented here have been corrected for gross errors (e.g., a processing error whereby 3,800 becomes 38,000 in a published statistical table would be obvious in most cases, and it would be corrected). However, the authors have not attempted to correct the data for systematic age misstatement (misreporting of age) or coverage errors (over- or under-enumeration of people or events). + + Some available studies assess the completeness of census coverage or death registration in the various countries, and more work is needed in this area. However, in developing the database thus far, the authors did not consider it feasible or desirable to attempt corrections of this sort, especially since it would be impossible to correct the data by a uniform method across all countries. + + + # Age misreporting + + Populations are included here if there is a well-founded belief that the coverage of their census and vital registration systems is relatively high, and thus, that fruitful analyses by both specialists and non-specialists should be possible with these data. Nevertheless, there is evidence of both age heaping (overreporting ages ending in "0" or "5") and age exaggeration in these data. + + In general, the degree of age heaping in these data varies by the time period and population considered, but it is usually no burden to scientific analysis. In most cases, it is sufficient to analyze data in five-year age groups in order to avoid the false impressions created by this particular form of age misstatement. + + Age exaggeration, on the other hand, is a more insidious problem. The authors' approach is guided by the conventional wisdom that age reporting in death registration systems is typically more reliable than in census counts or official population estimates. For this reason, the authors derive population estimates at older ages from the death counts themselves, employing extinct cohort methods. Such methods eliminate some, but certainly not all, of the biases in old-age mortality estimates due to age exaggeration. + + + # Uniform set of procedures + + A key goal of this project is to follow a uniform set of procedures for each population. This approach does not guarantee the cross-national comparability of the data. Rather, it ensures only that the authors have not introduced biases by the authors' own manipulations. The desire of the authors for uniformity had to face the challenge that raw data come in a variety of formats (for example, 1-year versus 5-year age groups). The authors' general approach to this problem is that the available raw data are used first to estimate two quantities: 1) the number of deaths by completed age, year of birth, and year of death; and 2) population estimates by single years of age on January 1 of each year. For each population, these calculations are performed separately by sex. From these two pieces of information, they compute death rates and life tables in a variety of age-time configurations. + + It is reasonable to ask whether a single procedure is the best method for treating the data from a variety of populations. Here, two points must be considered. First, the authors' uniform methodology is based on procedures that were developed separately, though following similar principles, for various countries and by different researchers. Earlier methods were synthesized by choosing what they considered the best among alternative procedures and by eliminating superficial inconsistencies. The second point is that a uniform procedure is possible only because the authors have not attempted to correct the data for reporting and coverage errors. Although some general principles could be followed, such problems would have to be addressed individually for each population. + + Although the authors adhere strictly to a uniform procedure, the data for each population also receive significant individualized attention. Each country or area is assigned to an individual researcher, who takes responsibility for assembling and checking the data for errors. In addition, the person assigned to each country/area checks the authors' data against other available sources. These procedures help to assure a high level of data quality, but assistance from database users in identifying problems is always appreciated! + date_published: "2024-11-13" + # Citation + producer: Human Mortality Database + citation_full: |- + HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. + + See also the methods protocol: + Wilmoth, J. R., Andreev, K., Jdanov, D., Glei, D. A., Riffe, T., Boe, C., Bubenheim, M., Philipov, D., Shkolnikov, V., Vachon, P., Winant, C., & Barbieri, M. (2021). Methods protocol for the human mortality database (v6). [Available online](https://www.mortality.org/File/GetDocument/Public/Docs/MethodsProtocolV6.pdf) (needs log in to mortality.org). + attribution_short: HMD + # Files + url_main: https://www.mortality.org/Data/ZippedDataFiles + date_accessed: 2024-11-27 + + # License + license: + name: CC BY 4.0 + url: https://www.mortality.org/Data/UserAgreement + +outs: + - md5: ceed045241a19573e6621423b582558e + size: 147314590 + path: hmd.zip From 2f703dd0d62c821b1868459f3c608ae254f21239 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 27 Nov 2024 19:48:40 +0100 Subject: [PATCH 03/35] wip --- .../garden/hmd/2024-11-27/hmd.countries.json | 2 + .../2024-11-27/hmd.excluded_countries.json | 2 + .../data/garden/hmd/2024-11-27/hmd.meta.yml | 59 +++++++++++++++++++ etl/steps/data/garden/hmd/2024-11-27/hmd.py | 37 ++++++++++++ etl/steps/data/grapher/hmd/2024-11-27/hmd.py | 28 +++++++++ etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 32 ++++++++++ 6 files changed, 160 insertions(+) create mode 100644 etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json create mode 100644 etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json create mode 100644 etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml create mode 100644 etl/steps/data/garden/hmd/2024-11-27/hmd.py create mode 100644 etl/steps/data/grapher/hmd/2024-11-27/hmd.py create mode 100644 etl/steps/data/meadow/hmd/2024-11-27/hmd.py diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json b/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json @@ -0,0 +1,2 @@ +{ +} diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json b/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json new file mode 100644 index 00000000000..0d4f101c7a3 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json @@ -0,0 +1,2 @@ +[ +] diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml new file mode 100644 index 00000000000..83a6e01604e --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -0,0 +1,59 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + - Population Growth + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + hmd: + variables: + # testing_variable: + # title: Testing variable title + # unit: arbitrary units + # short_unit: au + # description_short: Short description of testing variable. + # description_processing: Description of processing of testing variable. + # description_key: List of key points about the indicator. + # description_from_producer: Description of testing variable from producer. + # processing_level: minor + # type: + # sort: + # presentation: + # attribution: + # attribution_short: + # faqs: + # grapher_config: + # title_public: + # title_variant: + # topic_tags: + # display: + # name: Testing variable + # numDecimalPlaces: 0 + # tolerance: 0 + # color: + # conversionFactor: 1 + # description: + # entityAnnotationsMap: Test annotation + # includeInTable: + # isProjection: false + # unit: arbitrary units + # shortUnit: au + # tableDisplay: + # hideAbsoluteChange: + # hideRelativeChange: + # yearIsDay: false + # zeroDay: + # roundingMode: + # numSignificantFigures: + # + {} + diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py new file mode 100644 index 00000000000..f68ab560172 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -0,0 +1,37 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb = ds_meadow.read("hmd") + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/hmd/2024-11-27/hmd.py b/etl/steps/data/grapher/hmd/2024-11-27/hmd.py new file mode 100644 index 00000000000..e3ee265c8c8 --- /dev/null +++ b/etl/steps/data/grapher/hmd/2024-11-27/hmd.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("hmd") + + # Read table from garden dataset. + tb = ds_garden.read("hmd", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py new file mode 100644 index 00000000000..9a8a6c4ed4a --- /dev/null +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("hmd.zip") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() From 55af18284fea8a3256a5ec59e5834c68bf538345 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 28 Nov 2024 00:23:52 +0100 Subject: [PATCH 04/35] dag --- dag/demography.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dag/demography.yml b/dag/demography.yml index 14880b11731..c632fafa5b8 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -277,3 +277,11 @@ steps: - data://meadow/hmd/2024-11-19/hfd data://grapher/hmd/2024-11-19/hfd: - data://garden/hmd/2024-11-19/hfd + + # Human Mortality Database + data://meadow/hmd/2024-11-27/hmd: + - snapshot://hmd/2024-11-27/hmd.zip + data://garden/hmd/2024-11-27/hmd: + - data://meadow/hmd/2024-11-27/hmd + data://grapher/hmd/2024-11-27/hmd: + - data://garden/hmd/2024-11-27/hmd From 428447e57006c8b154ba0f7902adf1995f0027f9 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 28 Nov 2024 15:33:01 +0100 Subject: [PATCH 05/35] archive old hmd --- dag/archive/demography.yml | 8 +++++++- dag/main.yml | 27 ++++++++------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/dag/archive/demography.yml b/dag/archive/demography.yml index 185500becad..252fa1205de 100644 --- a/dag/archive/demography.yml +++ b/dag/archive/demography.yml @@ -52,9 +52,15 @@ steps: data-private://grapher/un/2024-07-11/un_wpp_full: - data-private://garden/un/2024-07-11/un_wpp - # Population density + # Population density data://garden/demography/2023-06-12/population_density: - data://garden/demography/2023-03-31/population - data://garden/faostat/2024-03-14/faostat_rl data://grapher/demography/2023-06-12/population_density: - data://garden/demography/2023-06-12/population_density + + # HMD + data://meadow/hmd/2022-12-07/life_tables: + - snapshot://hmd/2022-12-07/hmd.zip + data://garden/hmd/2022-12-07/life_tables: + - data://meadow/hmd/2022-12-07/life_tables diff --git a/dag/main.yml b/dag/main.yml index 7d4f3ee84bb..e6c493fc6ca 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -100,7 +100,6 @@ steps: data://grapher/homicide/2024-10-30/unodc: - data://garden/homicide/2024-10-30/unodc - # # UN data://grapher/un/2022-07-11/un_wpp: @@ -131,11 +130,6 @@ steps: - data://garden/regions/2023-01-01/regions data://grapher/technology/2022/internet: - data://garden/technology/2022/internet - # HMD - data://meadow/hmd/2022-12-07/life_tables: - - snapshot://hmd/2022-12-07/hmd.zip - data://garden/hmd/2022-12-07/life_tables: - - data://meadow/hmd/2022-12-07/life_tables # UNDP data://meadow/un/2024-04-09/undp_hdr: @@ -321,8 +315,6 @@ steps: data://grapher/terrorism/2023-07-20/global_terrorism_database: - data://garden/terrorism/2023-07-20/global_terrorism_database - - # Colonial Dates Dataset (COLDAT) data://meadow/harvard/2023-09-18/colonial_dates_dataset: - snapshot://harvard/2023-09-18/colonial_dates_dataset.csv @@ -355,7 +347,6 @@ steps: data://garden/gapminder/2023-09-21/under_five_mortality: - data://meadow/gapminder/2023-09-21/under_five_mortality - # Older vintage UN IGME (with longer time-series) data://meadow/un/2018/igme: - snapshot://un/2018/igme.csv @@ -763,7 +754,7 @@ steps: data-private://grapher/language/2024-07-17/ethnologue: - data-private://garden/language/2024-07-17/ethnologue -# Child Mortality Estimates - UN IGME + # Child Mortality Estimates - UN IGME data://meadow/un/2024-09-11/igme: - snapshot://un/2024-09-11/igme.zip data://garden/un/2024-09-11/igme: @@ -774,8 +765,7 @@ steps: data://grapher/un/2024-09-11/igme: - data://garden/un/2024-09-11/igme - -# Long-run child mortality, Gapminder + UN IGME + # Long-run child mortality, Gapminder + UN IGME data://garden/un/2024-09-16/long_run_child_mortality: - data://garden/un/2024-09-11/igme - data://garden/gapminder/2023-09-18/under_five_mortality @@ -783,8 +773,7 @@ steps: data://grapher/un/2024-09-16/long_run_child_mortality: - data://garden/un/2024-09-16/long_run_child_mortality - -# UN SDG (2024) + # UN SDG (2024) data://meadow/un/2024-08-27/un_sdg: - snapshot://un/2024-08-27/un_sdg.feather data://garden/un/2024-08-27/un_sdg: @@ -794,7 +783,7 @@ steps: data://grapher/un/2024-08-27/un_sdg: - data://garden/un/2024-08-27/un_sdg -# OECD Official Development Assistance (ODA) + # OECD Official Development Assistance (ODA) data://meadow/oecd/2024-08-21/official_development_assistance: - snapshot://oecd/2024-08-21/official_development_assistance_dac1.zip - snapshot://oecd/2024-08-21/official_development_assistance_dac2a.zip @@ -806,7 +795,7 @@ steps: data://grapher/oecd/2024-08-21/official_development_assistance: - data://garden/oecd/2024-08-21/official_development_assistance -# Oil Spills + # Oil Spills data://meadow/itopf/2024-10-16/oil_spills: - snapshot://itopf/2024-10-16/oil_spills.pdf data://garden/itopf/2024-10-16/oil_spills: @@ -814,7 +803,7 @@ steps: data://grapher/itopf/2024-10-16/oil_spills: - data://garden/itopf/2024-10-16/oil_spills -# UN SD census data + # UN SD census data data://meadow/un/2024-10-21/census_dates: - snapshot://un/2024-10-21/census_dates.csv data://garden/un/2024-10-21/census_dates: @@ -822,7 +811,7 @@ steps: data://grapher/un/2024-10-21/census_dates: - data://garden/un/2024-10-21/census_dates -# World Peace Foundation - Famines + # World Peace Foundation - Famines data://meadow/wpf/2024-10-03/famines: - snapshot://wpf/2024-10-03/famines.xlsx data://garden/wpf/2024-10-03/famines: @@ -861,7 +850,7 @@ steps: data-private://grapher/owid/latest/ig_countries: - data-private://garden/owid/latest/ig_countries -# Migration between regions, based on UN DESA flows + # Migration between regions, based on UN DESA flows data://garden/migration/2024-11-18/migration_between_regions: - data://garden/un/2024-07-16/migrant_stock - data://garden/regions/2023-01-01/regions From f1d66b3a3bfa0e3a34c79db47e30efa1b2710044 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 28 Nov 2024 16:52:12 +0100 Subject: [PATCH 06/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 9a8a6c4ed4a..5f50d7617de 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -1,11 +1,23 @@ """Load a snapshot and create a meadow dataset.""" +import os + from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +TABLES_LIFE_TABLES = [ + "lt_male", + "lt_female", + "lt_both", + "c_lt_male", + "c_lt_female", + "c_lt_both", +] + + def run(dest_dir: str) -> None: # # Load inputs. @@ -14,8 +26,8 @@ def run(dest_dir: str) -> None: snap = paths.load_snapshot("hmd.zip") # Load data from snapshot. - tb = snap.read() - + with snap.extract_to_tempdir() as tmpdir: + print(os.listdir(tmpdir)) # # Process data. # From 8dfef1988ccb599b7ac595b200ea6c363936aa15 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 28 Nov 2024 18:58:13 +0100 Subject: [PATCH 07/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 185 +++++++++++++++++++- 1 file changed, 182 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 5f50d7617de..935db9b9040 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -1,6 +1,13 @@ """Load a snapshot and create a meadow dataset.""" import os +import re +from io import StringIO +from pathlib import Path +from typing import Callable, List + +import owid.catalog.processing as pr +from owid.catalog import Table from etl.helpers import PathFinder, create_dataset @@ -8,6 +15,7 @@ paths = PathFinder(__file__) +# Life tables TABLES_LIFE_TABLES = [ "lt_male", "lt_female", @@ -16,6 +24,39 @@ "c_lt_female", "c_lt_both", ] +REGEX_LT = ( + r"(?P[a-zA-Z\-\s,]+), Life tables \((?P[a-zA-Z]+) (?P\d+x\d+)\), (?P[a-zA-Z]+)" + r"\tLast modified: (?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) +COLUMNS_RENAME_LT = { + "mx": "central_death_rate", + "qx": "probability_of_death", + "ax": "average_survival_length", + "lx": "number_survivors", + "dx": "number_deaths", + "Lx": "number_person_years_lived", + "Tx": "number_person_years_remaining", + "ex": "life_expectancy", +} + +# Exposures +TABLES_EXPOSURES = [ + "c_exposures", + "exposures", +] +REGEX_EXP = ( + r"(?P[a-zA-Z\-\s,]+), Exposure to risk \((?P[a-zA-Z]+) (?P\d+x\d+)\),\s\tLast modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) + +# Mortality +TABLES_M = [ + "deaths", +] +REGEX_M = ( + r"(?P[a-zA-Z\-\s,]+), Deaths \((?P[a-zA-Z]+) (?P\d+x\d+|Lexis triangle)\),\s\tLast modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) def run(dest_dir: str) -> None: @@ -27,18 +68,156 @@ def run(dest_dir: str) -> None: # Load data from snapshot. with snap.extract_to_tempdir() as tmpdir: - print(os.listdir(tmpdir)) + # Life tables + tb_lt = make_tb( + path=Path(tmpdir), + main_folders=TABLES_LIFE_TABLES, + regex=REGEX_LT, + ) + # Exposure + tb_exp = make_tb( + path=Path(tmpdir), + main_folders=TABLES_EXPOSURES, + regex=REGEX_EXP, + ) + # Mortality + tb_m = make_tb( + path=Path(tmpdir), + main_folders=TABLES_M, + regex=REGEX_M, + ) # # Process data. # + # Column rename + ## e.g. "Lx -> lx" and "lx -> lx". This will cause an error when setting the index. + tb_lt = tb_lt.rename(columns=COLUMNS_RENAME_LT) + + # Check missing values + def _check_missing(tb, missing_row_max, missing_countries_max): + row_nans = tb.isna().any(axis=1) + assert ( + row_nans.sum() / len(tb) < missing_row_max + ), f"Too many missing values in life tables: {row_nans.sum()/len(tb)}" + + # Countries missing + countries_missing_data = tb.loc[row_nans, "country"].unique() + assert ( + len(countries_missing_data) / len(tb) < missing_countries_max + ), f"Too many missing values in life tables: {len(countries_missing_data)}" + + _check_missing(tb_lt, 0.01, 14) + _check_missing(tb_exp, 0.23, 47) + _check_missing(tb_m, 0.001, 1) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.format(["country", "year"]) + tables = [ + tb_lt.format(["country", "year", "sex", "age", "type", "format"]), + tb_exp.format(["country", "year", "sex", "age", "type", "format"]), + tb_m.format(["country", "year", "sex", "age", "type", "format"]), + ] # # Save outputs. # # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + ds_meadow = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=snap.metadata, + ) # Save changes in the new meadow dataset. ds_meadow.save() + + +def make_tb(path: Path, main_folders: List[str], regex: str) -> Table: + """Create table from multiple category folders. + + It inspects the content in `main_folders` (should be in `path`), and looks for TXT files to parse into tables. + + The output is a table with the relevant indicators and dimensions for all the categories. + + Arguments: + path: Path where the HMD export is located. + main_folders: List of folders to consider in `path`. These should typically be categories, which + group different individual indicators + regex: Regex to extract the metadata for a set of TXTs file found in main_folders. We need this + because the structure of the header in the TXT files slightly varies depending on + the indicator. + """ + # List with all relevant tables + tbs = [] + # Iterate over each top-level folder + for category_folder in main_folders: + main_folder_path = path / category_folder + if not main_folder_path.is_dir(): + raise FileNotFoundError(f"Folder {main_folder_path} not found in {path}") + # Iterate over each indicator folder + for indicator_path in main_folder_path.iterdir(): + if "lexis" in indicator_path.name: + continue + if indicator_path.is_dir(): + # Read all TXT files in the indicator folder, and put them as a single table + paths.log.info(f"Creating list of tables from available files in {path}...") + files = list(indicator_path.glob("*.txt")) + tbs_ = [make_tb_from_txt(f, regex) for f in files] + tbs.extend(tbs_) + # Concatenate all dataframes + tb = pr.concat(tbs, ignore_index=True) + return tb + + +def make_tb_from_txt(text_path: Path, regex: str) -> Table: + """Create a table from a TXT file.""" + # Extract fields + groups = extract_fields(regex, text_path) + + # Build df + tb = parse_table(groups["data"]) + + # Optional melt + if ("Female" in tb.columns) & ("Male" in tb.columns): + tb = tb.melt(id_vars=["Age", "Year"], var_name="sex", value_name="deaths") + + # Add dimensions + tb = tb.assign( + country=groups["country"], + type=groups["type"], + format=groups["format"], + ) + + # Optional sex column + if "sex" in groups: + tb["sex"] = groups["sex"] + + return tb + + +def extract_fields(regex: str, path: Path) -> dict: + """Structure the fields in the raw TXT file.""" + # Read single file + with open(path, "r") as f: + text = f.read() + # Get relevant fields + match = re.search(regex, text) + if match is not None: + groups = match.groupdict() + else: + raise ValueError(f"No match found in {f}! Please revise that source files' content matches FILE_REGEX.") + return groups + + +def parse_table(data_raw: str): + """Given the raw data from the TXT file (as string) map it to a table.""" + tb_str = data_raw.strip() + tb_str = re.sub(r"\n\s+", "\n", tb_str) + tb_str = re.sub(r"[^\S\r\n]+", "\t", string=tb_str) + tb = pr.read_csv( + StringIO(tb_str), + sep="\t", + na_values=["."], + ) + + return tb From 2297dbb062638326bfc8b16c82a3df726101c8d0 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 28 Nov 2024 18:59:28 +0100 Subject: [PATCH 08/35] ci/cd --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 935db9b9040..1ef96bd2a9a 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -1,10 +1,9 @@ """Load a snapshot and create a meadow dataset.""" -import os import re from io import StringIO from pathlib import Path -from typing import Callable, List +from typing import List import owid.catalog.processing as pr from owid.catalog import Table From 7fc89b62fd41be79b6a8eb0b42c83a25815f23b9 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 29 Nov 2024 12:43:36 +0100 Subject: [PATCH 09/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 103 ++++++++++++++------ 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 1ef96bd2a9a..cb671539e7d 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -15,7 +15,7 @@ # Life tables -TABLES_LIFE_TABLES = [ +FOLDERS_LT = [ "lt_male", "lt_female", "lt_both", @@ -39,21 +39,37 @@ } # Exposures -TABLES_EXPOSURES = [ +FOLDERS_EXPOSURES = [ "c_exposures", "exposures", ] REGEX_EXP = ( - r"(?P[a-zA-Z\-\s,]+), Exposure to risk \((?P[a-zA-Z]+) (?P\d+x\d+)\),\s\tLast modified: " + r"(?P[a-zA-Z\-\s,]+), (?PExposure) to risk \((?P[a-zA-Z]+) (?P\d+x\d+)\),\s\tLast modified: " r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" ) # Mortality -TABLES_M = [ +FOLDERS_MOR = [ "deaths", ] -REGEX_M = ( - r"(?P[a-zA-Z\-\s,]+), Deaths \((?P[a-zA-Z]+) (?P\d+x\d+|Lexis triangle)\),\s\tLast modified: " +REGEX_MOR = ( + r"(?P[a-zA-Z\-\s,]+), (?PDeaths) \((?P[a-zA-Z]+) (?P\d+x\d+|Lexis triangle)\),\s\tLast modified: " + r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" +) +# Population +FOLDERS_POP = [ + "population", +] +REGEX_POP = ( + r"(?P[a-zA-Z\-\s,]+)(,\s)?(?PPopulation) size \((?P1\-year|abridged)\)\s+Last modified: " + r"(?P\d+ [a-zA-Z]{3} \d+)(; Methods Protocol: v\d+ \(\d+\)|,MPv\d \(in development\))\n\n(?P(?s:.)*)" +) +# Births +FOLDERS_BIRTHS = [ + "births", +] +REGEX_BIRTHS = ( + r"(?P[a-zA-Z\-\s,]+),\s+(?PBirths) \((?P1\-year)\)\s+Last modified: " r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" ) @@ -67,23 +83,37 @@ def run(dest_dir: str) -> None: # Load data from snapshot. with snap.extract_to_tempdir() as tmpdir: + # Population + tb_pop = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_POP, + regex=REGEX_POP, + ) + # Life tables tb_lt = make_tb( path=Path(tmpdir), - main_folders=TABLES_LIFE_TABLES, + main_folders=FOLDERS_LT, regex=REGEX_LT, ) # Exposure tb_exp = make_tb( path=Path(tmpdir), - main_folders=TABLES_EXPOSURES, + main_folders=FOLDERS_EXPOSURES, regex=REGEX_EXP, ) # Mortality tb_m = make_tb( path=Path(tmpdir), - main_folders=TABLES_M, - regex=REGEX_M, + main_folders=FOLDERS_MOR, + regex=REGEX_MOR, + ) + + # Births + tb_bi = make_tb( + path=Path(tmpdir), + main_folders=FOLDERS_BIRTHS, + regex=REGEX_BIRTHS, ) # # Process data. @@ -92,28 +122,25 @@ def run(dest_dir: str) -> None: ## e.g. "Lx -> lx" and "lx -> lx". This will cause an error when setting the index. tb_lt = tb_lt.rename(columns=COLUMNS_RENAME_LT) + # Invert 'abridged' <-> '1-year' in the type column + message = "Types 'abridged' and '1-year' might not be reversed anymore!" + assert not tb_pop.loc[tb_pop["type"] == "abridged", "Age"].str.contains("-").any(), message + assert tb_pop.loc[tb_pop["type"] == "1-year", "Age"].str.contains("80-84").any(), message + tb_pop["type"] = tb_pop["type"].map(lambda x: "1-year" if x == "abridged" else "abridged" if x == "1-year" else x) + # Check missing values - def _check_missing(tb, missing_row_max, missing_countries_max): - row_nans = tb.isna().any(axis=1) - assert ( - row_nans.sum() / len(tb) < missing_row_max - ), f"Too many missing values in life tables: {row_nans.sum()/len(tb)}" - - # Countries missing - countries_missing_data = tb.loc[row_nans, "country"].unique() - assert ( - len(countries_missing_data) / len(tb) < missing_countries_max - ), f"Too many missing values in life tables: {len(countries_missing_data)}" - - _check_missing(tb_lt, 0.01, 14) - _check_missing(tb_exp, 0.23, 47) - _check_missing(tb_m, 0.001, 1) + _check_nas(tb_lt, 0.01, 14) + _check_nas(tb_exp, 0.23, 47) + _check_nas(tb_m, 0.001, 1) + _check_nas(tb_pop, 0.001, 1) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tables = [ tb_lt.format(["country", "year", "sex", "age", "type", "format"]), tb_exp.format(["country", "year", "sex", "age", "type", "format"]), tb_m.format(["country", "year", "sex", "age", "type", "format"]), + tb_pop.format(["country", "year", "sex", "age", "type"]), + tb_bi.format(["country", "year", "sex", "type"]), ] # @@ -177,19 +204,25 @@ def make_tb_from_txt(text_path: Path, regex: str) -> Table: tb = parse_table(groups["data"]) # Optional melt - if ("Female" in tb.columns) & ("Male" in tb.columns): - tb = tb.melt(id_vars=["Age", "Year"], var_name="sex", value_name="deaths") + if ("Female" in tb.columns) and ("Male" in tb.columns): + id_vars = [col for col in ["Age", "Year"] if col in tb.columns] + if "name" not in groups: + raise ValueError( + f"Indicator name not found in {text_path}! Please revise that source files' content matches FILE_REGEX." + ) + tb = tb.melt(id_vars=id_vars, var_name="sex", value_name=groups["name"]) # Add dimensions tb = tb.assign( country=groups["country"], type=groups["type"], - format=groups["format"], ) # Optional sex column if "sex" in groups: tb["sex"] = groups["sex"] + if "format" in groups: + tb["format"] = groups["format"] return tb @@ -220,3 +253,17 @@ def parse_table(data_raw: str): ) return tb + + +def _check_nas(tb, missing_row_max, missing_countries_max): + """Check missing values & countries in data.""" + row_nans = tb.isna().any(axis=1) + assert ( + row_nans.sum() / len(tb) < missing_row_max + ), f"Too many missing values in life tables: {row_nans.sum()/len(tb)}" + + # Countries missing + countries_missing_data = tb.loc[row_nans, "country"].unique() + assert ( + len(countries_missing_data) / len(tb) < missing_countries_max + ), f"Too many missing values in life tables: {len(countries_missing_data)}" From 9d27cd9842abd90df1c645a0e19cbdc8d318f4bc Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 29 Nov 2024 13:05:27 +0100 Subject: [PATCH 10/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 32 ++++++++++++++------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index cb671539e7d..2e8fd24e503 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -115,14 +115,14 @@ def run(dest_dir: str) -> None: main_folders=FOLDERS_BIRTHS, regex=REGEX_BIRTHS, ) - # - # Process data. - # - # Column rename + + # Life tables + ## Column rename ## e.g. "Lx -> lx" and "lx -> lx". This will cause an error when setting the index. tb_lt = tb_lt.rename(columns=COLUMNS_RENAME_LT) - # Invert 'abridged' <-> '1-year' in the type column + # Population + ## Invert 'abridged' <-> '1-year' in the type column message = "Types 'abridged' and '1-year' might not be reversed anymore!" assert not tb_pop.loc[tb_pop["type"] == "abridged", "Age"].str.contains("-").any(), message assert tb_pop.loc[tb_pop["type"] == "1-year", "Age"].str.contains("80-84").any(), message @@ -134,13 +134,25 @@ def run(dest_dir: str) -> None: _check_nas(tb_m, 0.001, 1) _check_nas(tb_pop, 0.001, 1) + # Ensure correct year dtype + def _remove_range_years(tb): + flag = tb["Year"].astype("string").str.contains("-") + tb = tb.loc[~flag] + return tb + + tb_lt = _remove_range_years(tb_lt) + tb_exp = _remove_range_years(tb_exp) + tb_m = _remove_range_years(tb_m) + tb_pop = _remove_range_years(tb_pop) + tb_bi = _remove_range_years(tb_bi) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tables = [ - tb_lt.format(["country", "year", "sex", "age", "type", "format"]), - tb_exp.format(["country", "year", "sex", "age", "type", "format"]), - tb_m.format(["country", "year", "sex", "age", "type", "format"]), - tb_pop.format(["country", "year", "sex", "age", "type"]), - tb_bi.format(["country", "year", "sex", "type"]), + tb_lt.format(["country", "year", "sex", "age", "type", "format"], short_name="life_tables"), + tb_exp.format(["country", "year", "sex", "age", "type", "format"], short_name="exposures"), + tb_m.format(["country", "year", "sex", "age", "type", "format"], short_name="deaths"), + tb_pop.format(["country", "year", "sex", "age", "type"], short_name="population"), + tb_bi.format(["country", "year", "sex", "type"], short_name="births"), ] # From ae71868dbc696630caf9d051a7d5f4ee4d92f93b Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 29 Nov 2024 13:12:33 +0100 Subject: [PATCH 11/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 2e8fd24e503..b0f39dd23dc 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -135,16 +135,18 @@ def run(dest_dir: str) -> None: _check_nas(tb_pop, 0.001, 1) # Ensure correct year dtype - def _remove_range_years(tb): + def _clean_year(tb): + # Remove year ranges, and convert to int flag = tb["Year"].astype("string").str.contains("-") tb = tb.loc[~flag] + tb["Year"] = tb["Year"].astype(int) return tb - tb_lt = _remove_range_years(tb_lt) - tb_exp = _remove_range_years(tb_exp) - tb_m = _remove_range_years(tb_m) - tb_pop = _remove_range_years(tb_pop) - tb_bi = _remove_range_years(tb_bi) + tb_lt = _clean_year(tb_lt) + tb_exp = _clean_year(tb_exp) + tb_m = _clean_year(tb_m) + tb_pop = _clean_year(tb_pop) + tb_bi = _clean_year(tb_bi) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tables = [ From ce2b41e2fd4162efdbbe2037f4f258bd2836f37d Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 29 Nov 2024 13:16:58 +0100 Subject: [PATCH 12/35] wip --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index b0f39dd23dc..8ac32a2986b 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -139,7 +139,7 @@ def _clean_year(tb): # Remove year ranges, and convert to int flag = tb["Year"].astype("string").str.contains("-") tb = tb.loc[~flag] - tb["Year"] = tb["Year"].astype(int) + tb["Year"] = tb["Year"].astype("string") return tb tb_lt = _clean_year(tb_lt) From 36b8f8189ea05a49edaa3baea50778ae09379a0e Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 29 Nov 2024 18:35:29 +0100 Subject: [PATCH 13/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 43 ++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index f68ab560172..4d6b2d5db5a 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -15,11 +15,33 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("hmd") # Read table from meadow dataset. - tb = ds_meadow.read("hmd") + paths.log.info("reading tables") + tb_lt = ds_meadow.read("life_tables") + tb_exp = ds_meadow.read("exposures") + tb_mort = ds_meadow.read("deaths") + tb_pop = ds_meadow.read("population") + tb_births = ds_meadow.read("births") # # Process data. # + + # Life tables + tb_lt = standardize_sex_cat_names(tb_lt) + tb_lt = tb_lt.sort_values("format").drop_duplicates( + subset=[col for col in tb_lt.columns if col != "format"], keep="first" + ) + + ## Check + summary = tb_lt.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") + num_dups = summary.loc[summary["size"] != 1].shape[0] + assert num_dups <= 19 + ## Final drops + tb_lt = tb_lt.loc[~(tb_lt["format"] == "5x1") & (tb_lt["age"] == "110+")] + + # Exposures + tb_exp = standardize_sex_cat_names(tb_exp, {"female", "male", "total"}) + tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) @@ -35,3 +57,22 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() + + +def standardize_sex_cat_names(tb, sex_expected=None): + # Define expected sex categories + if sex_expected is None: + sex_expected = {"females", "males", "total"} + else: + sex = {s.lower() for s in sex_expected} + + # Set sex categories to lowercase + tb["sex"] = tb["sex"].str.lower() + + # Sanity check categories + sex_found = set(tb["sex"].unique()) + assert sex_found == sex_expected, f"Unexpected sex categories! Found {sex_found} but expected {sex_expected}" + + # Rename + tb["sex"] = tb["sex"].replace({"females": "female", "males": "male"}) + return tb From 5f71e294703e58dd48276949974f789b8772a968 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 13:21:36 +0100 Subject: [PATCH 14/35] change column names --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 8ac32a2986b..420de54098f 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -61,7 +61,7 @@ "population", ] REGEX_POP = ( - r"(?P[a-zA-Z\-\s,]+)(,\s)?(?PPopulation) size \((?P1\-year|abridged)\)\s+Last modified: " + r"(?P[a-zA-Z\-\s,]+?),?\s?(?PPopulation) size \((?P1\-year|abridged)\)\s+Last modified: " r"(?P\d+ [a-zA-Z]{3} \d+)(; Methods Protocol: v\d+ \(\d+\)|,MPv\d \(in development\))\n\n(?P(?s:.)*)" ) # Births @@ -69,7 +69,7 @@ "births", ] REGEX_BIRTHS = ( - r"(?P[a-zA-Z\-\s,]+),\s+(?PBirths) \((?P1\-year)\)\s+Last modified: " + r"(?P[a-zA-Z\-\s,]+),\s+(?PBirths) \((?P1\-year)\)\s+Last modified: " r"(?P\d+ [a-zA-Z]{3} \d+); Methods Protocol: v\d+ \(\d+\)\n\n(?P(?s:.)*)" ) @@ -124,9 +124,11 @@ def run(dest_dir: str) -> None: # Population ## Invert 'abridged' <-> '1-year' in the type column message = "Types 'abridged' and '1-year' might not be reversed anymore!" - assert not tb_pop.loc[tb_pop["type"] == "abridged", "Age"].str.contains("-").any(), message - assert tb_pop.loc[tb_pop["type"] == "1-year", "Age"].str.contains("80-84").any(), message - tb_pop["type"] = tb_pop["type"].map(lambda x: "1-year" if x == "abridged" else "abridged" if x == "1-year" else x) + assert not tb_pop.loc[tb_pop["format"] == "abridged", "Age"].str.contains("-").any(), message + assert tb_pop.loc[tb_pop["format"] == "1-year", "Age"].str.contains("80-84").any(), message + tb_pop["format"] = tb_pop["format"].map( + lambda x: "1-year" if x == "abridged" else "abridged" if x == "1-year" else x + ) # Check missing values _check_nas(tb_lt, 0.01, 14) @@ -153,8 +155,8 @@ def _clean_year(tb): tb_lt.format(["country", "year", "sex", "age", "type", "format"], short_name="life_tables"), tb_exp.format(["country", "year", "sex", "age", "type", "format"], short_name="exposures"), tb_m.format(["country", "year", "sex", "age", "type", "format"], short_name="deaths"), - tb_pop.format(["country", "year", "sex", "age", "type"], short_name="population"), - tb_bi.format(["country", "year", "sex", "type"], short_name="births"), + tb_pop.format(["country", "year", "sex", "age", "format"], short_name="population"), + tb_bi.format(["country", "year", "sex", "format"], short_name="births"), ] # @@ -211,6 +213,7 @@ def make_tb(path: Path, main_folders: List[str], regex: str) -> Table: def make_tb_from_txt(text_path: Path, regex: str) -> Table: """Create a table from a TXT file.""" + # print(text_path) # Extract fields groups = extract_fields(regex, text_path) @@ -229,7 +232,6 @@ def make_tb_from_txt(text_path: Path, regex: str) -> Table: # Add dimensions tb = tb.assign( country=groups["country"], - type=groups["type"], ) # Optional sex column @@ -237,7 +239,8 @@ def make_tb_from_txt(text_path: Path, regex: str) -> Table: tb["sex"] = groups["sex"] if "format" in groups: tb["format"] = groups["format"] - + if "type" in groups: + tb["type"] = groups["type"] return tb From 26d47e58c2b47ad63f2fee752d637eebe7d86ff8 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 13:27:52 +0100 Subject: [PATCH 15/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 74 ++++++++++++++++++--- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 4d6b2d5db5a..be991cdc8cf 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -26,33 +26,89 @@ def run(dest_dir: str) -> None: # Process data. # - # Life tables + # 1/ Life tables tb_lt = standardize_sex_cat_names(tb_lt) tb_lt = tb_lt.sort_values("format").drop_duplicates( subset=[col for col in tb_lt.columns if col != "format"], keep="first" ) - ## Check summary = tb_lt.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") - num_dups = summary.loc[summary["size"] != 1].shape[0] - assert num_dups <= 19 + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.shape[0] <= 19, "Found duplicated rows in life tables!" + assert (row_dups["country"].unique() == "Switzerland").all() & ( + row_dups["year"] <= 1931 + ).all(), "Unexpected duplicates in life tables!" ## Final drops tb_lt = tb_lt.loc[~(tb_lt["format"] == "5x1") & (tb_lt["age"] == "110+")] + tb_lt = tb_lt.drop(columns="format") - # Exposures + # 2/ Exposures tb_exp = standardize_sex_cat_names(tb_exp, {"female", "male", "total"}) + tb_exp = tb_exp.sort_values("format").drop_duplicates( + subset=[col for col in tb_exp.columns if col != "format"], keep="first" + ) + ## Check + summary = tb_exp.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + ## Final drops + tb_exp = tb_exp.drop(columns="format") + + # 3/ Mortality + tb_mort = standardize_sex_cat_names(tb_mort, {"female", "male", "total"}) + tb_mort = tb_mort.sort_values("format").drop_duplicates( + subset=[col for col in tb_mort.columns if col != "format"], keep="first" + ) + ## Check + summary = tb_mort.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + ## Final drops + tb_mort = tb_mort.drop(columns="format") + + # 4/ Population + tb_pop = standardize_sex_cat_names(tb_pop, {"female", "male", "total"}) + tb_pop = tb_pop.sort_values("format").drop_duplicates( + subset=[col for col in tb_pop.columns if col != "format"], keep="first" + ) + summary = tb_pop.groupby(["country", "year", "sex", "age"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + ## Final drops + tb_pop = tb_pop.drop(columns="format") - tb = geo.harmonize_countries( - df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + # 4/ Population + tb_births = standardize_sex_cat_names(tb_births, {"female", "male", "total"}) + tb_births = tb_births.sort_values("format").drop_duplicates( + subset=[col for col in tb_births.columns if col != "format"], keep="first" ) - tb = tb.format(["country", "year"]) + summary = tb_births.groupby(["country", "year", "sex"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + ## Final drops + tb_births = tb_births.drop(columns="format") + + # tb = geo.harmonize_countries( + # df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + # ) + + tables = [ + tb_lt.format(["country", "year", "sex", "age", "type"]), + tb_exp.format(["country", "year", "sex", "age", "type"]), + tb_mort.format(["country", "year", "sex", "age", "type"]), + tb_pop.format(["country", "year", "sex", "age"]), + tb_births.format(["country", "year", "sex"]), + ] # # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. From ac6cc3b83d24a7bebb0e8e0955933b6343284169 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 15:21:30 +0100 Subject: [PATCH 16/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 114 +++++++++++--------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index be991cdc8cf..ca6eb83b327 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -25,68 +25,47 @@ def run(dest_dir: str) -> None: # # Process data. # - # 1/ Life tables - tb_lt = standardize_sex_cat_names(tb_lt) - tb_lt = tb_lt.sort_values("format").drop_duplicates( - subset=[col for col in tb_lt.columns if col != "format"], keep="first" + def _sanity_check_lt(tb): + summary = tb.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.shape[0] <= 19, "Found duplicated rows in life tables!" + assert (row_dups["country"].unique() == "Switzerland").all() & ( + row_dups["year"] <= 1931 + ).all(), "Unexpected duplicates in life tables!" + tb = tb.loc[~(tb["format"] == "5x1") & (tb["age"] == "110+")] + return tb + + tb_lt = reshape_table( + tb=tb_lt, + col_index=["country", "year", "sex", "age", "type"], + sex_expected={"females", "males", "total"}, + callback_post=_sanity_check_lt, ) - ## Check - summary = tb_lt.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] - assert row_dups.shape[0] <= 19, "Found duplicated rows in life tables!" - assert (row_dups["country"].unique() == "Switzerland").all() & ( - row_dups["year"] <= 1931 - ).all(), "Unexpected duplicates in life tables!" - ## Final drops - tb_lt = tb_lt.loc[~(tb_lt["format"] == "5x1") & (tb_lt["age"] == "110+")] - tb_lt = tb_lt.drop(columns="format") # 2/ Exposures - tb_exp = standardize_sex_cat_names(tb_exp, {"female", "male", "total"}) - tb_exp = tb_exp.sort_values("format").drop_duplicates( - subset=[col for col in tb_exp.columns if col != "format"], keep="first" + tb_exp = reshape_table( + tb=tb_exp, + col_index=["country", "year", "sex", "age", "type"], ) - ## Check - summary = tb_exp.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] - assert row_dups.empty, "Found duplicated rows in life tables!" - ## Final drops - tb_exp = tb_exp.drop(columns="format") # 3/ Mortality - tb_mort = standardize_sex_cat_names(tb_mort, {"female", "male", "total"}) - tb_mort = tb_mort.sort_values("format").drop_duplicates( - subset=[col for col in tb_mort.columns if col != "format"], keep="first" + tb_mort = reshape_table( + tb=tb_mort, + col_index=["country", "year", "sex", "age", "type"], ) - ## Check - summary = tb_mort.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] - assert row_dups.empty, "Found duplicated rows in life tables!" - ## Final drops - tb_mort = tb_mort.drop(columns="format") # 4/ Population - tb_pop = standardize_sex_cat_names(tb_pop, {"female", "male", "total"}) - tb_pop = tb_pop.sort_values("format").drop_duplicates( - subset=[col for col in tb_pop.columns if col != "format"], keep="first" + tb_pop = reshape_table( + tb=tb_pop, + col_index=["country", "year", "sex", "age"], ) - summary = tb_pop.groupby(["country", "year", "sex", "age"], as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] - assert row_dups.empty, "Found duplicated rows in life tables!" - ## Final drops - tb_pop = tb_pop.drop(columns="format") - # 4/ Population - tb_births = standardize_sex_cat_names(tb_births, {"female", "male", "total"}) - tb_births = tb_births.sort_values("format").drop_duplicates( - subset=[col for col in tb_births.columns if col != "format"], keep="first" + # 5/ Births + tb_births = reshape_table( + tb=tb_births, + col_index=["country", "year", "sex"], ) - summary = tb_births.groupby(["country", "year", "sex"], as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] - assert row_dups.empty, "Found duplicated rows in life tables!" - ## Final drops - tb_births = tb_births.drop(columns="format") # tb = geo.harmonize_countries( # df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path @@ -115,12 +94,41 @@ def run(dest_dir: str) -> None: ds_garden.save() -def standardize_sex_cat_names(tb, sex_expected=None): - # Define expected sex categories +def reshape_table(tb, col_index, sex_expected=None, callback_post=None): + """Reshape a table. + + Input table has column `format`, which is sort-of redundant. This function ensures we can safely drop it (i.e. no duplicate rows). + + Additionally, it standardizes the dimension values. + """ if sex_expected is None: - sex_expected = {"females", "males", "total"} + sex_expected = {"female", "male", "total"} + + # Standardize dimension values + tb = standardize_sex_cat_names(tb, sex_expected) + + # Drop duplicate rows + tb = tb.sort_values("format").drop_duplicates(subset=[col for col in tb.columns if col != "format"], keep="first") + + # Check no duplicates + summary = tb.groupby(col_index, as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + if callback_post is not None: + tb = callback_post(tb) else: - sex = {s.lower() for s in sex_expected} + summary = tb.groupby(col_index, as_index=False).size().sort_values("size") + row_dups = summary.loc[summary["size"] != 1] + assert row_dups.empty, "Found duplicated rows in life tables!" + + # Final dropping o f columns + tb = tb.drop(columns="format") + + return tb + + +def standardize_sex_cat_names(tb, sex_expected): + # Define expected sex categories + sex_expected = {s.lower() for s in sex_expected} # Set sex categories to lowercase tb["sex"] = tb["sex"].str.lower() From ffedcb7cd74ad1424bb4a562fb6711839f7602a5 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 15:31:47 +0100 Subject: [PATCH 17/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 27 ++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index ca6eb83b327..1a47704affe 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -25,6 +25,8 @@ def run(dest_dir: str) -> None: # # Process data. # + paths.log.info("processing tables") + # 1/ Life tables def _sanity_check_lt(tb): summary = tb.groupby(["country", "year", "sex", "type", "age"], as_index=False).size().sort_values("size") @@ -36,7 +38,7 @@ def _sanity_check_lt(tb): tb = tb.loc[~(tb["format"] == "5x1") & (tb["age"] == "110+")] return tb - tb_lt = reshape_table( + tb_lt = process_table( tb=tb_lt, col_index=["country", "year", "sex", "age", "type"], sex_expected={"females", "males", "total"}, @@ -44,33 +46,30 @@ def _sanity_check_lt(tb): ) # 2/ Exposures - tb_exp = reshape_table( + tb_exp = process_table( tb=tb_exp, col_index=["country", "year", "sex", "age", "type"], ) # 3/ Mortality - tb_mort = reshape_table( + tb_mort = process_table( tb=tb_mort, col_index=["country", "year", "sex", "age", "type"], ) # 4/ Population - tb_pop = reshape_table( + tb_pop = process_table( tb=tb_pop, col_index=["country", "year", "sex", "age"], ) # 5/ Births - tb_births = reshape_table( + tb_births = process_table( tb=tb_births, col_index=["country", "year", "sex"], ) - # tb = geo.harmonize_countries( - # df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path - # ) - + # Create list with tables tables = [ tb_lt.format(["country", "year", "sex", "age", "type"]), tb_exp.format(["country", "year", "sex", "age", "type"]), @@ -87,14 +86,13 @@ def _sanity_check_lt(tb): dest_dir, tables=tables, check_variables_metadata=True, - default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. ds_garden.save() -def reshape_table(tb, col_index, sex_expected=None, callback_post=None): +def process_table(tb, col_index, sex_expected=None, callback_post=None): """Reshape a table. Input table has column `format`, which is sort-of redundant. This function ensures we can safely drop it (i.e. no duplicate rows). @@ -123,6 +121,12 @@ def reshape_table(tb, col_index, sex_expected=None, callback_post=None): # Final dropping o f columns tb = tb.drop(columns="format") + # Country name standardization + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + return tb @@ -139,4 +143,5 @@ def standardize_sex_cat_names(tb, sex_expected): # Rename tb["sex"] = tb["sex"].replace({"females": "female", "males": "male"}) + return tb From 398ea48667dd71ba8e87642d5ef3876d27a3fb5b Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 15:35:11 +0100 Subject: [PATCH 18/35] propagate snapshot metadata --- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index 420de54098f..bed7983a1b0 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -88,6 +88,7 @@ def run(dest_dir: str) -> None: path=Path(tmpdir), main_folders=FOLDERS_POP, regex=REGEX_POP, + snap=snap, ) # Life tables @@ -95,18 +96,21 @@ def run(dest_dir: str) -> None: path=Path(tmpdir), main_folders=FOLDERS_LT, regex=REGEX_LT, + snap=snap, ) # Exposure tb_exp = make_tb( path=Path(tmpdir), main_folders=FOLDERS_EXPOSURES, regex=REGEX_EXP, + snap=snap, ) # Mortality tb_m = make_tb( path=Path(tmpdir), main_folders=FOLDERS_MOR, regex=REGEX_MOR, + snap=snap, ) # Births @@ -114,6 +118,7 @@ def run(dest_dir: str) -> None: path=Path(tmpdir), main_folders=FOLDERS_BIRTHS, regex=REGEX_BIRTHS, + snap=snap, ) # Life tables @@ -174,7 +179,7 @@ def _clean_year(tb): ds_meadow.save() -def make_tb(path: Path, main_folders: List[str], regex: str) -> Table: +def make_tb(path: Path, main_folders: List[str], regex: str, snap) -> Table: """Create table from multiple category folders. It inspects the content in `main_folders` (should be in `path`), and looks for TXT files to parse into tables. @@ -204,21 +209,21 @@ def make_tb(path: Path, main_folders: List[str], regex: str) -> Table: # Read all TXT files in the indicator folder, and put them as a single table paths.log.info(f"Creating list of tables from available files in {path}...") files = list(indicator_path.glob("*.txt")) - tbs_ = [make_tb_from_txt(f, regex) for f in files] + tbs_ = [make_tb_from_txt(f, regex, snap) for f in files] tbs.extend(tbs_) # Concatenate all dataframes tb = pr.concat(tbs, ignore_index=True) return tb -def make_tb_from_txt(text_path: Path, regex: str) -> Table: +def make_tb_from_txt(text_path: Path, regex: str, snap) -> Table: """Create a table from a TXT file.""" # print(text_path) # Extract fields groups = extract_fields(regex, text_path) # Build df - tb = parse_table(groups["data"]) + tb = parse_table(groups["data"], snap) # Optional melt if ("Female" in tb.columns) and ("Male" in tb.columns): @@ -258,7 +263,7 @@ def extract_fields(regex: str, path: Path) -> dict: return groups -def parse_table(data_raw: str): +def parse_table(data_raw: str, snap): """Given the raw data from the TXT file (as string) map it to a table.""" tb_str = data_raw.strip() tb_str = re.sub(r"\n\s+", "\n", tb_str) @@ -267,6 +272,8 @@ def parse_table(data_raw: str): StringIO(tb_str), sep="\t", na_values=["."], + metadata=snap.to_table_metadata(), + origin=snap.m.origin, ) return tb From f1c894816b1ae0b79acd7d89d5bf28b0ce36a900 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 18:18:30 +0100 Subject: [PATCH 19/35] wip --- .../garden/hmd/2024-11-27/hmd.countries.json | 52 ++- .../data/garden/hmd/2024-11-27/hmd.meta.yml | 402 ++++++++++++++++-- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 62 ++- etl/steps/data/meadow/hmd/2024-11-27/hmd.py | 48 ++- 4 files changed, 512 insertions(+), 52 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json b/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json index 2c63c085104..c5fb3b64be0 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json @@ -1,2 +1,52 @@ { -} + "Australia": "Australia", + "Austria": "Austria", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Bulgaria": "Bulgaria", + "Canada": "Canada", + "Chile": "Chile", + "Croatia": "Croatia", + "Czechia": "Czechia", + "Denmark": "Denmark", + "East Germany": "East Germany", + "Estonia": "Estonia", + "Finland": "Finland", + "Germany": "Germany", + "Greece": "Greece", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "Ireland": "Ireland", + "Japan": "Japan", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Poland": "Poland", + "Portugal": "Portugal", + "Republic of Korea": "South Korea", + "Russia": "Russia", + "Slovenia": "Slovenia", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Taiwan": "Taiwan", + "Ukraine": "Ukraine", + "United Kingdom": "United Kingdom", + "West Germany": "West Germany", + "England and Wales, Civilian National Population": "England and Wales (Civilians)", + "England and Wales, Total Population": "England and Wales", + "France, Civilian Population": "France (Civilians)", + "France, Total Population": "France", + "Israel, Total Population": "Israel", + "Italy ": "Italy", + "New Zealand -- Maori": "New Zealand (Maori)", + "New Zealand -- Non-Maori": "New Zealand (Non-Maori)", + "Northern Ireland": "Northern Ireland", + "Scotland": "Scotland", + "Slovakia ": "Slovakia", + "The United States of America": "United States" +} \ No newline at end of file diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index 83a6e01604e..bced0d0bf3e 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -2,58 +2,376 @@ definitions: common: presentation: + attribution_short: HMD + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> topic_tags: - Life Expectancy - - Population Growth + others: + display_name_dim: |- + at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'total') >>, << type >> + title_public_dim: |- + at << age if age != '0' else 'birth'>> + global: + life_expectancy: + point_1: |- + <%- if type == "period" -%> + Period life expectancy is a metric that summarizes death rates across all age groups in one particular year. + <%- else -%> + Cohort life expectancy is the average lifespan of a group of people, usually a birth cohort – people born in the same year. + <%- endif -%> + point_2: |- + <%- if type == "period" -%> + <%- if age == '0' -%> + For a given year, it represents the average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else -%> + For a given year, it represents the remaining average lifespan for a hypothetical group of people, if they experienced the same age-specific death rates throughout the rest of their lives as the age-specific death rates seen in that particular year. + <%- endif -%> + <%- else -%> + <%- if age == '0' -%> + It is calculated by tracking individuals from that cohort throughout their lives until death, and calculating their average lifespan. + <%- else -%> + It is calculated by tracking individuals from that cohort throughout the rest of their lives until death, and calculating their average remaining lifespan. + <%- endif -%> + <%- endif -%> # Learn more about the available fields: -# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ dataset: update_period_days: 365 + description: |- + The Human Mortality Database (HMD) is a collaborative project sponsored by the University of California, Berkeley (in the United States of America) and the Max Planck Institute for Demographic Research (in Germany). + It provides researchers with comprehensive data on mortality from around 40 countries around the world, which have very high coverage and quality of data at the national level, through vital registration and potentially census data. + Data is given in terms of period or cohort estimates: + + - **Period data** refers to a snapshot estimated with data at a particular interval. For period life expectancy at birth, this refers to the estimated life expectancy at birth based on a synthetic cohort created using mortality rates across age groups in a given year. + - **Cohort data** refers to estimates of a particular birth cohort. For cohort life expectancy at birth, this refers to the average number of years that people in the birth cohort survived. Cohort data may use birth cohorts that are ‘almost extinct’ rather than entirely extinct. + + 'Interval' refers to the specific age- and time- period of the estimate. An interval can be a one year period for a single-age group, or it can be wider. For example, the life expectancy of a 40 year old in 2019 corresponds to an interval of 1 single-age group in 1 year. The central death rate of 5–9 year olds in 2020 corresponds to an interval of a 5 year age group in 1 year. + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ tables: hmd: + common: + presentation: + topic_tags: + - Life Expectancy + + variables: + central_death_rate: + title: Central death rate + description_short: |- + The death rate, calculated as the number of deaths divided by the average number of people alive during the interval. + description_key: + - "The death rate is measured using the number of person-years lived during the interval." + - "Person-years refers to the combined total time that a group of people has lived. For example, if 10 people each live for 2 years, they collectively contribute 20 person-years." + - "The death rate is slightly different from the 'probability of death' during the interval, because the 'probability of death' metric uses a different denominator: the number of people alive at that age at the start of the interval, while this indicator uses the average number of people alive during the interval." + unit: deaths per 1,000 people + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 1,000 to get a per-1,000 people rate. + display: + name: |- + {tables.hmd.variables.central_death_rate.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.central_death_rate.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> + + probability_of_death: + title: Probability of death + unit: "%" + description_short: |- + The probability of dying in a given interval, among people who survived to the start of that interval. + description_key: + - "For example, the probability of death for a 50 year old in a given year is found by: dividing the number of deaths in 50 year olds that year, by the number of people alive at the age of 50 at the start of the year." + processing_level: minor + description_processing: |- + The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. + display: + name: |- + {tables.hmd.variables.probability_of_death.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.probability_of_death.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> + + average_survival_length: + title: Average survival length + short_unit: years + unit: years + description_short: Average length of survival between ages x and x+n for persons dying in the interval. + display: + name: |- + {tables.hmd.variables.average_survival_length.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.average_survival_length.title} {definitions.others.title_public_dim} + + number_survivors: + title: Number of survivors + unit: survivors + description_short: Number of survivors at a given age, assuming survivors at 0 years old is 100,000. + display: + name: |- + {tables.hmd.variables.number_survivors.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.number_survivors.title} {definitions.others.title_public_dim} + + number_deaths: + title: Number of deaths + short_unit: deaths + unit: deaths + description_short: Number of deaths between ages x and x+n. + display: + name: |- + {tables.hmd.variables.number_deaths.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.number_deaths.title} {definitions.others.title_public_dim} + topic_tags: + - Life Expectancy + - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> + + number_person_years_lived: + title: Number of person-years lived + unit: person-years + description_short: Number of person-years lived between ages x and x+n. + display: + name: |- + {tables.hmd.variables.number_person_years_lived.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.number_person_years_lived.title} {definitions.others.title_public_dim} + + number_person_years_remaining: + title: Number of person-years remaining + unit: person-years + description_short: Number of person-years remaining after a given age. + display: + name: |- + {tables.hmd.variables.number_person_years_remaining.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.number_person_years_remaining.title} {definitions.others.title_public_dim} + + life_expectancy: + title: Life expectancy + short_unit: years + unit: years + description_short: |- + <%- if age == '0' -%> + <%- if sex == 'total' -%> + The << type >> life expectancy at birth, in a given year. + <%- else -%> + The << type >> life expectancy at birth among << sex + 's' >>, in a given year. + <%- endif -%> + <%- else -%> + <%- if sex == 'total' -%> + The remaining << type >> life expectancy at age << age >>, in a given year. + <%- else -%> + The remaining << type >> life expectancy at age << age >> among << sex + 's' >>, in a given year. + <%- endif -%> + <%- endif -%> + description_key: + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + - |- + <%- if age != '0' -%> + <%- if type == "period" -%> + This shows the remaining period life expectancy among people who have already reached the age << age >>, using death rates from their age group and older age groups. + <%- else -%> + This shows the remaining cohort life expectancy of people who have reached the age << age >>. + <%- endif -%> + <%- endif -%> + display: + numDecimalPlaces: 1 + name: |- + {tables.hmd.variables.life_expectancy.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.life_expectancy.title} {definitions.others.title_public_dim} + + exposure: variables: - # testing_variable: - # title: Testing variable title - # unit: arbitrary units - # short_unit: au - # description_short: Short description of testing variable. - # description_processing: Description of processing of testing variable. - # description_key: List of key points about the indicator. - # description_from_producer: Description of testing variable from producer. - # processing_level: minor - # type: - # sort: - # presentation: - # attribution: - # attribution_short: - # faqs: - # grapher_config: - # title_public: - # title_variant: - # topic_tags: - # display: - # name: Testing variable - # numDecimalPlaces: 0 - # tolerance: 0 - # color: - # conversionFactor: 1 - # description: - # entityAnnotationsMap: Test annotation - # includeInTable: - # isProjection: false - # unit: arbitrary units - # shortUnit: au - # tableDisplay: - # hideAbsoluteChange: - # hideRelativeChange: - # yearIsDay: false - # zeroDay: - # roundingMode: - # numSignificantFigures: - # - {} + exposure: + title: Exposure-to-risk + unit: person-years + description_short: The total number of person-years lived within a given interval. + description_key: + - It is equivalent to the average number of people living in that age group during the period. + description_from_producer: |- + Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with small corrections that reflect the timing of deaths during the interval. Period exposure estimations are based on assumptions of uniformity in the distribution of events except when historical monthly birth data are available. + display: + name: |- + {tables.hmd.variables.exposure.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.exposure.title} {definitions.others.title_public_dim} + + deaths: + common: + presentation: + topic_tags: + - Global Health + + variables: + deaths: + title: Number of deaths + unit: deaths + description_short: |- + <% if sex == 'total' %> + The total number of deaths at age << age >> in a given year. + <%- else %> + The total number of << sex >> deaths at age << age >> in a given year. + <%- endif %> + display: + name: |- + {tables.hmd.variables.exposure.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.exposure.title} {definitions.others.title_public_dim} + + population: + common: + presentation: + topic_tags: + - Population Growth + + variables: + population: + title: Population + unit: people + description_short: |- + <% if sex == 'total' %> + The total number of people aged << age >> living in a country. + <%- else %> + The total number of << sex + 's' >> aged << age >> living in a country. + <%- endif %> + description_processing: |- + From HMD Notes: For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914. + + We have used the "19xx+" population estimates for the year of the territorial change. + display: + name: |- + {tables.hmd.variables.population.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.population.title} {definitions.others.title_public_dim} + + births: + common: + presentation: + topic_tags: + - Fertility Rate + + variables: + births: + title: Births + unit: births + description_short: |- + <% if sex == 'total' %> + The total number of births in a given year. + <%- else %> + The total number of << sex >> births in a given year. + <%- endif %> + display: + name: |- + {tables.hmd.variables.births.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.births.title} {definitions.others.title_public_dim} + + birth_rate: + title: Births + unit: births per 1,000 people + description_short: |- + <% if sex == 'total' %> + The total number of births per 1,000 people in a given year. + <%- else %> + The total number of << sex >> births per 1,000 in a given year. + <%- endif %> + display: + name: |- + {tables.hmd.variables.births.title} {definitions.others.display_name_dim} + presentation: + title_public: |- + {tables.hmd.variables.births.title} {definitions.others.title_public_dim} + + diff_ratios: + variables: + central_death_rate_mf_ratio: + title: Central death rate ratio (m/f) + unit: "" + description_short: |- + The ratio of the << type >> central death rate (males to females) at age << age >>. + processing_level: major + display: + name: |- + Central death rate (male-to-female ratio) {definitions.others.display_name_dim} + presentation: + title_public: Central death rate {definitions.others.title_public_dim} + title_variant: |- + male-to-female ratio, << type + ' tables'>> + topic_tags: + - Life Expectancy + - Gender Ratio + - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> + + life_expectancy_fm_diff: + title: Life expectancy difference (f-m) + short_unit: years + unit: years + description_short: |- + The difference in the << type >> life expectancy (females - males) at age << age >>. + processing_level: major + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-male difference) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-male difference, << type + ' tables'>> + topic_tags: + - Life Expectancy + - Gender Ratio + life_expectancy_fm_ratio: + title: Life expectancy ratio (f/m) + unit: "" + short_unit: "" + description_short: |- + The ratio of the << type >> life expectancy (males to females) at age << age >>. + processing_level: major + description_key: + - Higher values indicate longer life expectancy among females than males. + - |- + {definitions.global.life_expectancy.point_1} + - |- + {definitions.global.life_expectancy.point_2} + display: + numDecimalPlaces: 1 + name: |- + Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << type >> + presentation: + title_public: Life expectancy at << age if age != '0' else 'birth'>> + title_variant: female-to-male ratio, << type + ' tables'>> + topic_tags: + - Life Expectancy + - Gender Ratio diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 1a47704affe..35721f8c65e 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -1,5 +1,9 @@ """Load a meadow dataset and create a garden dataset.""" +from typing import List, cast + +from owid.catalog import Table + from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -44,6 +48,9 @@ def _sanity_check_lt(tb): sex_expected={"females", "males", "total"}, callback_post=_sanity_check_lt, ) + # Scale central death rates + tb_lt["central_death_rate"] = tb_lt["central_death_rate"] * 1_000 + tb_lt["probability_of_death"] = tb_lt["probability_of_death"] * 100 # 2/ Exposures tb_exp = process_table( @@ -56,6 +63,8 @@ def _sanity_check_lt(tb): tb=tb_mort, col_index=["country", "year", "sex", "age", "type"], ) + assert set(tb_mort["type"].unique()) == {"period"}, "Unexpected values in column 'type' in mortality tables!" + tb_mort = tb_mort.drop(columns="type") # 4/ Population tb_pop = process_table( @@ -68,14 +77,22 @@ def _sanity_check_lt(tb): tb=tb_births, col_index=["country", "year", "sex"], ) + tb_pop_agg = tb_pop.groupby(["country", "year", "sex"], as_index=False)["population"].sum() + tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left") + tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000 + tb_births = tb_births.drop(columns=["population"]) + + # 6/ Create table with differences and ratios + tb_ratios = make_table_diffs_ratios(tb_lt) # Create list with tables tables = [ tb_lt.format(["country", "year", "sex", "age", "type"]), tb_exp.format(["country", "year", "sex", "age", "type"]), - tb_mort.format(["country", "year", "sex", "age", "type"]), + tb_mort.format(["country", "year", "sex", "age"]), tb_pop.format(["country", "year", "sex", "age"]), tb_births.format(["country", "year", "sex"]), + tb_ratios.format(["country", "year", "age", "type"], short_name="diff_ratio"), ] # @@ -127,6 +144,9 @@ def process_table(tb, col_index, sex_expected=None, callback_post=None): countries_file=paths.country_mapping_path, ) + # Make year column integer + tb["year"] = tb["year"].astype(int) + return tb @@ -145,3 +165,43 @@ def standardize_sex_cat_names(tb, sex_expected): tb["sex"] = tb["sex"].replace({"females": "female", "males": "male"}) return tb + + +def make_table_diffs_ratios(tb: Table) -> Table: + """Create table with metric differences and ratios. + + Currently, we estimate: + + - female - male: Life expectancy + - male/female: Life Expectancy, Central Death Rate + """ + # Pivot & obtain differences and ratios + cols_index = ["country", "year", "age", "type"] + tb_new = ( + tb.pivot_table( + index=cols_index, + columns="sex", + values=["life_expectancy", "central_death_rate"], + ) + .assign( + life_expectancy_fm_diff=lambda df: df[("life_expectancy", "female")] - df[("life_expectancy", "male")], + life_expectancy_mf_ratio=lambda df: df[("life_expectancy", "male")] / df[("life_expectancy", "female")], + central_death_rate_mf_ratio=lambda df: df[("central_death_rate", "male")] + / df[("central_death_rate", "female")], + ) + .reset_index() + ) + + # Keep relevant columns + cols = [col for col in tb_new.columns if col[1] == ""] + tb_new = tb_new.loc[:, cols] + + # Rename columns + tb_new.columns = [col[0] for col in tb_new.columns] + + # Add metadata back + for col in tb_new.columns: + if col not in cols_index: + tb_new[col] = tb_new[col].copy_metadata(tb["life_expectancy"]) + + return tb_new diff --git a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py index bed7983a1b0..d4a56860adf 100644 --- a/etl/steps/data/meadow/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/meadow/hmd/2024-11-27/hmd.py @@ -142,18 +142,11 @@ def run(dest_dir: str) -> None: _check_nas(tb_pop, 0.001, 1) # Ensure correct year dtype - def _clean_year(tb): - # Remove year ranges, and convert to int - flag = tb["Year"].astype("string").str.contains("-") - tb = tb.loc[~flag] - tb["Year"] = tb["Year"].astype("string") - return tb - tb_lt = _clean_year(tb_lt) tb_exp = _clean_year(tb_exp) tb_m = _clean_year(tb_m) - tb_pop = _clean_year(tb_pop) tb_bi = _clean_year(tb_bi) + tb_pop = _clean_population_type(tb_pop) # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tables = [ @@ -291,3 +284,42 @@ def _check_nas(tb, missing_row_max, missing_countries_max): assert ( len(countries_missing_data) / len(tb) < missing_countries_max ), f"Too many missing values in life tables: {len(countries_missing_data)}" + + +def _clean_population_type(tb): + """Data provider notes the following: + + For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914. + + To avoid confusion and duplicity, whenever there are multiple entries for a year, we keep YYYY+ definition for the year (e.g. country with new territorial changes). + """ + # Crete new column with the year. + regex = r"\b\d{4}\b" + tb["year"] = tb["Year"].astype("string").str.extract(f"({regex})", expand=False) + assert tb["year"].notna().all(), "Year extraction was successful!" + tb["year"] = tb["year"].astype(int) + + # Ensure raw year is as expected + assert ( + tb.groupby(["country", "year", "Age", "sex", "format"]).Year.nunique().max() == 2 + ), "Unexpected number of years (+/-)" + + # Drop duplicate years, keeping YYYY+. + tb["Year"] = tb["Year"].astype("string") + tb = tb.sort_values("Year") + tb = tb.drop_duplicates(subset=["year", "Age", "sex", "country", "format"], keep="first").drop(columns="Year") + + tb = tb.rename(columns={"year": "Year"}) + + # Additionally, remove year periods + tb = _clean_year(tb) + + return tb + + +def _clean_year(tb): + # Remove year ranges, and convert to int + flag = tb["Year"].astype("string").str.contains("-") + tb = tb.loc[~flag] + tb["Year"] = tb["Year"].astype("int") + return tb From 18d28c26c6ca7fa7f29addd69e726a3712fa861b Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 18:21:32 +0100 Subject: [PATCH 20/35] wip --- .../data/garden/hmd/2024-11-27/hmd.excluded_countries.json | 2 -- etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) delete mode 100644 etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json b/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json deleted file mode 100644 index 0d4f101c7a3..00000000000 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.excluded_countries.json +++ /dev/null @@ -1,2 +0,0 @@ -[ -] diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index bced0d0bf3e..1244223168b 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -54,7 +54,7 @@ dataset: # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ tables: - hmd: + life_tables: common: presentation: topic_tags: From d34ee68faa754876fcec1bc2814668486bfec0dd Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 18:40:55 +0100 Subject: [PATCH 21/35] garden --- .../data/garden/hmd/2024-11-27/hmd.meta.yml | 56 +++++++++---------- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 4 +- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index 1244223168b..ee587f04a76 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -75,10 +75,10 @@ tables: The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 1,000 to get a per-1,000 people rate. display: name: |- - {tables.hmd.variables.central_death_rate.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.central_death_rate.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.central_death_rate.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.central_death_rate.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> @@ -95,10 +95,10 @@ tables: The original metric is given as a fraction between 0 and 1 (i.e. per-capita). We multiply this by 100 to get a percentage. display: name: |- - {tables.hmd.variables.probability_of_death.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.probability_of_death.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.probability_of_death.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.probability_of_death.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> @@ -110,10 +110,10 @@ tables: description_short: Average length of survival between ages x and x+n for persons dying in the interval. display: name: |- - {tables.hmd.variables.average_survival_length.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.average_survival_length.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.average_survival_length.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.average_survival_length.title} {definitions.others.title_public_dim} number_survivors: title: Number of survivors @@ -121,10 +121,10 @@ tables: description_short: Number of survivors at a given age, assuming survivors at 0 years old is 100,000. display: name: |- - {tables.hmd.variables.number_survivors.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.number_survivors.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.number_survivors.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.number_survivors.title} {definitions.others.title_public_dim} number_deaths: title: Number of deaths @@ -133,10 +133,10 @@ tables: description_short: Number of deaths between ages x and x+n. display: name: |- - {tables.hmd.variables.number_deaths.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.number_deaths.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.number_deaths.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.number_deaths.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> @@ -147,10 +147,10 @@ tables: description_short: Number of person-years lived between ages x and x+n. display: name: |- - {tables.hmd.variables.number_person_years_lived.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.number_person_years_lived.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.number_person_years_lived.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.number_person_years_lived.title} {definitions.others.title_public_dim} number_person_years_remaining: title: Number of person-years remaining @@ -158,10 +158,10 @@ tables: description_short: Number of person-years remaining after a given age. display: name: |- - {tables.hmd.variables.number_person_years_remaining.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.number_person_years_remaining.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.number_person_years_remaining.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.number_person_years_remaining.title} {definitions.others.title_public_dim} life_expectancy: title: Life expectancy @@ -197,12 +197,12 @@ tables: display: numDecimalPlaces: 1 name: |- - {tables.hmd.variables.life_expectancy.title} {definitions.others.display_name_dim} + {tables.life_tables.variables.life_expectancy.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.life_expectancy.title} {definitions.others.title_public_dim} + {tables.life_tables.variables.life_expectancy.title} {definitions.others.title_public_dim} - exposure: + exposures: variables: exposure: title: Exposure-to-risk @@ -214,10 +214,10 @@ tables: Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with small corrections that reflect the timing of deaths during the interval. Period exposure estimations are based on assumptions of uniformity in the distribution of events except when historical monthly birth data are available. display: name: |- - {tables.hmd.variables.exposure.title} {definitions.others.display_name_dim} + {tables.exposures.variables.exposure.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.exposure.title} {definitions.others.title_public_dim} + {tables.exposures.variables.exposure.title} {definitions.others.title_public_dim} deaths: common: @@ -237,10 +237,10 @@ tables: <%- endif %> display: name: |- - {tables.hmd.variables.exposure.title} {definitions.others.display_name_dim} + {tables.deaths.variables.deaths.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.exposure.title} {definitions.others.title_public_dim} + {tables.deaths.variables.deaths.title} {definitions.others.title_public_dim} population: common: @@ -264,10 +264,10 @@ tables: We have used the "19xx+" population estimates for the year of the territorial change. display: name: |- - {tables.hmd.variables.population.title} {definitions.others.display_name_dim} + {tables.population.variables.population.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.population.title} {definitions.others.title_public_dim} + {tables.population.variables.population.title} {definitions.others.title_public_dim} births: common: @@ -287,10 +287,10 @@ tables: <%- endif %> display: name: |- - {tables.hmd.variables.births.title} {definitions.others.display_name_dim} + {tables.births.variables.births.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.births.title} {definitions.others.title_public_dim} + {tables.births.variables.births.title} {definitions.others.title_public_dim} birth_rate: title: Births @@ -303,10 +303,10 @@ tables: <%- endif %> display: name: |- - {tables.hmd.variables.births.title} {definitions.others.display_name_dim} + {tables.births.variables.births.title} {definitions.others.display_name_dim} presentation: title_public: |- - {tables.hmd.variables.births.title} {definitions.others.title_public_dim} + {tables.births.variables.births.title} {definitions.others.title_public_dim} diff_ratios: variables: @@ -352,7 +352,7 @@ tables: - Life Expectancy - Gender Ratio - life_expectancy_fm_ratio: + life_expectancy_mf_ratio: title: Life expectancy ratio (f/m) unit: "" short_unit: "" diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 35721f8c65e..9eaebe39521 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -1,7 +1,5 @@ """Load a meadow dataset and create a garden dataset.""" -from typing import List, cast - from owid.catalog import Table from etl.data_helpers import geo @@ -92,7 +90,7 @@ def _sanity_check_lt(tb): tb_mort.format(["country", "year", "sex", "age"]), tb_pop.format(["country", "year", "sex", "age"]), tb_births.format(["country", "year", "sex"]), - tb_ratios.format(["country", "year", "age", "type"], short_name="diff_ratio"), + tb_ratios.format(["country", "year", "age", "type"], short_name="diff_ratios"), ] # From 07801dc30d124e24748c1bb78074fbe84364547e Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 18:49:39 +0100 Subject: [PATCH 22/35] missing dimensions --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 9eaebe39521..74e99801941 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -38,6 +38,17 @@ def _sanity_check_lt(tb): row_dups["year"] <= 1931 ).all(), "Unexpected duplicates in life tables!" tb = tb.loc[~(tb["format"] == "5x1") & (tb["age"] == "110+")] + + flag = ( + (tb_lt["country"] == "Switzerland") + & (tb_lt["age"] == "110+") + & (tb_lt["type"] == "cohort") + & (tb_lt["sex"] == "Males") + & (tb_lt["year"] <= 1931) + & (tb_lt["year"] >= 1913) + ) + tb = tb.loc[~flag] + return tb tb_lt = process_table( @@ -84,6 +95,7 @@ def _sanity_check_lt(tb): tb_ratios = make_table_diffs_ratios(tb_lt) # Create list with tables + paths.log.info("saving tables") tables = [ tb_lt.format(["country", "year", "sex", "age", "type"]), tb_exp.format(["country", "year", "sex", "age", "type"]), @@ -114,6 +126,8 @@ def process_table(tb, col_index, sex_expected=None, callback_post=None): Additionally, it standardizes the dimension values. """ + paths.log.info(f"Pivoting table with columns {col_index}") + if sex_expected is None: sex_expected = {"female", "male", "total"} @@ -124,8 +138,6 @@ def process_table(tb, col_index, sex_expected=None, callback_post=None): tb = tb.sort_values("format").drop_duplicates(subset=[col for col in tb.columns if col != "format"], keep="first") # Check no duplicates - summary = tb.groupby(col_index, as_index=False).size().sort_values("size") - row_dups = summary.loc[summary["size"] != 1] if callback_post is not None: tb = callback_post(tb) else: From 9e992652b1adf227c7935fce4cf26c60fbb2433f Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 18:50:37 +0100 Subject: [PATCH 23/35] improve debug message --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 74e99801941..fb0796e753e 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -126,7 +126,7 @@ def process_table(tb, col_index, sex_expected=None, callback_post=None): Additionally, it standardizes the dimension values. """ - paths.log.info(f"Pivoting table with columns {col_index}") + paths.log.info(f"processing table {tb.name}") if sex_expected is None: sex_expected = {"female", "male", "total"} From 5b7ae8f0e9764c397893c75eb30c42c0278658d8 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 19:03:37 +0100 Subject: [PATCH 24/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index fb0796e753e..e9a35820b00 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -37,13 +37,12 @@ def _sanity_check_lt(tb): assert (row_dups["country"].unique() == "Switzerland").all() & ( row_dups["year"] <= 1931 ).all(), "Unexpected duplicates in life tables!" - tb = tb.loc[~(tb["format"] == "5x1") & (tb["age"] == "110+")] flag = ( (tb_lt["country"] == "Switzerland") & (tb_lt["age"] == "110+") & (tb_lt["type"] == "cohort") - & (tb_lt["sex"] == "Males") + & (tb_lt["sex"] == "male") & (tb_lt["year"] <= 1931) & (tb_lt["year"] >= 1913) ) @@ -126,7 +125,7 @@ def process_table(tb, col_index, sex_expected=None, callback_post=None): Additionally, it standardizes the dimension values. """ - paths.log.info(f"processing table {tb.name}") + paths.log.info(f"processing table {tb.m.short_name}") if sex_expected is None: sex_expected = {"female", "male", "total"} From 3d13f26789eabb07ffb8132856be1c772430db4c Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 19:11:49 +0100 Subject: [PATCH 25/35] wip --- etl/steps/data/grapher/hmd/2024-11-27/hmd.py | 58 +++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/grapher/hmd/2024-11-27/hmd.py b/etl/steps/data/grapher/hmd/2024-11-27/hmd.py index e3ee265c8c8..88cb5a79bd8 100644 --- a/etl/steps/data/grapher/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/grapher/hmd/2024-11-27/hmd.py @@ -5,6 +5,12 @@ # Get paths and naming conventions for current step. paths = PathFinder(__file__) +INDICATORS_RELEVANT_LT = [ + "central_death_rate", + "life_expectancy", + "probability_of_death", +] + def run(dest_dir: str) -> None: # @@ -14,15 +20,63 @@ def run(dest_dir: str) -> None: ds_garden = paths.load_dataset("hmd") # Read table from garden dataset. - tb = ds_garden.read("hmd", reset_index=False) + tb_lt = ds_garden.read("life_tables") + tb_exposure = ds_garden.read("exposures") + tb_deaths = ds_garden.read("deaths") + tb_pop = ds_garden.read("population") + tb_births = ds_garden.read("births") + tb_ratios = ds_garden.read("diff_ratios") + + # Filter relevant dimensions + tb_lt = keep_only_relevant_dimensions(tb_lt) + tb_exposure = keep_only_relevant_dimensions(tb_exposure) + tb_deaths = keep_only_relevant_dimensions(tb_deaths) + tb_pop = keep_only_relevant_dimensions(tb_pop) + tb_ratios = keep_only_relevant_dimensions(tb_ratios) # # Save outputs. # + cols_index = ["country", "year", "sex", "age", "type"] + tables = [ + tb_lt.format(cols_index), + tb_exposure.format(cols_index), + tb_deaths.format(["country", "year", "sex", "age"]), + tb_pop.format(["country", "year", "sex", "age"]), + tb_births.format(["country", "year", "sex"]), + tb_ratios.format(["country", "year", "age", "type"]), + ] # Create a new grapher dataset with the same metadata as the garden dataset. ds_grapher = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata ) # Save changes in the new grapher dataset. ds_grapher.save() + + +def keep_only_relevant_dimensions(tb): + """Keep only relevant dimensions. + + - We only preserve 5-year age groups, and specific 1-year age groups. + - We only preserve 1-year observation periods. + + """ + AGES_SINGLE = [ + 0, + 10, + 15, + 25, + 45, + 65, + 80, + ] + AGES_SINGLE = list(map(str, AGES_SINGLE)) + ["110+"] + flag_1 = tb["age"].isin(AGES_SINGLE) + flag_2 = tb["age"].str.contains( + "-", + ) + + tb = tb.loc[flag_1 | flag_2] + + return tb From 7620082c3e29072ba423d1d1bdf119268ed21721 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 19:27:02 +0100 Subject: [PATCH 26/35] fix dimension --- .../data/garden/hmd/2024-11-27/hmd.meta.yml | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index ee587f04a76..5629dd3f077 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -3,7 +3,6 @@ definitions: common: presentation: attribution_short: HMD - title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> topic_tags: - Life Expectancy @@ -57,6 +56,7 @@ tables: life_tables: common: presentation: + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> topic_tags: - Life Expectancy @@ -203,6 +203,12 @@ tables: {tables.life_tables.variables.life_expectancy.title} {definitions.others.title_public_dim} exposures: + common: + presentation: + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> + topic_tags: + - Life Expectancy + variables: exposure: title: Exposure-to-risk @@ -224,6 +230,7 @@ tables: presentation: topic_tags: - Global Health + title_variant: << sex + 's, ' if sex != 'total' >> variables: deaths: @@ -247,6 +254,7 @@ tables: presentation: topic_tags: - Population Growth + title_variant: << sex + 's, ' if sex != 'total' >> variables: population: @@ -274,6 +282,7 @@ tables: presentation: topic_tags: - Fertility Rate + title_variant: << sex + 's, ' if sex != 'total' >> variables: births: @@ -293,7 +302,7 @@ tables: {tables.births.variables.births.title} {definitions.others.title_public_dim} birth_rate: - title: Births + title: Birth rate unit: births per 1,000 people description_short: |- <% if sex == 'total' %> @@ -309,6 +318,12 @@ tables: {tables.births.variables.births.title} {definitions.others.title_public_dim} diff_ratios: + common: + presentation: + topic_tags: + - Life Expectancy + title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> + variables: central_death_rate_mf_ratio: title: Central death rate ratio (m/f) @@ -368,7 +383,8 @@ tables: display: numDecimalPlaces: 1 name: |- - Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << type >> + Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << + >> presentation: title_public: Life expectancy at << age if age != '0' else 'birth'>> title_variant: female-to-male ratio, << type + ' tables'>> From e39832ed069d318797988da889267d925a7fbdea Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 19:39:06 +0100 Subject: [PATCH 27/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index 5629dd3f077..09a7f08c099 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -322,7 +322,6 @@ tables: presentation: topic_tags: - Life Expectancy - title_variant: << sex + 's, ' if sex != 'total' >><< type + ' tables'>> variables: central_death_rate_mf_ratio: @@ -337,7 +336,7 @@ tables: presentation: title_public: Central death rate {definitions.others.title_public_dim} title_variant: |- - male-to-female ratio, << type + ' tables'>> + male-to-female ratio, << type >> tables topic_tags: - Life Expectancy - Gender Ratio @@ -362,7 +361,7 @@ tables: Life expectancy (female-male difference) at << 'birth' if (age == '0') else age >>, << type >> presentation: title_public: Life expectancy at << age if age != '0' else 'birth'>> - title_variant: female-male difference, << type + ' tables'>> + title_variant: female-male difference, << type >> tables topic_tags: - Life Expectancy - Gender Ratio @@ -383,11 +382,10 @@ tables: display: numDecimalPlaces: 1 name: |- - Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << - >> + Life expectancy (female-to-male ratio) at << 'birth' if (age == '0') else age >>, << type >> presentation: title_public: Life expectancy at << age if age != '0' else 'birth'>> - title_variant: female-to-male ratio, << type + ' tables'>> + title_variant: female-to-male ratio, << type >> tables topic_tags: - Life Expectancy - Gender Ratio From 9ee7af7d6112941f09f9268687ef9679dfa983f6 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 20:09:42 +0100 Subject: [PATCH 28/35] wip --- .../data/garden/hmd/2024-11-27/hmd.meta.yml | 2 +- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index 09a7f08c099..d6d27a09415 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -332,7 +332,7 @@ tables: processing_level: major display: name: |- - Central death rate (male-to-female ratio) {definitions.others.display_name_dim} + Central death rate (male-to-female ratio) at << 'birth' if (age == '0') else age >>, << type >> presentation: title_public: Central death rate {definitions.others.title_public_dim} title_variant: |- diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index e9a35820b00..4248b2c416c 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" +import numpy as np from owid.catalog import Table from etl.data_helpers import geo @@ -85,10 +86,16 @@ def _sanity_check_lt(tb): tb=tb_births, col_index=["country", "year", "sex"], ) - tb_pop_agg = tb_pop.groupby(["country", "year", "sex"], as_index=False)["population"].sum() - tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left") - tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000 - tb_births = tb_births.drop(columns=["population"]) + + def add_birth_rate(tb_pop, tb_births): + tb_pop_agg = tb_pop.groupby(["country", "year", "sex"], as_index=False)["population"].sum() + tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left") + tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000 + tb_births["birth_rate"] = tb_births["birth_rate"].replace([np.inf, -np.inf], np.nan) + tb_births = tb_births.drop(columns=["population"]) + return tb_births + + tb_births = add_birth_rate(tb_pop, tb_births) # 6/ Create table with differences and ratios tb_ratios = make_table_diffs_ratios(tb_lt) @@ -212,5 +219,6 @@ def make_table_diffs_ratios(tb: Table) -> Table: for col in tb_new.columns: if col not in cols_index: tb_new[col] = tb_new[col].copy_metadata(tb["life_expectancy"]) + tb_new[col] = tb_new[col].replace([np.inf, -np.inf], np.nan) return tb_new From 2bbe5c9d702b2006f230c3b66cde2802586ad817 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 21:12:05 +0100 Subject: [PATCH 29/35] wip --- etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml index d6d27a09415..0cd4ff29340 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.meta.yml @@ -81,7 +81,6 @@ tables: {tables.life_tables.variables.central_death_rate.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> probability_of_death: title: Probability of death @@ -101,7 +100,6 @@ tables: {tables.life_tables.variables.probability_of_death.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> average_survival_length: title: Average survival length @@ -139,7 +137,6 @@ tables: {tables.life_tables.variables.number_deaths.title} {definitions.others.title_public_dim} topic_tags: - Life Expectancy - - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> number_person_years_lived: title: Number of person-years lived @@ -340,7 +337,6 @@ tables: topic_tags: - Life Expectancy - Gender Ratio - - << 'Child & Infant Mortality' if age in ['0', '1-4'] else 'Causes of Death'>> life_expectancy_fm_diff: title: Life expectancy difference (f-m) From 035f63942aec1f60110610eefdf95d155aa0e2e7 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 21:25:46 +0100 Subject: [PATCH 30/35] nit memory opt --- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index 4248b2c416c..a3c3419c3cb 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -25,6 +25,10 @@ def run(dest_dir: str) -> None: tb_pop = ds_meadow.read("population") tb_births = ds_meadow.read("births") + # Drop NaNs + tb_exp = tb_exp.dropna(subset="exposure") + tb_births = tb_births.dropna(subset="births") + # # Process data. # From a42189f4f7fe4baa7b3c0aa1e7af55ee3979c165 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 21:41:55 +0100 Subject: [PATCH 31/35] wip --- dag/archive/demography.yml | 6 + dag/demography.yml | 12 +- .../2024-12-02/survivor_percentiles.meta.yml | 44 ++++++ .../2024-12-02/survivor_percentiles.py | 137 ++++++++++++++++++ .../2024-12-02/survivor_percentiles.py | 32 ++++ 5 files changed, 225 insertions(+), 6 deletions(-) create mode 100644 etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml create mode 100644 etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py create mode 100644 etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py diff --git a/dag/archive/demography.yml b/dag/archive/demography.yml index 252fa1205de..3879d5f3c6f 100644 --- a/dag/archive/demography.yml +++ b/dag/archive/demography.yml @@ -64,3 +64,9 @@ steps: - snapshot://hmd/2022-12-07/hmd.zip data://garden/hmd/2022-12-07/life_tables: - data://meadow/hmd/2022-12-07/life_tables + + # Survivorship ages (HMD-derived) + data://garden/demography/2023-09-27/survivor_percentiles: + - data://garden/hmd/2023-09-19/hmd + data://grapher/demography/2023-09-27/survivor_percentiles: + - data://garden/demography/2023-09-27/survivor_percentiles diff --git a/dag/demography.yml b/dag/demography.yml index f1a8f87c25f..cdd5e679957 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -137,12 +137,6 @@ steps: data://grapher/demography/2023-10-04/gini_le: - data://garden/demography/2023-10-04/gini_le - # Survivorship ages (HMD-derived) - data://garden/demography/2023-09-27/survivor_percentiles: - - data://garden/hmd/2023-09-19/hmd - data://grapher/demography/2023-09-27/survivor_percentiles: - - data://garden/demography/2023-09-27/survivor_percentiles - # Phi-gender life expectancy inequality data://garden/demography/2023-10-03/phi_gender_le: - data://garden/demography/2023-10-03/life_tables @@ -253,3 +247,9 @@ steps: - data://meadow/demography/2024-11-26/multiple_births data://grapher/demography/2024-11-26/multiple_births: - data://garden/demography/2024-11-26/multiple_births + + # Survivorship ages (HMD-derived) + data://garden/demography/2024-12-02/survivor_percentiles: + - data://garden/hmd/2024-11-27/hmd + data://grapher/demography/2024-12-02/survivor_percentiles: + - data://garden/demography/2024-12-02/survivor_percentiles diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml new file mode 100644 index 00000000000..cf84b684014 --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml @@ -0,0 +1,44 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Life Expectancy + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/dataset/ +dataset: + title: Survivorship percentiles (HMD, Alvarez and Vaupel; 2023) + update_period_days: 365 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/tables/ +tables: + survivor_percentiles: + variables: + age: + title: Survivorship age + unit: years + processing_level: major + description_short: |- + <%- if percentile == 1 -%> + The age until which the 1st percentile (99% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- else -%> + The age until which the << percentile>>th percentile (<< 100 - percentile|int>>% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year. + <%- endif -%> + + description_processing: |- + This was calculated with the method published in Alvarez and Vaupel (2023), with code provided by the authors: + + Jesús-Adrián Alvarez, James W. Vaupel; Mortality as a Function of Survival. Demography 1 February 2023; 60 (1): 327–342. doi: https://doi.org/10.1215/00703370-10429097 + + These estimates were regenerated for data from more recent years in the Human Mortality Database. + + Original R code from: https://github.com/jssalvrz/s-ages + description_key: + - This is calculated with the period life tables indicators. + display: + numDecimalPlaces: 1 + presentation: + attribution: |- + Alvarez & Vaupel (2023); Human Mortality Database (2023) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py new file mode 100644 index 00000000000..88de89c2dbd --- /dev/null +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py @@ -0,0 +1,137 @@ +"""Load a meadow dataset and create a garden dataset. + +Methods used here are taken from https://github.com/jssalvrz/s-ages. Authors of Citation: Alvarez, J.-A., & Vaupel, J. W. (2023). Mortality as a Function of Survival. Demography, 60(1), 327–342. https://doi.org/10.1215/00703370-10429097 + + +Dr. Saloni Dattani translated the R scripts into Python: + - Original: https://github.com/jssalvrz/s-ages + - Translated: https://github.com/saloni-nd/misc/tree/main/survivorship-ages + +Lucas Rodes-Guirao adapted the python code for ETL. +""" + +import numpy as np +import pandas as pd +from owid.catalog import Table +from scipy.integrate import cumulative_trapezoid as cumtrapz +from scipy.interpolate import InterpolatedUnivariateSpline + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + paths.log.info("load data.") + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb_deaths = ds_meadow.read("deaths") + tb_exposure = ds_meadow.read("exposures") + + # + # Process data. + # + # Combine tables, drop NaNs + tb = tb_deaths.merge(tb_exposure, on=["country", "year", "sex", "age"], how="outer") + tb = tb.dropna(subset=["deaths", "exposure"], how="all") + + # Keep format="1x1", and sex="both" + paths.log.info("keep period & 1-year data.") + tb = tb.loc[tb["age"].str.match(r"^\d{3}\+?$") & (tb["type"] == "period")] + + # Drop unused columns + tb = tb.drop(columns=["type"]) + + # 110+ -> 110 + paths.log.info("replace 110+ -> 100, set Dtypes.") + tb["age"] = tb["age"].replace({"110+": "110"}).astype(int) + + # Sort + tb = tb.sort_values(["year", "age"]) + + # Actual calculation + paths.log.info("calculate surviorship ages (can take some minutes)...") + columns_grouping = ["country", "sex", "year"] + tb = tb.groupby(columns_grouping).apply(lambda group: obtain_survivorship_ages(group)).reset_index() # type: ignore + + # Unpivot + paths.log.info("reshape table") + tb = tb.melt( + id_vars=["country", "sex", "year"], + value_vars=["s1", "s10", "s20", "s30", "s40", "s50", "s60", "s70", "s80", "s90", "s99"], + var_name="percentile", + value_name="age", + ) + tb = tb.dropna(subset=["percentile"]) + tb["percentile"] = tb["percentile"].str.replace("s", "").astype(int) + tb["percentile"] = 100 - tb["percentile"] + + # Propagate metadata + tb["age"] = tb["age"].copy_metadata(tb_exposure["exposure"]) + + # Set index + paths.log.info("format") + tb = tb.format(["country", "year", "sex", "percentile"], short_name="survivor_percentiles") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def obtain_survivorship_ages(tb_group: Table, start_age: int = 0, end_age: int = 110) -> pd.DataFrame: + """Get survivorship ages given a life and deaths table. + + Output dataframe has a column for each percentile of survivorship age. + + tb_group is expected to be a subset of the compelte table. It should only concern a particular (country, year, sex) triple. + """ + # Step 1: Apply splines, get Mx for each (country, year, sex, age) + ## Define splines + ### We could use CubicSpline (k=3 order), but it provides slightly different results hence, for precaution, we sticked to InterpolatedUnivariateSpline. + ### This is equivalent to R function interpSpline + spline_deaths = InterpolatedUnivariateSpline(tb_group["age"], tb_group["deaths"], k=3) + spline_exposures = InterpolatedUnivariateSpline(tb_group["age"], tb_group["exposure"], k=3) + + ## Define age range (with step 0.01) + age_range = np.arange(start_age, end_age, 0.01) + + # Run splines over age range + deaths_spline = np.abs(spline_deaths(age_range)) + exposure_spline = np.abs(spline_exposures(age_range)) + exposure_spline[exposure_spline == 0] = np.nan + survival_age_spline = np.abs(deaths_spline / exposure_spline) + + # Step 2: Calculate survival, density, hazard, and cumulative hazards + ## Estimate parameters + Hx = cumtrapz(y=survival_age_spline, x=age_range, initial=0) # Hazard CDF + Sx = np.exp(-Hx) # Survivor function + + # Step 3: Calculate survivorship ages from parameters + out = {} + out["s0"] = max(age_range) + ## I'm using a for loop to simplify the logic here + for i in range(1, 101): + try: + sx_rounded = np.ceil((100 * Sx).round(3)) + value = age_range[sx_rounded == i][0] + out[f"s{i}"] = value + except IndexError: + out[f"s{i}"] = np.nan + + # Create output dataframe + df = pd.DataFrame(out, index=[0]) + + return df diff --git a/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py new file mode 100644 index 00000000000..ea3d8dd07e2 --- /dev/null +++ b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("survivor_percentiles") + + # Read table from garden dataset. + + # + # Process data. + # + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() From 326aec6149469a0149e3b05df3ba29c99be2382b Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 21:58:39 +0100 Subject: [PATCH 32/35] fix origins propagation --- .../data/garden/demography/2024-12-02/survivor_percentiles.py | 2 +- etl/steps/data/garden/hmd/2024-11-27/hmd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py index 88de89c2dbd..c1b4ab4d326 100644 --- a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py @@ -73,7 +73,7 @@ def run(dest_dir: str) -> None: tb["percentile"] = 100 - tb["percentile"] # Propagate metadata - tb["age"] = tb["age"].copy_metadata(tb_exposure["exposure"]) + tb["age"].metadata.origins = tb_exposure["exposure"].m.origins.copy() # Set index paths.log.info("format") diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.py b/etl/steps/data/garden/hmd/2024-11-27/hmd.py index a3c3419c3cb..d1fde80301b 100644 --- a/etl/steps/data/garden/hmd/2024-11-27/hmd.py +++ b/etl/steps/data/garden/hmd/2024-11-27/hmd.py @@ -222,7 +222,7 @@ def make_table_diffs_ratios(tb: Table) -> Table: # Add metadata back for col in tb_new.columns: if col not in cols_index: - tb_new[col] = tb_new[col].copy_metadata(tb["life_expectancy"]) + tb_new[col].metadata.origins = tb["life_expectancy"].m.origins.copy() tb_new[col] = tb_new[col].replace([np.inf, -np.inf], np.nan) return tb_new From 4e2fba8b59bfb74c99a095bc8a206e685da5d3e1 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 22:16:37 +0100 Subject: [PATCH 33/35] grapher --- .../grapher/demography/2024-12-02/survivor_percentiles.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py index ea3d8dd07e2..1e319eaee4c 100644 --- a/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py +++ b/etl/steps/data/grapher/demography/2024-12-02/survivor_percentiles.py @@ -25,7 +25,10 @@ def run(dest_dir: str) -> None: # # Create a new grapher dataset with the same metadata as the garden dataset. ds_grapher = create_dataset( - dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_garden.metadata, ) # Save changes in the new grapher dataset. From 5e5b4389f77b7a441dcf47aed1b337b55c8fea14 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 22:19:18 +0100 Subject: [PATCH 34/35] dataset title --- .../garden/demography/2024-12-02/survivor_percentiles.meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml index cf84b684014..9c06c6ce986 100644 --- a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml @@ -8,7 +8,7 @@ definitions: # Learn more about the available fields: # http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/dataset/ dataset: - title: Survivorship percentiles (HMD, Alvarez and Vaupel; 2023) + title: Survivorship percentiles (HMD; Alvarez and Vaupel 2023) update_period_days: 365 # Learn more about the available fields: From a9f2ddf8cb4a10af01eb71248eed1f97f782b3d2 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 2 Dec 2024 22:57:10 +0100 Subject: [PATCH 35/35] minor metadata update --- .../demography/2024-12-02/survivor_percentiles.meta.yml | 2 +- .../data/garden/demography/2024-12-02/survivor_percentiles.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml index 9c06c6ce986..07e19bde3f9 100644 --- a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml @@ -41,4 +41,4 @@ tables: numDecimalPlaces: 1 presentation: attribution: |- - Alvarez & Vaupel (2023); Human Mortality Database (2023) + Alvarez & Vaupel (2023); Human Mortality Database (2024) diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py index c1b4ab4d326..1f2b1ef59cc 100644 --- a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py +++ b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py @@ -39,11 +39,11 @@ def run(dest_dir: str) -> None: # # Combine tables, drop NaNs tb = tb_deaths.merge(tb_exposure, on=["country", "year", "sex", "age"], how="outer") - tb = tb.dropna(subset=["deaths", "exposure"], how="all") + tb = tb.dropna(subset=["deaths", "exposure"], how="any") # Keep format="1x1", and sex="both" paths.log.info("keep period & 1-year data.") - tb = tb.loc[tb["age"].str.match(r"^\d{3}\+?$") & (tb["type"] == "period")] + tb = tb.loc[tb["age"].str.match(r"^(\d{1,3}|d{3}\+)$") & (tb["type"] == "period")] # Drop unused columns tb = tb.drop(columns=["type"])