diff --git a/dag/demography.yml b/dag/demography.yml index 453a694c419..c17ad96c410 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -122,6 +122,7 @@ steps: - data://garden/un/2022-07-11/un_wpp data://grapher/un/2024-03-14/un_wpp_most: - data://garden/un/2024-03-14/un_wpp_most + ######################################################################## # Life expectancy # ######################################################################## @@ -252,16 +253,20 @@ steps: data://grapher/demography/2024-12-03/fertility_rate: - data://garden/demography/2024-12-03/fertility_rate - # OMM: Mean Age at Birth -- HFD + UN WPP - # data://garden/demography/2024-12-03/mean_age_birth: - # - data://garden/hmd/2024-11-19/hfd - # - data://garden/un/2024-07-12/un_wpp - # data://grapher/demography/2024-12-03/mean_age_birth: - # - data://garden/demography/2024-12-03/mean_age_birth - # OMM: Birth rate -- HFD + UN WPP data://garden/demography/2024-12-03/birth_rate: - data://garden/hmd/2024-12-01/hmd - data://garden/un/2024-07-12/un_wpp data://grapher/demography/2024-12-03/birth_rate: - data://garden/demography/2024-12-03/birth_rate + + # HMD country data + data://meadow/hmd/2024-12-03/hmd_country: + - snapshot://hmd/2024-12-01/hmd_country.zip + + # HMD - Birth rate by month + data://garden/hmd/2024-12-03/hmd_country: + - data://meadow/hmd/2024-12-03/hmd_country + - data://garden/hmd/2024-12-01/hmd + data://grapher/hmd/2024-12-03/hmd_country: + - data://garden/hmd/2024-12-03/hmd_country diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json new file mode 100644 index 00000000000..26b62193714 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json @@ -0,0 +1,47 @@ +{ + "AUS": "Australia", + "AUT": "Austria", + "BEL": "Belgium", + "BGR": "Bulgaria", + "BLR": "Belarus", + "CAN": "Canada", + "CHE": "Switzerland", + "CHL": "Chile", + "CZE": "Czechia", + "DNK": "Denmark", + "ESP": "Spain", + "EST": "Estonia", + "FIN": "Finland", + "GRC": "Greece", + "HKG": "Hong Kong", + "HRV": "Croatia", + "HUN": "Hungary", + "IRL": "Ireland", + "ISL": "Iceland", + "ISR": "Israel", + "ITA": "Italy", + "JPN": "Japan", + "KOR": "South Korea", + "LTU": "Lithuania", + "LUX": "Luxembourg", + "LVA": "Latvia", + "NLD": "Netherlands", + "NOR": "Norway", + "POL": "Poland", + "PRT": "Portugal", + "RUS": "Russia", + "SVK": "Slovakia", + "SVN": "Slovenia", + "SWE": "Sweden", + "UKR": "Ukraine", + "USA": "United States", + "DEUTE": "East Germany", + "DEUTNP": "Germany", + "DEUTW": "West Germany", + "FRATNP": "France", + "GBRTENW": "England and Wales", + "GBR_NIR": "Northern Ireland", + "GBR_NP": "United Kingdom", + "GBR_SCO": "Scotland", + "NZL_NP": "New Zealand" +} diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json new file mode 100644 index 00000000000..3a7f14126b5 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json @@ -0,0 +1,4 @@ +[ + "FRACNP", + "GBRCENW" +] diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml new file mode 100644 index 00000000000..e5ba0b87a6c --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml @@ -0,0 +1,74 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Fertility Rate + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Birth rate by month (HMD) + update_period_days: 365 + +tables: + birth_rate: + variables: + birth_rate: + title: Birth rate (monthly) + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in a given month. + display: + name: |- + Birth rate + + birth_rate_per_day: + title: Daily birth Rate (average in month) + unit: births per 1,000 people + description_short: |- + The average daily number of births, per 1,000 people, calculated monthly. + display: + name: |- + Birth rate, per day + + birth_rate_month: + variables: + birth_rate: + title: Birth rate (monthly) - << month >> + unit: births per 1,000 people + description_short: |- + The total number of births per 1,000 people in <>. + display: + name: |- + Birth rate + + birth_rate_per_day: + title: Daily birth rate (average in month) - << month >> + unit: births per 1,000 people + description_short: |- + The average daily number of births, per 1,000 people, calculated for <>. + display: + name: |- + Birth rate, per day + + birth_rate_month_max: + variables: + month_max: + title: Month ordinal with the peak daily birth rate + unit: "" + description_short: |- + Number corresponding to the month with the highest daily birth rate. + month_max_name: + title: Month name with the peak daily birth rate + unit: "" + description_short: |- + Month with the highest daily birth rate. + birth_rate_per_day_max: + title: Peak daily birth rate + unit: births per 1,000 people + description_short: |- + The highest average daily number of births, per 1,000 people, recorded in the given year. + display: + name: |- + Maximum birth rate, per day diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..c7964f636d9 --- /dev/null +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,160 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import calendar + +import numpy as np +import pandas as pd + +from etl.data_helpers import geo +from etl.data_helpers.misc import interpolate_table +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hmd_country") + ds_hmd = paths.load_dataset("hmd") + + # Read table from meadow dataset. + tb_month = ds_meadow.read("monthly") + tb_pop = ds_hmd.read("population") + + # + # Process data. + # + tb_month_long, tb_month_dimensions, tb_month_max = make_monthly_tables(tb_month, tb_pop) + tables = [ + tb_month_long.format(["country", "date"], short_name="birth_rate"), + tb_month_dimensions.format(["country", "year", "month"], short_name="birth_rate_month"), + tb_month_max.format(["country", "year"], short_name="birth_rate_month_max"), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_monthly_tables(tb, tb_pop): + ## Discard unknown/total values + tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])] + tb["month"] = tb["month"].astype(int) + ## Create date column. TODO: check what day of the month to assign + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1)) + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_unknown_excluded_countries=False, + ) + + # Add population to monthly birth data table + tb = add_population_column(tb, tb_pop) + + # Estimate metrics + tb = estimate_metrics(tb) + + # Sort rows + tb = tb.sort_values(["country", "date", "date"]) + + # Classic time-series, with date-values + tb_long = tb[["country", "date", "birth_rate", "birth_rate_per_day"]] + + # Month as a dimension + tb_dimensions = tb[["country", "year", "month", "birth_rate", "birth_rate_per_day"]] + tb_dimensions["month"] = tb_dimensions["month"].apply(lambda x: calendar.month_name[x]) + + # For each year, ID of the month with highest birth rate per day + tb_month_max = tb.loc[ + tb.groupby(["country", "year"])["birth_rate_per_day"].idxmax(), + ["country", "year", "month", "birth_rate_per_day"], + ].rename(columns={"month": "month_max", "birth_rate_per_day": "birth_rate_per_day_max"}) + tb_month_max["month_max_name"] = tb_month_max["month_max"].apply(lambda x: calendar.month_name[x]) + + return tb_long, tb_dimensions, tb_month_max + + +def clean_table(tb): + """Filter rows, harmonize country names, add date column.""" + # Filter unwanted month categories, set dtype + tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])] + tb["month"] = tb["month"].astype(int) + ## Create date column. TODO: check what day of the month to assign + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1)) + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.directory / (paths.short_name + "_month.countries.json"), + excluded_countries_file=paths.excluded_countries_path, + warn_on_unknown_excluded_countries=False, + ) + + return tb + + +def add_population_column(tb, tb_pop): + """Add population column to main table for each date.""" + # Prepare population table + tb_pop = _prepare_population_table(tb_pop) + # Merge population table with main table + tb = tb.merge(tb_pop, on=["country", "date"], how="left") + tb = tb.sort_values(["country", "date"]) + # Interpolate to get monthly population estimates + tb_ = interpolate_table( + tb[["country", "date", "population"]], + entity_col="country", + time_col="date", + time_mode="none", + ) + tb = tb.drop(columns="population").merge(tb_, on=["country", "date"], how="left") + + return tb + + +def _prepare_population_table(tb): + """Prepare population table to merge with main table. + + Original table is given in years, but we need it in days! We use linear interpolation for that. + """ + tb_aux = tb.loc[(tb["sex"] == "total") & ~(tb["age"].str.contains("-")), ["country", "year", "population"]] + tb_aux = tb_aux.groupby(["country", "year"], as_index=False)["population"].sum() + ## Assign a day to population. TODO: Check if this is true + tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01") + tb_aux = tb_aux.drop(columns="year") + + return tb_aux + + +def estimate_metrics(tb): + """Estimate metrics: birth rate and birth rate per day.""" + # Get days in month + tb["days_in_month"] = tb.apply(lambda row: calendar.monthrange(row["year"], row["month"])[1], axis=1) + # Estimate rates + tb["birth_rate"] = tb["births"] / tb["population"] * 1_000 + tb["birth_rate_per_day"] = tb["birth_rate"] / tb["days_in_month"] * 1_000 + # Check + assert tb[["birth_rate", "birth_rate_per_day"]].notna().all().all() + # Replace INF values with NAs + tb[["birth_rate", "birth_rate_per_day"]] = tb[["birth_rate", "birth_rate_per_day"]].replace( + [np.inf, -np.inf], pd.NA + ) + # Drop NAs + tb = tb.dropna(subset=["birth_rate", "birth_rate_per_day"]) + + return tb diff --git a/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py b/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..8b3a73fafe6 --- /dev/null +++ b/etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("hmd_country") + + # Read table from garden dataset. + tables = list(ds_garden) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py b/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py new file mode 100644 index 00000000000..ef50429e65c --- /dev/null +++ b/etl/steps/data/meadow/hmd/2024-12-03/hmd_country.py @@ -0,0 +1,69 @@ +"""Load a snapshot and create a meadow dataset.""" + +from pathlib import Path + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("hmd_country.zip") + + # Load data from snapshot. + paths.log.info("Loading data from snapshot.") + tbs = [] + with snap.extract_to_tempdir() as tmp_dir: + p = Path(tmp_dir) + files = p.glob("**/InputDB/*month.txt") + for f in files: + tb_ = pr.read_csv( + f, + na_values=["."], + metadata=snap.to_table_metadata(), + origin=snap.m.origin, + ) + tb_.columns = tb_.columns.str.strip() + tb_ = tb_.rename( + columns={ + "NoteCode1": "Note1", + "NoteCode2": "Note2", + "NoteCode3": "Note3", + } + ) + tbs.append(tb_) + + # Concatenate + paths.log.info("Concatenating tables.") + tb = pr.concat(tbs, ignore_index=True) + tb = tb.rename(columns={"PopName": "country"}) + + # + # Process data. + # + paths.log.info("Processing data.") + tb = tb.groupby(["country", "Year", "Month"], as_index=False)["Births"].mean() + tb = tb.astype( + { + "country": "string", + "Month": "string", + } + ) + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year", "month"], short_name="monthly") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/hmd/2024-12-01/hmd_country.py b/snapshots/hmd/2024-12-01/hmd_country.py new file mode 100644 index 00000000000..356e913b41d --- /dev/null +++ b/snapshots/hmd/2024-12-01/hmd_country.py @@ -0,0 +1,32 @@ +"""Download data manually: + +- Go to https://mortality.org/Data/ZippedDataFiles +- Scroll down to "By country" section +- Click on "All HMD countries" + +Note: you need to be logged in to download the data. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"hmd/{SNAPSHOT_VERSION}/hmd_country.zip") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/hmd/2024-12-01/hmd_country.zip.dvc b/snapshots/hmd/2024-12-01/hmd_country.zip.dvc new file mode 100644 index 00000000000..9a0d86340f9 --- /dev/null +++ b/snapshots/hmd/2024-12-01/hmd_country.zip.dvc @@ -0,0 +1,79 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Mortality Database, by country + description: |- + The Human Mortality Database (HMD) contains original calculations of all-cause death rates and life tables for national populations (countries or areas), as well as the input data used in constructing those tables. The input data consist of death counts from vital statistics, plus census counts, birth counts, and population estimates from various sources. + + + # Scope and basic principles + + The database is limited by design to populations where death registration and census data are virtually complete, since this type of information is required for the uniform method used to reconstruct historical data series. As a result, the countries and areas included here are relatively wealthy and for the most part highly industrialized. + + The main goal of the Human Mortality Database is to document the longevity revolution of the modern era and to facilitate research into its causes and consequences. As much as possible, the authors of the database have followed four guiding principles: comparability, flexibility, accessibility, reproducibility. + + + # Computing death rates and life tables + + Their process for computing mortality rates and life tables can be described in terms of six steps, corresponding to six data types that are available from the HMD. Here is an overview of the process: + + 1. Births. Annual counts of live births by sex are collected for each population over the longest possible time period. These counts are used mainly for making population estimates at younger ages. + 2. Deaths. Death counts are collected at the finest level of detail available. If raw data are aggregated, uniform methods are used to estimate death counts by completed age (i.e., age-last-birthday at time of death), calendar year of death, and calendar year of birth. + 3. Population size. Annual estimates of population size on January 1st are either obtained from another source or are derived from census data plus birth and death counts. + 4. Exposure-to-risk. Estimates of the population exposed to the risk of death during some age-time interval are based on annual (January 1st) population estimates, with a small correction that reflects the timing of deaths within the interval. + 5. Death rates. Death rates are always a ratio of the death count for a given age-time interval divided by an estimate of the exposure-to-risk in the same interval. + 6. Life tables. To build a life table, probabilities of death are computed from death rates. These probabilities are used to construct life tables, which include life expectancies and other useful indicators of mortality and longevity. + + + # Corrections to the data + + The data presented here have been corrected for gross errors (e.g., a processing error whereby 3,800 becomes 38,000 in a published statistical table would be obvious in most cases, and it would be corrected). However, the authors have not attempted to correct the data for systematic age misstatement (misreporting of age) or coverage errors (over- or under-enumeration of people or events). + + Some available studies assess the completeness of census coverage or death registration in the various countries, and more work is needed in this area. However, in developing the database thus far, the authors did not consider it feasible or desirable to attempt corrections of this sort, especially since it would be impossible to correct the data by a uniform method across all countries. + + + # Age misreporting + + Populations are included here if there is a well-founded belief that the coverage of their census and vital registration systems is relatively high, and thus, that fruitful analyses by both specialists and non-specialists should be possible with these data. Nevertheless, there is evidence of both age heaping (overreporting ages ending in "0" or "5") and age exaggeration in these data. + + In general, the degree of age heaping in these data varies by the time period and population considered, but it is usually no burden to scientific analysis. In most cases, it is sufficient to analyze data in five-year age groups in order to avoid the false impressions created by this particular form of age misstatement. + + Age exaggeration, on the other hand, is a more insidious problem. The authors' approach is guided by the conventional wisdom that age reporting in death registration systems is typically more reliable than in census counts or official population estimates. For this reason, the authors derive population estimates at older ages from the death counts themselves, employing extinct cohort methods. Such methods eliminate some, but certainly not all, of the biases in old-age mortality estimates due to age exaggeration. + + + # Uniform set of procedures + + A key goal of this project is to follow a uniform set of procedures for each population. This approach does not guarantee the cross-national comparability of the data. Rather, it ensures only that the authors have not introduced biases by the authors' own manipulations. The desire of the authors for uniformity had to face the challenge that raw data come in a variety of formats (for example, 1-year versus 5-year age groups). The authors' general approach to this problem is that the available raw data are used first to estimate two quantities: 1) the number of deaths by completed age, year of birth, and year of death; and 2) population estimates by single years of age on January 1 of each year. For each population, these calculations are performed separately by sex. From these two pieces of information, they compute death rates and life tables in a variety of age-time configurations. + + It is reasonable to ask whether a single procedure is the best method for treating the data from a variety of populations. Here, two points must be considered. First, the authors' uniform methodology is based on procedures that were developed separately, though following similar principles, for various countries and by different researchers. Earlier methods were synthesized by choosing what they considered the best among alternative procedures and by eliminating superficial inconsistencies. The second point is that a uniform procedure is possible only because the authors have not attempted to correct the data for reporting and coverage errors. Although some general principles could be followed, such problems would have to be addressed individually for each population. + + Although the authors adhere strictly to a uniform procedure, the data for each population also receive significant individualized attention. Each country or area is assigned to an individual researcher, who takes responsibility for assembling and checking the data for errors. In addition, the person assigned to each country/area checks the authors' data against other available sources. These procedures help to assure a high level of data quality, but assistance from database users in identifying problems is always appreciated! + description_snapshot: |- + HMD data by country. This contains the raw data, including their "input data", which HMD defines as: + + The Input Database houses the raw data that are the basis for all HMD calculations. Input data files for each population are accessible from the country page. + + date_published: "2024-11-13" + # Citation + producer: Human Mortality Database + citation_full: |- + HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. + + See also the methods protocol: + Wilmoth, J. R., Andreev, K., Jdanov, D., Glei, D. A., Riffe, T., Boe, C., Bubenheim, M., Philipov, D., Shkolnikov, V., Vachon, P., Winant, C., & Barbieri, M. (2021). Methods protocol for the human mortality database (v6). [Available online](https://www.mortality.org/File/GetDocument/Public/Docs/MethodsProtocolV6.pdf) (needs log in to mortality.org). + attribution_short: HMD + # Files + url_main: https://www.mortality.org/Data/ZippedDataFiles + date_accessed: 2024-11-27 + + # License + license: + name: CC BY 4.0 + url: https://www.mortality.org/Data/UserAgreement + +outs: + - md5: efae1882e47e8132bd5a2add9f7e445a + size: 345841896 + path: hmd_country.zip