Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 add birth rate monthly data #3687

Merged
merged 5 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions dag/demography.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ steps:
- data://garden/un/2022-07-11/un_wpp
data://grapher/un/2024-03-14/un_wpp_most:
- data://garden/un/2024-03-14/un_wpp_most

########################################################################
# Life expectancy #
########################################################################
Expand Down Expand Up @@ -252,16 +253,20 @@ steps:
data://grapher/demography/2024-12-03/fertility_rate:
- data://garden/demography/2024-12-03/fertility_rate

# OMM: Mean Age at Birth -- HFD + UN WPP
# data://garden/demography/2024-12-03/mean_age_birth:
# - data://garden/hmd/2024-11-19/hfd
# - data://garden/un/2024-07-12/un_wpp
# data://grapher/demography/2024-12-03/mean_age_birth:
# - data://garden/demography/2024-12-03/mean_age_birth

# OMM: Birth rate -- HFD + UN WPP
data://garden/demography/2024-12-03/birth_rate:
- data://garden/hmd/2024-12-01/hmd
- data://garden/un/2024-07-12/un_wpp
data://grapher/demography/2024-12-03/birth_rate:
- data://garden/demography/2024-12-03/birth_rate

# HMD country data
data://meadow/hmd/2024-12-03/hmd_country:
- snapshot://hmd/2024-12-01/hmd_country.zip

# HMD - Birth rate by month
data://garden/hmd/2024-12-03/hmd_country:
- data://meadow/hmd/2024-12-03/hmd_country
- data://garden/hmd/2024-12-01/hmd
data://grapher/hmd/2024-12-03/hmd_country:
- data://garden/hmd/2024-12-03/hmd_country
47 changes: 47 additions & 0 deletions etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"AUS": "Australia",
"AUT": "Austria",
"BEL": "Belgium",
"BGR": "Bulgaria",
"BLR": "Belarus",
"CAN": "Canada",
"CHE": "Switzerland",
"CHL": "Chile",
"CZE": "Czechia",
"DNK": "Denmark",
"ESP": "Spain",
"EST": "Estonia",
"FIN": "Finland",
"GRC": "Greece",
"HKG": "Hong Kong",
"HRV": "Croatia",
"HUN": "Hungary",
"IRL": "Ireland",
"ISL": "Iceland",
"ISR": "Israel",
"ITA": "Italy",
"JPN": "Japan",
"KOR": "South Korea",
"LTU": "Lithuania",
"LUX": "Luxembourg",
"LVA": "Latvia",
"NLD": "Netherlands",
"NOR": "Norway",
"POL": "Poland",
"PRT": "Portugal",
"RUS": "Russia",
"SVK": "Slovakia",
"SVN": "Slovenia",
"SWE": "Sweden",
"UKR": "Ukraine",
"USA": "United States",
"DEUTE": "East Germany",
"DEUTNP": "Germany",
"DEUTW": "West Germany",
"FRATNP": "France",
"GBRTENW": "England and Wales",
"GBR_NIR": "Northern Ireland",
"GBR_NP": "United Kingdom",
"GBR_SCO": "Scotland",
"NZL_NP": "New Zealand"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
"FRACNP",
"GBRCENW"
]
74 changes: 74 additions & 0 deletions etl/steps/data/garden/hmd/2024-12-03/hmd_country.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Fertility Rate

# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
title: Birth rate by month (HMD)
update_period_days: 365

tables:
birth_rate:
variables:
birth_rate:
title: Birth rate (monthly)
unit: births per 1,000 people
description_short: |-
The total number of births per 1,000 people in a given month.
display:
name: |-
Birth rate

birth_rate_per_day:
title: Daily birth Rate (average in month)
unit: births per 1,000 people
description_short: |-
The average daily number of births, per 1,000 people, calculated monthly.
display:
name: |-
Birth rate, per day

birth_rate_month:
variables:
birth_rate:
title: Birth rate (monthly) - << month >>
unit: births per 1,000 people
description_short: |-
The total number of births per 1,000 people in <<month>>.
display:
name: |-
Birth rate

birth_rate_per_day:
title: Daily birth rate (average in month) - << month >>
unit: births per 1,000 people
description_short: |-
The average daily number of births, per 1,000 people, calculated for <<month>>.
display:
name: |-
Birth rate, per day

birth_rate_month_max:
variables:
month_max:
title: Month ordinal with the peak daily birth rate
unit: ""
description_short: |-
Number corresponding to the month with the highest daily birth rate.
month_max_name:
title: Month name with the peak daily birth rate
unit: ""
description_short: |-
Month with the highest daily birth rate.
birth_rate_per_day_max:
title: Peak daily birth rate
unit: births per 1,000 people
description_short: |-
The highest average daily number of births, per 1,000 people, recorded in the given year.
display:
name: |-
Maximum birth rate, per day
160 changes: 160 additions & 0 deletions etl/steps/data/garden/hmd/2024-12-03/hmd_country.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Load a meadow dataset and create a garden dataset."""

import calendar

import numpy as np
import pandas as pd

from etl.data_helpers import geo
from etl.data_helpers.misc import interpolate_table
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("hmd_country")
ds_hmd = paths.load_dataset("hmd")

# Read table from meadow dataset.
tb_month = ds_meadow.read("monthly")
tb_pop = ds_hmd.read("population")

#
# Process data.
#
tb_month_long, tb_month_dimensions, tb_month_max = make_monthly_tables(tb_month, tb_pop)
tables = [
tb_month_long.format(["country", "date"], short_name="birth_rate"),
tb_month_dimensions.format(["country", "year", "month"], short_name="birth_rate_month"),
tb_month_max.format(["country", "year"], short_name="birth_rate_month_max"),
]

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir,
tables=tables,
check_variables_metadata=True,
default_metadata=ds_meadow.metadata,
)

# Save changes in the new garden dataset.
ds_garden.save()


def make_monthly_tables(tb, tb_pop):
## Discard unknown/total values
tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])]
tb["month"] = tb["month"].astype(int)
## Create date column. TODO: check what day of the month to assign
tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1))
# Harmonize country names
tb = geo.harmonize_countries(
df=tb,
countries_file=paths.country_mapping_path,
excluded_countries_file=paths.excluded_countries_path,
warn_on_unknown_excluded_countries=False,
)

# Add population to monthly birth data table
tb = add_population_column(tb, tb_pop)

# Estimate metrics
tb = estimate_metrics(tb)

# Sort rows
tb = tb.sort_values(["country", "date", "date"])

# Classic time-series, with date-values
tb_long = tb[["country", "date", "birth_rate", "birth_rate_per_day"]]

# Month as a dimension
tb_dimensions = tb[["country", "year", "month", "birth_rate", "birth_rate_per_day"]]
tb_dimensions["month"] = tb_dimensions["month"].apply(lambda x: calendar.month_name[x])

# For each year, ID of the month with highest birth rate per day
tb_month_max = tb.loc[
tb.groupby(["country", "year"])["birth_rate_per_day"].idxmax(),
["country", "year", "month", "birth_rate_per_day"],
].rename(columns={"month": "month_max", "birth_rate_per_day": "birth_rate_per_day_max"})
tb_month_max["month_max_name"] = tb_month_max["month_max"].apply(lambda x: calendar.month_name[x])

return tb_long, tb_dimensions, tb_month_max


def clean_table(tb):
"""Filter rows, harmonize country names, add date column."""
# Filter unwanted month categories, set dtype
tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])]
tb["month"] = tb["month"].astype(int)
## Create date column. TODO: check what day of the month to assign
tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1))
# Harmonize country names
tb = geo.harmonize_countries(
df=tb,
countries_file=paths.directory / (paths.short_name + "_month.countries.json"),
excluded_countries_file=paths.excluded_countries_path,
warn_on_unknown_excluded_countries=False,
)

return tb


def add_population_column(tb, tb_pop):
"""Add population column to main table for each date."""
# Prepare population table
tb_pop = _prepare_population_table(tb_pop)
# Merge population table with main table
tb = tb.merge(tb_pop, on=["country", "date"], how="left")
tb = tb.sort_values(["country", "date"])
# Interpolate to get monthly population estimates
tb_ = interpolate_table(
tb[["country", "date", "population"]],
entity_col="country",
time_col="date",
time_mode="none",
)
tb = tb.drop(columns="population").merge(tb_, on=["country", "date"], how="left")

return tb


def _prepare_population_table(tb):
"""Prepare population table to merge with main table.

Original table is given in years, but we need it in days! We use linear interpolation for that.
"""
tb_aux = tb.loc[(tb["sex"] == "total") & ~(tb["age"].str.contains("-")), ["country", "year", "population"]]
tb_aux = tb_aux.groupby(["country", "year"], as_index=False)["population"].sum()
## Assign a day to population. TODO: Check if this is true
tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01")
tb_aux = tb_aux.drop(columns="year")

return tb_aux


def estimate_metrics(tb):
"""Estimate metrics: birth rate and birth rate per day."""
# Get days in month
tb["days_in_month"] = tb.apply(lambda row: calendar.monthrange(row["year"], row["month"])[1], axis=1)
# Estimate rates
tb["birth_rate"] = tb["births"] / tb["population"] * 1_000
tb["birth_rate_per_day"] = tb["birth_rate"] / tb["days_in_month"] * 1_000
# Check
assert tb[["birth_rate", "birth_rate_per_day"]].notna().all().all()
# Replace INF values with NAs
tb[["birth_rate", "birth_rate_per_day"]] = tb[["birth_rate", "birth_rate_per_day"]].replace(
[np.inf, -np.inf], pd.NA
)
# Drop NAs
tb = tb.dropna(subset=["birth_rate", "birth_rate_per_day"])

return tb
28 changes: 28 additions & 0 deletions etl/steps/data/grapher/hmd/2024-12-03/hmd_country.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("hmd_country")

# Read table from garden dataset.
tables = list(ds_garden)

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(
dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
ds_grapher.save()
Loading
Loading