-
-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
📊 add birth rate monthly data (#3687)
* 📊 add birth rate monthly data * wip * wip * wip * wip
- Loading branch information
1 parent
1445369
commit edc48ee
Showing
9 changed files
with
505 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
etl/steps/data/garden/hmd/2024-12-03/hmd_country.countries.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
{ | ||
"AUS": "Australia", | ||
"AUT": "Austria", | ||
"BEL": "Belgium", | ||
"BGR": "Bulgaria", | ||
"BLR": "Belarus", | ||
"CAN": "Canada", | ||
"CHE": "Switzerland", | ||
"CHL": "Chile", | ||
"CZE": "Czechia", | ||
"DNK": "Denmark", | ||
"ESP": "Spain", | ||
"EST": "Estonia", | ||
"FIN": "Finland", | ||
"GRC": "Greece", | ||
"HKG": "Hong Kong", | ||
"HRV": "Croatia", | ||
"HUN": "Hungary", | ||
"IRL": "Ireland", | ||
"ISL": "Iceland", | ||
"ISR": "Israel", | ||
"ITA": "Italy", | ||
"JPN": "Japan", | ||
"KOR": "South Korea", | ||
"LTU": "Lithuania", | ||
"LUX": "Luxembourg", | ||
"LVA": "Latvia", | ||
"NLD": "Netherlands", | ||
"NOR": "Norway", | ||
"POL": "Poland", | ||
"PRT": "Portugal", | ||
"RUS": "Russia", | ||
"SVK": "Slovakia", | ||
"SVN": "Slovenia", | ||
"SWE": "Sweden", | ||
"UKR": "Ukraine", | ||
"USA": "United States", | ||
"DEUTE": "East Germany", | ||
"DEUTNP": "Germany", | ||
"DEUTW": "West Germany", | ||
"FRATNP": "France", | ||
"GBRTENW": "England and Wales", | ||
"GBR_NIR": "Northern Ireland", | ||
"GBR_NP": "United Kingdom", | ||
"GBR_SCO": "Scotland", | ||
"NZL_NP": "New Zealand" | ||
} |
4 changes: 4 additions & 0 deletions
4
etl/steps/data/garden/hmd/2024-12-03/hmd_country.excluded_countries.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[ | ||
"FRACNP", | ||
"GBRCENW" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# NOTE: To learn more about the fields, hover over their names. | ||
definitions: | ||
common: | ||
presentation: | ||
topic_tags: | ||
- Fertility Rate | ||
|
||
# Learn more about the available fields: | ||
# http://docs.owid.io/projects/etl/architecture/metadata/reference/ | ||
dataset: | ||
title: Birth rate by month (HMD) | ||
update_period_days: 365 | ||
|
||
tables: | ||
birth_rate: | ||
variables: | ||
birth_rate: | ||
title: Birth rate (monthly) | ||
unit: births per 1,000 people | ||
description_short: |- | ||
The total number of births per 1,000 people in a given month. | ||
display: | ||
name: |- | ||
Birth rate | ||
birth_rate_per_day: | ||
title: Daily birth Rate (average in month) | ||
unit: births per 1,000 people | ||
description_short: |- | ||
The average daily number of births, per 1,000 people, calculated monthly. | ||
display: | ||
name: |- | ||
Birth rate, per day | ||
birth_rate_month: | ||
variables: | ||
birth_rate: | ||
title: Birth rate (monthly) - << month >> | ||
unit: births per 1,000 people | ||
description_short: |- | ||
The total number of births per 1,000 people in <<month>>. | ||
display: | ||
name: |- | ||
Birth rate | ||
birth_rate_per_day: | ||
title: Daily birth rate (average in month) - << month >> | ||
unit: births per 1,000 people | ||
description_short: |- | ||
The average daily number of births, per 1,000 people, calculated for <<month>>. | ||
display: | ||
name: |- | ||
Birth rate, per day | ||
birth_rate_month_max: | ||
variables: | ||
month_max: | ||
title: Month ordinal with the peak daily birth rate | ||
unit: "" | ||
description_short: |- | ||
Number corresponding to the month with the highest daily birth rate. | ||
month_max_name: | ||
title: Month name with the peak daily birth rate | ||
unit: "" | ||
description_short: |- | ||
Month with the highest daily birth rate. | ||
birth_rate_per_day_max: | ||
title: Peak daily birth rate | ||
unit: births per 1,000 people | ||
description_short: |- | ||
The highest average daily number of births, per 1,000 people, recorded in the given year. | ||
display: | ||
name: |- | ||
Maximum birth rate, per day |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
"""Load a meadow dataset and create a garden dataset.""" | ||
|
||
import calendar | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from etl.data_helpers import geo | ||
from etl.data_helpers.misc import interpolate_table | ||
from etl.helpers import PathFinder, create_dataset | ||
|
||
# Get paths and naming conventions for current step. | ||
paths = PathFinder(__file__) | ||
|
||
|
||
def run(dest_dir: str) -> None: | ||
# | ||
# Load inputs. | ||
# | ||
# Load meadow dataset. | ||
ds_meadow = paths.load_dataset("hmd_country") | ||
ds_hmd = paths.load_dataset("hmd") | ||
|
||
# Read table from meadow dataset. | ||
tb_month = ds_meadow.read("monthly") | ||
tb_pop = ds_hmd.read("population") | ||
|
||
# | ||
# Process data. | ||
# | ||
tb_month_long, tb_month_dimensions, tb_month_max = make_monthly_tables(tb_month, tb_pop) | ||
tables = [ | ||
tb_month_long.format(["country", "date"], short_name="birth_rate"), | ||
tb_month_dimensions.format(["country", "year", "month"], short_name="birth_rate_month"), | ||
tb_month_max.format(["country", "year"], short_name="birth_rate_month_max"), | ||
] | ||
|
||
# | ||
# Save outputs. | ||
# | ||
# Create a new garden dataset with the same metadata as the meadow dataset. | ||
ds_garden = create_dataset( | ||
dest_dir, | ||
tables=tables, | ||
check_variables_metadata=True, | ||
default_metadata=ds_meadow.metadata, | ||
) | ||
|
||
# Save changes in the new garden dataset. | ||
ds_garden.save() | ||
|
||
|
||
def make_monthly_tables(tb, tb_pop): | ||
## Discard unknown/total values | ||
tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])] | ||
tb["month"] = tb["month"].astype(int) | ||
## Create date column. TODO: check what day of the month to assign | ||
tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1)) | ||
# Harmonize country names | ||
tb = geo.harmonize_countries( | ||
df=tb, | ||
countries_file=paths.country_mapping_path, | ||
excluded_countries_file=paths.excluded_countries_path, | ||
warn_on_unknown_excluded_countries=False, | ||
) | ||
|
||
# Add population to monthly birth data table | ||
tb = add_population_column(tb, tb_pop) | ||
|
||
# Estimate metrics | ||
tb = estimate_metrics(tb) | ||
|
||
# Sort rows | ||
tb = tb.sort_values(["country", "date", "date"]) | ||
|
||
# Classic time-series, with date-values | ||
tb_long = tb[["country", "date", "birth_rate", "birth_rate_per_day"]] | ||
|
||
# Month as a dimension | ||
tb_dimensions = tb[["country", "year", "month", "birth_rate", "birth_rate_per_day"]] | ||
tb_dimensions["month"] = tb_dimensions["month"].apply(lambda x: calendar.month_name[x]) | ||
|
||
# For each year, ID of the month with highest birth rate per day | ||
tb_month_max = tb.loc[ | ||
tb.groupby(["country", "year"])["birth_rate_per_day"].idxmax(), | ||
["country", "year", "month", "birth_rate_per_day"], | ||
].rename(columns={"month": "month_max", "birth_rate_per_day": "birth_rate_per_day_max"}) | ||
tb_month_max["month_max_name"] = tb_month_max["month_max"].apply(lambda x: calendar.month_name[x]) | ||
|
||
return tb_long, tb_dimensions, tb_month_max | ||
|
||
|
||
def clean_table(tb): | ||
"""Filter rows, harmonize country names, add date column.""" | ||
# Filter unwanted month categories, set dtype | ||
tb = tb.loc[~tb["month"].isin(["TOT", "UNK"])] | ||
tb["month"] = tb["month"].astype(int) | ||
## Create date column. TODO: check what day of the month to assign | ||
tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=1)) | ||
# Harmonize country names | ||
tb = geo.harmonize_countries( | ||
df=tb, | ||
countries_file=paths.directory / (paths.short_name + "_month.countries.json"), | ||
excluded_countries_file=paths.excluded_countries_path, | ||
warn_on_unknown_excluded_countries=False, | ||
) | ||
|
||
return tb | ||
|
||
|
||
def add_population_column(tb, tb_pop): | ||
"""Add population column to main table for each date.""" | ||
# Prepare population table | ||
tb_pop = _prepare_population_table(tb_pop) | ||
# Merge population table with main table | ||
tb = tb.merge(tb_pop, on=["country", "date"], how="left") | ||
tb = tb.sort_values(["country", "date"]) | ||
# Interpolate to get monthly population estimates | ||
tb_ = interpolate_table( | ||
tb[["country", "date", "population"]], | ||
entity_col="country", | ||
time_col="date", | ||
time_mode="none", | ||
) | ||
tb = tb.drop(columns="population").merge(tb_, on=["country", "date"], how="left") | ||
|
||
return tb | ||
|
||
|
||
def _prepare_population_table(tb): | ||
"""Prepare population table to merge with main table. | ||
Original table is given in years, but we need it in days! We use linear interpolation for that. | ||
""" | ||
tb_aux = tb.loc[(tb["sex"] == "total") & ~(tb["age"].str.contains("-")), ["country", "year", "population"]] | ||
tb_aux = tb_aux.groupby(["country", "year"], as_index=False)["population"].sum() | ||
## Assign a day to population. TODO: Check if this is true | ||
tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01") | ||
tb_aux = tb_aux.drop(columns="year") | ||
|
||
return tb_aux | ||
|
||
|
||
def estimate_metrics(tb): | ||
"""Estimate metrics: birth rate and birth rate per day.""" | ||
# Get days in month | ||
tb["days_in_month"] = tb.apply(lambda row: calendar.monthrange(row["year"], row["month"])[1], axis=1) | ||
# Estimate rates | ||
tb["birth_rate"] = tb["births"] / tb["population"] * 1_000 | ||
tb["birth_rate_per_day"] = tb["birth_rate"] / tb["days_in_month"] * 1_000 | ||
# Check | ||
assert tb[["birth_rate", "birth_rate_per_day"]].notna().all().all() | ||
# Replace INF values with NAs | ||
tb[["birth_rate", "birth_rate_per_day"]] = tb[["birth_rate", "birth_rate_per_day"]].replace( | ||
[np.inf, -np.inf], pd.NA | ||
) | ||
# Drop NAs | ||
tb = tb.dropna(subset=["birth_rate", "birth_rate_per_day"]) | ||
|
||
return tb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
"""Load a garden dataset and create a grapher dataset.""" | ||
|
||
from etl.helpers import PathFinder, create_dataset | ||
|
||
# Get paths and naming conventions for current step. | ||
paths = PathFinder(__file__) | ||
|
||
|
||
def run(dest_dir: str) -> None: | ||
# | ||
# Load inputs. | ||
# | ||
# Load garden dataset. | ||
ds_garden = paths.load_dataset("hmd_country") | ||
|
||
# Read table from garden dataset. | ||
tables = list(ds_garden) | ||
|
||
# | ||
# Save outputs. | ||
# | ||
# Create a new grapher dataset with the same metadata as the garden dataset. | ||
ds_grapher = create_dataset( | ||
dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata | ||
) | ||
|
||
# Save changes in the new grapher dataset. | ||
ds_grapher.save() |
Oops, something went wrong.