Skip to content

Commit

Permalink
🔨 Use groupby(..., observed=True) by default (#3226)
Browse files Browse the repository at this point in the history
* 🔨 Make observed=True default
  • Loading branch information
Marigold authored Sep 3, 2024
1 parent ddbbb8b commit 48f2fa0
Show file tree
Hide file tree
Showing 12 changed files with 36 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def add_world(tb: Table, ds_regions: Dataset) -> Table:
numeric_cols = [col for col in df_regions.columns if col not in ["country", "year", "field"]]

# Group the filtered data by "year" and "field" and aggregate the data based on the defined rules
result = df_regions.groupby(["year", "field"])[numeric_cols].agg(sum_with_nan).reset_index()
result = df_regions.groupby(["year", "field"], observed=False)[numeric_cols].agg(sum_with_nan).reset_index()

# Assign the aggregated data to a new country named "World"
result["country"] = "World"
Expand Down
2 changes: 1 addition & 1 deletion etl/steps/data/garden/demography/2023-10-04/gini_le.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run(dest_dir: str) -> None:
tb = tb.sort_values(["location", "year", "sex", "age"])

# Estimates
tb = tb.groupby(["location", "year", "sex"], as_index=False).apply(gini_from_mx)
tb = tb.groupby(["location", "year", "sex"], as_index=False, observed=False).apply(gini_from_mx)
tb.life_expectancy_gini.m.origins = origins

# Rename columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def run(dest_dir: str) -> None:

# Format dataframe
log.info("avian: format dataframe")
tb = tb.groupby(["date", "country", "indicator"], as_index=False).size()
tb = tb.groupby(["date", "country", "indicator"], as_index=False, observed=False).size()
tb = tb.pivot(index=["date", "country"], columns="indicator", values="size").reset_index()

# Add regions
Expand Down Expand Up @@ -88,7 +88,7 @@ def add_regions(tb: Table, ds_regions: Dataset) -> Table:
# Add region
tb_region = tb[tb["country"].isin(countries_in_region)].copy()
tb_region["country"] = region
tb_region = tb_region.groupby(["date", "country"], as_index=False)["avian_cases"].sum()
tb_region = tb_region.groupby(["date", "country"], as_index=False, observed=False)["avian_cases"].sum()

# Combine
tb = pr.concat([tb, tb_region], ignore_index=True)
Expand All @@ -102,7 +102,7 @@ def add_world(tb: Table) -> Table:
tb_world = tb[~tb["country"].isin(REGIONS)].copy()

# Aggregate
tb_world = tb_world.groupby("date", as_index=False)["avian_cases"].sum()
tb_world = tb_world.groupby("date", as_index=False, observed=False)["avian_cases"].sum()
tb_world["country"] = "World"

# Combine
Expand Down
2 changes: 1 addition & 1 deletion etl/steps/data/garden/news/2024-05-08/guardian_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def run(dest_dir: str) -> None:
columns={col: f"{col}_10y_avg" for col in tb_10y_avg.columns if col not in COLUMN_INDEX}
)
columns_indicators = [col for col in tb_10y_avg.columns if col not in COLUMN_INDEX]
tb_10y_avg = tb_10y_avg.groupby("country")[columns_indicators].mean().reset_index()
tb_10y_avg = tb_10y_avg.groupby("country", observed=False)[columns_indicators].mean().reset_index()
tb_10y_avg["year"] = YEAR_DEC_MAX

# Estimate log(10-year average)
Expand Down
4 changes: 2 additions & 2 deletions etl/steps/data/garden/oecd/2023-09-21/plastic_fate_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def run(dest_dir: str) -> None:
tb = tb.drop(columns=["country"])
tb = tb.rename(columns={"region": "country"})
# Ensure the regions with the same country name are summed
tb = tb.groupby(["year", "plastic_fate", "country"])["value"].sum().reset_index()
tb = tb.groupby(["year", "plastic_fate", "country"], observed=False)["value"].sum().reset_index()
# Add the metadata back to the table
tb.metadata = metadata
# Calculate the global totals
total_df = tb.groupby(["year", "plastic_fate"])["value"].sum()
total_df = tb.groupby(["year", "plastic_fate"], observed=False)["value"].sum()
total_df = total_df.reset_index()

total_df["country"] = "World"
Expand Down
7 changes: 3 additions & 4 deletions etl/steps/data/garden/papers/2023-10-20/anthromes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Load a meadow dataset and create a garden dataset."""


import numpy as np
import owid.catalog.processing as pr
from owid.catalog import Table
Expand Down Expand Up @@ -183,12 +182,12 @@ def calculate_share_of_land_type(tb: Table, land_areas: Table) -> Table:


def calculate_area_of_each_land_type(tb: Table) -> Table:
tb_global = tb.groupby(["year", "value"])["land_ar"].sum()
tb_global = tb.groupby(["year", "value"], observed=False)["land_ar"].sum()
tb_global = tb_global.reset_index()
tb_global["regn_nm"] = "World"
tb_global["land_ar"] = tb_global["land_ar"].replace(np.nan, 0)

tb_regional = tb.groupby(["year", "value", "regn_nm"])["land_ar"].sum()
tb_regional = tb.groupby(["year", "value", "regn_nm"], observed=False)["land_ar"].sum()
tb_regional = tb_regional.reset_index()
tb_regional["land_ar"] = tb_regional["land_ar"].replace(np.nan, 0)

Expand All @@ -203,7 +202,7 @@ def calculate_regional_land_areas(tb: Table) -> Table:
"""
# Calculate land area for each region
land_areas = tb.groupby("regn_nm").sum().drop(columns="id").reset_index()
land_areas = tb.groupby("regn_nm", observed=False).sum().drop(columns="id").reset_index()
# Add a row for 'world' with the sum of 'land_ar'
world_row = Table({"regn_nm": ["World"], "land_ar": [land_areas["land_ar"].sum()]})
# Concatenate the 'world_row' DataFrame with the original DataFrame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ def run(dest_dir: str) -> None:
tb["country"].dtype.name == "category"
), "The 'country' column must be of type 'category for subsequent aggregations to be correct'"

total_df["total_killed"] = tb.groupby(["country", "year"])["nkill"].sum()
total_df["total_wounded"] = tb.groupby(["country", "year"])["nwound"].sum()
total_df["total_incident_counts"] = tb.groupby(["country", "year"]).size()
total_df["total_killed"] = tb.groupby(["country", "year"], observed=False)["nkill"].sum()
total_df["total_wounded"] = tb.groupby(["country", "year"], observed=False)["nwound"].sum()
total_df["total_incident_counts"] = tb.groupby(["country", "year"], observed=False).size()

# Add GTD regions to number of deaths, attacks and wounded
total_df = add_regions(tb, total_df)
Expand Down Expand Up @@ -349,7 +349,7 @@ def add_regions(df, total_df):
of killed, wounded, and incidents.
"""
grouped_regions_df = df.groupby(["region_txt", "year"])
grouped_regions_df = df.groupby(["region_txt", "year"], observed=False)
summary_regions_df = pd.DataFrame()

for column in ["nkill", "nwound"]:
Expand Down Expand Up @@ -384,9 +384,9 @@ def generate_summary_dataframe(df, group_column, target_columns):
summaries, and the index name "region_txt" is renamed to "country."
"""
if group_column != "region_txt":
grouped_df = df.groupby(["country", "year", group_column])
grouped_df = df.groupby(["country", "year", group_column], observed=False)
else:
grouped_df = df.groupby(["year", group_column])
grouped_df = df.groupby(["year", group_column], observed=False)

summary_df = pd.DataFrame()

Expand All @@ -395,7 +395,7 @@ def generate_summary_dataframe(df, group_column, target_columns):

summary_df["total_incident_counts"] = grouped_df.size()
if group_column != "region_txt":
grouped_regions_df = df.groupby(["region_txt", "year", group_column])
grouped_regions_df = df.groupby(["region_txt", "year", group_column], observed=False)
summary_regions_df = pd.DataFrame()

for column in target_columns:
Expand Down Expand Up @@ -456,10 +456,14 @@ def severity(tb):
"severity," and "total_incident_severity."
"""
total_severity_country = pd.DataFrame()
total_severity_country["total_incident_severity"] = tb.groupby(["country", "year", "severity"]).size()
total_severity_country["total_incident_severity"] = tb.groupby(
["country", "year", "severity"], observed=False
).size()

total_severity_regions = pd.DataFrame()
total_severity_regions["total_incident_severity"] = tb.groupby(["region_txt", "year", "severity"]).size()
total_severity_regions["total_incident_severity"] = tb.groupby(
["region_txt", "year", "severity"], observed=False
).size()

total_severity_regions = total_severity_regions.rename_axis(index={"region_txt": "country"})
merge_GTD_regions = pd.concat([total_severity_country, total_severity_regions])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def combining_sexes_for_all_age_groups(tb: Table) -> Table:
msk = tb["age_group"].isin(age_groups_with_both_sexes)
tb_age = tb[~msk]
tb_gr = (
tb_age.groupby(["country", "year", "age_group", "risk_factor"], dropna=False)[
tb_age.groupby(["country", "year", "age_group", "risk_factor"], dropna=False, observed=False)[
["best", "lo", "hi", "population"]
]
.sum()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def combining_sexes_for_all_age_groups(tb: Table) -> Table:
msk = tb["age_group"].isin(age_groups_with_both_sexes)
tb_age = tb[~msk]
tb_gr = (
tb_age.groupby(["country", "year", "age_group", "cohort_type"], dropna=False)[
tb_age.groupby(["country", "year", "age_group", "cohort_type"], dropna=False, observed=False)[
["coh", "succ", "fail", "died", "lost", "neval"]
]
.sum()
Expand All @@ -91,7 +91,7 @@ def add_region_sum_aggregates(tb: Table, ds_regions: Dataset, ds_income_groups:
Calculate region aggregates for all for each combination of age-group, sex and risk factor in the dataset.
"""
# Create the groups we want to aggregate over.
tb_gr = tb.groupby(["year", "age_group", "sex", "cohort_type"])
tb_gr = tb.groupby(["year", "age_group", "sex", "cohort_type"], observed=False)
tb_gr_out = Table()
for gr_name, gr in tb_gr:
for region in REGIONS_TO_ADD:
Expand Down
6 changes: 3 additions & 3 deletions etl/steps/data/garden/un/2024-03-14/un_wpp_most.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None:
if age_group == 10:
tb_pop_filter = create_ten_year_age_groups(tb_pop)
# Group by country and year, and apply the custom function
tb_pop_filter = tb_pop_filter.groupby(["location", "year"]).apply(get_largest_age_group)
tb_pop_filter = tb_pop_filter.groupby(["location", "year"], observed=False).apply(get_largest_age_group)
# The function above creates NAs for some locations that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here
tb_pop_filter = tb_pop_filter.dropna()
tb_pop_filter = tb_pop_filter.reset_index(drop=True)
Expand Down Expand Up @@ -55,11 +55,11 @@ def create_ten_year_age_groups(tb: Table) -> Table:

# Create the 0-9 and 10-19 age groups
tb_0_9 = tb[(tb.age == "0-4") | (tb.age == "5-9")]
tb_0_9 = tb_0_9.groupby(["location", "year"])["value"].sum().reset_index()
tb_0_9 = tb_0_9.groupby(["location", "year"], observed=False)["value"].sum().reset_index()
tb_0_9["age"] = "0-9"

tb_10_19 = tb[(tb.age == "10-14") | (tb.age == "15-19")]
tb_10_19 = tb_10_19.groupby(["location", "year"])["value"].sum().reset_index()
tb_10_19 = tb_10_19.groupby(["location", "year"], observed=False)["value"].sum().reset_index()
tb_10_19["age"] = "10-19"
# Drop the 0-4, 5-9, 10-14 and 15-19 age groups
tb = tb[(tb.age != "0-4") & (tb.age != "5-9") & (tb.age != "10-14") & (tb.age != "15-19")]
Expand Down
8 changes: 5 additions & 3 deletions lib/catalog/owid/catalog/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,9 +1000,11 @@ def __ipow__(self, other: Union[Scalar, Series, variables.Variable, "Table"]) ->
def sort_index(self, *args, **kwargs) -> "Table":
return super().sort_index(*args, **kwargs) # type: ignore

def groupby(self, *args, **kwargs) -> "TableGroupBy":
"""Groupby that preserves metadata."""
return TableGroupBy(pd.DataFrame.groupby(self.copy(deep=False), *args, **kwargs), self.metadata, self._fields)
def groupby(self, *args, observed=True, **kwargs) -> "TableGroupBy":
"""Groupby that preserves metadata. It uses observed=True by default."""
return TableGroupBy(
pd.DataFrame.groupby(self.copy(deep=False), *args, observed=observed, **kwargs), self.metadata, self._fields
)

def rolling(self, *args, **kwargs) -> "TableRolling":
"""Rolling operation that preserves metadata."""
Expand Down
2 changes: 1 addition & 1 deletion lib/catalog/tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,7 @@ def test_groupby_iteration(table_1) -> None:
def test_groupby_observed_default(table_1) -> None:
table_1 = table_1.astype({"a": "category"}).query("a != 3")
gt = table_1.groupby("a").min()
assert len(gt) == 3
assert len(gt) == 2


def test_groupby_levels(table_1) -> None:
Expand Down

0 comments on commit 48f2fa0

Please sign in to comment.