From 48f2fa063159bc2bbaf1159106ca991d0d5130ae Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Tue, 3 Sep 2024 21:16:54 +0200 Subject: [PATCH] :hammer: Use `groupby(..., observed=True)` by default (#3226) * :hammer: Make observed=True default --- .../2023-07-25/cset.py | 2 +- .../garden/demography/2023-10-04/gini_le.py | 2 +- .../avian_influenza_h5n1_kucharski.py | 6 ++--- .../news/2024-05-08/guardian_mentions.py | 2 +- .../oecd/2023-09-21/plastic_fate_regions.py | 4 ++-- .../garden/papers/2023-10-20/anthromes.py | 7 +++--- .../2023-07-20/global_terrorism_database.py | 22 +++++++++++-------- .../2023-11-27/burden_disaggregated.py | 2 +- .../2023-11-27/outcomes_disagg.py | 4 ++-- .../data/garden/un/2024-03-14/un_wpp_most.py | 6 ++--- lib/catalog/owid/catalog/tables.py | 8 ++++--- lib/catalog/tests/test_tables.py | 2 +- 12 files changed, 36 insertions(+), 31 deletions(-) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-07-25/cset.py b/etl/steps/data/garden/artificial_intelligence/2023-07-25/cset.py index 71809832767..31a5a989f1c 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-07-25/cset.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-07-25/cset.py @@ -57,7 +57,7 @@ def add_world(tb: Table, ds_regions: Dataset) -> Table: numeric_cols = [col for col in df_regions.columns if col not in ["country", "year", "field"]] # Group the filtered data by "year" and "field" and aggregate the data based on the defined rules - result = df_regions.groupby(["year", "field"])[numeric_cols].agg(sum_with_nan).reset_index() + result = df_regions.groupby(["year", "field"], observed=False)[numeric_cols].agg(sum_with_nan).reset_index() # Assign the aggregated data to a new country named "World" result["country"] = "World" diff --git a/etl/steps/data/garden/demography/2023-10-04/gini_le.py b/etl/steps/data/garden/demography/2023-10-04/gini_le.py index 1b9e8199d29..5c184892e25 100644 --- a/etl/steps/data/garden/demography/2023-10-04/gini_le.py +++ b/etl/steps/data/garden/demography/2023-10-04/gini_le.py @@ -57,7 +57,7 @@ def run(dest_dir: str) -> None: tb = tb.sort_values(["location", "year", "sex", "age"]) # Estimates - tb = tb.groupby(["location", "year", "sex"], as_index=False).apply(gini_from_mx) + tb = tb.groupby(["location", "year", "sex"], as_index=False, observed=False).apply(gini_from_mx) tb.life_expectancy_gini.m.origins = origins # Rename columns diff --git a/etl/steps/data/garden/health/2023-08-14/avian_influenza_h5n1_kucharski.py b/etl/steps/data/garden/health/2023-08-14/avian_influenza_h5n1_kucharski.py index 393ca5b4f0f..77a242da910 100644 --- a/etl/steps/data/garden/health/2023-08-14/avian_influenza_h5n1_kucharski.py +++ b/etl/steps/data/garden/health/2023-08-14/avian_influenza_h5n1_kucharski.py @@ -56,7 +56,7 @@ def run(dest_dir: str) -> None: # Format dataframe log.info("avian: format dataframe") - tb = tb.groupby(["date", "country", "indicator"], as_index=False).size() + tb = tb.groupby(["date", "country", "indicator"], as_index=False, observed=False).size() tb = tb.pivot(index=["date", "country"], columns="indicator", values="size").reset_index() # Add regions @@ -88,7 +88,7 @@ def add_regions(tb: Table, ds_regions: Dataset) -> Table: # Add region tb_region = tb[tb["country"].isin(countries_in_region)].copy() tb_region["country"] = region - tb_region = tb_region.groupby(["date", "country"], as_index=False)["avian_cases"].sum() + tb_region = tb_region.groupby(["date", "country"], as_index=False, observed=False)["avian_cases"].sum() # Combine tb = pr.concat([tb, tb_region], ignore_index=True) @@ -102,7 +102,7 @@ def add_world(tb: Table) -> Table: tb_world = tb[~tb["country"].isin(REGIONS)].copy() # Aggregate - tb_world = tb_world.groupby("date", as_index=False)["avian_cases"].sum() + tb_world = tb_world.groupby("date", as_index=False, observed=False)["avian_cases"].sum() tb_world["country"] = "World" # Combine diff --git a/etl/steps/data/garden/news/2024-05-08/guardian_mentions.py b/etl/steps/data/garden/news/2024-05-08/guardian_mentions.py index 82c81f04a28..249f9644caf 100644 --- a/etl/steps/data/garden/news/2024-05-08/guardian_mentions.py +++ b/etl/steps/data/garden/news/2024-05-08/guardian_mentions.py @@ -88,7 +88,7 @@ def run(dest_dir: str) -> None: columns={col: f"{col}_10y_avg" for col in tb_10y_avg.columns if col not in COLUMN_INDEX} ) columns_indicators = [col for col in tb_10y_avg.columns if col not in COLUMN_INDEX] - tb_10y_avg = tb_10y_avg.groupby("country")[columns_indicators].mean().reset_index() + tb_10y_avg = tb_10y_avg.groupby("country", observed=False)[columns_indicators].mean().reset_index() tb_10y_avg["year"] = YEAR_DEC_MAX # Estimate log(10-year average) diff --git a/etl/steps/data/garden/oecd/2023-09-21/plastic_fate_regions.py b/etl/steps/data/garden/oecd/2023-09-21/plastic_fate_regions.py index 78c1195fc43..de02ae4471f 100644 --- a/etl/steps/data/garden/oecd/2023-09-21/plastic_fate_regions.py +++ b/etl/steps/data/garden/oecd/2023-09-21/plastic_fate_regions.py @@ -52,11 +52,11 @@ def run(dest_dir: str) -> None: tb = tb.drop(columns=["country"]) tb = tb.rename(columns={"region": "country"}) # Ensure the regions with the same country name are summed - tb = tb.groupby(["year", "plastic_fate", "country"])["value"].sum().reset_index() + tb = tb.groupby(["year", "plastic_fate", "country"], observed=False)["value"].sum().reset_index() # Add the metadata back to the table tb.metadata = metadata # Calculate the global totals - total_df = tb.groupby(["year", "plastic_fate"])["value"].sum() + total_df = tb.groupby(["year", "plastic_fate"], observed=False)["value"].sum() total_df = total_df.reset_index() total_df["country"] = "World" diff --git a/etl/steps/data/garden/papers/2023-10-20/anthromes.py b/etl/steps/data/garden/papers/2023-10-20/anthromes.py index 327369b5e67..94173117116 100644 --- a/etl/steps/data/garden/papers/2023-10-20/anthromes.py +++ b/etl/steps/data/garden/papers/2023-10-20/anthromes.py @@ -1,6 +1,5 @@ """Load a meadow dataset and create a garden dataset.""" - import numpy as np import owid.catalog.processing as pr from owid.catalog import Table @@ -183,12 +182,12 @@ def calculate_share_of_land_type(tb: Table, land_areas: Table) -> Table: def calculate_area_of_each_land_type(tb: Table) -> Table: - tb_global = tb.groupby(["year", "value"])["land_ar"].sum() + tb_global = tb.groupby(["year", "value"], observed=False)["land_ar"].sum() tb_global = tb_global.reset_index() tb_global["regn_nm"] = "World" tb_global["land_ar"] = tb_global["land_ar"].replace(np.nan, 0) - tb_regional = tb.groupby(["year", "value", "regn_nm"])["land_ar"].sum() + tb_regional = tb.groupby(["year", "value", "regn_nm"], observed=False)["land_ar"].sum() tb_regional = tb_regional.reset_index() tb_regional["land_ar"] = tb_regional["land_ar"].replace(np.nan, 0) @@ -203,7 +202,7 @@ def calculate_regional_land_areas(tb: Table) -> Table: """ # Calculate land area for each region - land_areas = tb.groupby("regn_nm").sum().drop(columns="id").reset_index() + land_areas = tb.groupby("regn_nm", observed=False).sum().drop(columns="id").reset_index() # Add a row for 'world' with the sum of 'land_ar' world_row = Table({"regn_nm": ["World"], "land_ar": [land_areas["land_ar"].sum()]}) # Concatenate the 'world_row' DataFrame with the original DataFrame diff --git a/etl/steps/data/garden/terrorism/2023-07-20/global_terrorism_database.py b/etl/steps/data/garden/terrorism/2023-07-20/global_terrorism_database.py index 8d3832fb6cf..a95fdfde7ec 100644 --- a/etl/steps/data/garden/terrorism/2023-07-20/global_terrorism_database.py +++ b/etl/steps/data/garden/terrorism/2023-07-20/global_terrorism_database.py @@ -117,9 +117,9 @@ def run(dest_dir: str) -> None: tb["country"].dtype.name == "category" ), "The 'country' column must be of type 'category for subsequent aggregations to be correct'" - total_df["total_killed"] = tb.groupby(["country", "year"])["nkill"].sum() - total_df["total_wounded"] = tb.groupby(["country", "year"])["nwound"].sum() - total_df["total_incident_counts"] = tb.groupby(["country", "year"]).size() + total_df["total_killed"] = tb.groupby(["country", "year"], observed=False)["nkill"].sum() + total_df["total_wounded"] = tb.groupby(["country", "year"], observed=False)["nwound"].sum() + total_df["total_incident_counts"] = tb.groupby(["country", "year"], observed=False).size() # Add GTD regions to number of deaths, attacks and wounded total_df = add_regions(tb, total_df) @@ -349,7 +349,7 @@ def add_regions(df, total_df): of killed, wounded, and incidents. """ - grouped_regions_df = df.groupby(["region_txt", "year"]) + grouped_regions_df = df.groupby(["region_txt", "year"], observed=False) summary_regions_df = pd.DataFrame() for column in ["nkill", "nwound"]: @@ -384,9 +384,9 @@ def generate_summary_dataframe(df, group_column, target_columns): summaries, and the index name "region_txt" is renamed to "country." """ if group_column != "region_txt": - grouped_df = df.groupby(["country", "year", group_column]) + grouped_df = df.groupby(["country", "year", group_column], observed=False) else: - grouped_df = df.groupby(["year", group_column]) + grouped_df = df.groupby(["year", group_column], observed=False) summary_df = pd.DataFrame() @@ -395,7 +395,7 @@ def generate_summary_dataframe(df, group_column, target_columns): summary_df["total_incident_counts"] = grouped_df.size() if group_column != "region_txt": - grouped_regions_df = df.groupby(["region_txt", "year", group_column]) + grouped_regions_df = df.groupby(["region_txt", "year", group_column], observed=False) summary_regions_df = pd.DataFrame() for column in target_columns: @@ -456,10 +456,14 @@ def severity(tb): "severity," and "total_incident_severity." """ total_severity_country = pd.DataFrame() - total_severity_country["total_incident_severity"] = tb.groupby(["country", "year", "severity"]).size() + total_severity_country["total_incident_severity"] = tb.groupby( + ["country", "year", "severity"], observed=False + ).size() total_severity_regions = pd.DataFrame() - total_severity_regions["total_incident_severity"] = tb.groupby(["region_txt", "year", "severity"]).size() + total_severity_regions["total_incident_severity"] = tb.groupby( + ["region_txt", "year", "severity"], observed=False + ).size() total_severity_regions = total_severity_regions.rename_axis(index={"region_txt": "country"}) merge_GTD_regions = pd.concat([total_severity_country, total_severity_regions]) diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/burden_disaggregated.py b/etl/steps/data/garden/tuberculosis/2023-11-27/burden_disaggregated.py index 38cb49bf2ea..366243373b6 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/burden_disaggregated.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/burden_disaggregated.py @@ -95,7 +95,7 @@ def combining_sexes_for_all_age_groups(tb: Table) -> Table: msk = tb["age_group"].isin(age_groups_with_both_sexes) tb_age = tb[~msk] tb_gr = ( - tb_age.groupby(["country", "year", "age_group", "risk_factor"], dropna=False)[ + tb_age.groupby(["country", "year", "age_group", "risk_factor"], dropna=False, observed=False)[ ["best", "lo", "hi", "population"] ] .sum() diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes_disagg.py b/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes_disagg.py index f233d498e01..e625fd4306a 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes_disagg.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/outcomes_disagg.py @@ -72,7 +72,7 @@ def combining_sexes_for_all_age_groups(tb: Table) -> Table: msk = tb["age_group"].isin(age_groups_with_both_sexes) tb_age = tb[~msk] tb_gr = ( - tb_age.groupby(["country", "year", "age_group", "cohort_type"], dropna=False)[ + tb_age.groupby(["country", "year", "age_group", "cohort_type"], dropna=False, observed=False)[ ["coh", "succ", "fail", "died", "lost", "neval"] ] .sum() @@ -91,7 +91,7 @@ def add_region_sum_aggregates(tb: Table, ds_regions: Dataset, ds_income_groups: Calculate region aggregates for all for each combination of age-group, sex and risk factor in the dataset. """ # Create the groups we want to aggregate over. - tb_gr = tb.groupby(["year", "age_group", "sex", "cohort_type"]) + tb_gr = tb.groupby(["year", "age_group", "sex", "cohort_type"], observed=False) tb_gr_out = Table() for gr_name, gr in tb_gr: for region in REGIONS_TO_ADD: diff --git a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py index 2e9de35f978..c067d01d25e 100644 --- a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py +++ b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: if age_group == 10: tb_pop_filter = create_ten_year_age_groups(tb_pop) # Group by country and year, and apply the custom function - tb_pop_filter = tb_pop_filter.groupby(["location", "year"]).apply(get_largest_age_group) + tb_pop_filter = tb_pop_filter.groupby(["location", "year"], observed=False).apply(get_largest_age_group) # The function above creates NAs for some locations that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here tb_pop_filter = tb_pop_filter.dropna() tb_pop_filter = tb_pop_filter.reset_index(drop=True) @@ -55,11 +55,11 @@ def create_ten_year_age_groups(tb: Table) -> Table: # Create the 0-9 and 10-19 age groups tb_0_9 = tb[(tb.age == "0-4") | (tb.age == "5-9")] - tb_0_9 = tb_0_9.groupby(["location", "year"])["value"].sum().reset_index() + tb_0_9 = tb_0_9.groupby(["location", "year"], observed=False)["value"].sum().reset_index() tb_0_9["age"] = "0-9" tb_10_19 = tb[(tb.age == "10-14") | (tb.age == "15-19")] - tb_10_19 = tb_10_19.groupby(["location", "year"])["value"].sum().reset_index() + tb_10_19 = tb_10_19.groupby(["location", "year"], observed=False)["value"].sum().reset_index() tb_10_19["age"] = "10-19" # Drop the 0-4, 5-9, 10-14 and 15-19 age groups tb = tb[(tb.age != "0-4") & (tb.age != "5-9") & (tb.age != "10-14") & (tb.age != "15-19")] diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index 404e3fc904a..d4a1eae52b1 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -1000,9 +1000,11 @@ def __ipow__(self, other: Union[Scalar, Series, variables.Variable, "Table"]) -> def sort_index(self, *args, **kwargs) -> "Table": return super().sort_index(*args, **kwargs) # type: ignore - def groupby(self, *args, **kwargs) -> "TableGroupBy": - """Groupby that preserves metadata.""" - return TableGroupBy(pd.DataFrame.groupby(self.copy(deep=False), *args, **kwargs), self.metadata, self._fields) + def groupby(self, *args, observed=True, **kwargs) -> "TableGroupBy": + """Groupby that preserves metadata. It uses observed=True by default.""" + return TableGroupBy( + pd.DataFrame.groupby(self.copy(deep=False), *args, observed=observed, **kwargs), self.metadata, self._fields + ) def rolling(self, *args, **kwargs) -> "TableRolling": """Rolling operation that preserves metadata.""" diff --git a/lib/catalog/tests/test_tables.py b/lib/catalog/tests/test_tables.py index 3c3f20173ed..b6e5f265ab2 100644 --- a/lib/catalog/tests/test_tables.py +++ b/lib/catalog/tests/test_tables.py @@ -1085,7 +1085,7 @@ def test_groupby_iteration(table_1) -> None: def test_groupby_observed_default(table_1) -> None: table_1 = table_1.astype({"a": "category"}).query("a != 3") gt = table_1.groupby("a").min() - assert len(gt) == 3 + assert len(gt) == 2 def test_groupby_levels(table_1) -> None: