From 1df562a17187fe795c5203ec53a6458c0e31f152 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Thu, 21 Nov 2024 10:10:11 +0100 Subject: [PATCH] :hammer: Changes for compatibility with pandas 3.0 (#3550) :sparkles: Changes for compatibility with pandas 3.0 --------- Co-authored-by: Joris Van den Bossche --- lib/datautils/owid/datautils/dataframes.py | 17 +++++++++++- lib/datautils/tests/test_dataframes.py | 5 ++-- lib/repack/tests/test_repack.py | 2 +- tests/data_helpers/test_geo.py | 31 +++++++++++----------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/lib/datautils/owid/datautils/dataframes.py b/lib/datautils/owid/datautils/dataframes.py index 8eac2d987a2..195f6712336 100644 --- a/lib/datautils/owid/datautils/dataframes.py +++ b/lib/datautils/owid/datautils/dataframes.py @@ -108,7 +108,7 @@ def compare( # Compare, column by column, the elements of the two dataframes. compared = pd.DataFrame() for col in columns: - if (df1[col].dtype in (object, "category")) or (df2[col].dtype in (object, "category")): + if (df1[col].dtype in (object, "category", "string")) or (df2[col].dtype in (object, "category", "string")): # Apply a direct comparison for strings or categories compared_row = df1[col].values == df2[col].values else: @@ -461,6 +461,13 @@ def map_series( # Replace those nans with their original values, except if they were actually meant to be mapped to nan. # For example, if {"bad_value": np.nan} was part of the mapping, do not replace those nans back to "bad_value". + # if we are setting values from the original series, ensure we have the same dtype + try: + series_mapped = series_mapped.astype(series.dtype, copy=False) + except ValueError: + # casting NaNs to integer will fail + pass + # Detect values in the mapping that were intended to be mapped to nan. values_mapped_to_nan = [ original_value for original_value, target_value in mapping.items() if pd.isnull(target_value) @@ -632,6 +639,14 @@ def combine_two_overlapping_dataframes( # Give priority to df1 on overlapping values. combined, df2 = df1.align(df2) + new_columns = df2.columns.difference(df1.columns) + for col in new_columns: + try: + combined[col] = combined[col].astype(df2[col].dtype, copy=False) + except ValueError: + # casting NaNs to integer will fail + pass + # Fill missing values in df1 with values from df2. combined = combined.fillna(df2) diff --git a/lib/datautils/tests/test_dataframes.py b/lib/datautils/tests/test_dataframes.py index ebb87011d49..d6b81a51c7b 100644 --- a/lib/datautils/tests/test_dataframes.py +++ b/lib/datautils/tests/test_dataframes.py @@ -326,13 +326,14 @@ def test_default_aggregate_with_some_nans_ignored_different_types_and_more_nans( } ).set_index("year") df_out["value_03"] = df_out["value_03"].astype(object) - assert dataframes.groupby_agg( + result = dataframes.groupby_agg( df_in, ["year"], aggregations=None, num_allowed_nans=None, frac_allowed_nans=None, - ).equals(df_out) + ) + assert result.equals(df_out) def test_default_aggregate_with_num_allowed_nans_zero(self): df_in = pd.DataFrame( diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py index c5f57f48d8d..54a29d24ee4 100644 --- a/lib/repack/tests/test_repack.py +++ b/lib/repack/tests/test_repack.py @@ -197,7 +197,7 @@ def test_repack_object_with_nan_string(): def test_repack_category(): s = pd.Series(["a", "b", "c", None]) - assert s.dtype == np.object_ + assert s.dtype == np.object_ or s.dtype == "str" v = repack.repack_series(s) assert v.dtype == "category" diff --git a/tests/data_helpers/test_geo.py b/tests/data_helpers/test_geo.py index 637cd6e6d38..813cec00e7e 100644 --- a/tests/data_helpers/test_geo.py +++ b/tests/data_helpers/test_geo.py @@ -189,18 +189,19 @@ def test_one_country_unchanged_and_another_unknown(self): def test_two_unknown_countries_made_nan(self): df_in = pd.DataFrame({"country": ["Country 1", "country_04"], "some_variable": [1, 2]}) - df_out = pd.DataFrame({"country": [np.nan, np.nan], "some_variable": [1, 2]}) - df_out["country"] = df_out["country"].astype(object) - assert dataframes.are_equal( - df1=df_out, - df2=geo.harmonize_countries( - df=df_in, - countries_file="MOCK_COUNTRIES_FILE", - make_missing_countries_nan=True, - warn_on_unused_countries=False, - warn_on_missing_countries=False, - ), - )[0] + df_out = pd.DataFrame({"country": [pd.NA, pd.NA], "some_variable": [1, 2]}) + df_out["country"] = df_out["country"].astype("str") + + result = geo.harmonize_countries( + df=df_in, + countries_file="MOCK_COUNTRIES_FILE", + make_missing_countries_nan=True, + warn_on_unused_countries=False, + warn_on_missing_countries=False, + ) + df_out.country = df_out.country.astype("string") + result.country = result.country.astype("string") + assert dataframes.are_equal(df1=df_out, df2=result)[0] def test_one_unknown_country_made_nan_and_a_known_country_changed(self): df_in = pd.DataFrame({"country": ["Country 1", "country_02"], "some_variable": [1, 2]}) @@ -220,10 +221,8 @@ def test_on_dataframe_with_no_countries(self): df_in = pd.DataFrame({"country": []}) df_out = pd.DataFrame({"country": []}) df_out["country"] = df_out["country"].astype(object) - assert dataframes.are_equal( - df1=df_out, - df2=geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False), - )[0] + result = geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False) + assert result.empty def test_change_country_column_name(self): df_in = pd.DataFrame({"Country": ["country_02"]})