From 1df562a17187fe795c5203ec53a6458c0e31f152 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Thu, 21 Nov 2024 10:10:11 +0100
Subject: [PATCH]  :hammer: Changes for compatibility with pandas 3.0 (#3550)

:sparkles: Changes for compatibility with pandas 3.0

---------

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 lib/datautils/owid/datautils/dataframes.py | 17 +++++++++++-
 lib/datautils/tests/test_dataframes.py     |  5 ++--
 lib/repack/tests/test_repack.py            |  2 +-
 tests/data_helpers/test_geo.py             | 31 +++++++++++-----------
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/lib/datautils/owid/datautils/dataframes.py b/lib/datautils/owid/datautils/dataframes.py
index 8eac2d987a2..195f6712336 100644
--- a/lib/datautils/owid/datautils/dataframes.py
+++ b/lib/datautils/owid/datautils/dataframes.py
@@ -108,7 +108,7 @@ def compare(
     # Compare, column by column, the elements of the two dataframes.
     compared = pd.DataFrame()
     for col in columns:
-        if (df1[col].dtype in (object, "category")) or (df2[col].dtype in (object, "category")):
+        if (df1[col].dtype in (object, "category", "string")) or (df2[col].dtype in (object, "category", "string")):
             # Apply a direct comparison for strings or categories
             compared_row = df1[col].values == df2[col].values
         else:
@@ -461,6 +461,13 @@ def map_series(
         # Replace those nans with their original values, except if they were actually meant to be mapped to nan.
         # For example, if {"bad_value": np.nan} was part of the mapping, do not replace those nans back to "bad_value".
 
+        # if we are setting values from the original series, ensure we have the same dtype
+        try:
+            series_mapped = series_mapped.astype(series.dtype, copy=False)
+        except ValueError:
+            # casting NaNs to integer will fail
+            pass
+
         # Detect values in the mapping that were intended to be mapped to nan.
         values_mapped_to_nan = [
             original_value for original_value, target_value in mapping.items() if pd.isnull(target_value)
@@ -632,6 +639,14 @@ def combine_two_overlapping_dataframes(
     # Give priority to df1 on overlapping values.
     combined, df2 = df1.align(df2)
 
+    new_columns = df2.columns.difference(df1.columns)
+    for col in new_columns:
+        try:
+            combined[col] = combined[col].astype(df2[col].dtype, copy=False)
+        except ValueError:
+            # casting NaNs to integer will fail
+            pass
+
     # Fill missing values in df1 with values from df2.
     combined = combined.fillna(df2)
 
diff --git a/lib/datautils/tests/test_dataframes.py b/lib/datautils/tests/test_dataframes.py
index ebb87011d49..d6b81a51c7b 100644
--- a/lib/datautils/tests/test_dataframes.py
+++ b/lib/datautils/tests/test_dataframes.py
@@ -326,13 +326,14 @@ def test_default_aggregate_with_some_nans_ignored_different_types_and_more_nans(
             }
         ).set_index("year")
         df_out["value_03"] = df_out["value_03"].astype(object)
-        assert dataframes.groupby_agg(
+        result = dataframes.groupby_agg(
             df_in,
             ["year"],
             aggregations=None,
             num_allowed_nans=None,
             frac_allowed_nans=None,
-        ).equals(df_out)
+        )
+        assert result.equals(df_out)
 
     def test_default_aggregate_with_num_allowed_nans_zero(self):
         df_in = pd.DataFrame(
diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py
index c5f57f48d8d..54a29d24ee4 100644
--- a/lib/repack/tests/test_repack.py
+++ b/lib/repack/tests/test_repack.py
@@ -197,7 +197,7 @@ def test_repack_object_with_nan_string():
 
 def test_repack_category():
     s = pd.Series(["a", "b", "c", None])
-    assert s.dtype == np.object_
+    assert s.dtype == np.object_ or s.dtype == "str"
 
     v = repack.repack_series(s)
     assert v.dtype == "category"
diff --git a/tests/data_helpers/test_geo.py b/tests/data_helpers/test_geo.py
index 637cd6e6d38..813cec00e7e 100644
--- a/tests/data_helpers/test_geo.py
+++ b/tests/data_helpers/test_geo.py
@@ -189,18 +189,19 @@ def test_one_country_unchanged_and_another_unknown(self):
 
     def test_two_unknown_countries_made_nan(self):
         df_in = pd.DataFrame({"country": ["Country 1", "country_04"], "some_variable": [1, 2]})
-        df_out = pd.DataFrame({"country": [np.nan, np.nan], "some_variable": [1, 2]})
-        df_out["country"] = df_out["country"].astype(object)
-        assert dataframes.are_equal(
-            df1=df_out,
-            df2=geo.harmonize_countries(
-                df=df_in,
-                countries_file="MOCK_COUNTRIES_FILE",
-                make_missing_countries_nan=True,
-                warn_on_unused_countries=False,
-                warn_on_missing_countries=False,
-            ),
-        )[0]
+        df_out = pd.DataFrame({"country": [pd.NA, pd.NA], "some_variable": [1, 2]})
+        df_out["country"] = df_out["country"].astype("str")
+
+        result = geo.harmonize_countries(
+            df=df_in,
+            countries_file="MOCK_COUNTRIES_FILE",
+            make_missing_countries_nan=True,
+            warn_on_unused_countries=False,
+            warn_on_missing_countries=False,
+        )
+        df_out.country = df_out.country.astype("string")
+        result.country = result.country.astype("string")
+        assert dataframes.are_equal(df1=df_out, df2=result)[0]
 
     def test_one_unknown_country_made_nan_and_a_known_country_changed(self):
         df_in = pd.DataFrame({"country": ["Country 1", "country_02"], "some_variable": [1, 2]})
@@ -220,10 +221,8 @@ def test_on_dataframe_with_no_countries(self):
         df_in = pd.DataFrame({"country": []})
         df_out = pd.DataFrame({"country": []})
         df_out["country"] = df_out["country"].astype(object)
-        assert dataframes.are_equal(
-            df1=df_out,
-            df2=geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False),
-        )[0]
+        result = geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False)
+        assert result.empty
 
     def test_change_country_column_name(self):
         df_in = pd.DataFrame({"Country": ["country_02"]})