Skip to content

Commit

Permalink
🔨 Changes for compatibility with pandas 3.0 (#3550)
Browse files Browse the repository at this point in the history
✨ Changes for compatibility with pandas 3.0

---------

Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
Marigold and jorisvandenbossche authored Nov 21, 2024
1 parent 8ff6409 commit 1df562a
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 20 deletions.
17 changes: 16 additions & 1 deletion lib/datautils/owid/datautils/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def compare(
# Compare, column by column, the elements of the two dataframes.
compared = pd.DataFrame()
for col in columns:
if (df1[col].dtype in (object, "category")) or (df2[col].dtype in (object, "category")):
if (df1[col].dtype in (object, "category", "string")) or (df2[col].dtype in (object, "category", "string")):
# Apply a direct comparison for strings or categories
compared_row = df1[col].values == df2[col].values
else:
Expand Down Expand Up @@ -461,6 +461,13 @@ def map_series(
# Replace those nans with their original values, except if they were actually meant to be mapped to nan.
# For example, if {"bad_value": np.nan} was part of the mapping, do not replace those nans back to "bad_value".

# if we are setting values from the original series, ensure we have the same dtype
try:
series_mapped = series_mapped.astype(series.dtype, copy=False)
except ValueError:
# casting NaNs to integer will fail
pass

# Detect values in the mapping that were intended to be mapped to nan.
values_mapped_to_nan = [
original_value for original_value, target_value in mapping.items() if pd.isnull(target_value)
Expand Down Expand Up @@ -632,6 +639,14 @@ def combine_two_overlapping_dataframes(
# Give priority to df1 on overlapping values.
combined, df2 = df1.align(df2)

new_columns = df2.columns.difference(df1.columns)
for col in new_columns:
try:
combined[col] = combined[col].astype(df2[col].dtype, copy=False)
except ValueError:
# casting NaNs to integer will fail
pass

# Fill missing values in df1 with values from df2.
combined = combined.fillna(df2)

Expand Down
5 changes: 3 additions & 2 deletions lib/datautils/tests/test_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,14 @@ def test_default_aggregate_with_some_nans_ignored_different_types_and_more_nans(
}
).set_index("year")
df_out["value_03"] = df_out["value_03"].astype(object)
assert dataframes.groupby_agg(
result = dataframes.groupby_agg(
df_in,
["year"],
aggregations=None,
num_allowed_nans=None,
frac_allowed_nans=None,
).equals(df_out)
)
assert result.equals(df_out)

def test_default_aggregate_with_num_allowed_nans_zero(self):
df_in = pd.DataFrame(
Expand Down
2 changes: 1 addition & 1 deletion lib/repack/tests/test_repack.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_repack_object_with_nan_string():

def test_repack_category():
s = pd.Series(["a", "b", "c", None])
assert s.dtype == np.object_
assert s.dtype == np.object_ or s.dtype == "str"

v = repack.repack_series(s)
assert v.dtype == "category"
Expand Down
31 changes: 15 additions & 16 deletions tests/data_helpers/test_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,18 +189,19 @@ def test_one_country_unchanged_and_another_unknown(self):

def test_two_unknown_countries_made_nan(self):
df_in = pd.DataFrame({"country": ["Country 1", "country_04"], "some_variable": [1, 2]})
df_out = pd.DataFrame({"country": [np.nan, np.nan], "some_variable": [1, 2]})
df_out["country"] = df_out["country"].astype(object)
assert dataframes.are_equal(
df1=df_out,
df2=geo.harmonize_countries(
df=df_in,
countries_file="MOCK_COUNTRIES_FILE",
make_missing_countries_nan=True,
warn_on_unused_countries=False,
warn_on_missing_countries=False,
),
)[0]
df_out = pd.DataFrame({"country": [pd.NA, pd.NA], "some_variable": [1, 2]})
df_out["country"] = df_out["country"].astype("str")

result = geo.harmonize_countries(
df=df_in,
countries_file="MOCK_COUNTRIES_FILE",
make_missing_countries_nan=True,
warn_on_unused_countries=False,
warn_on_missing_countries=False,
)
df_out.country = df_out.country.astype("string")
result.country = result.country.astype("string")
assert dataframes.are_equal(df1=df_out, df2=result)[0]

def test_one_unknown_country_made_nan_and_a_known_country_changed(self):
df_in = pd.DataFrame({"country": ["Country 1", "country_02"], "some_variable": [1, 2]})
Expand All @@ -220,10 +221,8 @@ def test_on_dataframe_with_no_countries(self):
df_in = pd.DataFrame({"country": []})
df_out = pd.DataFrame({"country": []})
df_out["country"] = df_out["country"].astype(object)
assert dataframes.are_equal(
df1=df_out,
df2=geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False),
)[0]
result = geo.harmonize_countries(df=df_in, countries_file="MOCK_COUNTRIES_FILE", warn_on_unused_countries=False)
assert result.empty

def test_change_country_column_name(self):
df_in = pd.DataFrame({"Country": ["country_02"]})
Expand Down

0 comments on commit 1df562a

Please sign in to comment.