Skip to content

Commit

Permalink
check for is_string_dtype and unsupported mixed type (#335)
Browse files Browse the repository at this point in the history
  • Loading branch information
fdosani authored Oct 9, 2024
1 parent 60c52d2 commit 759efa2
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 24 deletions.
2 changes: 1 addition & 1 deletion datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Then extended to carry that functionality over to Spark Dataframes.
"""

__version__ = "0.13.3"
__version__ = "0.14.0"

import platform
from warnings import warn
Expand Down
22 changes: 18 additions & 4 deletions datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,11 @@ def columns_equal(
- Non-numeric values (i.e. where np.isclose can't be used) will just
trigger True on two nulls or exact matches.
Notes
-----
As of version ``0.14.0`` If a column is of a mixed data type the compare will
default to returning ``False``.
Parameters
----------
col_1 : Pandas.Series
Expand All @@ -792,6 +797,15 @@ def columns_equal(
values don't match.
"""
compare: pd.Series[bool]

# short circuit if comparing mixed type columns. We don't want to support this moving forward.
if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
col_2
).startswith("mixed"):
compare = pd.Series(False, index=col_1.index)
compare.index = col_1.index
return compare

try:
compare = pd.Series(
np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
Expand All @@ -810,15 +824,15 @@ def columns_equal(
except (ValueError, TypeError):
try:
if ignore_spaces:
if col_1.dtype.kind == "O":
if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
col_1 = col_1.str.strip()
if col_2.dtype.kind == "O":
if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
col_2 = col_2.str.strip()

if ignore_case:
if col_1.dtype.kind == "O":
if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
col_1 = col_1.str.upper()
if col_2.dtype.kind == "O":
if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
col_2 = col_2.str.upper()

if {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}:
Expand Down
38 changes: 19 additions & 19 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_string_columns_equal_with_ignore_spaces():
something||False
|something|False
||True"""
df = pd.read_csv(io.StringIO(data), sep="|")
df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
expect_out = df["expected"]
assert_series_equal(expect_out, actual_out, check_names=False)
Expand All @@ -119,7 +119,7 @@ def test_string_columns_equal_with_ignore_spaces_and_case():
something||False
|something|False
||True"""
df = pd.read_csv(io.StringIO(data), sep="|")
df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
actual_out = datacompy.columns_equal(
df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
)
Expand Down Expand Up @@ -160,7 +160,7 @@ def test_date_columns_equal_with_ignore_spaces():
2017-01-01||False
|2017-01-01|False
||True"""
df = pd.read_csv(io.StringIO(data), sep="|")
df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
# First compare just the strings
actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
expect_out = df["expected"]
Expand Down Expand Up @@ -192,7 +192,7 @@ def test_date_columns_equal_with_ignore_spaces_and_case():
2017-01-01||False
|2017-01-01|False
||True"""
df = pd.read_csv(io.StringIO(data), sep="|")
df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
# First compare just the strings
actual_out = datacompy.columns_equal(
df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
Expand Down Expand Up @@ -364,10 +364,10 @@ def test_infinity_and_beyond():
def test_mixed_column():
df = pd.DataFrame(
[
{"a": "hi", "b": "hi", "expected": True},
{"a": 1, "b": 1, "expected": True},
{"a": np.inf, "b": np.inf, "expected": True},
{"a": Decimal("1"), "b": Decimal("1"), "expected": True},
{"a": "hi", "b": "hi", "expected": False},
{"a": 1, "b": 1, "expected": False},
{"a": np.inf, "b": np.inf, "expected": False},
{"a": Decimal("1"), "b": Decimal("1"), "expected": False},
{"a": 1, "b": "1", "expected": False},
{"a": 1, "b": "yo", "expected": False},
]
Expand All @@ -380,10 +380,10 @@ def test_mixed_column():
def test_mixed_column_with_ignore_spaces():
df = pd.DataFrame(
[
{"a": "hi", "b": "hi ", "expected": True},
{"a": 1, "b": 1, "expected": True},
{"a": np.inf, "b": np.inf, "expected": True},
{"a": Decimal("1"), "b": Decimal("1"), "expected": True},
{"a": "hi", "b": "hi ", "expected": False},
{"a": 1, "b": 1, "expected": False},
{"a": np.inf, "b": np.inf, "expected": False},
{"a": Decimal("1"), "b": Decimal("1"), "expected": False},
{"a": 1, "b": "1 ", "expected": False},
{"a": 1, "b": "yo ", "expected": False},
]
Expand All @@ -396,15 +396,15 @@ def test_mixed_column_with_ignore_spaces():
def test_mixed_column_with_ignore_spaces_and_case():
df = pd.DataFrame(
[
{"a": "hi", "b": "hi ", "expected": True},
{"a": 1, "b": 1, "expected": True},
{"a": np.inf, "b": np.inf, "expected": True},
{"a": Decimal("1"), "b": Decimal("1"), "expected": True},
{"a": "hi", "b": "hi ", "expected": False},
{"a": 1, "b": 1, "expected": False},
{"a": np.inf, "b": np.inf, "expected": False},
{"a": Decimal("1"), "b": Decimal("1"), "expected": False},
{"a": 1, "b": "1 ", "expected": False},
{"a": 1, "b": "yo ", "expected": False},
{"a": "Hi", "b": "hI ", "expected": True},
{"a": "HI", "b": "HI ", "expected": True},
{"a": "hi", "b": "hi ", "expected": True},
{"a": "Hi", "b": "hI ", "expected": False},
{"a": "HI", "b": "HI ", "expected": False},
{"a": "hi", "b": "hi ", "expected": False},
]
)
actual_out = datacompy.columns_equal(
Expand Down

0 comments on commit 759efa2

Please sign in to comment.