Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into string-dtype-tests-…
Browse files Browse the repository at this point in the history
…frame-replace-fillna
  • Loading branch information
jorisvandenbossche committed Nov 15, 2024
2 parents f0fa390 + fba5f08 commit e156770
Show file tree
Hide file tree
Showing 10 changed files with 58 additions and 81 deletions.
6 changes: 5 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1644,7 +1644,11 @@ def _accumulate(
else:
data_to_accum = data_to_accum.cast(pa.int64())

result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
try:
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
except pa.ArrowNotImplementedError as err:
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
raise TypeError(msg) from err

if convert_to_int:
result = result.cast(pa_dtype)
Expand Down
30 changes: 10 additions & 20 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,18 +218,12 @@ def transform(row):
def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
# GH 21224
if using_infer_string:
if df.dtypes.iloc[0].storage == "pyarrow":
import pyarrow as pa

# TODO(infer_string)
# should raise a proper TypeError instead of propagating the pyarrow error

expected = (expected, pa.lib.ArrowNotImplementedError)
else:
expected = (expected, NotImplementedError)
expected = (expected, NotImplementedError)

msg = (
"can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform"
"can't multiply sequence by non-int of type 'str'"
"|cannot perform cumprod with type str" # NotImplementedError python backend
"|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow
)
warn = None if isinstance(func, str) else FutureWarning
with pytest.raises(expected, match=msg):
Expand Down Expand Up @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri
if func == "median" or func is np.nanmedian or func is np.median:
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"

if using_infer_string:
if series.dtype.storage == "pyarrow":
import pyarrow as pa

# TODO(infer_string)
# should raise a proper TypeError instead of propagating the pyarrow error
expected = (expected, pa.lib.ArrowNotImplementedError)
else:
expected = (expected, NotImplementedError)
msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform"
if using_infer_string and func == "cumprod":
expected = (expected, NotImplementedError)

msg = (
msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation"
)
warn = None if isinstance(func, str) else FutureWarning

with pytest.raises(expected, match=msg):
Expand Down
13 changes: 8 additions & 5 deletions pandas/tests/apply/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import WASM

from pandas.core.dtypes.common import is_number
Expand Down Expand Up @@ -81,7 +79,6 @@ def test_apply_np_transformer(float_frame, op, how):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"series, func, expected",
chain(
Expand Down Expand Up @@ -140,7 +137,6 @@ def test_agg_cython_table_series(series, func, expected):
assert result == expected


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"series, func, expected",
chain(
Expand All @@ -163,10 +159,17 @@ def test_agg_cython_table_series(series, func, expected):
),
),
)
def test_agg_cython_table_transform_series(series, func, expected):
def test_agg_cython_table_transform_series(request, series, func, expected):
# GH21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
if series.dtype == "string" and func == "cumsum":
request.applymarker(
pytest.mark.xfail(
raises=(TypeError, NotImplementedError),
reason="TODO(infer_string) cumsum not yet implemented for string",
)
)
warn = None if isinstance(func, str) else FutureWarning
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
result = series.agg(func)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
request.applymarker(
pytest.mark.xfail(
reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
raises=NotImplementedError,
raises=TypeError,
)
)

Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
Expand All @@ -27,10 +25,6 @@
set_default_names,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


@pytest.fixture
def df_schema():
Expand Down Expand Up @@ -127,7 +121,7 @@ def test_multiindex(self, df_schema, using_infer_string):
expected["fields"][0] = {
"name": "level_0",
"type": "any",
"extDtype": "string",
"extDtype": "str",
}
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"}
assert result == expected
Expand Down
14 changes: 4 additions & 10 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def datetime_frame(self):
# since that doesn't round-trip, see GH#33711
df = DataFrame(
np.random.default_rng(2).standard_normal((30, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=30, freq="B"),
)
df.index = df.index._with_freq(None)
Expand Down Expand Up @@ -184,7 +184,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame):

assert_json_roundtrip_equal(result, expected, orient)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype", [False, np.int64])
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame):
Expand Down Expand Up @@ -270,7 +269,6 @@ def test_roundtrip_empty(self, orient, convert_axes):

tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame):
# TODO: improve coverage with date_format parameter
Expand Down Expand Up @@ -698,7 +696,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string

tm.assert_series_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("dtype", [False, None])
def test_series_roundtrip_object(self, orient, dtype, object_series):
data = StringIO(object_series.to_json(orient=orient))
Expand All @@ -710,6 +707,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series):
if orient != "split":
expected.name = None

if using_string_dtype():
expected = expected.astype("str")

tm.assert_series_equal(result, expected)

def test_series_roundtrip_empty(self, orient):
Expand Down Expand Up @@ -808,7 +808,6 @@ def test_path(self, float_frame, int_frame, datetime_frame):
df.to_json(path)
read_json(path)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_axis_dates(self, datetime_series, datetime_frame):
# frame
json = StringIO(datetime_frame.to_json())
Expand All @@ -821,7 +820,6 @@ def test_axis_dates(self, datetime_series, datetime_frame):
tm.assert_series_equal(result, datetime_series, check_names=False)
assert result.name is None

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_convert_dates(self, datetime_series, datetime_frame):
# frame
df = datetime_frame
Expand Down Expand Up @@ -912,7 +910,6 @@ def test_convert_dates_infer(self, infer_word):
result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]]
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"date,date_unit",
[
Expand Down Expand Up @@ -973,7 +970,6 @@ def test_date_format_series_raises(self, datetime_series):
with pytest.raises(ValueError, match=msg):
ts.to_json(date_format="iso", date_unit="foo")

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_date_unit(self, unit, datetime_frame):
df = datetime_frame
df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
Expand Down Expand Up @@ -1114,7 +1110,6 @@ def test_round_trip_exception(self, datapath):
res = res.fillna(np.nan)
tm.assert_frame_equal(res, df)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.network
@pytest.mark.single_cpu
@pytest.mark.parametrize(
Expand Down Expand Up @@ -1555,7 +1550,6 @@ def test_data_frame_size_after_to_json(self):

assert size_before == size_after

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]]
)
Expand Down
13 changes: 6 additions & 7 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import (
PyperclipException,
PyperclipWindowsException,
Expand All @@ -26,10 +24,6 @@
init_qt_clipboard,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


def build_kwargs(sep, excel):
kwargs = {}
Expand Down Expand Up @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data):

@pytest.mark.parametrize("engine", ["c", "python"])
def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
self, clipboard, string_storage, dtype_backend, engine, using_infer_string
):
# GH#50502
if dtype_backend == "pyarrow":
Expand Down Expand Up @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend(
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))

if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)

tm.assert_frame_equal(result, expected)

def test_invalid_dtype_backend(self):
Expand Down
33 changes: 15 additions & 18 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def test_bytesiowrapper_returns_correct_bytes(self):
assert result == data.encode("utf-8")

# Test that pyarrow can handle a file opened with get_handle
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_get_handle_pyarrow_compat(self):
pa_csv = pytest.importorskip("pyarrow.csv")

Expand All @@ -155,6 +154,8 @@ def test_get_handle_pyarrow_compat(self):
s = StringIO(data)
with icom.get_handle(s, "rb", is_text=False) as handles:
df = pa_csv.read_csv(handles.handle).to_pandas()
# TODO will have to update this when pyarrow' to_pandas() is fixed
expected = expected.astype("object")
tm.assert_frame_equal(df, expected)
assert not s.closed

Expand Down Expand Up @@ -338,7 +339,6 @@ def test_read_fspath_all(self, reader, module, path, datapath):
("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
],
)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
if writer_name in ["to_latex"]: # uses Styler implementation
pytest.importorskip("jinja2")
Expand All @@ -365,7 +365,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
expected = f_path.read()
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
def test_write_fspath_hdf5(self):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
Expand Down Expand Up @@ -438,14 +438,13 @@ def test_unknown_engine(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_binary_mode(self):
"""
'encoding' shouldn't be passed to 'open' in binary mode.
Expand All @@ -455,8 +454,8 @@ def test_binary_mode(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
Expand All @@ -473,8 +472,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
Expand Down Expand Up @@ -504,15 +503,14 @@ def test_is_fsspec_url():
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("encoding", [None, "utf-8"])
@pytest.mark.parametrize("format", ["csv", "json"])
def test_codecs_encoding(encoding, format):
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with codecs.open(path, mode="w", encoding=encoding) as handle:
Expand All @@ -525,13 +523,12 @@ def test_codecs_encoding(encoding, format):
tm.assert_frame_equal(expected, df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_codecs_get_writer_reader():
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, "wb") as handle:
Expand All @@ -556,8 +553,8 @@ def test_explicit_encoding(io_class, mode, msg):
# wrong mode is requested
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with io_class() as buffer:
with pytest.raises(TypeError, match=msg):
Expand Down
Loading

0 comments on commit e156770

Please sign in to comment.