diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fcc50c5b6b20f..e0c93db0afb07 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1644,7 +1644,11 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index e19c21f81b3e1..0503bf9166ec7 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func == "cumprod": + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 732652f24e2eb..c52168ae48ca8 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.core.dtypes.common import is_number @@ -81,7 +79,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -140,7 +137,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -163,10 +159,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func == "cumsum": + request.applymarker( + pytest.mark.xfail( + raises=(TypeError, NotImplementedError), + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f0ff11e5fa3f7..9defb97394635 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -441,7 +441,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 7f367ded39863..7936982e4a055 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -27,10 +25,6 @@ set_default_names, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def df_schema(): @@ -127,7 +121,7 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d3328d1dfcaef..ad9dbf7554a8b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -84,7 +84,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -184,7 +184,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -270,7 +269,6 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -698,7 +696,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -710,6 +707,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -808,7 +808,6 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -821,7 +820,6 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -912,7 +910,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -973,7 +970,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -1114,7 +1110,6 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan) tm.assert_frame_equal(res, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1555,7 +1550,6 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 541cc39606047..b5e97314caf03 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -26,10 +24,6 @@ init_qt_clipboard, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def build_kwargs(sep, excel): kwargs = {} @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 if dtype_backend == "pyarrow": @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4f3f613f71542..506276a25dad6 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -140,7 +140,6 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -155,6 +154,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + # TODO will have to update this when pyarrow' to_pandas() is fixed + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -338,7 +339,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -365,7 +365,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll @@ -438,14 +438,13 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -455,8 +454,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -473,8 +472,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): @@ -504,15 +503,14 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -525,13 +523,12 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -556,8 +553,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 5eb202dd5aa24..fd1e9b4fdf211 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,8 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -139,7 +137,6 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -148,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a9e7b2da03a4d..48580003f6c5e 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -158,7 +158,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -171,8 +170,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file