Skip to content

Commit

Permalink
Backport PR #60320 on branch 2.3.x (TST (string dtype): resolve xfail…
Browse files Browse the repository at this point in the history
…s in common IO tests) (#60325)

Backport PR #60320: TST (string dtype): resolve xfails in common IO tests

Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
meeseeksmachine and jorisvandenbossche authored Nov 15, 2024
1 parent fe1f4f9 commit e37ffb3
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 37 deletions.
13 changes: 6 additions & 7 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import (
PyperclipException,
PyperclipWindowsException,
Expand All @@ -26,10 +24,6 @@
init_qt_clipboard,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


def build_kwargs(sep, excel):
kwargs = {}
Expand Down Expand Up @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data):

@pytest.mark.parametrize("engine", ["c", "python"])
def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
self, clipboard, string_storage, dtype_backend, engine, using_infer_string
):
# GH#50502
if dtype_backend == "pyarrow":
Expand Down Expand Up @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend(
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))

if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)

tm.assert_frame_equal(result, expected)

def test_invalid_dtype_backend(self):
Expand Down
33 changes: 15 additions & 18 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ def test_bytesiowrapper_returns_correct_bytes(self):
assert result == data.encode("utf-8")

# Test that pyarrow can handle a file opened with get_handle
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_get_handle_pyarrow_compat(self):
pa_csv = pytest.importorskip("pyarrow.csv")

Expand All @@ -169,6 +168,8 @@ def test_get_handle_pyarrow_compat(self):
s = StringIO(data)
with icom.get_handle(s, "rb", is_text=False) as handles:
df = pa_csv.read_csv(handles.handle).to_pandas()
# TODO will have to update this when pyarrow' to_pandas() is fixed
expected = expected.astype("object")
tm.assert_frame_equal(df, expected)
assert not s.closed

Expand Down Expand Up @@ -352,7 +353,6 @@ def test_read_fspath_all(self, reader, module, path, datapath):
("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
],
)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
if writer_name in ["to_latex"]: # uses Styler implementation
pytest.importorskip("jinja2")
Expand All @@ -379,7 +379,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
expected = f_path.read()
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
def test_write_fspath_hdf5(self):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
Expand Down Expand Up @@ -450,14 +450,13 @@ def test_unknown_engine(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_binary_mode(self):
"""
'encoding' shouldn't be passed to 'open' in binary mode.
Expand All @@ -467,8 +466,8 @@ def test_binary_mode(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
Expand All @@ -485,8 +484,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(UnicodeWarning):
Expand Down Expand Up @@ -516,15 +515,14 @@ def test_is_fsspec_url():
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("encoding", [None, "utf-8"])
@pytest.mark.parametrize("format", ["csv", "json"])
def test_codecs_encoding(encoding, format):
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with codecs.open(path, mode="w", encoding=encoding) as handle:
Expand All @@ -537,13 +535,12 @@ def test_codecs_encoding(encoding, format):
tm.assert_frame_equal(expected, df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_codecs_get_writer_reader():
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, "wb") as handle:
Expand All @@ -568,8 +565,8 @@ def test_explicit_encoding(io_class, mode, msg):
# wrong mode is requested
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with io_class() as buffer:
with pytest.raises(TypeError, match=msg):
Expand Down
15 changes: 6 additions & 9 deletions pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import is_platform_windows

import pandas as pd
Expand Down Expand Up @@ -139,7 +137,6 @@ def test_compression_warning(compression_only):
df.to_csv(handles.handle, compression=compression_only)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_compression_binary(compression_only):
"""
Binary file handles support compression.
Expand All @@ -148,8 +145,8 @@ def test_compression_binary(compression_only):
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)

# with a file
Expand Down Expand Up @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name():
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
compression_options = {"method": "gzip", "mtime": 1}

Expand All @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object():
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
compression_options = {"method": "gzip", "mtime": 1}

Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
assert result == expected


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
def test_to_csv_compression_encoding_gcs(
gcs_buffer, compression_only, encoding, compression_to_extension
Expand All @@ -160,8 +159,8 @@ def test_to_csv_compression_encoding_gcs(
"""
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

# reference of compressed and encoded file
Expand Down

0 comments on commit e37ffb3

Please sign in to comment.