From f5b3c07cd48e2e86071fd28b6ea6549af9c8dec0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2024 10:42:17 -0500 Subject: [PATCH 1/3] CI: Fix fastparquet failure from new release --- pandas/tests/io/test_parquet.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6ef7105cf5ccc..287d6c6dbb9b7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1174,7 +1174,6 @@ def test_non_nanosecond_timestamps(self, temp_file): class TestParquetFastParquet(Base): - @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") @@ -1213,10 +1212,6 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") @@ -1331,9 +1326,6 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929" - ) def test_timezone_aware_index(self, fp, timezone_aware_date_list): idx = 5 * [timezone_aware_date_list] From 8a1791e0501aaddcf86609895529f4dfbe39215f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2024 13:55:28 -0500 Subject: [PATCH 2/3] More fixes --- pandas/tests/io/test_fsspec.py | 1 - pandas/tests/io/test_gcs.py | 3 --- pandas/tests/io/test_parquet.py | 37 ++++++++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index aa9c47ea0e63c..4bf39fda555ca 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -176,7 +176,6 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 48580003f6c5e..f68ef5fa2e0e5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -207,7 +205,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 287d6c6dbb9b7..31cdb6626d237 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1174,8 +1174,17 @@ def test_non_nanosecond_timestamps(self, temp_file): class TestParquetFastParquet(Base): - def test_basic(self, fp, df_full): + def test_basic(self, fp, df_full, request): pytz = pytest.importorskip("pytz") + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=("datetime_with_nat gets incorrect values"), + ) + ) + tz = pytz.timezone("US/Eastern") df = df_full @@ -1212,7 +1221,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1326,7 +1345,19 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) From b91a48cc0b0d141ce0e8e052153b09c499c62193 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 16 Nov 2024 17:52:02 -0500 Subject: [PATCH 3/3] Skip for pyarrow string build --- pandas/tests/io/test_fsspec.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4bf39fda555ca..5340560884afe 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -176,6 +178,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet")