From 6439e31199dd294435ac1a3005b052dc027a332d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:41:25 -0400 Subject: [PATCH] Unblock Python test failures (#1220) * Install latest dask/distributed in CI * Add dask/distributed git installs to GPU CI envs * Pin pandas to <2.1 for now * Bump CI * Temporarily allow pyarrow string conversion * Disable p2p and pyarrow string conversion for now * Remove pandas<2.1 pin * Fix datetime formatting in compat tests * cuDF string columns are not compatible with nsmallest/nlargest * xfail q38 for now * mark xgboost tests as flaky on macOS * nudge CI * xfail q64 * More specific FIXMES for query regressions * Increase retries count for flaky mac tests --- conftest.py | 7 ++----- continuous_integration/gpuci/build.sh | 12 +++++++----- continuous_integration/gpuci/environment-3.10.yaml | 6 +++++- continuous_integration/gpuci/environment-3.9.yaml | 6 +++++- dask_sql/_compat.py | 1 + dask_sql/physical/utils/sort.py | 7 ++----- tests/integration/test_compatibility.py | 8 ++++++-- tests/integration/test_model.py | 2 ++ tests/unit/test_queries.py | 2 ++ 9 files changed, 32 insertions(+), 19 deletions(-) diff --git a/conftest.py b/conftest.py index 172fa3028..62559c061 100644 --- a/conftest.py +++ b/conftest.py @@ -12,17 +12,14 @@ def pytest_addoption(parser): def pytest_runtest_setup(item): - # TODO: explore adding support for pyarrow string columns + # TODO: get pyarrow strings and p2p shuffle working dask.config.set({"dataframe.convert-string": False}) + dask.config.set({"dataframe.shuffle.algorithm": "tasks"}) if "gpu" in item.keywords: if not item.config.getoption("--rungpu"): pytest.skip("need --rungpu option to run") - # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it - dask.config.set({"dataframe.shuffle.algorithm": "tasks"}) # manually enable cudf decimal support dask.config.set({"sql.mappings.decimal_support": "cudf"}) - else: - dask.config.set({"dataframe.shuffle.algorithm": None}) if "queries" in item.keywords and not item.config.getoption("--runqueries"): pytest.skip("need --runqueries option to run") diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh index 790e75540..175fc78da 100644 --- a/continuous_integration/gpuci/build.sh +++ b/continuous_integration/gpuci/build.sh @@ -37,6 +37,9 @@ gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate dask_sql +gpuci_logger "Update conda env" +gpuci_mamba_retry env update -n dask_sql -f continuous_integration/gpuci/environment-${PYTHON_VER}.yaml + gpuci_logger "Install awscli" gpuci_mamba_retry install -y -c conda-forge awscli @@ -46,11 +49,10 @@ gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}parquet_2gb_sor gpuci_logger "Download query files" gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}queries/" tests/unit/queries/ --recursive -# TODO: source install once dask/distributed are unpinned by dask-cuda -# gpuci_logger "Install dask" -# python -m pip install git+https://github.com/dask/dask -# gpuci_logger "Install distributed" -# python -m pip install git+https://github.com/dask/distributed +gpuci_logger "Install dask" +python -m pip install git+https://github.com/dask/dask +gpuci_logger "Install distributed" +python -m pip install git+https://github.com/dask/distributed gpuci_logger "Install dask-sql" pip install -e . -vv diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index bd572aa56..bb6d00032 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -7,6 +7,7 @@ channels: - nodefaults dependencies: - c-compiler +- zlib - dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 @@ -48,4 +49,7 @@ dependencies: - ucx-py=0.34 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* -- zlib +- pip +- pip: + - git+https://github.com/dask/dask + - git+https://github.com/dask/distributed diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index c9051fda1..47ca34e24 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -7,6 +7,7 @@ channels: - nodefaults dependencies: - c-compiler +- zlib - dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 @@ -48,4 +49,7 @@ dependencies: - ucx-py=0.34 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* -- zlib +- pip +- pip: + - git+https://github.com/dask/dask + - git+https://github.com/dask/distributed diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index be8cfbae5..9a80ad0d4 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -9,6 +9,7 @@ INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0") PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0") +PANDAS_GT_210 = _pandas_version >= parseVersion("2.1.0") # TODO: remove if prompt-toolkit min version gets bumped PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29") diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py index 8ac103ff1..f4299e3dc 100644 --- a/dask_sql/physical/utils/sort.py +++ b/dask_sql/physical/utils/sort.py @@ -130,11 +130,8 @@ def is_topk_optimizable( sort_num_rows is None or not single_ascending or any(sort_null_first) - # pandas doesnt support nsmallest/nlargest with object dtypes - or ( - "pandas" in str(df._partition_type) - and any(df[sort_columns].dtypes == "object") - ) + # pandas/cudf don't support nsmallest/nlargest with object dtypes + or any(df[sort_columns].dtypes == "object") or ( sort_num_rows * len(df.columns) > dask_config.get("sql.sort.topk-nelem-limit") diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index b34d64bbb..c55c7b327 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -18,6 +18,7 @@ import pytest from dask_sql import Context +from dask_sql._compat import PANDAS_GT_210 from dask_sql.utils import ParsingException from tests.utils import assert_eq @@ -28,8 +29,11 @@ def cast_datetime_to_string(df): if not cols: return df - # Casting directly to string loses second precision - df[cols] = df[cols].astype("object").astype("string") + strf = "%Y-%m-%dT%H:%M:%S" if PANDAS_GT_210 else "%Y-%m-%d %H:%M:%S" + + for col in cols: + df[col] = df[col].dt.strftime(strf) + return df diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 9980fc103..550f9c7ee 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -88,6 +88,7 @@ def test_training_and_prediction(c, gpu_client): check_trained_model(c, df_name=timeseries) +@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'") @pytest.mark.xfail( sys.platform == "win32", reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only", @@ -626,6 +627,7 @@ def test_mlflow_export(c, tmpdir): ) +@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'") def test_mlflow_export_xgboost(c, client, tmpdir): # Test only when mlflow & xgboost was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py index bfaedfcee..43b234bba 100644 --- a/tests/unit/test_queries.py +++ b/tests/unit/test_queries.py @@ -16,6 +16,7 @@ 28, 35, 36, + 38, # FIXME: failing due to https://github.com/rapidsai/cudf/issues/14200 39, 41, 44, @@ -24,6 +25,7 @@ 51, 57, 62, + 64, # FIXME: failing after cudf#14167 and #14079 67, 69, 70,