Unblock Python test failures (#1220)

* Install latest dask/distributed in CI * Add dask/distributed git installs to GPU CI envs * Pin pandas to <2.1 for now * Bump CI * Temporarily allow pyarrow string conversion * Disable p2p and pyarrow string conversion for now * Remove pandas<2.1 pin * Fix datetime formatting in compat tests * cuDF string columns are not compatible with nsmallest/nlargest * xfail q38 for now * mark xgboost tests as flaky on macOS * nudge CI * xfail q64 * More specific FIXMES for query regressions * Increase retries count for flaky mac tests
dask-contrib · Oct 2, 2023 · 6439e31 · 6439e31
1 parent a71b3eb
commit 6439e31
Show file tree

Hide file tree

Showing 9 changed files with 32 additions and 19 deletions.
diff --git a/conftest.py b/conftest.py
@@ -12,17 +12,14 @@ def pytest_addoption(parser):
 
 
 def pytest_runtest_setup(item):
-    # TODO: explore adding support for pyarrow string columns
+    # TODO: get pyarrow strings and p2p shuffle working
     dask.config.set({"dataframe.convert-string": False})
+    dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
     if "gpu" in item.keywords:
         if not item.config.getoption("--rungpu"):
             pytest.skip("need --rungpu option to run")
-        # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it
-        dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
         # manually enable cudf decimal support
         dask.config.set({"sql.mappings.decimal_support": "cudf"})
-    else:
-        dask.config.set({"dataframe.shuffle.algorithm": None})
     if "queries" in item.keywords and not item.config.getoption("--runqueries"):
         pytest.skip("need --runqueries option to run")
 

diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
@@ -37,6 +37,9 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate dask_sql
 
+gpuci_logger "Update conda env"
+gpuci_mamba_retry env update -n dask_sql -f continuous_integration/gpuci/environment-${PYTHON_VER}.yaml
+
 gpuci_logger "Install awscli"
 gpuci_mamba_retry install -y -c conda-forge awscli
 
@@ -46,11 +49,10 @@ gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}parquet_2gb_sor
 gpuci_logger "Download query files"
 gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}queries/" tests/unit/queries/ --recursive
 
-# TODO: source install once dask/distributed are unpinned by dask-cuda
-# gpuci_logger "Install dask"
-# python -m pip install git+https://github.com/dask/dask
-# gpuci_logger "Install distributed"
-# python -m pip install git+https://github.com/dask/distributed
+gpuci_logger "Install dask"
+python -m pip install git+https://github.com/dask/dask
+gpuci_logger "Install distributed"
+python -m pip install git+https://github.com/dask/distributed
 
 gpuci_logger "Install dask-sql"
 pip install -e . -vv

diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml
@@ -7,6 +7,7 @@ channels:
 - nodefaults
 dependencies:
 - c-compiler
+- zlib
 - dask>=2022.3.0
 - fastapi>=0.92.0
 - fugue>=0.7.3
@@ -48,4 +49,7 @@ dependencies:
 - ucx-py=0.34
 - xgboost=*=rapidsai_py*
 - libxgboost=*=rapidsai_h*
-- zlib
+- pip
+- pip:
+    - git+https://github.com/dask/dask
+    - git+https://github.com/dask/distributed
diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml
@@ -7,6 +7,7 @@ channels:
 - nodefaults
 dependencies:
 - c-compiler
+- zlib
 - dask>=2022.3.0
 - fastapi>=0.92.0
 - fugue>=0.7.3
@@ -48,4 +49,7 @@ dependencies:
 - ucx-py=0.34
 - xgboost=*=rapidsai_py*
 - libxgboost=*=rapidsai_h*
-- zlib
+- pip
+- pip:
+    - git+https://github.com/dask/dask
+    - git+https://github.com/dask/distributed
diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py
@@ -9,6 +9,7 @@
 
 INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0")
 PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0")
+PANDAS_GT_210 = _pandas_version >= parseVersion("2.1.0")
 
 # TODO: remove if prompt-toolkit min version gets bumped
 PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")

diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py
@@ -130,11 +130,8 @@ def is_topk_optimizable(
         sort_num_rows is None
         or not single_ascending
         or any(sort_null_first)
-        # pandas doesnt support nsmallest/nlargest with object dtypes
-        or (
-            "pandas" in str(df._partition_type)
-            and any(df[sort_columns].dtypes == "object")
-        )
+        # pandas/cudf don't support nsmallest/nlargest with object dtypes
+        or any(df[sort_columns].dtypes == "object")
         or (
             sort_num_rows * len(df.columns)
             > dask_config.get("sql.sort.topk-nelem-limit")

diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py
@@ -18,6 +18,7 @@
 import pytest
 
 from dask_sql import Context
+from dask_sql._compat import PANDAS_GT_210
 from dask_sql.utils import ParsingException
 from tests.utils import assert_eq
 
@@ -28,8 +29,11 @@ def cast_datetime_to_string(df):
     if not cols:
         return df
 
-    # Casting directly to string loses second precision
-    df[cols] = df[cols].astype("object").astype("string")
+    strf = "%Y-%m-%dT%H:%M:%S" if PANDAS_GT_210 else "%Y-%m-%d %H:%M:%S"
+
+    for col in cols:
+        df[col] = df[col].dt.strftime(strf)
+
     return df
 
 

diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
@@ -88,6 +88,7 @@ def test_training_and_prediction(c, gpu_client):
     check_trained_model(c, df_name=timeseries)
 
 
+@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
 @pytest.mark.xfail(
     sys.platform == "win32",
     reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only",
@@ -626,6 +627,7 @@ def test_mlflow_export(c, tmpdir):
         )
 
 
+@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
 def test_mlflow_export_xgboost(c, client, tmpdir):
     # Test only when mlflow & xgboost was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")

diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py
@@ -16,6 +16,7 @@
     28,
     35,
     36,
+    38,  # FIXME: failing due to https://github.com/rapidsai/cudf/issues/14200
     39,
     41,
     44,
@@ -24,6 +25,7 @@
     51,
     57,
     62,
+    64,  # FIXME: failing after cudf#14167 and #14079
     67,
     69,
     70,
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ @@
 ,
 ,
 ,
+,  # FIXME: failing due to https://github.com/rapidsai/cudf/issues/14200
 ,
 ,
 ,
@@ Expand All / @@ -24,6 +25,7 @@ @@
 ,
 ,
 ,
+,  # FIXME: failing after cudf#14167 and #14079
 ,
 ,
 ,
@@ Expand Down @@