From 6439e31199dd294435ac1a3005b052dc027a332d Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:41:25 -0400
Subject: [PATCH] Unblock Python test failures (#1220)

* Install latest dask/distributed in CI

* Add dask/distributed git installs to GPU CI envs

* Pin pandas to <2.1 for now

* Bump CI

* Temporarily allow pyarrow string conversion

* Disable p2p and pyarrow string conversion for now

* Remove pandas<2.1 pin

* Fix datetime formatting in compat tests

* cuDF string columns are not compatible with nsmallest/nlargest

* xfail q38 for now

* mark xgboost tests as flaky on macOS

* nudge CI

* xfail q64

* More specific FIXMES for query regressions

* Increase retries count for flaky mac tests
---
 conftest.py                                        |  7 ++-----
 continuous_integration/gpuci/build.sh              | 12 +++++++-----
 continuous_integration/gpuci/environment-3.10.yaml |  6 +++++-
 continuous_integration/gpuci/environment-3.9.yaml  |  6 +++++-
 dask_sql/_compat.py                                |  1 +
 dask_sql/physical/utils/sort.py                    |  7 ++-----
 tests/integration/test_compatibility.py            |  8 ++++++--
 tests/integration/test_model.py                    |  2 ++
 tests/unit/test_queries.py                         |  2 ++
 9 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/conftest.py b/conftest.py
index 172fa3028..62559c061 100644
--- a/conftest.py
+++ b/conftest.py
@@ -12,17 +12,14 @@ def pytest_addoption(parser):
 
 
 def pytest_runtest_setup(item):
-    # TODO: explore adding support for pyarrow string columns
+    # TODO: get pyarrow strings and p2p shuffle working
     dask.config.set({"dataframe.convert-string": False})
+    dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
     if "gpu" in item.keywords:
         if not item.config.getoption("--rungpu"):
             pytest.skip("need --rungpu option to run")
-        # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it
-        dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
         # manually enable cudf decimal support
         dask.config.set({"sql.mappings.decimal_support": "cudf"})
-    else:
-        dask.config.set({"dataframe.shuffle.algorithm": None})
     if "queries" in item.keywords and not item.config.getoption("--runqueries"):
         pytest.skip("need --runqueries option to run")
 
diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh
index 790e75540..175fc78da 100644
--- a/continuous_integration/gpuci/build.sh
+++ b/continuous_integration/gpuci/build.sh
@@ -37,6 +37,9 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate dask_sql
 
+gpuci_logger "Update conda env"
+gpuci_mamba_retry env update -n dask_sql -f continuous_integration/gpuci/environment-${PYTHON_VER}.yaml
+
 gpuci_logger "Install awscli"
 gpuci_mamba_retry install -y -c conda-forge awscli
 
@@ -46,11 +49,10 @@ gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}parquet_2gb_sor
 gpuci_logger "Download query files"
 gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}queries/" tests/unit/queries/ --recursive
 
-# TODO: source install once dask/distributed are unpinned by dask-cuda
-# gpuci_logger "Install dask"
-# python -m pip install git+https://github.com/dask/dask
-# gpuci_logger "Install distributed"
-# python -m pip install git+https://github.com/dask/distributed
+gpuci_logger "Install dask"
+python -m pip install git+https://github.com/dask/dask
+gpuci_logger "Install distributed"
+python -m pip install git+https://github.com/dask/distributed
 
 gpuci_logger "Install dask-sql"
 pip install -e . -vv
diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml
index bd572aa56..bb6d00032 100644
--- a/continuous_integration/gpuci/environment-3.10.yaml
+++ b/continuous_integration/gpuci/environment-3.10.yaml
@@ -7,6 +7,7 @@ channels:
 - nodefaults
 dependencies:
 - c-compiler
+- zlib
 - dask>=2022.3.0
 - fastapi>=0.92.0
 - fugue>=0.7.3
@@ -48,4 +49,7 @@ dependencies:
 - ucx-py=0.34
 - xgboost=*=rapidsai_py*
 - libxgboost=*=rapidsai_h*
-- zlib
+- pip
+- pip:
+    - git+https://github.com/dask/dask
+    - git+https://github.com/dask/distributed
diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml
index c9051fda1..47ca34e24 100644
--- a/continuous_integration/gpuci/environment-3.9.yaml
+++ b/continuous_integration/gpuci/environment-3.9.yaml
@@ -7,6 +7,7 @@ channels:
 - nodefaults
 dependencies:
 - c-compiler
+- zlib
 - dask>=2022.3.0
 - fastapi>=0.92.0
 - fugue>=0.7.3
@@ -48,4 +49,7 @@ dependencies:
 - ucx-py=0.34
 - xgboost=*=rapidsai_py*
 - libxgboost=*=rapidsai_h*
-- zlib
+- pip
+- pip:
+    - git+https://github.com/dask/dask
+    - git+https://github.com/dask/distributed
diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py
index be8cfbae5..9a80ad0d4 100644
--- a/dask_sql/_compat.py
+++ b/dask_sql/_compat.py
@@ -9,6 +9,7 @@
 
 INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0")
 PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0")
+PANDAS_GT_210 = _pandas_version >= parseVersion("2.1.0")
 
 # TODO: remove if prompt-toolkit min version gets bumped
 PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")
diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py
index 8ac103ff1..f4299e3dc 100644
--- a/dask_sql/physical/utils/sort.py
+++ b/dask_sql/physical/utils/sort.py
@@ -130,11 +130,8 @@ def is_topk_optimizable(
         sort_num_rows is None
         or not single_ascending
         or any(sort_null_first)
-        # pandas doesnt support nsmallest/nlargest with object dtypes
-        or (
-            "pandas" in str(df._partition_type)
-            and any(df[sort_columns].dtypes == "object")
-        )
+        # pandas/cudf don't support nsmallest/nlargest with object dtypes
+        or any(df[sort_columns].dtypes == "object")
         or (
             sort_num_rows * len(df.columns)
             > dask_config.get("sql.sort.topk-nelem-limit")
diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py
index b34d64bbb..c55c7b327 100644
--- a/tests/integration/test_compatibility.py
+++ b/tests/integration/test_compatibility.py
@@ -18,6 +18,7 @@
 import pytest
 
 from dask_sql import Context
+from dask_sql._compat import PANDAS_GT_210
 from dask_sql.utils import ParsingException
 from tests.utils import assert_eq
 
@@ -28,8 +29,11 @@ def cast_datetime_to_string(df):
     if not cols:
         return df
 
-    # Casting directly to string loses second precision
-    df[cols] = df[cols].astype("object").astype("string")
+    strf = "%Y-%m-%dT%H:%M:%S" if PANDAS_GT_210 else "%Y-%m-%d %H:%M:%S"
+
+    for col in cols:
+        df[col] = df[col].dt.strftime(strf)
+
     return df
 
 
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
index 9980fc103..550f9c7ee 100644
--- a/tests/integration/test_model.py
+++ b/tests/integration/test_model.py
@@ -88,6 +88,7 @@ def test_training_and_prediction(c, gpu_client):
     check_trained_model(c, df_name=timeseries)
 
 
+@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
 @pytest.mark.xfail(
     sys.platform == "win32",
     reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only",
@@ -626,6 +627,7 @@ def test_mlflow_export(c, tmpdir):
         )
 
 
+@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
 def test_mlflow_export_xgboost(c, client, tmpdir):
     # Test only when mlflow & xgboost was installed
     mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py
index bfaedfcee..43b234bba 100644
--- a/tests/unit/test_queries.py
+++ b/tests/unit/test_queries.py
@@ -16,6 +16,7 @@
     28,
     35,
     36,
+    38,  # FIXME: failing due to https://github.com/rapidsai/cudf/issues/14200
     39,
     41,
     44,
@@ -24,6 +25,7 @@
     51,
     57,
     62,
+    64,  # FIXME: failing after cudf#14167 and #14079
     67,
     69,
     70,