Skip to content

Commit

Permalink
Unblock Python test failures (#1220)
Browse files Browse the repository at this point in the history
* Install latest dask/distributed in CI

* Add dask/distributed git installs to GPU CI envs

* Pin pandas to <2.1 for now

* Bump CI

* Temporarily allow pyarrow string conversion

* Disable p2p and pyarrow string conversion for now

* Remove pandas<2.1 pin

* Fix datetime formatting in compat tests

* cuDF string columns are not compatible with nsmallest/nlargest

* xfail q38 for now

* mark xgboost tests as flaky on macOS

* nudge CI

* xfail q64

* More specific FIXMES for query regressions

* Increase retries count for flaky mac tests
  • Loading branch information
charlesbluca authored Oct 2, 2023
1 parent a71b3eb commit 6439e31
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 19 deletions.
7 changes: 2 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,14 @@ def pytest_addoption(parser):


def pytest_runtest_setup(item):
# TODO: explore adding support for pyarrow string columns
# TODO: get pyarrow strings and p2p shuffle working
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
if "gpu" in item.keywords:
if not item.config.getoption("--rungpu"):
pytest.skip("need --rungpu option to run")
# FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it
dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
# manually enable cudf decimal support
dask.config.set({"sql.mappings.decimal_support": "cudf"})
else:
dask.config.set({"dataframe.shuffle.algorithm": None})
if "queries" in item.keywords and not item.config.getoption("--runqueries"):
pytest.skip("need --runqueries option to run")

Expand Down
12 changes: 7 additions & 5 deletions continuous_integration/gpuci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ gpuci_logger "Activate conda env"
. /opt/conda/etc/profile.d/conda.sh
conda activate dask_sql

gpuci_logger "Update conda env"
gpuci_mamba_retry env update -n dask_sql -f continuous_integration/gpuci/environment-${PYTHON_VER}.yaml

gpuci_logger "Install awscli"
gpuci_mamba_retry install -y -c conda-forge awscli

Expand All @@ -46,11 +49,10 @@ gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}parquet_2gb_sor
gpuci_logger "Download query files"
gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}queries/" tests/unit/queries/ --recursive

# TODO: source install once dask/distributed are unpinned by dask-cuda
# gpuci_logger "Install dask"
# python -m pip install git+https://github.com/dask/dask
# gpuci_logger "Install distributed"
# python -m pip install git+https://github.com/dask/distributed
gpuci_logger "Install dask"
python -m pip install git+https://github.com/dask/dask
gpuci_logger "Install distributed"
python -m pip install git+https://github.com/dask/distributed

gpuci_logger "Install dask-sql"
pip install -e . -vv
Expand Down
6 changes: 5 additions & 1 deletion continuous_integration/gpuci/environment-3.10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ channels:
- nodefaults
dependencies:
- c-compiler
- zlib
- dask>=2022.3.0
- fastapi>=0.92.0
- fugue>=0.7.3
Expand Down Expand Up @@ -48,4 +49,7 @@ dependencies:
- ucx-py=0.34
- xgboost=*=rapidsai_py*
- libxgboost=*=rapidsai_h*
- zlib
- pip
- pip:
- git+https://github.com/dask/dask
- git+https://github.com/dask/distributed
6 changes: 5 additions & 1 deletion continuous_integration/gpuci/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ channels:
- nodefaults
dependencies:
- c-compiler
- zlib
- dask>=2022.3.0
- fastapi>=0.92.0
- fugue>=0.7.3
Expand Down Expand Up @@ -48,4 +49,7 @@ dependencies:
- ucx-py=0.34
- xgboost=*=rapidsai_py*
- libxgboost=*=rapidsai_h*
- zlib
- pip
- pip:
- git+https://github.com/dask/dask
- git+https://github.com/dask/distributed
1 change: 1 addition & 0 deletions dask_sql/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0")
PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0")
PANDAS_GT_210 = _pandas_version >= parseVersion("2.1.0")

# TODO: remove if prompt-toolkit min version gets bumped
PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")
Expand Down
7 changes: 2 additions & 5 deletions dask_sql/physical/utils/sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,8 @@ def is_topk_optimizable(
sort_num_rows is None
or not single_ascending
or any(sort_null_first)
# pandas doesnt support nsmallest/nlargest with object dtypes
or (
"pandas" in str(df._partition_type)
and any(df[sort_columns].dtypes == "object")
)
# pandas/cudf don't support nsmallest/nlargest with object dtypes
or any(df[sort_columns].dtypes == "object")
or (
sort_num_rows * len(df.columns)
> dask_config.get("sql.sort.topk-nelem-limit")
Expand Down
8 changes: 6 additions & 2 deletions tests/integration/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pytest

from dask_sql import Context
from dask_sql._compat import PANDAS_GT_210
from dask_sql.utils import ParsingException
from tests.utils import assert_eq

Expand All @@ -28,8 +29,11 @@ def cast_datetime_to_string(df):
if not cols:
return df

# Casting directly to string loses second precision
df[cols] = df[cols].astype("object").astype("string")
strf = "%Y-%m-%dT%H:%M:%S" if PANDAS_GT_210 else "%Y-%m-%d %H:%M:%S"

for col in cols:
df[col] = df[col].dt.strftime(strf)

return df


Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def test_training_and_prediction(c, gpu_client):
check_trained_model(c, df_name=timeseries)


@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
@pytest.mark.xfail(
sys.platform == "win32",
reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only",
Expand Down Expand Up @@ -626,6 +627,7 @@ def test_mlflow_export(c, tmpdir):
)


@pytest.mark.flaky(reruns=8, condition="sys.platform == 'darwin'")
def test_mlflow_export_xgboost(c, client, tmpdir):
# Test only when mlflow & xgboost was installed
mlflow = pytest.importorskip("mlflow", reason="mlflow not installed")
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
28,
35,
36,
38, # FIXME: failing due to https://github.com/rapidsai/cudf/issues/14200
39,
41,
44,
Expand All @@ -24,6 +25,7 @@
51,
57,
62,
64, # FIXME: failing after cudf#14167 and #14079
67,
69,
70,
Expand Down

0 comments on commit 6439e31

Please sign in to comment.