From 670643331dbabd9abb0612da6e0428d0b524ee90 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 1 Mar 2024 15:15:38 -0500 Subject: [PATCH] Pin dask to 2024.1.1 (#1301) * Bump dask min version to 2023.6.0 * Remove dask compat code * Linting * Bump pyarrow and uvicorn deps to unblock environment solve * Undo unintentional pytest mindep change * Pin to dask 2024.1.1 * Tighten sklearn xfail in test_model.py * Drop tpot from 3.12 CI deps * Explicitly add xgboost to testing deps --- continuous_integration/docker/conda.txt | 4 ++-- continuous_integration/docker/main.dockerfile | 4 ++-- continuous_integration/environment-3.10.yaml | 5 +++-- continuous_integration/environment-3.11.yaml | 5 +++-- continuous_integration/environment-3.12.yaml | 8 +++++--- continuous_integration/environment-3.9.yaml | 5 +++-- .../gpuci/environment-3.10.yaml | 5 +++-- .../gpuci/environment-3.9.yaml | 5 +++-- continuous_integration/recipe/meta.yaml | 4 ++-- dask_sql/_compat.py | 11 ----------- dask_sql/physical/rel/logical/join.py | 9 --------- dask_sql/physical/rex/core/call.py | 15 +++++---------- dask_sql/physical/utils/filter.py | 6 ------ docs/environment.yml | 4 ++-- docs/requirements-docs.txt | 4 ++-- pyproject.toml | 6 +++--- tests/integration/test_filter.py | 9 --------- tests/integration/test_join.py | 5 ----- tests/integration/test_model.py | 10 +++++++--- tests/integration/test_rex.py | 19 ++----------------- tests/unit/test_utils.py | 9 --------- 21 files changed, 47 insertions(+), 105 deletions(-) diff --git a/continuous_integration/docker/conda.txt b/continuous_integration/docker/conda.txt index eefd4351c..64892c882 100644 --- a/continuous_integration/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ python>=3.9 -dask>=2022.3.0 +dask==2024.1.1 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 @@ -12,7 +12,7 @@ sphinx>=3.2.1 tzlocal>=2.1 fastapi>=0.92.0 httpx>=0.24.1 -uvicorn>=0.13.4 +uvicorn>=0.14 pyarrow>=14.0.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 diff --git a/continuous_integration/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile index 202f38c95..2a8c2ed5d 100644 --- a/continuous_integration/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -16,11 +16,11 @@ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask>=2022.3.0" \ + "dask==2024.1.1" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ - "uvicorn>=0.13.4" \ + "uvicorn>=0.14" \ "tzlocal>=2.1" \ "prompt_toolkit>=3.0.8" \ "pygments>=2.7.1" \ diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 503133d35..94059ac5a 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2022.3.0 +- dask==2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -26,6 +26,7 @@ dependencies: - pytest-xdist - pytest - python=3.10 +- py-xgboost>=1.7.0 - scikit-learn>=1.0.0 - sphinx - sqlalchemy @@ -33,5 +34,5 @@ dependencies: # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal>=2.1 -- uvicorn>=0.13.4 +- uvicorn>=0.14 - zlib diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index afd0cd696..5074dd695 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2022.3.0 +- dask==2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -26,6 +26,7 @@ dependencies: - pytest-xdist - pytest - python=3.11 +- py-xgboost>=1.7.0 - scikit-learn>=1.0.0 - sphinx - sqlalchemy @@ -33,5 +34,5 @@ dependencies: # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal>=2.1 -- uvicorn>=0.13.4 +- uvicorn>=0.14 - zlib diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml index dda03f444..390735f83 100644 --- a/continuous_integration/environment-3.12.yaml +++ b/continuous_integration/environment-3.12.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask>=2022.3.0 +- dask==2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -27,12 +27,14 @@ dependencies: - pytest-xdist - pytest - python=3.12 +- py-xgboost>=1.7.0 - scikit-learn>=1.0.0 - sphinx - sqlalchemy -- tpot>=0.12.0 +# TODO: add once tpot supports python 3.12 +# - tpot>=0.12.0 # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal>=2.1 -- uvicorn>=0.13.4 +- uvicorn>=0.14 - zlib diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index b6f57e069..015e829d0 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - c-compiler -- dask=2022.3.0 +- dask=2024.1.1 - fastapi=0.92.0 - fugue=0.7.3 - httpx=0.24.1 @@ -26,6 +26,7 @@ dependencies: - pytest-xdist - pytest - python=3.9 +- py-xgboost=1.7.0 - scikit-learn=1.0.0 - sphinx # TODO: remove this constraint when we require pandas>2 @@ -34,5 +35,5 @@ dependencies: # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal=2.1 -- uvicorn=0.13.4 +- uvicorn=0.14 - zlib diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 7d59d05f1..8cb539352 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -9,7 +9,7 @@ channels: dependencies: - c-compiler - zlib -- dask>=2022.3.0 +- dask==2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -32,6 +32,7 @@ dependencies: - pytest-xdist - pytest - python=3.10 +- py-xgboost>=1.7.0 - scikit-learn>=1.0.0 - sphinx - sqlalchemy @@ -39,7 +40,7 @@ dependencies: # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal>=2.1 -- uvicorn>=0.13.4 +- uvicorn>=0.14 # GPU-specific requirements - cudatoolkit=11.8 - cudf=24.04 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 3ec47d753..8fa8c7f6d 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -9,7 +9,7 @@ channels: dependencies: - c-compiler - zlib -- dask>=2022.3.0 +- dask==2024.1.1 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -32,6 +32,7 @@ dependencies: - pytest-xdist - pytest - python=3.9 +- py-xgboost>=1.7.0 - scikit-learn>=1.0.0 - sphinx - sqlalchemy @@ -39,7 +40,7 @@ dependencies: # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - tzlocal>=2.1 -- uvicorn>=0.13.4 +- uvicorn>=0.14 # GPU-specific requirements - cudatoolkit=11.8 - cudf=24.04 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 60a5aa299..914f9da0b 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,11 +32,11 @@ requirements: - xz # [linux64] run: - python - - dask >=2022.3.0 + - dask ==2024.1.1 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 - - uvicorn >=0.13.4 + - uvicorn >=0.14 - tzlocal >=2.1 - prompt-toolkit >=3.0.8 - pygments >=2.7.1 diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index be8cfbae5..c637ef385 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -1,23 +1,12 @@ -import dask import pandas as pd import prompt_toolkit from packaging.version import parse as parseVersion _pandas_version = parseVersion(pd.__version__) _prompt_toolkit_version = parseVersion(prompt_toolkit.__version__) -_dask_version = parseVersion(dask.__version__) INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0") PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0") # TODO: remove if prompt-toolkit min version gets bumped PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29") - -# TODO: remove when dask min version gets bumped -BROADCAST_JOIN_SUPPORT_WORKING = _dask_version > parseVersion("2023.1.0") - -# Parquet predicate-support version checks -PQ_NOT_IN_SUPPORT = parseVersion(dask.__version__) > parseVersion("2023.5.1") -PQ_IS_SUPPORT = parseVersion(dask.__version__) >= parseVersion("2023.3.1") - -DASK_CUDF_TODATETIME_SUPPORT = _dask_version >= parseVersion("2023.5.1") diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index 374c74420..06bb34ca3 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -9,7 +9,6 @@ from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph -from dask_sql._compat import BROADCAST_JOIN_SUPPORT_WORKING from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rel.logical.filter import filter_or_scalar @@ -259,14 +258,6 @@ def _join_on_columns( added_columns = list(lhs_columns_to_add.keys()) broadcast = dask_config.get("sql.join.broadcast") - if not BROADCAST_JOIN_SUPPORT_WORKING and ( - isinstance(broadcast, float) or broadcast - ): - warnings.warn( - "Broadcast Joins may not work as expected with dask<2023.1.1" - "For more information refer to https://github.com/dask/dask/issues/9851" - " and https://github.com/dask/dask/issues/9870" - ) if join_type == "leftanti" and not is_cudf_type(df_lhs_with_tmp): df = df_lhs_with_tmp.merge( df_rhs_with_tmp, diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index a6e3ac98e..ae17b2027 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -16,7 +16,6 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( @@ -964,15 +963,11 @@ def date_part(self, what, df: SeriesOrScalar): elif what in {"YEAR", "YEARS"}: return df.year elif what == "DATE": - if isinstance(df, pd.Timestamp): - return df.date() - else: - if is_cudf_type(df) and not DASK_CUDF_TODATETIME_SUPPORT: - raise RuntimeError( - "Dask-cuDF to_datetime support requires Dask version >= 2023.5.1" - ) - else: - return dd.to_datetime(df.strftime("%Y-%m-%d")) + return ( + df.date() + if isinstance(df, pd.Timestamp) + else dd.to_datetime(df.strftime("%Y-%m-%d")) + ) else: raise NotImplementedError(f"Extraction of {what} is not (yet) implemented.") diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index ae564244d..aff9ab5ef 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -11,8 +11,6 @@ from dask.layers import DataFrameIOLayer from dask.utils import M, apply, is_arraylike -from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT - logger = logging.getLogger(__name__) @@ -501,8 +499,6 @@ def _get_blockwise_input(input_index, indices: list, dsk: RegenerableGraph): def _inv(symbol: str): - if symbol == "in" and not PQ_NOT_IN_SUPPORT: - raise ValueError("This version of dask does not support 'not in'") return { ">": "<", "<": ">", @@ -568,8 +564,6 @@ def _blockwise_isin_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: def _blockwise_isna_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: # Return DNF expression pattern for `isna` - if not PQ_IS_SUPPORT: - raise ValueError("This version of dask does not support 'is' predicates.") left = _get_blockwise_input(0, indices, dsk) return DNF((left, "is", None)) diff --git a/docs/environment.yml b/docs/environment.yml index 2d0e08ba0..98b2f0f08 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -6,14 +6,14 @@ dependencies: - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask>=2022.3.0 + - dask==2024.1.1 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 - triad<0.9.2 - fastapi>=0.92.0 - httpx>=0.24.1 - - uvicorn>=0.13.4 + - uvicorn>=0.14 - tzlocal>=2.1 - prompt_toolkit>=3.0.8 - pygments>=2.7.1 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 1f2052a92..689599446 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,14 +1,14 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask>=2022.3.0 +dask==2024.1.1 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 triad<0.9.2 fastapi>=0.92.0 httpx>=0.24.1 -uvicorn>=0.13.4 +uvicorn>=0.14 tzlocal>=2.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 diff --git a/pyproject.toml b/pyproject.toml index c2e26c823..bcbd0a06e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,12 +27,12 @@ classifiers = [ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dask[dataframe]>=2022.3.0", - "distributed>=2022.3.0", + "dask[dataframe]==2024.1.1", + "distributed==2024.1.1", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", - "uvicorn>=0.13.4", + "uvicorn>=0.14", "tzlocal>=2.1", "prompt_toolkit>=3.0.8", "pygments>=2.7.1", diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py index cede43185..41c51d5fb 100644 --- a/tests/integration/test_filter.py +++ b/tests/integration/test_filter.py @@ -5,7 +5,6 @@ from dask.utils_test import hlg_layer from packaging.version import parse as parseVersion -from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT from tests.utils import assert_eq DASK_GT_2022_4_2 = parseVersion(dask.__version__) >= parseVersion("2022.4.2") @@ -182,10 +181,6 @@ def test_filter_year(c): "SELECT * FROM parquet_ddf WHERE b NOT IN (1, 3, 5, 6)", lambda x: x[~x["b"].isin([1, 3, 5, 6])], [[("b", "not in", (1, 3, 5, 6))]], - marks=pytest.mark.skipif( - not PQ_NOT_IN_SUPPORT, - reason="Requires https://github.com/dask/dask/pull/10320", - ), ), ( "SELECT a FROM parquet_ddf WHERE (b > 5 AND b < 10) OR a = 1", @@ -317,10 +312,6 @@ def test_filter_decimal(c, gpu): c.drop_table("df") -@pytest.mark.skipif( - not PQ_IS_SUPPORT, - reason="Requires https://github.com/dask/dask/pull/10320", -) def test_predicate_pushdown_isna(tmpdir): from dask_sql.context import Context diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index 8254ccbfe..e6257ca02 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -5,7 +5,6 @@ from dask.utils_test import hlg_layer from dask_sql import Context -from dask_sql._compat import BROADCAST_JOIN_SUPPORT_WORKING from dask_sql.datacontainer import Statistics from tests.utils import assert_eq @@ -524,10 +523,6 @@ def test_join_reorder(c): assert_eq(result_df, expected_df, check_index=False) -@pytest.mark.xfail( - not BROADCAST_JOIN_SUPPORT_WORKING, - reason="Broadcast Joins do not work as expected with dask<2023.1.1", -) @pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_broadcast_join(c, client, gpu): df1 = dd.from_pandas( diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 1dcba616b..4ef441f23 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -20,7 +20,7 @@ sklearn = pytest.importorskip("sklearn") -SKLEARN_GT_130 = parseVersion(sklearn.__version__) >= parseVersion("1.4") +SKLEARN_EQ_140 = parseVersion(sklearn.__version__) == parseVersion("1.4.0") def check_trained_model(c, model_name="my_model", df_name="timeseries"): @@ -909,7 +909,9 @@ def test_ml_experiment(c, client): ) -@pytest.mark.xfail(reason="tpot is broken with sklearn>=1.4", condition=SKLEARN_GT_130) +@pytest.mark.xfail( + reason="tpot is broken with sklearn==1.4.0", condition=SKLEARN_EQ_140 +) def test_experiment_automl_classifier(c, client): tpot = pytest.importorskip("tpot", reason="tpot not installed") @@ -934,7 +936,9 @@ def test_experiment_automl_classifier(c, client): check_trained_model(c, "my_automl_exp1") -@pytest.mark.xfail(reason="tpot is broken with sklearn>=1.4", condition=SKLEARN_GT_130) +@pytest.mark.xfail( + reason="tpot is broken with sklearn==1.4.0", condition=SKLEARN_EQ_140 +) def test_experiment_automl_regressor(c, client): tpot = pytest.importorskip("tpot", reason="tpot not installed") diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 5f0af726e..b02607f61 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -5,7 +5,6 @@ import pandas as pd import pytest -from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT from tests.utils import assert_eq @@ -1050,14 +1049,7 @@ def test_totimestamp(c, gpu): False, pytest.param( True, - marks=( - pytest.mark.gpu, - pytest.mark.xfail( - not DASK_CUDF_TODATETIME_SUPPORT, - reason="Requires https://github.com/dask/dask/pull/9881", - raises=RuntimeError, - ), - ), + marks=(pytest.mark.gpu,), ), ], ) @@ -1114,14 +1106,7 @@ def test_extract_date(c, gpu): False, pytest.param( True, - marks=( - pytest.mark.gpu, - pytest.mark.xfail( - not DASK_CUDF_TODATETIME_SUPPORT, - reason="Requires https://github.com/dask/dask/pull/9881", - raises=RuntimeError, - ), - ), + marks=(pytest.mark.gpu,), ), ], ) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 7c5df717d..6dac75837 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -3,7 +3,6 @@ from dask import dataframe as dd from dask.utils_test import hlg_layer -from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT from dask_sql.physical.utils.filter import attempt_predicate_pushdown from dask_sql.utils import Pluggable, is_frame @@ -84,10 +83,6 @@ def test_predicate_pushdown_logical(parquet_ddf): assert got_filters == expected_filters -@pytest.mark.skipif( - not PQ_NOT_IN_SUPPORT, - reason="Requires https://github.com/dask/dask/pull/10320", -) def test_predicate_pushdown_in(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) @@ -108,10 +103,6 @@ def test_predicate_pushdown_in(parquet_ddf): assert got_filters == expected_filters -@pytest.mark.skipif( - not PQ_IS_SUPPORT, - reason="Requires dask>=2023.3.1", -) def test_predicate_pushdown_isna(parquet_ddf): filtered_df = parquet_ddf[ (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2)