From ce51df344ae8f91be6f2a7563d2e150b8741125a Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:16:27 -0500 Subject: [PATCH] Drop python 3.8, add 3.11 & 3.12 to testing (#1143) * Initial python version bump in CI * Add libprotobuf to GPU CI environments * Replace mentions of old env files * Remove strict channel priority to try to unblock env solves? * Establish minimum version for mlflow * Revert "Remove strict channel priority to try to unblock env solves?" This reverts commit e4548339f7284915ed6a5eb79121ed6a2ae260d3. * Try strict channel priority without nodefaults * Bump mlflow min version to fix windows failures * Build python 3.11 wheels * Run wheel builds in PR test * Try protoc action in wheels build to unblock * Skip hive testing on 3.11 for now * Fix workflow syntax errors * Stop running wheel CI * Bump pyo3 abi minor version * Initial run of pyupgrade to py39 * Continue marking test_dask_fsql as flaky * More places to drop 3.8 * Try running tests on python 3.12 * Add environment file * Skip sasl installation * Drop protoc build dep * Drop mlflow constraint * Set min version for mlflow * Drop mlflow from 3.12 tests for now * Relocate docker/server files to continuous_integration * Unpin dask/distributed * unpin 3.9 gpu environment * add 3.12 to classifiers * unpin dask in gpuci 3.9 --- .github/workflows/conda.yml | 4 +- .github/workflows/docker.yml | 6 +-- .github/workflows/test-upstream.yml | 11 ++---- .github/workflows/test.yml | 13 +++---- CONTRIBUTING.md | 2 +- Cargo.toml | 2 +- README.md | 2 +- .../docker}/cloud.dockerfile | 0 .../docker}/conda.txt | 4 +- .../docker}/main.dockerfile | 6 +-- ...nt-3.10-dev.yaml => environment-3.10.yaml} | 7 +--- ...ent-3.9-dev.yaml => environment-3.11.yaml} | 9 ++--- continuous_integration/environment-3.12.yaml | 38 +++++++++++++++++++ ...ment-3.8-dev.yaml => environment-3.9.yaml} | 11 ++---- .../gpuci/environment-3.10.yaml | 15 +++----- .../gpuci/environment-3.9.yaml | 15 +++----- continuous_integration/recipe/meta.yaml | 2 +- .../scripts}/startup_script.py | 0 dask_sql/context.py | 18 ++++----- dask_sql/datacontainer.py | 30 +++++++-------- dask_sql/integrations/fugue.py | 4 +- dask_sql/integrations/ipython.py | 4 +- dask_sql/physical/rel/base.py | 4 +- dask_sql/physical/rel/custom/wrappers.py | 8 ++-- dask_sql/physical/rel/logical/aggregate.py | 18 ++++----- dask_sql/physical/rel/logical/join.py | 8 ++-- dask_sql/physical/rel/logical/table_scan.py | 2 +- dask_sql/physical/rel/logical/window.py | 24 ++++++------ dask_sql/physical/utils/groupby.py | 4 +- dask_sql/physical/utils/sort.py | 22 +++++------ dask_sql/physical/utils/statistics.py | 5 +-- dask_sql/utils.py | 6 +-- docs/environment.yml | 3 +- docs/requirements-docs.txt | 2 +- docs/source/installation.rst | 2 +- docs/source/server.rst | 2 +- pyproject.toml | 9 +++-- tests/integration/test_cmd.py | 3 +- tests/integration/test_fugue.py | 2 +- tests/integration/test_model.py | 2 +- tests/unit/test_ml_utils.py | 2 +- 41 files changed, 172 insertions(+), 159 deletions(-) rename {docker => continuous_integration/docker}/cloud.dockerfile (100%) rename {docker => continuous_integration/docker}/conda.txt (89%) rename {docker => continuous_integration/docker}/main.dockerfile (89%) rename continuous_integration/{environment-3.10-dev.yaml => environment-3.10.yaml} (77%) rename continuous_integration/{environment-3.9-dev.yaml => environment-3.11.yaml} (75%) create mode 100644 continuous_integration/environment-3.12.yaml rename continuous_integration/{environment-3.8-dev.yaml => environment-3.9.yaml} (73%) rename {scripts => continuous_integration/scripts}/startup_script.py (100%) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index a5ba2f1c6..549b3411a 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - python: ["3.8", "3.9", "3.10"] + python: ["3.9", "3.10", "3.11", "3.12"] arch: ["linux-64", "linux-aarch64"] steps: - name: Manage disk space @@ -72,7 +72,7 @@ jobs: with: miniforge-variant: Mambaforge use-mamba: true - python-version: "3.8" + python-version: "3.9" channel-priority: strict - name: Install dependencies run: | diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 658dc93e6..8b59c589f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -11,7 +11,7 @@ on: - Cargo.toml - Cargo.lock - pyproject.toml - - docker/** + - continuous_integration/docker/** - .github/workflows/docker.yml # When this workflow is queued, automatically cancel any previous running @@ -47,7 +47,7 @@ jobs: uses: docker/build-push-action@v5 with: context: . - file: ./docker/main.dockerfile + file: ./continuous_integration/docker/main.dockerfile build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }} platforms: ${{ matrix.platform }} tags: ${{ steps.docker_meta_main.outputs.tags }} @@ -68,7 +68,7 @@ jobs: uses: docker/build-push-action@v5 with: context: . - file: ./docker/cloud.dockerfile + file: ./continuous_integration/docker/cloud.dockerfile build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }} platforms: ${{ matrix.platform }} tags: ${{ steps.docker_meta_cloud.outputs.tags }} diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index e84387296..f9c08bade 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -36,21 +36,21 @@ jobs: name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }})" runs-on: ${{ matrix.os }} env: - CONDA_FILE: continuous_integration/environment-${{ matrix.python }}-dev.yaml + CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python: ["3.8", "3.9", "3.10"] + python: ["3.9", "3.10", "3.11", "3.12"] distributed: [false] include: # run tests on a distributed client - os: "ubuntu-latest" - python: "3.8" + python: "3.9" distributed: true - os: "ubuntu-latest" - python: "3.10" + python: "3.11" distributed: true steps: - uses: actions/checkout@v4 @@ -75,7 +75,6 @@ jobs: - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | - mamba install -c conda-forge "sasl>=0.3.1" docker pull bde2020/hive:2.3.2-postgresql-metastore docker pull bde2020/hive-metastore-postgresql:2.3.0 - name: Install upstream dev Dask @@ -109,8 +108,6 @@ jobs: with: miniforge-variant: Mambaforge use-mamba: true - # TODO: drop support for python 3.8, add support for python 3.11 - # https://github.com/dask-contrib/dask-sql/pull/1143 python-version: "3.9" channel-priority: strict - name: Optionally update upstream cargo dependencies diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b45f924b8..ef1398881 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,21 +37,21 @@ jobs: needs: [detect-ci-trigger] runs-on: ${{ matrix.os }} env: - CONDA_FILE: continuous_integration/environment-${{ matrix.python }}-dev.yaml + CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python: ["3.8", "3.9", "3.10"] + python: ["3.9", "3.10", "3.11", "3.12"] distributed: [false] include: # run tests on a distributed client - os: "ubuntu-latest" - python: "3.8" + python: "3.9" distributed: true - os: "ubuntu-latest" - python: "3.10" + python: "3.11" distributed: true steps: - uses: actions/checkout@v4 @@ -76,7 +76,6 @@ jobs: - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | - mamba install -c conda-forge "sasl>=0.3.1" docker pull bde2020/hive:2.3.2-postgresql-metastore docker pull bde2020/hive-metastore-postgresql:2.3.0 - name: Optionally install upstream dev Dask @@ -107,9 +106,7 @@ jobs: with: miniforge-variant: Mambaforge use-mamba: true - # TODO: drop support for python 3.8, add support for python 3.11 - # https://github.com/dask-contrib/dask-sql/pull/1143 - python-version: ${{ needs.detect-ci-trigger.outputs.triggered == 'true' && '3.9' || '3.8' }} + python-version: "3.9" channel-priority: strict - name: Install dependencies and nothing else run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f5adc85a..3c14cd7f2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ rustup update To initialize and activate the conda environment for a given Python version: ``` -conda env create -f dask-sql/continuous_integration/environment-{$PYTHON_VER}-dev.yaml +conda env create -f dask-sql/continuous_integration/environment-{$PYTHON_VER}.yaml conda activate dask-sql ``` diff --git a/Cargo.toml b/Cargo.toml index 24b1db8b0..1ac50cf57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ async-trait = "0.1.74" datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", ref = "da6c183" } env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.19.2", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.19.2", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-log = "0.9.0" [build-dependencies] diff --git a/README.md b/README.md index d08aa0328..7359aae2d 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ If you want to have the newest (unreleased) `dask-sql` version or if you plan to Create a new conda environment and install the development environment: - conda env create -f continuous_integration/environment-3.9-dev.yaml + conda env create -f continuous_integration/environment-3.9.yaml It is not recommended to use `pip` instead of `conda` for the environment setup. diff --git a/docker/cloud.dockerfile b/continuous_integration/docker/cloud.dockerfile similarity index 100% rename from docker/cloud.dockerfile rename to continuous_integration/docker/cloud.dockerfile diff --git a/docker/conda.txt b/continuous_integration/docker/conda.txt similarity index 89% rename from docker/conda.txt rename to continuous_integration/docker/conda.txt index 492baaa66..270c2febd 100644 --- a/docker/conda.txt +++ b/continuous_integration/docker/conda.txt @@ -1,5 +1,5 @@ -python>=3.8 -dask>=2022.3.0,<=2023.11.0 +python>=3.9 +dask>=2022.3.0 pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 diff --git a/docker/main.dockerfile b/continuous_integration/docker/main.dockerfile similarity index 89% rename from docker/main.dockerfile rename to continuous_integration/docker/main.dockerfile index 98fb970b4..78cd46938 100644 --- a/docker/main.dockerfile +++ b/continuous_integration/docker/main.dockerfile @@ -11,12 +11,12 @@ RUN sh /rustup-init.sh -y --default-toolchain=stable --profile=minimal \ ENV PATH="/root/.cargo/bin:${PATH}" # Install conda dependencies for dask-sql -COPY docker/conda.txt /opt/dask_sql/ +COPY continuous_integration/docker/conda.txt /opt/dask_sql/ RUN mamba install -y \ # build requirements "maturin>=1.3,<1.4" \ # core dependencies - "dask>=2022.3.0,<=2023.11.0" \ + "dask>=2022.3.0" \ "pandas>=1.4.0" \ "fastapi>=0.92.0" \ "httpx>=0.24.1" \ @@ -44,7 +44,7 @@ RUN cd /opt/dask_sql/ \ && CONDA_PREFIX="/opt/conda/" maturin develop # Set the script to execute -COPY scripts/startup_script.py /opt/dask_sql/startup_script.py +COPY continuous_integration/scripts/startup_script.py /opt/dask_sql/startup_script.py EXPOSE 8080 ENTRYPOINT [ "/usr/bin/prepare.sh", "/opt/conda/bin/python", "/opt/dask_sql/startup_script.py" ] diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10.yaml similarity index 77% rename from continuous_integration/environment-3.10-dev.yaml rename to continuous_integration/environment-3.10.yaml index bca6e8c66..b0557a915 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -1,10 +1,9 @@ name: dask-sql channels: - conda-forge -- nodefaults dependencies: - c-compiler -- dask>=2022.3.0,<=2023.11.0 +- dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -12,9 +11,7 @@ dependencies: - jsonschema - lightgbm - maturin>=1.3,<1.4 -# FIXME: mlflow 2.6.0 has import issues related to pydantic -# https://github.com/mlflow/mlflow/issues/9331 -- mlflow<2.6 +- mlflow>=2.0 - mock - numpy>=1.21.6 - pandas>=1.4.0 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.11.yaml similarity index 75% rename from continuous_integration/environment-3.9-dev.yaml rename to continuous_integration/environment-3.11.yaml index 730ed0129..1bcf46d45 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -1,10 +1,9 @@ name: dask-sql channels: - conda-forge -- nodefaults dependencies: - c-compiler -- dask>=2022.3.0,<=2023.11.0 +- dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -12,9 +11,7 @@ dependencies: - jsonschema - lightgbm - maturin>=1.3,<1.4 -# FIXME: mlflow 2.6.0 has import issues related to pydantic -# https://github.com/mlflow/mlflow/issues/9331 -- mlflow<2.6 +- mlflow>=2.0 - mock - numpy>=1.21.6 - pandas>=1.4.0 @@ -28,7 +25,7 @@ dependencies: - pytest-rerunfailures - pytest-xdist - pytest -- python=3.9 +- python=3.11 - scikit-learn>=1.0.0 - sphinx - sqlalchemy<2 diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml new file mode 100644 index 000000000..18a67409b --- /dev/null +++ b/continuous_integration/environment-3.12.yaml @@ -0,0 +1,38 @@ +name: dask-sql +channels: +- conda-forge +dependencies: +- c-compiler +- dask>=2022.3.0 +- fastapi>=0.92.0 +- fugue>=0.7.3 +- httpx>=0.24.1 +- intake>=0.6.0 +- jsonschema +- lightgbm +- maturin>=1.3,<1.4 +# TODO: add once mlflow 3.12 builds are available +# - mlflow>=2.0 +- mock +- numpy>=1.21.6 +- pandas>=1.4.0 +- pre-commit +- prompt_toolkit>=3.0.8 +- psycopg2 +- pyarrow>=6.0.2 +- pygments>=2.7.1 +- pyhive +- pytest-cov +- pytest-rerunfailures +- pytest-xdist +- pytest +- python=3.12 +- scikit-learn>=1.0.0 +- sphinx +- sqlalchemy<2 +- tpot>=0.12.0 +# FIXME: https://github.com/fugue-project/fugue/issues/526 +- triad<0.9.2 +- tzlocal>=2.1 +- uvicorn>=0.13.4 +- zlib diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.9.yaml similarity index 73% rename from continuous_integration/environment-3.8-dev.yaml rename to continuous_integration/environment-3.9.yaml index d09ea9b64..a627318c1 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -1,7 +1,6 @@ -name: dask-sql +name: dask-sql-py39 channels: - conda-forge -- nodefaults dependencies: - c-compiler - dask=2022.3.0 @@ -11,10 +10,8 @@ dependencies: - intake=0.6.0 - jsonschema - lightgbm -- maturin>=1.3,<1.4 -# FIXME: mlflow 2.6.0 has import issues related to pydantic -# https://github.com/mlflow/mlflow/issues/9331 -- mlflow<2.6 +- maturin=1.3 +- mlflow=2.0 - mock - numpy=1.21.6 - pandas=1.4.0 @@ -28,7 +25,7 @@ dependencies: - pytest-rerunfailures - pytest-xdist - pytest -- python=3.8 +- python=3.9 - scikit-learn=1.0.0 - sphinx - sqlalchemy<2 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 370577a54..2420e949f 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -8,7 +8,7 @@ channels: dependencies: - c-compiler - zlib -- dask>=2022.3.0,<=2023.11.0 +- dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -16,9 +16,7 @@ dependencies: - jsonschema - lightgbm - maturin>=1.3,<1.4 -# FIXME: mlflow 2.6.0 has import issues related to pydantic -# https://github.com/mlflow/mlflow/issues/9331 -- mlflow<2.6 +- mlflow>=2.0 - mock - numpy>=1.21.6 - pandas>=1.4.0 @@ -51,8 +49,7 @@ dependencies: - ucx-py=0.36 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* -# TODO: unpin after RAPIDS 24.02 release -# - pip -# - pip: -# - git+https://github.com/dask/dask -# - git+https://github.com/dask/distributed +- pip +- pip: + - git+https://github.com/dask/dask + - git+https://github.com/dask/distributed diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 2c9ed4a6c..f88cf57c7 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -8,7 +8,7 @@ channels: dependencies: - c-compiler - zlib -- dask>=2022.3.0,<=2023.11.0 +- dask>=2022.3.0 - fastapi>=0.92.0 - fugue>=0.7.3 - httpx>=0.24.1 @@ -16,9 +16,7 @@ dependencies: - jsonschema - lightgbm - maturin>=1.3,<1.4 -# FIXME: mlflow 2.6.0 has import issues related to pydantic -# https://github.com/mlflow/mlflow/issues/9331 -- mlflow<2.6 +- mlflow>=2.0 - mock - numpy>=1.21.6 - pandas>=1.4.0 @@ -51,8 +49,7 @@ dependencies: - ucx-py=0.36 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* -# TODO: unpin after RAPIDS 24.02 release -# - pip -# - pip: -# - git+https://github.com/dask/dask -# - git+https://github.com/dask/distributed +- pip +- pip: + - git+https://github.com/dask/dask + - git+https://github.com/dask/distributed diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 16c943c38..60a5aa299 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,7 +32,7 @@ requirements: - xz # [linux64] run: - python - - dask >=2022.3.0,<=2023.11.0 + - dask >=2022.3.0 - pandas >=1.4.0 - fastapi >=0.92.0 - httpx >=0.24.1 diff --git a/scripts/startup_script.py b/continuous_integration/scripts/startup_script.py similarity index 100% rename from scripts/startup_script.py rename to continuous_integration/scripts/startup_script.py diff --git a/dask_sql/context.py b/dask_sql/context.py index 19bba68ae..faab98e90 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -2,7 +2,7 @@ import inspect import logging from collections import Counter -from typing import Any, Callable, Dict, List, Tuple, Union +from typing import Any, Callable, Union import dask.dataframe as dd import pandas as pd @@ -309,7 +309,7 @@ def register_function( self, f: Callable, name: str, - parameters: List[Tuple[str, type]], + parameters: list[tuple[str, type]], return_type: type, replace: bool = False, schema_name: str = None, @@ -400,7 +400,7 @@ def register_aggregation( self, f: dd.Aggregation, name: str, - parameters: List[Tuple[str, type]], + parameters: list[tuple[str, type]], return_type: type, replace: bool = False, schema_name: str = None, @@ -467,9 +467,9 @@ def sql( self, sql: Any, return_futures: bool = True, - dataframes: Dict[str, Union[dd.DataFrame, pd.DataFrame]] = None, + dataframes: dict[str, Union[dd.DataFrame, pd.DataFrame]] = None, gpu: bool = False, - config_options: Dict[str, Any] = None, + config_options: dict[str, Any] = None, ) -> Union[dd.DataFrame, pd.DataFrame]: """ Query the registered tables with the given SQL. @@ -519,7 +519,7 @@ def sql( def explain( self, sql: str, - dataframes: Dict[str, Union[dd.DataFrame, pd.DataFrame]] = None, + dataframes: dict[str, Union[dd.DataFrame, pd.DataFrame]] = None, gpu: bool = False, ) -> str: """ @@ -606,7 +606,7 @@ def register_model( self, model_name: str, model: Any, - training_columns: List[str], + training_columns: list[str], schema_name: str = None, ): """ @@ -708,7 +708,7 @@ def stop_server(self): # pragma: no cover self.sql_server = None - def fqn(self, tbl: "DaskTable") -> Tuple[str, str]: + def fqn(self, tbl: "DaskTable") -> tuple[str, str]: """ Return the fully qualified name of an object, maybe including the schema name. @@ -908,7 +908,7 @@ def _register_callable( f: Any, name: str, aggregation: bool, - parameters: List[Tuple[str, type]], + parameters: list[tuple[str, type]], return_type: type, replace: bool = False, schema_name=None, diff --git a/dask_sql/datacontainer.py b/dask_sql/datacontainer.py index e4c93a8f5..db2c82abf 100644 --- a/dask_sql/datacontainer.py +++ b/dask_sql/datacontainer.py @@ -1,5 +1,5 @@ from collections import namedtuple -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Union import dask.dataframe as dd import pandas as pd @@ -28,8 +28,8 @@ class ColumnContainer: def __init__( self, - frontend_columns: List[str], - frontend_backend_mapping: Union[Dict[str, ColumnType], None] = None, + frontend_columns: list[str], + frontend_backend_mapping: Union[dict[str, ColumnType], None] = None, ): assert all( isinstance(col, str) for col in frontend_columns @@ -50,7 +50,7 @@ def _copy(self) -> ColumnContainer: self._frontend_columns.copy(), self._frontend_backend_mapping.copy() ) - def limit_to(self, fields: List[str]) -> ColumnContainer: + def limit_to(self, fields: list[str]) -> ColumnContainer: """ Create a new ColumnContainer, which has frontend columns limited to only the ones given as parameter. @@ -64,7 +64,7 @@ def limit_to(self, fields: List[str]) -> ColumnContainer: cc._frontend_columns = [str(x) for x in fields] return cc - def rename(self, columns: Dict[str, str]) -> ColumnContainer: + def rename(self, columns: dict[str, str]) -> ColumnContainer: """ Return a new ColumnContainer where the frontend columns are renamed according to the given mapping. @@ -84,7 +84,7 @@ def rename(self, columns: Dict[str, str]) -> ColumnContainer: return cc def rename_handle_duplicates( - self, from_columns: List[str], to_columns: List[str] + self, from_columns: list[str], to_columns: list[str] ) -> ColumnContainer: """ Same as `rename` but additionally handles presence of @@ -105,14 +105,14 @@ def rename_handle_duplicates( return cc - def mapping(self) -> List[Tuple[str, ColumnType]]: + def mapping(self) -> list[tuple[str, ColumnType]]: """ The mapping from frontend columns to backend columns. """ return list(self._frontend_backend_mapping.items()) @property - def columns(self) -> List[str]: + def columns(self) -> list[str]: """ The stored frontend columns in the correct order """ @@ -281,10 +281,10 @@ def __hash__(self): class SchemaContainer: def __init__(self, name: str): self.__name__ = name - self.tables: Dict[str, DataContainer] = {} - self.statistics: Dict[str, Statistics] = {} - self.experiments: Dict[str, pd.DataFrame] = {} - self.models: Dict[str, Tuple[Any, List[str]]] = {} - self.functions: Dict[str, UDF] = {} - self.function_lists: List[FunctionDescription] = [] - self.filepaths: Dict[str, str] = {} + self.tables: dict[str, DataContainer] = {} + self.statistics: dict[str, Statistics] = {} + self.experiments: dict[str, pd.DataFrame] = {} + self.models: dict[str, tuple[Any, list[str]]] = {} + self.functions: dict[str, UDF] = {} + self.function_lists: list[FunctionDescription] = [] + self.filepaths: dict[str, str] = {} diff --git a/dask_sql/integrations/fugue.py b/dask_sql/integrations/fugue.py index cdc5fbdae..c1123d652 100644 --- a/dask_sql/integrations/fugue.py +++ b/dask_sql/integrations/fugue.py @@ -11,7 +11,7 @@ "Can not load the fugue module. If you want to use this integration, you need to install it." ) -from typing import Any, Dict, Optional +from typing import Any, Optional import dask.dataframe as dd @@ -94,7 +94,7 @@ def fsql_dask( ctx: Optional[Context] = None, register: bool = False, fugue_conf: Any = None, -) -> Dict[str, dd.DataFrame]: +) -> dict[str, dd.DataFrame]: """FugueSQL utility function that can consume Context directly. FugueSQL is a language extending standard SQL. It makes SQL eligible to describe end to end workflows. It also enables you to invoke python extensions in the SQL like language. diff --git a/dask_sql/integrations/ipython.py b/dask_sql/integrations/ipython.py index 08843c00c..b68f7acf5 100644 --- a/dask_sql/integrations/ipython.py +++ b/dask_sql/integrations/ipython.py @@ -1,5 +1,5 @@ import time -from typing import TYPE_CHECKING, Dict, List +from typing import TYPE_CHECKING from dask_sql.mappings import _SQL_TO_PYTHON_FRAMES from dask_sql.physical.rex.core import RexCallPlugin @@ -153,6 +153,6 @@ def _register_syntax_highlighting(): # pragma: no cover display.display_javascript(js + _JS_ENABLE_DASK_SQL, raw=True) -def _create_set(keys: List[str]) -> Dict[str, bool]: # pragma: no cover +def _create_set(keys: list[str]) -> dict[str, bool]: # pragma: no cover """Small helper function to turn a list into the correct format for codemirror""" return {key: True for key in keys} diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index 5f70cde4e..5215dfe28 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -1,5 +1,5 @@ import logging -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import dask.dataframe as dd @@ -68,7 +68,7 @@ def assert_inputs( rel: "LogicalPlan", n: int = 1, context: "dask_sql.Context" = None, - ) -> List[dd.DataFrame]: + ) -> list[dd.DataFrame]: """ LogicalPlan nodes build on top of others. Those are called the "input" of the LogicalPlan. diff --git a/dask_sql/physical/rel/custom/wrappers.py b/dask_sql/physical/rel/custom/wrappers.py index a1ab18534..49d4adb64 100644 --- a/dask_sql/physical/rel/custom/wrappers.py +++ b/dask_sql/physical/rel/custom/wrappers.py @@ -3,7 +3,7 @@ """Meta-estimators for parallelizing estimators using the scikit-learn API.""" import logging import warnings -from typing import Any, Callable, Tuple, Union +from typing import Any, Callable, Union import dask.array as da import dask.dataframe as dd @@ -34,9 +34,9 @@ # Scorers -accuracy_scorer: Tuple[Any, Any] = (accuracy_score, {}) +accuracy_scorer: tuple[Any, Any] = (accuracy_score, {}) neg_mean_squared_error_scorer = (mean_squared_error, dict(greater_is_better=False)) -r2_scorer: Tuple[Any, Any] = (r2_score, {}) +r2_scorer: tuple[Any, Any] = (r2_score, {}) neg_log_loss_scorer = (log_loss, dict(greater_is_better=False, needs_proba=True)) @@ -504,7 +504,7 @@ def __init__( self.shuffle_blocks = shuffle_blocks self.random_state = random_state self.assume_equal_chunks = assume_equal_chunks - super(Incremental, self).__init__( + super().__init__( estimator=estimator, scoring=scoring, predict_meta=predict_meta, diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index dd2f9f41d..1af2748f5 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -2,7 +2,7 @@ import operator from collections import defaultdict from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple +from typing import TYPE_CHECKING, Any, Callable import dask.dataframe as dd import pandas as pd @@ -289,9 +289,9 @@ def _do_aggregations( self, rel: "LogicalPlan", dc: DataContainer, - group_columns: List[str], + group_columns: list[str], context: "dask_sql.Context", - ) -> Tuple[dd.DataFrame, List[str]]: + ) -> tuple[dd.DataFrame, list[str]]: """ Main functionality: return the result dataframe and the output column order @@ -381,9 +381,9 @@ def _collect_aggregations( cc: ColumnContainer, context: "dask_sql.Context", additional_column_name: str, - output_column_order: List[str], - ) -> Tuple[ - Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str], dd.DataFrame + output_column_order: list[str], + ) -> tuple[ + dict[tuple[str, str], list[tuple[str, str, Any]]], list[str], dd.DataFrame ]: """ Collect all aggregations together, which have the same filter column @@ -524,10 +524,10 @@ def _perform_aggregation( dc: DataContainer, filter_column: str, distinct_column: str, - aggregations: List[Tuple[str, str, Any]], + aggregations: list[tuple[str, str, Any]], additional_column_name: str, - group_columns: List[str], - groupby_agg_options: Dict[str, Any] = {}, + group_columns: list[str], + groupby_agg_options: dict[str, Any] = {}, ): tmp_df = dc.df diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index 1657d2bf4..374c74420 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -2,7 +2,7 @@ import operator import warnings from functools import reduce -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING import dask.dataframe as dd from dask import config as dask_config @@ -223,8 +223,8 @@ def _join_on_columns( self, df_lhs_renamed: dd.DataFrame, df_rhs_renamed: dd.DataFrame, - lhs_on: List[str], - rhs_on: List[str], + lhs_on: list[str], + rhs_on: list[str], join_type: str, ) -> dd.DataFrame: @@ -290,7 +290,7 @@ def _join_on_columns( def _split_join_condition( self, join_condition: "Expression" - ) -> Tuple[List[str], List[str], List["Expression"]]: + ) -> tuple[list[str], list[str], list["Expression"]]: if str(join_condition.getRexType()) in ["RexType.Literal", "RexType.Reference"]: return [], [], [join_condition] elif not str(join_condition.getRexType()) == "RexType.Call": diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index 53e1d29be..4a9cecc25 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -43,7 +43,7 @@ def convert( # The table(s) we need to return dask_table = rel.getTable() - schema_name, table_name = [n.lower() for n in context.fqn(dask_table)] + schema_name, table_name = (n.lower() for n in context.fqn(dask_table)) dc = context.schema[schema_name].tables[table_name] diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index aba788bc3..42b0f9613 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -1,7 +1,7 @@ import logging from collections import namedtuple from functools import partial -from typing import TYPE_CHECKING, Callable, List, Optional, Tuple +from typing import TYPE_CHECKING, Callable, Optional import dask.dataframe as dd import numpy as np @@ -109,7 +109,7 @@ def _get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: if self.start is None: start = np.zeros(num_values, dtype=np.int64) else: @@ -141,7 +141,7 @@ def get_window_bounds( center: Optional[bool] = None, closed: Optional[str] = None, step: Optional[int] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: return self._get_window_bounds(num_values, min_periods, center, closed) else: @@ -152,18 +152,18 @@ def get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: return self._get_window_bounds(num_values, min_periods, center, closed) def map_on_each_group( partitioned_group: pd.DataFrame, - sort_columns: List[str], - sort_ascending: List[bool], - sort_null_first: List[bool], + sort_columns: list[str], + sort_ascending: list[bool], + sort_null_first: list[bool], lower_bound: BoundDescription, upper_bound: BoundDescription, - operations: List[Tuple[Callable, str, List[str]]], + operations: list[tuple[Callable, str, list[str]]], ): """Internal function mapped on each group of the dataframe after partitioning""" # Apply sorting @@ -261,7 +261,7 @@ def _apply_window( rel, window, dc: DataContainer, - field_names: List[str], + field_names: list[str], context: "dask_sql.Context", ): temporary_columns = [] @@ -366,7 +366,7 @@ def _extract_groupby( window, dc: DataContainer, context: "dask_sql.Context", - ) -> Tuple[dd.DataFrame, str]: + ) -> tuple[dd.DataFrame, str]: """Prepare grouping columns we can later use while applying the main function""" partition_keys = rel.window().getPartitionExprs(window) if partition_keys: @@ -385,7 +385,7 @@ def _extract_groupby( def _extract_ordering( self, rel, window, cc: ColumnContainer - ) -> Tuple[str, str, str]: + ) -> tuple[str, str, str]: """Prepare sorting information we can later use while applying the main function""" logger.debug( "Error is about to be encountered, FIX me when bindings are available in subsequent PR" @@ -407,7 +407,7 @@ def _extract_operations( df: dd.DataFrame, dc: DataContainer, context: "dask_sql.Context", - ) -> List[Tuple[Callable, str, List[str]]]: + ) -> list[tuple[Callable, str, list[str]]]: # Finally apply the actual function on each group separately operations = [] diff --git a/dask_sql/physical/utils/groupby.py b/dask_sql/physical/utils/groupby.py index 97070bdd0..089219181 100644 --- a/dask_sql/physical/utils/groupby.py +++ b/dask_sql/physical/utils/groupby.py @@ -1,12 +1,10 @@ -from typing import List - import dask.dataframe as dd from dask_sql.utils import new_temporary_column def get_groupby_with_nulls_cols( - df: dd.DataFrame, group_columns: List[str], additional_column_name: str = None + df: dd.DataFrame, group_columns: list[str], additional_column_name: str = None ): """ SQL and dask are treating null columns a bit different: diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py index c35704a32..b39c7993d 100644 --- a/dask_sql/physical/utils/sort.py +++ b/dask_sql/physical/utils/sort.py @@ -1,5 +1,3 @@ -from typing import List - import dask.dataframe as dd import pandas as pd from dask import config as dask_config @@ -10,9 +8,9 @@ def apply_sort( df: dd.DataFrame, - sort_columns: List[str], - sort_ascending: List[bool], - sort_null_first: List[bool], + sort_columns: list[str], + sort_ascending: list[bool], + sort_null_first: list[bool], sort_num_rows: int = None, ) -> dd.DataFrame: # when sort_values doesn't support lists of ascending / null @@ -79,8 +77,8 @@ def apply_sort( def topk_sort( df: dd.DataFrame, - sort_columns: List[str], - sort_ascending: List[bool], + sort_columns: list[str], + sort_ascending: list[bool], sort_num_rows: int = None, ): if sort_ascending[0]: @@ -91,9 +89,9 @@ def topk_sort( def sort_partition_func( partition: pd.DataFrame, - sort_columns: List[str], - sort_ascending: List[bool], - sort_null_first: List[bool], + sort_columns: list[str], + sort_ascending: list[bool], + sort_null_first: list[bool], **kwargs, ): if partition.empty: @@ -121,9 +119,9 @@ def sort_partition_func( def is_topk_optimizable( df: dd.DataFrame, - sort_columns: List[str], + sort_columns: list[str], single_ascending: bool, - sort_null_first: List[bool], + sort_null_first: list[bool], sort_num_rows: int = None, ): if ( diff --git a/dask_sql/physical/utils/statistics.py b/dask_sql/physical/utils/statistics.py index 4dc06b91a..1ada03de8 100644 --- a/dask_sql/physical/utils/statistics.py +++ b/dask_sql/physical/utils/statistics.py @@ -4,7 +4,6 @@ import logging from collections import defaultdict from functools import lru_cache -from typing import List import dask import dask.dataframe as dd @@ -21,10 +20,10 @@ def parquet_statistics( ddf: dd.DataFrame, - columns: List | None = None, + columns: list | None = None, parallel: int | False | None = None, **compute_kwargs, -) -> List[dict] | None: +) -> list[dict] | None: """Extract Parquet statistics from a Dask DataFrame collection WARNING: This API is experimental diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 454eecb7f..8e2673b3e 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -2,7 +2,7 @@ import logging from collections import defaultdict from datetime import datetime -from typing import Any, Dict +from typing import Any from uuid import uuid4 import dask.dataframe as dd @@ -142,8 +142,8 @@ def __str__(self): def convert_sql_kwargs( - sql_kwargs: Dict[str, str], -) -> Dict[str, Any]: + sql_kwargs: dict[str, str], +) -> dict[str, Any]: """ Convert the Rust Vec of key/value pairs into a Dict containing the keys and values """ diff --git a/docs/environment.yml b/docs/environment.yml index 87c4b78d0..2d0e08ba0 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -1,13 +1,12 @@ name: dask-sql-docs channels: - conda-forge - - nodefaults dependencies: - python=3.9 - sphinx>=4.0.0 - sphinx-tabs - dask-sphinx-theme>=2.0.3 - - dask>=2022.3.0,<=2023.11.0 + - dask>=2022.3.0 - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index ed931135a..1f2052a92 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 -dask>=2022.3.0,<=2023.11.0 +dask>=2022.3.0 pandas>=1.4.0 fugue>=0.7.3 # FIXME: https://github.com/fugue-project/fugue/issues/526 diff --git a/docs/source/installation.rst b/docs/source/installation.rst index a2a3ee895..4404facbb 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -74,7 +74,7 @@ Create a new conda environment and install the development environment: .. code-block:: bash - conda env create -f continuous_integration/environment-3.9-dev.yaml + conda env create -f continuous_integration/environment-3.9.yaml It is not recommended to use ``pip`` instead of ``conda``. diff --git a/docs/source/server.rst b/docs/source/server.rst index 70ad902e9..5e26b04c6 100644 --- a/docs/source/server.rst +++ b/docs/source/server.rst @@ -121,7 +121,7 @@ To run a standalone SQL server in your ``dask`` cluster, follow these three step FROM nbraun/dask-sql - COPY startup_script.py /opt/dask_sql/startup_script.py + COPY continuous_integration/docker/startup_script.py /opt/dask_sql/startup_script.py ENTRYPOINT [ "/opt/conda/bin/python", "/opt/dask_sql/startup_script.py" ] diff --git a/pyproject.toml b/pyproject.toml index 921cac04d..75ec4519f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,17 +17,18 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering", "Topic :: System :: Distributed Computing", ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ - "dask[dataframe]>=2022.3.0,<=2023.11.0", - "distributed>=2022.3.0,<=2023.11.0", + "dask[dataframe]>=2022.3.0", + "distributed>=2022.3.0", "pandas>=1.4.0", "fastapi>=0.92.0", "httpx>=0.24.1", diff --git a/tests/integration/test_cmd.py b/tests/integration/test_cmd.py index 63e2da9c5..0ffe82eb0 100644 --- a/tests/integration/test_cmd.py +++ b/tests/integration/test_cmd.py @@ -1,6 +1,7 @@ +from unittest.mock import MagicMock, patch + import pytest from dask import config as dask_config -from mock import MagicMock, patch from prompt_toolkit.application import create_app_session from prompt_toolkit.input import create_pipe_input from prompt_toolkit.output import DummyOutput diff --git a/tests/integration/test_fugue.py b/tests/integration/test_fugue.py index 7faf17ce1..1e1bcd2c4 100644 --- a/tests/integration/test_fugue.py +++ b/tests/integration/test_fugue.py @@ -40,7 +40,7 @@ def test_fugue_fsql(client): assert_eq(return_df, pd.DataFrame({"a": [1], "b": ["world"]})) -@pytest.mark.flaky(reruns=4, condition="sys.version_info < (3, 9)") +@pytest.mark.flaky(reruns=4, condition="sys.version_info < (3, 10)") def test_dask_fsql(client): def assert_fsql(df: pd.DataFrame) -> None: assert_eq(df, pd.DataFrame({"a": [1]})) diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index e503e6af2..973802fe4 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -363,7 +363,7 @@ def test_correct_argument_passing(c): boolean=False, array=[1, 2], dict={"a": 1}, - set=set([1, 2, 3]), + set={1, 2, 3}, ) diff --git a/tests/unit/test_ml_utils.py b/tests/unit/test_ml_utils.py index d092c824d..7130b2bed 100644 --- a/tests/unit/test_ml_utils.py +++ b/tests/unit/test_ml_utils.py @@ -61,7 +61,7 @@ def check_random_state(random_state): elif isinstance(random_state, da.random.RandomState): return random_state else: - raise TypeError("Unexpected type '{}'".format(type(random_state))) + raise TypeError(f"Unexpected type '{type(random_state)}'") def make_classification(