diff --git a/dask_planner/.cargo/config.toml b/.cargo/config.toml similarity index 100% rename from dask_planner/.cargo/config.toml rename to .cargo/config.toml diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 80cdd7357..000000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -dask_sql/_version.py export-subst diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 527d01fa2..1ff63a673 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,7 @@ * @ayushdg @charlesbluca @galipremsagar # rust codeowners -dask_planner/ @ayushdg @charlesbluca @galipremsagar @jdye64 +.cargo/ @ayushdg @charlesbluca @galipremsagar @jdye64 +src/ @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.toml @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.lock @ayushdg @charlesbluca @galipremsagar @jdye64 diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a1e24044a..c6195a9ae 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,7 +1,7 @@ version: 2 updates: - package-ecosystem: "cargo" - directory: "/dask_planner" + directory: "/" schedule: interval: "daily" ignore: @@ -18,6 +18,5 @@ updates: # Check for updates to GitHub Actions every weekday interval: "weekly" ignore: - # ignore cibw patch updates + # prefer updating cibuildwheel manually as needed - dependency-name: "pypa/cibuildwheel" - update-types: ["version-update:semver-patch"] diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 0efe5b656..63a67da6c 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -3,14 +3,11 @@ on: push: branches: - main - - datafusion-sql-planner pull_request: paths: - - setup.py - - dask_planner/Cargo.toml - - dask_planner/Cargo.lock - - dask_planner/pyproject.toml - - dask_planner/rust-toolchain.toml + - Cargo.toml + - Cargo.lock + - pyproject.toml - continuous_integration/recipe/** - .github/workflows/conda.yml schedule: @@ -29,9 +26,44 @@ defaults: jobs: conda: - name: Build (and optionally upload) the conda nightly + name: "Build conda nightlies (python: ${{ matrix.python }}, arch: ${{ matrix.arch }})" runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python: ["3.8", "3.9", "3.10"] + arch: ["linux-64", "linux-aarch64"] steps: + - name: Manage disk space + if: matrix.arch == 'linux-aarch64' + run: | + sudo mkdir -p /opt/empty_dir || true + for d in \ + /opt/ghc \ + /opt/hostedtoolcache \ + /usr/lib/jvm \ + /usr/local/.ghcup \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/share/dotnet \ + /usr/share/swift \ + ; do + sudo rsync --stats -a --delete /opt/empty_dir/ $d || true + done + sudo apt-get purge -y -f firefox \ + google-chrome-stable \ + microsoft-edge-stable + sudo apt-get autoremove -y >& /dev/null + sudo apt-get autoclean -y >& /dev/null + sudo docker image prune --all --force + df -h + - name: Create swapfile + if: matrix.arch == 'linux-aarch64' + run: | + sudo fallocate -l 10GiB /swapfile || true + sudo chmod 600 /swapfile || true + sudo mkswap /swapfile || true + sudo swapon /swapfile || true - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -49,23 +81,36 @@ jobs: which python pip list mamba list - - name: Build conda package + - name: Build conda packages run: | # suffix for nightly package versions export VERSION_SUFFIX=a`date +%y%m%d` conda mambabuild continuous_integration/recipe \ + --python ${{ matrix.python }} \ + --variants "{target_platform: [${{ matrix.arch }}]}" \ + --error-overlinking \ + --no-test \ --no-anaconda-upload \ - --output-folder . - - name: Upload conda package + --output-folder packages + - name: Test conda packages + if: matrix.arch == 'linux-64' # can only test native platform packages + run: | + conda mambabuild --test packages/${{ matrix.arch }}/*.tar.bz2 + - name: Upload conda packages as artifacts + uses: actions/upload-artifact@v3 + with: + name: "conda nightlies (python - ${{ matrix.python }}, arch - ${{ matrix.arch }})" + # need to install all conda channel metadata to properly install locally + path: packages/ + - name: Upload conda packages to Anaconda if: | github.event_name == 'push' && github.repository == 'dask-contrib/dask-sql' env: ANACONDA_API_TOKEN: ${{ secrets.DASK_CONDA_TOKEN }} - LABEL: ${{ github.ref == 'refs/heads/datafusion-sql-planner' && 'dev_datafusion' || 'dev' }} run: | # install anaconda for upload mamba install -c conda-forge anaconda-client - anaconda upload --label $LABEL linux-64/*.tar.bz2 + anaconda upload --label dev packages/${{ matrix.arch }}/*.tar.bz2 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index aa2fd6482..a0c95beac 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,4 +1,4 @@ -name: Upload Docker image +name: Build Docker image on: release: @@ -6,6 +6,13 @@ on: push: branches: - main + pull_request: + paths: + - Cargo.toml + - Cargo.lock + - pyproject.toml + - docker/** + - .github/workflows/docker.yml # When this workflow is queued, automatically cancel any previous running # or pending jobs from the same branch @@ -17,15 +24,16 @@ jobs: push_to_registry: name: Push Docker image to Docker Hub runs-on: ubuntu-latest - if: github.repository == 'dask-contrib/dask-sql' + env: + DOCKER_PUSH: ${{ contains(['push', 'release'], github.event_name) && github.repository == 'dask-contrib/dask-sql' }} + strategy: + fail-fast: false + matrix: + platform: ["linux/amd64", "linux/arm64", "linux/386"] steps: - - name: Check out the repo - uses: actions/checkout@v3 - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + - uses: actions/checkout@v3 - name: Login to DockerHub + if: ${{ fromJSON(env.DOCKER_PUSH) }} uses: docker/login-action@v2 with: username: ${{ secrets.DOCKER_USERNAME }} @@ -41,10 +49,16 @@ jobs: context: . file: ./docker/main.dockerfile build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }} - platforms: linux/amd64,linux/arm64,linux/386 + platforms: ${{ matrix.platform }} tags: ${{ steps.docker_meta_main.outputs.tags }} labels: ${{ steps.docker_meta_main.outputs.labels }} - push: true + push: ${{ fromJSON(env.DOCKER_PUSH) }} + load: ${{ !fromJSON(env.DOCKER_PUSH) }} + - name: Check images + run: | + df -h + docker image ls + docker image inspect ${{ steps.docker_meta_main.outputs.tags }} - name: Docker meta for cloud image id: docker_meta_cloud uses: crazy-max/ghaction-docker-meta@v4 @@ -56,7 +70,8 @@ jobs: context: . file: ./docker/cloud.dockerfile build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }} - platforms: linux/amd64,linux/arm64,linux/386 + platforms: ${{ matrix.platform }} tags: ${{ steps.docker_meta_cloud.outputs.tags }} labels: ${{ steps.docker_meta_cloud.outputs.labels }} - push: true + push: ${{ fromJSON(env.DOCKER_PUSH) }} + load: ${{ !fromJSON(env.DOCKER_PUSH) }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 393987d92..ebacfad99 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,101 +2,177 @@ name: Upload Python package on: release: types: [created] + pull_request: + paths: + - .github/workflows/release.yml + - dask_sql/__init__.py -# Required shell entrypoint to have properly activated conda environments -defaults: - run: - shell: bash -l {0} +# When this workflow is queued, automatically cancel any previous running +# or pending jobs from the same branch +concurrency: + group: release-${{ github.head_ref }} + cancel-in-progress: true + +env: + upload: ${{ github.event_name == 'release' && github.repository == 'dask-contrib/dask-sql' }} jobs: - wheels: - name: Build and publish py3.${{ matrix.python }} wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + linux: + name: Build and publish wheels for linux ${{ matrix.target }} + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - # corresponds to python 3.9, 3.10, 3.11 - python: ["9", "10", "11"] + target: [x86_64, aarch64] + steps: + - uses: actions/checkout@v3 + - name: Install Protoc + uses: arduino/setup-protoc@v1 + if: matrix.target == 'aarch64' + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Build wheels for x86_64 + if: matrix.target == 'x86_64' + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' + manylinux: '2_17' + before-script-linux: > + DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-x86_64.zip$") && + curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && + unzip protoc-*-linux-x86_64.zip -d $HOME/.local + docker-options: --env PROTOC=/root/.local/bin/protoc + - name: Build wheels for aarch64 + if: matrix.target == 'aarch64' + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist --zig + sccache: 'true' + manylinux: '2_17' + - name: Check dist files + run: | + pip install twine + + twine check dist/* + ls -lh dist/ + - name: Upload binary wheels + uses: actions/upload-artifact@v3 + with: + name: wheels for linux ${{ matrix.target }} + path: dist/* + - name: Publish package + if: env.upload == 'true' + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/* + + windows: + name: Build and publish wheels for windows + runs-on: windows-latest steps: - uses: actions/checkout@v3 + - name: Install Protoc + uses: arduino/setup-protoc@v1 with: - fetch-depth: 0 - - name: Set up QEMU for linux aarch64 - if: contains(matrix.os, 'ubuntu') - uses: docker/setup-qemu-action@v2 + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/setup-python@v4 with: - platforms: arm64 - - name: Add arm64 target for macos - if: contains(matrix.os, 'macos') - run: rustup target add aarch64-apple-darwin + python-version: '3.10' + architecture: x64 - name: Build wheels - uses: pypa/cibuildwheel@v2.11.3 + uses: PyO3/maturin-action@v1 + with: + target: x64 + args: --release --out dist + sccache: 'true' + - name: Check dist files + run: | + pip install twine + + twine check dist/* + ls dist/ + - name: Upload binary wheels + uses: actions/upload-artifact@v3 + with: + name: wheels for windows + path: dist/* + - name: Publish package + if: env.upload == 'true' env: - CIBW_BUILD: 'cp3${{ matrix.python }}-*' - CIBW_SKIP: '*musllinux*' - CIBW_ARCHS_LINUX: 'aarch64 x86_64' - CIBW_ARCHS_WINDOWS: 'AMD64' - CIBW_ARCHS_MACOS: 'x86_64 arm64' - CIBW_ENVIRONMENT_LINUX: 'CARGO_NET_GIT_FETCH_WITH_CLI="true" PATH="$HOME/.cargo/bin:$PATH"' - # Without CARGO_NET_GIT_FETCH_WITH_CLI we oom (https://github.com/rust-lang/cargo/issues/10583) - CIBW_ENVIRONMENT_WINDOWS: 'PATH="$UserProfile\.cargo\bin;$PATH"' - CIBW_BEFORE_BUILD: 'pip install -U setuptools-rust' - CIBW_BEFORE_BUILD_LINUX: > - pip install -U setuptools-rust && - curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=minimal -y && - rustup show + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/* + + macos: + name: Build and publish wheels for macos ${{ matrix.target }} + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + target: [x86_64, aarch64] + steps: + - uses: actions/checkout@v3 + - name: Install Protoc + uses: arduino/setup-protoc@v1 with: - package-dir: . - output-dir: dist - config-file: "dask_planner/pyproject.toml" - - name: Set up Python - uses: conda-incubator/setup-miniconda@v2.2.0 + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/setup-python@v4 with: - miniforge-variant: Mambaforge - use-mamba: true - python-version: "3.9" - channel-priority: strict + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' - name: Check dist files run: | - mamba install twine + pip install twine twine check dist/* ls -lh dist/ - name: Upload binary wheels uses: actions/upload-artifact@v3 with: - name: wheels for py3.${{ matrix.python }} on ${{ matrix.os }} + name: wheels for macos ${{ matrix.target }} path: dist/* - name: Publish package + if: env.upload == 'true' env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: twine upload dist/* + sdist: - name: Build and publish source distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - name: Build sdist + uses: PyO3/maturin-action@v1 with: - fetch-depth: 0 - - name: Set up Python - uses: conda-incubator/setup-miniconda@v2.2.0 + command: sdist + args: --out dist + - uses: actions/setup-python@v4 with: - miniforge-variant: Mambaforge - use-mamba: true - python-version: "3.9" - channel-priority: strict - - name: Build source distribution - run: | - mamba install setuptools-rust twine - - python setup.py sdist + python-version: '3.10' - name: Check dist files run: | + pip install twine + twine check dist/* ls -lh dist/ - name: Publish source distribution + if: env.upload == 'true' env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7e983172b..9759add1a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -51,8 +51,7 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner - bash update-dependencies.sh + bash continuous_integration/scripts/update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 with: @@ -60,11 +59,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Check workspace in debug mode run: | - cd dask_planner cargo check - name: Check workspace in release mode run: | - cd dask_planner cargo check --release # test the crate @@ -84,8 +81,7 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner - bash update-dependencies.sh + bash continuous_integration/scripts/update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 with: @@ -93,5 +89,4 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Run tests run: | - cd dask_planner cargo test diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index a07761c76..ce8d0bc5f 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -68,11 +68,10 @@ jobs: - name: Optionally update upstream cargo dependencies if: env.which_upstream == 'DataFusion' run: | - cd dask_planner - bash update-dependencies.sh + bash continuous_integration/scripts/update-dependencies.sh - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | @@ -122,11 +121,9 @@ jobs: env: UPDATE_ALL_CARGO_DEPS: false run: | - cd dask_planner - bash update-dependencies.sh + bash continuous_integration/scripts/update-dependencies.sh - name: Install dependencies and nothing else run: | - mamba install setuptools-rust pip install -e . -vv which python diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 83916d15f..b675cb206 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,7 +72,7 @@ jobs: shared-key: test - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies # FIXME: sasl is not available on python 3.11 if: | @@ -119,7 +119,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies and nothing else run: | - mamba install "setuptools-rust>=1.5.2" pip install -e . -vv which python diff --git a/.github/workflows/update-gpuci.yml b/.github/workflows/update-gpuci.yml index e9909d3ec..40c1cf2de 100644 --- a/.github/workflows/update-gpuci.yml +++ b/.github/workflows/update-gpuci.yml @@ -51,9 +51,9 @@ jobs: run: | echo RAPIDS_VER=${{ steps.rapids_current.outputs.RAPIDS_VER_0 }} >> $GITHUB_ENV echo UCX_PY_VER=$(curl -sL https://version.gpuci.io/rapids/${{ steps.rapids_current.outputs.RAPIDS_VER_0 }}) >> $GITHUB_ENV - echo NEW_CUDF_VER=${FULL_CUDF_VER::-4} >> $GITHUB_ENV - echo NEW_CUML_VER=${FULL_CUML_VER::-4} >> $GITHUB_ENV - echo NEW_UCX_PY_VER=${FULL_UCX_PY_VER::-4} >> $GITHUB_ENV + echo NEW_CUDF_VER=$(echo $FULL_CUDF_VER | cut -d'.' -f1,2) >> $GITHUB_ENV + echo NEW_CUML_VER=$(echo $FULL_CUML_VER | cut -d'.' -f1,2) >> $GITHUB_ENV + echo NEW_UCX_PY_VER=$(echo $FULL_UCX_PY_VER | cut -d'.' -f1,2) >> $GITHUB_ENV - name: Update RAPIDS version uses: jacobtomlinson/gha-find-replace@v3 diff --git a/.gitignore b/.gitignore index 245817fc1..18b2a3d83 100644 --- a/.gitignore +++ b/.gitignore @@ -46,23 +46,16 @@ venv # IDE .idea .vscode -planner/.classpath -planner/.project -planner/.settings/ -planner/.idea -planner/*.iml *.swp # project specific -planner/dependency-reduced-pom.xml -planner/target/ -dask_sql/jar -.next/ dask-worker-space/ node_modules/ docs/source/_build/ tests/unit/queries tests/unit/data +target/* +packages/* # Ignore development specific local testing files dev_tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ed701014a..094c4ada1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,9 +20,9 @@ repos: rev: v1.0 hooks: - id: cargo-check - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] - id: clippy - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--', '-D', 'warnings'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--', '-D', 'warnings'] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.2.0 hooks: @@ -39,4 +39,4 @@ repos: entry: cargo +nightly fmt language: system types: [rust] - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8ca9ae32a..3c14cd7f2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,17 +39,17 @@ DataFusion provides Dask-SQL with key functionality. ### Building Building the Dask-SQL Rust codebase is a straightforward process. If you create and activate the Dask-SQL Conda environment the Rust compiler and all necessary components will be installed for you during that process and therefore requires no further manual setup. -`setuptools-rust` is used by Dask-SQL for building and bundling the resulting Rust binaries. This helps make building and installing the Rust binaries feel much more like a native Python workflow. +`maturin` is used by Dask-SQL for building and bundling the resulting Rust binaries. This helps make building and installing the Rust binaries feel much more like a native Python workflow. -More details about the building setup can be found at [setup.py](setup.py) and searching for `rust_extensions` which is the hook for the Rust code build and inclusion. +More details about the building setup can be found in [pyproject.toml](pyproject.toml) and [Cargo.toml](Cargo.toml) -Note that while `setuptools-rust` is used by CI and should be used during your development cycle, if the need arises to do something more specific that is not yet supported by `setuptools-rust` you can opt to use `cargo` directly from the command line. +Note that while `maturin` is used by CI and should be used during your development cycle, if the need arises to do something more specific that is not yet supported by `maturin` you can opt to use `cargo` directly from the command line. #### Building with Python -Building Dask-SQL is straightforward with Python. To build run ```python setup.py install```. This will build both the Rust and Python codebase and install it into your locally activated conda environment. While not required, if you have updated dependencies for Rust you might prefer a clean build. To clean your setup run ```python setup.py clean``` and then run ```python setup.py install``` +Building Dask-SQL is straightforward with Python. To build run ```pip install .```. This will build both the Rust and Python codebase and install it into your locally activated conda environment; note that if your Rust dependencies have been updated, this command must be rerun to rebuild the Rust codebase. #### DataFusion Modules -DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](dask_planner/Cargo.toml). The modules that we use currently are +DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](Cargo.toml). The modules that we use currently are - `datafusion-common` - Datastructures and core logic - `datafusion-expr` - Expression based logic and operators @@ -57,9 +57,7 @@ DataFusion is broken down into a few modules. We consume those modules in our [C - `datafusion-optimizer` - Optimization logic and datastructures for modifying current plans into more efficient ones. #### Retrieving Upstream Dependencies -During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](dask_planner/Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. - -After updating the `Cargo.toml` file the codebase can be re-built to reflect those changes by running `python setup.py install` +During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. #### Local Documentation Sometimes when building against the latest Github commits for DataFusion you may find that the features you are consuming do not have their documentation public yet. In this case it can be helpful to build the DataFusion documentation locally so that it can be referenced to assist with development. Here is a rough outline for building that documentation locally. @@ -72,40 +70,40 @@ Sometimes when building against the latest Github commits for DataFusion you may ### Datastructures While working in the Rust codebase there are a few datastructures that you should make yourself familiar with. This section does not aim to verbosely list out all of the datastructure with in the project but rather just the key datastructures that you are likely to encounter while working on almost any feature/issue. The aim is to give you a better overview of the codebase without having to manually dig through the all the source code. -- [`PyLogicalPlan`](dask_planner/src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) +- [`PyLogicalPlan`](src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) - Often encountered in Python code with variable name `rel` - Python serializable umbrella representation of the entire LogicalPlan that was generated by DataFusion - Provides access to `DaskTable` instances and type information for each table - Access to individual nodes in the logical plan tree. Ex: `TableScan` -- [`DaskSQLContext`](dask_planner/src/sql.rs) +- [`DaskSQLContext`](src/sql.rs) - Analogous to Python `Context` - Contains metadata about the tables, schemas, functions, operators, and configurations that are persent within the current execution context - When adding custom functions/UDFs this is the location that you would register them - Entry point for parsing SQL strings to sql node trees. This is the location Python will begin its interactions with Rust -- [`PyExpr`](dask_planner/src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) +- [`PyExpr`](src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) - Arguably where most of your time will be spent - Represents a single node in sql tree. Ex: `avg(age)` from `SELECT avg(age) FROM people` - Is associate with a single `RexType` - Can contain literal values or represent function calls, `avg()` for example - The expressions "index" in the tree can be retrieved by calling `PyExpr.index()` on an instance. This is useful when mapping frontend column names in Dask code to backend Dataframe columns - Certain `PyExpr`s contain operands. Ex: `2 + 2` would contain 3 operands. 1) A literal `PyExpr` instance with value 2 2) Another literal `PyExpr` instance with a value of 2. 3) A `+` `PyExpr` representing the addition of the 2 literals. -- [`DaskSqlOptimizer`](dask_planner/src/sql/optimizer.rs) +- [`DaskSqlOptimizer`](src/sql/optimizer.rs) - Registering location for all Dask-SQL specific logical plan optimizations - Optimizations that are written either custom or use from another source, DataFusion, are registered here in the order they are wished to be executed - Represents functions that modify/convert an original `PyLogicalPlan` into another `PyLogicalPlan` that would be more efficient when running in the underlying Dask framework -- [`RelDataType`](dask_planner/src/sql/types/rel_data_type.rs) +- [`RelDataType`](src/sql/types/rel_data_type.rs) - Not a fan of this name, was chosen to match existing Calcite logic - Represents a "row" in a table - Contains a list of "columns" that are present in that row - - [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) -- [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) + - [RelDataTypeField](src/sql/types/rel_data_type_field.rs) +- [RelDataTypeField](src/sql/types/rel_data_type_field.rs) - Represents an individual column in a table - Contains: - `qualifier` - schema the field belongs to - `name` - name of the column/field - `data_type` - `DaskTypeMap` instance containing information about the SQL type and underlying Arrow DataType - `index` - location of the field in the LogicalPlan -- [DaskTypeMap](dask_planner/src/sql/types.rs) +- [DaskTypeMap](src/sql/types.rs) - Maps a conventional SQL type to an underlying Arrow DataType diff --git a/dask_planner/Cargo.lock b/Cargo.lock similarity index 67% rename from dask_planner/Cargo.lock rename to Cargo.lock index 840ff8ad8..6c24af8d7 100644 --- a/dask_planner/Cargo.lock +++ b/Cargo.lock @@ -29,9 +29,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -51,6 +51,18 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -85,8 +97,8 @@ dependencies = [ "serde", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "thiserror", "typed-builder", "uuid", @@ -107,15 +119,15 @@ checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" +checksum = "2feeebd77b34b0bc88f224e06d01c27da4733997cc4789a4e056196656cdc59a" dependencies = [ "ahash", "arrow-arith", @@ -136,9 +148,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" +checksum = "7173f5dc49c0ecb5135f52565af33afd3fdc9a12d13bd6f9973e8b96305e4b2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -151,9 +163,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" +checksum = "63d7ea725f7d1f8bb2cffc53ef538557e95fc802e217d5be25122d402e22f3d0" dependencies = [ "ahash", "arrow-buffer", @@ -162,15 +174,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "num", ] [[package]] name = "arrow-buffer" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0746ae991b186be39933147117f8339eb1c4bbbea1c8ad37e7bf5851a1a06ba" +checksum = "bdbe439e077f484e5000b9e1d47b5e4c0d15f2b311a8f5bcc682553d5d67a722" dependencies = [ "half", "num", @@ -178,9 +190,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" +checksum = "93913cc14875770aa1eef5e310765e855effa352c094cb1c7c00607d0f37b4e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -189,15 +201,16 @@ dependencies = [ "arrow-select", "chrono", "comfy-table", + "half", "lexical-core", "num", ] [[package]] name = "arrow-csv" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" +checksum = "ef55b67c55ed877e6fe7b923121c19dae5e31ca70249ea2779a17b58fb0fbd9a" dependencies = [ "arrow-array", "arrow-buffer", @@ -214,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" +checksum = "d4f4f4a3c54614126a71ab91f6631c9743eb4643d6e9318b74191da9dc6e028b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -226,9 +239,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" +checksum = "d41a3659f984a524ef1c2981d43747b24d8eec78e2425267fcd0ef34ce71cd18" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +253,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" +checksum = "10b95faa95a378f56ef32d84cc0104ea998c39ef7cd1faaa6b4cebf8ea92846d" dependencies = [ "arrow-array", "arrow-buffer", @@ -251,17 +264,18 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", + "indexmap 2.0.0", "lexical-core", "num", + "serde", "serde_json", ] [[package]] name = "arrow-ord" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" +checksum = "c68549a4284d9f8b39586afb8d5ff8158b8f0286353a4844deb1d11cf1ba1f26" dependencies = [ "arrow-array", "arrow-buffer", @@ -274,9 +288,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" +checksum = "0a75a4a757afc301ce010adadff54d79d66140c4282ed3de565f6ccb716a5cf3" dependencies = [ "ahash", "arrow-array", @@ -284,23 +298,23 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", ] [[package]] name = "arrow-schema" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04f17f7b86ded0b5baf98fe6123391c4343e031acc3ccc5fa604cc180bff220" +checksum = "2bebcb57eef570b15afbcf2d07d813eb476fde9f6dd69c81004d6476c197e87e" dependencies = [ - "bitflags 2.2.1", + "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" +checksum = "f6e2943fa433a48921e914417173816af64eef61c0a3d448280e6c40a62df221" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,24 +325,25 @@ dependencies = [ [[package]] name = "arrow-string" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" +checksum = "bbc92ed638851774f6d7af1ad900b92bc1486746497511868b4298fcbcfa35af" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "num", "regex", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "async-compression" -version = "0.3.15" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" +checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" dependencies = [ "bzip2", "flate2", @@ -338,8 +353,8 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", - "zstd 0.11.2+zstd.1.5.2", - "zstd-safe 5.0.2+zstd.1.5.2", + "zstd", + "zstd-safe", ] [[package]] @@ -350,18 +365,18 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] name = "async-trait" -version = "0.1.68" +version = "0.1.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" +checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] @@ -372,9 +387,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.21.0" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" [[package]] name = "bitflags" @@ -384,9 +399,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.2.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a6904aef64d73cf10ab17ebace7befb918b82164785cb89907993be7f83813" +checksum = "6dbe3c979c178231552ecba20214a8272df4e09f232a87aef4320cf06539aded" [[package]] name = "blake2" @@ -399,9 +414,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.3.3" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +checksum = "729b71f35bd3fa1a4c86b85d32c8b9069ea7fe14f7a53cfabb65f62d4265b888" dependencies = [ "arrayref", "arrayvec", @@ -441,32 +456,11 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" -dependencies = [ - "memchr", - "once_cell", - "regex-automata", - "serde", -] - -[[package]] -name = "btoi" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd6407f73a9b8b6162d8a2ef999fe6afd7cc15902ebf42c5cd296addf17e0ad" -dependencies = [ - "num-traits", -] - [[package]] name = "bumpalo" -version = "3.12.1" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byteorder" @@ -518,17 +512,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" dependencies = [ + "android-tzdata", "iana-time-zone", - "js-sys", - "num-integer", "num-traits", "serde", - "time 0.1.45", - "wasm-bindgen", "winapi", ] @@ -554,30 +545,14 @@ dependencies = [ "phf_codegen", ] -[[package]] -name = "clru" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8191fa7302e03607ff0e237d4246cc043ff5b3cb9409d995172ba3bea16b807" - -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "comfy-table" -version = "6.1.4" +version = "7.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "9ab77dbd8adecaf3f0db40581631b995f312a8a5ae3aa9993188bb8f23d83a5b" dependencies = [ - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "unicode-width", ] @@ -605,9 +580,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" [[package]] name = "core-foundation-sys" @@ -651,9 +626,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" dependencies = [ "csv-core", "itoa", @@ -670,50 +645,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "cxx" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 2.0.15", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "dashmap" version = "5.4.0" @@ -728,27 +659,29 @@ dependencies = [ ] [[package]] -name = "dask_planner" -version = "0.1.0" +name = "dask-sql" +version = "2023.10.1" dependencies = [ "async-trait", "datafusion-python", "env_logger", "log", "pyo3", - "pyo3-build-config", + "pyo3-build-config 0.20.0", "pyo3-log", ] [[package]] name = "datafusion" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bdb93fee4f30368f1f71bfd5cd28882ec9fab0183db7924827b76129d33227c" +checksum = "5ddbcb2dda5b5033537457992ebde78938014390b2b19f9f4282e3be0e18b0c3" dependencies = [ "ahash", "apache-avro", "arrow", + "arrow-array", + "arrow-schema", "async-compression", "async-trait", "bytes", @@ -760,14 +693,14 @@ dependencies = [ "datafusion-expr", "datafusion-optimizer", "datafusion-physical-expr", - "datafusion-row", "datafusion-sql", "flate2", "futures", "glob", - "hashbrown 0.13.2", - "indexmap", - "itertools", + "half", + "hashbrown 0.14.0", + "indexmap 2.0.0", + "itertools 0.11.0", "lazy_static", "log", "num-traits", @@ -782,19 +715,18 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-stream", "tokio-util", "url", "uuid", "xz2", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] name = "datafusion-common" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82401ce129e601d406012b6d718f8978ba84c386e1c342fa155877120d68824" +checksum = "85fbb7b4da925031311743ab96662d55f0f7342d3692744f184f99b2257ef435" dependencies = [ "apache-avro", "arrow", @@ -809,14 +741,14 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08b2078aed21a27239cd93f3015e492a58b0d50ebeeaf8d2236cf108ef583ce" +checksum = "5bb3617466d894eb0ad11d06bab1e6e89c571c0a27d660685d327d0c6e1e1ccd" dependencies = [ "dashmap", "datafusion-common", "datafusion-expr", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "log", "object_store", "parking_lot", @@ -827,21 +759,24 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b5b977ce9695fb4c67614266ec57f384fc11e9a9f9b3e6d0e62b9c5a9f2c1f" +checksum = "3bd8220a0dfcdfddcc785cd7e71770ef1ce54fbe1e08984e5adf537027ecb6de" dependencies = [ "ahash", "arrow", "datafusion-common", + "lazy_static", "sqlparser", + "strum 0.25.0", + "strum_macros 0.25.1", ] [[package]] name = "datafusion-optimizer" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0b2bb9e73ed778d1bc5af63a270f0154bf6eab5099c77668a6362296888e46b" +checksum = "1d685a100c66952aaadd0cbe766df46d1887d58fc8bcf3589e6387787f18492b" dependencies = [ "arrow", "async-trait", @@ -849,34 +784,37 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.13.2", - "itertools", + "hashbrown 0.14.0", + "itertools 0.11.0", "log", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80cd8ea5ab0a07b1b2a3e17d5909f1b1035bd129ffeeb5c66842a32e682f8f79" +checksum = "0f2c635da9b05b4b4c6c8d935f46fd99f9b6225f834091cf4e3c8a045b68beab" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", "arrow-schema", + "base64", "blake2", "blake3", "chrono", "datafusion-common", "datafusion-expr", - "datafusion-row", "half", - "hashbrown 0.13.2", - "indexmap", - "itertools", + "hashbrown 0.14.0", + "hex", + "indexmap 2.0.0", + "itertools 0.11.0", "lazy_static", + "libc", + "log", "md-5", "paste", "petgraph", @@ -889,8 +827,9 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "22.0.0" -source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=9493638#94936380e58a266f5dd5de6b70a06d3aa36fbe22" +version = "28.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a2441774e84875ae16a8b5277090ed6ab77ce94ab1820c315ed02cd3813de29" dependencies = [ "async-trait", "datafusion", @@ -903,33 +842,23 @@ dependencies = [ "mimalloc", "object_store", "parking_lot", + "prost", + "prost-types", "pyo3", - "pyo3-build-config", + "pyo3-build-config 0.19.2", "rand", - "regex-syntax 0.6.29", - "syn 2.0.15", + "regex-syntax", + "syn 2.0.23", "tokio", "url", "uuid", ] -[[package]] -name = "datafusion-row" -version = "22.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a95d6badab19fd6e9195fdc5209ac0a7e5ce9bcdedc67767b9ffc1b4e645760" -dependencies = [ - "arrow", - "datafusion-common", - "paste", - "rand", -] - [[package]] name = "datafusion-sql" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37a78f8fc67123c4357e63bc0c87622a2a663d26f074958d749a633d0ecde90f" +checksum = "b3ef8abf4dd84d3f20c910822b52779c035ab7f4f2d5e7125ede3bae618e9de8" dependencies = [ "arrow", "arrow-schema", @@ -941,63 +870,38 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "22.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae6ed64a2005f0d78f2b1b3ec3f8148183f4523d5d364e5367115f8d8a82b7df" +checksum = "2c97d351bbd6bd6497e7c9606ddd3c00cd63e9d185d7ab96fc8a66cf3c449177" dependencies = [ "async-recursion", "chrono", "datafusion", - "itertools", + "itertools 0.11.0", "object_store", "prost", + "prost-types", "substrait", "tokio", ] [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", "subtle", ] -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" -[[package]] -name = "dunce" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" - [[package]] name = "dyn-clone" version = "1.0.11" @@ -1032,6 +936,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.1" @@ -1040,7 +950,7 @@ checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -1062,18 +972,6 @@ dependencies = [ "instant", ] -[[package]] -name = "filetime" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.2.16", - "windows-sys 0.48.0", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -1082,9 +980,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "23.1.21" +version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77f5399c2c9c50ae9418e522842ad362f61ee48b346ac106807bd355a8a7c619" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1108,9 +1006,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" dependencies = [ "percent-encoding", ] @@ -1171,7 +1069,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] @@ -1216,555 +1114,28 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - -[[package]] -name = "gix" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c256ea71cc1967faaefdaad15f334146b7c806f12460dcafd3afed845c8c78dd" -dependencies = [ - "gix-actor", - "gix-attributes", - "gix-config", - "gix-credentials", - "gix-date", - "gix-diff", - "gix-discover", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-index", - "gix-lock", - "gix-mailmap", - "gix-object", - "gix-odb", - "gix-pack", - "gix-path", - "gix-prompt", - "gix-ref", - "gix-refspec", - "gix-revision", - "gix-sec", - "gix-tempfile", - "gix-traverse", - "gix-url", - "gix-validate", - "gix-worktree", - "log", - "once_cell", - "signal-hook", - "smallvec", - "thiserror", - "unicode-normalization", -] - -[[package]] -name = "gix-actor" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc22b0cdc52237667c301dd7cdc6ead8f8f73c9f824e9942c8ebd6b764f6c0bf" -dependencies = [ - "bstr", - "btoi", - "gix-date", - "itoa", - "nom", - "thiserror", -] - -[[package]] -name = "gix-attributes" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2231a25934a240d0a4b6f4478401c73ee81d8be52de0293eedbc172334abf3e1" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-quote", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-bitmap" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55a95f4942360766c3880bdb2b4b57f1ef73b190fc424755e7fdf480430af618" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-chunk" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0d39583cab06464b8bf73b3f1707458270f0e7383cb24c3c9c1a16e6f792978" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-command" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c6f75c1e0f924de39e750880a6e21307194bb1ab773efe3c7d2d787277f8ab" -dependencies = [ - "bstr", -] - -[[package]] -name = "gix-config" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbad5ce54a8fc997acc50febd89ec80fa6e97cb7f8d0654cb229936407489d8" -dependencies = [ - "bstr", - "gix-config-value", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-ref", - "gix-sec", - "log", - "memchr", - "nom", - "once_cell", - "smallvec", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-config-value" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09154c0c8677e4da0ec35e896f56ee3e338e741b9599fae06075edd83a4081c" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "gix-path", - "libc", - "thiserror", -] - -[[package]] -name = "gix-credentials" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750b684197374518ea057e0a0594713e07683faa0a3f43c0f93d97f64130ad8d" -dependencies = [ - "bstr", - "gix-command", - "gix-config-value", - "gix-path", - "gix-prompt", - "gix-sec", - "gix-url", - "thiserror", -] - -[[package]] -name = "gix-date" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b96271912ce39822501616f177dea7218784e6c63be90d5f36322ff3a722aae2" -dependencies = [ - "bstr", - "itoa", - "thiserror", - "time 0.3.20", -] - -[[package]] -name = "gix-diff" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "103a0fa79b0d438f5ecb662502f052e530ace4fe1fe8e1c83c0c6da76d728e67" -dependencies = [ - "gix-hash 0.10.4", - "gix-object", - "imara-diff", - "thiserror", -] - -[[package]] -name = "gix-discover" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eba8ba458cb8f4a6c33409b0fe650b1258655175a7ffd1d24fafd3ed31d880b" -dependencies = [ - "bstr", - "dunce", - "gix-hash 0.10.4", - "gix-path", - "gix-ref", - "gix-sec", - "thiserror", -] - -[[package]] -name = "gix-features" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b76f9a80f6dd7be66442ae86e1f534effad9546676a392acc95e269d0c21c22" -dependencies = [ - "crc32fast", - "flate2", - "gix-hash 0.10.4", - "libc", - "once_cell", - "prodash", - "sha1_smol", - "thiserror", - "walkdir", -] - -[[package]] -name = "gix-features" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf69b0f5c701cc3ae22d3204b671907668f6437ca88862d355eaf9bc47a4f897" -dependencies = [ - "gix-hash 0.11.1", - "libc", -] - -[[package]] -name = "gix-fs" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b37a1832f691fdc09910bd267f9a2e413737c1f9ec68c6e31f9e802616278a9" -dependencies = [ - "gix-features 0.29.0", -] - -[[package]] -name = "gix-glob" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93e43efd776bc543f46f0fd0ca3d920c37af71a764a16f2aebd89765e9ff2993" -dependencies = [ - "bitflags 1.3.2", - "bstr", -] - -[[package]] -name = "gix-hash" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a258595457bc192d1f1c59d0d168a1e34e2be9b97a614e14995416185de41a7" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hash" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078eec3ac2808cc03f0bddd2704cb661da5c5dc33b41a9d7947b141d499c7c42" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hashtable" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e55e40dfd694884f0eb78796c5bddcf2f8b295dace47039099dd7e76534973" -dependencies = [ - "gix-hash 0.10.4", - "hashbrown 0.13.2", - "parking_lot", -] - -[[package]] -name = "gix-index" -version = "0.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "717ab601ece7921f59fe86849dbe27d44a46ebb883b5885732c4f30df4996177" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "btoi", - "filetime", - "gix-bitmap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-traverse", - "itoa", - "memmap2", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-lock" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c693d7f05730fa74a7c467150adc7cea393518410c65f0672f80226b8111555" -dependencies = [ - "gix-tempfile", - "gix-utils", - "thiserror", -] - -[[package]] -name = "gix-mailmap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b66aea5e52875cd4915f4957a6f4b75831a36981e2ec3f5fad9e370e444fe1a" -dependencies = [ - "bstr", - "gix-actor", - "thiserror", -] - -[[package]] -name = "gix-object" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df068db9180ee935fbb70504848369e270bdcb576b05c0faa8b9fd3b86fc017" -dependencies = [ - "bstr", - "btoi", - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-validate", - "hex", - "itoa", - "nom", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-odb" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83af2e3e36005bfe010927f0dff41fb5acc3e3d89c6f1174135b3a34086bda2" -dependencies = [ - "arc-swap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-object", - "gix-pack", - "gix-path", - "gix-quote", - "parking_lot", - "tempfile", - "thiserror", -] - -[[package]] -name = "gix-pack" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9401911c7fe032ad7b31c6a6b5be59cb283d1d6c999417a8215056efe6d635f3" -dependencies = [ - "clru", - "gix-chunk", - "gix-diff", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-traverse", - "memmap2", - "parking_lot", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-path" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32370dce200bb951df013e03dff35b4233fc7a89458642b047629b91734a7e19" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-prompt" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3034d4d935aef2c7bf719aaa54b88c520e82413118d886ae880a31d5bdee57" -dependencies = [ - "gix-command", - "gix-config-value", - "nix", - "parking_lot", - "thiserror", -] - -[[package]] -name = "gix-quote" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a282f5a8d9ee0b09ec47390ac727350c48f2f5c76d803cd8da6b3e7ad56e0bcb" -dependencies = [ - "bstr", - "btoi", - "thiserror", -] - -[[package]] -name = "gix-ref" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e909396ed3b176823991ccc391c276ae2a015e54edaafa3566d35123cfac9d" -dependencies = [ - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-validate", - "memmap2", - "nom", - "thiserror", + "wasi", ] [[package]] -name = "gix-refspec" -version = "0.9.0" +name = "git2" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba332462bda2e8efeae4302b39a6ed01ad56ef772fd5b7ef197cf2798294d65" -dependencies = [ - "bstr", - "gix-hash 0.10.4", - "gix-revision", - "gix-validate", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-revision" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6f6ff53f888858afc24bf12628446a14279ceec148df6194481f306f553ad2" -dependencies = [ - "bstr", - "gix-date", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-sec" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ffa5bf0772f9b01de501c035b6b084cf9b8bb07dec41e3afc6a17336a65f47" +checksum = "7b989d6a7ca95a362cf2cfc5ad688b3a467be1f87e480b8dad07fee8c79b0044" dependencies = [ "bitflags 1.3.2", - "dirs", - "gix-path", - "libc", - "windows 0.43.0", -] - -[[package]] -name = "gix-tempfile" -version = "5.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71a0d32f34e71e86586124225caefd78dabc605d0486de580d717653addf182" -dependencies = [ - "gix-fs", "libc", - "once_cell", - "parking_lot", - "signal-hook", - "signal-hook-registry", - "tempfile", -] - -[[package]] -name = "gix-traverse" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9a4a07bb22168dc79c60e1a6a41919d198187ca83d8a5940ad8d7122a45df3" -dependencies = [ - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-url" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a22b4b32ad14d68f7b7fb6458fa58d44b01797d94c1b8f4db2d9c7b3c366b5" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-path", - "home", - "thiserror", + "libgit2-sys", + "log", "url", ] -[[package]] -name = "gix-utils" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c10b69beac219acb8df673187a1f07dde2d74092f974fb3f9eb385aeb667c909" -dependencies = [ - "fastrand", -] - -[[package]] -name = "gix-validate" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd629d3680773e1785e585d76fd4295b740b559cad9141517300d99a0c8c049" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-worktree" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54ec9a000b4f24af706c3cc680c7cda235656cbe3216336522f5692773b8a301" -dependencies = [ - "bstr", - "gix-attributes", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-index", - "gix-object", - "gix-path", - "io-close", - "thiserror", -] - [[package]] name = "glob" version = "0.3.1" @@ -1773,9 +1144,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" dependencies = [ "bytes", "fnv", @@ -1783,7 +1154,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap", + "indexmap 1.9.3", "slab", "tokio", "tokio-util", @@ -1815,6 +1186,16 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "heck" version = "0.4.1" @@ -1842,15 +1223,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "home" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" -dependencies = [ - "windows-sys 0.48.0", -] - [[package]] name = "http" version = "0.2.9" @@ -1917,9 +1289,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.2" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http", "hyper", @@ -1930,56 +1302,55 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows 0.48.0", + "windows", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] name = "idna" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" dependencies = [ "unicode-bidi", "unicode-normalization", ] [[package]] -name = "imara-diff" -version = "0.1.5" +name = "indexmap" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e98c1d0ad70fc91b8b9654b1f33db55e59579d3b3de2bffdced0fdb810570cb8" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ - "ahash", + "autocfg", "hashbrown 0.12.3", ] [[package]] name = "indexmap" -version = "1.9.3" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" dependencies = [ - "autocfg", - "hashbrown 0.12.3", + "equivalent", + "hashbrown 0.14.0", ] [[package]] @@ -2003,25 +1374,15 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-close" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cadcf447f06744f8ce713d2d6239bb5bde2c357a452397a9ed90c625da390bc" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ "hermit-abi 0.3.1", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2039,7 +1400,7 @@ dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", "rustix", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2051,6 +1412,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -2068,9 +1438,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -2147,15 +1517,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.142" +version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" [[package]] name = "libflate" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97822bf791bd4d5b403713886a5fbe8bf49520fe78e323b0dc480ca1a03e50b0" +checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" dependencies = [ "adler32", "crc32fast", @@ -2168,14 +1538,26 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf" dependencies = [ - "rle-decode-fast", + "rle-decode-fast", +] + +[[package]] +name = "libgit2-sys" +version = "0.15.2+1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a80df2e11fb4a61f4ba2ab42dbe7f74468da143f1a75c74e11dee7c813f694fa" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", ] [[package]] name = "libm" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" [[package]] name = "libmimalloc-sys" @@ -2188,25 +1570,28 @@ dependencies = [ ] [[package]] -name = "link-cplusplus" -version = "1.0.8" +name = "libz-sys" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +checksum = "56ee889ecc9568871456d42f603d6a0ce59ff328d291063a45cbdf0036baf6db" dependencies = [ "cc", + "libc", + "pkg-config", + "vcpkg", ] [[package]] name = "linux-raw-sys" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -2214,12 +1599,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.17" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "lz4" @@ -2267,20 +1649,11 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memmap2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -2300,12 +1673,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -2317,14 +1684,13 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", - "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "wasi", + "windows-sys", ] [[package]] @@ -2333,28 +1699,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "libc", - "static_assertions", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - [[package]] name = "num" version = "0.4.0" @@ -2442,27 +1786,20 @@ dependencies = [ "libc", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "object_store" -version = "0.5.6" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec9cd6ca25e796a49fa242876d1c4de36a24a6da5258e9f0bc062dbf5e81c53b" +checksum = "27c776db4f332b571958444982ff641d2531417a326ca368995073b639205d58" dependencies = [ "async-trait", "base64", "bytes", "chrono", "futures", - "itertools", + "humantime", + "hyper", + "itertools 0.10.5", "parking_lot", "percent-encoding", "quick-xml", @@ -2481,9 +1818,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ordered-float" @@ -2506,22 +1843,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall", "smallvec", - "windows-sys 0.45.0", + "windows-targets", ] [[package]] name = "parquet" -version = "36.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" +checksum = "ec7267a9607c3f955d4d0ac41b88a67cecc0d8d009173ad3da390699a6cb3750" dependencies = [ "ahash", "arrow-array", @@ -2537,17 +1874,18 @@ dependencies = [ "chrono", "flate2", "futures", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "lz4", "num", "num-bigint", + "object_store", "paste", "seq-macro", "snap", "thrift", "tokio", "twox-hash", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] @@ -2567,9 +1905,9 @@ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "percent-encoding" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "petgraph" @@ -2578,7 +1916,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 1.9.3", ] [[package]] @@ -2645,12 +1983,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] @@ -2661,19 +1999,13 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] -[[package]] -name = "prodash" -version = "23.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9516b775656bc3e8985e19cd4b8c0c0de045095074e453d2c0a513b5f978392d" - [[package]] name = "prost" version = "0.11.9" @@ -2692,7 +2024,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck", - "itertools", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -2711,7 +2043,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -2728,16 +2060,16 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b1ac5b3731ba34fdaa9785f8d74d17448cd18f30cf19e0c7e7b1fdb5272109" +checksum = "ffb88ae05f306b4bfcde40ac4a51dc0b05936a9207a4b75b798c7729c4258a59" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", "parking_lot", - "pyo3-build-config", + "pyo3-build-config 0.19.2", "pyo3-ffi", "pyo3-macros", "unindent", @@ -2745,9 +2077,19 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.18.3" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "076c73d0bc438f7a4ef6fdd0c3bb4732149136abd952b110ac93e4edb13a6ba5" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-build-config" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cb946f5ac61bb61a5014924910d936ebd2b23b705f7a4a3c40b05c720b079a3" +checksum = "a96fe70b176a89cff78f2fa7b3c930081e163d5379b4dcdf993e3ae29ca662e5" dependencies = [ "once_cell", "target-lexicon", @@ -2755,19 +2097,19 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd4d7c5337821916ea2a1d21d1092e8443cf34879e53a0ac653fbb98f44ff65c" +checksum = "922ede8759e8600ad4da3195ae41259654b9c55da4f7eec84a0ccc7d067a70a4" dependencies = [ "libc", - "pyo3-build-config", + "pyo3-build-config 0.19.2", ] [[package]] name = "pyo3-log" -version = "0.8.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9c8b57fe71fb5dcf38970ebedc2b1531cf1c14b1b9b4c560a182a57e115575c" +checksum = "4c10808ee7250403bedb24bc30c32493e93875fef7ba3e4292226fe924f398bd" dependencies = [ "arc-swap", "log", @@ -2776,9 +2118,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d39c55dab3fc5a4b25bbd1ac10a2da452c4aca13bb450f22818a002e29648d" +checksum = "8a5caec6a1dd355964a841fcbeeb1b89fe4146c87295573f94228911af3cc5a2" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2788,9 +2130,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97daff08a4c48320587b5224cc98d609e3c27b6d437315bd40b605c98eeb5918" +checksum = "e0b78ccbb160db1556cdb6fd96c50334c5d4ec44dc5e0a968d0a1208fa0efa8b" dependencies = [ "proc-macro2", "quote", @@ -2815,9 +2157,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] @@ -2852,15 +2194,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.3.5" @@ -2870,51 +2203,28 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom", - "redox_syscall 0.2.16", - "thiserror", -] - [[package]] name = "regex" -version = "1.8.1" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.1", + "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - -[[package]] -name = "regex-syntax" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "regress" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d995d590bd8ec096d1893f414bf3f5e8b0ee4c9eed9a5642b9766ef2c8e2e8e9" +checksum = "82a9ecfa0cb04d0b04dddb99b8ccf4f66bc8dfd23df694b398570bd8ae3a50fb" dependencies = [ "hashbrown 0.13.2", "memchr", @@ -2922,9 +2232,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.17" +version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ "base64", "bytes", @@ -2993,28 +2303,28 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.19" +version = "0.37.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags 1.3.2", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ "log", "ring", + "rustls-webpki", "sct", - "webpki", ] [[package]] @@ -3026,6 +2336,16 @@ dependencies = [ "base64", ] +[[package]] +name = "rustls-webpki" +version = "0.100.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -3077,12 +2397,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" - [[package]] name = "sct" version = "0.7.0" @@ -3107,22 +2421,22 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] @@ -3149,13 +2463,14 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.1.7" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797ba1d80299b264f3aac68ab5d12e5825a561749db4df7cd7c8083900c5d4e9" +checksum = "8a00ffd23fd882d096f09fcaae2a9de8329a328628e86027e049ee051dc1621f" dependencies = [ "proc-macro2", + "quote", "serde", - "syn 1.0.109", + "syn 2.0.23", ] [[package]] @@ -3176,19 +2491,13 @@ version = "0.9.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9d684e3ec7de3bf5466b32bd75303ac16f0736426e5a4e0d6e489559ce1249c" dependencies = [ - "indexmap", + "indexmap 1.9.3", "itoa", "ryu", "serde", "unsafe-libyaml", ] -[[package]] -name = "sha1_smol" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" - [[package]] name = "sha2" version = "0.10.6" @@ -3200,25 +2509,6 @@ dependencies = [ "digest", ] -[[package]] -name = "signal-hook" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" -dependencies = [ - "libc", - "signal-hook-registry", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" -dependencies = [ - "libc", -] - [[package]] name = "siphasher" version = "0.3.10" @@ -3286,9 +2576,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.32.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0366f270dbabb5cc2e4c88427dc4c08bba144f81e32fbd459a013f26a4d16aa0" +checksum = "ca597d77c98894be1f965f2e4e2d2a61575d4998088e655476c73715c54b2b43" dependencies = [ "log", "sqlparser_derive", @@ -3317,6 +2607,15 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros 0.25.1", +] + [[package]] name = "strum_macros" version = "0.24.3" @@ -3330,13 +2629,26 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum_macros" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6069ca09d878a33f883cc06aaa9718ede171841d3832450354410b718b097232" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.23", +] + [[package]] name = "substrait" -version = "0.7.5" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ae64fb7ad0670c7d6d53d57b1b91beb2212afc30e164cc8edb02d6b2cff32a" +checksum = "2ac1ce8315086b127ca0abf162c62279550942bb26ebf7946fe17fe114446472" dependencies = [ - "gix", + "git2", "heck", "prettyplease", "prost", @@ -3347,16 +2659,16 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.15", + "syn 2.0.23", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" @@ -3371,9 +2683,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.15" +version = "2.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" +checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" dependencies = [ "proc-macro2", "quote", @@ -3388,15 +2700,16 @@ checksum = "fd1ba337640d60c3e96bc6f0638a939b9c9a7f2c316a1598c279828b3d1dc8c5" [[package]] name = "tempfile" -version = "3.5.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" dependencies = [ + "autocfg", "cfg-if", "fastrand", - "redox_syscall 0.3.5", + "redox_syscall", "rustix", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -3425,7 +2738,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] @@ -3439,46 +2752,6 @@ dependencies = [ "ordered-float", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" -dependencies = [ - "itoa", - "libc", - "num_threads", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" - -[[package]] -name = "time-macros" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" -dependencies = [ - "time-core", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -3505,9 +2778,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.0" +version = "1.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" dependencies = [ "autocfg", "bytes", @@ -3518,7 +2791,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -3529,29 +2802,17 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] name = "tokio-rustls" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", "tokio", - "webpki", -] - -[[package]] -name = "tokio-stream" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", ] [[package]] @@ -3594,14 +2855,14 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.23", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", ] @@ -3641,9 +2902,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.11" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bfde96849e25d7feef1bbf652e9cfc51deb63203fdc07b115b8bc3bcfe20b9" +checksum = "be9bb640c0eece20cac2028ebbc2ca1a3d17e3b1ddd98540309c309ed178d158" dependencies = [ "typify-impl", "typify-macro", @@ -3651,9 +2912,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.11" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95d27d749378ceab6ec22188ed7ad102205c89ddb92ab662371c850ffc71aa1a" +checksum = "5c8d9ecedde2fd77e975c38eeb9ca40b34ad0247b2259c6e6bbd2a8d6cc2444f" dependencies = [ "heck", "log", @@ -3662,16 +2923,16 @@ dependencies = [ "regress", "schemars", "serde_json", - "syn 1.0.109", + "syn 2.0.23", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.11" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35db6fc2bd9220ecdac6eeb88158824b83610de3dda0c6d0f2142b49efd858b0" +checksum = "c08942cd65d458d2da15777a649cb6400cb545f17964f1ca965583f22e9cc3a9" dependencies = [ "proc-macro2", "quote", @@ -3679,7 +2940,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 1.0.109", + "syn 2.0.23", "typify-impl", ] @@ -3689,17 +2950,11 @@ version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" -[[package]] -name = "unicode-bom" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63ec69f541d875b783ca40184d655f2927c95f0bffd486faa83cd3ac3529ec32" - [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -3742,9 +2997,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" dependencies = [ "form_urlencoded", "idna", @@ -3753,14 +3008,20 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dad5567ad0cf5b760e5665964bec1b47dfd077ba8a2544b513f3556d3d239a2" +checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81" dependencies = [ "getrandom", "serde", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -3787,12 +3048,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3801,9 +3056,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3811,24 +3066,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.23", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -3838,9 +3093,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3848,22 +3103,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.23", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -3880,9 +3135,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", @@ -3949,37 +3204,13 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.43.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04662ed0e3e5630dfa9b26e4cb823b817f1a9addda855d973a9458c236556244" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", + "windows-targets", ] [[package]] @@ -3988,22 +3219,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets", ] [[package]] @@ -4012,93 +3228,51 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.48.0" @@ -4144,32 +3318,13 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", -] - [[package]] name = "zstd" version = "0.12.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" dependencies = [ - "zstd-safe 6.0.5+zstd.1.5.4", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..867727871 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "dask-sql" +repository = "https://github.com/dask-contrib/dask-sql" +version = "2023.10.1" +description = "Bindings for DataFusion used by Dask-SQL" +readme = "README.md" +license = "Apache-2.0" +edition = "2021" +rust-version = "1.65" +include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] + +[dependencies] +async-trait = "0.1.74" +datafusion-python = "28.0.0" +env_logger = "0.10" +log = "^0.4" +pyo3 = { version = "0.19.1", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3-log = "0.9.0" + +[build-dependencies] +pyo3-build-config = "0.20.0" + +[lib] +name = "dask_sql" +crate-type = ["cdylib", "rlib"] + +[profile.release] +lto = true +codegen-units = 1 diff --git a/MANIFEST.in b/MANIFEST.in index d0108fedd..05f034af2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,2 @@ recursive-include dask_sql *.yaml recursive-include dask_planner * - -include versioneer.py -include dask_sql/_version.py diff --git a/README.md b/README.md index 3a034b9ff..7359aae2d 100644 --- a/README.md +++ b/README.md @@ -110,10 +110,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" The Rust DataFusion bindings are built as part of the `pip install`. -If changes are made to the Rust source in `dask_planner/`, another build/install must be run to recompile the bindings: - - python setup.py build install - +Note that if changes are made to the Rust source in `src/`, another build must be run to recompile the bindings. This repository uses [pre-commit](https://pre-commit.com/) hooks. To install them, call pre-commit install diff --git a/conftest.py b/conftest.py index d840bfd86..62559c061 100644 --- a/conftest.py +++ b/conftest.py @@ -7,17 +7,28 @@ def pytest_addoption(parser): parser.addoption("--rungpu", action="store_true", help="run tests meant for GPU") parser.addoption("--runqueries", action="store_true", help="run test queries") + parser.addoption("--data_dir", help="specify file path to the data") + parser.addoption("--queries_dir", help="specify file path to the queries") def pytest_runtest_setup(item): + # TODO: get pyarrow strings and p2p shuffle working + dask.config.set({"dataframe.convert-string": False}) + dask.config.set({"dataframe.shuffle.algorithm": "tasks"}) if "gpu" in item.keywords: if not item.config.getoption("--rungpu"): pytest.skip("need --rungpu option to run") - # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it - dask.config.set({"dataframe.shuffle.algorithm": "tasks"}) # manually enable cudf decimal support dask.config.set({"sql.mappings.decimal_support": "cudf"}) - else: - dask.config.set({"dataframe.shuffle.algorithm": None}) if "queries" in item.keywords and not item.config.getoption("--runqueries"): pytest.skip("need --runqueries option to run") + + +@pytest.fixture(scope="session") +def data_dir(request): + return request.config.getoption("--data_dir") + + +@pytest.fixture(scope="session") +def queries_dir(request): + return request.config.getoption("--queries_dir") diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index 7add8b087..b9612daec 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -4,24 +4,24 @@ channels: dependencies: - c-compiler - dask>=2022.3.0 -# FIXME: handling is needed for httpx-based fastapi>=0.87.0 -- fastapi>=0.69.0,<0.87.0 +- fastapi>=0.92.0 - fugue>=0.7.3 +- httpx>=0.24.1 - intake>=0.6.0 - jsonschema - libprotobuf=3 - lightgbm -- maturin>=0.12.8 -- mlflow>=1.20 +- maturin>=1.1,<1.2 +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock -# tpot imports fail with numpy >=1.24.0 -# https://github.com/EpistasisLab/tpot/issues/1281 -- numpy<1.24.0 +- numpy>=1.21.6 - pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov @@ -30,9 +30,10 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 -- tpot +- tpot>=0.12.0 - tzlocal>=2.1 - uvicorn>=0.13.4 +- libprotobuf=3 +- zlib diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 256aa20ee..55f848351 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -4,24 +4,24 @@ channels: dependencies: - c-compiler - dask>=2022.3.0 -# FIXME: handling is needed for httpx-based fastapi>=0.87.0 -- fastapi>=0.69.0,<0.87.0 +- fastapi>=0.92.0 - fugue>=0.7.3 +- httpx>=0.24.1 - intake>=0.6.0 - jsonschema - libprotobuf=3 - lightgbm -- maturin>=0.12.8 -- mlflow>=1.20 +- maturin>=1.1,<1.2 +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock -# tpot imports fail with numpy >=1.24.0 -# https://github.com/EpistasisLab/tpot/issues/1281 -- numpy<1.24.0 +- numpy>=1.21.6 - pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov @@ -30,9 +30,10 @@ dependencies: - pytest - python=3.11 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 -- tpot +- tpot>=0.12.0 - tzlocal>=2.1 - uvicorn>=0.13.4 +- libprotobuf=3 +- zlib diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index 1d1de0aba..9ae90d561 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -4,23 +4,24 @@ channels: dependencies: - c-compiler - dask=2022.3.0 -- fastapi=0.69.0 +- fastapi=0.92.0 - fugue=0.7.3 +- httpx=0.24.1 - intake=0.6.0 - jsonschema - libprotobuf=3 - lightgbm -- maturin=0.12.8 -- mlflow=1.20 +- maturin=1.1 +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock -# tpot imports fail with numpy >=1.24.0 -# https://github.com/EpistasisLab/tpot/issues/1281 -- numpy<1.24.0 +- numpy=1.21.6 - pandas=1.4.0 - pre-commit - prompt_toolkit=3.0.8 - psycopg2 -- pyarrow=6.0.1 +- pyarrow=6.0.2 - pygments=2.7.1 - pyhive - pytest-cov @@ -29,9 +30,10 @@ dependencies: - pytest - python=3.9 - scikit-learn=1.0.0 -- setuptools-rust=1.5.2 - sphinx - sqlalchemy<2 -- tpot +- tpot>=0.12.0 - tzlocal=2.1 - uvicorn=0.13.4 +- libprotobuf=3 +- zlib diff --git a/continuous_integration/gpuci/axis.yaml b/continuous_integration/gpuci/axis.yaml index a7029f427..dd7525507 100644 --- a/continuous_integration/gpuci/axis.yaml +++ b/continuous_integration/gpuci/axis.yaml @@ -3,12 +3,12 @@ PYTHON_VER: - "3.10" CUDA_VER: -- "11.5" +- "11.5.2" LINUX_VER: -- ubuntu18.04 +- ubuntu20.04 RAPIDS_VER: -- "23.06" +- "23.12" excludes: diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh index 790e75540..a26d236b9 100644 --- a/continuous_integration/gpuci/build.sh +++ b/continuous_integration/gpuci/build.sh @@ -37,6 +37,9 @@ gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate dask_sql +gpuci_logger "Update conda env" +gpuci_mamba_retry env update -n dask_sql -f continuous_integration/gpuci/environment-${PYTHON_VER}.yaml + gpuci_logger "Install awscli" gpuci_mamba_retry install -y -c conda-forge awscli @@ -46,12 +49,6 @@ gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}parquet_2gb_sor gpuci_logger "Download query files" gpuci_retry aws s3 cp --only-show-errors "${DASK_SQL_BUCKET_NAME}queries/" tests/unit/queries/ --recursive -# TODO: source install once dask/distributed are unpinned by dask-cuda -# gpuci_logger "Install dask" -# python -m pip install git+https://github.com/dask/dask -# gpuci_logger "Install distributed" -# python -m pip install git+https://github.com/dask/distributed - gpuci_logger "Install dask-sql" pip install -e . -vv diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 5afdb6e4f..14d1cfb95 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -7,22 +7,26 @@ channels: - nodefaults dependencies: - c-compiler +- zlib - dask>=2022.3.0 -# FIXME: handling is needed for httpx-based fastapi>=0.87.0 -- fastapi>=0.69.0,<0.87.0 +- fastapi>=0.92.0 - fugue>=0.7.3 +- httpx>=0.24.1 - intake>=0.6.0 - jsonschema - libprotobuf=3 - lightgbm -- maturin>=0.12.8 -- mlflow>=1.20 +- maturin>=1.1,<1.2 +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock +- numpy>=1.21.6 - pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov @@ -31,21 +35,23 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 -- tpot +- tpot>=0.12.0 - tzlocal>=2.1 - uvicorn>=0.13.4 # GPU-specific requirements - cudatoolkit=11.5 -- cudf=23.06 -- cuml=23.06 -- dask-cudf=23.06 -- dask-cuda=23.06 -# tpot imports fail with numpy >=1.24.0 -# https://github.com/EpistasisLab/tpot/issues/1281 -- numpy>=1.20.1, <1.24.0 +- cudf=23.12 +- cuml=23.12 +- dask-cudf=23.12 +- dask-cuda=23.12 - ucx-proc=*=gpu -- ucx-py=0.32 -- xgboost=*rapidsai23.06 +- ucx-py=0.35 +- xgboost=*=rapidsai_py* +- libxgboost=*=rapidsai_h* +# TODO: unpin after RAPIDS 23.12 release +# - pip +# - pip: +# - git+https://github.com/dask/dask +# - git+https://github.com/dask/distributed diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 6de3a1217..ca1a8ca10 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -7,21 +7,26 @@ channels: - nodefaults dependencies: - c-compiler +- zlib - dask=2022.3.0 -- fastapi=0.69.0 +- fastapi=0.92.0 - fugue=0.7.3 +- httpx=0.24.1 - intake=0.6.0 - jsonschema - libprotobuf=3 - lightgbm -- maturin=0.12.8 -- mlflow>=1.20 +- maturin=1.1 +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock +- numpy=1.21.6 - pandas=1.4.0 - pre-commit - prompt_toolkit=3.0.8 - psycopg2 -- pyarrow=6.0.1 +- pyarrow=6.0.2 - pygments=2.7.1 - pyhive - pytest-cov @@ -30,21 +35,23 @@ dependencies: - pytest - python=3.9 - scikit-learn=1.0.0 -- setuptools-rust=1.5.2 - sphinx - sqlalchemy<2 -- tpot +- tpot=0.12.0 - tzlocal=2.1 - uvicorn=0.13.4 # GPU-specific requirements - cudatoolkit=11.5 -- cudf=23.06 -- cuml=23.06 -- dask-cudf=23.06 -- dask-cuda=23.06 -# tpot imports fail with numpy >=1.24.0 -# https://github.com/EpistasisLab/tpot/issues/1281 -- numpy>=1.20.1, <1.24.0 +- cudf=23.12 +- cuml=23.12 +- dask-cudf=23.12 +- dask-cuda=23.12 - ucx-proc=*=gpu -- ucx-py=0.32 -- xgboost=*rapidsai23.06 +- ucx-py=0.35 +- xgboost=*=rapidsai_py* +- libxgboost=*=rapidsai_h* +# TODO: unpin after RAPIDS 23.12 release +# - pip +# - pip: +# - git+https://github.com/dask/dask +# - git+https://github.com/dask/distributed diff --git a/continuous_integration/recipe/build.sh b/continuous_integration/recipe/build.sh new file mode 100644 index 000000000..3750311d7 --- /dev/null +++ b/continuous_integration/recipe/build.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -ex + +# See https://github.com/conda-forge/rust-feedstock/blob/master/recipe/build.sh for cc env explanation +if [ "$c_compiler" = gcc ] ; then + case "$target_platform" in + linux-64) rust_env_arch=X86_64_UNKNOWN_LINUX_GNU ;; + linux-aarch64) rust_env_arch=AARCH64_UNKNOWN_LINUX_GNU ;; + linux-ppc64le) rust_env_arch=POWERPC64LE_UNKNOWN_LINUX_GNU ;; + *) echo "unknown target_platform $target_platform" ; exit 1 ;; + esac + + export CARGO_TARGET_${rust_env_arch}_LINKER=$CC +fi + +declare -a _xtra_maturin_args + +mkdir -p $SRC_DIR/.cargo + +if [ "$target_platform" = "osx-64" ] ; then + cat <> $SRC_DIR/.cargo/config +[target.x86_64-apple-darwin] +linker = "$CC" +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +EOF + + _xtra_maturin_args+=(--target=x86_64-apple-darwin) + +elif [ "$target_platform" = "osx-arm64" ] ; then + cat <> $SRC_DIR/.cargo/config +# Required for intermediate codegen stuff +[target.x86_64-apple-darwin] +linker = "$CC_FOR_BUILD" + +# Required for final binary artifacts for target +[target.aarch64-apple-darwin] +linker = "$CC" +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +EOF + _xtra_maturin_args+=(--target=aarch64-apple-darwin) + + # This variable must be set to the directory containing the target's libpython DSO + export PYO3_CROSS_LIB_DIR=$PREFIX/lib + + # xref: https://github.com/PyO3/pyo3/commit/7beb2720 + export PYO3_PYTHON_VERSION=${PY_VER} + + # xref: https://github.com/conda-forge/python-feedstock/issues/621 + sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/os-patch.py + sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/platform-patch.py +fi + +maturin build -vv -j "${CPU_COUNT}" --release --strip --manylinux off --interpreter="${PYTHON}" "${_xtra_maturin_args[@]}" + +"${PYTHON}" -m pip install $SRC_DIR/target/wheels/dask_sql*.whl --no-deps -vv diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index fe14e3873..aa6e0d578 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -1,8 +1,4 @@ -python: - - 3.9 - - 3.10 - - 3.11 -c_compiler_version: - - 11 rust_compiler_version: - 1.69 +maturin: + - 1.1 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 5ad457621..fbe498eda 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -13,29 +13,32 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - skip: true # [py2k] entry_points: - dask-sql-server = dask_sql.server.app:main - dask-sql = dask_sql.cmd:main string: py{{ python | replace(".", "") }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script: RUST_BACKTRACE=full {{ PYTHON }} -m pip install . --no-deps -vv requirements: build: + - python # [build_platform != target_platform] + - cross-python_{{ target_platform }} # [build_platform != target_platform] + - maturin # [build_platform != target_platform] + - libprotobuf + - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} - - setuptools-rust >=1.5.2 host: - pip - python - - setuptools-rust >=1.5.2 - - libprotobuf =3 + - maturin + - zlib + - xz # [linux64] run: - python - dask >=2022.3.0 - pandas >=1.4.0 - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - - fastapi >=0.69.0,<0.87.0 + - fastapi >=0.92.0 + - httpx >=0.24.1 - uvicorn >=0.13.4 - tzlocal >=2.1 - prompt-toolkit >=3.0.8 diff --git a/dask_planner/update-dependencies.sh b/continuous_integration/scripts/update-dependencies.sh similarity index 100% rename from dask_planner/update-dependencies.sh rename to continuous_integration/scripts/update-dependencies.sh diff --git a/dask_planner/.classpath b/dask_planner/.classpath deleted file mode 100644 index b14b13a76..000000000 --- a/dask_planner/.classpath +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dask_planner/.gitignore b/dask_planner/.gitignore deleted file mode 100644 index c8f044299..000000000 --- a/dask_planner/.gitignore +++ /dev/null @@ -1,72 +0,0 @@ -/target - -# Byte-compiled / optimized / DLL files -__pycache__/ -.pytest_cache/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -.venv/ -env/ -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -include/ -man/ -venv/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt -pip-selfcheck.json - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# Rope -.ropeproject - -# Django stuff: -*.log -*.pot - -.DS_Store - -# Sphinx documentation -docs/_build/ - -# PyCharm -.idea/ - -# VSCode -.vscode/ - -# Pyenv -.python-version diff --git a/dask_planner/.settings/org.eclipse.core.resources.prefs b/dask_planner/.settings/org.eclipse.core.resources.prefs deleted file mode 100644 index 92920805e..000000000 --- a/dask_planner/.settings/org.eclipse.core.resources.prefs +++ /dev/null @@ -1,5 +0,0 @@ -eclipse.preferences.version=1 -encoding//src/main/java=UTF-8 -encoding//src/main/resources=UTF-8 -encoding//target/generated-sources/annotations=UTF-8 -encoding/=UTF-8 diff --git a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs deleted file mode 100644 index d4313d4b2..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs +++ /dev/null @@ -1,2 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.apt.aptEnabled=false diff --git a/dask_planner/.settings/org.eclipse.jdt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.core.prefs deleted file mode 100644 index 1b6e1ef22..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.core.prefs +++ /dev/null @@ -1,9 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 -org.eclipse.jdt.core.compiler.compliance=1.8 -org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled -org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning -org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore -org.eclipse.jdt.core.compiler.processAnnotations=disabled -org.eclipse.jdt.core.compiler.release=disabled -org.eclipse.jdt.core.compiler.source=1.8 diff --git a/dask_planner/.settings/org.eclipse.m2e.core.prefs b/dask_planner/.settings/org.eclipse.m2e.core.prefs deleted file mode 100644 index f897a7f1c..000000000 --- a/dask_planner/.settings/org.eclipse.m2e.core.prefs +++ /dev/null @@ -1,4 +0,0 @@ -activeProfiles= -eclipse.preferences.version=1 -resolveWorkspaceProjects=true -version=1 diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml deleted file mode 100644 index 550fbe2ff..000000000 --- a/dask_planner/Cargo.toml +++ /dev/null @@ -1,23 +0,0 @@ -[package] -name = "dask_planner" -repository = "https://github.com/dask-contrib/dask-sql" -version = "0.1.0" -description = "Bindings for DataFusion used by Dask-SQL" -readme = "README.md" -license = "Apache-2.0" -edition = "2021" -rust-version = "1.65" - -[dependencies] -async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "9493638" } -env_logger = "0.10" -log = "^0.4" -pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py39"] } -pyo3-log = "0.8.1" - -[build-dependencies] -pyo3-build-config = "0.18.3" - -[lib] -crate-type = ["cdylib"] diff --git a/dask_planner/MANIFEST.in b/dask_planner/MANIFEST.in deleted file mode 100644 index 7c68298bd..000000000 --- a/dask_planner/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include Cargo.toml -recursive-include src * diff --git a/dask_planner/README.md b/dask_planner/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/dask_planner/pyproject.toml b/dask_planner/pyproject.toml deleted file mode 100644 index 1fa5119d1..000000000 --- a/dask_planner/pyproject.toml +++ /dev/null @@ -1,11 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] - -[project] -name = "datafusion_planner" -requires-python = ">=3.9" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index 756486b74..803c753d1 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,9 +1,15 @@ -from . import _version, config +# FIXME: can we modify TLS model of Rust object to avoid aarch64 glibc bug? +# https://github.com/dask-contrib/dask-sql/issues/1169 +from . import _datafusion_lib # isort:skip + +import importlib.metadata + +from . import config from .cmd import cmd_loop from .context import Context from .datacontainer import Statistics from .server.app import run_server -__version__ = _version.get_versions()["version"] +__version__ = importlib.metadata.version(__name__) __all__ = [__version__, cmd_loop, Context, run_server, Statistics] diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index 20d69852c..be8cfbae5 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -8,9 +8,16 @@ _dask_version = parseVersion(dask.__version__) INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0") +PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0") # TODO: remove if prompt-toolkit min version gets bumped PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29") # TODO: remove when dask min version gets bumped BROADCAST_JOIN_SUPPORT_WORKING = _dask_version > parseVersion("2023.1.0") + +# Parquet predicate-support version checks +PQ_NOT_IN_SUPPORT = parseVersion(dask.__version__) > parseVersion("2023.5.1") +PQ_IS_SUPPORT = parseVersion(dask.__version__) >= parseVersion("2023.3.1") + +DASK_CUDF_TODATETIME_SUPPORT = _dask_version >= parseVersion("2023.5.1") diff --git a/dask_sql/_version.py b/dask_sql/_version.py deleted file mode 100644 index 3ea999fa9..000000000 --- a/dask_sql/_version.py +++ /dev/null @@ -1,693 +0,0 @@ -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. -# Generated by versioneer-0.27 -# https://github.com/python-versioneer/python-versioneer - -"""Git implementation of _version.py.""" - -import errno -import functools -import os -import re -import subprocess -import sys -from typing import Callable, Dict - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "$Format:%d$" - git_full = "$Format:%H$" - git_date = "$Format:%ci$" - keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "pep440" - cfg.tag_prefix = "" - cfg.parentdir_prefix = "dask-sql-" - cfg.versionfile_source = "dask_sql/_version.py" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY: Dict[str, str] = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} - - -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - process = None - - popen_kwargs = {} - if sys.platform == "win32": - # This hides the console window if pythonw.exe is used - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - popen_kwargs["startupinfo"] = startupinfo - - for command in commands: - try: - dispcmd = str([command] + args) - # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen( - [command] + args, - cwd=cwd, - env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr else None), - **popen_kwargs, - ) - break - except OSError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None, None - stdout = process.communicate()[0].strip().decode() - if process.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - print("stdout was %s" % stdout) - return None, process.returncode - return stdout, process.returncode - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for _ in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - "date": None, - } - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - with open(versionfile_abs, "r") as fobj: - for line in fobj: - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - except OSError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if "refnames" not in keywords: - raise NotThisMethod("Short version file found") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = {r.strip() for r in refnames.strip("()").split(",")} - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r"\d", r)} - if verbose: - print("discarding '%s', no digits" % ",".join(refs - tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix) :] - # Filter out refs that exactly match prefix or that don't start - # with a number once the prefix is stripped (mostly a concern - # when prefix is '') - if not re.match(r"\d", r): - continue - if verbose: - print("picking %s" % r) - return { - "version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": None, - "date": date, - } - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return { - "version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": "no suitable tags", - "date": None, - } - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - - # GIT_DIR can interfere with correct operation of Versioneer. - # It may be intended to be passed to the Versioneer-versioned project, - # but that should not change where we get our version from. - env = os.environ.copy() - env.pop("GIT_DIR", None) - runner = functools.partial(runner, env=env) - - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) - if rc != 0: - if verbose: - print("Directory %s not under git control" % root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner( - GITS, - [ - "describe", - "--tags", - "--dirty", - "--always", - "--long", - "--match", - f"{tag_prefix}[[:digit:]]*", - ], - cwd=root, - ) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) - # --abbrev-ref was added in git-1.6.3 - if rc != 0 or branch_name is None: - raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") - branch_name = branch_name.strip() - - if branch_name == "HEAD": - # If we aren't exactly on a branch, pick a branch which represents - # the current commit. If all else fails, we are on a branchless - # commit. - branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) - # --contains was added in git-1.5.4 - if rc != 0 or branches is None: - raise NotThisMethod("'git branch --contains' returned error") - branches = branches.split("\n") - - # Remove the first line if we're running detached - if "(" in branches[0]: - branches.pop(0) - - # Strip off the leading "* " from the list of branches. - branches = [branch[2:] for branch in branches] - if "master" in branches: - branch_name = "master" - elif not branches: - branch_name = None - else: - # Pick the first branch that is returned. Good or bad. - branch_name = branches[0] - - pieces["branch"] = branch_name - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[: git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) - if not mo: - # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, - ) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) - pieces["distance"] = len(out.split()) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_branch(pieces): - """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . - - The ".dev0" means not master branch. Note that .dev0 sorts backwards - (a feature branch will appear "older" than the master branch). - - Exceptions: - 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0" - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def pep440_split_post(ver): - """Split pep440 version string at the post-release segment. - - Returns the release segments before the post-release and the - post-release version number (or -1 if no post-release segment is present). - """ - vc = str.split(ver, ".post") - return vc[0], int(vc[1] or 0) if len(vc) == 2 else None - - -def render_pep440_pre(pieces): - """TAG[.postN.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ - if pieces["closest-tag"]: - if pieces["distance"]: - # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) - rendered = tag_version - if post_version is not None: - rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) - else: - rendered += ".post0.dev%d" % (pieces["distance"]) - else: - # no commits, use the tag as the version - rendered = pieces["closest-tag"] - else: - # exception #1 - rendered = "0.post0.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_post_branch(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . - - The ".dev0" means not master branch. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return { - "version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None, - } - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-branch": - rendered = render_pep440_branch(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-post-branch": - rendered = render_pep440_post_branch(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return { - "version": rendered, - "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], - "error": None, - "date": pieces.get("date"), - } - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for _ in cfg.versionfile_source.split("/"): - root = os.path.dirname(root) - except NameError: - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None, - } - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", - "date": None, - } diff --git a/dask_sql/context.py b/dask_sql/context.py index 6426a8dc7..1023d5714 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -10,7 +10,7 @@ from dask.base import optimize from dask.utils_test import hlg_layer -from dask_planner.rust import ( +from dask_sql._datafusion_lib import ( DaskSchema, DaskSQLContext, DaskTable, @@ -42,7 +42,7 @@ from dask_sql.mappings import python_to_sql_type from dask_sql.physical.rel import RelConverter, custom, logical from dask_sql.physical.rex import RexConverter, core -from dask_sql.utils import OptimizationException, ParsingException +from dask_sql.utils import ParsingException logger = logging.getLogger(__name__) @@ -102,6 +102,10 @@ def __init__(self, logging_level=logging.INFO): self.context = DaskSQLContext(self.catalog_name, self.schema_name) self.context.register_schema(self.schema_name, DaskSchema(self.schema_name)) + self.context.apply_dynamic_partition_pruning( + dask_config.get("sql.dynamic_partition_pruning") + ) + # # Register any default plugins, if nothing was registered before. RelConverter.add_plugin_class(logical.DaskAggregatePlugin, replace=False) RelConverter.add_plugin_class(logical.DaskCrossJoinPlugin, replace=False) @@ -795,6 +799,9 @@ def _get_ral(self, sql): """Helper function to turn the sql query into a relational algebra and resulting column names""" logger.debug(f"Entering _get_ral('{sql}')") + self.context.apply_dynamic_partition_pruning( + dask_config.get("sql.dynamic_partition_pruning") + ) # get the schema of what we currently have registered schemas = self._prepare_schemas() @@ -824,8 +831,9 @@ def _get_ral(self, sql): try: rel = self.context.optimize_relational_algebra(nonOptimizedRel) except DFOptimizationException as oe: + # Use original plan and warn about inability to optimize plan rel = nonOptimizedRel - raise OptimizationException(str(oe)) from None + logger.warn(str(oe)) else: rel = nonOptimizedRel @@ -838,15 +846,19 @@ def _get_ral(self, sql): def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = True): dc = RelConverter.convert(rel, context=self) - # Optimization might remove some alias projects. Make sure to keep them here. - select_names = [field for field in rel.getRowType().getFieldList()] - if rel.get_current_node_type() == "Explain": return dc if dc is None: return + # Optimization might remove some alias projects. Make sure to keep them here. + select_names = [field for field in rel.getRowType().getFieldList()] + if select_names: + cc = dc.column_container + + select_names = select_names[: len(cc.columns)] + # Use FQ name if not unique and simple name if it is unique. If a join contains the same column # names the output col is prepended with the fully qualified column name field_counts = Counter([field.getName() for field in select_names]) @@ -857,7 +869,6 @@ def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = Tru for field in select_names ] - cc = dc.column_container cc = cc.rename( { df_col: select_name diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 4d0eb9cce..14bc547f0 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -6,7 +6,7 @@ import dask.dataframe as dd -from dask_planner.rust import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index a7ff50147..ca0e23691 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from typing import Any import dask.array as da @@ -7,7 +8,7 @@ import numpy as np import pandas as pd -from dask_planner.rust import DaskTypeMap, SqlTypeName +from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) @@ -121,6 +122,26 @@ def python_to_sql_type(python_type) -> "DaskTypeMap": ) +def parse_datetime(obj): + formats = [ + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + "%d-%m-%Y %H:%M:%S", + "%d-%m-%Y", + "%m/%d/%Y %H:%M:%S", + "%m/%d/%Y", + ] + + for f in formats: + try: + datetime_obj = datetime.strptime(obj, f) + return datetime_obj + except ValueError: + pass + + raise ValueError("Unable to parse datetime: " + obj) + + def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: """Mapping between SQL and python values (of correct type).""" # In most of the cases, we turn the value first into a string. @@ -191,6 +212,7 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: or sql_type == SqlTypeName.DATE ): if isinstance(literal_value, str): + literal_value = parse_datetime(literal_value) literal_value = np.datetime64(literal_value) elif str(literal_value) == "None": # NULL time @@ -309,13 +331,19 @@ def cast_column_type( def cast_column_to_type(col: dd.Series, expected_type: str): """Cast the given column to the expected type""" + pdt = pd.api.types + + is_dt_ns = pdt.is_datetime64_ns_dtype + is_dt_tz = lambda t: is_dt_ns(t) and pdt.is_datetime64tz_dtype(t) + is_dt_ntz = lambda t: is_dt_ns(t) and not pdt.is_datetime64tz_dtype(t) + current_type = col.dtype if similar_type(current_type, expected_type): logger.debug("...not converting.") return None - if pd.api.types.is_integer_dtype(expected_type): + if pdt.is_integer_dtype(expected_type): if pd.api.types.is_float_dtype(current_type): logger.debug("...truncating...") # Currently "trunc" can not be applied to NA (the pandas missing value type), @@ -323,10 +351,14 @@ def cast_column_to_type(col: dd.Series, expected_type: str): # For our use case, that does not matter, as the conversion to integer later # will convert both NA and np.NaN to NA. col = da.trunc(col.fillna(value=np.NaN)) - elif pd.api.types.is_timedelta64_dtype(current_type): + elif pdt.is_timedelta64_dtype(current_type): logger.debug(f"Explicitly casting from {current_type} to np.int64") return col.astype(np.int64) + if is_dt_tz(current_type) and is_dt_ntz(expected_type): + # casting from timezone-aware to timezone-naive datatypes with astype is deprecated in pandas 2 + return col.dt.tz_localize(None) + logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index 3d42a84b6..5215dfe28 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -1,5 +1,5 @@ import logging -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import dask.dataframe as dd @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan, RelDataType + from dask_sql._datafusion_lib import LogicalPlan, RelDataType logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> dd.DataFra @staticmethod def fix_column_to_row_type( - cc: ColumnContainer, row_type: "RelDataType" + cc: ColumnContainer, row_type: "RelDataType", join_type: Optional[str] = None ) -> ColumnContainer: """ Make sure that the given column container @@ -39,6 +39,8 @@ def fix_column_to_row_type( and will just "blindly" rename the columns. """ field_names = [str(x) for x in row_type.getFieldNames()] + if join_type in ("leftsemi", "leftanti"): + field_names = field_names[: len(cc.columns)] logger.debug(f"Renaming {cc.columns} to {field_names}") cc = cc.rename_handle_duplicates( @@ -84,7 +86,9 @@ def assert_inputs( return [RelConverter.convert(input_rel, context) for input_rel in input_rels] @staticmethod - def fix_dtype_to_row_type(dc: DataContainer, row_type: "RelDataType"): + def fix_dtype_to_row_type( + dc: DataContainer, row_type: "RelDataType", join_type: Optional[str] = None + ): """ Fix the dtype of the given data container (or: the df within it) to the data type given as argument. @@ -98,9 +102,12 @@ def fix_dtype_to_row_type(dc: DataContainer, row_type: "RelDataType"): df = dc.df cc = dc.column_container + field_list = row_type.getFieldList() + if join_type in ("leftsemi", "leftanti"): + field_list = field_list[: len(cc.columns)] + field_types = { - str(field.getQualifiedName()): field.getType() - for field in row_type.getFieldList() + str(field.getQualifiedName()): field.getType() for field in field_list } for field_name, field_type in field_types.items(): diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 29ad8c327..6d2beceff 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 9c8a159b0..b29eb7737 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 14011ccef..49308cf3a 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): @@ -47,26 +47,22 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai df = dc.df # Calculate statistics - statistics = dd.from_pandas( - pd.DataFrame({col: [] for col in columns}), npartitions=1 - ) - statistics = statistics.append(df[[mapping(col) for col in columns]].describe()) - - # Add additional information - statistics = statistics.append( - pd.Series( - { - col: str(python_to_sql_type(df[mapping(col)].dtype)).lower() - for col in columns - }, - name="data_type", - ) - ) - statistics = statistics.append( - pd.Series( - {col: col for col in columns}, - name="col_name", - ) + statistics = dd.concat( + [ + df[[mapping(col) for col in columns]].describe(), + pd.DataFrame( + { + mapping(col): str( + python_to_sql_type(df[mapping(col)].dtype) + ).lower() + for col in columns + }, + index=["data_type"], + ), + pd.DataFrame( + {mapping(col): col for col in columns}, index=["col_name"] + ), + ] ) cc = ColumnContainer(statistics.columns) diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 52ed37b55..e55d31a90 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 760857563..3c829fb42 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 36b165230..cbe61abf7 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index d915a6b0b..422ac7c3b 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index c7ce70610..71ac114f2 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 444662e2b..5491fcaa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index 07cf9979e..08446c43c 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index 917d712c3..0bb5c79b4 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 6b0b94fe9..2da4f4535 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index 3f879dd38..28e495810 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 98b9f8ab3..fb69c5359 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index d79b4052b..05fb8a66c 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 889dd2b1c..f5fc65b7d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 24c339381..1af2748f5 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) @@ -127,6 +127,7 @@ class DaskAggregatePlugin(BaseRelPlugin): "avg": AggregationSpecification("mean", AggregationOnPandas("mean")), "stddev": AggregationSpecification("std", AggregationOnPandas("std")), "stddevsamp": AggregationSpecification("std", AggregationOnPandas("std")), + "stddev_samp": AggregationSpecification("std", AggregationOnPandas("std")), "stddevpop": AggregationSpecification( dd.Aggregation( "stddevpop", @@ -142,6 +143,21 @@ class DaskAggregatePlugin(BaseRelPlugin): ** (1 / 2), ) ), + "stddev_pop": AggregationSpecification( + dd.Aggregation( + "stddev_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ) + ** (1 / 2), + ) + ), "bit_and": AggregationSpecification( ReduceAggregation("bit_and", operator.and_) ), @@ -198,6 +214,20 @@ class DaskAggregatePlugin(BaseRelPlugin): ), ) ), + "variance_pop": AggregationSpecification( + dd.Aggregation( + "variance_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ), + ) + ), } def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 5f32d3257..d1c74c8cc 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 23f8d1cd3..453f63de5 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index 69d20fca3..0e4875d0c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index 178121fef..af3685a11 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -1,5 +1,5 @@ import logging -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Union import dask.config as dask_config import dask.dataframe as dd @@ -12,12 +12,16 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) -def filter_or_scalar(df: dd.DataFrame, filter_condition: Union[np.bool_, dd.Series]): +def filter_or_scalar( + df: dd.DataFrame, + filter_condition: Union[np.bool_, dd.Series], + add_filters: List = None, +): """ Some (complex) SQL queries can lead to a strange condition which is always true or false. We do not need to filter in this case. @@ -35,7 +39,7 @@ def filter_or_scalar(df: dd.DataFrame, filter_condition: Union[np.bool_, dd.Seri filter_condition = filter_condition.fillna(False) out = df[filter_condition] if dask_config.get("sql.predicate_pushdown"): - return attempt_predicate_pushdown(out) + return attempt_predicate_pushdown(out, add_filters=add_filters) else: return out diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index 320913e92..374c74420 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -14,10 +14,11 @@ from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rel.logical.filter import filter_or_scalar from dask_sql.physical.rex import RexConverter +from dask_sql.utils import is_cudf_type if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) @@ -45,7 +46,8 @@ class DaskJoinPlugin(BaseRelPlugin): "LEFT": "left", "RIGHT": "right", "FULL": "outer", - "LEFTSEMI": "inner", # TODO: Need research here! This is likely not a true inner join + "LEFTSEMI": "leftsemi", + "LEFTANTI": "leftanti", } def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: @@ -74,6 +76,10 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai join_type = join.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] + # TODO: update with correct implementation of leftsemi for CPU + # https://github.com/dask-contrib/dask-sql/issues/1190 + if join_type == "leftsemi" and not is_cudf_type(df_lhs_renamed): + join_type = "inner" # 3. The join condition can have two forms, that we can understand # (a) a = b @@ -170,14 +176,19 @@ def merge_single_partitions(lhs_partition, rhs_partition): # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) - correct_column_order = list(df_lhs_renamed.columns) + list( - df_rhs_renamed.columns - ) + if join_type in ("leftsemi", "leftanti"): + correct_column_order = list(df_lhs_renamed.columns) + else: + correct_column_order = list(df_lhs_renamed.columns) + list( + df_rhs_renamed.columns + ) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] + if join_type in ("leftsemi", "leftanti"): + field_specifications = field_specifications[: len(cc.columns)] cc = cc.rename( { @@ -185,7 +196,7 @@ def merge_single_partitions(lhs_partition, rhs_partition): for from_col, to_col in zip(cc.columns, field_specifications) } ) - cc = self.fix_column_to_row_type(cc, row_type) + cc = self.fix_column_to_row_type(cc, row_type, join_type) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters @@ -202,7 +213,7 @@ def merge_single_partitions(lhs_partition, rhs_partition): df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, rel.getRowType(), join_type) # # Rename underlying DataFrame column names back to their original values before returning # df = dc.assign() # dc = DataContainer(df, ColumnContainer(cc.columns)) @@ -227,7 +238,7 @@ def _join_on_columns( [~df_lhs_renamed.iloc[:, index].isna() for index in lhs_on], ) df_lhs_renamed = df_lhs_renamed[df_lhs_filter] - if join_type in ["inner", "left"]: + if join_type in ["inner", "left", "leftanti", "leftsemi"]: df_rhs_filter = reduce( operator.and_, [~df_rhs_renamed.iloc[:, index].isna() for index in rhs_on], @@ -256,12 +267,24 @@ def _join_on_columns( "For more information refer to https://github.com/dask/dask/issues/9851" " and https://github.com/dask/dask/issues/9870" ) - df = df_lhs_with_tmp.merge( - df_rhs_with_tmp, - on=added_columns, - how=join_type, - broadcast=broadcast, - ).drop(columns=added_columns) + if join_type == "leftanti" and not is_cudf_type(df_lhs_with_tmp): + df = df_lhs_with_tmp.merge( + df_rhs_with_tmp, + on=added_columns, + how="left", + broadcast=broadcast, + indicator=True, + ).drop(columns=added_columns) + df = df[df["_merge"] == "left_only"].drop( + columns=["_merge"] + list(df_rhs_with_tmp.columns), errors="ignore" + ) + else: + df = df_lhs_with_tmp.merge( + df_rhs_with_tmp, + on=added_columns, + how=join_type, + broadcast=broadcast, + ).drop(columns=added_columns) return df diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 3e2fc6434..9bd2be562 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index b990e21b4..0a7637f59 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,7 +1,7 @@ import logging from typing import TYPE_CHECKING -from dask_planner.rust import RexType +from dask_sql._datafusion_lib import RexType from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 2e1376d41..9dfccdc49 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index 2473167d7..14be8928f 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index 62cb38256..4a9cecc25 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -3,6 +3,8 @@ from functools import reduce from typing import TYPE_CHECKING +from dask.utils_test import hlg_layer + from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rel.logical.filter import filter_or_scalar @@ -10,7 +12,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) @@ -77,16 +79,38 @@ def _apply_projections(self, table_scan, dask_table, dc): def _apply_filters(self, table_scan, rel, dc, context): df = dc.df cc = dc.column_container - filters = table_scan.getFilters() - # All partial filters here are applied in conjunction (&) - if filters: + all_filters = table_scan.getFilters() + conjunctive_dnf_filters = table_scan.getDNFFilters().filtered_exprs + non_dnf_filters = table_scan.getDNFFilters().io_unfilterable_exprs + + if conjunctive_dnf_filters: + # Extract the PyExprs from the conjunctive DNF filters + filter_exprs = [f[0] for f in conjunctive_dnf_filters] + if non_dnf_filters: + filter_exprs.extend(non_dnf_filters) + + df_condition = reduce( + operator.and_, + [ + RexConverter.convert(rel, rex, dc, context=context) + for rex in filter_exprs + ], + ) + df = filter_or_scalar( + df, df_condition, add_filters=[f[1] for f in conjunctive_dnf_filters] + ) + elif all_filters: df_condition = reduce( operator.and_, [ RexConverter.convert(rel, rex, dc, context=context) - for rex in filters + for rex in all_filters ], ) df = filter_or_scalar(df, df_condition) + try: + logger.debug(hlg_layer(df.dask, "read-parquet").creation_info) + except KeyError: + pass return DataContainer(df, cc) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 830f7f981..f31ced797 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 2ac20ec3b..42b0f9613 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -17,7 +17,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 5724a4536..d74ad6309 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 71431cbb4..1713e496d 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 40c373766..7486bc9c5 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index ab2ecaeba..8db8ca048 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -1,11 +1,13 @@ import logging import operator import re +import warnings from datetime import datetime from functools import partial, reduce from typing import TYPE_CHECKING, Any, Callable, Union import dask.array as da +import dask.config as dask_config import dask.dataframe as dd import numpy as np import pandas as pd @@ -14,13 +16,15 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner.rust import SqlTypeName +from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, sql_to_python_type, sql_to_python_value, ) +from dask_sql.physical.rel import RelConverter from dask_sql.physical.rex import RexConverter from dask_sql.physical.rex.base import BaseRexPlugin from dask_sql.physical.rex.core.literal import SargPythonImplementation @@ -34,7 +38,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] @@ -44,7 +48,11 @@ def as_timelike(op): if isinstance(op, np.int64): return np.timedelta64(op, "D") elif isinstance(op, str): - return np.datetime64(op) + try: + return np.datetime64(op) + except ValueError: + op = datetime.strptime(op, "%Y-%m-%d") + return np.datetime64(op.strftime("%Y-%m-%d")) elif pd.api.types.is_datetime64_dtype(op) or isinstance(op, np.timedelta64): return op else: @@ -60,6 +68,12 @@ class Operation: # True, if the operation should also get the REX needs_rex = False + # True, if the operation should also needs the Context, possible subquery Relation expansion + needs_context = False + + # True, if the operation needs the original relation algebra + needs_rel = False + @staticmethod def op_needs_dc(op): return hasattr(op, "needs_dc") and op.needs_dc @@ -68,6 +82,14 @@ def op_needs_dc(op): def op_needs_rex(op): return hasattr(op, "needs_rex") and op.needs_rex + @staticmethod + def op_needs_context(op): + return hasattr(op, "needs_context") and op.needs_context + + @staticmethod + def op_needs_rel(op): + return hasattr(op, "needs_rel") and op.needs_rel + def __init__(self, f: Callable): """Init with the given function""" self.f = f @@ -81,6 +103,8 @@ def of(self, op: "Operation") -> "Operation": new_op = Operation(lambda *x, **kwargs: self(op(*x, **kwargs))) new_op.needs_dc = Operation.op_needs_dc(op) new_op.needs_rex = Operation.op_needs_rex(op) + new_op.needs_context = Operation.op_needs_context(op) + new_op.needs_rel = Operation.op_needs_rel(op) return new_op @@ -250,6 +274,9 @@ def cast(self, operand, rex=None) -> SeriesOrScalar: if output_type == "DECIMAL": sql_type_args = rex.getPrecisionScale() + if output_type == "TIMESTAMP" and pd.api.types.is_integer_dtype(operand): + operand = operand * 10**9 + if not is_frame(operand): # pragma: no cover return sql_to_python_value(sql_type, operand) @@ -612,17 +639,8 @@ def to_timestamp(self, df, format): format = format.replace('"', "") format = format.replace("'", "") - # TODO: format timestamps for GPU tests - if is_cudf_type(df): - if format != default_format: - raise RuntimeError("Non-default timestamp formats not supported on GPU") - if df.dtype == "object": - return df - else: - nanoseconds_to_seconds = 10**9 - return df * nanoseconds_to_seconds # String cases - elif type(df) == str: + if type(df) == str: return np.datetime64(datetime.strptime(df, format)) elif df.dtype == "object": return dd.to_datetime(df, format=format) @@ -655,7 +673,11 @@ def timestampadd(self, unit, interval, df: SeriesOrScalar): interval = int(interval) if interval < 0: raise RuntimeError(f"Negative time interval {interval} is not supported.") - df = df.astype("datetime64[ns]") + df = ( + df.astype("datetime64[s]") + if pd.api.types.is_integer_dtype(df) + else df.astype("datetime64[ns]") + ) if is_cudf_type(df): from cudf import DateOffset @@ -699,11 +721,23 @@ def __init__(self): super().__init__(self.datetime_sub) def datetime_sub(self, unit, df1, df2): + if pd.api.types.is_integer_dtype(df1): + df1 = df1 * 10**9 + if pd.api.types.is_integer_dtype(df2): + df2 = df2 * 10**9 + if "datetime64[s]" == str(getattr(df1, "dtype", "")): + df1 = df1.astype("datetime64[ns]") + if "datetime64[s]" == str(getattr(df2, "dtype", "")): + df2 = df2.astype("datetime64[ns]") + subtraction_op = ReduceOperation( operation=operator.sub, unary_operation=lambda x: -x ) result = subtraction_op(df2, df1) + if is_cudf_type(df1): + result = result.astype("int") + if unit in {"NANOSECOND", "NANOSECONDS"}: return result elif unit in {"MICROSECOND", "MICROSECONDS"}: @@ -926,9 +960,19 @@ def date_part(self, what, df: SeriesOrScalar): elif what in {"SECOND", "SECONDS"}: return df.second elif what in {"WEEK", "WEEKS"}: - return df.week + return df.isocalendar().week if PANDAS_GT_200 else df.week elif what in {"YEAR", "YEARS"}: return df.year + elif what == "DATE": + if isinstance(df, pd.Timestamp): + return df.date() + else: + if is_cudf_type(df) and not DASK_CUDF_TODATETIME_SUPPORT: + raise RuntimeError( + "Dask-cuDF to_datetime support requires Dask version >= 2023.5.1" + ) + else: + return dd.to_datetime(df.strftime("%Y-%m-%d")) else: raise NotImplementedError(f"Extraction of {what} is not (yet) implemented.") @@ -966,6 +1010,39 @@ def inList(self, series: dd.Series, *operands, rex=None): return ~result if rex.isNegated() else result +class InSubqueryOperation(Operation): + """ + Returns a boolean of whether an expression is/isn't in a Subquery Expression result + """ + + needs_rex = True + needs_context = True + needs_rel = True + + def __init__(self): + super().__init__(self.inSubquery) + + def inSubquery( + self, series: dd.Series, *operands, rel=None, rex=None, context=None + ): + sub_rel = rex.getSubqueryLogicalPlan() + dc = RelConverter.convert(sub_rel, context=context) + + # Extract the specified column/Series from the Dataframe + fq_column_name = rex.column_name(rel).split(".") + + # FIXME: dask's isin doesn't support dask frames as arguments + # so we need to compute here + col = dc.df[fq_column_name[-1]].compute() + + warnings.warn( + "Dask doesn't support Dask frames as input for .isin, so we must force an early computation", + ResourceWarning, + ) + + return series.isin(col) + + class RexCallPlugin(BaseRexPlugin): """ RexCall is used for expressions, which calculate something. @@ -1015,6 +1092,7 @@ class RexCallPlugin(BaseRexPlugin): "negative": NegativeOperation(), "not": NotOperation(), "in list": InListOperation(), + "in subquery": InSubqueryOperation(), "is null": IsNullOperation(), "is not null": NotOperation().of(IsNullOperation()), "is true": IsTrueOperation(), @@ -1056,6 +1134,9 @@ class RexCallPlugin(BaseRexPlugin): "characterlength": TensorScalarOperation( lambda x: x.str.len(), lambda x: len(x) ), + "character_length": TensorScalarOperation( + lambda x: x.str.len(), lambda x: len(x) + ), "upper": TensorScalarOperation(lambda x: x.str.upper(), lambda x: x.upper()), "lower": TensorScalarOperation(lambda x: x.str.lower(), lambda x: x.lower()), "position": PositionOperation(), @@ -1070,6 +1151,7 @@ class RexCallPlugin(BaseRexPlugin): "coalesce": CoalesceOperation(), "replace": ReplaceOperation(), # date/time operations + "extract_date": ExtractOperation(), "localtime": Operation(lambda *args: pd.Timestamp.now()), "localtimestamp": Operation(lambda *args: pd.Timestamp.now()), "current_time": Operation(lambda *args: pd.Timestamp.now()), @@ -1082,6 +1164,7 @@ class RexCallPlugin(BaseRexPlugin): "dsql_totimestamp": ToTimestampOperation(), # Temporary UDF functions that need to be moved after this POC "datepart": ExtractOperation(), + "date_part": ExtractOperation(), "year": YearOperation(), "timestampadd": TimeStampAddOperation(), "timestampceil": CeilFloorOperation("ceil"), @@ -1103,6 +1186,21 @@ def convert( for o in expr.getOperands() ] + # FIXME: cuDF doesn't support binops between decimal columns and numpy ints / floats + if dask_config.get("sql.mappings.decimal_support") == "cudf" and any( + str(getattr(o, "dtype", None)) == "decimal128" for o in operands + ): + from decimal import Decimal + + operands = [ + Decimal(str(o)) + if isinstance(o, float) + else o.item() + if np.isscalar(o) and pd.api.types.is_integer_dtype(o) + else o + for o in operands + ] + # Now use the operator name in the mapping schema_name = context.schema_name operator_name = expr.getOperatorName().lower() @@ -1113,7 +1211,9 @@ def convert( try: operation = context.schema[schema_name].functions[operator_name] except KeyError: # pragma: no cover - raise NotImplementedError(f"{operator_name} not (yet) implemented") + raise NotImplementedError( + f"RexCall operator '{operator_name}' not (yet) implemented" + ) logger.debug( f"Executing {operator_name} on {[str(LoggableDataFrame(df)) for df in operands]}" @@ -1125,6 +1225,10 @@ def convert( kwargs["dc"] = dc if Operation.op_needs_rex(operation): kwargs["rex"] = expr + if Operation.op_needs_context(operation): + kwargs["context"] = context + if Operation.op_needs_rel(operation): + kwargs["rel"] = rel return operation(*operands, **kwargs) # TODO: We have information on the typing here - we should use it diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 4272c832e..4d2c0f929 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 73e3b8185..da0eeb128 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -5,14 +5,14 @@ import dask.dataframe as dd import numpy as np -from dask_planner.rust import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 5e0a33098..60a07c0b9 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index 5309289c4..ae564244d 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import itertools import logging import operator @@ -5,14 +7,21 @@ import dask.dataframe as dd import numpy as np from dask.blockwise import Blockwise -from dask.highlevelgraph import HighLevelGraph +from dask.highlevelgraph import HighLevelGraph, MaterializedLayer from dask.layers import DataFrameIOLayer from dask.utils import M, apply, is_arraylike +from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT + logger = logging.getLogger(__name__) -def attempt_predicate_pushdown(ddf: dd.DataFrame) -> dd.DataFrame: +def attempt_predicate_pushdown( + ddf: dd.DataFrame, + preserve_filters: bool = True, + extract_filters: bool = True, + add_filters: list | tuple | DNF | None = None, +) -> dd.DataFrame: """Use graph information to update IO-level filters The original `ddf` will be returned if/when the @@ -24,8 +33,27 @@ def attempt_predicate_pushdown(ddf: dd.DataFrame) -> dd.DataFrame: is due to the fact that `npartitions` and `divisions` may change when this optimization is applied (invalidating npartition/divisions-specific logic in following Layers). + + Parameters + ---------- + ddf + Dask-DataFrame target for predicate pushdown. + preserve_filters + Whether to preserve pre-existing filters in the case that either + `add_filters` is specified, or `extract_filters` is `True` and + filters are successfully extracted from `ddf`. Default is `True`. + extract_filters + Whether to extract filters from the task graph of `ddf`. Default + is `True`. + add_filters + Custom filters to manually add to the IO layer of `ddf`. """ + if not (extract_filters or add_filters): + # Not extracting filters from the graph or + # manually adding user-defined filters. Return + return ddf + # Check that we have a supported `ddf` object if not isinstance(ddf, dd.DataFrame): raise ValueError( @@ -48,17 +76,19 @@ def attempt_predicate_pushdown(ddf: dd.DataFrame) -> dd.DataFrame: creation_info = ( (v.creation_info or {}) if hasattr(v, "creation_info") else {} ) - if ( - "filters" not in creation_info.get("kwargs", {}) - or creation_info["kwargs"]["filters"] is not None - ): - # No filters support, or filters is already set + if "filters" not in creation_info.get("kwargs", {}): + # No filters support return ddf if len(io_layer) != 1: # Not a single IO layer return ddf io_layer = io_layer.pop() + # Get pre-existing filters + existing_filters = ( + ddf.dask.layers[io_layer].creation_info.get("kwargs", {}).get("filters") + ) + # Start by converting the HLG to a `RegenerableGraph`. # Succeeding here means that all layers in the graph # are regenerable. @@ -71,28 +101,56 @@ def attempt_predicate_pushdown(ddf: dd.DataFrame) -> dd.DataFrame: ) return ddf - # Extract a DNF-formatted filter expression name = ddf._name - try: - filters = dsk.layers[name]._dnf_filter_expression(dsk) - if not isinstance(filters, frozenset): - # No filters encountered - return ddf - filters = filters.to_list_tuple() - except ValueError: - # DNF dispatching failed for 1+ layers - logger.warning( - "Predicate pushdown optimization skipped. One or more " - "layers has an unknown filter expression." - ) + extracted_filters = DNF(None) + if extract_filters: + # Extract a DNF-formatted filter expression + try: + extracted_filters = dsk.layers[name]._dnf_filter_expression(dsk) + except (ValueError, TypeError): + # DNF dispatching failed for 1+ layers + logger.warning( + "Predicate pushdown optimization skipped. One or more " + "layers has an unknown filter expression." + ) + + # Combine filters + filters = DNF(None) + if preserve_filters: + filters = filters.combine(existing_filters) + if extract_filters: + filters = filters.combine(extracted_filters) + if add_filters: + filters = filters.combine(add_filters) + if not filters: + # No filters encountered return ddf + filters = filters.to_list_tuple() + + # FIXME: pyarrow doesn't seem to like converting datetime64[D] to scalars + # so we must convert any we encounter to datetime64[ns] + filters = [ + [ + ( + col, + op, + val.astype("datetime64[ns]") + if isinstance(val, np.datetime64) and val.dtype == "datetime64[D]" + else val, + ) + for col, op, val in sublist + ] + for sublist in filters + ] # Regenerate collection with filtered IO layer try: + _regen_cache = {} return dsk.layers[name]._regenerate_collection( dsk, # TODO: shouldn't need to specify index=False after dask#9661 is merged new_kwargs={io_layer: {"filters": filters, "index": False}}, + _regen_cache=_regen_cache, ) except ValueError as err: # Most-likely failed to apply filters in read_parquet. @@ -108,49 +166,103 @@ def attempt_predicate_pushdown(ddf: dd.DataFrame) -> dd.DataFrame: return ddf -class Or(frozenset): - """Helper class for 'OR' expressions""" +class DNF: + """Manage filters in Disjunctive Normal Form (DNF)""" - def to_list_tuple(self): - # NDF "or" is List[List[Tuple]] - def _maybe_list(val): - if isinstance(val, tuple) and val and isinstance(val[0], (tuple, list)): - return list(val) - return [val] + class _Or(frozenset): + """Fozen set of disjunctions""" - return [ - _maybe_list(val.to_list_tuple()) - if hasattr(val, "to_list_tuple") - else _maybe_list(val) - for val in self - ] + def to_list_tuple(self) -> list: + # DNF "or" is List[List[Tuple]] + def _maybe_list(val): + if isinstance(val, tuple) and val and isinstance(val[0], (tuple, list)): + return list(val) + return [val] + return [ + _maybe_list(val.to_list_tuple()) + if hasattr(val, "to_list_tuple") + else _maybe_list(val) + for val in self + ] -class And(frozenset): - """Helper class for 'AND' expressions""" + class _And(frozenset): + """Frozen set of conjunctions""" - def to_list_tuple(self): - # NDF "and" is List[Tuple] - return tuple( - val.to_list_tuple() if hasattr(val, "to_list_tuple") else val - for val in self - ) + def to_list_tuple(self) -> list: + # DNF "and" is List[Tuple] + return tuple( + val.to_list_tuple() if hasattr(val, "to_list_tuple") else val + for val in self + ) + _filters: _And | _Or | None # Underlying filter expression -def to_dnf(expr): - """Normalize a boolean filter expression to disjunctive normal form (DNF)""" + def __init__(self, filters: DNF | _And | _Or | list | tuple | None) -> DNF: + if isinstance(filters, DNF): + self._filters = filters._filters + else: + self._filters = self.normalize(filters) + + def to_list_tuple(self) -> list: + return self._filters.to_list_tuple() + + def __bool__(self) -> bool: + return bool(self._filters) + + @classmethod + def normalize(cls, filters: _And | _Or | list | tuple | None): + """Convert raw filters to the `_Or(_And)` DNF representation""" + + def _valid_tuple(predicate: tuple): + col, op, val = predicate + if isinstance(col, tuple): + raise TypeError("filters must be List[Tuple] or List[List[Tuple]]") + if op in ("in", "not in"): + return (col, op, tuple(val)) + else: + return predicate + + def _valid_list(conjunction: list): + valid = [] + for predicate in conjunction: + if not isinstance(predicate, tuple): + raise TypeError(f"Predicate must be a tuple, got {predicate}") + valid.append(_valid_tuple(predicate)) + return valid + + if not filters: + result = None + elif isinstance(filters, list): + conjunctions = filters if isinstance(filters[0], list) else [filters] + result = cls._Or( + [cls._And(_valid_list(conjunction)) for conjunction in conjunctions] + ) + elif isinstance(filters, tuple): + result = cls._Or((cls._And((_valid_tuple(filters),)),)) + elif isinstance(filters, cls._Or): + result = cls._Or(se for e in filters for se in cls.normalize(e)) + elif isinstance(filters, cls._And): + total = [] + for c in itertools.product(*[cls.normalize(e) for e in filters]): + total.append(cls._And(se for e in c for se in e)) + result = cls._Or(total) + else: + raise TypeError(f"{type(filters)} not a supported type for DNF") + return result - # Credit: https://stackoverflow.com/a/58372345 - if not isinstance(expr, (Or, And)): - result = Or((And((expr,)),)) - elif isinstance(expr, Or): - result = Or(se for e in expr for se in to_dnf(e)) - elif isinstance(expr, And): - total = [] - for c in itertools.product(*[to_dnf(e) for e in expr]): - total.append(And(se for e in c for se in e)) - result = Or(total) - return result + def combine(self, other: DNF | _And | _Or | list | tuple | None) -> DNF: + """Combine with another DNF object""" + if not isinstance(other, DNF): + other = DNF(other) + assert isinstance(other, DNF) + if self._filters is None: + result = other._filters + elif other._filters is None: + result = self._filters + else: + result = self._And([self._filters, other._filters]) + return DNF(result) # Define all supported comparison functions @@ -170,20 +282,65 @@ def to_dnf(expr): np.not_equal: "!=", } +# Define all regenerable "pass-through" ops +# that do not affect filters. +_pass_through_ops = {M.fillna, M.astype} + # Define set of all "regenerable" operations. # Predicate pushdown is supported for graphs # comprised of `Blockwise` layers based on these # operations -_regenerable_ops = set(_comparison_symbols.keys()) | { - operator.and_, - operator.or_, - operator.getitem, - M.fillna, -} +_regenerable_ops = ( + set(_comparison_symbols.keys()) + | { + operator.and_, + operator.or_, + operator.getitem, + operator.inv, + M.isin, + M.isna, + } + | _pass_through_ops +) # Specify functions that must be generated with # a different API at the dataframe-collection level -_special_op_mappings = {M.fillna: dd._Frame.fillna} +_special_op_mappings = { + M.fillna: dd._Frame.fillna, + M.isin: dd._Frame.isin, + M.isna: dd._Frame.isna, + M.astype: dd._Frame.astype, +} + +# Convert _pass_through_ops to respect "special" mappings +_pass_through_ops = {_special_op_mappings.get(op, op) for op in _pass_through_ops} + + +def _preprocess_layers(input_layers): + # NOTE: This is a Layer-specific work-around to deal with + # the fact that `dd._Frame.isin(values)` will add a distinct + # `MaterializedLayer` for the `values` argument. + # See: https://github.com/dask-contrib/dask-sql/issues/607 + skip = set() + layers = input_layers.copy() + for key, layer in layers.items(): + if key.startswith("isin-") and isinstance(layer, Blockwise): + indices = list(layer.indices) + for i, (k, ind) in enumerate(layer.indices): + if ( + ind is None + and isinstance(layers.get(k), MaterializedLayer) + and isinstance(layers[k].get(k), (np.ndarray, tuple)) + ): + # Replace `indices[i]` with a literal value and + # make sure we skip the `MaterializedLayer` that + # we are now fusing into the `isin` + value = layers[k][k] + value = value[0](*value[1:]) if callable(value[0]) else value + indices[i] = (value, None) + skip.add(k) + layer.indices = tuple(indices) + return {k: v for k, v in layers.items() if k not in skip} class RegenerableLayer: @@ -211,7 +368,8 @@ def _regenerate_collection( # Return regenerated layer if the work was # already done - _regen_cache = _regen_cache or {} + if _regen_cache is None: + _regen_cache = {} if self.layer.output in _regen_cache: return _regen_cache[self.layer.output] @@ -246,6 +404,7 @@ def _regenerate_collection( regen_kwargs = self.creation_info.get("kwargs", {}).copy() regen_kwargs = {k: v for k, v in self.creation_info.get("kwargs", {}).items()} regen_kwargs.update((new_kwargs or {}).get(self.layer.output, {})) + result = func(*inputs, *regen_args, **regen_kwargs) _regen_cache[self.layer.output] = result return result @@ -261,8 +420,14 @@ def _dnf_filter_expression(self, dsk): func = _blockwise_logical_dnf elif op == operator.getitem: func = _blockwise_getitem_dnf - elif op == dd._Frame.fillna: - func = _blockwise_fillna_dnf + elif op == dd._Frame.isin: + func = _blockwise_isin_dnf + elif op == dd._Frame.isna: + func = _blockwise_isna_dnf + elif op == operator.inv: + func = _blockwise_inv_dnf + elif op in _pass_through_ops: + func = _blockwise_pass_through_dnf else: raise ValueError(f"No DNF expression for {op}") @@ -288,7 +453,7 @@ def from_hlg(cls, hlg: HighLevelGraph): raise TypeError(f"Expected HighLevelGraph, got {type(hlg)}") _layers = {} - for key, layer in hlg.layers.items(): + for key, layer in _preprocess_layers(hlg.layers).items(): regenerable_layer = None if isinstance(layer, DataFrameIOLayer): regenerable_layer = RegenerableLayer(layer, layer.creation_info or {}) @@ -335,36 +500,50 @@ def _get_blockwise_input(input_index, indices: list, dsk: RegenerableGraph): return dsk.layers[key]._dnf_filter_expression(dsk) -def _blockwise_comparison_dnf(op, indices: list, dsk: RegenerableGraph): +def _inv(symbol: str): + if symbol == "in" and not PQ_NOT_IN_SUPPORT: + raise ValueError("This version of dask does not support 'not in'") + return { + ">": "<", + "<": ">", + ">=": "<=", + "<=": ">=", + "in": "not in", + "not in": "in", + "is": "is not", + "is not": "is", + }.get(symbol, symbol) + + +def _blockwise_comparison_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: # Return DNF expression pattern for a simple comparison left = _get_blockwise_input(0, indices, dsk) right = _get_blockwise_input(1, indices, dsk) - def _inv(symbol: str): - return { - ">": "<", - "<": ">", - ">=": "<=", - "<=": ">=", - }.get(symbol, symbol) - if is_arraylike(left) and hasattr(left, "item") and left.size == 1: left = left.item() # Need inverse comparison in read_parquet - return (right, _inv(_comparison_symbols[op]), left) + return DNF((right, _inv(_comparison_symbols[op]), left)) if is_arraylike(right) and hasattr(right, "item") and right.size == 1: right = right.item() - return to_dnf((left, _comparison_symbols[op], right)) + return DNF((left, _comparison_symbols[op], right)) -def _blockwise_logical_dnf(op, indices: list, dsk: RegenerableGraph): +def _blockwise_logical_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: # Return DNF expression pattern for logical "and" or "or" left = _get_blockwise_input(0, indices, dsk) right = _get_blockwise_input(1, indices, dsk) + + filters = [] + for val in [left, right]: + if not isinstance(val, (tuple, DNF)): + raise TypeError(f"Invalid logical operand: {val}") + filters.append(DNF(val)._filters) + if op == operator.or_: - return to_dnf(Or([left, right])) + return DNF(DNF._Or(filters)) elif op == operator.and_: - return to_dnf(And([left, right])) + return DNF(DNF._And(filters)) else: raise ValueError @@ -375,6 +554,39 @@ def _blockwise_getitem_dnf(op, indices: list, dsk: RegenerableGraph): return key -def _blockwise_fillna_dnf(op, indices: list, dsk: RegenerableGraph): +def _blockwise_pass_through_dnf(op, indices: list, dsk: RegenerableGraph): # Return dnf of input collection return _get_blockwise_input(0, indices, dsk) + + +def _blockwise_isin_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: + # Return DNF expression pattern for a simple "in" comparison + left = _get_blockwise_input(0, indices, dsk) + right = _get_blockwise_input(1, indices, dsk) + return DNF((left, "in", tuple(right))) + + +def _blockwise_isna_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: + # Return DNF expression pattern for `isna` + if not PQ_IS_SUPPORT: + raise ValueError("This version of dask does not support 'is' predicates.") + left = _get_blockwise_input(0, indices, dsk) + return DNF((left, "is", None)) + + +def _blockwise_inv_dnf(op, indices: list, dsk: RegenerableGraph) -> DNF: + # Return DNF expression pattern for the inverse of a comparison + expr = _get_blockwise_input(0, indices, dsk).to_list_tuple() + new_expr = [] + count = 0 + for conjunction in expr: + new_conjunction = [] + for col, op, val in conjunction: + count += 1 + new_conjunction.append((col, _inv(op), val)) + new_expr.append(DNF._And(new_conjunction)) + if count > 1: + # Havent taken the time to think through + # general inversion yet. + raise ValueError("inv(DNF) case not implemented.") + return DNF(DNF._Or(new_expr)) diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py index c54feae8f..b39c7993d 100644 --- a/dask_sql/physical/utils/sort.py +++ b/dask_sql/physical/utils/sort.py @@ -55,7 +55,7 @@ def apply_sort( by=sort_columns, ascending=sort_ascending[0], na_position="first" if sort_null_first[0] else "last", - ignore_index=True, + # ignore_index=True, ).persist() except ValueError: pass @@ -128,11 +128,8 @@ def is_topk_optimizable( sort_num_rows is None or not single_ascending or any(sort_null_first) - # pandas doesnt support nsmallest/nlargest with object dtypes - or ( - "pandas" in str(df._partition_type) - and any(df[sort_columns].dtypes == "object") - ) + # pandas/cudf don't support nsmallest/nlargest with object dtypes + or any(df[sort_columns].dtypes == "object") or ( sort_num_rows * len(df.columns) > dask_config.get("sql.sort.topk-nelem-limit") diff --git a/dask_sql/server/presto_jdbc.py b/dask_sql/server/presto_jdbc.py index d3c3880cb..02f77a1b4 100644 --- a/dask_sql/server/presto_jdbc.py +++ b/dask_sql/server/presto_jdbc.py @@ -37,15 +37,15 @@ def create_meta_data(c: Context): # catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True) # c.create_table("catalogs", catalogs, schema_name=system_schema) - schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True) + schemas = pd.DataFrame(create_schema_row(), index=[0]) c.create_table("schemas", schemas, schema_name=system_schema) schema_rows = [] - tables = pd.DataFrame().append(create_table_row(), ignore_index=True) + tables = pd.DataFrame(create_table_row(), index=[0]) c.create_table("tables", tables, schema_name=system_schema) table_rows = [] - columns = pd.DataFrame().append(create_column_row(), ignore_index=True) + columns = pd.DataFrame(create_column_row(), index=[0]) c.create_table("columns", columns, schema_name=system_schema) column_rows = [] diff --git a/dask_sql/sql-schema.yaml b/dask_sql/sql-schema.yaml index 63aad44aa..eaab6936a 100644 --- a/dask_sql/sql-schema.yaml +++ b/dask_sql/sql-schema.yaml @@ -64,6 +64,11 @@ properties: description: | Whether to try pushing down filter predicates into IO (when possible). + dynamic_partition_pruning: + type: boolean + description: | + Whether to apply the dynamic partition pruning optimizer rule. + sort: type: object properties: diff --git a/dask_sql/sql.yaml b/dask_sql/sql.yaml index 786bc2e74..42434d20d 100644 --- a/dask_sql/sql.yaml +++ b/dask_sql/sql.yaml @@ -16,6 +16,8 @@ sql: predicate_pushdown: True + dynamic_partition_pruning: True + sort: topk-nelem-limit: 1000000 diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 7426ca64d..8e2673b3e 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from dask_planner.rust import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value diff --git a/docker/conda.txt b/docker/conda.txt index 174991243..5f33680a9 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -4,16 +4,16 @@ pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 maven>=3.6.0 -pytest>=6.0.1 +pytest>=6.0.2 pytest-cov>=2.10.1 pytest-xdist mock>=4.0.3 sphinx>=3.2.1 tzlocal>=2.1 -# FIXME: handling is needed for httpx-based fastapi>=0.87.0 -fastapi>=0.69.0,<0.87.0 +fastapi>=0.92.0 +httpx>=0.24.1 uvicorn>=0.13.4 -pyarrow>=6.0.1 +pyarrow>=6.0.2 prompt_toolkit>=3.0.8 pygments>=2.7.1 scikit-learn>=1.0.0 @@ -21,4 +21,4 @@ intake>=0.6.0 pre-commit>=2.11.1 black=22.10.0 isort=5.12.0 -setuptools-rust>=1.5.2 +maturin>=1.1,<1.2 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index da965a53c..f17e0181f 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -14,32 +14,34 @@ ENV PATH="/root/.cargo/bin:${PATH}" COPY docker/conda.txt /opt/dask_sql/ RUN mamba install -y \ # build requirements - "setuptools-rust>=1.5.2" \ + "maturin>=1.1,<1.2" \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - "fastapi>=0.69.0,<0.87.0" \ + "fastapi>=0.92.0" \ + "httpx>=0.24.1" \ "uvicorn>=0.13.4" \ "tzlocal>=2.1" \ "prompt_toolkit>=3.0.8" \ "pygments>=2.7.1" \ tabulate \ # additional dependencies - "pyarrow>=6.0.1" \ + "pyarrow>=6.0.2" \ "scikit-learn>=1.0.0" \ "intake>=0.6.0" \ && conda clean -ay # install dask-sql -COPY setup.py /opt/dask_sql/ +COPY Cargo.toml /opt/dask_sql/ +COPY Cargo.lock /opt/dask_sql/ +COPY pyproject.toml /opt/dask_sql/ COPY setup.cfg /opt/dask_sql/ -COPY versioneer.py /opt/dask_sql/ +COPY README.md /opt/dask_sql/ COPY .git /opt/dask_sql/.git -COPY dask_planner /opt/dask_sql/dask_planner +COPY src /opt/dask_sql/src COPY dask_sql /opt/dask_sql/dask_sql RUN cd /opt/dask_sql/ \ - && pip install -e . -vv + && CONDA_PREFIX="/opt/conda/" maturin develop # Set the script to execute COPY scripts/startup_script.py /opt/dask_sql/startup_script.py diff --git a/docs/environment.yml b/docs/environment.yml index bb463d028..c4f5ad52b 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -9,13 +9,12 @@ dependencies: - dask>=2022.3.0 - pandas>=1.4.0 - fugue>=0.7.3 - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - - fastapi>=0.69.0,<0.87.0 + - fastapi>=0.92.0 + - httpx>=0.24.1 - uvicorn>=0.13.4 - tzlocal>=2.1 - prompt_toolkit>=3.0.8 - pygments>=2.7.1 - tabulate - - setuptools-rust>=1.5.2 - ucx-proc=*=cpu - rust>=1.65.0 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index c9d8c6b0e..cce9cb599 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -4,11 +4,11 @@ dask-sphinx-theme>=3.0.0 dask>=2022.3.0 pandas>=1.4.0 fugue>=0.7.3 -# FIXME: handling is needed for httpx-based fastapi>=0.87.0 -fastapi>=0.69.0,<0.87.0 +fastapi>=0.92.0 +httpx>=0.24.1 uvicorn>=0.13.4 tzlocal>=2.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 tabulate -setuptools-rust>=1.5.2 +maturin>=1.1,<1.2 diff --git a/docs/source/how_does_it_work.rst b/docs/source/how_does_it_work.rst index 32c736431..67d2eab01 100644 --- a/docs/source/how_does_it_work.rst +++ b/docs/source/how_does_it_work.rst @@ -22,7 +22,7 @@ No matter of via the Python API (:ref:`api`), the command line client (:ref:`cmd This function will first give the SQL string to the dask_planner Rust crate via the ``PyO3`` library. Inside this crate, Apache Arrow DataFusion is used to first parse the SQL string and then turn it into a relational algebra. For this, DataFusion uses the SQL language description specified in the `sqlparser-rs library `_ -We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. +We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. 3. SQL is (maybe) optimized --------------------------- diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 2ca0e99c5..4404facbb 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -84,12 +84,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" -To compile the Rust code (after changes), run - -.. code-block:: bash - - python setup.py build_ext - +To compile the Rust code (after changes), the above command must be rerun. You can run the tests (after installation) with .. code-block:: bash diff --git a/pyproject.toml b/pyproject.toml index dfed2ba50..3caa92ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,86 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] +requires = ["maturin>=1.1,<1.2"] +build-backend = "maturin" -[tool.isort] -profile = "black" +[project] +name = "dask_sql" +description = "SQL query layer for Dask" +maintainers = [{name = "Nils Braun", email = "nilslennartbraun@gmail.com"}] +license = {text = "MIT"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Rust", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering", + "Topic :: System :: Distributed Computing", +] +readme = "README.md" +requires-python = ">=3.8" +dependencies = [ + "dask[dataframe]>=2022.3.0", + "distributed>=2022.3.0", + "pandas>=1.4.0", + "fastapi>=0.92.0", + "httpx>=0.24.1", + "uvicorn>=0.13.4", + "tzlocal>=2.1", + "prompt_toolkit>=3.0.8", + "pygments>=2.7.1", + "tabulate", +] +dynamic = ["version"] + +[project.urls] +Homepage = "https://github.com/dask-contrib/dask-sql" +Documentation = "https://dask-sql.readthedocs.io" +Source = "https://github.com/dask-contrib/dask-sql" + +[project.optional-dependencies] +dev = [ + "pytest>=6.0.1", + "pytest-cov>=2.10.1", + "mock>=4.0.3", + "sphinx>=3.2.1", + "pyarrow>=6.0.2", + "scikit-learn>=1.0.0", + "intake>=0.6.0", + "pre-commit", + "black==22.10.0", + "isort==5.12.0", +] +fugue = ["fugue>=0.7.3"] + +[project.entry-points."fugue.plugins"] +dasksql = "dask_sql.integrations.fugue:_register_engines[fugue]" + +[project.scripts] +dask-sql = "dask_sql.cmd:main" +dask-sql-server = "dask_sql.server.app:main" + +[tool.setuptools] +include-package-data = true +zip-safe = false +license-files = ["LICENSE.txt"] + +[tool.setuptools.packages] +find = {namespaces = false} [tool.maturin] +module-name = "dask_sql._datafusion_lib" include = [ { path = "Cargo.lock", format = "sdist" } ] -exclude = [".github/**", "ci/**", ".asf.yaml"] -# Require Cargo.lock is up to date +exclude = [".github/**", "continuous_integration/**"] locked = true + +[tool.isort] +profile = "black" diff --git a/setup.cfg b/setup.cfg index e8d125d4c..99254d2f6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ # https://flake8.readthedocs.io/en/latest/user/configuration.html # https://flake8.readthedocs.io/en/latest/user/error-codes.html # https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes -exclude = __init__.py,versioneer.py +exclude = __init__.py ignore = E203, # whitespace before ':' E231,E241, # Multiple spaces around "," @@ -19,11 +19,3 @@ per-file-ignores = # Ambiguous variable name E741, max-line-length = 150 - -[versioneer] -VCS = git -style = pep440 -versionfile_source = dask_sql/_version.py -versionfile_build = dask_sql/_version.py -tag_prefix = -parentdir_prefix = dask-sql- diff --git a/setup.py b/setup.py deleted file mode 100644 index 0109a8714..000000000 --- a/setup.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import sys - -from setuptools import find_packages, setup -from setuptools_rust import Binding, RustExtension - -import versioneer - -long_description = "" -if os.path.exists("README.md"): - with open("README.md") as f: - long_description = f.read() - -needs_sphinx = "build_sphinx" in sys.argv -sphinx_requirements = ["sphinx>=3.2.1", "sphinx_rtd_theme"] if needs_sphinx else [] -debug_build = "debug" in sys.argv - -cmdclass = versioneer.get_cmdclass() - -setup( - name="dask_sql", - version=versioneer.get_version(), - description="SQL query layer for Dask", - url="https://github.com/dask-contrib/dask-sql/", - maintainer="Nils Braun", - maintainer_email="nilslennartbraun@gmail.com", - license="MIT", - long_description=long_description, - long_description_content_type="text/markdown", - packages=find_packages( - include=["dask_sql", "dask_sql.*", "dask_planner", "dask_planner.*"] - ), - package_data={"dask_sql": ["sql*.yaml"]}, - rust_extensions=[ - RustExtension( - "dask_planner.rust", - binding=Binding.PyO3, - path="dask_planner/Cargo.toml", - debug=debug_build, - ) - ], - python_requires=">=3.9", - setup_requires=sphinx_requirements, - install_requires=[ - "dask[dataframe]>=2022.3.0", - "distributed>=2022.3.0", - "pandas>=1.4.0", - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - "fastapi>=0.69.0,<0.87.0", - "uvicorn>=0.13.4", - "tzlocal>=2.1", - "prompt_toolkit>=3.0.8", - "pygments>=2.7.1", - "tabulate", - ], - extras_require={ - "dev": [ - "pytest>=6.0.1", - "pytest-cov>=2.10.1", - "mock>=4.0.3", - "sphinx>=3.2.1", - "pyarrow>=6.0.1", - "scikit-learn>=1.0.0", - "intake>=0.6.0", - "pre-commit", - "black==22.10.0", - "isort==5.12.0", - ], - "fugue": ["fugue>=0.7.3"], - }, - entry_points={ - "console_scripts": [ - "dask-sql-server = dask_sql.server.app:main", - "dask-sql = dask_sql.cmd:main", - ], - "fugue.plugins": [ - "dasksql = dask_sql.integrations.fugue:_register_engines[fugue]" - ], - }, - zip_safe=False, - cmdclass=cmdclass, - command_options={ - "build_sphinx": { - "source_dir": ("setup.py", "docs"), - } - }, -) diff --git a/dask_planner/src/dialect.rs b/src/dialect.rs similarity index 84% rename from dask_planner/src/dialect.rs rename to src/dialect.rs index 9fe013f3d..da4e213e1 100644 --- a/dask_planner/src/dialect.rs +++ b/src/dialect.rs @@ -77,6 +77,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "floor" => { @@ -108,6 +109,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampadd" => { @@ -136,6 +138,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampdiff" => { @@ -163,6 +166,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "to_timestamp" => { @@ -192,6 +196,37 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], + }))) + } + Token::Word(w) if w.value.to_lowercase() == "extract" => { + // EXTRACT(DATE FROM d) + parser.next_token(); // skip extract + parser.expect_token(&Token::LParen)?; + if !parser.parse_keywords(&[Keyword::DATE, Keyword::FROM]) { + // Parse EXTRACT(x FROM d) as normal + parser.prev_token(); + parser.prev_token(); + return Ok(None); + } + let expr = parser.parse_expr()?; + parser.expect_token(&Token::RParen)?; + + // convert to function args + let args = vec![ + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Value( + Value::SingleQuotedString("DATE".to_string()), + ))), + FunctionArg::Unnamed(FunctionArgExpr::Expr(expr)), + ]; + + Ok(Some(Expr::Function(Function { + name: ObjectName(vec![Ident::new("extract_date")]), + args, + over: None, + distinct: false, + special: false, + order_by: vec![], }))) } _ => Ok(None), diff --git a/dask_planner/src/error.rs b/src/error.rs similarity index 100% rename from dask_planner/src/error.rs rename to src/error.rs diff --git a/dask_planner/src/expression.rs b/src/expression.rs similarity index 80% rename from dask_planner/src/expression.rs rename to src/expression.rs index 150d332b8..fccfa9d87 100644 --- a/dask_planner/src/expression.rs +++ b/src/expression.rs @@ -4,7 +4,21 @@ use datafusion_python::{ datafusion::arrow::datatypes::DataType, datafusion_common::{Column, DFField, DFSchema, ScalarValue}, datafusion_expr::{ - expr::{AggregateFunction, BinaryExpr, Cast, Sort, TryCast, WindowFunction}, + expr::{ + AggregateFunction, + AggregateUDF, + Alias, + BinaryExpr, + Cast, + Exists, + InList, + InSubquery, + ScalarFunction, + ScalarUDF, + Sort, + TryCast, + WindowFunction, + }, lit, utils::exprlist_to_fields, Between, @@ -30,7 +44,7 @@ use crate::{ }; /// An PyExpr that can be used on a DataFrame -#[pyclass(name = "Expression", module = "datafusion", subclass)] +#[pyclass(name = "Expression", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, @@ -44,7 +58,7 @@ impl From for Expr { } } -#[pyclass(name = "ScalarValue", module = "datafusion", subclass)] +#[pyclass(name = "ScalarValue", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyScalarValue { pub scalar_value: ScalarValue, @@ -91,9 +105,10 @@ impl PyExpr { fn _rex_type(&self, expr: &Expr) -> RexType { match expr { Expr::Alias(..) => RexType::Alias, - Expr::Column(..) | Expr::QualifiedWildcard { .. } | Expr::GetIndexedField { .. } => { - RexType::Reference - } + Expr::Column(..) + | Expr::QualifiedWildcard { .. } + | Expr::GetIndexedField { .. } + | Expr::Wildcard => RexType::Reference, Expr::ScalarVariable(..) | Expr::Literal(..) => RexType::Literal, Expr::BinaryExpr { .. } | Expr::Not(..) @@ -101,7 +116,6 @@ impl PyExpr { | Expr::Negative(..) | Expr::IsNull(..) | Expr::Like { .. } - | Expr::ILike { .. } | Expr::SimilarTo { .. } | Expr::Between { .. } | Expr::Case { .. } @@ -113,7 +127,6 @@ impl PyExpr { | Expr::WindowFunction { .. } | Expr::AggregateUDF { .. } | Expr::InList { .. } - | Expr::Wildcard | Expr::ScalarUDF { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } @@ -153,6 +166,9 @@ impl PyExpr { pub fn subquery_plan(&self) -> PyResult { match &self.expr { Expr::ScalarSubquery(subquery) => Ok(subquery.subquery.as_ref().clone().into()), + Expr::InSubquery(insubquery) => { + Ok(insubquery.subquery.subquery.as_ref().clone().into()) + } _ => Err(py_type_err(format!( "Attempted to extract a LogicalPlan instance from invalid Expr {:?}. Only Subquery and related variants are supported for this operation.", @@ -184,49 +200,61 @@ impl PyExpr { schema.merge(plan.schema().as_ref()); } let name = get_expr_name(&self.expr).map_err(py_runtime_err)?; - schema - .index_of_column(&Column::from_qualified_name(name.clone())) - .or_else(|_| { - // Handles cases when from_qualified_name doesn't format the Column correctly. - // "name" will always contain the name of the column. Anything in addition to - // that will be separated by a '.' and should be further referenced. - let parts = name.split('.').collect::>(); - let tbl_reference = match parts.len() { - // Single element means name contains just the column name so no TableReference - 1 => None, - // Tablename.column_name - 2 => Some( - TableReference::Bare { - table: Cow::Borrowed(parts[0]), + if name != "*" { + schema + .index_of_column(&Column::from_qualified_name(name.clone())) + .or_else(|_| { + // Handles cases when from_qualified_name doesn't format the Column correctly. + // "name" will always contain the name of the column. Anything in addition to + // that will be separated by a '.' and should be further referenced. + match &self.expr { + Expr::Column(col) => { + schema.index_of_column(col).map_err(py_runtime_err) } - .to_owned_reference(), - ), - // Schema_name.table_name.column_name - 3 => Some( - TableReference::Partial { - schema: Cow::Borrowed(parts[0]), - table: Cow::Borrowed(parts[1]), + _ => { + let parts = name.split('.').collect::>(); + let tbl_reference = match parts.len() { + // Single element means name contains just the column name so no TableReference + 1 => None, + // Tablename.column_name + 2 => Some( + TableReference::Bare { + table: Cow::Borrowed(parts[0]), + } + .to_owned_reference(), + ), + // Schema_name.table_name.column_name + 3 => Some( + TableReference::Partial { + schema: Cow::Borrowed(parts[0]), + table: Cow::Borrowed(parts[1]), + } + .to_owned_reference(), + ), + // catalog_name.schema_name.table_name.column_name + 4 => Some( + TableReference::Full { + catalog: Cow::Borrowed(parts[0]), + schema: Cow::Borrowed(parts[1]), + table: Cow::Borrowed(parts[2]), + } + .to_owned_reference(), + ), + _ => None, + }; + + let col = Column { + relation: tbl_reference.clone(), + name: parts[parts.len() - 1].to_string(), + }; + schema.index_of_column(&col).map_err(py_runtime_err) } - .to_owned_reference(), - ), - // catalog_name.schema_name.table_name.column_name - 4 => Some( - TableReference::Full { - catalog: Cow::Borrowed(parts[0]), - schema: Cow::Borrowed(parts[1]), - table: Cow::Borrowed(parts[2]), - } - .to_owned_reference(), - ), - _ => None, - }; - - let col = Column { - relation: tbl_reference.clone(), - name: parts[parts.len() - 1].to_string(), - }; - schema.index_of_column(&col).map_err(py_runtime_err) - }) + } + }) + } else { + // Since this is wildcard any Column will do, just use first one + Ok(0) + } } _ => Err(py_runtime_err( "We need a valid LogicalPlan instance to get the Expr's index in the schema", @@ -271,7 +299,6 @@ impl PyExpr { | Expr::IsNotTrue(_) | Expr::IsNotFalse(_) | Expr::Like { .. } - | Expr::ILike { .. } | Expr::SimilarTo { .. } | Expr::IsNotUnknown(_) | Expr::Case { .. } @@ -315,8 +342,7 @@ impl PyExpr { } // Expr(s) that house the Expr instance to return in their bounded params - Expr::Alias(expr, ..) - | Expr::Not(expr) + Expr::Not(expr) | Expr::IsNull(expr) | Expr::IsNotNull(expr) | Expr::IsTrue(expr) @@ -330,15 +356,15 @@ impl PyExpr { | Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) | Expr::Sort(Sort { expr, .. }) - | Expr::InSubquery { expr, .. } => { + | Expr::InSubquery(InSubquery { expr, .. }) => { Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) } // Expr variants containing a collection of Expr(s) for operands Expr::AggregateFunction(AggregateFunction { args, .. }) - | Expr::AggregateUDF { args, .. } - | Expr::ScalarFunction { args, .. } - | Expr::ScalarUDF { args, .. } + | Expr::AggregateUDF(AggregateUDF { args, .. }) + | Expr::ScalarFunction(ScalarFunction { args, .. }) + | Expr::ScalarUDF(ScalarUDF { args, .. }) | Expr::WindowFunction(WindowFunction { args, .. }) => Ok(args .iter() .map(|arg| PyExpr::from(arg.clone(), self.input_plan.clone())) @@ -353,21 +379,34 @@ impl PyExpr { let mut operands: Vec = Vec::new(); if let Some(e) = expr { - operands.push(PyExpr::from(*e.clone(), self.input_plan.clone())); + for (when, then) in when_then_expr { + operands.push(PyExpr::from( + Expr::BinaryExpr(BinaryExpr::new( + Box::new(*e.clone()), + Operator::Eq, + Box::new(*when.clone()), + )), + self.input_plan.clone(), + )); + operands.push(PyExpr::from(*then.clone(), self.input_plan.clone())); + } + } else { + for (when, then) in when_then_expr { + operands.push(PyExpr::from(*when.clone(), self.input_plan.clone())); + operands.push(PyExpr::from(*then.clone(), self.input_plan.clone())); + } }; - for (when, then) in when_then_expr { - operands.push(PyExpr::from(*when.clone(), self.input_plan.clone())); - operands.push(PyExpr::from(*then.clone(), self.input_plan.clone())); - } - if let Some(e) = else_expr { operands.push(PyExpr::from(*e.clone(), self.input_plan.clone())); }; Ok(operands) } - Expr::InList { expr, list, .. } => { + Expr::Alias(Alias { expr, .. }) => { + Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) + } + Expr::InList(InList { expr, list, .. }) => { let mut operands: Vec = vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]; for list_elem in list { @@ -384,10 +423,6 @@ impl PyExpr { PyExpr::from(*expr.clone(), self.input_plan.clone()), PyExpr::from(*pattern.clone(), self.input_plan.clone()), ]), - Expr::ILike(Like { expr, pattern, .. }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*pattern.clone(), self.input_plan.clone()), - ]), Expr::SimilarTo(Like { expr, pattern, .. }) => Ok(vec![ PyExpr::from(*expr.clone(), self.input_plan.clone()), PyExpr::from(*pattern.clone(), self.input_plan.clone()), @@ -402,11 +437,14 @@ impl PyExpr { PyExpr::from(*low.clone(), self.input_plan.clone()), PyExpr::from(*high.clone(), self.input_plan.clone()), ]), + Expr::Wildcard => Ok(vec![PyExpr::from( + self.expr.clone(), + self.input_plan.clone(), + )]), // Currently un-support/implemented Expr types for Rex Call operations Expr::GroupingSet(..) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard | Expr::QualifiedWildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } @@ -425,8 +463,8 @@ impl PyExpr { op, right: _, }) => format!("{op}"), - Expr::ScalarFunction { fun, args: _ } => format!("{fun}"), - Expr::ScalarUDF { fun, .. } => fun.name.clone(), + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => format!("{fun}"), + Expr::ScalarUDF(ScalarUDF { fun, .. }) => fun.name.clone(), Expr::Cast { .. } => "cast".to_string(), Expr::Between { .. } => "between".to_string(), Expr::Case { .. } => "case".to_string(), @@ -439,21 +477,19 @@ impl PyExpr { Expr::IsNotFalse(_) => "is not false".to_string(), Expr::IsNotUnknown(_) => "is not unknown".to_string(), Expr::InList { .. } => "in list".to_string(), + Expr::InSubquery(..) => "in subquery".to_string(), Expr::Negative(..) => "negative".to_string(), Expr::Not(..) => "not".to_string(), - Expr::Like(Like { negated, .. }) => { - if *negated { - "not like".to_string() - } else { - "like".to_string() - } - } - Expr::ILike(Like { negated, .. }) => { - if *negated { - "not ilike".to_string() - } else { - "ilike".to_string() - } + Expr::Like(Like { + negated, + case_insensitive, + .. + }) => { + format!( + "{}{}like", + if *negated { "not " } else { "" }, + if *case_insensitive { "i" } else { "" } + ) } Expr::SimilarTo(Like { negated, .. }) => { if *negated { @@ -546,8 +582,13 @@ impl PyExpr { ScalarValue::List(..) => "List", ScalarValue::Struct(..) => "Struct", ScalarValue::FixedSizeBinary(_, _) => "FixedSizeBinary", + ScalarValue::Fixedsizelist(..) => "Fixedsizelist", + ScalarValue::DurationSecond(..) => "DurationSecond", + ScalarValue::DurationMillisecond(..) => "DurationMillisecond", + ScalarValue::DurationMicrosecond(..) => "DurationMicrosecond", + ScalarValue::DurationNanosecond(..) => "DurationNanosecond", }, - Expr::ScalarFunction { fun, args: _ } => match fun { + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => match fun { BuiltinScalarFunction::Abs => "Abs", BuiltinScalarFunction::DatePart => "DatePart", _ => { @@ -627,9 +668,9 @@ impl PyExpr { pub fn get_filter_expr(&self) -> PyResult> { // TODO refactor to avoid duplication match &self.expr { - Expr::Alias(expr, _) => match expr.as_ref() { + Expr::Alias(Alias { expr, .. }) => match expr.as_ref() { Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => { Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))) } @@ -640,7 +681,7 @@ impl PyExpr { )), }, Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))), None => Ok(None), }, @@ -729,7 +770,10 @@ impl PyExpr { ScalarValue::TimestampNanosecond(iv, tz) | ScalarValue::TimestampMicrosecond(iv, tz) | ScalarValue::TimestampMillisecond(iv, tz) - | ScalarValue::TimestampSecond(iv, tz) => Ok((*iv, tz.clone())), + | ScalarValue::TimestampSecond(iv, tz) => match tz { + Some(time_zone) => Ok((*iv, Some(time_zone.to_string()))), + None => Ok((*iv, None)), + }, other => Err(unexpected_literal_value(other)), } } @@ -780,9 +824,9 @@ impl PyExpr { pub fn is_negated(&self) -> PyResult { match &self.expr { Expr::Between(Between { negated, .. }) - | Expr::Exists { negated, .. } - | Expr::InList { negated, .. } - | Expr::InSubquery { negated, .. } => Ok(*negated), + | Expr::Exists(Exists { negated, .. }) + | Expr::InList(InList { negated, .. }) + | Expr::InSubquery(InSubquery { negated, .. }) => Ok(*negated), _ => Err(py_type_err(format!( "unknown Expr type {:?} encountered", &self.expr @@ -796,7 +840,7 @@ impl PyExpr { match &self.expr { Expr::AggregateFunction(funct) => Ok(funct.distinct), Expr::AggregateUDF { .. } => Ok(false), - Expr::Alias(expr, _) => match expr.as_ref() { + Expr::Alias(Alias { expr, .. }) => match expr.as_ref() { Expr::AggregateFunction(funct) => Ok(funct.distinct), Expr::AggregateUDF { .. } => Ok(false), _ => Err(py_type_err( @@ -837,9 +881,9 @@ impl PyExpr { #[pyo3(name = "getEscapeChar")] pub fn get_escape_char(&self) -> PyResult> { match &self.expr { - Expr::Like(Like { escape_char, .. }) - | Expr::ILike(Like { escape_char, .. }) - | Expr::SimilarTo(Like { escape_char, .. }) => Ok(*escape_char), + Expr::Like(Like { escape_char, .. }) | Expr::SimilarTo(Like { escape_char, .. }) => { + Ok(*escape_char) + } _ => Err(py_type_err(format!( "Provided Expr {:?} not one of Like/ILike/SimilarTo", &self.expr @@ -867,7 +911,11 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { fn get_expr_name(expr: &Expr) -> Result { match expr { - Expr::Alias(expr, _) => get_expr_name(expr), + Expr::Alias(Alias { expr, .. }) => get_expr_name(expr), + Expr::Wildcard => { + // 'Wildcard' means any and all columns. We get the first valid column name here + Ok("*".to_owned()) + } _ => Ok(expr.canonical_name()), } } @@ -880,6 +928,11 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { // appear in projections) so we just delegate to the contained expression instead expr_to_field(expr, input_plan) } + Expr::Wildcard => { + // Any column will do. We use the first column to keep things consistent + Ok(input_plan.schema().field(0).clone()) + } + Expr::InSubquery(insubquery) => expr_to_field(&insubquery.expr, input_plan), _ => { let fields = exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; diff --git a/dask_planner/src/lib.rs b/src/lib.rs similarity index 90% rename from dask_planner/src/lib.rs rename to src/lib.rs index f5305d900..921478973 100644 --- a/dask_planner/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -#[pyo3(name = "rust")] -fn rust(py: Python, m: &PyModule) -> PyResult<()> { +fn _datafusion_lib(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); @@ -41,7 +40,7 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { py.get_type::(), )?; - debug!("dask_planner Python module loaded"); + debug!("dask_sql native library loaded"); Ok(()) } diff --git a/dask_planner/src/parser.rs b/src/parser.rs similarity index 95% rename from dask_planner/src/parser.rs rename to src/parser.rs index 3147e6309..100f9c137 100644 --- a/dask_planner/src/parser.rs +++ b/src/parser.rs @@ -30,7 +30,7 @@ pub enum CustomExpr { Nested(Vec<(String, PySqlArg)>), } -#[pyclass(name = "SqlArg", module = "datafusion")] +#[pyclass(name = "SqlArg", module = "dask_sql")] #[derive(Debug, Clone, PartialEq, Eq)] pub struct PySqlArg { expr: Option, @@ -1374,14 +1374,7 @@ mod test { let statements = DaskParser::parse_sql(sql).unwrap(); assert_eq!(1, statements.len()); let actual = format!("{:?}", statements[0]); - let expected = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), \ - Unnamed(Expr(Value(Number(\"2\", false)))), \ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), args: [Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), Unnamed(Expr(Value(Number(\"2\", false)))), Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; assert!(actual.contains(expected)); } @@ -1391,26 +1384,16 @@ mod test { let statements1 = DaskParser::parse_sql(sql1).unwrap(); assert_eq!(1, statements1.len()); let actual1 = format!("{:?}", statements1[0]); - let expected1 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected1 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual1.contains(expected1)); let sql2 = "SELECT TO_TIMESTAMP(d, \"%d/%m/%Y\") FROM t"; let statements2 = DaskParser::parse_sql(sql2).unwrap(); assert_eq!(1, statements2.len()); let actual2 = format!("{:?}", statements2[0]); - let expected2 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected2 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual2.contains(expected2)); } diff --git a/dask_planner/src/sql.rs b/src/sql.rs similarity index 89% rename from dask_planner/src/sql.rs rename to src/sql.rs index 22f6d01ac..c9a600225 100644 --- a/dask_planner/src/sql.rs +++ b/src/sql.rs @@ -21,7 +21,7 @@ use datafusion_python::{ }, datafusion_expr::{ logical_plan::Extension, - AccumulatorFunctionImplementation, + AccumulatorFactoryFunction, AggregateUDF, LogicalPlan, ReturnTypeFunction, @@ -78,27 +78,14 @@ use crate::{ /// /// The following example demonstrates how to generate an optimized LogicalPlan /// from SQL using DaskSQLContext. -/// -/// ``` -/// use datafusion_python::datafusion::prelude::*; -/// -/// # use datafusion_python::datafusion_common::Result; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let mut ctx = DaskSQLContext::new(); -/// let parsed_sql = ctx.parse_sql("SELECT COUNT(*) FROM test_table"); -/// let nonOptimizedRelAlgebra = ctx.logical_relational_algebra(parsed_sql); -/// let optmizedRelAlg = ctx.optimizeRelationalAlgebra(nonOptimizedRelAlgebra); -/// # Ok(()) -/// # } -/// ``` -#[pyclass(name = "DaskSQLContext", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSQLContext", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSQLContext { current_catalog: String, current_schema: String, schemas: HashMap, options: ConfigOptions, + dynamic_partition_pruning: bool, } impl ContextProvider for DaskSQLContext { @@ -243,6 +230,11 @@ impl ContextProvider for DaskSQLContext { DataType::Int64, DataType::Timestamp(TimeUnit::Nanosecond, None), ]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Int64, + DataType::Int64, + ]), ], Volatility::Immutable, ); @@ -250,11 +242,23 @@ impl ContextProvider for DaskSQLContext { return Some(Arc::new(ScalarUDF::new(name, &sig, &rtf, &fun))); } "timestampdiff" => { - let sig = Signature::exact( + let sig = Signature::one_of( vec![ - DataType::Utf8, - DataType::Timestamp(TimeUnit::Nanosecond, None), - DataType::Timestamp(TimeUnit::Nanosecond, None), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Date64, + DataType::Date64, + ]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Int64, + DataType::Int64, + ]), ], Volatility::Immutable, ); @@ -309,6 +313,20 @@ impl ContextProvider for DaskSQLContext { let rtf: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Int64))); return Some(Arc::new(ScalarUDF::new(name, &sig, &rtf, &fun))); } + "extract_date" => { + let sig = Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8, DataType::Date64]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]), + ], + Volatility::Immutable, + ); + let rtf: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Date64))); + return Some(Arc::new(ScalarUDF::new(name, &sig, &rtf, &fun))); + } _ => (), } @@ -353,7 +371,7 @@ impl ContextProvider for DaskSQLContext { } fn get_aggregate_meta(&self, name: &str) -> Option> { - let acc: AccumulatorFunctionImplementation = + let acc: AccumulatorFactoryFunction = Arc::new(|_return_type| Err(DataFusionError::NotImplemented("".to_string()))); let st: StateTypeFunction = @@ -446,6 +464,13 @@ impl ContextProvider for DaskSQLContext { fn options(&self) -> &ConfigOptions { &self.options } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + unimplemented!("RUST: get_window_meta is not yet implemented for DaskSQLContext") + } } #[pymethods] @@ -457,9 +482,15 @@ impl DaskSQLContext { current_schema: default_schema_name.to_owned(), schemas: HashMap::new(), options: ConfigOptions::new(), + dynamic_partition_pruning: false, } } + pub fn apply_dynamic_partition_pruning(&mut self, config: bool) -> PyResult<()> { + self.dynamic_partition_pruning = config; + Ok(()) + } + /// Change the current schema pub fn use_schema(&mut self, schema_name: &str) -> PyResult<()> { if self.schemas.contains_key(schema_name) { @@ -546,13 +577,31 @@ impl DaskSQLContext { warn!("This LogicalPlan does not support Optimization. Returning original"); Ok(existing_plan) } - _ => optimizer::DaskSqlOptimizer::new() - .optimize(existing_plan.original_plan) - .map(|k| PyLogicalPlan { - original_plan: k, - current_node: None, - }) - .map_err(py_optimization_exp), + _ => { + let optimized_plan = optimizer::DaskSqlOptimizer::new() + .optimize(existing_plan.original_plan) + .map(|k| PyLogicalPlan { + original_plan: k, + current_node: None, + }) + .map_err(py_optimization_exp); + + if let Ok(optimized_plan) = optimized_plan { + if self.dynamic_partition_pruning { + optimizer::DaskSqlOptimizer::dynamic_partition_pruner() + .optimize_once(optimized_plan.original_plan) + .map(|k| PyLogicalPlan { + original_plan: k, + current_node: None, + }) + .map_err(py_optimization_exp) + } else { + Ok(optimized_plan) + } + } else { + optimized_plan + } + } } } Err(e) => Err(py_optimization_exp(e)), diff --git a/dask_planner/src/sql/column.rs b/src/sql/column.rs similarity index 91% rename from dask_planner/src/sql/column.rs rename to src/sql/column.rs index 63f043901..32250c382 100644 --- a/dask_planner/src/sql/column.rs +++ b/src/sql/column.rs @@ -1,7 +1,7 @@ use datafusion_python::datafusion_common::Column; use pyo3::prelude::*; -#[pyclass(name = "Column", module = "dask_planner", subclass)] +#[pyclass(name = "Column", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyColumn { /// Original Column instance diff --git a/dask_planner/src/sql/exceptions.rs b/src/sql/exceptions.rs similarity index 100% rename from dask_planner/src/sql/exceptions.rs rename to src/sql/exceptions.rs diff --git a/dask_planner/src/sql/function.rs b/src/sql/function.rs similarity index 93% rename from dask_planner/src/sql/function.rs rename to src/sql/function.rs index 39fa7635e..4169d386c 100644 --- a/dask_planner/src/sql/function.rs +++ b/src/sql/function.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; use super::types::PyDataType; -#[pyclass(name = "DaskFunction", module = "dask_planner", subclass)] +#[pyclass(name = "DaskFunction", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskFunction { #[pyo3(get, set)] diff --git a/dask_planner/src/sql/logical.rs b/src/sql/logical.rs similarity index 95% rename from dask_planner/src/sql/logical.rs rename to src/sql/logical.rs index d2096ba9b..e8f5f9f6f 100644 --- a/dask_planner/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -37,7 +37,7 @@ pub mod window; use datafusion_python::{ datafusion_common::{DFSchemaRef, DataFusionError}, - datafusion_expr::LogicalPlan, + datafusion_expr::{DdlStatement, LogicalPlan}, }; use pyo3::prelude::*; @@ -62,7 +62,7 @@ use self::{ }; use crate::{error::Result, sql::exceptions::py_type_err}; -#[pyclass(name = "LogicalPlan", module = "dask_planner", subclass)] +#[pyclass(name = "LogicalPlan", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyLogicalPlan { /// The original LogicalPlan that was parsed by DataFusion from the input SQL @@ -315,18 +315,19 @@ impl PyLogicalPlan { LogicalPlan::TableScan(_table_scan) => "TableScan", LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation", LogicalPlan::Limit(_limit) => "Limit", - LogicalPlan::CreateExternalTable(_create_external_table) => "CreateExternalTable", - LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable", - LogicalPlan::DropTable(_drop_table) => "DropTable", - LogicalPlan::DropView(_drop_view) => "DropView", + LogicalPlan::Ddl(DdlStatement::CreateExternalTable { .. }) => "CreateExternalTable", + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable { .. }) => "CreateMemoryTable", + LogicalPlan::Ddl(DdlStatement::DropTable { .. }) => "DropTable", + LogicalPlan::Ddl(DdlStatement::DropView { .. }) => "DropView", LogicalPlan::Values(_values) => "Values", LogicalPlan::Explain(_explain) => "Explain", LogicalPlan::Analyze(_analyze) => "Analyze", LogicalPlan::Subquery(_sub_query) => "Subquery", LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias", - LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema", - LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog", - LogicalPlan::CreateView(_create_view) => "CreateView", + LogicalPlan::Ddl(DdlStatement::CreateCatalogSchema { .. }) => "CreateCatalogSchema", + LogicalPlan::Ddl(DdlStatement::DropCatalogSchema { .. }) => "DropCatalogSchema", + LogicalPlan::Ddl(DdlStatement::CreateCatalog { .. }) => "CreateCatalog", + LogicalPlan::Ddl(DdlStatement::CreateView { .. }) => "CreateView", LogicalPlan::Statement(_) => "Statement", // Further examine and return the name that is a possible Dask-SQL Extension type LogicalPlan::Extension(extension) => { diff --git a/dask_planner/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs similarity index 87% rename from dask_planner/src/sql/logical/aggregate.rs rename to src/sql/logical/aggregate.rs index 0acc8b86e..1c4074239 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/src/sql/logical/aggregate.rs @@ -1,5 +1,5 @@ use datafusion_python::datafusion_expr::{ - expr::AggregateFunction, + expr::{AggregateFunction, AggregateUDF, Alias}, logical_plan::{Aggregate, Distinct}, Expr, LogicalPlan, @@ -11,7 +11,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Aggregate", module = "dask_planner", subclass)] +#[pyclass(name = "Aggregate", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyAggregate { aggregate: Option, @@ -73,9 +73,9 @@ impl PyAggregate { impl PyAggregate { fn _aggregation_arguments(&self, expr: &Expr) -> PyResult> { match expr { - Expr::Alias(expr, _) => self._aggregation_arguments(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun: _, args, .. }) - | Expr::AggregateUDF { fun: _, args, .. } => match &self.aggregate { + | Expr::AggregateUDF(AggregateUDF { fun: _, args, .. }) => match &self.aggregate { Some(e) => py_expr_list(&e.input, args), None => Ok(vec![]), }, @@ -88,9 +88,9 @@ impl PyAggregate { fn _agg_func_name(expr: &Expr) -> PyResult { match expr { - Expr::Alias(expr, _) => _agg_func_name(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => _agg_func_name(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun, .. }) => Ok(fun.to_string()), - Expr::AggregateUDF { fun, .. } => Ok(fun.name.clone()), + Expr::AggregateUDF(AggregateUDF { fun, .. }) => Ok(fun.name.clone()), _ => Err(py_type_err( "Encountered a non Aggregate type in agg_func_name", )), @@ -99,7 +99,7 @@ fn _agg_func_name(expr: &Expr) -> PyResult { fn _distinct_agg_expr(expr: &Expr) -> PyResult { match expr { - Expr::Alias(expr, _) => _distinct_agg_expr(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => _distinct_agg_expr(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { distinct, .. }) => Ok(*distinct), Expr::AggregateUDF { .. } => { // DataFusion does not support DISTINCT in UDAFs diff --git a/dask_planner/src/sql/logical/alter_schema.rs b/src/sql/logical/alter_schema.rs similarity index 98% rename from dask_planner/src/sql/logical/alter_schema.rs rename to src/sql/logical/alter_schema.rs index 742ae513f..a7a8696b8 100644 --- a/dask_planner/src/sql/logical/alter_schema.rs +++ b/src/sql/logical/alter_schema.rs @@ -96,7 +96,7 @@ impl UserDefinedLogicalNode for AlterSchemaPlanNode { } } -#[pyclass(name = "AlterSchema", module = "dask_planner", subclass)] +#[pyclass(name = "AlterSchema", module = "dask_sql", subclass)] pub struct PyAlterSchema { pub(crate) alter_schema: AlterSchemaPlanNode, } diff --git a/dask_planner/src/sql/logical/alter_table.rs b/src/sql/logical/alter_table.rs similarity index 98% rename from dask_planner/src/sql/logical/alter_table.rs rename to src/sql/logical/alter_table.rs index 7f51a15c3..d6b49315b 100644 --- a/dask_planner/src/sql/logical/alter_table.rs +++ b/src/sql/logical/alter_table.rs @@ -102,7 +102,7 @@ impl UserDefinedLogicalNode for AlterTablePlanNode { } } -#[pyclass(name = "AlterTable", module = "dask_planner", subclass)] +#[pyclass(name = "AlterTable", module = "dask_sql", subclass)] pub struct PyAlterTable { pub(crate) alter_table: AlterTablePlanNode, } diff --git a/dask_planner/src/sql/logical/analyze_table.rs b/src/sql/logical/analyze_table.rs similarity index 98% rename from dask_planner/src/sql/logical/analyze_table.rs rename to src/sql/logical/analyze_table.rs index 9fa7fb219..6876c3704 100644 --- a/dask_planner/src/sql/logical/analyze_table.rs +++ b/src/sql/logical/analyze_table.rs @@ -99,7 +99,7 @@ impl UserDefinedLogicalNode for AnalyzeTablePlanNode { } } -#[pyclass(name = "AnalyzeTable", module = "dask_planner", subclass)] +#[pyclass(name = "AnalyzeTable", module = "dask_sql", subclass)] pub struct PyAnalyzeTable { pub(crate) analyze_table: AnalyzeTablePlanNode, } diff --git a/dask_planner/src/sql/logical/create_catalog_schema.rs b/src/sql/logical/create_catalog_schema.rs similarity index 98% rename from dask_planner/src/sql/logical/create_catalog_schema.rs rename to src/sql/logical/create_catalog_schema.rs index bc89b02ce..82a1426af 100644 --- a/dask_planner/src/sql/logical/create_catalog_schema.rs +++ b/src/sql/logical/create_catalog_schema.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for CreateCatalogSchemaPlanNode { } } -#[pyclass(name = "CreateCatalogSchema", module = "dask_planner", subclass)] +#[pyclass(name = "CreateCatalogSchema", module = "dask_sql", subclass)] pub struct PyCreateCatalogSchema { pub(crate) create_catalog_schema: CreateCatalogSchemaPlanNode, } diff --git a/dask_planner/src/sql/logical/create_experiment.rs b/src/sql/logical/create_experiment.rs similarity index 98% rename from dask_planner/src/sql/logical/create_experiment.rs rename to src/sql/logical/create_experiment.rs index 313357d75..06fe9d856 100644 --- a/dask_planner/src/sql/logical/create_experiment.rs +++ b/src/sql/logical/create_experiment.rs @@ -105,7 +105,7 @@ impl UserDefinedLogicalNode for CreateExperimentPlanNode { } } -#[pyclass(name = "CreateExperiment", module = "dask_planner", subclass)] +#[pyclass(name = "CreateExperiment", module = "dask_sql", subclass)] pub struct PyCreateExperiment { pub(crate) create_experiment: CreateExperimentPlanNode, } diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/src/sql/logical/create_memory_table.rs similarity index 89% rename from dask_planner/src/sql/logical/create_memory_table.rs rename to src/sql/logical/create_memory_table.rs index 668295e0f..53ff9432e 100644 --- a/dask_planner/src/sql/logical/create_memory_table.rs +++ b/src/sql/logical/create_memory_table.rs @@ -1,12 +1,13 @@ use datafusion_python::datafusion_expr::{ logical_plan::{CreateMemoryTable, CreateView}, + DdlStatement, LogicalPlan, }; use pyo3::prelude::*; use crate::sql::{exceptions::py_type_err, logical::PyLogicalPlan}; -#[pyclass(name = "CreateMemoryTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateMemoryTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyCreateMemoryTable { create_memory_table: Option, @@ -85,13 +86,13 @@ impl TryFrom for PyCreateMemoryTable { fn try_from(logical_plan: LogicalPlan) -> Result { Ok(match logical_plan { - LogicalPlan::CreateMemoryTable(create_memory_table) => PyCreateMemoryTable { - create_memory_table: Some(create_memory_table), + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(cmt)) => PyCreateMemoryTable { + create_memory_table: Some(cmt), create_view: None, }, - LogicalPlan::CreateView(create_view) => PyCreateMemoryTable { + LogicalPlan::Ddl(DdlStatement::CreateView(cv)) => PyCreateMemoryTable { create_memory_table: None, - create_view: Some(create_view), + create_view: Some(cv), }, _ => return Err(py_type_err("unexpected plan")), }) diff --git a/dask_planner/src/sql/logical/create_model.rs b/src/sql/logical/create_model.rs similarity index 98% rename from dask_planner/src/sql/logical/create_model.rs rename to src/sql/logical/create_model.rs index 782fe3325..7dbcdff95 100644 --- a/dask_planner/src/sql/logical/create_model.rs +++ b/src/sql/logical/create_model.rs @@ -101,7 +101,7 @@ impl UserDefinedLogicalNode for CreateModelPlanNode { } } -#[pyclass(name = "CreateModel", module = "dask_planner", subclass)] +#[pyclass(name = "CreateModel", module = "dask_sql", subclass)] pub struct PyCreateModel { pub(crate) create_model: CreateModelPlanNode, } diff --git a/dask_planner/src/sql/logical/create_table.rs b/src/sql/logical/create_table.rs similarity index 98% rename from dask_planner/src/sql/logical/create_table.rs rename to src/sql/logical/create_table.rs index 9271130c7..1c423415f 100644 --- a/dask_planner/src/sql/logical/create_table.rs +++ b/src/sql/logical/create_table.rs @@ -100,7 +100,7 @@ impl UserDefinedLogicalNode for CreateTablePlanNode { } } -#[pyclass(name = "CreateTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateTable", module = "dask_sql", subclass)] pub struct PyCreateTable { pub(crate) create_table: CreateTablePlanNode, } diff --git a/dask_planner/src/sql/logical/describe_model.rs b/src/sql/logical/describe_model.rs similarity index 97% rename from dask_planner/src/sql/logical/describe_model.rs rename to src/sql/logical/describe_model.rs index cb2087376..3e3563fe1 100644 --- a/dask_planner/src/sql/logical/describe_model.rs +++ b/src/sql/logical/describe_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for DescribeModelPlanNode { } } -#[pyclass(name = "DescribeModel", module = "dask_planner", subclass)] +#[pyclass(name = "DescribeModel", module = "dask_sql", subclass)] pub struct PyDescribeModel { pub(crate) describe_model: DescribeModelPlanNode, } diff --git a/dask_planner/src/sql/logical/drop_model.rs b/src/sql/logical/drop_model.rs similarity index 98% rename from dask_planner/src/sql/logical/drop_model.rs rename to src/sql/logical/drop_model.rs index 71074905d..2715cb067 100644 --- a/dask_planner/src/sql/logical/drop_model.rs +++ b/src/sql/logical/drop_model.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for DropModelPlanNode { } } -#[pyclass(name = "DropModel", module = "dask_planner", subclass)] +#[pyclass(name = "DropModel", module = "dask_sql", subclass)] pub struct PyDropModel { pub(crate) drop_model: DropModelPlanNode, } diff --git a/dask_planner/src/sql/logical/drop_schema.rs b/src/sql/logical/drop_schema.rs similarity index 97% rename from dask_planner/src/sql/logical/drop_schema.rs rename to src/sql/logical/drop_schema.rs index 2022a61c9..78d252d11 100644 --- a/dask_planner/src/sql/logical/drop_schema.rs +++ b/src/sql/logical/drop_schema.rs @@ -88,7 +88,7 @@ impl UserDefinedLogicalNode for DropSchemaPlanNode { } } -#[pyclass(name = "DropSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DropSchema", module = "dask_sql", subclass)] pub struct PyDropSchema { pub(crate) drop_schema: DropSchemaPlanNode, } diff --git a/dask_planner/src/sql/logical/drop_table.rs b/src/sql/logical/drop_table.rs similarity index 71% rename from dask_planner/src/sql/logical/drop_table.rs rename to src/sql/logical/drop_table.rs index 7d58e8a47..504a104c1 100644 --- a/dask_planner/src/sql/logical/drop_table.rs +++ b/src/sql/logical/drop_table.rs @@ -1,9 +1,12 @@ -use datafusion_python::datafusion_expr::logical_plan::{DropTable, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + logical_plan::{DropTable, LogicalPlan}, + DdlStatement, +}; use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "DropTable", module = "dask_planner", subclass)] +#[pyclass(name = "DropTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyDropTable { drop_table: DropTable, @@ -27,7 +30,7 @@ impl TryFrom for PyDropTable { fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - LogicalPlan::DropTable(drop_table) => Ok(PyDropTable { drop_table }), + LogicalPlan::Ddl(DdlStatement::DropTable(drop_table)) => Ok(PyDropTable { drop_table }), _ => Err(py_type_err("unexpected plan")), } } diff --git a/dask_planner/src/sql/logical/empty_relation.rs b/src/sql/logical/empty_relation.rs similarity index 94% rename from dask_planner/src/sql/logical/empty_relation.rs rename to src/sql/logical/empty_relation.rs index 5bd6659ce..6356f9c85 100644 --- a/dask_planner/src/sql/logical/empty_relation.rs +++ b/src/sql/logical/empty_relation.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "EmptyRelation", module = "dask_planner", subclass)] +#[pyclass(name = "EmptyRelation", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyEmptyRelation { empty_relation: EmptyRelation, diff --git a/dask_planner/src/sql/logical/explain.rs b/src/sql/logical/explain.rs similarity index 93% rename from dask_planner/src/sql/logical/explain.rs rename to src/sql/logical/explain.rs index 17f1e4ee2..839a731d8 100644 --- a/dask_planner/src/sql/logical/explain.rs +++ b/src/sql/logical/explain.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "Explain", module = "dask_planner", subclass)] +#[pyclass(name = "Explain", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyExplain { explain: Explain, diff --git a/dask_planner/src/sql/logical/export_model.rs b/src/sql/logical/export_model.rs similarity index 98% rename from dask_planner/src/sql/logical/export_model.rs rename to src/sql/logical/export_model.rs index e38551b58..58b5f7fad 100644 --- a/dask_planner/src/sql/logical/export_model.rs +++ b/src/sql/logical/export_model.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ExportModelPlanNode { } } -#[pyclass(name = "ExportModel", module = "dask_planner", subclass)] +#[pyclass(name = "ExportModel", module = "dask_sql", subclass)] pub struct PyExportModel { pub(crate) export_model: ExportModelPlanNode, } diff --git a/dask_planner/src/sql/logical/filter.rs b/src/sql/logical/filter.rs similarity index 93% rename from dask_planner/src/sql/logical/filter.rs rename to src/sql/logical/filter.rs index a50d508ff..f2dc2e702 100644 --- a/dask_planner/src/sql/logical/filter.rs +++ b/src/sql/logical/filter.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Filter", module = "dask_planner", subclass)] +#[pyclass(name = "Filter", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyFilter { filter: Filter, diff --git a/dask_planner/src/sql/logical/join.rs b/src/sql/logical/join.rs similarity index 98% rename from dask_planner/src/sql/logical/join.rs rename to src/sql/logical/join.rs index d6c31b55b..3261e9217 100644 --- a/dask_planner/src/sql/logical/join.rs +++ b/src/sql/logical/join.rs @@ -15,7 +15,7 @@ use crate::{ sql::{column, exceptions::py_type_err}, }; -#[pyclass(name = "Join", module = "dask_planner", subclass)] +#[pyclass(name = "Join", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyJoin { join: Join, diff --git a/dask_planner/src/sql/logical/limit.rs b/src/sql/logical/limit.rs similarity index 95% rename from dask_planner/src/sql/logical/limit.rs rename to src/sql/logical/limit.rs index 189fdeea0..04d783fdd 100644 --- a/dask_planner/src/sql/logical/limit.rs +++ b/src/sql/logical/limit.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Limit", module = "dask_planner", subclass)] +#[pyclass(name = "Limit", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyLimit { limit: Limit, diff --git a/dask_planner/src/sql/logical/predict_model.rs b/src/sql/logical/predict_model.rs similarity index 98% rename from dask_planner/src/sql/logical/predict_model.rs rename to src/sql/logical/predict_model.rs index e8d723d2c..3f68ffdb4 100644 --- a/dask_planner/src/sql/logical/predict_model.rs +++ b/src/sql/logical/predict_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for PredictModelPlanNode { } } -#[pyclass(name = "PredictModel", module = "dask_planner", subclass)] +#[pyclass(name = "PredictModel", module = "dask_sql", subclass)] pub struct PyPredictModel { pub(crate) predict_model: PredictModelPlanNode, } diff --git a/dask_planner/src/sql/logical/projection.rs b/src/sql/logical/projection.rs similarity index 83% rename from dask_planner/src/sql/logical/projection.rs rename to src/sql/logical/projection.rs index 99ed0d684..56e5e28d8 100644 --- a/dask_planner/src/sql/logical/projection.rs +++ b/src/sql/logical/projection.rs @@ -1,9 +1,14 @@ -use datafusion_python::datafusion_expr::{logical_plan::Projection, Expr, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + expr::Alias, + logical_plan::Projection, + Expr, + LogicalPlan, +}; use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Projection", module = "dask_planner", subclass)] +#[pyclass(name = "Projection", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyProjection { pub(crate) projection: Projection, @@ -14,7 +19,7 @@ impl PyProjection { fn projected_expressions(&mut self, local_expr: &PyExpr) -> Vec { let mut projs: Vec = Vec::new(); match &local_expr.expr { - Expr::Alias(expr, _name) => { + Expr::Alias(Alias { expr, .. }) => { let py_expr: PyExpr = PyExpr::from(*expr.clone(), Some(vec![self.projection.input.clone()])); projs.extend_from_slice(self.projected_expressions(&py_expr).as_slice()); @@ -35,9 +40,9 @@ impl PyProjection { PyExpr::from(expression, Some(vec![self.projection.input.clone()])); for expr in self.projected_expressions(&py_expr) { match expr.expr { - Expr::Alias(ex, name) => named.push(( + Expr::Alias(Alias { expr, name }) => named.push(( name.to_string(), - PyExpr::from(*ex, Some(vec![self.projection.input.clone()])), + PyExpr::from(*expr, Some(vec![self.projection.input.clone()])), )), _ => { if let Ok(name) = expr._column_name(&self.projection.input) { diff --git a/dask_planner/src/sql/logical/repartition_by.rs b/src/sql/logical/repartition_by.rs similarity index 96% rename from dask_planner/src/sql/logical/repartition_by.rs rename to src/sql/logical/repartition_by.rs index e931b88e7..687958571 100644 --- a/dask_planner/src/sql/logical/repartition_by.rs +++ b/src/sql/logical/repartition_by.rs @@ -10,7 +10,7 @@ use crate::{ sql::{exceptions::py_type_err, logical}, }; -#[pyclass(name = "RepartitionBy", module = "dask_planner", subclass)] +#[pyclass(name = "RepartitionBy", module = "dask_sql", subclass)] pub struct PyRepartitionBy { pub(crate) repartition: Repartition, } diff --git a/dask_planner/src/sql/logical/show_columns.rs b/src/sql/logical/show_columns.rs similarity index 98% rename from dask_planner/src/sql/logical/show_columns.rs rename to src/sql/logical/show_columns.rs index adfb584ef..cdd844127 100644 --- a/dask_planner/src/sql/logical/show_columns.rs +++ b/src/sql/logical/show_columns.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for ShowColumnsPlanNode { } } -#[pyclass(name = "ShowColumns", module = "dask_planner", subclass)] +#[pyclass(name = "ShowColumns", module = "dask_sql", subclass)] pub struct PyShowColumns { pub(crate) show_columns: ShowColumnsPlanNode, } diff --git a/dask_planner/src/sql/logical/show_models.rs b/src/sql/logical/show_models.rs similarity index 97% rename from dask_planner/src/sql/logical/show_models.rs rename to src/sql/logical/show_models.rs index 026a179a5..a228769de 100644 --- a/dask_planner/src/sql/logical/show_models.rs +++ b/src/sql/logical/show_models.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for ShowModelsPlanNode { } } -#[pyclass(name = "ShowModels", module = "dask_planner", subclass)] +#[pyclass(name = "ShowModels", module = "dask_sql", subclass)] pub struct PyShowModels { pub(crate) show_models: ShowModelsPlanNode, } diff --git a/dask_planner/src/sql/logical/show_schemas.rs b/src/sql/logical/show_schemas.rs similarity index 98% rename from dask_planner/src/sql/logical/show_schemas.rs rename to src/sql/logical/show_schemas.rs index 3e3ed4783..454afb51d 100644 --- a/dask_planner/src/sql/logical/show_schemas.rs +++ b/src/sql/logical/show_schemas.rs @@ -91,7 +91,7 @@ impl UserDefinedLogicalNode for ShowSchemasPlanNode { } } -#[pyclass(name = "ShowSchema", module = "dask_planner", subclass)] +#[pyclass(name = "ShowSchema", module = "dask_sql", subclass)] pub struct PyShowSchema { pub(crate) show_schema: ShowSchemasPlanNode, } diff --git a/dask_planner/src/sql/logical/show_tables.rs b/src/sql/logical/show_tables.rs similarity index 98% rename from dask_planner/src/sql/logical/show_tables.rs rename to src/sql/logical/show_tables.rs index 987f2546e..c01022828 100644 --- a/dask_planner/src/sql/logical/show_tables.rs +++ b/src/sql/logical/show_tables.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ShowTablesPlanNode { } } -#[pyclass(name = "ShowTables", module = "dask_planner", subclass)] +#[pyclass(name = "ShowTables", module = "dask_sql", subclass)] pub struct PyShowTables { pub(crate) show_tables: ShowTablesPlanNode, } diff --git a/dask_planner/src/sql/logical/sort.rs b/src/sql/logical/sort.rs similarity index 93% rename from dask_planner/src/sql/logical/sort.rs rename to src/sql/logical/sort.rs index 9abcd3906..5a1f862a1 100644 --- a/dask_planner/src/sql/logical/sort.rs +++ b/src/sql/logical/sort.rs @@ -6,7 +6,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Sort", module = "dask_planner", subclass)] +#[pyclass(name = "Sort", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySort { sort: Sort, diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/src/sql/logical/subquery_alias.rs similarity index 85% rename from dask_planner/src/sql/logical/subquery_alias.rs rename to src/sql/logical/subquery_alias.rs index 1b23e5dc4..e98c78203 100644 --- a/dask_planner/src/sql/logical/subquery_alias.rs +++ b/src/sql/logical/subquery_alias.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "SubqueryAlias", module = "dask_planner", subclass)] +#[pyclass(name = "SubqueryAlias", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySubqueryAlias { subquery_alias: SubqueryAlias, @@ -14,7 +14,7 @@ impl PySubqueryAlias { /// Returns a Vec of the sort expressions #[pyo3(name = "getAlias")] pub fn alias(&self) -> PyResult { - Ok(self.subquery_alias.alias.clone()) + Ok(self.subquery_alias.alias.clone().to_string()) } } diff --git a/dask_planner/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs similarity index 79% rename from dask_planner/src/sql/logical/table_scan.rs rename to src/sql/logical/table_scan.rs index 679d24c49..c9cb92ebd 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/src/sql/logical/table_scan.rs @@ -1,8 +1,13 @@ -use std::sync::Arc; +use std::{sync::Arc, vec}; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan}, + datafusion_expr::{ + expr::{Alias, InList}, + logical_plan::TableScan, + Expr, + LogicalPlan, + }, }; use pyo3::prelude::*; @@ -12,14 +17,15 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "TableScan", module = "dask_planner", subclass)] +#[pyclass(name = "TableScan", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyTableScan { pub(crate) table_scan: TableScan, input: Arc, } -#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)] +type FilterTuple = (String, String, Option>); +#[pyclass(name = "FilteredResult", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyFilteredResult { // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering @@ -31,7 +37,7 @@ pub struct PyFilteredResult { // Expr(s) that can have their filtering logic performed in the pyarrow IO logic // are stored here in a DNF format that is expected by pyarrow. #[pyo3(get)] - pub filtered_exprs: Vec<(String, String, Vec)>, + pub filtered_exprs: Vec<(PyExpr, FilterTuple)>, } impl PyTableScan { @@ -45,16 +51,17 @@ impl PyTableScan { /// it as well if needed. pub fn _expand_dnf_filter( filter: &Expr, + input: &Arc, py: Python, - ) -> Result)>, DaskPlannerError> { - let mut filter_tuple: Vec<(String, String, Vec)> = Vec::new(); + ) -> Result, DaskPlannerError> { + let mut filter_tuple: Vec<(PyExpr, FilterTuple)> = Vec::new(); match filter { - Expr::InList { + Expr::InList(InList { expr, list, negated, - } => { + }) => { // Only handle simple Expr(s) for InList operations for now if PyTableScan::_valid_expr_type(list) { // While ANSI SQL would not allow for anything other than a Column or Literal @@ -62,7 +69,7 @@ impl PyTableScan { // IF it is something else it is returned to Dask to handle let ident = match *expr.clone() { Expr::Column(col) => Ok(col.name), - Expr::Alias(_, name) => Ok(name), + Expr::Alias(Alias { name, .. }) => Ok(name), Expr::Literal(val) => Ok(format!("{}", val)), _ => Err(DaskPlannerError::InvalidIOFilter(format!( "Invalid InList Expr type `{}`. using in Dask instead", @@ -75,7 +82,7 @@ impl PyTableScan { .iter() .map(|f| match f { Expr::Column(col) => Ok(col.name.clone().into_py(py)), - Expr::Alias(_, name) => Ok(name.clone().into_py(py)), + Expr::Alias(Alias { name, ..}) => Ok(name.clone().into_py(py)), Expr::Literal(val) => match val { ScalarValue::Boolean(val) => Ok(val.unwrap().into_py(py)), ScalarValue::Float32(val) => Ok(val.unwrap().into_py(py)), @@ -100,9 +107,12 @@ impl PyTableScan { .collect(); filter_tuple.push(( - ident.unwrap_or(expr.canonical_name()), - op.to_string(), - il?, + PyExpr::from(filter.clone(), Some(vec![input.clone()])), + ( + ident.unwrap_or(expr.canonical_name()), + op.to_string(), + Some(il?), + ), )); Ok(filter_tuple) } else { @@ -110,15 +120,35 @@ impl PyTableScan { "Invalid identifying column Expr instance `{}`. using in Dask instead", filter )); - Err::)>, DaskPlannerError>(er) + Err::, DaskPlannerError>(er) } } + Expr::IsNotNull(expr) => { + // Only handle simple Expr(s) for IsNotNull operations for now + let ident = match *expr.clone() { + Expr::Column(col) => Ok(col.name), + _ => Err(DaskPlannerError::InvalidIOFilter(format!( + "Invalid IsNotNull Expr type `{}`. using in Dask instead", + filter + ))), + }; + + filter_tuple.push(( + PyExpr::from(filter.clone(), Some(vec![input.clone()])), + ( + ident.unwrap_or(expr.canonical_name()), + "is not".to_string(), + None, + ), + )); + Ok(filter_tuple) + } _ => { let er = DaskPlannerError::InvalidIOFilter(format!( "Unable to apply filter: `{}` to IO reader, using in Dask instead", filter )); - Err::)>, DaskPlannerError>(er) + Err::, DaskPlannerError>(er) } } } @@ -132,12 +162,12 @@ impl PyTableScan { filters: &[Expr], py: Python, ) -> PyFilteredResult { - let mut filtered_exprs: Vec<(String, String, Vec)> = Vec::new(); + let mut filtered_exprs: Vec<(PyExpr, FilterTuple)> = Vec::new(); let mut unfiltered_exprs: Vec = Vec::new(); filters .iter() - .for_each(|f| match PyTableScan::_expand_dnf_filter(f, py) { + .for_each(|f| match PyTableScan::_expand_dnf_filter(f, input, py) { Ok(mut expanded_dnf_filter) => filtered_exprs.append(&mut expanded_dnf_filter), Err(_e) => { unfiltered_exprs.push(PyExpr::from(f.clone(), Some(vec![input.clone()]))) diff --git a/dask_planner/src/sql/logical/use_schema.rs b/src/sql/logical/use_schema.rs similarity index 97% rename from dask_planner/src/sql/logical/use_schema.rs rename to src/sql/logical/use_schema.rs index 7c2206310..0f804ce7a 100644 --- a/dask_planner/src/sql/logical/use_schema.rs +++ b/src/sql/logical/use_schema.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for UseSchemaPlanNode { } } -#[pyclass(name = "UseSchema", module = "dask_planner", subclass)] +#[pyclass(name = "UseSchema", module = "dask_sql", subclass)] pub struct PyUseSchema { pub(crate) use_schema: UseSchemaPlanNode, } diff --git a/dask_planner/src/sql/logical/window.rs b/src/sql/logical/window.rs similarity index 96% rename from dask_planner/src/sql/logical/window.rs rename to src/sql/logical/window.rs index e104ccdb3..3dd9d8c0d 100644 --- a/dask_planner/src/sql/logical/window.rs +++ b/src/sql/logical/window.rs @@ -17,19 +17,19 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Window", module = "dask_planner", subclass)] +#[pyclass(name = "Window", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindow { window: Window, } -#[pyclass(name = "WindowFrame", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrame", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrame { window_frame: WindowFrame, } -#[pyclass(name = "WindowFrameBound", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrameBound", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrameBound { frame_bound: WindowFrameBound, diff --git a/dask_planner/src/sql/optimizer.rs b/src/sql/optimizer.rs similarity index 85% rename from dask_planner/src/sql/optimizer.rs rename to src/sql/optimizer.rs index 68577cf2c..85f335572 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -1,11 +1,16 @@ +// Declare optimizer modules +pub mod decorrelate_where_exists; +pub mod decorrelate_where_in; +pub mod dynamic_partition_pruning; +pub mod join_reorder; +pub mod utils; + use std::sync::Arc; use datafusion_python::{ datafusion_common::DataFusionError, datafusion_expr::LogicalPlan, datafusion_optimizer::{ - decorrelate_where_exists::DecorrelateWhereExists, - decorrelate_where_in::DecorrelateWhereIn, eliminate_cross_join::EliminateCrossJoin, eliminate_limit::EliminateLimit, eliminate_outer_join::EliminateOuterJoin, @@ -22,10 +27,11 @@ use datafusion_python::{ OptimizerContext, }, }; -use log::{debug, trace}; - -mod join_reorder; +use decorrelate_where_exists::DecorrelateWhereExists; +use decorrelate_where_in::DecorrelateWhereIn; +use dynamic_partition_pruning::DynamicPartitionPruning; use join_reorder::JoinReorder; +use log::{debug, trace}; /// Houses the optimization logic for Dask-SQL. This optimization controls the optimizations /// and their ordering in regards to their impact on the underlying `LogicalPlan` instance @@ -86,6 +92,17 @@ impl DaskSqlOptimizer { } } + // Create a separate instance of this optimization rule, since we want to ensure that it only + // runs one time + pub fn dynamic_partition_pruner() -> Self { + let rule: Vec> = + vec![Arc::new(DynamicPartitionPruning::new())]; + + Self { + optimizer: Optimizer::with_rules(rule), + } + } + /// Iterates through the configured `OptimizerRule`(s) to transform the input `LogicalPlan` /// to its final optimized form pub(crate) fn optimize(&self, plan: LogicalPlan) -> Result { @@ -93,6 +110,14 @@ impl DaskSqlOptimizer { self.optimizer.optimize(&plan, &config, Self::observe) } + /// Iterates once through the configured `OptimizerRule`(s) to transform the input `LogicalPlan` + /// to its final optimized form + pub(crate) fn optimize_once(&self, plan: LogicalPlan) -> Result { + let mut config = OptimizerContext::new(); + config = OptimizerContext::with_max_passes(config, 1); + self.optimizer.optimize(&plan, &config, Self::observe) + } + fn observe(optimized_plan: &LogicalPlan, optimization: &dyn OptimizerRule) { trace!( "== AFTER APPLYING RULE {} ==\n{}\n", @@ -129,17 +154,7 @@ mod tests { AND (cast('2002-05-08' as date) + interval '5 days')\ )"; let plan = test_sql(sql)?; - let expected = r#"Projection: test.col_int32 - Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.__value - CrossJoin: - TableScan: test projection=[col_int32] - SubqueryAlias: __scalar_sq_1 - Projection: AVG(test.col_int32) AS __value - Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]] - Projection: test.col_int32 - Filter: test.col_utf8 >= Utf8("2002-05-08") AND test.col_utf8 <= Utf8("2002-05-13") - TableScan: test projection=[col_int32, col_utf8]"#; - assert_eq!(expected, format!("{:?}", plan)); + assert!(format!("{:?}", plan).contains(r#"<= Date32("11820")"#)); Ok(()) } @@ -212,6 +227,13 @@ mod tests { fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + None + } } struct MyTableSource { diff --git a/src/sql/optimizer/decorrelate_where_exists.rs b/src/sql/optimizer/decorrelate_where_exists.rs new file mode 100644 index 000000000..5944c83ae --- /dev/null +++ b/src/sql/optimizer/decorrelate_where_exists.rs @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion_python::{ + datafusion_common::{Column, DataFusionError, Result}, + datafusion_expr::{ + expr::Exists, + logical_plan::{Distinct, Filter, JoinType, Subquery}, + Expr, + LogicalPlan, + LogicalPlanBuilder, + }, + datafusion_optimizer::optimizer::{ApplyOrder, OptimizerConfig, OptimizerRule}, +}; + +use crate::sql::optimizer::utils::{ + collect_subquery_cols, + conjunction, + extract_join_filters, + split_conjunction, +}; + +/// Optimizer rule for rewriting subquery filters to joins +#[derive(Default)] +pub struct DecorrelateWhereExists {} + +impl DecorrelateWhereExists { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } + + /// Finds expressions that have a where in subquery (and recurse when found) + /// + /// # Arguments + /// + /// * `predicate` - A conjunction to split and search + /// * `optimizer_config` - For generating unique subquery aliases + /// + /// Returns a tuple (subqueries, non-subquery expressions) + fn extract_subquery_exprs( + &self, + predicate: &Expr, + config: &dyn OptimizerConfig, + ) -> Result<(Vec, Vec)> { + let filters = split_conjunction(predicate); + + let mut subqueries = vec![]; + let mut others = vec![]; + for it in filters.iter() { + match it { + Expr::Exists(Exists { subquery, negated }) => { + let subquery_plan = self + .try_optimize(&subquery.subquery, config)? + .map(Arc::new) + .unwrap_or_else(|| subquery.subquery.clone()); + let new_subquery = subquery.with_plan(subquery_plan); + subqueries.push(SubqueryInfo::new(new_subquery, *negated)); + } + _ => others.push((*it).clone()), + } + } + + Ok((subqueries, others)) + } +} + +impl OptimizerRule for DecorrelateWhereExists { + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Filter(filter) => { + let (subqueries, other_exprs) = + self.extract_subquery_exprs(&filter.predicate, config)?; + if subqueries.is_empty() { + // regular filter, no subquery exists clause here + return Ok(None); + } + + // iterate through all exists clauses in predicate, turning each into a join + let mut cur_input = filter.input.as_ref().clone(); + for subquery in subqueries { + if let Some(x) = optimize_exists(&subquery, &cur_input)? { + cur_input = x; + } else { + return Ok(None); + } + } + + let expr = conjunction(other_exprs); + if let Some(expr) = expr { + let new_filter = Filter::try_new(expr, Arc::new(cur_input))?; + cur_input = LogicalPlan::Filter(new_filter); + } + + Ok(Some(cur_input)) + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "decorrelate_where_exists" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +/// Takes a query like: +/// +/// SELECT t1.id +/// FROM t1 +/// WHERE exists +/// ( +/// SELECT t2.id FROM t2 WHERE t1.id = t2.id +/// ) +/// +/// and optimizes it into: +/// +/// SELECT t1.id +/// FROM t1 LEFT SEMI +/// JOIN t2 +/// ON t1.id = t2.id +/// +/// # Arguments +/// +/// * query_info - The subquery and negated(exists/not exists) info. +/// * outer_input - The non-subquery portion (relation t1) +fn optimize_exists( + query_info: &SubqueryInfo, + outer_input: &LogicalPlan, +) -> Result> { + let subquery = query_info.query.subquery.as_ref(); + if let Some((join_filter, optimized_subquery)) = optimize_subquery(subquery)? { + // join our sub query into the main plan + let join_type = match query_info.negated { + true => JoinType::LeftAnti, + false => JoinType::LeftSemi, + }; + + let new_plan = LogicalPlanBuilder::from(outer_input.clone()) + .join( + optimized_subquery, + join_type, + (Vec::::new(), Vec::::new()), + Some(join_filter), + )? + .build()?; + + Ok(Some(new_plan)) + } else { + Ok(None) + } +} +/// Optimize the subquery and extract the possible join filter. +/// This function can't optimize non-correlated subquery, and will return None. +fn optimize_subquery(subquery: &LogicalPlan) -> Result> { + match subquery { + LogicalPlan::Distinct(subqry_distinct) => { + let distinct_input = &subqry_distinct.input; + let optimized_plan = optimize_subquery(distinct_input)?.map(|(filters, right)| { + ( + filters, + LogicalPlan::Distinct(Distinct { + input: Arc::new(right), + }), + ) + }); + Ok(optimized_plan) + } + LogicalPlan::Projection(projection) => { + // extract join filters + let (join_filters, subquery_input) = extract_join_filters(&projection.input)?; + // cannot optimize non-correlated subquery + if join_filters.is_empty() { + return Ok(None); + } + let input_schema = subquery_input.schema(); + let project_exprs: Vec = + collect_subquery_cols(&join_filters, input_schema.clone())? + .into_iter() + .map(Expr::Column) + .collect(); + let right = LogicalPlanBuilder::from(subquery_input) + .project(project_exprs)? + .build()?; + + // join_filters is not empty. + let join_filter = conjunction(join_filters).ok_or_else(|| { + DataFusionError::Internal("join filters should not be empty".to_string()) + })?; + Ok(Some((join_filter, right))) + } + _ => Ok(None), + } +} + +struct SubqueryInfo { + query: Subquery, + negated: bool, +} + +impl SubqueryInfo { + pub fn new(query: Subquery, negated: bool) -> Self { + Self { query, negated } + } +} diff --git a/src/sql/optimizer/decorrelate_where_in.rs b/src/sql/optimizer/decorrelate_where_in.rs new file mode 100644 index 000000000..014f22092 --- /dev/null +++ b/src/sql/optimizer/decorrelate_where_in.rs @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion_python::{ + datafusion_common::{alias::AliasGenerator, context, Column, Result}, + datafusion_expr::{ + expr::InSubquery, + expr_rewriter::unnormalize_col, + logical_plan::{JoinType, Projection, Subquery}, + Expr, + Filter, + LogicalPlan, + LogicalPlanBuilder, + }, + datafusion_optimizer::optimizer::{ApplyOrder, OptimizerConfig, OptimizerRule}, +}; +use log::debug; + +use crate::sql::optimizer::utils::{ + collect_subquery_cols, + conjunction, + extract_join_filters, + only_or_err, + replace_qualified_name, + split_conjunction, +}; + +#[derive(Default)] +pub struct DecorrelateWhereIn { + alias: AliasGenerator, +} + +impl DecorrelateWhereIn { + #[allow(missing_docs)] + pub fn new() -> Self { + Self::default() + } + + /// Finds expressions that have a where in subquery (and recurses when found) + /// + /// # Arguments + /// + /// * `predicate` - A conjunction to split and search + /// * `optimizer_config` - For generating unique subquery aliases + /// + /// Returns a tuple (subqueries, non-subquery expressions) + fn extract_subquery_exprs( + &self, + predicate: &Expr, + config: &dyn OptimizerConfig, + ) -> Result<(Vec, Vec)> { + let filters = split_conjunction(predicate); // TODO: disjunctions + + let mut subqueries = vec![]; + let mut others = vec![]; + for it in filters.iter() { + match it { + Expr::InSubquery(InSubquery { + expr, + subquery, + negated, + }) => { + let subquery_plan = self + .try_optimize(&subquery.subquery, config)? + .map(Arc::new) + .unwrap_or_else(|| subquery.subquery.clone()); + let new_subquery = subquery.with_plan(subquery_plan); + subqueries.push(SubqueryInfo::new(new_subquery, (**expr).clone(), *negated)); + // TODO: if subquery doesn't get optimized, optimized children are lost + } + _ => others.push((*it).clone()), + } + } + + Ok((subqueries, others)) + } +} + +impl OptimizerRule for DecorrelateWhereIn { + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Filter(filter) => { + let (subqueries, other_exprs) = + self.extract_subquery_exprs(&filter.predicate, config)?; + if subqueries.is_empty() { + // regular filter, no subquery exists clause here + return Ok(None); + } + + // iterate through all exists clauses in predicate, turning each into a join + let mut cur_input = filter.input.as_ref().clone(); + for subquery in subqueries { + cur_input = optimize_where_in(&subquery, &cur_input, &self.alias)?; + } + + let expr = conjunction(other_exprs); + if let Some(expr) = expr { + let new_filter = Filter::try_new(expr, Arc::new(cur_input))?; + cur_input = LogicalPlan::Filter(new_filter); + } + + Ok(Some(cur_input)) + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "decorrelate_where_in" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +/// Optimize the where in subquery to left-anti/left-semi join. +/// If the subquery is a correlated subquery, we need extract the join predicate from the subquery. +/// +/// For example, given a query like: +/// `select t1.a, t1.b from t1 where t1 in (select t2.a from t2 where t1.b = t2.b and t1.c > t2.c)` +/// +/// The optimized plan will be: +/// +/// ```text +/// Projection: t1.a, t1.b +/// LeftSemi Join: Filter: t1.a = __correlated_sq_1.a AND t1.b = __correlated_sq_1.b AND t1.c > __correlated_sq_1.c +/// TableScan: t1 +/// SubqueryAlias: __correlated_sq_1 +/// Projection: t2.a AS a, t2.b, t2.c +/// TableScan: t2 +/// ``` +fn optimize_where_in( + query_info: &SubqueryInfo, + left: &LogicalPlan, + alias: &AliasGenerator, +) -> Result { + let projection = Projection::try_from_plan(&query_info.query.subquery) + .map_err(|e| context!("a projection is required", e))?; + let subquery_input = projection.input.clone(); + // TODO add the validate logic to Analyzer + let subquery_expr = only_or_err(projection.expr.as_slice()) + .map_err(|e| context!("single expression projection required", e))?; + + // extract join filters + let (join_filters, subquery_input) = extract_join_filters(subquery_input.as_ref())?; + + // in_predicate may be also include in the join filters, remove it from the join filters. + let in_predicate = Expr::eq(query_info.where_in_expr.clone(), subquery_expr.clone()); + let join_filters = remove_duplicated_filter(join_filters, in_predicate); + + // replace qualified name with subquery alias. + let subquery_alias = alias.next("__correlated_sq"); + let input_schema = subquery_input.schema(); + let mut subquery_cols = collect_subquery_cols(&join_filters, input_schema.clone())?; + let join_filter = conjunction(join_filters).map_or(Ok(None), |filter| { + replace_qualified_name(filter, &subquery_cols, &subquery_alias).map(Option::Some) + })?; + + // add projection + if let Expr::Column(col) = subquery_expr { + subquery_cols.remove(col); + } + let subquery_expr_name = format!("{:?}", unnormalize_col(subquery_expr.clone())); + let first_expr = subquery_expr.clone().alias(subquery_expr_name.clone()); + let projection_exprs: Vec = [first_expr] + .into_iter() + .chain(subquery_cols.into_iter().map(Expr::Column)) + .collect(); + + let right = LogicalPlanBuilder::from(subquery_input) + .project(projection_exprs)? + .alias(subquery_alias.clone())? + .build()?; + + // join our sub query into the main plan + let join_type = match query_info.negated { + true => JoinType::LeftAnti, + false => JoinType::LeftSemi, + }; + let right_join_col = Column::new(Some(subquery_alias), subquery_expr_name); + let in_predicate = Expr::eq( + query_info.where_in_expr.clone(), + Expr::Column(right_join_col), + ); + let join_filter = join_filter + .map(|filter| in_predicate.clone().and(filter)) + .unwrap_or_else(|| in_predicate); + + let new_plan = LogicalPlanBuilder::from(left.clone()) + .join( + right, + join_type, + (Vec::::new(), Vec::::new()), + Some(join_filter), + )? + .build()?; + + debug!("where in optimized:\n{}", new_plan.display_indent()); + Ok(new_plan) +} + +fn remove_duplicated_filter(filters: Vec, in_predicate: Expr) -> Vec { + filters + .into_iter() + .filter(|filter| { + if filter == &in_predicate { + return false; + } + + // ignore the binary order + !match (filter, &in_predicate) { + (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { + (a_expr.op == b_expr.op) + && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) + || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) + } + _ => false, + } + }) + .collect::>() +} + +struct SubqueryInfo { + query: Subquery, + where_in_expr: Expr, + negated: bool, +} + +impl SubqueryInfo { + pub fn new(query: Subquery, expr: Expr, negated: bool) -> Self { + Self { + query, + where_in_expr: expr, + negated, + } + } +} diff --git a/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs new file mode 100644 index 000000000..d7e1a8be5 --- /dev/null +++ b/src/sql/optimizer/dynamic_partition_pruning.rs @@ -0,0 +1,1088 @@ +//! Optimizer rule for dynamic partition pruning (DPP) +//! +//! DPP refers to a query optimization rule in which distinct values in an inner join are used as +//! filters in a table scan. This allows us to eliminate all other rows which do not fit the join +//! condition from being read at all. +//! +//! Furthermore, a table involved in a join may be filtered during a scan, which allows us to +//! further prune the values to be read. + +use std::{ + collections::{HashMap, HashSet}, + fs, + hash::{Hash, Hasher}, +}; + +use datafusion_python::{ + datafusion::parquet::{ + basic::Type as BasicType, + file::reader::{FileReader, SerializedFileReader}, + record::{reader::RowIter, RowAccessor}, + schema::{parser::parse_message_type, types::Type}, + }, + datafusion_common::{Column, Result, ScalarValue}, + datafusion_expr::{ + expr::InList, + logical_plan::LogicalPlan, + utils::from_plan, + Expr, + JoinType, + Operator, + TableScan, + }, + datafusion_optimizer::{OptimizerConfig, OptimizerRule}, +}; +use log::warn; + +use crate::sql::table::DaskTableSource; + +// Optimizer rule for dynamic partition pruning +pub struct DynamicPartitionPruning {} + +impl DynamicPartitionPruning { + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for DynamicPartitionPruning { + fn name(&self) -> &str { + "dynamic_partition_pruning" + } + + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + // Parse the LogicalPlan and store tables and columns being (inner) joined upon. We do this + // by creating a HashSet of all InnerJoins' join.on and join.filters + let join_conds = gather_joins(plan); + let tables = gather_tables(plan); + let aliases = gather_aliases(plan); + + if join_conds.is_empty() || tables.is_empty() { + // No InnerJoins to optimize with + Ok(None) + } else { + // Find the size of the largest table in the query + let mut largest_size = 1_f64; + for table in &tables { + let table_size = table.1.size.unwrap_or(0) as f64; + if table_size > largest_size { + largest_size = table_size; + } + } + + let mut join_values = vec![]; + let mut join_tables = vec![]; + let mut join_fields = vec![]; + let mut fact_tables = HashSet::new(); + + // Iterate through all inner joins in the query + for join_cond in &join_conds { + let join_on = &join_cond.on; + for on_i in join_on { + // Obtain tables and columns (fields) involved in join + let (left_on, right_on) = (&on_i.0, &on_i.1); + let (mut left_table, mut right_table) = (None, None); + let (mut left_field, mut right_field) = (None, None); + + if let Expr::Column(c) = left_on { + left_table = Some(c.relation.clone().unwrap().to_string().clone()); + left_field = Some(c.name.clone()); + } + if let Expr::Column(c) = right_on { + right_table = Some(c.relation.clone().unwrap().to_string().clone()); + right_field = Some(c.name.clone()); + } + + // For now, if it is not a join between columns then we skip the rule + // TODO: https://github.com/dask-contrib/dask-sql/issues/1121 + if left_table.is_none() || right_table.is_none() { + continue; + } + + let (mut left_table, mut right_table) = + (left_table.unwrap(), right_table.unwrap()); + let (left_field, right_field) = (left_field.unwrap(), right_field.unwrap()); + + // TODO: Consider allowing the fact_dimension_ratio to be configured by the + // user. See issue: https://github.com/dask-contrib/dask-sql/issues/1121 + let fact_dimension_ratio = 0.3; + let (mut left_filtered_table, mut right_filtered_table) = (None, None); + + // Check if join uses an alias instead of the table name itself. Need to use + // the actual table name to obtain its filepath + let left_alias = aliases.get(&left_table.clone()); + if let Some(t) = left_alias { + left_table = t.to_string() + } + let right_alias = aliases.get(&right_table.clone()); + if let Some(t) = right_alias { + right_table = t.to_string() + } + + // A more complicated alias, e.g. an alias for a nested select, means it's not + // obvious which file(s) should be read + if !tables.contains_key(&left_table) || !tables.contains_key(&right_table) { + continue; + } + + // Determine whether a table is a fact or dimension table. If it's a dimension + // table, we should read it in and use the rule + if tables + .get(&left_table.clone()) + .unwrap() + .size + .unwrap_or(largest_size as usize) as f64 + / largest_size + < fact_dimension_ratio + { + left_filtered_table = + read_table(left_table.clone(), left_field.clone(), tables.clone()); + } else { + fact_tables.insert(left_table.clone()); + } + if tables + .get(&right_table.clone()) + .unwrap() + .size + .unwrap_or(largest_size as usize) as f64 + / largest_size + < fact_dimension_ratio + { + right_filtered_table = + read_table(right_table.clone(), right_field.clone(), tables.clone()); + } else { + fact_tables.insert(right_table.clone()); + } + + join_values.push((left_filtered_table, right_filtered_table)); + join_tables.push((left_table, right_table)); + join_fields.push((left_field, right_field)); + } + } + // Creates HashMap of all tables and field with their unique values to be set in the + // TableScan + let filter_values = combine_sets(join_values, join_tables, join_fields, fact_tables); + // Optimize and return the plan + optimize_table_scans(plan, filter_values) + } + } +} + +/// Represents relevant information in an InnerJoin +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct JoinInfo { + /// Equijoin clause expressed as pairs of (left, right) join expressions + on: Vec<(Expr, Expr)>, + /// Filters applied during join (non-equi conditions) + /// TODO: https://github.com/dask-contrib/dask-sql/issues/1121 + filter: Option, +} + +// This function parses through the LogicalPlan, grabs relevant information from an InnerJoin, and +// adds them to a HashSet +fn gather_joins(plan: &LogicalPlan) -> HashSet { + let mut current_plan = plan.clone(); + let mut join_info = HashSet::new(); + loop { + if current_plan.inputs().is_empty() { + break; + } else if current_plan.inputs().len() > 1 { + match current_plan { + LogicalPlan::Join(ref j) => { + if j.join_type == JoinType::Inner { + // Store tables and columns that are being (inner) joined upon + let info = JoinInfo { + on: j.on.clone(), + filter: j.filter.clone(), + }; + join_info.insert(info); + + // Recurse on left and right inputs of Join + let (left_joins, right_joins) = + (gather_joins(&j.left), gather_joins(&j.right)); + + // Add left_joins and right_joins to HashSet + join_info.extend(left_joins); + join_info.extend(right_joins); + } else { + // We don't run the rule if there are non-inner joins in the query + return HashSet::new(); + } + } + LogicalPlan::CrossJoin(ref c) => { + // Recurse on left and right inputs of CrossJoin + let (left_joins, right_joins) = (gather_joins(&c.left), gather_joins(&c.right)); + + // Add left_joins and right_joins to HashSet + join_info.extend(left_joins); + join_info.extend(right_joins); + } + LogicalPlan::Union(ref u) => { + // Recurse on inputs vector of Union + for input in &u.inputs { + let joins = gather_joins(input); + + // Add joins to HashSet + join_info.extend(joins); + } + } + _ => { + warn!("Skipping optimizer rule 'DynamicPartitionPruning'"); + return HashSet::new(); + } + } + break; + } else { + // Move on to next step + current_plan = current_plan.inputs()[0].clone(); + } + } + join_info +} + +/// Represents relevant information in a TableScan +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct TableInfo { + /// The name of the table + table_name: String, + /// The path and filename of the table + filepath: String, + /// The number of rows in the table + size: Option, + /// Optional expressions to be used as filters by the table provider + filters: Vec, +} + +// This function parses through the LogicalPlan, grabs relevant information from a TableScan, and +// adds them to a HashMap where the key is the table name +fn gather_tables(plan: &LogicalPlan) -> HashMap { + let mut current_plan = plan.clone(); + let mut tables = HashMap::new(); + loop { + if current_plan.inputs().is_empty() { + if let LogicalPlan::TableScan(ref t) = current_plan { + // Use TableScan to get the filepath and/or size + let filepath = get_filepath(¤t_plan); + let size = get_table_size(¤t_plan); + match filepath { + Some(f) => { + // TODO: Add better handling for when a table is read in more than once + // https://github.com/dask-contrib/dask-sql/issues/1121 + if tables.contains_key(&t.table_name.to_string()) { + return HashMap::new(); + } + + tables.insert( + t.table_name.to_string(), + TableInfo { + table_name: t.table_name.to_string(), + filepath: f.clone(), + size, + filters: t.filters.clone(), + }, + ); + break; + } + None => return HashMap::new(), + } + } + break; + } else if current_plan.inputs().len() > 1 { + match current_plan { + LogicalPlan::Join(ref j) => { + // Recurse on left and right inputs of Join + let (left_tables, right_tables) = + (gather_tables(&j.left), gather_tables(&j.right)); + + if check_table_overlaps(&tables, &left_tables, &right_tables) { + return HashMap::new(); + } + + // Add left_tables and right_tables to HashMap + tables.extend(left_tables); + tables.extend(right_tables); + } + LogicalPlan::CrossJoin(ref c) => { + // Recurse on left and right inputs of CrossJoin + let (left_tables, right_tables) = + (gather_tables(&c.left), gather_tables(&c.right)); + + if check_table_overlaps(&tables, &left_tables, &right_tables) { + return HashMap::new(); + } + + // Add left_tables and right_tables to HashMap + tables.extend(left_tables); + tables.extend(right_tables); + } + LogicalPlan::Union(ref u) => { + // Recurse on inputs vector of Union + for input in &u.inputs { + let union_tables = gather_tables(input); + + // TODO: Add better handling for when a table is read in more than once + // https://github.com/dask-contrib/dask-sql/issues/1121 + if tables.keys().any(|k| union_tables.contains_key(k)) + || union_tables.keys().any(|k| tables.contains_key(k)) + { + return HashMap::new(); + } + + // Add union_tables to HashMap + tables.extend(union_tables); + } + } + _ => { + warn!("Skipping optimizer rule 'DynamicPartitionPruning'"); + return HashMap::new(); + } + } + break; + } else { + // Move on to next step + current_plan = current_plan.inputs()[0].clone(); + } + } + tables +} + +// TODO: Add better handling for when a table is read in more than once +// https://github.com/dask-contrib/dask-sql/issues/1121 +fn check_table_overlaps( + m1: &HashMap, + m2: &HashMap, + m3: &HashMap, +) -> bool { + m1.keys().any(|k| m2.contains_key(k)) + || m2.keys().any(|k| m1.contains_key(k)) + || m1.keys().any(|k| m3.contains_key(k)) + || m3.keys().any(|k| m1.contains_key(k)) + || m2.keys().any(|k| m3.contains_key(k)) + || m3.keys().any(|k| m2.contains_key(k)) +} + +fn get_filepath(plan: &LogicalPlan) -> Option<&String> { + match plan { + LogicalPlan::TableScan(scan) => scan + .source + .as_any() + .downcast_ref::()? + .filepath(), + _ => None, + } +} + +fn get_table_size(plan: &LogicalPlan) -> Option { + match plan { + LogicalPlan::TableScan(scan) => scan + .source + .as_any() + .downcast_ref::()? + .statistics() + .map(|stats| stats.get_row_count() as usize), + _ => None, + } +} + +// This function parses through the LogicalPlan, grabs any aliases, and adds them to a HashMap +// where the key is the alias name and the value is the table name +fn gather_aliases(plan: &LogicalPlan) -> HashMap { + let mut current_plan = plan.clone(); + let mut aliases = HashMap::new(); + loop { + if current_plan.inputs().is_empty() { + break; + } else if current_plan.inputs().len() > 1 { + match current_plan { + LogicalPlan::Join(ref j) => { + // Recurse on left and right inputs of Join + let (left_aliases, right_aliases) = + (gather_aliases(&j.left), gather_aliases(&j.right)); + + // Add left_aliases and right_aliases to HashMap + aliases.extend(left_aliases); + aliases.extend(right_aliases); + } + LogicalPlan::CrossJoin(ref c) => { + // Recurse on left and right inputs of CrossJoin + let (left_aliases, right_aliases) = + (gather_aliases(&c.left), gather_aliases(&c.right)); + + // Add left_aliases and right_aliases to HashMap + aliases.extend(left_aliases); + aliases.extend(right_aliases); + } + LogicalPlan::Union(ref u) => { + // Recurse on inputs vector of Union + for input in &u.inputs { + let union_aliases = gather_aliases(input); + + // Add union_aliases to HashMap + aliases.extend(union_aliases); + } + } + _ => { + return HashMap::new(); + } + } + break; + } else { + if let LogicalPlan::SubqueryAlias(ref s) = current_plan { + match *s.input { + LogicalPlan::TableScan(ref t) => { + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); + } + // Sometimes a TableScan is immediately followed by a Projection, so we can + // still use the alias for the table + LogicalPlan::Projection(ref p) => { + if let LogicalPlan::TableScan(ref t) = *p.input { + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); + } + } + _ => (), + } + } + // Move on to next step + current_plan = current_plan.inputs()[0].clone(); + } + } + aliases +} + +// Wrapper for floats, since they are not hashable +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +struct FloatWrapper(f64); + +impl Eq for FloatWrapper {} + +impl Hash for FloatWrapper { + fn hash(&self, state: &mut H) { + // Convert the f64 to a u64 using transmute + let bits: u64 = self.0.to_bits(); + // Use the u64's hash implementation + bits.hash(state); + } +} + +// Wrapper for possible row value types +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +enum RowValue { + String(Option), + Int64(Option), + Int32(Option), + Double(Option), +} + +// This function uses the table name, column name, and filters to read in the relevant columns, +// filter out row values, and construct a HashSet of relevant row values for the specified column, +// i.e., the column involved in the join +fn read_table( + table_string: String, + field_string: String, + tables: HashMap, +) -> Option> { + // Obtain filepaths to all relevant Parquet files, e.g., in a directory of Parquet files + let paths = fs::read_dir(tables.get(&table_string).unwrap().filepath.clone()).unwrap(); + let mut files = vec![]; + for path in paths { + files.push(path.unwrap().path().display().to_string()) + } + + // Using the filepaths to the Parquet tables, obtain the schemas of the relevant tables + let schema: &Type = &SerializedFileReader::try_from(files[0].clone()) + .unwrap() + .metadata() + .file_metadata() + .schema() + .clone(); + + // Use the schemas of the relevant tables to obtain the physical type of the relevant columns + let physical_type = get_physical_type(schema, field_string.clone()); + + // A TableScan may include existing filters. These conditions should be used to filter the data + // after being read. Therefore, the columns involved in these filters should be read in as well + let filters = tables.get(&table_string).unwrap().filters.clone(); + let filtered_fields = get_filtered_fields(&filters, schema, field_string.clone()); + let filtered_string = filtered_fields.0; + let filtered_types = filtered_fields.1; + let filtered_names = filtered_fields.2; + + if filters.len() != filtered_names.len() { + warn!("Unable to check existing filters for optimizer rule 'DynamicPartitionPruning'"); + return None; + } + + // Specify which columns to include in the reader, then read in the rows + let repetition = get_repetition(schema, field_string.clone()); + let physical_type = physical_type.unwrap().to_string(); + let projection_schema = "message schema { ".to_owned() + + &filtered_string + + &repetition.unwrap() + + " " + + &physical_type + + " " + + &field_string + + "; }"; + let projection = parse_message_type(&projection_schema).ok(); + + let mut rows = Vec::new(); + for file in files { + let reader_result = SerializedFileReader::try_from(&*file.clone()); + if let Ok(reader) = reader_result { + let row_iter_result = RowIter::from_file_into(Box::new(reader)) + .project(projection.clone()) + .ok(); + if let Some(row_iter) = row_iter_result { + rows.extend(row_iter.map(|r| r.expect("Parquet error encountered"))); + } else { + // TODO: Investigate cases when this would happen + rows.clear(); + break; + } + } else { + rows.clear(); + break; + } + } + if rows.is_empty() { + return None; + } + + // Create HashSets for the join column values + let mut value_set: HashSet = HashSet::new(); + for row in rows { + // Since a TableScan may have its own filters, we want to ensure that the values in + // value_set satisfy the TableScan filters + let mut satisfies_filters = true; + let mut row_index = 0; + for index in 0..filters.len() { + if filtered_names[index] != field_string { + let current_type = &filtered_types[index]; + match current_type.as_str() { + "BYTE_ARRAY" => { + let string_value = row.get_string(row_index).ok(); + if !satisfies_string(string_value, filters[index].clone()) { + satisfies_filters = false; + } + } + "INT64" => { + let long_value = row.get_long(row_index).ok(); + if !satisfies_int64(long_value, filters[index].clone()) { + satisfies_filters = false; + } + } + "INT32" => { + let int_value = row.get_int(row_index).ok(); + if !satisfies_int32(int_value, filters[index].clone()) { + satisfies_filters = false; + } + } + "DOUBLE" => { + let double_value = row.get_double(row_index).ok(); + if !satisfies_float(double_value, filters[index].clone()) { + satisfies_filters = false; + } + } + u => panic!("Unknown PhysicalType {u}"), + } + row_index += 1; + } + } + // After verifying that the row satisfies all existing filters, we add the column value to + // the HashSet + if satisfies_filters { + match physical_type.as_str() { + "BYTE_ARRAY" => { + let r = row.get_string(row_index).ok(); + value_set.insert(RowValue::String(r.cloned())); + } + "INT64" => { + let r = row.get_long(row_index).ok(); + value_set.insert(RowValue::Int64(r)); + } + "INT32" => { + let r = row.get_int(row_index).ok(); + value_set.insert(RowValue::Int32(r)); + } + "DOUBLE" => { + let r = row.get_double(row_index).ok(); + if let Some(f) = r { + value_set.insert(RowValue::Double(Some(FloatWrapper(f)))); + } else { + value_set.insert(RowValue::Double(None)); + } + } + _ => panic!("Unknown PhysicalType"), + } + } + } + + Some(value_set) +} + +// A column has a physical_type (INT64, etc.) that needs to be included when specifying which +// columns to read in. To get the physical_type, we grab it from the schema +fn get_physical_type(schema: &Type, field: String) -> Option { + match schema { + Type::GroupType { + basic_info: _, + fields, + } => { + for f in fields { + let match_field = &*f.clone(); + match match_field { + Type::PrimitiveType { + basic_info, + physical_type, + .. + } => { + if basic_info.name() == field { + return Some(*physical_type); + } + } + _ => return None, + } + } + None + } + _ => None, + } +} + +// A column has a repetition (i.e., REQUIRED or OPTIONAL) that needs to be included when specifying +// which columns to read in. To get the repetition, we grab it from the schema +fn get_repetition(schema: &Type, field: String) -> Option { + match schema { + Type::GroupType { + basic_info: _, + fields, + } => { + for f in fields { + let match_field = &*f.clone(); + match match_field { + Type::PrimitiveType { basic_info, .. } => { + if basic_info.name() == field { + return Some(basic_info.repetition().to_string()); + } + } + _ => return None, + } + } + None + } + _ => None, + } +} + +// This is a helper function to deal with TableScan filters for reading in the data. The first +// value returned is a string representation of the projection used to read in the relevant +// columns. The second value returned is a vector of the physical_type of each column that has has +// a filter, in the order that they are being read. The third value returned is a vector of the +// column names, in the order that they are being read. +fn get_filtered_fields( + filters: &Vec, + schema: &Type, + field: String, +) -> (String, Vec, Vec) { + // Used to create a string representation of the projection + // for the TableScan filters to be read + let mut filtered_fields = vec![]; + // All physical types involved in TableScan filters + let mut filtered_types = vec![]; + // All columns involved in TableScan filters + let mut filtered_columns = vec![]; + for filter in filters { + match filter { + Expr::BinaryExpr(b) => { + if let Expr::Column(column) = &*b.left { + push_filtered_fields( + column, + schema, + field.clone(), + &mut filtered_fields, + &mut filtered_columns, + &mut filtered_types, + ); + } + } + Expr::IsNotNull(e) => { + if let Expr::Column(column) = &**e { + push_filtered_fields( + column, + schema, + field.clone(), + &mut filtered_fields, + &mut filtered_columns, + &mut filtered_types, + ); + } + } + _ => (), + } + } + (filtered_fields.join(""), filtered_types, filtered_columns) +} + +// Helper function for get_filtered_fields +fn push_filtered_fields( + column: &Column, + schema: &Type, + field: String, + filtered_fields: &mut Vec, + filtered_columns: &mut Vec, + filtered_types: &mut Vec, +) { + let current_field = column.name.clone(); + let physical_type = get_physical_type(schema, current_field.clone()) + .unwrap() + .to_string(); + if current_field != field { + let repetition = get_repetition(schema, current_field.clone()); + filtered_fields.push(repetition.unwrap()); + filtered_fields.push(" ".to_string()); + + filtered_fields.push(physical_type.clone()); + filtered_fields.push(" ".to_string()); + + filtered_fields.push(current_field.clone()); + filtered_fields.push("; ".to_string()); + } + filtered_types.push(physical_type); + filtered_columns.push(current_field); +} + +// Returns a boolean representing whether a string satisfies a given filter +fn satisfies_string(string_value: Option<&String>, filter: Expr) -> bool { + match filter { + Expr::BinaryExpr(b) => match b.op { + Operator::Eq => Expr::Literal(ScalarValue::Utf8(string_value.cloned())) == *b.right, + Operator::NotEq => Expr::Literal(ScalarValue::Utf8(string_value.cloned())) != *b.right, + _ => { + panic!("Unknown satisfies_string operator"); + } + }, + Expr::IsNotNull(_) => string_value.is_some(), + _ => { + panic!("Unknown satisfies_string Expr"); + } + } +} + +// Returns a boolean representing whether an Int64 satisfies a given filter +fn satisfies_int64(long_value: Option, filter: Expr) -> bool { + match filter { + Expr::BinaryExpr(b) => { + let filter_value = *b.right; + let int_value: i64 = match filter_value { + Expr::Literal(ScalarValue::Int64(i)) => i.unwrap(), + Expr::Literal(ScalarValue::Int32(i)) => i64::from(i.unwrap()), + Expr::Literal(ScalarValue::Float64(i)) => i.unwrap() as i64, + Expr::Literal(ScalarValue::TimestampNanosecond(i, None)) => i.unwrap(), + Expr::Literal(ScalarValue::Date32(i)) => i64::from(i.unwrap()), + // TODO: Add logic to check if the string can be converted to a timestamp + Expr::Literal(ScalarValue::Utf8(_)) => return false, + _ => { + panic!("Unknown ScalarValue type {filter_value}"); + } + }; + let filter_value = Expr::Literal(ScalarValue::Int64(Some(int_value))); + match b.op { + Operator::Eq => Expr::Literal(ScalarValue::Int64(long_value)) == filter_value, + Operator::NotEq => Expr::Literal(ScalarValue::Int64(long_value)) != filter_value, + Operator::Gt => Expr::Literal(ScalarValue::Int64(long_value)) > filter_value, + Operator::Lt => Expr::Literal(ScalarValue::Int64(long_value)) < filter_value, + Operator::GtEq => Expr::Literal(ScalarValue::Int64(long_value)) >= filter_value, + Operator::LtEq => Expr::Literal(ScalarValue::Int64(long_value)) <= filter_value, + _ => { + panic!("Unknown satisfies_int64 operator"); + } + } + } + Expr::IsNotNull(_) => long_value.is_some(), + _ => { + panic!("Unknown satisfies_int64 Expr"); + } + } +} + +// Returns a boolean representing whether an Int32 satisfies a given filter +fn satisfies_int32(long_value: Option, filter: Expr) -> bool { + match filter { + Expr::BinaryExpr(b) => { + let filter_value = *b.right; + let int_value: i32 = match filter_value { + Expr::Literal(ScalarValue::Int64(i)) => i.unwrap() as i32, + Expr::Literal(ScalarValue::Int32(i)) => i.unwrap(), + Expr::Literal(ScalarValue::Float64(i)) => i.unwrap() as i32, + _ => { + panic!("Unknown ScalarValue type {filter_value}"); + } + }; + let filter_value = Expr::Literal(ScalarValue::Int32(Some(int_value))); + match b.op { + Operator::Eq => Expr::Literal(ScalarValue::Int32(long_value)) == filter_value, + Operator::NotEq => Expr::Literal(ScalarValue::Int32(long_value)) != filter_value, + Operator::Gt => Expr::Literal(ScalarValue::Int32(long_value)) > filter_value, + Operator::Lt => Expr::Literal(ScalarValue::Int32(long_value)) < filter_value, + Operator::GtEq => Expr::Literal(ScalarValue::Int32(long_value)) >= filter_value, + Operator::LtEq => Expr::Literal(ScalarValue::Int32(long_value)) <= filter_value, + _ => { + panic!("Unknown satisfies_int32 operator"); + } + } + } + Expr::IsNotNull(_) => long_value.is_some(), + _ => { + panic!("Unknown satisfies_int32 Expr"); + } + } +} + +// Returns a boolean representing whether an Float64 satisfies a given filter +fn satisfies_float(long_value: Option, filter: Expr) -> bool { + match filter { + Expr::BinaryExpr(b) => { + let filter_value = *b.right; + let float_value: f64 = match filter_value { + Expr::Literal(ScalarValue::Int64(i)) => i.unwrap() as f64, + Expr::Literal(ScalarValue::Int32(i)) => i.unwrap() as f64, + Expr::Literal(ScalarValue::Float64(i)) => i.unwrap(), + _ => { + panic!("Unknown ScalarValue type {filter_value}"); + } + }; + let filter_value = Expr::Literal(ScalarValue::Float64(Some(float_value))); + match b.op { + Operator::Eq => Expr::Literal(ScalarValue::Float64(long_value)) == filter_value, + Operator::NotEq => Expr::Literal(ScalarValue::Float64(long_value)) != filter_value, + Operator::Gt => Expr::Literal(ScalarValue::Float64(long_value)) > filter_value, + Operator::Lt => Expr::Literal(ScalarValue::Float64(long_value)) < filter_value, + Operator::GtEq => Expr::Literal(ScalarValue::Float64(long_value)) >= filter_value, + Operator::LtEq => Expr::Literal(ScalarValue::Float64(long_value)) <= filter_value, + _ => { + panic!("Unknown satisfies_float operator"); + } + } + } + Expr::IsNotNull(_) => long_value.is_some(), + _ => { + panic!("Unknown satisfies_float Expr"); + } + } +} + +// Used to simplify the signature of combine_sets +type RowHashSet = HashSet; +type RowOptionHashSet = Option; +type RowTuple = (RowOptionHashSet, RowOptionHashSet); +type RowVec = Vec; + +// Given a vector of hashsets to be set as TableScan filters, a vector of tuples representing the +// tables involved in a join, a vector of tuples representing the columns involved in a join, and +// a hashset of fact tables in the query; return a hashmap where the key is a tuple of the table +// and column names, and the value is the hashset representing the INLIST filter specified in the +// TableScan. +fn combine_sets( + join_values: RowVec, + join_tables: Vec<(String, String)>, + join_fields: Vec<(String, String)>, + fact_tables: HashSet, +) -> HashMap<(String, String), HashSet> { + let mut sets: HashMap<(String, String), HashSet> = HashMap::new(); + for i in 0..join_values.len() { + // Case when we were able to read in both tables involved in the join + if let (Some(set1), Some(set2)) = (&join_values[i].0, &join_values[i].1) { + // The INLIST vector will be the intersection of both hashsets + let set_intersection = set1.intersection(set2); + let mut values = HashSet::new(); + for value in set_intersection { + values.insert(value.clone()); + } + + let current_table = join_tables[i].0.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].0.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + + let current_table = join_tables[i].1.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].1.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + // Case when we were only able to read in the left table of the join + } else if let Some(values) = &join_values[i].0 { + let current_table = join_tables[i].0.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].0.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + + let current_table = join_tables[i].1.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].1.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + // Case when we were only able to read in the right table of the join + } else if let Some(values) = &join_values[i].1 { + let current_table = join_tables[i].0.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].0.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + + let current_table = join_tables[i].1.clone(); + // We only create INLIST filters for fact tables + if fact_tables.contains(¤t_table) { + let current_field = join_fields[i].1.clone(); + add_to_existing_set(&mut sets, values.clone(), current_table, current_field); + } + } + } + sets +} + +// Given a mutable hashmap (the hashmap which will eventually be returned by the `combine_sets` +// function), a hashset of values, a table name, and a column name; insert the hashset of values +// into the hashmap, where the key is a tuple of the table and column names. +fn add_to_existing_set( + sets: &mut HashMap<(String, String), HashSet>, + values: HashSet, + current_table: String, + current_field: String, +) { + let existing_set = sets.get(&(current_table.clone(), current_field.clone())); + match existing_set { + // If the tuple for (current_table, current_field) already exists, then we want to combine + // the existing set with the new hashset being inserted; to do this, we take the + // intersection of both sets. + Some(s) => { + let s = s.clone(); + let v = values.iter().cloned().collect::>(); + let s = s.intersection(&v); + let mut set_intersection = HashSet::new(); + for i in s { + set_intersection.insert(i.clone()); + } + sets.insert((current_table, current_field), set_intersection.clone()); + } + // If the tuple for (current_table, current_field) does not already exist as a key in the + // hashmap, then simply create it and set the hashset as the value + None => { + sets.insert((current_table, current_field), values); + } + } +} + +// Given a LogicalPlan and a hashmap where the key is a tuple containing a table name and column +// and the value is a hashset of unique row values, parse the LogicalPlan and insert INLIST filters +// at the TableScan level. +fn optimize_table_scans( + plan: &LogicalPlan, + filter_values: HashMap<(String, String), HashSet>, +) -> Result> { + // Replaces existing TableScan with a new TableScan which includes + // the new binary expression filter created from reading in the join columns + match plan { + LogicalPlan::TableScan(t) => { + let table_name = t.table_name.to_string(); + let table_filters: HashMap<(String, String), HashSet> = filter_values + .iter() + .filter(|(key, _value)| key.0 == table_name) + .map(|(key, value)| ((key.0.to_owned(), key.1.to_owned()), value.clone())) + .collect(); + let mut updated_filters = t.filters.clone(); + for (key, value) in table_filters.iter() { + let current_expr = + format_inlist_expr(value.clone(), key.0.to_owned(), key.1.to_owned()); + if let Some(e) = current_expr { + updated_filters.push(e); + } + } + let scan = LogicalPlan::TableScan(TableScan { + table_name: t.table_name.clone(), + source: t.source.clone(), + projection: t.projection.clone(), + projected_schema: t.projected_schema.clone(), + filters: updated_filters, + fetch: t.fetch, + }); + Ok(Some(scan)) + } + _ => optimize_children(plan, filter_values), + } +} + +// Given a hashset of values, a table name, and a column name, return a DataFusion INLIST Expr +fn format_inlist_expr( + value_set: HashSet, + join_table: String, + join_field: String, +) -> Option { + let expr = Box::new(Expr::Column(Column::new(Some(join_table), join_field))); + let mut list: Vec = vec![]; + + // Need to correctly format the ScalarValue type + for value in value_set { + if let RowValue::String(s) = value { + if s.is_some() { + let v = Expr::Literal(ScalarValue::Utf8(s)); + list.push(v); + } + } else if let RowValue::Int64(l) = value { + if l.is_some() { + let v = Expr::Literal(ScalarValue::Int64(l)); + list.push(v); + } + } else if let RowValue::Int32(i) = value { + if i.is_some() { + let v = Expr::Literal(ScalarValue::Int32(i)); + list.push(v); + } + } else if let RowValue::Double(Some(f)) = value { + let v = Expr::Literal(ScalarValue::Float64(Some(f.0))); + list.push(v); + } + } + + if list.is_empty() { + None + } else { + Some(Expr::InList(InList { + expr, + list, + negated: false, + })) + } +} + +// Given a LogicalPlan and the same hashmap as the `optimize_table_scans` function, correctly +// iterate through the LogicalPlan nodes. Similar to DataFusion's `optimize_children` function, but +// recurses on the `optimize_table_scans` function instead. +fn optimize_children( + plan: &LogicalPlan, + filter_values: HashMap<(String, String), HashSet>, +) -> Result> { + let new_exprs = plan.expressions(); + let mut new_inputs = Vec::with_capacity(plan.inputs().len()); + let mut plan_is_changed = false; + for input in plan.inputs() { + let new_input = optimize_table_scans(input, filter_values.clone())?; + plan_is_changed = plan_is_changed || new_input.is_some(); + new_inputs.push(new_input.unwrap_or_else(|| input.clone())) + } + if plan_is_changed { + Ok(Some(from_plan(plan, &new_exprs, &new_inputs)?)) + } else { + Ok(None) + } +} diff --git a/dask_planner/src/sql/optimizer/join_reorder.rs b/src/sql/optimizer/join_reorder.rs similarity index 100% rename from dask_planner/src/sql/optimizer/join_reorder.rs rename to src/sql/optimizer/join_reorder.rs diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs new file mode 100644 index 000000000..f72bbe5c3 --- /dev/null +++ b/src/sql/optimizer/utils.rs @@ -0,0 +1,516 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Collection of utility functions that are leveraged by the query optimizer rules + +use std::{ + collections::{BTreeSet, HashMap}, + sync::Arc, +}; + +use datafusion_python::{ + datafusion_common::{Column, DFSchema, DFSchemaRef, Result}, + datafusion_expr::{ + and, + expr::{Alias, BinaryExpr}, + expr_rewriter::{replace_col, strip_outer_reference}, + logical_plan::{Filter, LogicalPlan}, + Expr, + LogicalPlanBuilder, + Operator, + }, + datafusion_optimizer::optimizer::{OptimizerConfig, OptimizerRule}, +}; +use log::{debug, trace}; + +#[allow(dead_code)] +/// Convenience rule for writing optimizers: recursively invoke +/// optimize on plan's children and then return a node of the same +/// type. Useful for optimizer rules which want to leave the type +/// of plan unchanged but still apply to the children. +/// This also handles the case when the `plan` is a [`LogicalPlan::Explain`]. +/// +/// Returning `Ok(None)` indicates that the plan can't be optimized by the `optimizer`. +pub fn optimize_children( + optimizer: &impl OptimizerRule, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, +) -> Result> { + let mut new_inputs = Vec::with_capacity(plan.inputs().len()); + let mut plan_is_changed = false; + for input in plan.inputs() { + let new_input = optimizer.try_optimize(input, config)?; + plan_is_changed = plan_is_changed || new_input.is_some(); + new_inputs.push(new_input.unwrap_or_else(|| input.clone())) + } + if plan_is_changed { + Ok(Some(plan.with_new_inputs(&new_inputs)?)) + } else { + Ok(None) + } +} + +/// Splits a conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]` +/// +/// See [`split_conjunction_owned`] for more details and an example. +pub fn split_conjunction(expr: &Expr) -> Vec<&Expr> { + split_conjunction_impl(expr, vec![]) +} + +fn split_conjunction_impl<'a>(expr: &'a Expr, mut exprs: Vec<&'a Expr>) -> Vec<&'a Expr> { + match expr { + Expr::BinaryExpr(BinaryExpr { + right, + op: Operator::And, + left, + }) => { + let exprs = split_conjunction_impl(left, exprs); + split_conjunction_impl(right, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_conjunction_impl(expr, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +/// Extract join predicates from the correclated subquery. +/// The join predicate means that the expression references columns +/// from both the subquery and outer table or only from the outer table. +/// +/// Returns join predicates and subquery(extracted). +pub(crate) fn extract_join_filters(maybe_filter: &LogicalPlan) -> Result<(Vec, LogicalPlan)> { + if let LogicalPlan::Filter(plan_filter) = maybe_filter { + let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); + let (join_filters, subquery_filters) = find_join_exprs(subquery_filter_exprs)?; + // if the subquery still has filter expressions, restore them. + let mut plan = LogicalPlanBuilder::from((*plan_filter.input).clone()); + if let Some(expr) = conjunction(subquery_filters) { + plan = plan.filter(expr)? + } + + Ok((join_filters, plan.build()?)) + } else { + Ok((vec![], maybe_filter.clone())) + } +} + +#[allow(dead_code)] +/// Splits an owned conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]` +/// +/// This is often used to "split" filter expressions such as `col1 = 5 +/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`]; +/// +/// # Example +/// ``` +/// # use datafusion_python::datafusion_expr::{col, lit}; +/// # use datafusion_python::datafusion_optimizer::utils::split_conjunction_owned; +/// // a=1 AND b=2 +/// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use split_conjunction_owned to split them +/// assert_eq!(split_conjunction_owned(expr), split); +/// ``` +pub fn split_conjunction_owned(expr: Expr) -> Vec { + split_binary_owned(expr, Operator::And) +} + +#[allow(dead_code)] +/// Splits an owned binary operator tree [`Expr`] such as `A B C` => `[A, B, C]` +/// +/// This is often used to "split" expressions such as `col1 = 5 +/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`]; +/// +/// # Example +/// ``` +/// # use datafusion_python::datafusion_expr::{col, lit, Operator}; +/// # use datafusion_python::datafusion_optimizer::utils::split_binary_owned; +/// # use std::ops::Add; +/// // a=1 + b=2 +/// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use split_binary_owned to split them +/// assert_eq!(split_binary_owned(expr, Operator::Plus), split); +/// ``` +pub fn split_binary_owned(expr: Expr, op: Operator) -> Vec { + split_binary_owned_impl(expr, op, vec![]) +} + +#[allow(dead_code)] +fn split_binary_owned_impl(expr: Expr, operator: Operator, mut exprs: Vec) -> Vec { + match expr { + Expr::BinaryExpr(BinaryExpr { right, op, left }) if op == operator => { + let exprs = split_binary_owned_impl(*left, operator, exprs); + split_binary_owned_impl(*right, operator, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_binary_owned_impl(*expr, operator, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +#[allow(dead_code)] +/// Splits an binary operator tree [`Expr`] such as `A B C` => `[A, B, C]` +/// +/// See [`split_binary_owned`] for more details and an example. +pub fn split_binary(expr: &Expr, op: Operator) -> Vec<&Expr> { + split_binary_impl(expr, op, vec![]) +} + +#[allow(dead_code)] +fn split_binary_impl<'a>( + expr: &'a Expr, + operator: Operator, + mut exprs: Vec<&'a Expr>, +) -> Vec<&'a Expr> { + match expr { + Expr::BinaryExpr(BinaryExpr { right, op, left }) if *op == operator => { + let exprs = split_binary_impl(left, operator, exprs); + split_binary_impl(right, operator, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_binary_impl(expr, operator, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +/// Combines an array of filter expressions into a single filter +/// expression consisting of the input filter expressions joined with +/// logical AND. +/// +/// Returns None if the filters array is empty. +/// +/// # Example +/// ``` +/// # use datafusion_python::datafusion_expr::{col, lit}; +/// # use datafusion_python::datafusion_optimizer::utils::conjunction; +/// // a=1 AND b=2 +/// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use conjunction to join them together with `AND` +/// assert_eq!(conjunction(split), Some(expr)); +/// ``` +pub fn conjunction(filters: impl IntoIterator) -> Option { + filters.into_iter().reduce(|accum, expr| accum.and(expr)) +} + +#[allow(dead_code)] +/// Combines an array of filter expressions into a single filter +/// expression consisting of the input filter expressions joined with +/// logical OR. +/// +/// Returns None if the filters array is empty. +pub fn disjunction(filters: impl IntoIterator) -> Option { + filters.into_iter().reduce(|accum, expr| accum.or(expr)) +} + +/// returns a new [LogicalPlan] that wraps `plan` in a [LogicalPlan::Filter] with +/// its predicate be all `predicates` ANDed. +#[allow(dead_code)] +pub fn add_filter(plan: LogicalPlan, predicates: &[&Expr]) -> Result { + // reduce filters to a single filter with an AND + let predicate = predicates + .iter() + .skip(1) + .fold(predicates[0].clone(), |acc, predicate| { + and(acc, (*predicate).to_owned()) + }); + + Ok(LogicalPlan::Filter(Filter::try_new( + predicate, + Arc::new(plan), + )?)) +} + +/// Looks for correlating expressions: for example, a binary expression with one field from the subquery, and +/// one not in the subquery (closed upon from outer scope) +/// +/// # Arguments +/// +/// * `exprs` - List of expressions that may or may not be joins +/// +/// # Return value +/// +/// Tuple of (expressions containing joins, remaining non-join expressions) +pub fn find_join_exprs(exprs: Vec<&Expr>) -> Result<(Vec, Vec)> { + let mut joins = vec![]; + let mut others = vec![]; + for filter in exprs.into_iter() { + // If the expression contains correlated predicates, add it to join filters + if filter.contains_outer() { + if !matches!(filter, Expr::BinaryExpr(BinaryExpr{ left, op: Operator::Eq, right }) if left.eq(right)) + { + joins.push(strip_outer_reference((*filter).clone())); + } + } else { + others.push((*filter).clone()); + } + } + + Ok((joins, others)) +} + +/// Returns the first (and only) element in a slice, or an error +/// +/// # Arguments +/// +/// * `slice` - The slice to extract from +/// +/// # Return value +/// +/// The first element, or an error +pub fn only_or_err(slice: &[T]) -> Result<&T> { + match slice { + [it] => Ok(it), + [] => Err(datafusion_python::datafusion_common::DataFusionError::Plan( + "No items found!".to_owned(), + )), + _ => Err(datafusion_python::datafusion_common::DataFusionError::Plan( + "More than one item found!".to_owned(), + )), + } +} + +/// merge inputs schema into a single schema. +#[allow(dead_code)] +pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { + if inputs.len() == 1 { + inputs[0].schema().clone().as_ref().clone() + } else { + inputs + .iter() + .map(|input| input.schema()) + .fold(DFSchema::empty(), |mut lhs, rhs| { + lhs.merge(rhs); + lhs + }) + } +} + +pub(crate) fn collect_subquery_cols( + exprs: &[Expr], + subquery_schema: DFSchemaRef, +) -> Result> { + exprs.iter().try_fold(BTreeSet::new(), |mut cols, expr| { + let mut using_cols: Vec = vec![]; + for col in expr.to_columns()?.into_iter() { + if subquery_schema.has_column(&col) { + using_cols.push(col); + } + } + + cols.extend(using_cols); + Result::<_>::Ok(cols) + }) +} + +pub(crate) fn replace_qualified_name( + expr: Expr, + cols: &BTreeSet, + subquery_alias: &str, +) -> Result { + let alias_cols: Vec = cols + .iter() + .map(|col| Column::from_qualified_name(format!("{}.{}", subquery_alias, col.name))) + .collect(); + let replace_map: HashMap<&Column, &Column> = cols.iter().zip(alias_cols.iter()).collect(); + + replace_col(expr, &replace_map) +} + +#[allow(dead_code)] +/// Log the plan in debug/tracing mode after some part of the optimizer runs +pub fn log_plan(description: &str, plan: &LogicalPlan) { + debug!("{description}:\n{}\n", plan.display_indent()); + trace!("{description}::\n{}\n", plan.display_indent_schema()); +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use datafusion_python::{ + datafusion::arrow::datatypes::DataType, + datafusion_common::Column, + datafusion_expr::{col, expr::Cast, lit, utils::expr_to_columns}, + }; + + use super::*; + + #[test] + fn test_split_conjunction() { + let expr = col("a"); + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr]); + } + + #[test] + fn test_split_conjunction_two() { + let expr = col("a").eq(lit(5)).and(col("b")); + let expr1 = col("a").eq(lit(5)); + let expr2 = col("b"); + + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr1, &expr2]); + } + + #[test] + fn test_split_conjunction_alias() { + let expr = col("a").eq(lit(5)).and(col("b").alias("the_alias")); + let expr1 = col("a").eq(lit(5)); + let expr2 = col("b"); // has no alias + + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr1, &expr2]); + } + + #[test] + fn test_split_conjunction_or() { + let expr = col("a").eq(lit(5)).or(col("b")); + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr]); + } + + #[test] + fn test_split_binary_owned() { + let expr = col("a"); + assert_eq!(split_binary_owned(expr.clone(), Operator::And), vec![expr]); + } + + #[test] + fn test_split_binary_owned_two() { + assert_eq!( + split_binary_owned(col("a").eq(lit(5)).and(col("b")), Operator::And), + vec![col("a").eq(lit(5)), col("b")] + ); + } + + #[test] + fn test_split_binary_owned_different_op() { + let expr = col("a").eq(lit(5)).or(col("b")); + assert_eq!( + // expr is connected by OR, but pass in AND + split_binary_owned(expr.clone(), Operator::And), + vec![expr] + ); + } + + #[test] + fn test_split_conjunction_owned() { + let expr = col("a"); + assert_eq!(split_conjunction_owned(expr.clone()), vec![expr]); + } + + #[test] + fn test_split_conjunction_owned_two() { + assert_eq!( + split_conjunction_owned(col("a").eq(lit(5)).and(col("b"))), + vec![col("a").eq(lit(5)), col("b")] + ); + } + + #[test] + fn test_split_conjunction_owned_alias() { + assert_eq!( + split_conjunction_owned(col("a").eq(lit(5)).and(col("b").alias("the_alias"))), + vec![ + col("a").eq(lit(5)), + // no alias on b + col("b"), + ] + ); + } + + #[test] + fn test_conjunction_empty() { + assert_eq!(conjunction(vec![]), None); + } + + #[test] + fn test_conjunction() { + // `[A, B, C]` + let expr = conjunction(vec![col("a"), col("b"), col("c")]); + + // --> `(A AND B) AND C` + assert_eq!(expr, Some(col("a").and(col("b")).and(col("c")))); + + // which is different than `A AND (B AND C)` + assert_ne!(expr, Some(col("a").and(col("b").and(col("c"))))); + } + + #[test] + fn test_disjunction_empty() { + assert_eq!(disjunction(vec![]), None); + } + + #[test] + fn test_disjunction() { + // `[A, B, C]` + let expr = disjunction(vec![col("a"), col("b"), col("c")]); + + // --> `(A OR B) OR C` + assert_eq!(expr, Some(col("a").or(col("b")).or(col("c")))); + + // which is different than `A OR (B OR C)` + assert_ne!(expr, Some(col("a").or(col("b").or(col("c"))))); + } + + #[test] + fn test_split_conjunction_owned_or() { + let expr = col("a").eq(lit(5)).or(col("b")); + assert_eq!(split_conjunction_owned(expr.clone()), vec![expr]); + } + + #[test] + fn test_collect_expr() -> Result<()> { + let mut accum: HashSet = HashSet::new(); + expr_to_columns( + &Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)), + &mut accum, + )?; + expr_to_columns( + &Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)), + &mut accum, + )?; + assert_eq!(1, accum.len()); + assert!(accum.contains(&Column::from_name("a"))); + Ok(()) + } +} diff --git a/dask_planner/src/sql/parser_utils.rs b/src/sql/parser_utils.rs similarity index 100% rename from dask_planner/src/sql/parser_utils.rs rename to src/sql/parser_utils.rs diff --git a/dask_planner/src/sql/schema.rs b/src/sql/schema.rs similarity index 95% rename from dask_planner/src/sql/schema.rs rename to src/sql/schema.rs index 0975391f4..804db700f 100644 --- a/dask_planner/src/sql/schema.rs +++ b/src/sql/schema.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use super::types::PyDataType; use crate::sql::{function::DaskFunction, table}; -#[pyclass(name = "DaskSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSchema", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSchema { #[pyo3(get, set)] diff --git a/dask_planner/src/sql/statement.rs b/src/sql/statement.rs similarity index 88% rename from dask_planner/src/sql/statement.rs rename to src/sql/statement.rs index f8fabc109..40fc9f268 100644 --- a/dask_planner/src/sql/statement.rs +++ b/src/sql/statement.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use crate::parser::DaskStatement; -#[pyclass(name = "Statement", module = "dask_planner", subclass)] +#[pyclass(name = "Statement", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyStatement { pub statement: DaskStatement, diff --git a/dask_planner/src/sql/table.rs b/src/sql/table.rs similarity index 97% rename from dask_planner/src/sql/table.rs rename to src/sql/table.rs index f25f891ec..1c2585bef 100644 --- a/dask_planner/src/sql/table.rs +++ b/src/sql/table.rs @@ -2,7 +2,7 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, Field, SchemaRef}, + datafusion::arrow::datatypes::{DataType, Fields, SchemaRef}, datafusion_common::DFField, datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource}, datafusion_optimizer::utils::split_conjunction, @@ -51,7 +51,6 @@ impl DaskTableSource { } /// Access optional filepath associated with this table source - #[allow(dead_code)] pub fn filepath(&self) -> Option<&String> { self.filepath.as_ref() } @@ -91,7 +90,7 @@ fn is_supported_push_down_expr(_expr: &Expr) -> bool { true } -#[pyclass(name = "DaskStatistics", module = "dask_planner", subclass)] +#[pyclass(name = "DaskStatistics", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskStatistics { row_count: f64, @@ -110,7 +109,7 @@ impl DaskStatistics { } } -#[pyclass(name = "DaskTable", module = "dask_planner", subclass)] +#[pyclass(name = "DaskTable", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskTable { pub(crate) schema_name: Option, @@ -195,7 +194,7 @@ pub(crate) fn table_from_logical_plan( // Get the TableProvider for this Table instance let tbl_provider: Arc = table_scan.source.clone(); let tbl_schema: SchemaRef = tbl_provider.schema(); - let fields: &Vec = tbl_schema.fields(); + let fields: &Fields = tbl_schema.fields(); let mut cols: Vec<(String, DaskTypeMap)> = Vec::new(); for field in fields { diff --git a/dask_planner/src/sql/types.rs b/src/sql/types.rs similarity index 95% rename from dask_planner/src/sql/types.rs rename to src/sql/types.rs index ceff904a6..34af22342 100644 --- a/dask_planner/src/sql/types.rs +++ b/src/sql/types.rs @@ -1,6 +1,8 @@ pub mod rel_data_type; pub mod rel_data_type_field; +use std::sync::Arc; + use datafusion_python::{ datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, @@ -10,7 +12,7 @@ use pyo3::{prelude::*, types::PyDict}; use crate::{dialect::DaskDialect, error::DaskPlannerError, sql::exceptions::py_type_err}; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "RexType", module = "datafusion")] +#[pyclass(name = "RexType", module = "dask_sql")] pub enum RexType { Alias, Literal, @@ -21,7 +23,7 @@ pub enum RexType { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "DaskTypeMap", module = "datafusion", subclass)] +#[pyclass(name = "DaskTypeMap", module = "dask_sql", subclass)] /// Represents a Python Data Type. This is needed instead of simple /// Enum instances because PyO3 can only support unit variants as /// of version 0.16 which means Enums like `DataType::TIMESTAMP_WITH_LOCAL_TIME_ZONE` @@ -54,10 +56,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; @@ -85,10 +89,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; @@ -161,7 +167,7 @@ impl DaskTypeMap { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "PyDataType", module = "datafusion", subclass)] +#[pyclass(name = "PyDataType", module = "dask_sql", subclass)] pub struct PyDataType { data_type: DataType, } @@ -204,7 +210,7 @@ impl From for PyDataType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "SqlTypeName", module = "datafusion")] +#[pyclass(name = "SqlTypeName", module = "dask_sql")] pub enum SqlTypeName { ANY, ARRAY, diff --git a/dask_planner/src/sql/types/rel_data_type.rs b/src/sql/types/rel_data_type.rs similarity index 98% rename from dask_planner/src/sql/types/rel_data_type.rs rename to src/sql/types/rel_data_type.rs index 1ae3646b0..59cb0fb7c 100644 --- a/dask_planner/src/sql/types/rel_data_type.rs +++ b/src/sql/types/rel_data_type.rs @@ -8,7 +8,7 @@ const PRECISION_NOT_SPECIFIED: i32 = i32::MIN; const SCALE_NOT_SPECIFIED: i32 = -1; /// RelDataType represents the type of a scalar expression or entire row returned from a relational expression. -#[pyclass(name = "RelDataType", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataType", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataType { nullable: bool, diff --git a/dask_planner/src/sql/types/rel_data_type_field.rs b/src/sql/types/rel_data_type_field.rs similarity index 98% rename from dask_planner/src/sql/types/rel_data_type_field.rs rename to src/sql/types/rel_data_type_field.rs index 13f036d0e..3694d0bce 100644 --- a/dask_planner/src/sql/types/rel_data_type_field.rs +++ b/src/sql/types/rel_data_type_field.rs @@ -12,7 +12,7 @@ use crate::{ }; /// RelDataTypeField represents the definition of a field in a structured RelDataType. -#[pyclass(name = "RelDataTypeField", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataTypeField", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataTypeField { qualifier: Option, diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 65fc3b156..90b6f3828 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -169,12 +169,21 @@ def gpu_string_table(string_table): @pytest.fixture() def gpu_datetime_table(datetime_table): - # cudf doesn't have support for timezoned datetime data - datetime_table["timezone"] = datetime_table["timezone"].astype("datetime64[ns]") - datetime_table["utc_timezone"] = datetime_table["utc_timezone"].astype( - "datetime64[ns]" - ) - return cudf.from_pandas(datetime_table) if cudf else None + if cudf: + # TODO: remove once `from_pandas` has support for timezone-aware data + # https://github.com/rapidsai/cudf/issues/13611 + df = datetime_table.copy() + df["timezone"] = df["timezone"].dt.tz_localize(None) + df["utc_timezone"] = df["utc_timezone"].dt.tz_localize(None) + gdf = cudf.from_pandas(df) + gdf["timezone"] = gdf["timezone"].dt.tz_localize( + str(datetime_table["timezone"].dt.tz) + ) + gdf["utc_timezone"] = gdf["utc_timezone"].dt.tz_localize( + str(datetime_table["utc_timezone"].dt.tz) + ) + return gdf + return None @pytest.fixture() diff --git a/tests/integration/test_analyze.py b/tests/integration/test_analyze.py index 8371476c1..a7ccf65b6 100644 --- a/tests/integration/test_analyze.py +++ b/tests/integration/test_analyze.py @@ -1,3 +1,4 @@ +import dask.dataframe as dd import pandas as pd from dask_sql.mappings import python_to_sql_type @@ -8,24 +9,21 @@ def test_analyze(c, df): result_df = c.sql("ANALYZE TABLE df COMPUTE STATISTICS FOR ALL COLUMNS") # extract table and compute stats with Dask manually - expected_df = ( - c.sql("SELECT * FROM df") - .describe() - .append( - pd.Series( + expected_df = dd.concat( + [ + c.sql("SELECT * FROM df").describe(), + pd.DataFrame( { col: str(python_to_sql_type(df[col].dtype)).lower() for col in df.columns }, - name="data_type", - ) - ) - .append( - pd.Series( + index=["data_type"], + ), + pd.DataFrame( {col: col for col in df.columns}, - name="col_name", - ) - ) + index=["col_name"], + ), + ] ) assert_eq(result_df, expected_df) diff --git a/tests/integration/test_cmd.py b/tests/integration/test_cmd.py index 847936e22..0ffe82eb0 100644 --- a/tests/integration/test_cmd.py +++ b/tests/integration/test_cmd.py @@ -107,13 +107,14 @@ def test_meta_commands(c, client, capsys): captured = capsys.readouterr() assert "Schema not_exists not available\n" == captured.out + # FIXME: Revert to 8787 once https://github.com/dask/distributed/issues/8071 is fixed with pytest.raises( OSError, - match="Timed out .* to tcp://localhost:8787 after 5 s", + match="Timed out .* to tcp://localhost:8788 after 5 s", ): with dask_config.set({"distributed.comm.timeouts.connect": 5}): - client = _meta_commands("\\dsc localhost:8787", context=c, client=client) - assert client.scheduler.__dict__["addr"] == "localhost:8787" + client = _meta_commands("\\dsc localhost:8788", context=c, client=client) + assert client.scheduler.__dict__["addr"] == "localhost:8788" def test_connection_info(c, client, capsys): diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index b34d64bbb..7b8808629 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -28,8 +28,9 @@ def cast_datetime_to_string(df): if not cols: return df - # Casting directly to string loses second precision - df[cols] = df[cols].astype("object").astype("string") + for col in cols: + df[col] = df[col].dt.strftime("%Y-%m-%d %H:%M:%S") + return df diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py index 0388aced8..cede43185 100644 --- a/tests/integration/test_filter.py +++ b/tests/integration/test_filter.py @@ -5,6 +5,7 @@ from dask.utils_test import hlg_layer from packaging.version import parse as parseVersion +from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT from tests.utils import assert_eq DASK_GT_2022_4_2 = parseVersion(dask.__version__) >= parseVersion("2022.4.2") @@ -91,9 +92,8 @@ def test_filter_cast_date(c, input_table, request): CAST(timezone AS DATE) > DATE '2014-08-01' """ ) - expected_df = datetime_table[ - datetime_table["timezone"].astype(" pd.Timestamp("2014-08-01") ] assert_eq(return_df, expected_df) @@ -109,6 +109,9 @@ def test_filter_cast_date(c, input_table, request): ), ], ) +@pytest.mark.xfail( + reason="Need support for non-UTC timezoned literals, see https://github.com/dask-contrib/dask-sql/issues/1193" +) def test_filter_cast_timestamp(c, input_table, request): datetime_table = request.getfixturevalue(input_table) return_df = c.sql( @@ -162,10 +165,26 @@ def test_filter_year(c): ), pytest.param( "SELECT * FROM parquet_ddf WHERE b IN (1, 3, 5, 6)", - lambda x: x[(x["b"] == 1) | (x["b"] == 3) | (x["b"] == 5) | (x["b"] == 6)], - [[("b", "==", 1)], [("b", "==", 3)], [("b", "==", 5)], [("b", "==", 6)]], - marks=pytest.mark.xfail( - reason="WIP https://github.com/dask-contrib/dask-sql/issues/607" + lambda x: x[x["b"].isin([1, 3, 5, 6])], + [[("b", "in", (1, 3, 5, 6))]], + ), + pytest.param( + "SELECT * FROM parquet_ddf WHERE c IN ('A', 'B', 'C', 'D')", + lambda x: x[x["c"].isin(["A", "B", "C", "D"])], + [[("c", "in", ("A", "B", "C", "D"))]], + ), + pytest.param( + "SELECT * FROM parquet_ddf WHERE b NOT IN (1, 6)", + lambda x: x[(x["b"] != 1) & (x["b"] != 6)], + [[("b", "!=", 1), ("b", "!=", 6)]], + ), + pytest.param( + "SELECT * FROM parquet_ddf WHERE b NOT IN (1, 3, 5, 6)", + lambda x: x[~x["b"].isin([1, 3, 5, 6])], + [[("b", "not in", (1, 3, 5, 6))]], + marks=pytest.mark.skipif( + not PQ_NOT_IN_SUPPORT, + reason="Requires https://github.com/dask/dask/pull/10320", ), ), ( @@ -296,3 +315,55 @@ def test_filter_decimal(c, gpu): assert_eq(result_df, expected_df, check_index=False) c.drop_table("df") + + +@pytest.mark.skipif( + not PQ_IS_SUPPORT, + reason="Requires https://github.com/dask/dask/pull/10320", +) +def test_predicate_pushdown_isna(tmpdir): + from dask_sql.context import Context + + c = Context() + + path = str(tmpdir) + dd.from_pandas( + pd.DataFrame( + { + "a": [1, 2, None] * 5, + "b": range(15), + "index": range(15), + } + ), + npartitions=3, + ).to_parquet(path + "/df1") + df1 = dd.read_parquet(path + "/df1", index="index") + c.create_table("df1", df1) + + dd.from_pandas( + pd.DataFrame( + { + "a": [None, 2, 3] * 5, + "b": range(15), + "index": range(15), + }, + ), + npartitions=3, + ).to_parquet(path + "/df2") + df2 = dd.read_parquet(path + "/df2", index="index") + c.create_table("df2", df2) + + return_df = c.sql("SELECT df1.a FROM df1, df2 WHERE df1.a = df2.a") + + # Check for predicate pushdown + filters = [[("a", "is not", None)]] + got_filters = hlg_layer(return_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + + got_filters = frozenset(frozenset(v) for v in got_filters) + expect_filters = frozenset(frozenset(v) for v in filters) + + assert got_filters == expect_filters + assert all(return_df.compute() == 2) + assert len(return_df) == 25 diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py index 597ed04c2..92a2464ab 100644 --- a/tests/integration/test_groupby.py +++ b/tests/integration/test_groupby.py @@ -488,7 +488,7 @@ def test_covar_aggregation(c, timeseries_df): pytest.param("gpu_user_table_1", marks=pytest.mark.gpu), ], ) -@pytest.mark.parametrize("split_out", [None, 2, 4]) +@pytest.mark.parametrize("split_out", [1, 2, 4]) def test_groupby_split_out(c, input_table, split_out, request): user_table = request.getfixturevalue(input_table) diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py index c4634311f..aa39737ad 100644 --- a/tests/integration/test_jdbc.py +++ b/tests/integration/test_jdbc.py @@ -19,8 +19,7 @@ def c(): c = Context() c.create_schema(schema) - row = create_table_row() - tables = pd.DataFrame().append(row, ignore_index=True) + tables = pd.DataFrame(create_table_row(), index=[0]) tables = tables.astype({"AN_INT": "int64"}) c.create_table(table, tables, schema_name=schema) diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index 3b131541c..3f19a3211 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -86,6 +86,56 @@ def test_join_left(c): assert_eq(return_df, expected_df, check_index=False) +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) +def test_join_left_anti(c, gpu): + df1 = pd.DataFrame({"id": [1, 1, 2, 4], "a": ["a", "b", "c", "d"]}) + df2 = pd.DataFrame({"id": [2, 1, 2, 3], "b": ["c", "c", "a", "c"]}) + c.create_table("df_1", df1, gpu=gpu) + c.create_table("df_2", df2, gpu=gpu) + + return_df = c.sql( + """ + SELECT lhs.id, lhs.a + FROM df_1 AS lhs + LEFT ANTI JOIN df_2 AS rhs + ON lhs.id = rhs.id + """ + ) + expected_df = pd.DataFrame( + { + "id": [4], + "a": ["d"], + } + ) + + assert_eq(return_df, expected_df, check_index=False) + + +@pytest.mark.gpu +def test_join_left_semi(c): + df1 = pd.DataFrame({"id": [1, 1, 2, 4], "a": ["a", "b", "c", "d"]}) + df2 = pd.DataFrame({"id": [2, 1, 2, 3], "b": ["c", "c", "a", "c"]}) + c.create_table("df_1", df1, gpu=True) + c.create_table("df_2", df2, gpu=True) + + return_df = c.sql( + """ + SELECT lhs.id, lhs.a + FROM df_1 AS lhs + LEFT SEMI JOIN df_2 AS rhs + ON lhs.id = rhs.id + """ + ) + expected_df = pd.DataFrame( + { + "id": [1, 1, 2], + "a": ["a", "b", "c"], + } + ) + + assert_eq(return_df, expected_df, check_index=False) + + def test_join_right(c): return_df = c.sql( """ @@ -119,7 +169,7 @@ def test_join_cross(c, user_table_1, department_table): user_table_1["key"] = 1 department_table["key"] = 1 - expected_df = dd.merge(user_table_1, department_table, on="key").drop("key", 1) + expected_df = dd.merge(user_table_1, department_table, on="key").drop(columns="key") assert_eq(return_df, expected_df, check_index=False) @@ -327,7 +377,7 @@ def test_intersect(c): limit 100 """ ) - assert actual_df["COUNT(UInt8(1))"].compute()[0] == 3 + assert actual_df["COUNT(*)"].compute()[0] == 3 # Join df_simple against itself, and then that result against df_wide. Nothing should match so therefore result should be 0 actual_df = c.sql( @@ -342,7 +392,7 @@ def test_intersect(c): limit 100 """ ) - assert len(actual_df["COUNT(UInt8(1))"]) == 0 + assert len(actual_df["COUNT(*)"]) == 0 actual_df = c.sql( """ diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index 5d47e2ec4..973802fe4 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -92,6 +92,9 @@ def test_training_and_prediction(c, gpu_client): sys.platform == "win32", reason="'xgboost.core.XGBoostError: Failed to poll' on Windows only", ) +@pytest.mark.xfail( + sys.platform == "darwin", reason="Intermittent socket errors on macOS", strict=False +) @pytest.mark.parametrize( "gpu_client", [False, pytest.param(True, marks=pytest.mark.gpu)], indirect=True ) @@ -626,6 +629,9 @@ def test_mlflow_export(c, tmpdir): ) +@pytest.mark.xfail( + sys.platform == "darwin", reason="Intermittent socket errors on macOS", strict=False +) def test_mlflow_export_xgboost(c, client, tmpdir): # Test only when mlflow & xgboost was installed mlflow = pytest.importorskip("mlflow", reason="mlflow not installed") diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index d79927372..e099a3ddb 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT from tests.utils import assert_eq @@ -32,7 +33,8 @@ def test_case(c, df): CASE WHEN (a < 2) OR (3 < a AND a < 4) THEN 42 ELSE 47 END AS "S6", - CASE WHEN (1 < a AND a <= 4) THEN 1 ELSE 0 END AS "S7" + CASE WHEN (1 < a AND a <= 4) THEN 1 ELSE 0 END AS "S7", + CASE a WHEN 2 THEN 5 ELSE a + 1 END AS "S8" FROM df """ ) @@ -46,6 +48,7 @@ def test_case(c, df): ) expected_df["S6"] = df.a.apply(lambda a: 42 if ((a < 2) or (3 < a < 4)) else 47) expected_df["S7"] = df.a.apply(lambda a: 1 if (1 < a <= 4) else 0) + expected_df["S8"] = df.a.apply(lambda a: 5 if a == 2 else a + 1) # Do not check dtypes, as pandas versions are inconsistent here assert_eq(result_df, expected_df, check_dtype=False) @@ -404,8 +407,7 @@ def test_coalesce(c, gpu): COALESCE(NULL, 'hi') as c3, COALESCE(NULL, NULL, 'bye', 5/0) as c4, COALESCE(NULL, 3/2, NULL, 'fly') as c5, - COALESCE(SUM(b), 'why', 2.2) as c6, - COALESCE(NULL, MEAN(b), MEAN(a), 4/0) as c7 + COALESCE(NULL, MEAN(b), MEAN(a), 4/0) as c6 FROM df """ ) @@ -416,9 +418,8 @@ def test_coalesce(c, gpu): "c2": [np.nan], "c3": ["hi"], "c4": ["bye"], - "c5": ["1"], - "c6": ["why"], - "c7": [2.0], + "c5": ["1.5"], + "c6": [2.0], } ) @@ -757,10 +758,11 @@ def test_date_functions(c): EXTRACT(SECOND FROM d) AS "second", EXTRACT(WEEK FROM d) AS "week", EXTRACT(YEAR FROM d) AS "year", + EXTRACT(DATE FROM d) AS "date", LAST_DAY(d) as "last_day", - TIMESTAMPADD(YEAR, 2, d) as "plus_1_year", + TIMESTAMPADD(YEAR, 1, d) as "plus_1_year", TIMESTAMPADD(MONTH, 1, d) as "plus_1_month", TIMESTAMPADD(WEEK, 1, d) as "plus_1_week", TIMESTAMPADD(DAY, 1, d) as "plus_1_day", @@ -804,8 +806,9 @@ def test_date_functions(c): "second": [42], "week": [39], "year": [2021], + "date": [datetime(2021, 10, 3)], "last_day": [datetime(2021, 10, 31, 15, 53, 42, 47)], - "plus_1_year": [datetime(2023, 10, 3, 15, 53, 42, 47)], + "plus_1_year": [datetime(2022, 10, 3, 15, 53, 42, 47)], "plus_1_month": [datetime(2021, 11, 3, 15, 53, 42, 47)], "plus_1_week": [datetime(2021, 10, 10, 15, 53, 42, 47)], "plus_1_day": [datetime(2021, 10, 4, 15, 53, 42, 47)], @@ -935,21 +938,7 @@ def test_timestampdiff(c): assert_eq(ddf, expected_df, check_dtype=False) -@pytest.mark.parametrize( - "gpu", - [ - False, - pytest.param( - True, - marks=( - pytest.mark.gpu, - pytest.mark.xfail( - reason="Failing due to dask-cudf bug https://github.com/rapidsai/cudf/issues/12062" - ), - ), - ), - ], -) +@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)]) def test_totimestamp(c, gpu): df = pd.DataFrame( { @@ -1052,3 +1041,198 @@ def test_totimestamp(c, gpu): } ) assert_eq(df, expected_df, check_dtype=False) + + +@pytest.mark.parametrize( + "gpu", + [ + False, + pytest.param( + True, + marks=( + pytest.mark.gpu, + pytest.mark.xfail( + not DASK_CUDF_TODATETIME_SUPPORT, + reason="Requires https://github.com/dask/dask/pull/9881", + raises=RuntimeError, + ), + ), + ), + ], +) +def test_extract_date(c, gpu): + df = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + } + ) + df["t"] = [datetime(2021, 1, 1), datetime(2022, 2, 2), datetime(2023, 3, 3)] + c.create_table("df", df, gpu=gpu) + + result = c.sql("SELECT EXTRACT(DATE FROM t) AS e FROM df") + expected_df = pd.DataFrame( + {"e": [datetime(2021, 1, 1), datetime(2022, 2, 2), datetime(2023, 3, 3)]} + ) + assert_eq(result, expected_df) + + result = c.sql("SELECT * FROM df WHERE EXTRACT(DATE FROM t) > '2021-02-01'") + expected_df = pd.DataFrame( + { + "a": [2, 3], + "b": [5, 6], + "t": [datetime(2022, 2, 2), datetime(2023, 3, 3)], + } + ) + assert_eq(result, expected_df, check_index=False) + + result = c.sql( + "SELECT * FROM df WHERE EXTRACT(DATE FROM t) BETWEEN '2020-10-01' AND '2022-10-10'" + ) + expected_df = pd.DataFrame( + {"a": [1, 2], "b": [4, 5], "t": [datetime(2021, 1, 1), datetime(2022, 2, 2)]} + ) + assert_eq(result, expected_df) + + result = c.sql("SELECT TIMESTAMPADD(YEAR, 1, EXTRACT(DATE FROM t)) AS ta FROM df") + expected_df = pd.DataFrame( + {"ta": [datetime(2022, 1, 1), datetime(2023, 2, 2), datetime(2024, 3, 3)]} + ) + assert_eq(result, expected_df) + + result = c.sql("SELECT EXTRACT(DATE FROM t) + INTERVAL '2 days' AS i FROM df") + expected_df = pd.DataFrame( + {"i": [datetime(2021, 1, 3), datetime(2022, 2, 4), datetime(2023, 3, 5)]} + ) + assert_eq(result, expected_df) + + +@pytest.mark.parametrize( + "gpu", + [ + False, + pytest.param( + True, + marks=( + pytest.mark.gpu, + pytest.mark.xfail( + not DASK_CUDF_TODATETIME_SUPPORT, + reason="Requires https://github.com/dask/dask/pull/9881", + raises=RuntimeError, + ), + ), + ), + ], +) +def test_scalar_timestamps(c, gpu): + df = pd.DataFrame({"d": [1203073300, 1503073700]}) + c.create_table("df", df, gpu=gpu) + + expected_df = pd.DataFrame( + { + "dt": [datetime(2008, 2, 20, 11, 1, 40), datetime(2017, 8, 23, 16, 28, 20)], + } + ) + + df1 = c.sql("SELECT to_timestamp(d) + INTERVAL '5 days' AS dt FROM df") + assert_eq(df1, expected_df) + df2 = c.sql("SELECT CAST(d AS TIMESTAMP) + INTERVAL '5 days' AS dt FROM df") + assert_eq(df2, expected_df) + + df1 = c.sql("SELECT TIMESTAMPADD(DAY, 5, to_timestamp(d)) AS dt FROM df") + assert_eq(df1, expected_df) + df2 = c.sql("SELECT TIMESTAMPADD(DAY, 5, d) AS dt FROM df") + assert_eq(df2, expected_df) + df3 = c.sql("SELECT TIMESTAMPADD(DAY, 5, CAST(d AS TIMESTAMP)) AS dt FROM df") + assert_eq(df3, expected_df) + + expected_df = pd.DataFrame({"day": [15, 18]}) + df1 = c.sql("SELECT EXTRACT(DAY FROM to_timestamp(d)) AS day FROM df") + assert_eq(df1, expected_df, check_dtype=False) + df2 = c.sql("SELECT EXTRACT(DAY FROM CAST(d AS TIMESTAMP)) AS day FROM df") + assert_eq(df2, expected_df, check_dtype=False) + + expected_df = pd.DataFrame( + { + "ceil_to_day": [datetime(2008, 2, 16), datetime(2017, 8, 19)], + } + ) + df1 = c.sql("SELECT CEIL(to_timestamp(d) TO DAY) AS ceil_to_day FROM df") + assert_eq(df1, expected_df) + df2 = c.sql("SELECT CEIL(CAST(d AS TIMESTAMP) TO DAY) AS ceil_to_day FROM df") + assert_eq(df2, expected_df) + + expected_df = pd.DataFrame( + { + "floor_to_day": [datetime(2008, 2, 15), datetime(2017, 8, 18)], + } + ) + df1 = c.sql("SELECT FLOOR(to_timestamp(d) TO DAY) AS floor_to_day FROM df") + assert_eq(df1, expected_df) + df2 = c.sql("SELECT FLOOR(CAST(d AS TIMESTAMP) TO DAY) AS floor_to_day FROM df") + assert_eq(df2, expected_df) + + df = pd.DataFrame({"d1": [1203073300], "d2": [1503073700]}) + c.create_table("df", df, gpu=gpu) + expected_df = pd.DataFrame({"dt": [3472]}) + df1 = c.sql( + "SELECT TIMESTAMPDIFF(DAY, to_timestamp(d1), to_timestamp(d2)) AS dt FROM df" + ) + # TODO: The GPU case returns an incorrect value here + if not gpu: + assert_eq(df1, expected_df) + df2 = c.sql("SELECT TIMESTAMPDIFF(DAY, d1, d2) AS dt FROM df") + assert_eq(df2, expected_df, check_dtype=False) + df3 = c.sql( + "SELECT TIMESTAMPDIFF(DAY, CAST(d1 AS TIMESTAMP), CAST(d2 AS TIMESTAMP)) AS dt FROM df" + ) + assert_eq(df3, expected_df) + + scalar1 = 1203073300 + scalar2 = 1503073700 + + expected_df = pd.DataFrame({"dt": [datetime(2008, 2, 20, 11, 1, 40)]}) + + df1 = c.sql(f"SELECT to_timestamp({scalar1}) + INTERVAL '5 days' AS dt") + assert_eq(df1, expected_df) + # TODO: Fix seconds/nanoseconds conversion + # df2 = c.sql(f"SELECT CAST({scalar1} AS TIMESTAMP) + INTERVAL '5 days' AS dt") + # assert_eq(df2, expected_df) + + df1 = c.sql(f"SELECT TIMESTAMPADD(DAY, 5, to_timestamp({scalar1})) AS dt") + assert_eq(df1, expected_df) + df2 = c.sql(f"SELECT TIMESTAMPADD(DAY, 5, {scalar1}) AS dt") + assert_eq(df2, expected_df) + df3 = c.sql(f"SELECT TIMESTAMPADD(DAY, 5, CAST({scalar1} AS TIMESTAMP)) AS dt") + assert_eq(df3, expected_df) + + expected_df = pd.DataFrame({"day": [15]}) + df1 = c.sql(f"SELECT EXTRACT(DAY FROM to_timestamp({scalar1})) AS day") + assert_eq(df1, expected_df, check_dtype=False) + # TODO: Fix seconds/nanoseconds conversion + # df2 = c.sql(f"SELECT EXTRACT(DAY FROM CAST({scalar1} AS TIMESTAMP)) AS day") + # assert_eq(df2, expected_df, check_dtype=False) + + expected_df = pd.DataFrame({"ceil_to_day": [datetime(2008, 2, 16)]}) + df1 = c.sql(f"SELECT CEIL(to_timestamp({scalar1}) TO DAY) AS ceil_to_day") + assert_eq(df1, expected_df) + df2 = c.sql(f"SELECT CEIL(CAST({scalar1} AS TIMESTAMP) TO DAY) AS ceil_to_day") + assert_eq(df2, expected_df) + + expected_df = pd.DataFrame({"floor_to_day": [datetime(2008, 2, 15)]}) + df1 = c.sql(f"SELECT FLOOR(to_timestamp({scalar1}) TO DAY) AS floor_to_day") + assert_eq(df1, expected_df) + df2 = c.sql(f"SELECT FLOOR(CAST({scalar1} AS TIMESTAMP) TO DAY) AS floor_to_day") + assert_eq(df2, expected_df) + + expected_df = pd.DataFrame({"dt": [3472]}) + df1 = c.sql( + f"SELECT TIMESTAMPDIFF(DAY, to_timestamp({scalar1}), to_timestamp({scalar2})) AS dt" + ) + assert_eq(df1, expected_df) + df2 = c.sql(f"SELECT TIMESTAMPDIFF(DAY, {scalar1}, {scalar2}) AS dt") + assert_eq(df2, expected_df, check_dtype=False) + df3 = c.sql( + f"SELECT TIMESTAMPDIFF(DAY, CAST({scalar1} AS TIMESTAMP), CAST({scalar2} AS TIMESTAMP)) AS dt" + ) + assert_eq(df3, expected_df) diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index 92ca6b53d..53ebdc224 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -4,6 +4,7 @@ from dask.dataframe.optimize import optimize_dataframe_getitem from dask.utils_test import hlg_layer +from dask_sql._compat import PANDAS_GT_200 from dask_sql.utils import ParsingException from tests.utils import assert_eq @@ -33,7 +34,10 @@ def test_select_column(c, df): def test_select_different_types(c): expected_df = pd.DataFrame( { - "date": pd.to_datetime(["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT]), + "date": pd.to_datetime( + ["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT], + format="mixed" if PANDAS_GT_200 else None, + ), "string": ["this is a test", "another test", "äölüć", ""], "integer": [1, 2, -4, 5], "float": [-1.1, np.NaN, pd.NA, np.sqrt(2)], @@ -163,13 +167,13 @@ def test_date_casting(c, input_table, request): expected_df = datetime_table expected_df["timezone"] = ( - expected_df["timezone"].astype(" 1] + pushdown_df = attempt_predicate_pushdown(filtered_df) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [[("a", ">", 1)]] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +def test_predicate_pushdown_logical(parquet_ddf): + filtered_df = parquet_ddf[ + (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) | (parquet_ddf["a"] == -1) + ] + + pushdown_df = attempt_predicate_pushdown(filtered_df) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [[("a", ">", 1), ("b", "<", 2)], [("a", "==", -1)]] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +@pytest.mark.skipif( + not PQ_NOT_IN_SUPPORT, + reason="Requires https://github.com/dask/dask/pull/10320", +) +def test_predicate_pushdown_in(parquet_ddf): + filtered_df = parquet_ddf[ + (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) + | (parquet_ddf["a"] == -1) & parquet_ddf["c"].isin(("A", "B", "C")) + | ~parquet_ddf["b"].isin((5, 6, 7)) + ] + pushdown_df = attempt_predicate_pushdown(filtered_df) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [ + [("b", "<", 2), ("a", ">", 1)], + [("a", "==", -1), ("c", "in", ("A", "B", "C"))], + [("b", "not in", (5, 6, 7))], + ] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +@pytest.mark.skipif( + not PQ_IS_SUPPORT, + reason="Requires dask>=2023.3.1", +) +def test_predicate_pushdown_isna(parquet_ddf): + filtered_df = parquet_ddf[ + (parquet_ddf["a"] > 1) & (parquet_ddf["b"] < 2) + | (parquet_ddf["a"] == -1) & ~parquet_ddf["c"].isna() + | parquet_ddf["b"].isna() + ] + pushdown_df = attempt_predicate_pushdown(filtered_df) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [ + [("b", "<", 2), ("a", ">", 1)], + [("a", "==", -1), ("c", "is not", None)], + [("b", "is", None)], + ] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +def test_predicate_pushdown_add_filters(parquet_ddf): + filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] + pushdown_df = attempt_predicate_pushdown( + filtered_df, + add_filters=("b", "<", 2), + ) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [ + [("a", ">", 1), ("b", "<", 2)], + [("a", "==", -1), ("b", "<", 2)], + ] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +def test_predicate_pushdown_add_filters_no_extract(parquet_ddf): + filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] + pushdown_df = attempt_predicate_pushdown( + filtered_df, + extract_filters=False, + add_filters=("b", "<", 2), + ) + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [[("b", "<", 2)]] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters + + +def test_predicate_pushdown_add_filters_no_preserve(parquet_ddf): + filtered_df = parquet_ddf[(parquet_ddf["a"] > 1) | (parquet_ddf["a"] == -1)] + pushdown_df0 = attempt_predicate_pushdown(filtered_df) + pushdown_df = attempt_predicate_pushdown( + pushdown_df0, + preserve_filters=False, + extract_filters=False, + add_filters=("b", "<", 2), + ) + + got_filters = hlg_layer(pushdown_df.dask, "read-parquet").creation_info["kwargs"][ + "filters" + ] + got_filters = frozenset(frozenset(v) for v in got_filters) + expected_filters = [[("b", "<", 2)]] + expected_filters = frozenset(frozenset(v) for v in expected_filters) + assert got_filters == expected_filters diff --git a/versioneer.py b/versioneer.py deleted file mode 100644 index 76aa1c2b7..000000000 --- a/versioneer.py +++ /dev/null @@ -1,2260 +0,0 @@ -# Version: 0.27 - -"""The Versioneer - like a rocketeer, but for versions. - -The Versioneer -============== - -* like a rocketeer, but for versions! -* https://github.com/python-versioneer/python-versioneer -* Brian Warner -* License: Public Domain (Unlicense) -* Compatible with: Python 3.7, 3.8, 3.9, 3.10 and pypy3 -* [![Latest Version][pypi-image]][pypi-url] -* [![Build Status][travis-image]][travis-url] - -This is a tool for managing a recorded version number in setuptools-based -python projects. The goal is to remove the tedious and error-prone "update -the embedded version string" step from your release process. Making a new -release should be as easy as recording a new tag in your version-control -system, and maybe making new tarballs. - - -## Quick Install - -Versioneer provides two installation modes. The "classic" vendored mode installs -a copy of versioneer into your repository. The experimental build-time dependency mode -is intended to allow you to skip this step and simplify the process of upgrading. - -### Vendored mode - -* `pip install versioneer` to somewhere in your $PATH - * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is - available, so you can also use `conda install -c conda-forge versioneer` -* add a `[tool.versioneer]` section to your `pyproject.toml` or a - `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) - * Note that you will need to add `tomli` to your build-time dependencies if you - use `pyproject.toml` -* run `versioneer install --vendor` in your source tree, commit the results -* verify version information with `python setup.py version` - -### Build-time dependency mode - -* `pip install versioneer` to somewhere in your $PATH - * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is - available, so you can also use `conda install -c conda-forge versioneer` -* add a `[tool.versioneer]` section to your `pyproject.toml` or a - `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) -* add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`) - to the `requires` key of the `build-system` table in `pyproject.toml`: - ```toml - [build-system] - requires = ["setuptools", "versioneer[toml]"] - build-backend = "setuptools.build_meta" - ``` -* run `versioneer install --no-vendor` in your source tree, commit the results -* verify version information with `python setup.py version` - -## Version Identifiers - -Source trees come from a variety of places: - -* a version-control system checkout (mostly used by developers) -* a nightly tarball, produced by build automation -* a snapshot tarball, produced by a web-based VCS browser, like github's - "tarball from tag" feature -* a release tarball, produced by "setup.py sdist", distributed through PyPI - -Within each source tree, the version identifier (either a string or a number, -this tool is format-agnostic) can come from a variety of places: - -* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows - about recent "tags" and an absolute revision-id -* the name of the directory into which the tarball was unpacked -* an expanded VCS keyword ($Id$, etc) -* a `_version.py` created by some earlier build step - -For released software, the version identifier is closely related to a VCS -tag. Some projects use tag names that include more than just the version -string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool -needs to strip the tag prefix to extract the version identifier. For -unreleased software (between tags), the version identifier should provide -enough information to help developers recreate the same tree, while also -giving them an idea of roughly how old the tree is (after version 1.2, before -version 1.3). Many VCS systems can report a description that captures this, -for example `git describe --tags --dirty --always` reports things like -"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the -0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has -uncommitted changes). - -The version identifier is used for multiple purposes: - -* to allow the module to self-identify its version: `myproject.__version__` -* to choose a name and prefix for a 'setup.py sdist' tarball - -## Theory of Operation - -Versioneer works by adding a special `_version.py` file into your source -tree, where your `__init__.py` can import it. This `_version.py` knows how to -dynamically ask the VCS tool for version information at import time. - -`_version.py` also contains `$Revision$` markers, and the installation -process marks `_version.py` to have this marker rewritten with a tag name -during the `git archive` command. As a result, generated tarballs will -contain enough information to get the proper version. - -To allow `setup.py` to compute a version too, a `versioneer.py` is added to -the top level of your source tree, next to `setup.py` and the `setup.cfg` -that configures it. This overrides several distutils/setuptools commands to -compute the version when invoked, and changes `setup.py build` and `setup.py -sdist` to replace `_version.py` with a small static file that contains just -the generated version data. - -## Installation - -See [INSTALL.md](./INSTALL.md) for detailed installation instructions. - -## Version-String Flavors - -Code which uses Versioneer can learn about its version string at runtime by -importing `_version` from your main `__init__.py` file and running the -`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can -import the top-level `versioneer.py` and run `get_versions()`. - -Both functions return a dictionary with different flavors of version -information: - -* `['version']`: A condensed version string, rendered using the selected - style. This is the most commonly used value for the project's version - string. The default "pep440" style yields strings like `0.11`, - `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section - below for alternative styles. - -* `['full-revisionid']`: detailed revision identifier. For Git, this is the - full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". - -* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the - commit date in ISO 8601 format. This will be None if the date is not - available. - -* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that - this is only accurate if run in a VCS checkout, otherwise it is likely to - be False or None - -* `['error']`: if the version string could not be computed, this will be set - to a string describing the problem, otherwise it will be None. It may be - useful to throw an exception in setup.py if this is set, to avoid e.g. - creating tarballs with a version string of "unknown". - -Some variants are more useful than others. Including `full-revisionid` in a -bug report should allow developers to reconstruct the exact code being tested -(or indicate the presence of local changes that should be shared with the -developers). `version` is suitable for display in an "about" box or a CLI -`--version` output: it can be easily compared against release notes and lists -of bugs fixed in various releases. - -The installer adds the following text to your `__init__.py` to place a basic -version in `YOURPROJECT.__version__`: - - from ._version import get_versions - __version__ = get_versions()['version'] - del get_versions - -## Styles - -The setup.cfg `style=` configuration controls how the VCS information is -rendered into a version string. - -The default style, "pep440", produces a PEP440-compliant string, equal to the -un-prefixed tag name for actual releases, and containing an additional "local -version" section with more detail for in-between builds. For Git, this is -TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags ---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the -tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and -that this commit is two revisions ("+2") beyond the "0.11" tag. For released -software (exactly equal to a known tag), the identifier will only contain the -stripped tag, e.g. "0.11". - -Other styles are available. See [details.md](details.md) in the Versioneer -source tree for descriptions. - -## Debugging - -Versioneer tries to avoid fatal errors: if something goes wrong, it will tend -to return a version of "0+unknown". To investigate the problem, run `setup.py -version`, which will run the version-lookup code in a verbose mode, and will -display the full contents of `get_versions()` (including the `error` string, -which may help identify what went wrong). - -## Known Limitations - -Some situations are known to cause problems for Versioneer. This details the -most significant ones. More can be found on Github -[issues page](https://github.com/python-versioneer/python-versioneer/issues). - -### Subprojects - -Versioneer has limited support for source trees in which `setup.py` is not in -the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are -two common reasons why `setup.py` might not be in the root: - -* Source trees which contain multiple subprojects, such as - [Buildbot](https://github.com/buildbot/buildbot), which contains both - "master" and "slave" subprojects, each with their own `setup.py`, - `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI - distributions (and upload multiple independently-installable tarballs). -* Source trees whose main purpose is to contain a C library, but which also - provide bindings to Python (and perhaps other languages) in subdirectories. - -Versioneer will look for `.git` in parent directories, and most operations -should get the right version string. However `pip` and `setuptools` have bugs -and implementation details which frequently cause `pip install .` from a -subproject directory to fail to find a correct version string (so it usually -defaults to `0+unknown`). - -`pip install --editable .` should work correctly. `setup.py install` might -work too. - -Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in -some later version. - -[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking -this issue. The discussion in -[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the -issue from the Versioneer side in more detail. -[pip PR#3176](https://github.com/pypa/pip/pull/3176) and -[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve -pip to let Versioneer work correctly. - -Versioneer-0.16 and earlier only looked for a `.git` directory next to the -`setup.cfg`, so subprojects were completely unsupported with those releases. - -### Editable installs with setuptools <= 18.5 - -`setup.py develop` and `pip install --editable .` allow you to install a -project into a virtualenv once, then continue editing the source code (and -test) without re-installing after every change. - -"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a -convenient way to specify executable scripts that should be installed along -with the python package. - -These both work as expected when using modern setuptools. When using -setuptools-18.5 or earlier, however, certain operations will cause -`pkg_resources.DistributionNotFound` errors when running the entrypoint -script, which must be resolved by re-installing the package. This happens -when the install happens with one version, then the egg_info data is -regenerated while a different version is checked out. Many setup.py commands -cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into -a different virtualenv), so this can be surprising. - -[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes -this one, but upgrading to a newer version of setuptools should probably -resolve it. - - -## Updating Versioneer - -To upgrade your project to a new release of Versioneer, do the following: - -* install the new Versioneer (`pip install -U versioneer` or equivalent) -* edit `setup.cfg` and `pyproject.toml`, if necessary, - to include any new configuration settings indicated by the release notes. - See [UPGRADING](./UPGRADING.md) for details. -* re-run `versioneer install --[no-]vendor` in your source tree, to replace - `SRC/_version.py` -* commit any changed files - -## Future Directions - -This tool is designed to make it easily extended to other version-control -systems: all VCS-specific components are in separate directories like -src/git/ . The top-level `versioneer.py` script is assembled from these -components by running make-versioneer.py . In the future, make-versioneer.py -will take a VCS name as an argument, and will construct a version of -`versioneer.py` that is specific to the given VCS. It might also take the -configuration arguments that are currently provided manually during -installation by editing setup.py . Alternatively, it might go the other -direction and include code from all supported VCS systems, reducing the -number of intermediate scripts. - -## Similar projects - -* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time - dependency -* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of - versioneer -* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools - plugin - -## License - -To make Versioneer easier to embed, all its code is dedicated to the public -domain. The `_version.py` that it creates is also in the public domain. -Specifically, both are released under the "Unlicense", as described in -https://unlicense.org/. - -[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg -[pypi-url]: https://pypi.python.org/pypi/versioneer/ -[travis-image]: -https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg -[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer - -""" -# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring -# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements -# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error -# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with -# pylint:disable=attribute-defined-outside-init,too-many-arguments - -import configparser -import errno -import functools -import json -import os -import re -import subprocess -import sys -from pathlib import Path -from typing import Callable, Dict - -try: - import tomli - - have_tomli = True -except ImportError: - have_tomli = False - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_root(): - """Get the project root directory. - - We require that all commands are run from the project root, i.e. the - directory that contains setup.py, setup.cfg, and versioneer.py . - """ - root = os.path.realpath(os.path.abspath(os.getcwd())) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - # allow 'python path/to/setup.py COMMAND' - root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ( - "Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND')." - ) - raise VersioneerBadRootError(err) - try: - # Certain runtime workflows (setup.py install/develop in a setuptools - # tree) execute all dependencies in a single python process, so - # "versioneer" may be imported multiple times, and python's shared - # module-import table will cache the first one. So we can't use - # os.path.dirname(__file__), as that will find whichever - # versioneer.py was first imported, even in later projects. - my_path = os.path.realpath(os.path.abspath(__file__)) - me_dir = os.path.normcase(os.path.splitext(my_path)[0]) - vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) - if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): - print( - "Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(my_path), versioneer_py) - ) - except NameError: - pass - return root - - -def get_config_from_root(root): - """Read the project setup.cfg file to determine Versioneer config.""" - # This might raise OSError (if setup.cfg is missing), or - # configparser.NoSectionError (if it lacks a [versioneer] section), or - # configparser.NoOptionError (if it lacks "VCS="). See the docstring at - # the top of versioneer.py for instructions on writing your setup.cfg . - root = Path(root) - pyproject_toml = root / "pyproject.toml" - setup_cfg = root / "setup.cfg" - section = None - if pyproject_toml.exists() and have_tomli: - try: - with open(pyproject_toml, "rb") as fobj: - pp = tomli.load(fobj) - section = pp["tool"]["versioneer"] - except (tomli.TOMLDecodeError, KeyError): - pass - if not section: - parser = configparser.ConfigParser() - with open(setup_cfg) as cfg_file: - parser.read_file(cfg_file) - parser.get("versioneer", "VCS") # raise error if missing - - section = parser["versioneer"] - - cfg = VersioneerConfig() - cfg.VCS = section["VCS"] - cfg.style = section.get("style", "") - cfg.versionfile_source = section.get("versionfile_source") - cfg.versionfile_build = section.get("versionfile_build") - cfg.tag_prefix = section.get("tag_prefix") - if cfg.tag_prefix in ("''", '""', None): - cfg.tag_prefix = "" - cfg.parentdir_prefix = section.get("parentdir_prefix") - cfg.verbose = section.get("verbose") - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -# these dictionaries contain VCS-specific tools -LONG_VERSION_PY: Dict[str, str] = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} - - -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - HANDLERS.setdefault(vcs, {})[method] = f - return f - - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - process = None - - popen_kwargs = {} - if sys.platform == "win32": - # This hides the console window if pythonw.exe is used - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - popen_kwargs["startupinfo"] = startupinfo - - for command in commands: - try: - dispcmd = str([command] + args) - # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen( - [command] + args, - cwd=cwd, - env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr else None), - **popen_kwargs, - ) - break - except OSError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None, None - stdout = process.communicate()[0].strip().decode() - if process.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - print("stdout was %s" % stdout) - return None, process.returncode - return stdout, process.returncode - - -LONG_VERSION_PY[ - "git" -] = r''' -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. -# Generated by versioneer-0.27 -# https://github.com/python-versioneer/python-versioneer - -"""Git implementation of _version.py.""" - -import errno -import os -import re -import subprocess -import sys -from typing import Callable, Dict -import functools - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" - git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" - git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" - keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "%(STYLE)s" - cfg.tag_prefix = "%(TAG_PREFIX)s" - cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" - cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY: Dict[str, str] = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} - - -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - process = None - - popen_kwargs = {} - if sys.platform == "win32": - # This hides the console window if pythonw.exe is used - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - popen_kwargs["startupinfo"] = startupinfo - - for command in commands: - try: - dispcmd = str([command] + args) - # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None), **popen_kwargs) - break - except OSError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %%s" %% dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %%s" %% (commands,)) - return None, None - stdout = process.communicate()[0].strip().decode() - if process.returncode != 0: - if verbose: - print("unable to run %%s (error)" %% dispcmd) - print("stdout was %%s" %% stdout) - return None, process.returncode - return stdout, process.returncode - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for _ in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print("Tried directories %%s but none started with prefix %%s" %% - (str(rootdirs), parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - with open(versionfile_abs, "r") as fobj: - for line in fobj: - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - except OSError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if "refnames" not in keywords: - raise NotThisMethod("Short version file found") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = {r.strip() for r in refnames.strip("()").split(",")} - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %%d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} - if verbose: - print("discarding '%%s', no digits" %% ",".join(refs - tags)) - if verbose: - print("likely tags: %%s" %% ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - # Filter out refs that exactly match prefix or that don't start - # with a number once the prefix is stripped (mostly a concern - # when prefix is '') - if not re.match(r'\d', r): - continue - if verbose: - print("picking %%s" %% r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - - # GIT_DIR can interfere with correct operation of Versioneer. - # It may be intended to be passed to the Versioneer-versioned project, - # but that should not change where we get our version from. - env = os.environ.copy() - env.pop("GIT_DIR", None) - runner = functools.partial(runner, env=env) - - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=not verbose) - if rc != 0: - if verbose: - print("Directory %%s not under git control" %% root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, [ - "describe", "--tags", "--dirty", "--always", "--long", - "--match", f"{tag_prefix}[[:digit:]]*" - ], cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) - # --abbrev-ref was added in git-1.6.3 - if rc != 0 or branch_name is None: - raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") - branch_name = branch_name.strip() - - if branch_name == "HEAD": - # If we aren't exactly on a branch, pick a branch which represents - # the current commit. If all else fails, we are on a branchless - # commit. - branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) - # --contains was added in git-1.5.4 - if rc != 0 or branches is None: - raise NotThisMethod("'git branch --contains' returned error") - branches = branches.split("\n") - - # Remove the first line if we're running detached - if "(" in branches[0]: - branches.pop(0) - - # Strip off the leading "* " from the list of branches. - branches = [branch[2:] for branch in branches] - if "master" in branches: - branch_name = "master" - elif not branches: - branch_name = None - else: - # Pick the first branch that is returned. Good or bad. - branch_name = branches[0] - - pieces["branch"] = branch_name - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%%s'" - %% describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%%s' doesn't start with prefix '%%s'" - print(fmt %% (full_tag, tag_prefix)) - pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" - %% (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) - pieces["distance"] = len(out.split()) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_branch(pieces): - """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . - - The ".dev0" means not master branch. Note that .dev0 sorts backwards - (a feature branch will appear "older" than the master branch). - - Exceptions: - 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0" - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def pep440_split_post(ver): - """Split pep440 version string at the post-release segment. - - Returns the release segments before the post-release and the - post-release version number (or -1 if no post-release segment is present). - """ - vc = str.split(ver, ".post") - return vc[0], int(vc[1] or 0) if len(vc) == 2 else None - - -def render_pep440_pre(pieces): - """TAG[.postN.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ - if pieces["closest-tag"]: - if pieces["distance"]: - # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) - rendered = tag_version - if post_version is not None: - rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"]) - else: - rendered += ".post0.dev%%d" %% (pieces["distance"]) - else: - # no commits, use the tag as the version - rendered = pieces["closest-tag"] - else: - # exception #1 - rendered = "0.post0.dev%%d" %% pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%%s" %% pieces["short"] - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%%s" %% pieces["short"] - return rendered - - -def render_pep440_post_branch(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . - - The ".dev0" means not master branch. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%%s" %% pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+g%%s" %% pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-branch": - rendered = render_pep440_branch(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-post-branch": - rendered = render_pep440_post_branch(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%%s'" %% style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for _ in cfg.versionfile_source.split('/'): - root = os.path.dirname(root) - except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} -''' - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - with open(versionfile_abs, "r") as fobj: - for line in fobj: - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - except OSError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if "refnames" not in keywords: - raise NotThisMethod("Short version file found") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = {r.strip() for r in refnames.strip("()").split(",")} - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r"\d", r)} - if verbose: - print("discarding '%s', no digits" % ",".join(refs - tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix) :] - # Filter out refs that exactly match prefix or that don't start - # with a number once the prefix is stripped (mostly a concern - # when prefix is '') - if not re.match(r"\d", r): - continue - if verbose: - print("picking %s" % r) - return { - "version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": None, - "date": date, - } - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return { - "version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": "no suitable tags", - "date": None, - } - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - - # GIT_DIR can interfere with correct operation of Versioneer. - # It may be intended to be passed to the Versioneer-versioned project, - # but that should not change where we get our version from. - env = os.environ.copy() - env.pop("GIT_DIR", None) - runner = functools.partial(runner, env=env) - - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) - if rc != 0: - if verbose: - print("Directory %s not under git control" % root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner( - GITS, - [ - "describe", - "--tags", - "--dirty", - "--always", - "--long", - "--match", - f"{tag_prefix}[[:digit:]]*", - ], - cwd=root, - ) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) - # --abbrev-ref was added in git-1.6.3 - if rc != 0 or branch_name is None: - raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") - branch_name = branch_name.strip() - - if branch_name == "HEAD": - # If we aren't exactly on a branch, pick a branch which represents - # the current commit. If all else fails, we are on a branchless - # commit. - branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) - # --contains was added in git-1.5.4 - if rc != 0 or branches is None: - raise NotThisMethod("'git branch --contains' returned error") - branches = branches.split("\n") - - # Remove the first line if we're running detached - if "(" in branches[0]: - branches.pop(0) - - # Strip off the leading "* " from the list of branches. - branches = [branch[2:] for branch in branches] - if "master" in branches: - branch_name = "master" - elif not branches: - branch_name = None - else: - # Pick the first branch that is returned. Good or bad. - branch_name = branches[0] - - pieces["branch"] = branch_name - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[: git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) - if not mo: - # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, - ) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) - pieces["distance"] = len(out.split()) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def do_vcs_install(versionfile_source, ipy): - """Git-specific installation logic for Versioneer. - - For Git, this means creating/changing .gitattributes to mark _version.py - for export-subst keyword substitution. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - files = [versionfile_source] - if ipy: - files.append(ipy) - if "VERSIONEER_PEP518" not in globals(): - try: - my_path = __file__ - if my_path.endswith((".pyc", ".pyo")): - my_path = os.path.splitext(my_path)[0] + ".py" - versioneer_file = os.path.relpath(my_path) - except NameError: - versioneer_file = "versioneer.py" - files.append(versioneer_file) - present = False - try: - with open(".gitattributes", "r") as fobj: - for line in fobj: - if line.strip().startswith(versionfile_source): - if "export-subst" in line.strip().split()[1:]: - present = True - break - except OSError: - pass - if not present: - with open(".gitattributes", "a+") as fobj: - fobj.write(f"{versionfile_source} export-subst\n") - files.append(".gitattributes") - run_command(GITS, ["add", "--"] + files) - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for _ in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - "date": None, - } - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -SHORT_VERSION_PY = """ -# This file was generated by 'versioneer.py' (0.27) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -import json - -version_json = ''' -%s -''' # END VERSION_JSON - - -def get_versions(): - return json.loads(version_json) -""" - - -def versions_from_file(filename): - """Try to determine the version from _version.py if present.""" - try: - with open(filename) as f: - contents = f.read() - except OSError: - raise NotThisMethod("unable to read _version.py") - mo = re.search( - r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) - if not mo: - mo = re.search( - r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) - if not mo: - raise NotThisMethod("no version_json in _version.py") - return json.loads(mo.group(1)) - - -def write_to_version_file(filename, versions): - """Write the given version number to the given _version.py file.""" - os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) - with open(filename, "w") as f: - f.write(SHORT_VERSION_PY % contents) - - print("set %s to '%s'" % (filename, versions["version"])) - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_branch(pieces): - """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . - - The ".dev0" means not master branch. Note that .dev0 sorts backwards - (a feature branch will appear "older" than the master branch). - - Exceptions: - 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0" - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def pep440_split_post(ver): - """Split pep440 version string at the post-release segment. - - Returns the release segments before the post-release and the - post-release version number (or -1 if no post-release segment is present). - """ - vc = str.split(ver, ".post") - return vc[0], int(vc[1] or 0) if len(vc) == 2 else None - - -def render_pep440_pre(pieces): - """TAG[.postN.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ - if pieces["closest-tag"]: - if pieces["distance"]: - # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) - rendered = tag_version - if post_version is not None: - rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) - else: - rendered += ".post0.dev%d" % (pieces["distance"]) - else: - # no commits, use the tag as the version - rendered = pieces["closest-tag"] - else: - # exception #1 - rendered = "0.post0.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_post_branch(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . - - The ".dev0" means not master branch. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return { - "version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None, - } - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-branch": - rendered = render_pep440_branch(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-post-branch": - rendered = render_pep440_post_branch(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return { - "version": rendered, - "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], - "error": None, - "date": pieces.get("date"), - } - - -class VersioneerBadRootError(Exception): - """The project root directory is unknown or missing key files.""" - - -def get_versions(verbose=False): - """Get the project version from whatever source is available. - - Returns dict with two keys: 'version' and 'full'. - """ - if "versioneer" in sys.modules: - # see the discussion in cmdclass.py:get_cmdclass() - del sys.modules["versioneer"] - - root = get_root() - cfg = get_config_from_root(root) - - assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" - handlers = HANDLERS.get(cfg.VCS) - assert handlers, "unrecognized VCS '%s'" % cfg.VCS - verbose = verbose or cfg.verbose - assert ( - cfg.versionfile_source is not None - ), "please set versioneer.versionfile_source" - assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" - - versionfile_abs = os.path.join(root, cfg.versionfile_source) - - # extract version from first of: _version.py, VCS command (e.g. 'git - # describe'), parentdir. This is meant to work for developers using a - # source checkout, for users of a tarball created by 'setup.py sdist', - # and for users of a tarball/zipball created by 'git archive' or github's - # download-from-tag feature or the equivalent in other VCSes. - - get_keywords_f = handlers.get("get_keywords") - from_keywords_f = handlers.get("keywords") - if get_keywords_f and from_keywords_f: - try: - keywords = get_keywords_f(versionfile_abs) - ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) - if verbose: - print("got version from expanded keyword %s" % ver) - return ver - except NotThisMethod: - pass - - try: - ver = versions_from_file(versionfile_abs) - if verbose: - print("got version from file %s %s" % (versionfile_abs, ver)) - return ver - except NotThisMethod: - pass - - from_vcs_f = handlers.get("pieces_from_vcs") - if from_vcs_f: - try: - pieces = from_vcs_f(cfg.tag_prefix, root, verbose) - ver = render(pieces, cfg.style) - if verbose: - print("got version from VCS %s" % ver) - return ver - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - if verbose: - print("got version from parentdir %s" % ver) - return ver - except NotThisMethod: - pass - - if verbose: - print("unable to compute version") - - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", - "date": None, - } - - -def get_version(): - """Get the short version string for this project.""" - return get_versions()["version"] - - -def get_cmdclass(cmdclass=None): - """Get the custom setuptools subclasses used by Versioneer. - - If the package uses a different cmdclass (e.g. one from numpy), it - should be provide as an argument. - """ - if "versioneer" in sys.modules: - del sys.modules["versioneer"] - # this fixes the "python setup.py develop" case (also 'install' and - # 'easy_install .'), in which subdependencies of the main project are - # built (using setup.py bdist_egg) in the same python process. Assume - # a main project A and a dependency B, which use different versions - # of Versioneer. A's setup.py imports A's Versioneer, leaving it in - # sys.modules by the time B's setup.py is executed, causing B to run - # with the wrong versioneer. Setuptools wraps the sub-dep builds in a - # sandbox that restores sys.modules to it's pre-build state, so the - # parent is protected against the child's "import versioneer". By - # removing ourselves from sys.modules here, before the child build - # happens, we protect the child from the parent's versioneer too. - # Also see https://github.com/python-versioneer/python-versioneer/issues/52 - - cmds = {} if cmdclass is None else cmdclass.copy() - - # we add "version" to setuptools - from setuptools import Command - - class cmd_version(Command): - description = "report generated version string" - user_options = [] - boolean_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - vers = get_versions(verbose=True) - print("Version: %s" % vers["version"]) - print(" full-revisionid: %s" % vers.get("full-revisionid")) - print(" dirty: %s" % vers.get("dirty")) - print(" date: %s" % vers.get("date")) - if vers["error"]: - print(" error: %s" % vers["error"]) - - cmds["version"] = cmd_version - - # we override "build_py" in setuptools - # - # most invocation pathways end up running build_py: - # distutils/build -> build_py - # distutils/install -> distutils/build ->.. - # setuptools/bdist_wheel -> distutils/install ->.. - # setuptools/bdist_egg -> distutils/install_lib -> build_py - # setuptools/install -> bdist_egg ->.. - # setuptools/develop -> ? - # pip install: - # copies source tree to a tempdir before running egg_info/etc - # if .git isn't copied too, 'git describe' will fail - # then does setup.py bdist_wheel, or sometimes setup.py install - # setup.py egg_info -> ? - - # pip install -e . and setuptool/editable_wheel will invoke build_py - # but the build_py command is not expected to copy any files. - - # we override different "build_py" commands for both environments - if "build_py" in cmds: - _build_py = cmds["build_py"] - else: - from setuptools.command.build_py import build_py as _build_py - - class cmd_build_py(_build_py): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - _build_py.run(self) - if getattr(self, "editable_mode", False): - # During editable installs `.py` and data files are - # not copied to build_lib - return - # now locate _version.py in the new build/ directory and replace - # it with an updated value - if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - cmds["build_py"] = cmd_build_py - - if "build_ext" in cmds: - _build_ext = cmds["build_ext"] - else: - from setuptools.command.build_ext import build_ext as _build_ext - - class cmd_build_ext(_build_ext): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - _build_ext.run(self) - if self.inplace: - # build_ext --inplace will only build extensions in - # build/lib<..> dir with no _version.py to write to. - # As in place builds will already have a _version.py - # in the module dir, we do not need to write one. - return - # now locate _version.py in the new build/ directory and replace - # it with an updated value - target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) - if not os.path.exists(target_versionfile): - print( - f"Warning: {target_versionfile} does not exist, skipping " - "version update. This can happen if you are running build_ext " - "without first running build_py." - ) - return - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - cmds["build_ext"] = cmd_build_ext - - if "cx_Freeze" in sys.modules: # cx_freeze enabled? - from cx_Freeze.dist import build_exe as _build_exe - - # nczeczulin reports that py2exe won't like the pep440-style string - # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. - # setup(console=[{ - # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION - # "product_version": versioneer.get_version(), - # ... - - class cmd_build_exe(_build_exe): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - target_versionfile = cfg.versionfile_source - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - _build_exe.run(self) - os.unlink(target_versionfile) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - - cmds["build_exe"] = cmd_build_exe - del cmds["build_py"] - - if "py2exe" in sys.modules: # py2exe enabled? - try: - from py2exe.setuptools_buildexe import py2exe as _py2exe - except ImportError: - from py2exe.distutils_buildexe import py2exe as _py2exe - - class cmd_py2exe(_py2exe): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - target_versionfile = cfg.versionfile_source - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - _py2exe.run(self) - os.unlink(target_versionfile) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - - cmds["py2exe"] = cmd_py2exe - - # sdist farms its file list building out to egg_info - if "egg_info" in cmds: - _egg_info = cmds["egg_info"] - else: - from setuptools.command.egg_info import egg_info as _egg_info - - class cmd_egg_info(_egg_info): - def find_sources(self): - # egg_info.find_sources builds the manifest list and writes it - # in one shot - super().find_sources() - - # Modify the filelist and normalize it - root = get_root() - cfg = get_config_from_root(root) - self.filelist.append("versioneer.py") - if cfg.versionfile_source: - # There are rare cases where versionfile_source might not be - # included by default, so we must be explicit - self.filelist.append(cfg.versionfile_source) - self.filelist.sort() - self.filelist.remove_duplicates() - - # The write method is hidden in the manifest_maker instance that - # generated the filelist and was thrown away - # We will instead replicate their final normalization (to unicode, - # and POSIX-style paths) - from setuptools import unicode_utils - - normalized = [ - unicode_utils.filesys_decode(f).replace(os.sep, "/") - for f in self.filelist.files - ] - - manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") - with open(manifest_filename, "w") as fobj: - fobj.write("\n".join(normalized)) - - cmds["egg_info"] = cmd_egg_info - - # we override different "sdist" commands for both environments - if "sdist" in cmds: - _sdist = cmds["sdist"] - else: - from setuptools.command.sdist import sdist as _sdist - - class cmd_sdist(_sdist): - def run(self): - versions = get_versions() - self._versioneer_generated_versions = versions - # unless we update this, the command will keep using the old - # version - self.distribution.metadata.version = versions["version"] - return _sdist.run(self) - - def make_release_tree(self, base_dir, files): - root = get_root() - cfg = get_config_from_root(root) - _sdist.make_release_tree(self, base_dir, files) - # now locate _version.py in the new base_dir directory - # (remembering that it may be a hardlink) and replace it with an - # updated value - target_versionfile = os.path.join(base_dir, cfg.versionfile_source) - print("UPDATING %s" % target_versionfile) - write_to_version_file( - target_versionfile, self._versioneer_generated_versions - ) - - cmds["sdist"] = cmd_sdist - - return cmds - - -CONFIG_ERROR = """ -setup.cfg is missing the necessary Versioneer configuration. You need -a section like: - - [versioneer] - VCS = git - style = pep440 - versionfile_source = src/myproject/_version.py - versionfile_build = myproject/_version.py - tag_prefix = - parentdir_prefix = myproject- - -You will also need to edit your setup.py to use the results: - - import versioneer - setup(version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), ...) - -Please read the docstring in ./versioneer.py for configuration instructions, -edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. -""" - -SAMPLE_CONFIG = """ -# See the docstring in versioneer.py for instructions. Note that you must -# re-run 'versioneer.py setup' after changing this section, and commit the -# resulting files. - -[versioneer] -#VCS = git -#style = pep440 -#versionfile_source = -#versionfile_build = -#tag_prefix = -#parentdir_prefix = - -""" - -OLD_SNIPPET = """ -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions -""" - -INIT_PY_SNIPPET = """ -from . import {0} -__version__ = {0}.get_versions()['version'] -""" - - -def do_setup(): - """Do main VCS-independent setup function for installing Versioneer.""" - root = get_root() - try: - cfg = get_config_from_root(root) - except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: - if isinstance(e, (OSError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", file=sys.stderr) - with open(os.path.join(root, "setup.cfg"), "a") as f: - f.write(SAMPLE_CONFIG) - print(CONFIG_ERROR, file=sys.stderr) - return 1 - - print(" creating %s" % cfg.versionfile_source) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") - if os.path.exists(ipy): - try: - with open(ipy, "r") as f: - old = f.read() - except OSError: - old = "" - module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] - snippet = INIT_PY_SNIPPET.format(module) - if OLD_SNIPPET in old: - print(" replacing boilerplate in %s" % ipy) - with open(ipy, "w") as f: - f.write(old.replace(OLD_SNIPPET, snippet)) - elif snippet not in old: - print(" appending to %s" % ipy) - with open(ipy, "a") as f: - f.write(snippet) - else: - print(" %s unmodified" % ipy) - else: - print(" %s doesn't exist, ok" % ipy) - ipy = None - - # Make VCS-specific changes. For git, this means creating/changing - # .gitattributes to mark _version.py for export-subst keyword - # substitution. - do_vcs_install(cfg.versionfile_source, ipy) - return 0 - - -def scan_setup_py(): - """Validate the contents of setup.py against Versioneer's expectations.""" - found = set() - setters = False - errors = 0 - with open("setup.py", "r") as f: - for line in f.readlines(): - if "import versioneer" in line: - found.add("import") - if "versioneer.get_cmdclass()" in line: - found.add("cmdclass") - if "versioneer.get_version()" in line: - found.add("get_version") - if "versioneer.VCS" in line: - setters = True - if "versioneer.versionfile_source" in line: - setters = True - if len(found) != 3: - print("") - print("Your setup.py appears to be missing some important items") - print("(but I might be wrong). Please make sure it has something") - print("roughly like the following:") - print("") - print(" import versioneer") - print(" setup( version=versioneer.get_version(),") - print(" cmdclass=versioneer.get_cmdclass(), ...)") - print("") - errors += 1 - if setters: - print("You should remove lines like 'versioneer.VCS = ' and") - print("'versioneer.versionfile_source = ' . This configuration") - print("now lives in setup.cfg, and should be removed from setup.py") - print("") - errors += 1 - return errors - - -def setup_command(): - """Set up Versioneer and exit with appropriate error code.""" - errors = do_setup() - errors += scan_setup_py() - sys.exit(1 if errors else 0) - - -if __name__ == "__main__": - cmd = sys.argv[1] - if cmd == "setup": - setup_command()