dask-contrib · charlesbluca · Apr 15, 2024 · Mar 19, 2024 · Mar 22, 2024 · Mar 26, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ description = "Bindings for DataFusion used by Dask-SQL"
 readme = "README.md"
 license = "Apache-2.0"
 edition = "2021"
-rust-version = "1.72"
+rust-version = "1.73"
 include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"]
 
 [dependencies]

@@ -1,5 +1,5 @@
 python>=3.9
-dask==2024.1.1
+dask>=2024.1.1
 pandas>=1.4.0
 jpype1>=1.0.2
 openjdk>=8

@@ -16,7 +16,7 @@ RUN mamba install -y \
     # build requirements
     "maturin>=1.3,<1.4" \
     # core dependencies
-    "dask==2024.1.1" \
+    "dask>=2024.1.1" \
     "pandas>=1.4.0" \
     "fastapi>=0.92.0" \
     "httpx>=0.24.1" \

@@ -3,7 +3,7 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.1.1
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1

@@ -3,7 +3,7 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.1.1
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1

@@ -3,7 +3,7 @@ channels:
 - conda-forge
 dependencies:
 - c-compiler
-- dask==2024.1.1
+- dask>=2024.1.1
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1

@@ -23,6 +23,8 @@ cd "$WORKSPACE"
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
 
+export DASK_DATAFRAME__QUERY_PLANNING=false
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -61,4 +63,4 @@ conda config --show-sources
 conda list --show-channel-urls
 
 rapids-logger "Python py.test for dask-sql"
-py.test $WORKSPACE -n 4 -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
+py.test $WORKSPACE -n $PARALLEL_LEVEL -v -m gpu --runqueries --rungpu --junitxml="$WORKSPACE/junit-dask-sql.xml" --cov-config="$WORKSPACE/.coveragerc" --cov=dask_sql --cov-report=xml:"$WORKSPACE/dask-sql-coverage.xml" --cov-report term
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - c-compiler
 - zlib
-- dask==2024.1.1
+- dask>=2024.1.1
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1

@@ -9,7 +9,7 @@ channels:
 dependencies:
 - c-compiler
 - zlib
-- dask==2024.1.1
+- dask>=2024.1.1
 - fastapi>=0.92.0
 - fugue>=0.7.3
 - httpx>=0.24.1

diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml
@@ -5,7 +5,7 @@ c_compiler_version:
 rust_compiler:
     - rust
 rust_compiler_version:
-    - '1.72'
+    - '1.73'
 maturin:
     - '1.3'
 xz:        # [linux64]

@@ -32,7 +32,7 @@ requirements:
     - xz  # [linux64]
   run:
     - python
-    - dask ==2024.1.1
+    - dask >=2024.1.1
     - pandas >=1.4.0
     - fastapi >=0.92.0
     - httpx >=0.24.1

@@ -262,15 +262,23 @@ def create_table(
             self.schema[schema_name].filepaths[table_name.lower()] = input_table
         elif hasattr(input_table, "dask") and dd.utils.is_dataframe_like(input_table):
             try:
-                dask_filepath = hlg_layer(
-                    input_table.dask, "read-parquet"
-                ).creation_info["args"][0]
+                if dd._dask_expr_enabled():
+                    from dask_expr.io.parquet import ReadParquet
+
+                    dask_filepath = None
+                    operations = input_table.find_operations(ReadParquet)
+                    for op in operations:
+                        dask_filepath = op._args[0]
+                else:
+                    dask_filepath = hlg_layer(
+                        input_table.dask, "read-parquet"
+                    ).creation_info["args"][0]
                 dc.filepath = dask_filepath
                 self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
             except KeyError:
                 logger.debug("Expected 'read-parquet' layer")
 
-        if parquet_statistics and not statistics:
+        if parquet_statistics and not dd._dask_expr_enabled() and not statistics:
             statistics = parquet_statistics(dc.df)
             if statistics:
                 row_count = 0

@@ -207,7 +207,7 @@ def transform(self, X):
                 estimator=self._postfit_estimator,
                 meta=output_meta,
             )
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 output_meta = _transform(X._meta_nonempty, self._postfit_estimator)
             try:
@@ -305,7 +305,7 @@ def predict(self, X):
             )
             return result
 
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 # dask-dataframe relies on dd.core.no_default
                 # for infering meta
@@ -364,7 +364,7 @@ def predict_proba(self, X):
                 meta=output_meta,
                 chunks=(X.chunks[0], len(self._postfit_estimator.classes_)),
             )
-        elif isinstance(X, dd._Frame):
+        elif isinstance(X, dd.DataFrame):
             if output_meta is None:
                 # dask-dataframe relies on dd.core.no_default
                 # for infering meta

@@ -38,7 +38,8 @@ def filter_or_scalar(
     # In SQL, a NULL in a boolean is False on filtering
     filter_condition = filter_condition.fillna(False)
     out = df[filter_condition]
-    if dask_config.get("sql.predicate_pushdown"):
+    # dask-expr should implicitly handle predicate pushdown
+    if dask_config.get("sql.predicate_pushdown") and not dd._dask_expr_enabled():
         return attempt_predicate_pushdown(out, add_filters=add_filters)
     else:
         return out

@@ -6,8 +6,6 @@
 
 import dask.dataframe as dd
 from dask import config as dask_config
-from dask.base import tokenize
-from dask.highlevelgraph import HighLevelGraph
 
 from dask_sql.datacontainer import ColumnContainer, DataContainer
 from dask_sql.physical.rel.base import BaseRelPlugin
@@ -132,41 +130,11 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai
             # TODO: we should implement a shortcut
             # for filter conditions that are always false
 
-            def merge_single_partitions(lhs_partition, rhs_partition):
-                # Do a cross join with the two partitions
-                # TODO: it would be nice to apply the filter already here
-                # problem: this would mean we need to ship the rex to the
-                # workers (as this is executed on the workers),
-                # which is definitely not possible (java dependency, JVM start...)
-                lhs_partition = lhs_partition.assign(common=1)
-                rhs_partition = rhs_partition.assign(common=1)
-
-                return lhs_partition.merge(rhs_partition, on="common").drop(
-                    columns="common"
-                )
-
-            # Iterate nested over all partitions from lhs and rhs and merge them
-            name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
-            dsk = {
-                (name, i * df_rhs_renamed.npartitions + j): (
-                    merge_single_partitions,
-                    (df_lhs_renamed._name, i),
-                    (df_rhs_renamed._name, j),
-                )
-                for i in range(df_lhs_renamed.npartitions)
-                for j in range(df_rhs_renamed.npartitions)
-            }
-
-            graph = HighLevelGraph.from_collections(
-                name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed]
-            )
-
-            meta = dd.dispatch.concat(
-                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1
-            )
-            # TODO: Do we know the divisions in any way here?
-            divisions = [None] * (len(dsk) + 1)
-            df = dd.DataFrame(graph, name, meta=meta, divisions=divisions)
+            df = dd.merge(
+                df_lhs_renamed.assign(common=1),
+                df_rhs_renamed.assign(common=1),
+                on="common",
+            ).drop(columns="common")
 
             warnings.warn(
                 "Need to do a cross-join, which is typically very resource heavy",

@@ -58,6 +58,7 @@ def _apply_limit(self, df: dd.DataFrame, limit: int, offset: int) -> dd.DataFram
             # check if the first partition contains our desired window
             if (
                 dask_config.get("sql.limit.check-first-partition")
+                and not dd._dask_expr_enabled()
                 and all(
                     [
                         isinstance(
@@ -79,6 +80,10 @@ def _apply_limit(self, df: dd.DataFrame, limit: int, offset: int) -> dd.DataFram
         def limit_partition_func(df, partition_borders, partition_info=None):
             """Limit the partition to values contained within the specified window, returning an empty dataframe if there are none"""
 
+            # with dask-expr we may need to explicitly compute here
+            if hasattr(partition_borders, "compute"):
+                partition_borders = partition_borders.compute()
+
             # TODO: remove the `cumsum` call here when dask#9067 is resolved
             partition_borders = partition_borders.cumsum().to_dict()
             partition_index = (

@@ -3,6 +3,7 @@
 from functools import reduce
 from typing import TYPE_CHECKING
 
+from dask.dataframe import _dask_expr_enabled
 from dask.utils_test import hlg_layer
 
 from dask_sql.datacontainer import DataContainer
@@ -108,9 +109,11 @@ def _apply_filters(self, table_scan, rel, dc, context):
                 ],
             )
             df = filter_or_scalar(df, df_condition)
-        try:
-            logger.debug(hlg_layer(df.dask, "read-parquet").creation_info)
-        except KeyError:
-            pass
+
+        if not _dask_expr_enabled():
+            try:
+                logger.debug(hlg_layer(df.dask, "read-parquet").creation_info)
+            except KeyError:
+                pass
 
         return DataContainer(df, cc)
@@ -11,9 +11,6 @@
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
-from dask.base import tokenize
-from dask.dataframe.core import Series
-from dask.highlevelgraph import HighLevelGraph
 from dask.utils import random_state_data
 
 from dask_sql._datafusion_lib import SqlTypeName
@@ -828,37 +825,28 @@ def random_function(self, partition, random_state, kwargs):
 
     def random_frame(self, seed: int, dc: DataContainer, **kwargs) -> dd.Series:
         """This function - in contrast to others in this module - will only ever be called on data frames"""
-
-        random_state = np.random.RandomState(seed=seed)
-
-        # Idea taken from dask.DataFrame.sample:
-        # initialize a random state for each of the partitions
-        # separately and then create a random series
-        # for each partition
         df = dc.df
-        name = "sample-" + tokenize(df, random_state)
-
-        state_data = random_state_data(df.npartitions, random_state)
-        dsk = {
-            (name, i): (
-                self.random_function,
-                (df._name, i),
-                np.random.RandomState(state),
-                kwargs,
+        state_data = random_state_data(df.npartitions, np.random.RandomState(seed=seed))
+
+        def random_partition_func(df, state_data, partition_info=None):
+            """Create a random number for each partition"""
+            partition_index = (
+                partition_info["number"] if partition_info is not None else 0
             )
-            for i, state in enumerate(state_data)
-        }
 
-        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
-        random_series = Series(graph, name, ("random", "float64"), df.divisions)
+            state = np.random.RandomState(state_data[partition_index])
+            return self.random_function(df, state, kwargs)
+
+        random_series = df.map_partitions(
+            random_partition_func, state_data, meta=("random", "float64")
+        )
 
         # This part seems to be stupid, but helps us do a very simple
         # task without going into the (private) internals of Dask:
         # copy all meta information from the original input dataframe
         # This is important so that the returned series looks
         # exactly like coming from the input dataframe
-        return_df = df.assign(random=random_series)["random"]
-        return return_df
+        return df.assign(random=random_series)["random"]
 
 
 class RandOperation(BaseRandomOperation):

@@ -304,10 +304,10 @@ def combine(self, other: DNF | _And | _Or | list | tuple | None) -> DNF:
 # Specify functions that must be generated with
 # a different API at the dataframe-collection level
 _special_op_mappings = {
-    M.fillna: dd._Frame.fillna,
-    M.isin: dd._Frame.isin,
-    M.isna: dd._Frame.isna,
-    M.astype: dd._Frame.astype,
+    M.fillna: dd.DataFrame.fillna,
+    M.isin: dd.DataFrame.isin,
+    M.isna: dd.DataFrame.isna,
+    M.astype: dd.DataFrame.astype,
 }
 
 # Convert _pass_through_ops to respect "special" mappings
@@ -316,7 +316,7 @@ def combine(self, other: DNF | _And | _Or | list | tuple | None) -> DNF:
 
 def _preprocess_layers(input_layers):
     # NOTE: This is a Layer-specific work-around to deal with
-    # the fact that `dd._Frame.isin(values)` will add a distinct
+    # the fact that `dd.DataFrame.isin(values)` will add a distinct
     # `MaterializedLayer` for the `values` argument.
     # See: https://github.com/dask-contrib/dask-sql/issues/607
     skip = set()
@@ -418,9 +418,9 @@ def _dnf_filter_expression(self, dsk):
             func = _blockwise_logical_dnf
         elif op == operator.getitem:
             func = _blockwise_getitem_dnf
-        elif op == dd._Frame.isin:
+        elif op == dd.DataFrame.isin:
             func = _blockwise_isin_dnf
-        elif op == dd._Frame.isna:
+        elif op == dd.DataFrame.isna:
             func = _blockwise_isna_dnf
         elif op == operator.inv:
             func = _blockwise_inv_dnf

@@ -6,7 +6,7 @@ dependencies:
   - sphinx>=4.0.0
   - sphinx-tabs
   - dask-sphinx-theme>=2.0.3
-  - dask==2024.1.1
+  - dask>=2024.1.1
   - pandas>=1.4.0
   - fugue>=0.7.3
   # FIXME: https://github.com/fugue-project/fugue/issues/526
@@ -19,4 +19,4 @@ dependencies:
   - pygments>=2.7.1
   - tabulate
   - ucx-proc=*=cpu
-  - rust>=1.72
+  - rust>=1.73
@@ -1,7 +1,7 @@
 sphinx>=4.0.0
 sphinx-tabs
 dask-sphinx-theme>=3.0.0
-dask==2024.1.1
+dask>=2024.1.1
 pandas>=1.4.0
 fugue>=0.7.3
 # FIXME: https://github.com/fugue-project/fugue/issues/526