Skip to content

Commit

Permalink
Ensure query-planning is disabled in dask (#371)
Browse files Browse the repository at this point in the history
* disable query-planning when merlin.core is imported

* fix typo

* remove tests/__init__.py change

* disable string conversin

* move dask config settings/check to top level (only way to be sure we don't import dd anywere before disabling these settings)

* clarify error message

* use new configs module tfor central config validation point

* add missing config module
  • Loading branch information
rjzamora authored Jul 26, 2024
1 parent 6a177d8 commit 2643923
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 4 deletions.
67 changes: 67 additions & 0 deletions merlin/config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

_DASK_QUERY_PLANNING_ENABLED = False
try:
# Disable query-planning and string conversion
import dask

dask.config.set(
{
"dataframe.query-planning": False,
"dataframe.convert-string": False,
}
)
except ImportError:
dask = None
else:
import sys

import dask.dataframe as dd
from packaging.version import parse

if parse(dask.__version__) > parse("2024.6.0"):
# For newer versions of dask, we can just check
# the official DASK_EXPR_ENABLED constant
_DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED
else:
# For older versions of dask, we must assume query
# planning is enabled if dask_expr was imported
# (because we can't know for sure)
_DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules


def validate_dask_configs():
"""Central check for problematic config options in Dask"""
if _DASK_QUERY_PLANNING_ENABLED:
raise NotImplementedError(
"Merlin does not support the query-planning API in "
"Dask Dataframe yet. Please make sure query-planning is "
"disabled before dask.dataframe is imported.\n\ne.g."
"dask.config.set({'dataframe.query-planning': False})"
"\n\nOr set the environment variable: "
"export DASK_DATAFRAME__QUERY_PLANNING=False"
)

if dask is not None and dask.config.get("dataframe.convert-string"):
raise NotImplementedError(
"Merlin does not support automatic string conversion in "
"Dask Dataframe yet. Please make sure this option is "
"disabled.\n\ne.g."
"dask.config.set({'dataframe.convert-string': False})"
"\n\nOr set the environment variable: "
"export DASK_DATAFRAME__CONVERT_STRING=False"
)
4 changes: 3 additions & 1 deletion merlin/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,8 @@
# limitations under the License.
#

from merlin.config import validate_dask_configs
from merlin.core import _version

__version__ = _version.get_versions()["version"]
validate_dask_configs()
6 changes: 5 additions & 1 deletion merlin/dag/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,10 @@
#
# flake8: noqa

from merlin.config import validate_dask_configs

validate_dask_configs()

from merlin.dag.graph import Graph
from merlin.dag.node import Node, iter_nodes, postorder_iter_nodes, preorder_iter_nodes
from merlin.dag.operator import DataFormats, Operator, Supports
Expand Down
8 changes: 6 additions & 2 deletions merlin/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,8 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

# flake8: noqa

from merlin.config import validate_dask_configs

validate_dask_configs()

from merlin.io import dataframe_iter, dataset, shuffle
from merlin.io.dataframe_iter import DataFrameIter
from merlin.io.dataset import MERLIN_METADATA_DIR_NAME, Dataset
Expand Down

0 comments on commit 2643923

Please sign in to comment.