From 26439239625ce131a7e69758f3464ef3afd1f6da Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 26 Jul 2024 12:18:30 -0500 Subject: [PATCH] Ensure query-planning is disabled in dask (#371) * disable query-planning when merlin.core is imported * fix typo * remove tests/__init__.py change * disable string conversin * move dask config settings/check to top level (only way to be sure we don't import dd anywere before disabling these settings) * clarify error message * use new configs module tfor central config validation point * add missing config module --- merlin/config/__init__.py | 67 +++++++++++++++++++++++++++++++++++++++ merlin/core/__init__.py | 4 ++- merlin/dag/__init__.py | 6 +++- merlin/io/__init__.py | 8 +++-- 4 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 merlin/config/__init__.py diff --git a/merlin/config/__init__.py b/merlin/config/__init__.py new file mode 100644 index 000000000..a1f37a010 --- /dev/null +++ b/merlin/config/__init__.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +_DASK_QUERY_PLANNING_ENABLED = False +try: + # Disable query-planning and string conversion + import dask + + dask.config.set( + { + "dataframe.query-planning": False, + "dataframe.convert-string": False, + } + ) +except ImportError: + dask = None +else: + import sys + + import dask.dataframe as dd + from packaging.version import parse + + if parse(dask.__version__) > parse("2024.6.0"): + # For newer versions of dask, we can just check + # the official DASK_EXPR_ENABLED constant + _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED + else: + # For older versions of dask, we must assume query + # planning is enabled if dask_expr was imported + # (because we can't know for sure) + _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules + + +def validate_dask_configs(): + """Central check for problematic config options in Dask""" + if _DASK_QUERY_PLANNING_ENABLED: + raise NotImplementedError( + "Merlin does not support the query-planning API in " + "Dask Dataframe yet. Please make sure query-planning is " + "disabled before dask.dataframe is imported.\n\ne.g." + "dask.config.set({'dataframe.query-planning': False})" + "\n\nOr set the environment variable: " + "export DASK_DATAFRAME__QUERY_PLANNING=False" + ) + + if dask is not None and dask.config.get("dataframe.convert-string"): + raise NotImplementedError( + "Merlin does not support automatic string conversion in " + "Dask Dataframe yet. Please make sure this option is " + "disabled.\n\ne.g." + "dask.config.set({'dataframe.convert-string': False})" + "\n\nOr set the environment variable: " + "export DASK_DATAFRAME__CONVERT_STRING=False" + ) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index f35898e5d..0dda4f9d5 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ # limitations under the License. # +from merlin.config import validate_dask_configs from merlin.core import _version __version__ = _version.get_versions()["version"] +validate_dask_configs() diff --git a/merlin/dag/__init__.py b/merlin/dag/__init__.py index dca0c76dd..c668e8945 100644 --- a/merlin/dag/__init__.py +++ b/merlin/dag/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,10 @@ # # flake8: noqa +from merlin.config import validate_dask_configs + +validate_dask_configs() + from merlin.dag.graph import Graph from merlin.dag.node import Node, iter_nodes, postorder_iter_nodes, preorder_iter_nodes from merlin.dag.operator import DataFormats, Operator, Supports diff --git a/merlin/io/__init__.py b/merlin/io/__init__.py index ff4058c5a..851f5a558 100644 --- a/merlin/io/__init__.py +++ b/merlin/io/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - # flake8: noqa + +from merlin.config import validate_dask_configs + +validate_dask_configs() + from merlin.io import dataframe_iter, dataset, shuffle from merlin.io.dataframe_iter import DataFrameIter from merlin.io.dataset import MERLIN_METADATA_DIR_NAME, Dataset