From 5068f0479ef962b5ba8aa6bb699c10b0d6ebb6ca Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 9 Apr 2024 10:32:45 -0700 Subject: [PATCH 01/11] Move GPU imports and make them optional Signed-off-by: Ayush Dattagupta --- nemo_curator/datasets/doc_dataset.py | 6 +- nemo_curator/gpu_deduplication/utils.py | 76 ------------------------- nemo_curator/modules/__init__.py | 26 +++++---- nemo_curator/modules/exact_dedup.py | 3 +- nemo_curator/modules/fuzzy_dedup.py | 17 ++++-- nemo_curator/utils/distributed_utils.py | 53 +++++++++++++---- nemo_curator/utils/gpu_utils.py | 16 ++++++ 7 files changed, 88 insertions(+), 109 deletions(-) diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index af45f290c..37592b188 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -13,7 +13,6 @@ # limitations under the License. import dask.dataframe as dd -import dask_cudf from nemo_curator.utils.distributed_utils import read_data, write_to_disk from nemo_curator.utils.file_utils import get_all_files_paths_under @@ -182,10 +181,7 @@ def _read_json_or_parquet( ) dfs.append(df) - if backend == "cudf": - raw_data = dask_cudf.concat(dfs, ignore_unknown_divisions=True) - else: - raw_data = dd.concat(dfs, ignore_unknown_divisions=True) + raw_data = dd.concat(dfs, ignore_unknown_divisions=True) elif isinstance(input_files, str): # Single file diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py index ed69477be..f6faefe77 100644 --- a/nemo_curator/gpu_deduplication/utils.py +++ b/nemo_curator/gpu_deduplication/utils.py @@ -13,84 +13,8 @@ # limitations under the License. import argparse -import logging -import os -import socket -from contextlib import nullcontext from time import time -import cudf -from dask_cuda import LocalCUDACluster -from distributed import Client, performance_report - - -def create_logger(rank, log_file, name="logger", log_level=logging.INFO): - # Create the logger - logger = logging.getLogger(name) - logger.setLevel(log_level) - - myhost = socket.gethostname() - - extra = {"host": myhost, "rank": rank} - formatter = logging.Formatter( - "%(asctime)s | %(host)s | Rank %(rank)s | %(message)s" - ) - - # File handler for output - file_handler = logging.FileHandler(log_file, mode="a") - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - logger = logging.LoggerAdapter(logger, extra) - - return logger - - -# TODO: Remove below to use nemo_curator.distributed_utils.get_client -def get_client(args) -> Client: - if args.scheduler_address: - if args.scheduler_file: - raise ValueError( - "Only one of scheduler_address or scheduler_file can be provided" - ) - else: - return Client(address=args.scheduler_address, timeout="30s") - elif args.scheduler_file: - return Client(scheduler_file=args.scheduler_file, timeout="30s") - else: - extra_kwargs = ( - { - "enable_tcp_over_ucx": True, - "enable_nvlink": True, - "enable_infiniband": False, - "enable_rdmacm": False, - } - if args.nvlink_only and args.protocol == "ucx" - else {} - ) - - cluster = LocalCUDACluster( - rmm_pool_size=args.rmm_pool_size, - protocol=args.protocol, - rmm_async=True, - **extra_kwargs, - ) - return Client(cluster) - - -def performance_report_if(path=None, report_name="dask-profile.html"): - if path is not None: - return performance_report(os.path.join(path, report_name)) - else: - return nullcontext() - - -# TODO: Remove below to use nemo_curator.distributed_utils._enable_spilling -def enable_spilling(): - """ - Enables spilling to host memory for cudf - """ - cudf.set_option("spill", True) - def get_num_workers(client): """ diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index d7c099803..caf3dc5ad 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -22,27 +22,33 @@ from .add_id import AddId from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter -from .fuzzy_dedup import LSH, MinHash + from .meta import Sequential from .modify import Modify from .task import TaskDecontamination -# Pytorch related imports must come after all imports that require cugraph, -# because of context cleanup issues b/w pytorch and cugraph -# See this issue: https://github.com/rapidsai/cugraph/issues/2718 -from .distributed_data_classifier import DomainClassifier, QualityClassifier - __all__ = [ - "DomainClassifier", "ExactDuplicates", "Filter", - "LSH", - "MinHash", "Modify", - "QualityClassifier", "Score", "ScoreFilter", "Sequential", "TaskDecontamination", "AddId", ] + +# GPU packages +try: + from .fuzzy_dedup import LSH, MinHash + + __all__ += ["LSH", "MinHash"] +except ModuleNotFoundError: + pass + +# Pytorch related imports must come after all imports that require cugraph, +# because of context cleanup issues b/w pytorch and cugraph +# See this issue: https://github.com/rapidsai/cugraph/issues/2718 +from .distributed_data_classifier import DomainClassifier, QualityClassifier + +__all__ += ["DomainClassifier", "QualityClassifier"] diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py index 5d960ac6e..2831f516f 100644 --- a/nemo_curator/modules/exact_dedup.py +++ b/nemo_curator/modules/exact_dedup.py @@ -28,7 +28,8 @@ from nemo_curator._compat import DASK_P2P_ERROR from nemo_curator.datasets import DocumentDataset -from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import performance_report_if from nemo_curator.utils.gpu_utils import is_cudf_type diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index 3b0576058..1bb5aa0e3 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -22,9 +22,6 @@ from typing import List, Tuple, Union import cudf -import cugraph -import cugraph.dask as dcg -import cugraph.dask.comms.comms as Comms import cupy as cp import dask_cudf import numpy as np @@ -39,8 +36,12 @@ filter_text_rows_by_bucket_batch, merge_left_to_shuffled_right, ) -from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if -from nemo_curator.utils.distributed_utils import get_current_client, get_num_workers +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import ( + get_current_client, + get_num_workers, + performance_report_if, +) from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import ( convert_str_id_to_int, int_ids_to_str, @@ -1106,6 +1107,10 @@ def _run_connected_components( deduped_parsed_id_path, output_path, ): + import cugraph.dask as dcg + import cugraph.dask.comms.comms as Comms + from cugraph import MultiGraph + Comms.initialize(p2p=True) df = dask_cudf.read_parquet( deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True @@ -1120,7 +1125,7 @@ def _run_connected_components( df = df[[self.left_id, self.right_id]].astype(np.int64) df = dask_cudf.concat([df, self_edge_df]) - G = cugraph.MultiGraph(directed=False) + G = MultiGraph(directed=False) G.from_dask_cudf_edgelist( df, source=self.left_id, destination=self.right_id, renumber=False ) diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index 71fa1cdca..33cbe1fa2 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -11,20 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import os os.environ["RAPIDS_NO_INITIALIZE"] = "1" import warnings +from contextlib import nullcontext from pathlib import Path from typing import Union -import cudf import dask.dataframe as dd -import dask_cudf import pandas as pd -from dask.distributed import Client, LocalCluster, get_worker -from dask_cuda import LocalCUDACluster +from dask.distributed import Client, LocalCluster, get_worker, performance_report + +from nemo_curator.utils.gpu_utils import ( + GPU_INSTALL_STRING, + is_cudf_type, + try_dask_cudf_import_and_raise, +) class DotDict: @@ -48,6 +53,12 @@ def start_dask_gpu_local_cluster(args) -> Client: GPUs present on the machine. """ + try: + from dask_cuda import LocalCUDACluster + except ModuleNotFoundError: + raise ModuleNotFoundError( + f"Starting a GPU cluster requires GPU dependencies. {GPU_INSTALL_STRING}" + ) # Setting conservative defaults # which should work across most systems @@ -166,6 +177,8 @@ def _enable_spilling(): i.e., computing on objects that occupy more memory than is available on the GPU. """ + import cudf + cudf.set_option("spill", True) @@ -184,6 +197,9 @@ def read_single_partition( A cudf DataFrame or a pandas DataFrame. """ + if backend == "cudf": + try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages") + if filetype == "jsonl": read_kwargs = {"lines": True} if backend == "cudf": @@ -265,6 +281,9 @@ def read_data( A Dask-cuDF or a Dask-pandas DataFrame. """ + if backend == "cudf": + try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages") + if file_type == "pickle": df = read_pandas_pickle(input_files[0], add_filename=add_filename) df = dd.from_pandas(df, npartitions=16) @@ -369,10 +388,12 @@ def single_partition_write_with_filename(df, output_file_dir, output_type="jsonl warnings.warn(f"Empty partition found") empty_partition = False - if isinstance(df, pd.DataFrame): - success_ser = pd.Series([empty_partition]) - else: + if is_cudf_type(df): + import cudf + success_ser = cudf.Series([empty_partition]) + else: + success_ser = pd.Series([empty_partition]) if empty_partition: filename = df.filename.iloc[0] @@ -425,10 +446,13 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso ) if write_to_filename: - if isinstance(df, dd.DataFrame): - output_meta = pd.Series([True], dtype="bool") - else: + if is_cudf_type(df): + import cudf + output_meta = cudf.Series([True]) + else: + output_meta = pd.Series([True], dtype="bool") + os.makedirs(output_file_dir, exist_ok=True) output = df.map_partitions( single_partition_write_with_filename, @@ -440,7 +464,7 @@ def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jso output = output.compute() else: if output_type == "jsonl": - if isinstance(df, dask_cudf.DataFrame): + if is_cudf_type(df): # See open issue here: https://github.com/rapidsai/cudf/issues/15211 # df.to_json(output_file_dir, orient="records", lines=True, engine="cudf", force_ascii=False) df.to_json( @@ -521,3 +545,10 @@ def get_current_client(): return Client.current() except ValueError: return None + + +def performance_report_if(path=None, report_name="dask-profile.html"): + if path is not None: + return performance_report(os.path.join(path, report_name)) + else: + return nullcontext() diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py index de1c23dfe..d231f13b5 100644 --- a/nemo_curator/utils/gpu_utils.py +++ b/nemo_curator/utils/gpu_utils.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda]` +or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda]` if installing from source""" + def is_cudf_type(obj): """ @@ -23,3 +26,16 @@ def is_cudf_type(obj): str(getattr(obj, "_meta", "")), ] return any("cudf" in obj_type for obj_type in types) + + +def try_dask_cudf_import_and_raise(message_prefix: str): + """ + Try to import cudf/dask-cudf and raise an error message on installing dependencies. + Optionally prepends msg + + """ + try: + import cudf + import dask_cudf + except ModuleNotFoundError: + raise ModuleNotFoundError(f"{message_prefix}. {GPU_INSTALL_STRING}") From 6c56352c078282dc55707bdce42f2afe63b1acb5 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 11 Apr 2024 11:29:16 -0700 Subject: [PATCH 02/11] Move gpu dependencies to a seperate install Signed-off-by: Ayush Dattagupta --- setup.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index b47ef5c95..8fc60e926 100644 --- a/setup.py +++ b/setup.py @@ -55,10 +55,6 @@ "comment_parser", "beautifulsoup4", "mwparserfromhell @ git+https://github.com/earwig/mwparserfromhell.git@0f89f44", - "cudf-cu12>=24.2", - "dask-cudf-cu12>=24.2", - "cugraph-cu12>=24.2", - "dask-cuda>=24.2", "spacy>=3.6.0, <4.0.0", "presidio-analyzer==2.2.351", "presidio-anonymizer==2.2.351", @@ -68,6 +64,15 @@ # due to this: https://github.com/miso-belica/jusText/issues/47 "lxml[html_clean]", ], + extras_require={ + "cuda12x": [ + "cudf-cu12>=24.2", + "dask-cudf-cu12>=24.2", + "cugraph-cu12>=24.2", + "dask-cuda>=24.2", + "spacy[cuda12x]>=3.6.0, <4.0.0", + ] + }, entry_points={ "console_scripts": [ "get_common_crawl_urls=nemo_curator.scripts.get_common_crawl_urls:console_script", From 48a4da24839face3cbfed3686feb1fe8d1217ba6 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 18 Apr 2024 22:30:49 -0700 Subject: [PATCH 03/11] Remove unused import Signed-off-by: Ayush Dattagupta --- nemo_curator/modules/fuzzy_dedup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index 1bb5aa0e3..cd8196448 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -42,10 +42,7 @@ get_num_workers, performance_report_if, ) -from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import ( - convert_str_id_to_int, - int_ids_to_str, -) +from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import int_ids_to_str from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( aggregated_anchor_docs_with_bk_read, get_restart_offsets, From edd926b79e60309ad29e1e078563047d6882dc51 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 11:50:39 -0700 Subject: [PATCH 04/11] Switch to placeholder import that raises on usage Signed-off-by: Ayush Dattagupta --- nemo_curator/modules/__init__.py | 31 +- nemo_curator/modules/fuzzy_dedup.py | 7 +- nemo_curator/utils/distributed_utils.py | 23 +- nemo_curator/utils/gpu_utils.py | 17 +- nemo_curator/utils/import_utils.py | 379 ++++++++++++++++++++++++ 5 files changed, 406 insertions(+), 51 deletions(-) create mode 100644 nemo_curator/utils/import_utils.py diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index caf3dc5ad..e857696d7 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -19,36 +19,35 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/31 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" +from nemo_curator.utils.import_utils import gpu_only_import_from + from .add_id import AddId from .exact_dedup import ExactDuplicates from .filter import Filter, Score, ScoreFilter - from .meta import Sequential from .modify import Modify from .task import TaskDecontamination +# GPU packages +LSH = gpu_only_import_from(".fuzzy_dedup", "LSH") +MinHash = gpu_only_import_from(".fuzzy_dedup", "MinHash") + +# Pytorch related imports must come after all imports that require cugraph, +# because of context cleanup issues b/w pytorch and cugraph +# See this issue: https://github.com/rapidsai/cugraph/issues/2718 +from .distributed_data_classifier import DomainClassifier, QualityClassifier + __all__ = [ + "DomainClassifier", "ExactDuplicates", "Filter", + "LSH", + "MinHash", "Modify", + "QualityClassifier", "Score", "ScoreFilter", "Sequential", "TaskDecontamination", "AddId", ] - -# GPU packages -try: - from .fuzzy_dedup import LSH, MinHash - - __all__ += ["LSH", "MinHash"] -except ModuleNotFoundError: - pass - -# Pytorch related imports must come after all imports that require cugraph, -# because of context cleanup issues b/w pytorch and cugraph -# See this issue: https://github.com/rapidsai/cugraph/issues/2718 -from .distributed_data_classifier import DomainClassifier, QualityClassifier - -__all__ += ["DomainClassifier", "QualityClassifier"] diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index cd8196448..b51499678 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -22,9 +22,12 @@ from typing import List, Tuple, Union import cudf +import cugraph.dask as dcg +import cugraph.dask.comms.comms as Comms import cupy as cp import dask_cudf import numpy as np +from cugraph import MultiGraph from dask import dataframe as dd from dask.dataframe.shuffle import shuffle as dd_shuffle from dask.utils import M @@ -1104,10 +1107,6 @@ def _run_connected_components( deduped_parsed_id_path, output_path, ): - import cugraph.dask as dcg - import cugraph.dask.comms.comms as Comms - from cugraph import MultiGraph - Comms.initialize(p2p=True) df = dask_cudf.read_parquet( deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index 33cbe1fa2..2d7dc9213 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -25,11 +25,11 @@ import pandas as pd from dask.distributed import Client, LocalCluster, get_worker, performance_report -from nemo_curator.utils.gpu_utils import ( - GPU_INSTALL_STRING, - is_cudf_type, - try_dask_cudf_import_and_raise, -) +from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type +from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from + +cudf = gpu_only_import("cudf") +LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") class DotDict: @@ -53,13 +53,6 @@ def start_dask_gpu_local_cluster(args) -> Client: GPUs present on the machine. """ - try: - from dask_cuda import LocalCUDACluster - except ModuleNotFoundError: - raise ModuleNotFoundError( - f"Starting a GPU cluster requires GPU dependencies. {GPU_INSTALL_STRING}" - ) - # Setting conservative defaults # which should work across most systems nvlink_only = getattr(args, "nvlink_only", False) @@ -197,9 +190,6 @@ def read_single_partition( A cudf DataFrame or a pandas DataFrame. """ - if backend == "cudf": - try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages") - if filetype == "jsonl": read_kwargs = {"lines": True} if backend == "cudf": @@ -282,7 +272,8 @@ def read_data( """ if backend == "cudf": - try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages") + # Try using cuDF. If not availible will throw an error. + test_obj = cudf.Series if file_type == "pickle": df = read_pandas_pickle(input_files[0], add_filename=add_filename) diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py index d231f13b5..7d11e2d7a 100644 --- a/nemo_curator/utils/gpu_utils.py +++ b/nemo_curator/utils/gpu_utils.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda]` -or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda]` if installing from source""" +GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda-12x]` +or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda-12x]"` if installing from source""" def is_cudf_type(obj): @@ -26,16 +26,3 @@ def is_cudf_type(obj): str(getattr(obj, "_meta", "")), ] return any("cudf" in obj_type for obj_type in types) - - -def try_dask_cudf_import_and_raise(message_prefix: str): - """ - Try to import cudf/dask-cudf and raise an error message on installing dependencies. - Optionally prepends msg - - """ - try: - import cudf - import dask_cudf - except ModuleNotFoundError: - raise ModuleNotFoundError(f"{message_prefix}. {GPU_INSTALL_STRING}") diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py new file mode 100644 index 000000000..3ec3f3dee --- /dev/null +++ b/nemo_curator/utils/import_utils.py @@ -0,0 +1,379 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import importlib +import logging +import traceback +from contextlib import contextmanager + +from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING + +logger = logging.Logger("import_logger", level=logging.INFO) + + +class UnavailableError(Exception): + """Error thrown if a symbol is unavailable due to an issue importing it""" + + +@contextmanager +def null_decorator(*args, **kwargs): + if len(kwargs) == 0 and len(args) == 1 and callable(args[0]): + return args[0] + else: + + def inner(func): + return func + + return inner + + +class UnavailableMeta(type): + """A metaclass for generating placeholder objects for unavailable symbols + + This metaclass allows errors to be deferred from import time to the time + that a symbol is actually used in order to streamline the usage of optional + dependencies. This is particularly useful for attempted imports of GPU-only + modules which will only be invoked if GPU-only functionality is + specifically used. + + If an attempt to import a symbol fails, this metaclass is used to generate + a class which stands in for that symbol. Any attempt to call the symbol + (instantiate the class) or access its attributes will throw an + UnavailableError exception. Furthermore, this class can be used in + e.g. isinstance checks, since it will (correctly) fail to match any + instance it is compared against. + + In addition to calls and attribute access, a number of dunder methods are + implemented so that other common usages of imported symbols (e.g. + arithmetic) throw an UnavailableError, but this is not guaranteed for + all possible uses. In such cases, other exception types (typically + TypeErrors) will be thrown instead. + """ + + def __new__(meta, name, bases, dct): + if dct.get("_msg", None) is None: + dct["_msg"] = f"{name} could not be imported" + name = f"MISSING{name}" + return super(UnavailableMeta, meta).__new__(meta, name, bases, dct) + + def __call__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __getattr__(cls, name): + raise UnavailableError(cls._msg) + + def __eq__(cls, other): + raise UnavailableError(cls._msg) + + def __lt__(cls, other): + raise UnavailableError(cls._msg) + + def __gt__(cls, other): + raise UnavailableError(cls._msg) + + def __ne__(cls, other): + raise UnavailableError(cls._msg) + + def __abs__(cls, other): + raise UnavailableError(cls._msg) + + def __add__(cls, other): + raise UnavailableError(cls._msg) + + def __radd__(cls, other): + raise UnavailableError(cls._msg) + + def __iadd__(cls, other): + raise UnavailableError(cls._msg) + + def __floordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __rfloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __ifloordiv__(cls, other): + raise UnavailableError(cls._msg) + + def __lshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rlshift__(cls, other): + raise UnavailableError(cls._msg) + + def __mul__(cls, other): + raise UnavailableError(cls._msg) + + def __rmul__(cls, other): + raise UnavailableError(cls._msg) + + def __imul__(cls, other): + raise UnavailableError(cls._msg) + + def __ilshift__(cls, other): + raise UnavailableError(cls._msg) + + def __pow__(cls, other): + raise UnavailableError(cls._msg) + + def __rpow__(cls, other): + raise UnavailableError(cls._msg) + + def __ipow__(cls, other): + raise UnavailableError(cls._msg) + + def __rshift__(cls, other): + raise UnavailableError(cls._msg) + + def __rrshift__(cls, other): + raise UnavailableError(cls._msg) + + def __irshift__(cls, other): + raise UnavailableError(cls._msg) + + def __sub__(cls, other): + raise UnavailableError(cls._msg) + + def __rsub__(cls, other): + raise UnavailableError(cls._msg) + + def __isub__(cls, other): + raise UnavailableError(cls._msg) + + def __truediv__(cls, other): + raise UnavailableError(cls._msg) + + def __rtruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __itruediv__(cls, other): + raise UnavailableError(cls._msg) + + def __divmod__(cls, other): + raise UnavailableError(cls._msg) + + def __rdivmod__(cls, other): + raise UnavailableError(cls._msg) + + def __neg__(cls): + raise UnavailableError(cls._msg) + + def __invert__(cls): + raise UnavailableError(cls._msg) + + def __hash__(cls): + raise UnavailableError(cls._msg) + + def __index__(cls): + raise UnavailableError(cls._msg) + + def __iter__(cls): + raise UnavailableError(cls._msg) + + def __delitem__(cls, name): + raise UnavailableError(cls._msg) + + def __setitem__(cls, name, value): + raise UnavailableError(cls._msg) + + def __enter__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __get__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __delete__(cls, *args, **kwargs): + raise UnavailableError(cls._msg) + + def __len__(cls): + raise UnavailableError(cls._msg) + + +def is_unavailable(obj): + """Helper to check if given symbol is actually a placeholder""" + return type(obj) is UnavailableMeta + + +class UnavailableNullContext: + """A placeholder class for unavailable context managers + + This context manager will return a value which will throw an + UnavailableError if used in any way, but the context manager itself can be + safely invoked. + """ + + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + return UnavailableMeta( + "MissingContextValue", + (), + {"_msg": "Attempted to make use of placeholder context return value."}, + ) + + def __exit__(self, *args, **kwargs): + pass + + +def safe_import(module, *, msg=None, alt=None): + """A function used to import modules that may not be available + + This function will attempt to import a module with the given name, but it + will not throw an ImportError if the module is not found. Instead, it will + return a placeholder object which will raise an exception only if used. + + Parameters + ---------- + module: str + The name of the module to import. + msg: str or None + An optional error message to be displayed if this module is used + after a failed import. + alt: object + An optional module to be used in place of the given module if it + fails to import + + Returns + ------- + object + The imported module, the given alternate, or a class derived from + UnavailableMeta. + """ + try: + return importlib.import_module(module) + except ImportError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + logger.info(f"Import of {module} failed with: {exception_text}") + if msg is None: + msg = f"{module} could not be imported" + if alt is None: + return UnavailableMeta(module.rsplit(".")[-1], (), {"_msg": msg}) + else: + return alt + + +def safe_import_from(module, symbol, *, msg=None, alt=None): + """A function used to import symbols from modules that may not be available + + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used. + + Parameters + ---------- + module: str + The name of the module in which the symbol is defined. + symbol: str + The name of the symbol to import. + msg: str or None + An optional error message to be displayed if this symbol is used + after a failed import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import + + Returns + ------- + object + The imported symbol, the given alternate, or a class derived from + UnavailableMeta. + """ + try: + imported_module = importlib.import_module(module) + return getattr(imported_module, symbol) + except ImportError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {module} failed with: {exception_text}") + except AttributeError: + exception_text = traceback.format_exc() + logger.debug(f"Import of {symbol} from {module} failed with: {exception_text}") + except Exception: + exception_text = traceback.format_exc() + logger.info(f"Import of {module}.{symbol} failed with: {exception_text}") + if msg is None: + msg = f"{module}.{symbol} could not be imported" + if alt is None: + return UnavailableMeta(symbol, (), {"_msg": msg}) + else: + return alt + + +def gpu_only_import(module, *, alt=None): + """A function used to import modules required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + alt: object + An optional module to be used in place of the given module if it + fails to import in a non-GPU-enabled install + + Returns + ------- + object + The imported module, the given alternate, or a class derived from + UnavailableMeta. + """ + + return safe_import( + module, + msg=f"{module} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}", + alt=alt, + ) + + +def gpu_only_import_from(module, symbol, *, alt=None): + """A function used to import symbols required only in GPU installs + + This function will attempt to import a module with the given name. + This function will attempt to import a symbol with the given name from + the given module, but it will not throw an ImportError if the symbol is not + found. Instead, it will return a placeholder object which will raise an + exception only if used with instructions on installing a GPU build. + + Parameters + ---------- + module: str + The name of the module to import. + symbol: str + The name of the symbol to import. + alt: object + An optional object to be used in place of the given symbol if it fails + to import in a non-GPU-enabled install + + Returns + ------- + object + The imported symbol, the given alternate, or a class derived from + UnavailableMeta. + """ + return safe_import_from( + module, + symbol, + msg=f"{module}.{symbol} is not installed in non GPU-enabled installations. {GPU_INSTALL_STRING}", + alt=alt, + ) From 3bed23dfe7c391afc897e10a64347a024d734c17 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 12:09:18 -0700 Subject: [PATCH 05/11] Remove deprecated utils usage Signed-off-by: Ayush Dattagupta --- nemo_curator/scripts/compute_minhashes.py | 9 +++++---- nemo_curator/scripts/connected_components.py | 7 ++++--- nemo_curator/scripts/find_exact_duplicates.py | 3 ++- nemo_curator/scripts/jaccard_compute.py | 8 ++++---- nemo_curator/scripts/jaccard_shuffle.py | 9 +++------ nemo_curator/scripts/map_buckets.py | 9 +++------ nemo_curator/scripts/minhash_lsh.py | 3 ++- 7 files changed, 23 insertions(+), 25 deletions(-) diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/compute_minhashes.py index c7a7e68b2..044653ceb 100644 --- a/nemo_curator/scripts/compute_minhashes.py +++ b/nemo_curator/scripts/compute_minhashes.py @@ -18,12 +18,13 @@ from nemo_curator import MinHash from nemo_curator.datasets import DocumentDataset from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import ( - create_logger, - parse_nc_args, +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger +from nemo_curator.utils.distributed_utils import ( + get_client, performance_report_if, + read_data, ) -from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import get_all_files_paths_under diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/connected_components.py index 1ab1282af..c04f0349d 100644 --- a/nemo_curator/scripts/connected_components.py +++ b/nemo_curator/scripts/connected_components.py @@ -15,7 +15,7 @@ import os import time -from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import ConnectedComponents from nemo_curator.utils.distributed_utils import get_client @@ -32,9 +32,10 @@ def main(args): st = time.time() output_path = os.path.join(args.output_dir, "connected_components.parquet") args.set_torch_to_use_rmm = False + args.enable_spilling = True + client = get_client(args, cluster_type="gpu") - enable_spilling() - client.run(enable_spilling) + components_stage = ConnectedComponents( cache_dir=args.cache_dir, jaccard_pairs_path=args.jaccard_pairs_path, diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index 7da01ea8e..16173861d 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -19,7 +19,8 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep -from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger from nemo_curator.modules import ExactDuplicates from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import get_all_files_paths_under diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/jaccard_compute.py index f59157164..d16e95654 100644 --- a/nemo_curator/scripts/jaccard_compute.py +++ b/nemo_curator/scripts/jaccard_compute.py @@ -15,13 +15,13 @@ import os import time -from nemo_curator.gpu_deduplication.utils import enable_spilling, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity from nemo_curator.utils.distributed_utils import get_client, get_num_workers def main(args): - description = """Computes the Jaccard similarity between document pairs + """Computes the Jaccard similarity between document pairs from partitioned parquet dataset. Result is a parquet dataset consiting of document id pair along with their Jaccard similarity score. """ @@ -30,9 +30,9 @@ def main(args): output_final_results_path = os.path.join( OUTPUT_PATH, "jaccard_similarity_results.parquet" ) + args.enable_spilling = True client = get_client(args, "gpu") - enable_spilling() - client.run(enable_spilling) + print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) print("Running jaccard compute script", flush=True) diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/jaccard_shuffle.py index dc5d20f9b..c01935a61 100644 --- a/nemo_curator/scripts/jaccard_shuffle.py +++ b/nemo_curator/scripts/jaccard_shuffle.py @@ -15,12 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, -) +from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _Shuffle +from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_text_ddf_from_json_path_with_blocksize, ) @@ -38,7 +35,7 @@ def main(args): OUTPUT_PATH = args.output_dir output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet") - client = get_client(args) + client = get_client(args, "gpu") client.run(func) print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/map_buckets.py index 522e4f417..9e3f71a51 100644 --- a/nemo_curator/scripts/map_buckets.py +++ b/nemo_curator/scripts/map_buckets.py @@ -15,12 +15,9 @@ import os import time -from nemo_curator.gpu_deduplication.utils import ( - get_client, - get_num_workers, - parse_nc_args, -) +from nemo_curator.gpu_deduplication.utils import get_num_workers, parse_nc_args from nemo_curator.modules.fuzzy_dedup import _MapBuckets +from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( get_bucket_ddf_from_parquet_path, get_text_ddf_from_json_path_with_blocksize, @@ -157,7 +154,7 @@ def main(args): output_anchor_docs_with_bk_path = os.path.join( OUTPUT_PATH, "anchor_docs_with_bk.parquet" ) - client = get_client(args) + client = get_client(args, "gpu") print(f"Num Workers = {get_num_workers(client)}", flush=True) print("Connected to dask cluster", flush=True) print("Running jaccard map buckets script", flush=True) diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/minhash_lsh.py index fb2c6a90d..ec206dc10 100644 --- a/nemo_curator/scripts/minhash_lsh.py +++ b/nemo_curator/scripts/minhash_lsh.py @@ -24,7 +24,8 @@ from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( convert_str_id_to_int, ) -from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args +from nemo_curator.gpu_deduplication.utils import parse_nc_args +from nemo_curator.log import create_logger from nemo_curator.utils.distributed_utils import get_client From 754f3e408fb81f2a6a9920ee17bb6ce8c2511982 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 12:13:20 -0700 Subject: [PATCH 06/11] Add cuML attribution Signed-off-by: Ayush Dattagupta --- nemo_curator/utils/import_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py index 3ec3f3dee..eb4a3165a 100644 --- a/nemo_curator/utils/import_utils.py +++ b/nemo_curator/utils/import_utils.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# This file is adapted from cuML's safe_imports module: +# https://github.com/rapidsai/cuml/blob/e93166ea0dddfa8ef2f68c6335012af4420bc8ac/python/cuml/internals/safe_imports.py + import importlib import logging From d080e6ed5f5802a8b4fc57406295a0ab0437eaa6 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 12:23:43 -0700 Subject: [PATCH 07/11] Safe import tests, improve install instruction, update gha workflow Signed-off-by: Ayush Dattagupta --- .github/workflows/test.yml | 5 ++--- nemo_curator/utils/gpu_utils.py | 4 ++-- tests/test_fuzzy_dedup.py | 6 ++++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d179a2a57..baa968f47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,9 +40,8 @@ jobs: # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | pip install wheel cython - pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com . + pip install --no-cache-dir . pip install pytest - name: Run tests - # TODO: Remove env variable when gpu dependencies are optional run: | - RAPIDS_NO_INITIALIZE=1 python -m pytest -v --cpu + python -m pytest -v --cpu diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py index 7d11e2d7a..86ba888fc 100644 --- a/nemo_curator/utils/gpu_utils.py +++ b/nemo_curator/utils/gpu_utils.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda-12x]` -or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda-12x]"` if installing from source""" +GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda12x]` +or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]"` if installing from source""" def is_cudf_type(obj): diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index 3c6a32754..a1acb901f 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -16,14 +16,16 @@ from itertools import combinations from typing import Iterable -import cudf -import dask_cudf import numpy as np import pytest from dask.dataframe.utils import assert_eq from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import LSH, MinHash +from nemo_curator.utils.import_utils import gpu_only_import + +cudf = gpu_only_import("cudf") +dask_cudf = gpu_only_import("dask_cudf") @pytest.fixture From 7d68763c24905e90a630762265aaec347ca9c883 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 18:53:59 -0700 Subject: [PATCH 08/11] Fix pytests due to loc bug Signed-off-by: Ayush Dattagupta --- tests/test_filters.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_filters.py b/tests/test_filters.py index 11bf57388..4ab11c21a 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -149,7 +149,9 @@ def test_retain_score_filter(self, letter_count_data): filtered_data = filter_step(letter_count_data) expected_indices = [2, 3] - expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + # Compute before loc due to https://github.com/dask/dask-expr/issues/1036 + expected_data = letter_count_data.df.compute().loc[expected_indices] + expected_data = DocumentDataset(dd.from_pandas(expected_data, 2)) expected_data.df[score_field] = pd.Series([5, 7], index=expected_data.df.index) assert all_equal( expected_data, filtered_data @@ -168,7 +170,9 @@ def test_filter(self, letter_count_data): filtered_data = filter_step(scored_data) expected_indices = [2, 3] - expected_data = letter_count_data.df.loc[expected_indices] + # Compute before loc due to https://github.com/dask/dask-expr/issues/1036 + expected_data = letter_count_data.df.compute().loc[expected_indices] + expected_data = dd.from_pandas(expected_data, 2) expected_data[score_field] = pd.Series([5, 7], index=expected_data.index) expected_data = DocumentDataset(expected_data) assert all_equal( From f520fb9d35fdabc828d43ec32b9943e481826fec Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 20:01:46 -0700 Subject: [PATCH 09/11] update install instructions Signed-off-by: Ayush Dattagupta --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eb8c37abe..a17a573eb 100644 --- a/README.md +++ b/README.md @@ -37,12 +37,20 @@ These modules are designed to be flexible and allow for reordering with few exce ## Installation -NeMo Curator currently requires Python 3.10 and a GPU with CUDA 12 or above installed in order to be used. +NeMo Curator currently requires Python 3.10 and the GPU accelerated modules require CUDA 12 or above installed in order to be used. -NeMo Curator can be installed manually by cloning the repository and installing as follows: +NeMo Curator can be installed manually by cloning the repository and installing as follows - + +For CPU only modules: +``` +pip install . ``` -pip install --extra-index-url https://pypi.nvidia.com . + +For CPU + CUDA accelerated modules ``` +pip install --extra-index-url https://pypi.nvidia.com ".[cuda12x]" +``` + ### NeMo Framework Container NeMo Curator is available in the [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo). The NeMo Framework Container provides an end-to-end platform for development of custom generative AI models anywhere. The latest release of NeMo Curator comes preinstalled in the container. From f18444f659f9096fe1bb9dae9c59f58111bd9df5 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Mon, 22 Apr 2024 20:02:28 -0700 Subject: [PATCH 10/11] Raise on non module-not-found errors, update logging Signed-off-by: Ayush Dattagupta --- nemo_curator/modules/__init__.py | 4 ++-- nemo_curator/utils/import_utils.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index e857696d7..434ebecf4 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -29,8 +29,8 @@ from .task import TaskDecontamination # GPU packages -LSH = gpu_only_import_from(".fuzzy_dedup", "LSH") -MinHash = gpu_only_import_from(".fuzzy_dedup", "MinHash") +LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH") +MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash") # Pytorch related imports must come after all imports that require cugraph, # because of context cleanup issues b/w pytorch and cugraph diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py index eb4a3165a..8b2308e9b 100644 --- a/nemo_curator/utils/import_utils.py +++ b/nemo_curator/utils/import_utils.py @@ -23,7 +23,8 @@ from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING -logger = logging.Logger("import_logger", level=logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("import_logger") class UnavailableError(Exception): @@ -235,7 +236,7 @@ def safe_import(module, *, msg=None, alt=None): """A function used to import modules that may not be available This function will attempt to import a module with the given name, but it - will not throw an ImportError if the module is not found. Instead, it will + will not throw an ModuleNotFoundError if the module is not found. Instead, it will return a placeholder object which will raise an exception only if used. Parameters @@ -257,12 +258,12 @@ def safe_import(module, *, msg=None, alt=None): """ try: return importlib.import_module(module) - except ImportError: + except ModuleNotFoundError: exception_text = traceback.format_exc() logger.debug(f"Import of {module} failed with: {exception_text}") except Exception: exception_text = traceback.format_exc() - logger.info(f"Import of {module} failed with: {exception_text}") + raise if msg is None: msg = f"{module} could not be imported" if alt is None: @@ -301,15 +302,15 @@ def safe_import_from(module, symbol, *, msg=None, alt=None): try: imported_module = importlib.import_module(module) return getattr(imported_module, symbol) - except ImportError: + except ModuleNotFoundError: exception_text = traceback.format_exc() logger.debug(f"Import of {module} failed with: {exception_text}") except AttributeError: exception_text = traceback.format_exc() - logger.debug(f"Import of {symbol} from {module} failed with: {exception_text}") + logger.info(f"Import of {symbol} from {module} failed with: {exception_text}") except Exception: exception_text = traceback.format_exc() - logger.info(f"Import of {module}.{symbol} failed with: {exception_text}") + raise if msg is None: msg = f"{module}.{symbol} could not be imported" if alt is None: From f508c30f0170fd038645e81d695d252738a5b9e2 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 23 Apr 2024 11:29:20 -0700 Subject: [PATCH 11/11] Update logging to not change root logger Signed-off-by: Ayush Dattagupta --- nemo_curator/utils/import_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo_curator/utils/import_utils.py b/nemo_curator/utils/import_utils.py index 8b2308e9b..ea78e4597 100644 --- a/nemo_curator/utils/import_utils.py +++ b/nemo_curator/utils/import_utils.py @@ -23,8 +23,9 @@ from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger("import_logger") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler()) class UnavailableError(Exception):