diff --git a/nemo_curator/datasets/image_text_pair_dataset.py b/nemo_curator/datasets/image_text_pair_dataset.py index b580015ca..9151ee97c 100644 --- a/nemo_curator/datasets/image_text_pair_dataset.py +++ b/nemo_curator/datasets/image_text_pair_dataset.py @@ -79,7 +79,7 @@ def from_webdataset(cls, path: str, id_col: str): path (str): The path to the WebDataset-like format on disk or cloud storage. id_col (str): The column storing the unique identifier for each record. """ - metadata = dask_cudf.read_parquet(path) + metadata = dask_cudf.read_parquet(path, split_row_groups=False, blocksize=None) metadata = metadata.map_partitions(cls._sort_partition, id_col=id_col) tar_files = cls._get_tar_files(path) diff --git a/nemo_curator/image/classifiers/base.py b/nemo_curator/image/classifiers/base.py index 7ad9de01a..40b5215ed 100644 --- a/nemo_curator/image/classifiers/base.py +++ b/nemo_curator/image/classifiers/base.py @@ -17,9 +17,9 @@ import cudf import cupy as cp import torch +from crossfit.backend.cudf.series import create_list_series_from_1d_or_2d_ar from nemo_curator.datasets import ImageTextPairDataset -from nemo_curator.utils.cudf_utils import create_list_series_from_1d_or_2d_ar from nemo_curator.utils.distributed_utils import load_object_on_worker diff --git a/nemo_curator/image/embedders/base.py b/nemo_curator/image/embedders/base.py index d910e1700..428a61f43 100644 --- a/nemo_curator/image/embedders/base.py +++ b/nemo_curator/image/embedders/base.py @@ -16,11 +16,11 @@ import cupy as cp import torch +from crossfit.backend.cudf.series import create_list_series_from_1d_or_2d_ar from tqdm import tqdm from nemo_curator.datasets import ImageTextPairDataset from nemo_curator.image.classifiers import ImageClassifier -from nemo_curator.utils.cudf_utils import create_list_series_from_1d_or_2d_ar from nemo_curator.utils.distributed_utils import load_object_on_worker diff --git a/nemo_curator/utils/cudf_utils.py b/nemo_curator/utils/cudf_utils.py deleted file mode 100644 index 2ec6c1e25..000000000 --- a/nemo_curator/utils/cudf_utils.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import cudf -import cupy as cp -from cudf.core.column import as_column - - -@staticmethod -def create_list_series_from_1d_or_2d_ar(ar, index): - """ - Create a cudf list series from 2d arrays - """ - if len(ar.shape) == 1: - n_rows, *_ = ar.shape - n_cols = 1 - elif len(ar.shape) == 2: - n_rows, n_cols = ar.shape - else: - return RuntimeError(f"Unexpected input shape: {ar.shape}") - data = as_column(ar.flatten()) - offset_col = as_column( - cp.arange(start=0, stop=len(data) + 1, step=n_cols), dtype="int32" - ) - mask_col = cp.full(shape=n_rows, fill_value=cp.bool_(True)) - mask = cudf._lib.transform.bools_to_mask(as_column(mask_col)) - lc = cudf.core.column.ListColumn( - size=n_rows, - dtype=cudf.ListDtype(data.dtype), - mask=mask, - offset=0, - null_count=0, - children=(offset_col, data), - ) - - return cudf.Series(lc, index=index)