Merge branch 'main' into cleanup_run_classifier_helper

NVIDIA · Jan 3, 2025 · 5377c19 · 5377c19
2 parents 498910d + 7dfb21a
commit 5377c19
Show file tree

Hide file tree

Showing 24 changed files with 128 additions and 95 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -25,10 +25,13 @@ on:
         required: true
         default: true
         type: boolean
-
+      version-bump-branch:
+        type: string
+        required: true
+        description: Branch to target for version bump
 jobs:
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.4
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_curator_container
@@ -43,6 +46,7 @@ jobs:
       container-workdir: /opt/NeMo-Curator
       library-name: NeMo Curator
       dry-run: ${{ inputs.dry-run }}
+      version-bump-branch: ${{ inputs.version-bump-branch }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
 
+## NeMo Curator 0.6.0
+
+- Synthetic Data Generation for Text Retrieval
+  - LLM-based Filters
+    - Easiness
+    - Answerability
+  - Q&A Retrieval Generation Pipeline
+- Parallel Dataset Curation for Machine Translation
+  - Load/Write Bitext Files
+  - Heuristic filtering (Histogram, Length Ratio)
+  - Classifier filtering (Comet, Cometoid)
+
 ## NeMo Curator 0.5.0
 
 ### Highlights

diff --git a/nemo_curator/_compat.py b/nemo_curator/_compat.py
@@ -39,10 +39,15 @@
 except (ImportError, TypeError):
     CURRENT_CUDF_VERSION = parse_version("24.10.0")
 
-# TODO remove this once 24.12.0 becomes the base version of cudf in nemo-curator
-MINHASH_PERMUTED_AVAILABLE = CURRENT_CUDF_VERSION >= parse_version("24.12.0") or (
-    CURRENT_CUDF_VERSION.is_prerelease
-    and CURRENT_CUDF_VERSION.base_version >= "24.12.0"
+# TODO remove this once 25.02 becomes the base version of cudf in nemo-curator
+
+# minhash in < 24.12 used to have a minhash(txt) api which was deprecated in favor of
+# minhash(a, b) in 25.02 (in 24.12, minhash_permuted(a,b) was introduced)
+MINHASH_DEPRECATED_API = (
+    CURRENT_CUDF_VERSION.base_version < parse_version("24.12").base_version
+)
+MINHASH_PERMUTED_AVAILABLE = (CURRENT_CUDF_VERSION.major == 24) & (
+    CURRENT_CUDF_VERSION.minor == 12
 )
 
 # TODO: remove when dask min version gets bumped

diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -64,7 +64,7 @@ def read_json(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             input_meta: A dictionary or a string formatted as a dictionary, which outlines
                 the field names and their respective data types within the JSONL input file.
             columns: If not None, only these columns will be read from the file.
@@ -102,7 +102,7 @@ def read_parquet(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             columns: If not None, only these columns will be read from the file.
                 There is a significant performance gain when specifying columns for Parquet files.
 
@@ -135,7 +135,7 @@ def read_pickle(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             columns: If not None, only these columns will be read from the file.
 
         """

diff --git a/nemo_curator/datasets/parallel_dataset.py b/nemo_curator/datasets/parallel_dataset.py
@@ -129,7 +129,7 @@ def read_single_simple_bitext_file_pair(
             tgt_lang (str): Target language, in ISO-639-1 (two character) format (e.g. 'en')
             doc_id (str, optional): A string document id to assign to every segment in the file. Defaults to None.
             backend (str, optional): Backend of the data frame. Defaults to "cudf".
-            add_filename (bool, optional): Add filename as an extra field to every segment in the file. Defaults to False.
+            add_filename (bool, optional): Add "file_name" as an extra field to every segment in the file. Defaults to False.
 
         Returns:
             Union[dd.DataFrame, dask_cudf.DataFrame]
@@ -162,6 +162,6 @@ def read_single_simple_bitext_file_pair(
         df_combined["tgt_lang"] = tgt_lang
 
         if add_filename:
-            df_combined["filename"] = remove_path_extension(src_input_file)
+            df_combined["file_name"] = remove_path_extension(src_input_file)
 
         return df_combined
diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py
@@ -403,7 +403,7 @@ def download_arxiv(
         "text": str,
         "id": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         arxiv_urls,

diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
@@ -430,7 +430,7 @@ def download_common_crawl(
         "url": str,
         "warc_id": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         common_crawl_urls,

diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py
@@ -141,7 +141,7 @@ def _download_and_extract_single_partition(
     partition = pd.DataFrame(records)
     filename = os.path.basename(output_path)
     output_dir = os.path.dirname(output_path)
-    partition["filename"] = filename
+    partition["file_name"] = filename
     single_partition_write_with_filename(partition, output_dir, output_type=output_type)
     if not keep_raw_download:
         os.remove(downloaded_file)

diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py
@@ -799,7 +799,7 @@ def download_wikipedia(
         "url": str,
         "language": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         wikipedia_urls,

diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
@@ -25,7 +25,6 @@
 from .config import FuzzyDuplicatesConfig, SemDedupConfig
 from .dataset_ops import blend_datasets, Shuffle
 from .exact_dedup import ExactDuplicates
-from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
 from .meta import Sequential
 from .modify import Modify
 from .task import TaskDecontamination
@@ -39,9 +38,7 @@
 BucketsToEdges = gpu_only_import_from(
     "nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
 )
-# Pytorch related imports must come after all imports that require cugraph,
-# because of context cleanup issues b/w pytorch and cugraph
-# See this issue: https://github.com/rapidsai/cugraph/issues/2718
+
 SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
 EmbeddingCreator = gpu_only_import_from(
     "nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
@@ -52,6 +49,10 @@
 SemanticClusterLevelDedup = gpu_only_import_from(
     "nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
 )
+# Pytorch related imports must come after all imports that require cugraph,
+# because of context cleanup issues b/w pytorch and cugraph
+# See this issue: https://github.com/rapidsai/cugraph/issues/2718
+from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
 
 __all__ = [
     "ExactDuplicates",

diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py
@@ -19,7 +19,7 @@ def __init__(
         partition_to_filename: Callable[[int], str] = default_filename,
     ) -> None:
         """
-        Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten.
+        Randomly permutes the dataset. This will make the original "file_name" column invalid, so if the column is present it will be overwritten.
         Args:
             seed: The random seed that will be used to determine which partition (file) each datapoint goes to.
                 Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower)
@@ -52,8 +52,8 @@ def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset:
         shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions)
         shuffled_df = shuffled_df.reset_index(drop=True)
 
-        if "filename" in shuffled_df:
-            shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename)
+        if "file_name" in shuffled_df:
+            shuffled_df["file_name"] = shuffled_df.map_partitions(self._add_filename)
 
         return DocumentDataset(shuffled_df)
 
@@ -98,15 +98,15 @@ def _partition_shuffle(self, partition, partition_info=None):
             drop=True
         )
 
-        if "filename" in partition:
+        if "file_name" in partition:
             filename = self.partition_to_filename(partition_num)
-            partition["filename"] = filename
+            partition["file_name"] = filename
 
         return partition
 
     def _add_filename(self, partition, partition_info=None):
         if partition_info is None:
-            return ["filename"] * len(partition)
+            return ["file_name"] * len(partition)
 
         filename = self.partition_to_filename(partition_info["number"])
 

diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
@@ -35,7 +35,7 @@
 from dask.utils import M
 from tqdm import tqdm
 
-from nemo_curator._compat import MINHASH_PERMUTED_AVAILABLE
+from nemo_curator._compat import MINHASH_DEPRECATED_API, MINHASH_PERMUTED_AVAILABLE
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.log import create_logger
 from nemo_curator.modules.config import FuzzyDuplicatesConfig
@@ -98,15 +98,17 @@ def __init__(
         """
         self.num_hashes = num_hashes
         self.char_ngram = char_ngrams
-        if MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
+            self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)
+        else:
             self.seeds = self.generate_hash_permutation_seeds(
                 bit_width=64 if use_64bit_hash else 32,
                 n_permutations=self.num_hashes,
                 seed=seed,
             )
-        else:
-            self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)
+
         self.minhash_method = self.minhash64 if use_64bit_hash else self.minhash32
+
         self.id_field = id_field
         self.text_field = text_field
 
@@ -171,7 +173,7 @@ def minhash32(
         if not isinstance(ser, cudf.Series):
             raise TypeError("Expected data of type cudf.Series")
 
-        if not MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
             warnings.warn(
                 "Using an outdated minhash implementation, please update to cuDF version 24.12 "
                 "or later for improved performance. "
@@ -184,9 +186,14 @@ def minhash32(
             seeds_a = cudf.Series(seeds[:, 0], dtype="uint32")
             seeds_b = cudf.Series(seeds[:, 1], dtype="uint32")
 
-            return ser.str.minhash_permuted(
-                a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
-            )
+            if MINHASH_PERMUTED_AVAILABLE:
+                return ser.str.minhash_permuted(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
+            else:
+                return ser.str.minhash(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
 
     def minhash64(
         self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int
@@ -196,7 +203,7 @@ def minhash64(
         """
         if not isinstance(ser, cudf.Series):
             raise TypeError("Expected data of type cudf.Series")
-        if not MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
             warnings.warn(
                 "Using an outdated minhash implementation, please update to cuDF version 24.12 "
                 "or later for improved performance. "
@@ -209,9 +216,14 @@ def minhash64(
             seeds_a = cudf.Series(seeds[:, 0], dtype="uint64")
             seeds_b = cudf.Series(seeds[:, 1], dtype="uint64")
 
-            return ser.str.minhash64_permuted(
-                a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
-            )
+            if MINHASH_PERMUTED_AVAILABLE:
+                return ser.str.minhash64_permuted(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
+            else:
+                return ser.str.minhash64(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
 
     def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]:
         """

diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py
@@ -23,7 +23,13 @@
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV)
 
 __shortversion__ = ".".join(map(str, VERSION[:3]))
-__version__ = __shortversion__ + VERSION[3] + "." + ".".join(VERSION[4:])
+__version__ = __shortversion__
+
+if VERSION[3] != "":
+    __version__ = __version__ + VERSION[3]
+
+if VERSION[4] != "":
+    __version__ = __version__ + "." + ".".join(VERSION[4:])
 
 __package_name__ = "nemo_curator"
 __contact_names__ = "NVIDIA"