From 4fb7f54bd5cf44bee7a520b3d40d1a53049f78d4 Mon Sep 17 00:00:00 2001 From: Praateek Mahajan Date: Mon, 23 Dec 2024 19:42:40 -0800 Subject: [PATCH 1/6] Change filename column name to file_name (#449) --- nemo_curator/datasets/doc_dataset.py | 6 +-- nemo_curator/datasets/parallel_dataset.py | 4 +- nemo_curator/download/arxiv.py | 2 +- nemo_curator/download/commoncrawl.py | 2 +- nemo_curator/download/doc_builder.py | 2 +- nemo_curator/download/wikipedia.py | 2 +- nemo_curator/modules/dataset_ops.py | 12 +++--- nemo_curator/utils/distributed_utils.py | 38 +++++++++---------- nemo_curator/utils/file_utils.py | 2 +- tests/test_io.py | 18 ++++----- tests/test_read_data.py | 18 +++------ tests/test_separate_by_metadata.py | 2 +- tests/test_shuffle.py | 16 ++++---- .../peft-curation-with-sdg/docbuilder.py | 2 +- .../peft-curation-with-sdg/synthetic_gen.py | 4 +- tutorials/peft-curation/docbuilder.py | 2 +- tutorials/tinystories/docbuilder.py | 2 +- 17 files changed, 64 insertions(+), 70 deletions(-) diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index 10ffe2230..5b4caf518 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -64,7 +64,7 @@ def read_json( input_files: The path of the input file(s). backend: The backend to use for reading the data. files_per_partition: The number of files to read per partition. - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. input_meta: A dictionary or a string formatted as a dictionary, which outlines the field names and their respective data types within the JSONL input file. columns: If not None, only these columns will be read from the file. @@ -102,7 +102,7 @@ def read_parquet( input_files: The path of the input file(s). backend: The backend to use for reading the data. files_per_partition: The number of files to read per partition. - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. columns: If not None, only these columns will be read from the file. There is a significant performance gain when specifying columns for Parquet files. @@ -135,7 +135,7 @@ def read_pickle( input_files: The path of the input file(s). backend: The backend to use for reading the data. files_per_partition: The number of files to read per partition. - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. columns: If not None, only these columns will be read from the file. """ diff --git a/nemo_curator/datasets/parallel_dataset.py b/nemo_curator/datasets/parallel_dataset.py index eb8952676..b9a5eee1f 100644 --- a/nemo_curator/datasets/parallel_dataset.py +++ b/nemo_curator/datasets/parallel_dataset.py @@ -129,7 +129,7 @@ def read_single_simple_bitext_file_pair( tgt_lang (str): Target language, in ISO-639-1 (two character) format (e.g. 'en') doc_id (str, optional): A string document id to assign to every segment in the file. Defaults to None. backend (str, optional): Backend of the data frame. Defaults to "cudf". - add_filename (bool, optional): Add filename as an extra field to every segment in the file. Defaults to False. + add_filename (bool, optional): Add "file_name" as an extra field to every segment in the file. Defaults to False. Returns: Union[dd.DataFrame, dask_cudf.DataFrame] @@ -162,6 +162,6 @@ def read_single_simple_bitext_file_pair( df_combined["tgt_lang"] = tgt_lang if add_filename: - df_combined["filename"] = remove_path_extension(src_input_file) + df_combined["file_name"] = remove_path_extension(src_input_file) return df_combined diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py index 54cbca3de..449503e8b 100644 --- a/nemo_curator/download/arxiv.py +++ b/nemo_curator/download/arxiv.py @@ -403,7 +403,7 @@ def download_arxiv( "text": str, "id": str, "source_id": str, - "filename": str, + "file_name": str, } dataset = download_and_extract( arxiv_urls, diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index 53deffd91..68ad0de48 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -430,7 +430,7 @@ def download_common_crawl( "url": str, "warc_id": str, "source_id": str, - "filename": str, + "file_name": str, } dataset = download_and_extract( common_crawl_urls, diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py index 122d3ea31..dbeac133e 100644 --- a/nemo_curator/download/doc_builder.py +++ b/nemo_curator/download/doc_builder.py @@ -141,7 +141,7 @@ def _download_and_extract_single_partition( partition = pd.DataFrame(records) filename = os.path.basename(output_path) output_dir = os.path.dirname(output_path) - partition["filename"] = filename + partition["file_name"] = filename single_partition_write_with_filename(partition, output_dir, output_type=output_type) if not keep_raw_download: os.remove(downloaded_file) diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py index 456142fbb..32088a274 100644 --- a/nemo_curator/download/wikipedia.py +++ b/nemo_curator/download/wikipedia.py @@ -799,7 +799,7 @@ def download_wikipedia( "url": str, "language": str, "source_id": str, - "filename": str, + "file_name": str, } dataset = download_and_extract( wikipedia_urls, diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py index 745d741cf..e996946b0 100644 --- a/nemo_curator/modules/dataset_ops.py +++ b/nemo_curator/modules/dataset_ops.py @@ -19,7 +19,7 @@ def __init__( partition_to_filename: Callable[[int], str] = default_filename, ) -> None: """ - Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten. + Randomly permutes the dataset. This will make the original "file_name" column invalid, so if the column is present it will be overwritten. Args: seed: The random seed that will be used to determine which partition (file) each datapoint goes to. Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower) @@ -52,8 +52,8 @@ def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset: shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions) shuffled_df = shuffled_df.reset_index(drop=True) - if "filename" in shuffled_df: - shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename) + if "file_name" in shuffled_df: + shuffled_df["file_name"] = shuffled_df.map_partitions(self._add_filename) return DocumentDataset(shuffled_df) @@ -98,15 +98,15 @@ def _partition_shuffle(self, partition, partition_info=None): drop=True ) - if "filename" in partition: + if "file_name" in partition: filename = self.partition_to_filename(partition_num) - partition["filename"] = filename + partition["file_name"] = filename return partition def _add_filename(self, partition, partition_info=None): if partition_info is None: - return ["filename"] * len(partition) + return ["file_name"] * len(partition) filename = self.partition_to_filename(partition_info["number"]) diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py index 89b4415f7..7ff463a70 100644 --- a/nemo_curator/utils/distributed_utils.py +++ b/nemo_curator/utils/distributed_utils.py @@ -281,8 +281,8 @@ def select_columns( ) -> Union[dd.DataFrame, pd.DataFrame, "cudf.DataFrame"]: # We exclude parquet because the parquet readers already support column selection if filetype in ["jsonl", "json"] and columns is not None: - if add_filename and "filename" not in columns: - columns.append("filename") + if add_filename and "file_name" not in columns: + columns.append("file_name") df = df[columns] return df @@ -299,12 +299,12 @@ def read_single_partition( ) -> Union["cudf.DataFrame", pd.DataFrame]: """ This function reads a file with cuDF, sorts the columns of the DataFrame - and adds a "filename" column. + and adds a "file_name" column. Args: files: The path to the jsonl files to read. backend: The backend to use for reading the data. Either "cudf" or "pandas". - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. input_meta: A dictionary or a string formatted as a dictionary, which outlines the field names and their respective data types within the JSONL input file. columns: If not None, only these columns will be read from the file. @@ -368,7 +368,7 @@ def read_single_partition( for file in files: df = read_f(file, **read_kwargs, **kwargs) if add_filename: - df["filename"] = os.path.basename(file) + df["file_name"] = os.path.basename(file) df = select_columns(df, io_columns, filetype, add_filename) df_ls.append(df) @@ -429,7 +429,7 @@ def extract_filename(path: str) -> str: read_kwargs["include_path_column"] = add_filename read_kwargs["path_converter"] = extract_filename - postprocessing_func = lambda df: df.rename(columns={"path": "filename"}) + postprocessing_func = lambda df: df.rename(columns={"path": "file_name"}) elif file_type == "parquet": if backend == "cudf" and not DASK_CUDF_PARQUET_READ_INCONSISTENT_SCHEMA: @@ -509,7 +509,7 @@ def read_pandas_pickle( Args: file: The path to the pickle file to read. - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. columns: If not None, only these columns will be read from the file. Returns: A Pandas DataFrame. @@ -543,7 +543,7 @@ def read_data( file_type: The type of the input file(s). backend: The backend to use for reading the data. files_per_partition: The number of files to read per partition. - add_filename: Whether to add a "filename" column to the DataFrame. + add_filename: Whether to add a "file_name" column to the DataFrame. input_meta: A dictionary or a string formatted as a dictionary, which outlines the field names and their respective data types within the JSONL input file. columns: If not None, only these columns will be read from the file. @@ -686,14 +686,14 @@ def single_partition_write_with_filename( Args: df: A DataFrame. output_file_dir: The output file path. - keep_filename_column: Boolean representing whether to keep or drop the "filename" column, if it exists. + keep_filename_column: Boolean representing whether to keep or drop the "file_name" column, if it exists. output_type: The type of output file to write. Can be "jsonl" or "parquet". Returns: If the DataFrame is non-empty, return a Series containing a single element, True. If the DataFrame is empty, return a Series containing a single element, False. """ - assert "filename" in df.columns + assert "file_name" in df.columns if len(df) > 0: empty_partition = False @@ -709,14 +709,14 @@ def single_partition_write_with_filename( success_ser = pd.Series([empty_partition]) if not empty_partition: - filenames = df.filename.unique() + filenames = df.file_name.unique() filenames = list(filenames.values_host) if is_cudf_type(df) else list(filenames) num_files = len(filenames) for filename in filenames: - out_df = df[df.filename == filename] if num_files > 1 else df + out_df = df[df.file_name == filename] if num_files > 1 else df if not keep_filename_column: - out_df = out_df.drop("filename", axis=1) + out_df = out_df.drop("file_name", axis=1) filename = ( Path(filename).stem if output_type != "bitext" else Path(filename).name @@ -831,13 +831,13 @@ def write_to_disk( """ This function writes a Dask DataFrame to the specified file path. If write_to_filename is True, then it expects the - DataFrame to have a "filename" column that specifies where to write the document. + DataFrame to have a "file_name" column that specifies where to write the document. Args: df: A Dask DataFrame. output_path: The output file path. - write_to_filename: Boolean representing whether to write the filename using the "filename" column. - keep_filename_column: Boolean representing whether to keep or drop the "filename" column, if it exists. + write_to_filename: Boolean representing whether to write the filename using the "file_name" column. + keep_filename_column: Boolean representing whether to keep or drop the "file_name" column, if it exists. output_type: The type of output file to write. Can be "jsonl" or "parquet". """ @@ -856,9 +856,9 @@ def write_to_disk( ) # output_path is a directory - elif write_to_filename and "filename" not in df.columns: + elif write_to_filename and "file_name" not in df.columns: raise ValueError( - "write_using_filename is True but no filename column found in DataFrame" + "write_using_filename is True but no file_name column found in DataFrame" ) if is_cudf_type(df): @@ -890,7 +890,7 @@ def write_to_disk( os.makedirs(output_path, exist_ok=True) tmp_output_file_dir = os.path.join(output_path, ".tmp") os.makedirs(tmp_output_file_dir, exist_ok=True) - file_name = os.path.basename(list(df.filename.unique())[0]) + file_name = os.path.basename(list(df.file_name.unique())[0]) else: tmp_output_file_dir = os.path.join(output_path, ".tmp") os.makedirs(tmp_output_file_dir, exist_ok=True) diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index f1c957b47..4632346ae 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -300,7 +300,7 @@ def separate_by_metadata( Args: input_data: Either a DataFrame or a string representing the path to the input directory. - If a DataFrame is provided, it must have a 'filename' column for the shard. + If a DataFrame is provided, it must have a "file_name" column for the shard. output_dir: The base directory for which all metadata based subdirs will be created under metadata_field: The metadata field to split on remove_metadata: Whether to remove the metadata from the dataframe when saving it diff --git a/tests/test_io.py b/tests/test_io.py index 432c00b3d..546ccf27c 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -153,7 +153,7 @@ def test_multifile_single_partition( keep_filename_column, file_ext, ): - df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file0", "file1", "file1"]}) + df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file0", "file1", "file1"]}) single_partition_write_with_filename( df=df, @@ -165,7 +165,7 @@ def test_multifile_single_partition( assert os.path.exists(tmp_path / f"file1.{file_ext}") if not keep_filename_column: - df = df.drop("filename", axis=1) + df = df.drop("file_name", axis=1) df1 = read_single_partition( files=[tmp_path / f"file0.{file_ext}"], backend="pandas", filetype=file_ext @@ -185,7 +185,7 @@ def test_singlefile_single_partition( keep_filename_column, file_ext, ): - df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file2", "file2", "file2"]}) + df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file2", "file2", "file2"]}) single_partition_write_with_filename( df=df, @@ -197,14 +197,14 @@ def test_singlefile_single_partition( assert os.path.exists(tmp_path / f"file2.{file_ext}") if not keep_filename_column: - df = df.drop("filename", axis=1) + df = df.drop("file_name", axis=1) got = read_single_partition( files=[tmp_path / f"file2.{file_ext}"], backend="pandas", filetype=file_ext ) assert_eq(got, df) def test_multifile_single_partition_error(self, tmp_path): - df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file0", "file1", "file1"]}) + df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file0", "file1", "file1"]}) with pytest.raises(ValueError, match="Unknown output type"): single_partition_write_with_filename( @@ -220,13 +220,13 @@ def test_multifile_single_partition_error(self, tmp_path): ], ) def test_multifile_multi_partition(self, tmp_path, file_ext, read_f): - df1 = pd.DataFrame({"a": [1, 2, 3], "filename": ["file1", "file2", "file2"]}) + df1 = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file1", "file2", "file2"]}) df2 = df1.copy() - df2["filename"] = "file3" + df2["file_name"] = "file3" df3 = df1.copy() - df3["filename"] = ["file4", "file5", "file6"] + df3["file_name"] = ["file4", "file5", "file6"] ddf = dd.concat([df1, df2, df3]) - ddf["filename"] = ddf["filename"] + f".{file_ext}" + ddf["file_name"] = ddf["file_name"] + f".{file_ext}" write_to_disk( df=ddf, output_path=tmp_path / file_ext, diff --git a/tests/test_read_data.py b/tests/test_read_data.py index a619be3a4..29013479f 100644 --- a/tests/test_read_data.py +++ b/tests/test_read_data.py @@ -300,8 +300,8 @@ def test_read_data_blocksize_add_filename_jsonl(mock_multiple_jsonl_files, backe columns=None, ) - assert "filename" in df.columns - file_names = df["filename"].unique().compute() + assert "file_name" in df.columns + file_names = df["file_name"].unique().compute() if backend == "cudf": file_names = file_names.to_pandas() @@ -340,13 +340,7 @@ def test_read_data_blocksize_add_filename_parquet(mock_multiple_parquet_files, b pytest.param("cudf", "jsonl", marks=pytest.mark.gpu), pytest.param("cudf", "parquet", marks=pytest.mark.gpu), ("pandas", "jsonl"), - pytest.param( - "pandas", - "parquet", - marks=pytest.mark.xfail( - reason="filename column inaccessible with pandas backend and parquet" - ), - ), + ("pandas", "parquet"), ], ) def test_read_data_fpp_add_filename( @@ -369,8 +363,8 @@ def test_read_data_fpp_add_filename( ) assert list(df.columns) == list(df.head().columns) - assert set(df.columns) == {"filename", "id", "text"} - file_names = df["filename"].unique().compute() + assert set(df.columns) == {"file_name", "id", "text"} + file_names = df["file_name"].unique().compute() if backend == "cudf": file_names = file_names.to_pandas() @@ -445,7 +439,7 @@ def test_read_data_select_columns( if not add_filename: assert list(df.columns) == sorted(cols_to_select) else: - assert list(df.columns) == sorted(cols_to_select + ["filename"]) + assert list(df.columns) == sorted(cols_to_select + ["file_name"]) @pytest.mark.parametrize( diff --git a/tests/test_separate_by_metadata.py b/tests/test_separate_by_metadata.py index 020bf21d8..cd7c7147d 100644 --- a/tests/test_separate_by_metadata.py +++ b/tests/test_separate_by_metadata.py @@ -24,7 +24,7 @@ def _write_data(num_files, file_ext): dfs = [] for i in range(num_files): partition = df.copy() - partition["filename"] = f"f{i}.{file_ext}" + partition["file_name"] = f"f{i}.{file_ext}" dfs.append(partition) df = dd.concat(dfs) diff --git a/tests/test_shuffle.py b/tests/test_shuffle.py index a23d47906..2a4afc994 100644 --- a/tests/test_shuffle.py +++ b/tests/test_shuffle.py @@ -61,11 +61,11 @@ def test_filename(self): original_dataset = list_to_dataset( ["one", "two", "three", "four", "five"], npartitions=1 ) - original_dataset.df["filename"] = "original.jsonl" + original_dataset.df["file_name"] = "original.jsonl" expected_data = { "text": ["one", "two", "three", "five", "four"], - "filename": [ + "file_name": [ "file_0000000000.jsonl", "file_0000000000.jsonl", "file_0000000000.jsonl", @@ -86,11 +86,11 @@ def test_custom_filenames(self): original_dataset = list_to_dataset( ["one", "two", "three", "four", "five"], npartitions=1 ) - original_dataset.df["filename"] = "original.jsonl" + original_dataset.df["file_name"] = "original.jsonl" expected_data = { "text": ["one", "two", "three", "five", "four"], - "filename": [ + "file_name": [ "my_0.test", "my_0.test", "my_0.test", @@ -140,11 +140,11 @@ def test_filename(self): original_dataset = list_to_dataset( ["one", "two", "three", "four", "five"], npartitions=1 ) - original_dataset.df["filename"] = "original.jsonl" + original_dataset.df["file_name"] = "original.jsonl" expected_data = { "text": ["four", "five", "three", "one", "two"], - "filename": [ + "file_name": [ "file_0000000000.jsonl", "file_0000000001.jsonl", "file_0000000001.jsonl", @@ -163,11 +163,11 @@ def test_custom_filenames(self): original_dataset = list_to_dataset( ["one", "two", "three", "four", "five"], npartitions=1 ) - original_dataset.df["filename"] = "original.jsonl" + original_dataset.df["file_name"] = "original.jsonl" expected_data = { "text": ["four", "five", "three", "one", "two"], - "filename": [ + "file_name": [ "my_0.test", "my_1.test", "my_1.test", diff --git a/tutorials/peft-curation-with-sdg/docbuilder.py b/tutorials/peft-curation-with-sdg/docbuilder.py index f4892f70d..73365ec03 100644 --- a/tutorials/peft-curation-with-sdg/docbuilder.py +++ b/tutorials/peft-curation-with-sdg/docbuilder.py @@ -108,7 +108,7 @@ def iterate(self, file_path): id, extracted_content = extracted_content meta = { - "filename": file_name, + "file_name": file_name, "id": f"law-stackexchange-qa-{id}", } diff --git a/tutorials/peft-curation-with-sdg/synthetic_gen.py b/tutorials/peft-curation-with-sdg/synthetic_gen.py index f084e799d..7bbcf0f82 100644 --- a/tutorials/peft-curation-with-sdg/synthetic_gen.py +++ b/tutorials/peft-curation-with-sdg/synthetic_gen.py @@ -153,7 +153,7 @@ def _write_all_to_file(self, gen_entries, out_fp: str): synth_questions.extend(questions) synth_answers.extend(answers) synth_scores.extend(scores) - synth_filenames.extend([row["filename"] + ".synth"] * self.n_variants) + synth_filenames.extend([row["file_name"] + ".synth"] * self.n_variants) synth_ids.extend( [f"{row['id']}-synth-{i}" for i in range(self.n_variants)] ) @@ -169,7 +169,7 @@ def _write_all_to_file(self, gen_entries, out_fp: str): "question": synth_questions, "answer": synth_answers, "title": synth_titles, - "filename": synth_filenames, + "file_name": synth_filenames, "tags": synth_tags, # Use the same score for both the generated questions and answers. "question_score": synth_scores, diff --git a/tutorials/peft-curation/docbuilder.py b/tutorials/peft-curation/docbuilder.py index 3ae0840c9..7b635aa80 100644 --- a/tutorials/peft-curation/docbuilder.py +++ b/tutorials/peft-curation/docbuilder.py @@ -77,7 +77,7 @@ def iterate(self, file_path): self._counter += 1 content = email.group().strip('"').strip() meta = { - "filename": file_name, + "file_name": file_name, "id": f"email-{self._counter}", } extracted_content = self._extractor.extract(content) diff --git a/tutorials/tinystories/docbuilder.py b/tutorials/tinystories/docbuilder.py index 859fb35ab..9a9d9fe5e 100644 --- a/tutorials/tinystories/docbuilder.py +++ b/tutorials/tinystories/docbuilder.py @@ -71,7 +71,7 @@ def split_meta(example): self._counter += 1 content = " ".join(example) meta = { - "filename": file_name, + "file_name": file_name, "id": f"{file_name}-{self._counter}", } From d401333ec9d88c36494befc9ae7515574c4d89fb Mon Sep 17 00:00:00 2001 From: Praateek Mahajan Date: Mon, 30 Dec 2024 05:17:22 -0800 Subject: [PATCH 2/6] Support the new minhash 25.02 api (#445) Signed-off-by: Praateek --- nemo_curator/_compat.py | 13 +++++++---- nemo_curator/modules/fuzzy_dedup.py | 36 +++++++++++++++++++---------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/nemo_curator/_compat.py b/nemo_curator/_compat.py index 5de25ebd8..9125713dd 100644 --- a/nemo_curator/_compat.py +++ b/nemo_curator/_compat.py @@ -39,10 +39,15 @@ except (ImportError, TypeError): CURRENT_CUDF_VERSION = parse_version("24.10.0") -# TODO remove this once 24.12.0 becomes the base version of cudf in nemo-curator -MINHASH_PERMUTED_AVAILABLE = CURRENT_CUDF_VERSION >= parse_version("24.12.0") or ( - CURRENT_CUDF_VERSION.is_prerelease - and CURRENT_CUDF_VERSION.base_version >= "24.12.0" +# TODO remove this once 25.02 becomes the base version of cudf in nemo-curator + +# minhash in < 24.12 used to have a minhash(txt) api which was deprecated in favor of +# minhash(a, b) in 25.02 (in 24.12, minhash_permuted(a,b) was introduced) +MINHASH_DEPRECATED_API = ( + CURRENT_CUDF_VERSION.base_version < parse_version("24.12").base_version +) +MINHASH_PERMUTED_AVAILABLE = (CURRENT_CUDF_VERSION.major == 24) & ( + CURRENT_CUDF_VERSION.minor == 12 ) # TODO: remove when dask min version gets bumped diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index 2986a88f6..3afc4123a 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -35,7 +35,7 @@ from dask.utils import M from tqdm import tqdm -from nemo_curator._compat import MINHASH_PERMUTED_AVAILABLE +from nemo_curator._compat import MINHASH_DEPRECATED_API, MINHASH_PERMUTED_AVAILABLE from nemo_curator.datasets import DocumentDataset from nemo_curator.log import create_logger from nemo_curator.modules.config import FuzzyDuplicatesConfig @@ -98,15 +98,17 @@ def __init__( """ self.num_hashes = num_hashes self.char_ngram = char_ngrams - if MINHASH_PERMUTED_AVAILABLE: + if MINHASH_DEPRECATED_API: + self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed) + else: self.seeds = self.generate_hash_permutation_seeds( bit_width=64 if use_64bit_hash else 32, n_permutations=self.num_hashes, seed=seed, ) - else: - self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed) + self.minhash_method = self.minhash64 if use_64bit_hash else self.minhash32 + self.id_field = id_field self.text_field = text_field @@ -171,7 +173,7 @@ def minhash32( if not isinstance(ser, cudf.Series): raise TypeError("Expected data of type cudf.Series") - if not MINHASH_PERMUTED_AVAILABLE: + if MINHASH_DEPRECATED_API: warnings.warn( "Using an outdated minhash implementation, please update to cuDF version 24.12 " "or later for improved performance. " @@ -184,9 +186,14 @@ def minhash32( seeds_a = cudf.Series(seeds[:, 0], dtype="uint32") seeds_b = cudf.Series(seeds[:, 1], dtype="uint32") - return ser.str.minhash_permuted( - a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram - ) + if MINHASH_PERMUTED_AVAILABLE: + return ser.str.minhash_permuted( + a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram + ) + else: + return ser.str.minhash( + a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram + ) def minhash64( self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int @@ -196,7 +203,7 @@ def minhash64( """ if not isinstance(ser, cudf.Series): raise TypeError("Expected data of type cudf.Series") - if not MINHASH_PERMUTED_AVAILABLE: + if MINHASH_DEPRECATED_API: warnings.warn( "Using an outdated minhash implementation, please update to cuDF version 24.12 " "or later for improved performance. " @@ -209,9 +216,14 @@ def minhash64( seeds_a = cudf.Series(seeds[:, 0], dtype="uint64") seeds_b = cudf.Series(seeds[:, 1], dtype="uint64") - return ser.str.minhash64_permuted( - a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram - ) + if MINHASH_PERMUTED_AVAILABLE: + return ser.str.minhash64_permuted( + a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram + ) + else: + return ser.str.minhash64( + a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram + ) def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]: """ From db411b0480b0b84d481505a92553be12332191cf Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 2 Jan 2025 08:23:23 -0800 Subject: [PATCH 3/6] Reorder import (#460) Signed-off-by: Ryan Wolf --- nemo_curator/modules/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index bc5659311..b792d807c 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -25,7 +25,6 @@ from .config import FuzzyDuplicatesConfig, SemDedupConfig from .dataset_ops import blend_datasets, Shuffle from .exact_dedup import ExactDuplicates -from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter from .meta import Sequential from .modify import Modify from .task import TaskDecontamination @@ -39,9 +38,7 @@ BucketsToEdges = gpu_only_import_from( "nemo_curator.modules.fuzzy_dedup", "BucketsToEdges" ) -# Pytorch related imports must come after all imports that require cugraph, -# because of context cleanup issues b/w pytorch and cugraph -# See this issue: https://github.com/rapidsai/cugraph/issues/2718 + SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup") EmbeddingCreator = gpu_only_import_from( "nemo_curator.modules.semantic_dedup", "EmbeddingCreator" @@ -52,6 +49,10 @@ SemanticClusterLevelDedup = gpu_only_import_from( "nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup" ) +# Pytorch related imports must come after all imports that require cugraph, +# because of context cleanup issues b/w pytorch and cugraph +# See this issue: https://github.com/rapidsai/cugraph/issues/2718 +from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter __all__ = [ "ExactDuplicates", From fd516fbdcd5e925aa4598d10406a64b890ad7e68 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 2 Jan 2025 15:12:36 -0800 Subject: [PATCH 4/6] Pin CrossFit 0.0.8 (#464) Signed-off-by: Sarah Yurick --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 89e5ae150..329c2eec4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,7 @@ dependencies = [ "beautifulsoup4", "charset_normalizer>=3.1.0", "comment_parser", - # TODO: Pin CrossFit 0.0.8 when it is released - "crossfit @ git+https://github.com/rapidsai/crossfit.git@main", + "crossfit>=0.0.8", "dask-mpi>=2021.11.0", "dask[complete]>=2021.7.1", "datasets", From 98fb0bc6b19d312eebb988695c5cb40672fda0d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 3 Jan 2025 20:38:26 +0100 Subject: [PATCH 5/6] chore: update changelog 0.6.0 (#466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: Update changelog Signed-off-by: oliver könig * ci: Bump release workflow Signed-off-by: oliver könig * Update release.yml Signed-off-by: oliver könig * docs: mention extended MNMG support Signed-off-by: oliver könig * bump Signed-off-by: oliver könig * remove note Signed-off-by: oliver könig --------- Signed-off-by: oliver könig --- .github/workflows/release.yml | 8 ++++++-- CHANGELOG.md | 12 ++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f9a53eb88..34a087ea0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,10 +25,13 @@ on: required: true default: true type: boolean - + version-bump-branch: + type: string + required: true + description: Branch to target for version bump jobs: release: - uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.4 + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4 with: release-ref: ${{ inputs.release-ref }} image-name: nemo_curator_container @@ -43,6 +46,7 @@ jobs: container-workdir: /opt/NeMo-Curator library-name: NeMo Curator dry-run: ${{ inputs.dry-run }} + version-bump-branch: ${{ inputs.version-bump-branch }} secrets: TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 4de64f094..7109b43d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## NeMo Curator 0.6.0 + +- Synthetic Data Generation for Text Retrieval + - LLM-based Filters + - Easiness + - Answerability + - Q&A Retrieval Generation Pipeline +- Parallel Dataset Curation for Machine Translation + - Load/Write Bitext Files + - Heuristic filtering (Histogram, Length Ratio) + - Classifier filtering (Comet, Cometoid) + ## NeMo Curator 0.5.0 ### Highlights From 7dfb21a86baffd3ed16a774cb86da0daa2eec140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 3 Jan 2025 20:44:21 +0100 Subject: [PATCH 6/6] fix: Generate version number string (#467) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_curator/package_info.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py index f9ca1dbd2..65c7ae069 100644 --- a/nemo_curator/package_info.py +++ b/nemo_curator/package_info.py @@ -23,7 +23,13 @@ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV) __shortversion__ = ".".join(map(str, VERSION[:3])) -__version__ = __shortversion__ + VERSION[3] + "." + ".".join(VERSION[4:]) +__version__ = __shortversion__ + +if VERSION[3] != "": + __version__ = __version__ + VERSION[3] + +if VERSION[4] != "": + __version__ = __version__ + "." + ".".join(VERSION[4:]) __package_name__ = "nemo_curator" __contact_names__ = "NVIDIA"