From 4fb7f54bd5cf44bee7a520b3d40d1a53049f78d4 Mon Sep 17 00:00:00 2001
From: Praateek Mahajan <praateekmahajan@users.noreply.github.com>
Date: Mon, 23 Dec 2024 19:42:40 -0800
Subject: [PATCH 1/6] Change filename column name to file_name (#449)

---
 nemo_curator/datasets/doc_dataset.py          |  6 +--
 nemo_curator/datasets/parallel_dataset.py     |  4 +-
 nemo_curator/download/arxiv.py                |  2 +-
 nemo_curator/download/commoncrawl.py          |  2 +-
 nemo_curator/download/doc_builder.py          |  2 +-
 nemo_curator/download/wikipedia.py            |  2 +-
 nemo_curator/modules/dataset_ops.py           | 12 +++---
 nemo_curator/utils/distributed_utils.py       | 38 +++++++++----------
 nemo_curator/utils/file_utils.py              |  2 +-
 tests/test_io.py                              | 18 ++++-----
 tests/test_read_data.py                       | 18 +++------
 tests/test_separate_by_metadata.py            |  2 +-
 tests/test_shuffle.py                         | 16 ++++----
 .../peft-curation-with-sdg/docbuilder.py      |  2 +-
 .../peft-curation-with-sdg/synthetic_gen.py   |  4 +-
 tutorials/peft-curation/docbuilder.py         |  2 +-
 tutorials/tinystories/docbuilder.py           |  2 +-
 17 files changed, 64 insertions(+), 70 deletions(-)

diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
index 10ffe2230..5b4caf518 100644
--- a/nemo_curator/datasets/doc_dataset.py
+++ b/nemo_curator/datasets/doc_dataset.py
@@ -64,7 +64,7 @@ def read_json(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             input_meta: A dictionary or a string formatted as a dictionary, which outlines
                 the field names and their respective data types within the JSONL input file.
             columns: If not None, only these columns will be read from the file.
@@ -102,7 +102,7 @@ def read_parquet(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             columns: If not None, only these columns will be read from the file.
                 There is a significant performance gain when specifying columns for Parquet files.
 
@@ -135,7 +135,7 @@ def read_pickle(
             input_files: The path of the input file(s).
             backend: The backend to use for reading the data.
             files_per_partition: The number of files to read per partition.
-            add_filename: Whether to add a "filename" column to the DataFrame.
+            add_filename: Whether to add a "file_name" column to the DataFrame.
             columns: If not None, only these columns will be read from the file.
 
         """
diff --git a/nemo_curator/datasets/parallel_dataset.py b/nemo_curator/datasets/parallel_dataset.py
index eb8952676..b9a5eee1f 100644
--- a/nemo_curator/datasets/parallel_dataset.py
+++ b/nemo_curator/datasets/parallel_dataset.py
@@ -129,7 +129,7 @@ def read_single_simple_bitext_file_pair(
             tgt_lang (str): Target language, in ISO-639-1 (two character) format (e.g. 'en')
             doc_id (str, optional): A string document id to assign to every segment in the file. Defaults to None.
             backend (str, optional): Backend of the data frame. Defaults to "cudf".
-            add_filename (bool, optional): Add filename as an extra field to every segment in the file. Defaults to False.
+            add_filename (bool, optional): Add "file_name" as an extra field to every segment in the file. Defaults to False.
 
         Returns:
             Union[dd.DataFrame, dask_cudf.DataFrame]
@@ -162,6 +162,6 @@ def read_single_simple_bitext_file_pair(
         df_combined["tgt_lang"] = tgt_lang
 
         if add_filename:
-            df_combined["filename"] = remove_path_extension(src_input_file)
+            df_combined["file_name"] = remove_path_extension(src_input_file)
 
         return df_combined
diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py
index 54cbca3de..449503e8b 100644
--- a/nemo_curator/download/arxiv.py
+++ b/nemo_curator/download/arxiv.py
@@ -403,7 +403,7 @@ def download_arxiv(
         "text": str,
         "id": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         arxiv_urls,
diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
index 53deffd91..68ad0de48 100644
--- a/nemo_curator/download/commoncrawl.py
+++ b/nemo_curator/download/commoncrawl.py
@@ -430,7 +430,7 @@ def download_common_crawl(
         "url": str,
         "warc_id": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         common_crawl_urls,
diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py
index 122d3ea31..dbeac133e 100644
--- a/nemo_curator/download/doc_builder.py
+++ b/nemo_curator/download/doc_builder.py
@@ -141,7 +141,7 @@ def _download_and_extract_single_partition(
     partition = pd.DataFrame(records)
     filename = os.path.basename(output_path)
     output_dir = os.path.dirname(output_path)
-    partition["filename"] = filename
+    partition["file_name"] = filename
     single_partition_write_with_filename(partition, output_dir, output_type=output_type)
     if not keep_raw_download:
         os.remove(downloaded_file)
diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py
index 456142fbb..32088a274 100644
--- a/nemo_curator/download/wikipedia.py
+++ b/nemo_curator/download/wikipedia.py
@@ -799,7 +799,7 @@ def download_wikipedia(
         "url": str,
         "language": str,
         "source_id": str,
-        "filename": str,
+        "file_name": str,
     }
     dataset = download_and_extract(
         wikipedia_urls,
diff --git a/nemo_curator/modules/dataset_ops.py b/nemo_curator/modules/dataset_ops.py
index 745d741cf..e996946b0 100644
--- a/nemo_curator/modules/dataset_ops.py
+++ b/nemo_curator/modules/dataset_ops.py
@@ -19,7 +19,7 @@ def __init__(
         partition_to_filename: Callable[[int], str] = default_filename,
     ) -> None:
         """
-        Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten.
+        Randomly permutes the dataset. This will make the original "file_name" column invalid, so if the column is present it will be overwritten.
         Args:
             seed: The random seed that will be used to determine which partition (file) each datapoint goes to.
                 Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower)
@@ -52,8 +52,8 @@ def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset:
         shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions)
         shuffled_df = shuffled_df.reset_index(drop=True)
 
-        if "filename" in shuffled_df:
-            shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename)
+        if "file_name" in shuffled_df:
+            shuffled_df["file_name"] = shuffled_df.map_partitions(self._add_filename)
 
         return DocumentDataset(shuffled_df)
 
@@ -98,15 +98,15 @@ def _partition_shuffle(self, partition, partition_info=None):
             drop=True
         )
 
-        if "filename" in partition:
+        if "file_name" in partition:
             filename = self.partition_to_filename(partition_num)
-            partition["filename"] = filename
+            partition["file_name"] = filename
 
         return partition
 
     def _add_filename(self, partition, partition_info=None):
         if partition_info is None:
-            return ["filename"] * len(partition)
+            return ["file_name"] * len(partition)
 
         filename = self.partition_to_filename(partition_info["number"])
 
diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py
index 89b4415f7..7ff463a70 100644
--- a/nemo_curator/utils/distributed_utils.py
+++ b/nemo_curator/utils/distributed_utils.py
@@ -281,8 +281,8 @@ def select_columns(
 ) -> Union[dd.DataFrame, pd.DataFrame, "cudf.DataFrame"]:
     # We exclude parquet because the parquet readers already support column selection
     if filetype in ["jsonl", "json"] and columns is not None:
-        if add_filename and "filename" not in columns:
-            columns.append("filename")
+        if add_filename and "file_name" not in columns:
+            columns.append("file_name")
         df = df[columns]
 
     return df
@@ -299,12 +299,12 @@ def read_single_partition(
 ) -> Union["cudf.DataFrame", pd.DataFrame]:
     """
     This function reads a file with cuDF, sorts the columns of the DataFrame
-    and adds a "filename" column.
+    and adds a "file_name" column.
 
     Args:
         files: The path to the jsonl files to read.
         backend: The backend to use for reading the data. Either "cudf" or "pandas".
-        add_filename: Whether to add a "filename" column to the DataFrame.
+        add_filename: Whether to add a "file_name" column to the DataFrame.
         input_meta: A dictionary or a string formatted as a dictionary, which outlines
             the field names and their respective data types within the JSONL input file.
         columns: If not None, only these columns will be read from the file.
@@ -368,7 +368,7 @@ def read_single_partition(
         for file in files:
             df = read_f(file, **read_kwargs, **kwargs)
             if add_filename:
-                df["filename"] = os.path.basename(file)
+                df["file_name"] = os.path.basename(file)
             df = select_columns(df, io_columns, filetype, add_filename)
             df_ls.append(df)
 
@@ -429,7 +429,7 @@ def extract_filename(path: str) -> str:
 
             read_kwargs["include_path_column"] = add_filename
             read_kwargs["path_converter"] = extract_filename
-            postprocessing_func = lambda df: df.rename(columns={"path": "filename"})
+            postprocessing_func = lambda df: df.rename(columns={"path": "file_name"})
 
     elif file_type == "parquet":
         if backend == "cudf" and not DASK_CUDF_PARQUET_READ_INCONSISTENT_SCHEMA:
@@ -509,7 +509,7 @@ def read_pandas_pickle(
 
     Args:
         file: The path to the pickle file to read.
-        add_filename: Whether to add a "filename" column to the DataFrame.
+        add_filename: Whether to add a "file_name" column to the DataFrame.
         columns: If not None, only these columns will be read from the file.
     Returns:
         A Pandas DataFrame.
@@ -543,7 +543,7 @@ def read_data(
         file_type: The type of the input file(s).
         backend: The backend to use for reading the data.
         files_per_partition: The number of files to read per partition.
-        add_filename: Whether to add a "filename" column to the DataFrame.
+        add_filename: Whether to add a "file_name" column to the DataFrame.
         input_meta: A dictionary or a string formatted as a dictionary, which outlines
             the field names and their respective data types within the JSONL input file.
         columns: If not None, only these columns will be read from the file.
@@ -686,14 +686,14 @@ def single_partition_write_with_filename(
     Args:
         df: A DataFrame.
         output_file_dir: The output file path.
-        keep_filename_column: Boolean representing whether to keep or drop the "filename" column, if it exists.
+        keep_filename_column: Boolean representing whether to keep or drop the "file_name" column, if it exists.
         output_type: The type of output file to write. Can be "jsonl" or "parquet".
     Returns:
         If the DataFrame is non-empty, return a Series containing a single element, True.
         If the DataFrame is empty, return a Series containing a single element, False.
 
     """
-    assert "filename" in df.columns
+    assert "file_name" in df.columns
 
     if len(df) > 0:
         empty_partition = False
@@ -709,14 +709,14 @@ def single_partition_write_with_filename(
         success_ser = pd.Series([empty_partition])
 
     if not empty_partition:
-        filenames = df.filename.unique()
+        filenames = df.file_name.unique()
         filenames = list(filenames.values_host) if is_cudf_type(df) else list(filenames)
         num_files = len(filenames)
 
         for filename in filenames:
-            out_df = df[df.filename == filename] if num_files > 1 else df
+            out_df = df[df.file_name == filename] if num_files > 1 else df
             if not keep_filename_column:
-                out_df = out_df.drop("filename", axis=1)
+                out_df = out_df.drop("file_name", axis=1)
 
             filename = (
                 Path(filename).stem if output_type != "bitext" else Path(filename).name
@@ -831,13 +831,13 @@ def write_to_disk(
     """
     This function writes a Dask DataFrame to the specified file path.
     If write_to_filename is True, then it expects the
-    DataFrame to have a "filename" column that specifies where to write the document.
+    DataFrame to have a "file_name" column that specifies where to write the document.
 
     Args:
         df: A Dask DataFrame.
         output_path: The output file path.
-        write_to_filename: Boolean representing whether to write the filename using the "filename" column.
-        keep_filename_column: Boolean representing whether to keep or drop the "filename" column, if it exists.
+        write_to_filename: Boolean representing whether to write the filename using the "file_name" column.
+        keep_filename_column: Boolean representing whether to keep or drop the "file_name" column, if it exists.
         output_type: The type of output file to write. Can be "jsonl" or "parquet".
 
     """
@@ -856,9 +856,9 @@ def write_to_disk(
             )
 
     # output_path is a directory
-    elif write_to_filename and "filename" not in df.columns:
+    elif write_to_filename and "file_name" not in df.columns:
         raise ValueError(
-            "write_using_filename is True but no filename column found in DataFrame"
+            "write_using_filename is True but no file_name column found in DataFrame"
         )
 
     if is_cudf_type(df):
@@ -890,7 +890,7 @@ def write_to_disk(
                 os.makedirs(output_path, exist_ok=True)
                 tmp_output_file_dir = os.path.join(output_path, ".tmp")
                 os.makedirs(tmp_output_file_dir, exist_ok=True)
-                file_name = os.path.basename(list(df.filename.unique())[0])
+                file_name = os.path.basename(list(df.file_name.unique())[0])
             else:
                 tmp_output_file_dir = os.path.join(output_path, ".tmp")
                 os.makedirs(tmp_output_file_dir, exist_ok=True)
diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py
index f1c957b47..4632346ae 100644
--- a/nemo_curator/utils/file_utils.py
+++ b/nemo_curator/utils/file_utils.py
@@ -300,7 +300,7 @@ def separate_by_metadata(
 
     Args:
         input_data: Either a DataFrame or a string representing the path to the input directory.
-            If a DataFrame is provided, it must have a 'filename' column for the shard.
+            If a DataFrame is provided, it must have a "file_name" column for the shard.
         output_dir: The base directory for which all metadata based subdirs will be created under
         metadata_field: The metadata field to split on
         remove_metadata: Whether to remove the metadata from the dataframe when saving it
diff --git a/tests/test_io.py b/tests/test_io.py
index 432c00b3d..546ccf27c 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -153,7 +153,7 @@ def test_multifile_single_partition(
         keep_filename_column,
         file_ext,
     ):
-        df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file0", "file1", "file1"]})
+        df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file0", "file1", "file1"]})
 
         single_partition_write_with_filename(
             df=df,
@@ -165,7 +165,7 @@ def test_multifile_single_partition(
         assert os.path.exists(tmp_path / f"file1.{file_ext}")
 
         if not keep_filename_column:
-            df = df.drop("filename", axis=1)
+            df = df.drop("file_name", axis=1)
 
         df1 = read_single_partition(
             files=[tmp_path / f"file0.{file_ext}"], backend="pandas", filetype=file_ext
@@ -185,7 +185,7 @@ def test_singlefile_single_partition(
         keep_filename_column,
         file_ext,
     ):
-        df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file2", "file2", "file2"]})
+        df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file2", "file2", "file2"]})
 
         single_partition_write_with_filename(
             df=df,
@@ -197,14 +197,14 @@ def test_singlefile_single_partition(
         assert os.path.exists(tmp_path / f"file2.{file_ext}")
 
         if not keep_filename_column:
-            df = df.drop("filename", axis=1)
+            df = df.drop("file_name", axis=1)
         got = read_single_partition(
             files=[tmp_path / f"file2.{file_ext}"], backend="pandas", filetype=file_ext
         )
         assert_eq(got, df)
 
     def test_multifile_single_partition_error(self, tmp_path):
-        df = pd.DataFrame({"a": [1, 2, 3], "filename": ["file0", "file1", "file1"]})
+        df = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file0", "file1", "file1"]})
 
         with pytest.raises(ValueError, match="Unknown output type"):
             single_partition_write_with_filename(
@@ -220,13 +220,13 @@ def test_multifile_single_partition_error(self, tmp_path):
         ],
     )
     def test_multifile_multi_partition(self, tmp_path, file_ext, read_f):
-        df1 = pd.DataFrame({"a": [1, 2, 3], "filename": ["file1", "file2", "file2"]})
+        df1 = pd.DataFrame({"a": [1, 2, 3], "file_name": ["file1", "file2", "file2"]})
         df2 = df1.copy()
-        df2["filename"] = "file3"
+        df2["file_name"] = "file3"
         df3 = df1.copy()
-        df3["filename"] = ["file4", "file5", "file6"]
+        df3["file_name"] = ["file4", "file5", "file6"]
         ddf = dd.concat([df1, df2, df3])
-        ddf["filename"] = ddf["filename"] + f".{file_ext}"
+        ddf["file_name"] = ddf["file_name"] + f".{file_ext}"
         write_to_disk(
             df=ddf,
             output_path=tmp_path / file_ext,
diff --git a/tests/test_read_data.py b/tests/test_read_data.py
index a619be3a4..29013479f 100644
--- a/tests/test_read_data.py
+++ b/tests/test_read_data.py
@@ -300,8 +300,8 @@ def test_read_data_blocksize_add_filename_jsonl(mock_multiple_jsonl_files, backe
         columns=None,
     )
 
-    assert "filename" in df.columns
-    file_names = df["filename"].unique().compute()
+    assert "file_name" in df.columns
+    file_names = df["file_name"].unique().compute()
     if backend == "cudf":
         file_names = file_names.to_pandas()
 
@@ -340,13 +340,7 @@ def test_read_data_blocksize_add_filename_parquet(mock_multiple_parquet_files, b
         pytest.param("cudf", "jsonl", marks=pytest.mark.gpu),
         pytest.param("cudf", "parquet", marks=pytest.mark.gpu),
         ("pandas", "jsonl"),
-        pytest.param(
-            "pandas",
-            "parquet",
-            marks=pytest.mark.xfail(
-                reason="filename column inaccessible with pandas backend and parquet"
-            ),
-        ),
+        ("pandas", "parquet"),
     ],
 )
 def test_read_data_fpp_add_filename(
@@ -369,8 +363,8 @@ def test_read_data_fpp_add_filename(
     )
 
     assert list(df.columns) == list(df.head().columns)
-    assert set(df.columns) == {"filename", "id", "text"}
-    file_names = df["filename"].unique().compute()
+    assert set(df.columns) == {"file_name", "id", "text"}
+    file_names = df["file_name"].unique().compute()
     if backend == "cudf":
         file_names = file_names.to_pandas()
 
@@ -445,7 +439,7 @@ def test_read_data_select_columns(
     if not add_filename:
         assert list(df.columns) == sorted(cols_to_select)
     else:
-        assert list(df.columns) == sorted(cols_to_select + ["filename"])
+        assert list(df.columns) == sorted(cols_to_select + ["file_name"])
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_separate_by_metadata.py b/tests/test_separate_by_metadata.py
index 020bf21d8..cd7c7147d 100644
--- a/tests/test_separate_by_metadata.py
+++ b/tests/test_separate_by_metadata.py
@@ -24,7 +24,7 @@ def _write_data(num_files, file_ext):
         dfs = []
         for i in range(num_files):
             partition = df.copy()
-            partition["filename"] = f"f{i}.{file_ext}"
+            partition["file_name"] = f"f{i}.{file_ext}"
             dfs.append(partition)
 
         df = dd.concat(dfs)
diff --git a/tests/test_shuffle.py b/tests/test_shuffle.py
index a23d47906..2a4afc994 100644
--- a/tests/test_shuffle.py
+++ b/tests/test_shuffle.py
@@ -61,11 +61,11 @@ def test_filename(self):
                 original_dataset = list_to_dataset(
                     ["one", "two", "three", "four", "five"], npartitions=1
                 )
-                original_dataset.df["filename"] = "original.jsonl"
+                original_dataset.df["file_name"] = "original.jsonl"
 
                 expected_data = {
                     "text": ["one", "two", "three", "five", "four"],
-                    "filename": [
+                    "file_name": [
                         "file_0000000000.jsonl",
                         "file_0000000000.jsonl",
                         "file_0000000000.jsonl",
@@ -86,11 +86,11 @@ def test_custom_filenames(self):
                 original_dataset = list_to_dataset(
                     ["one", "two", "three", "four", "five"], npartitions=1
                 )
-                original_dataset.df["filename"] = "original.jsonl"
+                original_dataset.df["file_name"] = "original.jsonl"
 
                 expected_data = {
                     "text": ["one", "two", "three", "five", "four"],
-                    "filename": [
+                    "file_name": [
                         "my_0.test",
                         "my_0.test",
                         "my_0.test",
@@ -140,11 +140,11 @@ def test_filename(self):
         original_dataset = list_to_dataset(
             ["one", "two", "three", "four", "five"], npartitions=1
         )
-        original_dataset.df["filename"] = "original.jsonl"
+        original_dataset.df["file_name"] = "original.jsonl"
 
         expected_data = {
             "text": ["four", "five", "three", "one", "two"],
-            "filename": [
+            "file_name": [
                 "file_0000000000.jsonl",
                 "file_0000000001.jsonl",
                 "file_0000000001.jsonl",
@@ -163,11 +163,11 @@ def test_custom_filenames(self):
         original_dataset = list_to_dataset(
             ["one", "two", "three", "four", "five"], npartitions=1
         )
-        original_dataset.df["filename"] = "original.jsonl"
+        original_dataset.df["file_name"] = "original.jsonl"
 
         expected_data = {
             "text": ["four", "five", "three", "one", "two"],
-            "filename": [
+            "file_name": [
                 "my_0.test",
                 "my_1.test",
                 "my_1.test",
diff --git a/tutorials/peft-curation-with-sdg/docbuilder.py b/tutorials/peft-curation-with-sdg/docbuilder.py
index f4892f70d..73365ec03 100644
--- a/tutorials/peft-curation-with-sdg/docbuilder.py
+++ b/tutorials/peft-curation-with-sdg/docbuilder.py
@@ -108,7 +108,7 @@ def iterate(self, file_path):
 
             id, extracted_content = extracted_content
             meta = {
-                "filename": file_name,
+                "file_name": file_name,
                 "id": f"law-stackexchange-qa-{id}",
             }
 
diff --git a/tutorials/peft-curation-with-sdg/synthetic_gen.py b/tutorials/peft-curation-with-sdg/synthetic_gen.py
index f084e799d..7bbcf0f82 100644
--- a/tutorials/peft-curation-with-sdg/synthetic_gen.py
+++ b/tutorials/peft-curation-with-sdg/synthetic_gen.py
@@ -153,7 +153,7 @@ def _write_all_to_file(self, gen_entries, out_fp: str):
                 synth_questions.extend(questions)
                 synth_answers.extend(answers)
                 synth_scores.extend(scores)
-                synth_filenames.extend([row["filename"] + ".synth"] * self.n_variants)
+                synth_filenames.extend([row["file_name"] + ".synth"] * self.n_variants)
                 synth_ids.extend(
                     [f"{row['id']}-synth-{i}" for i in range(self.n_variants)]
                 )
@@ -169,7 +169,7 @@ def _write_all_to_file(self, gen_entries, out_fp: str):
             "question": synth_questions,
             "answer": synth_answers,
             "title": synth_titles,
-            "filename": synth_filenames,
+            "file_name": synth_filenames,
             "tags": synth_tags,
             # Use the same score for both the generated questions and answers.
             "question_score": synth_scores,
diff --git a/tutorials/peft-curation/docbuilder.py b/tutorials/peft-curation/docbuilder.py
index 3ae0840c9..7b635aa80 100644
--- a/tutorials/peft-curation/docbuilder.py
+++ b/tutorials/peft-curation/docbuilder.py
@@ -77,7 +77,7 @@ def iterate(self, file_path):
             self._counter += 1
             content = email.group().strip('"').strip()
             meta = {
-                "filename": file_name,
+                "file_name": file_name,
                 "id": f"email-{self._counter}",
             }
             extracted_content = self._extractor.extract(content)
diff --git a/tutorials/tinystories/docbuilder.py b/tutorials/tinystories/docbuilder.py
index 859fb35ab..9a9d9fe5e 100644
--- a/tutorials/tinystories/docbuilder.py
+++ b/tutorials/tinystories/docbuilder.py
@@ -71,7 +71,7 @@ def split_meta(example):
                     self._counter += 1
                     content = " ".join(example)
                     meta = {
-                        "filename": file_name,
+                        "file_name": file_name,
                         "id": f"{file_name}-{self._counter}",
                     }
 

From d401333ec9d88c36494befc9ae7515574c4d89fb Mon Sep 17 00:00:00 2001
From: Praateek Mahajan <praateekmahajan@users.noreply.github.com>
Date: Mon, 30 Dec 2024 05:17:22 -0800
Subject: [PATCH 2/6] Support the new minhash 25.02 api (#445)

Signed-off-by: Praateek <praateekm@gmail.com>
---
 nemo_curator/_compat.py             | 13 +++++++----
 nemo_curator/modules/fuzzy_dedup.py | 36 +++++++++++++++++++----------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/nemo_curator/_compat.py b/nemo_curator/_compat.py
index 5de25ebd8..9125713dd 100644
--- a/nemo_curator/_compat.py
+++ b/nemo_curator/_compat.py
@@ -39,10 +39,15 @@
 except (ImportError, TypeError):
     CURRENT_CUDF_VERSION = parse_version("24.10.0")
 
-# TODO remove this once 24.12.0 becomes the base version of cudf in nemo-curator
-MINHASH_PERMUTED_AVAILABLE = CURRENT_CUDF_VERSION >= parse_version("24.12.0") or (
-    CURRENT_CUDF_VERSION.is_prerelease
-    and CURRENT_CUDF_VERSION.base_version >= "24.12.0"
+# TODO remove this once 25.02 becomes the base version of cudf in nemo-curator
+
+# minhash in < 24.12 used to have a minhash(txt) api which was deprecated in favor of
+# minhash(a, b) in 25.02 (in 24.12, minhash_permuted(a,b) was introduced)
+MINHASH_DEPRECATED_API = (
+    CURRENT_CUDF_VERSION.base_version < parse_version("24.12").base_version
+)
+MINHASH_PERMUTED_AVAILABLE = (CURRENT_CUDF_VERSION.major == 24) & (
+    CURRENT_CUDF_VERSION.minor == 12
 )
 
 # TODO: remove when dask min version gets bumped
diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
index 2986a88f6..3afc4123a 100644
--- a/nemo_curator/modules/fuzzy_dedup.py
+++ b/nemo_curator/modules/fuzzy_dedup.py
@@ -35,7 +35,7 @@
 from dask.utils import M
 from tqdm import tqdm
 
-from nemo_curator._compat import MINHASH_PERMUTED_AVAILABLE
+from nemo_curator._compat import MINHASH_DEPRECATED_API, MINHASH_PERMUTED_AVAILABLE
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.log import create_logger
 from nemo_curator.modules.config import FuzzyDuplicatesConfig
@@ -98,15 +98,17 @@ def __init__(
         """
         self.num_hashes = num_hashes
         self.char_ngram = char_ngrams
-        if MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
+            self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)
+        else:
             self.seeds = self.generate_hash_permutation_seeds(
                 bit_width=64 if use_64bit_hash else 32,
                 n_permutations=self.num_hashes,
                 seed=seed,
             )
-        else:
-            self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)
+
         self.minhash_method = self.minhash64 if use_64bit_hash else self.minhash32
+
         self.id_field = id_field
         self.text_field = text_field
 
@@ -171,7 +173,7 @@ def minhash32(
         if not isinstance(ser, cudf.Series):
             raise TypeError("Expected data of type cudf.Series")
 
-        if not MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
             warnings.warn(
                 "Using an outdated minhash implementation, please update to cuDF version 24.12 "
                 "or later for improved performance. "
@@ -184,9 +186,14 @@ def minhash32(
             seeds_a = cudf.Series(seeds[:, 0], dtype="uint32")
             seeds_b = cudf.Series(seeds[:, 1], dtype="uint32")
 
-            return ser.str.minhash_permuted(
-                a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
-            )
+            if MINHASH_PERMUTED_AVAILABLE:
+                return ser.str.minhash_permuted(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
+            else:
+                return ser.str.minhash(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
 
     def minhash64(
         self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int
@@ -196,7 +203,7 @@ def minhash64(
         """
         if not isinstance(ser, cudf.Series):
             raise TypeError("Expected data of type cudf.Series")
-        if not MINHASH_PERMUTED_AVAILABLE:
+        if MINHASH_DEPRECATED_API:
             warnings.warn(
                 "Using an outdated minhash implementation, please update to cuDF version 24.12 "
                 "or later for improved performance. "
@@ -209,9 +216,14 @@ def minhash64(
             seeds_a = cudf.Series(seeds[:, 0], dtype="uint64")
             seeds_b = cudf.Series(seeds[:, 1], dtype="uint64")
 
-            return ser.str.minhash64_permuted(
-                a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
-            )
+            if MINHASH_PERMUTED_AVAILABLE:
+                return ser.str.minhash64_permuted(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
+            else:
+                return ser.str.minhash64(
+                    a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
+                )
 
     def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]:
         """

From db411b0480b0b84d481505a92553be12332191cf Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Thu, 2 Jan 2025 08:23:23 -0800
Subject: [PATCH 3/6] Reorder import (#460)

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 nemo_curator/modules/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
index bc5659311..b792d807c 100644
--- a/nemo_curator/modules/__init__.py
+++ b/nemo_curator/modules/__init__.py
@@ -25,7 +25,6 @@
 from .config import FuzzyDuplicatesConfig, SemDedupConfig
 from .dataset_ops import blend_datasets, Shuffle
 from .exact_dedup import ExactDuplicates
-from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
 from .meta import Sequential
 from .modify import Modify
 from .task import TaskDecontamination
@@ -39,9 +38,7 @@
 BucketsToEdges = gpu_only_import_from(
     "nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
 )
-# Pytorch related imports must come after all imports that require cugraph,
-# because of context cleanup issues b/w pytorch and cugraph
-# See this issue: https://github.com/rapidsai/cugraph/issues/2718
+
 SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
 EmbeddingCreator = gpu_only_import_from(
     "nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
@@ -52,6 +49,10 @@
 SemanticClusterLevelDedup = gpu_only_import_from(
     "nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
 )
+# Pytorch related imports must come after all imports that require cugraph,
+# because of context cleanup issues b/w pytorch and cugraph
+# See this issue: https://github.com/rapidsai/cugraph/issues/2718
+from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
 
 __all__ = [
     "ExactDuplicates",

From fd516fbdcd5e925aa4598d10406a64b890ad7e68 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Date: Thu, 2 Jan 2025 15:12:36 -0800
Subject: [PATCH 4/6] Pin CrossFit 0.0.8 (#464)

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89e5ae150..329c2eec4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,8 +42,7 @@ dependencies = [
     "beautifulsoup4",
     "charset_normalizer>=3.1.0",
     "comment_parser",
-    # TODO: Pin CrossFit 0.0.8 when it is released
-    "crossfit @ git+https://github.com/rapidsai/crossfit.git@main",
+    "crossfit>=0.0.8",
     "dask-mpi>=2021.11.0",
     "dask[complete]>=2021.7.1",
     "datasets",

From 98fb0bc6b19d312eebb988695c5cb40672fda0d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 3 Jan 2025 20:38:26 +0100
Subject: [PATCH 5/6] chore: update changelog 0.6.0 (#466)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: Update changelog

Signed-off-by: oliver könig <okoenig@nvidia.com>

* ci: Bump release workflow

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Update release.yml

Signed-off-by: oliver könig <okoenig@nvidia.com>

* docs: mention extended MNMG support

Signed-off-by: oliver könig <okoenig@nvidia.com>

* bump

Signed-off-by: oliver könig <okoenig@nvidia.com>

* remove note

Signed-off-by: oliver könig <okoenig@nvidia.com>

---------

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/release.yml |  8 ++++++--
 CHANGELOG.md                  | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f9a53eb88..34a087ea0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -25,10 +25,13 @@ on:
         required: true
         default: true
         type: boolean
-
+      version-bump-branch:
+        type: string
+        required: true
+        description: Branch to target for version bump
 jobs:
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.4
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_curator_container
@@ -43,6 +46,7 @@ jobs:
       container-workdir: /opt/NeMo-Curator
       library-name: NeMo Curator
       dry-run: ${{ inputs.dry-run }}
+      version-bump-branch: ${{ inputs.version-bump-branch }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4de64f094..7109b43d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Changelog
 
+## NeMo Curator 0.6.0
+
+- Synthetic Data Generation for Text Retrieval
+  - LLM-based Filters
+    - Easiness
+    - Answerability
+  - Q&A Retrieval Generation Pipeline
+- Parallel Dataset Curation for Machine Translation
+  - Load/Write Bitext Files
+  - Heuristic filtering (Histogram, Length Ratio)
+  - Classifier filtering (Comet, Cometoid)
+
 ## NeMo Curator 0.5.0
 
 ### Highlights

From 7dfb21a86baffd3ed16a774cb86da0daa2eec140 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 3 Jan 2025 20:44:21 +0100
Subject: [PATCH 6/6] fix: Generate version number string (#467)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_curator/package_info.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py
index f9ca1dbd2..65c7ae069 100644
--- a/nemo_curator/package_info.py
+++ b/nemo_curator/package_info.py
@@ -23,7 +23,13 @@
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV)
 
 __shortversion__ = ".".join(map(str, VERSION[:3]))
-__version__ = __shortversion__ + VERSION[3] + "." + ".".join(VERSION[4:])
+__version__ = __shortversion__
+
+if VERSION[3] != "":
+    __version__ = __version__ + VERSION[3]
+
+if VERSION[4] != "":
+    __version__ = __version__ + "." + ".".join(VERSION[4:])
 
 __package_name__ = "nemo_curator"
 __contact_names__ = "NVIDIA"