Skip to content

Commit

Permalink
Merge branch 'main' into cleanup_run_classifier_helper
Browse files Browse the repository at this point in the history
  • Loading branch information
sarahyurick authored Jan 3, 2025
2 parents 498910d + 7dfb21a commit 5377c19
Show file tree
Hide file tree
Showing 24 changed files with 128 additions and 95 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ on:
required: true
default: true
type: boolean

version-bump-branch:
type: string
required: true
description: Branch to target for version bump
jobs:
release:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.4
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4
with:
release-ref: ${{ inputs.release-ref }}
image-name: nemo_curator_container
Expand All @@ -43,6 +46,7 @@ jobs:
container-workdir: /opt/NeMo-Curator
library-name: NeMo Curator
dry-run: ${{ inputs.dry-run }}
version-bump-branch: ${{ inputs.version-bump-branch }}
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
Expand Down
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## NeMo Curator 0.6.0

- Synthetic Data Generation for Text Retrieval
- LLM-based Filters
- Easiness
- Answerability
- Q&A Retrieval Generation Pipeline
- Parallel Dataset Curation for Machine Translation
- Load/Write Bitext Files
- Heuristic filtering (Histogram, Length Ratio)
- Classifier filtering (Comet, Cometoid)

## NeMo Curator 0.5.0

### Highlights
Expand Down
13 changes: 9 additions & 4 deletions nemo_curator/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,15 @@
except (ImportError, TypeError):
CURRENT_CUDF_VERSION = parse_version("24.10.0")

# TODO remove this once 24.12.0 becomes the base version of cudf in nemo-curator
MINHASH_PERMUTED_AVAILABLE = CURRENT_CUDF_VERSION >= parse_version("24.12.0") or (
CURRENT_CUDF_VERSION.is_prerelease
and CURRENT_CUDF_VERSION.base_version >= "24.12.0"
# TODO remove this once 25.02 becomes the base version of cudf in nemo-curator

# minhash in < 24.12 used to have a minhash(txt) api which was deprecated in favor of
# minhash(a, b) in 25.02 (in 24.12, minhash_permuted(a,b) was introduced)
MINHASH_DEPRECATED_API = (
CURRENT_CUDF_VERSION.base_version < parse_version("24.12").base_version
)
MINHASH_PERMUTED_AVAILABLE = (CURRENT_CUDF_VERSION.major == 24) & (
CURRENT_CUDF_VERSION.minor == 12
)

# TODO: remove when dask min version gets bumped
Expand Down
6 changes: 3 additions & 3 deletions nemo_curator/datasets/doc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def read_json(
input_files: The path of the input file(s).
backend: The backend to use for reading the data.
files_per_partition: The number of files to read per partition.
add_filename: Whether to add a "filename" column to the DataFrame.
add_filename: Whether to add a "file_name" column to the DataFrame.
input_meta: A dictionary or a string formatted as a dictionary, which outlines
the field names and their respective data types within the JSONL input file.
columns: If not None, only these columns will be read from the file.
Expand Down Expand Up @@ -102,7 +102,7 @@ def read_parquet(
input_files: The path of the input file(s).
backend: The backend to use for reading the data.
files_per_partition: The number of files to read per partition.
add_filename: Whether to add a "filename" column to the DataFrame.
add_filename: Whether to add a "file_name" column to the DataFrame.
columns: If not None, only these columns will be read from the file.
There is a significant performance gain when specifying columns for Parquet files.
Expand Down Expand Up @@ -135,7 +135,7 @@ def read_pickle(
input_files: The path of the input file(s).
backend: The backend to use for reading the data.
files_per_partition: The number of files to read per partition.
add_filename: Whether to add a "filename" column to the DataFrame.
add_filename: Whether to add a "file_name" column to the DataFrame.
columns: If not None, only these columns will be read from the file.
"""
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/datasets/parallel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def read_single_simple_bitext_file_pair(
tgt_lang (str): Target language, in ISO-639-1 (two character) format (e.g. 'en')
doc_id (str, optional): A string document id to assign to every segment in the file. Defaults to None.
backend (str, optional): Backend of the data frame. Defaults to "cudf".
add_filename (bool, optional): Add filename as an extra field to every segment in the file. Defaults to False.
add_filename (bool, optional): Add "file_name" as an extra field to every segment in the file. Defaults to False.
Returns:
Union[dd.DataFrame, dask_cudf.DataFrame]
Expand Down Expand Up @@ -162,6 +162,6 @@ def read_single_simple_bitext_file_pair(
df_combined["tgt_lang"] = tgt_lang

if add_filename:
df_combined["filename"] = remove_path_extension(src_input_file)
df_combined["file_name"] = remove_path_extension(src_input_file)

return df_combined
2 changes: 1 addition & 1 deletion nemo_curator/download/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def download_arxiv(
"text": str,
"id": str,
"source_id": str,
"filename": str,
"file_name": str,
}
dataset = download_and_extract(
arxiv_urls,
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def download_common_crawl(
"url": str,
"warc_id": str,
"source_id": str,
"filename": str,
"file_name": str,
}
dataset = download_and_extract(
common_crawl_urls,
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/download/doc_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _download_and_extract_single_partition(
partition = pd.DataFrame(records)
filename = os.path.basename(output_path)
output_dir = os.path.dirname(output_path)
partition["filename"] = filename
partition["file_name"] = filename
single_partition_write_with_filename(partition, output_dir, output_type=output_type)
if not keep_raw_download:
os.remove(downloaded_file)
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/download/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ def download_wikipedia(
"url": str,
"language": str,
"source_id": str,
"filename": str,
"file_name": str,
}
dataset = download_and_extract(
wikipedia_urls,
Expand Down
9 changes: 5 additions & 4 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from .config import FuzzyDuplicatesConfig, SemDedupConfig
from .dataset_ops import blend_datasets, Shuffle
from .exact_dedup import ExactDuplicates
from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter
from .meta import Sequential
from .modify import Modify
from .task import TaskDecontamination
Expand All @@ -39,9 +38,7 @@
BucketsToEdges = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
)
# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718

SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
EmbeddingCreator = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
Expand All @@ -52,6 +49,10 @@
SemanticClusterLevelDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
)
# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter

__all__ = [
"ExactDuplicates",
Expand Down
12 changes: 6 additions & 6 deletions nemo_curator/modules/dataset_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(
partition_to_filename: Callable[[int], str] = default_filename,
) -> None:
"""
Randomly permutes the dataset. This will make the original "filename" column invalid, so if the column is present it will be overwritten.
Randomly permutes the dataset. This will make the original "file_name" column invalid, so if the column is present it will be overwritten.
Args:
seed: The random seed that will be used to determine which partition (file) each datapoint goes to.
Setting the seed will guarantee determinism, but may be slightly slower (20-30% slower)
Expand Down Expand Up @@ -52,8 +52,8 @@ def shuffle_deterministic(self, dataset: DocumentDataset) -> DocumentDataset:
shuffled_df = dataset.df.set_index(self.rand_col, npartitions=new_npartitions)
shuffled_df = shuffled_df.reset_index(drop=True)

if "filename" in shuffled_df:
shuffled_df["filename"] = shuffled_df.map_partitions(self._add_filename)
if "file_name" in shuffled_df:
shuffled_df["file_name"] = shuffled_df.map_partitions(self._add_filename)

return DocumentDataset(shuffled_df)

Expand Down Expand Up @@ -98,15 +98,15 @@ def _partition_shuffle(self, partition, partition_info=None):
drop=True
)

if "filename" in partition:
if "file_name" in partition:
filename = self.partition_to_filename(partition_num)
partition["filename"] = filename
partition["file_name"] = filename

return partition

def _add_filename(self, partition, partition_info=None):
if partition_info is None:
return ["filename"] * len(partition)
return ["file_name"] * len(partition)

filename = self.partition_to_filename(partition_info["number"])

Expand Down
36 changes: 24 additions & 12 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from dask.utils import M
from tqdm import tqdm

from nemo_curator._compat import MINHASH_PERMUTED_AVAILABLE
from nemo_curator._compat import MINHASH_DEPRECATED_API, MINHASH_PERMUTED_AVAILABLE
from nemo_curator.datasets import DocumentDataset
from nemo_curator.log import create_logger
from nemo_curator.modules.config import FuzzyDuplicatesConfig
Expand Down Expand Up @@ -98,15 +98,17 @@ def __init__(
"""
self.num_hashes = num_hashes
self.char_ngram = char_ngrams
if MINHASH_PERMUTED_AVAILABLE:
if MINHASH_DEPRECATED_API:
self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)
else:
self.seeds = self.generate_hash_permutation_seeds(
bit_width=64 if use_64bit_hash else 32,
n_permutations=self.num_hashes,
seed=seed,
)
else:
self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed)

self.minhash_method = self.minhash64 if use_64bit_hash else self.minhash32

self.id_field = id_field
self.text_field = text_field

Expand Down Expand Up @@ -171,7 +173,7 @@ def minhash32(
if not isinstance(ser, cudf.Series):
raise TypeError("Expected data of type cudf.Series")

if not MINHASH_PERMUTED_AVAILABLE:
if MINHASH_DEPRECATED_API:
warnings.warn(
"Using an outdated minhash implementation, please update to cuDF version 24.12 "
"or later for improved performance. "
Expand All @@ -184,9 +186,14 @@ def minhash32(
seeds_a = cudf.Series(seeds[:, 0], dtype="uint32")
seeds_b = cudf.Series(seeds[:, 1], dtype="uint32")

return ser.str.minhash_permuted(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)
if MINHASH_PERMUTED_AVAILABLE:
return ser.str.minhash_permuted(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)
else:
return ser.str.minhash(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)

def minhash64(
self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int
Expand All @@ -196,7 +203,7 @@ def minhash64(
"""
if not isinstance(ser, cudf.Series):
raise TypeError("Expected data of type cudf.Series")
if not MINHASH_PERMUTED_AVAILABLE:
if MINHASH_DEPRECATED_API:
warnings.warn(
"Using an outdated minhash implementation, please update to cuDF version 24.12 "
"or later for improved performance. "
Expand All @@ -209,9 +216,14 @@ def minhash64(
seeds_a = cudf.Series(seeds[:, 0], dtype="uint64")
seeds_b = cudf.Series(seeds[:, 1], dtype="uint64")

return ser.str.minhash64_permuted(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)
if MINHASH_PERMUTED_AVAILABLE:
return ser.str.minhash64_permuted(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)
else:
return ser.str.minhash64(
a=seeds_a, b=seeds_b, seed=seeds[0][0], width=char_ngram
)

def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]:
"""
Expand Down
8 changes: 7 additions & 1 deletion nemo_curator/package_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV)

__shortversion__ = ".".join(map(str, VERSION[:3]))
__version__ = __shortversion__ + VERSION[3] + "." + ".".join(VERSION[4:])
__version__ = __shortversion__

if VERSION[3] != "":
__version__ = __version__ + VERSION[3]

if VERSION[4] != "":
__version__ = __version__ + "." + ".".join(VERSION[4:])

__package_name__ = "nemo_curator"
__contact_names__ = "NVIDIA"
Expand Down
Loading

0 comments on commit 5377c19

Please sign in to comment.