From 72230e220a08b7450719dc3de20d85b5408803bf Mon Sep 17 00:00:00 2001
From: Ryan Wolf
Date: Tue, 26 Mar 2024 10:47:37 -0700
Subject: [PATCH] Add citation
Signed-off-by: Ryan Wolf
---
.github/workflows/test.yml | 42 +
.pre-commit-config.yaml | 47 +
.style.yapf | 3 -
CITATION.cff | 25 +
CONTRIBUTING.md | 2 +-
README.md | 12 +-
SECURITY.md | 2 +-
config/arxiv_builder.yaml | 4 +-
config/cc_warc_builder.yaml | 2 +-
config/heuristic_filter_code.yaml | 2 +-
config/heuristic_filter_en.yaml | 18 +-
config/heuristic_filter_non-en.yaml | 18 +-
config/lm_tasks.yaml | 2 +-
config/pii_config.yaml | 2 +-
config/wikipedia_builder.yaml | 2 +-
conftest.py | 15 +
docs/user-guide/CPUvsGPU.rst | 2 +-
.../DistributedDataClassification.rst | 4 +-
docs/user-guide/DocumentDataset.rst | 8 +-
docs/user-guide/Download.rst | 29 +-
docs/user-guide/GpuDeduplication.rst | 11 +-
...anguageIdentificationUnicodeFormatting.rst | 6 +-
...bleInformationIdentificationAndRemoval.rst | 1 -
docs/user-guide/QualityFiltering.rst | 20 +-
docs/user-guide/TaskDecontamination.rst | 2 +-
docs/user-guide/index.rst | 6 +-
examples/classifier_filtering.py | 42 +-
.../domain_api_example.py | 16 +-
.../quality_api_example.py | 12 +-
examples/download_arxiv.py | 10 +-
examples/download_common_crawl.py | 14 +-
examples/download_wikipedia.py | 14 +-
examples/exact_deduplication.py | 9 +-
examples/find_pii_and_deidentify.py | 17 +-
.../identify_languages_and_fix_unicode.py | 36 +-
examples/raw_download_common_crawl.py | 16 +-
examples/slurm/container-entrypoint.sh | 2 +-
examples/slurm/start-slurm.sh | 2 +-
examples/task_decontamination.py | 49 +-
nemo_curator/__init__.py | 2 +-
nemo_curator/datasets/__init__.py | 2 +-
nemo_curator/datasets/doc_dataset.py | 35 +-
.../__init__.py | 2 +-
.../generate_statistics.py | 14 +-
nemo_curator/download/__init__.py | 52 +-
nemo_curator/download/arxiv.py | 728 ++++++------
nemo_curator/download/commoncrawl.py | 516 +++++----
nemo_curator/download/doc_builder.py | 283 +++--
nemo_curator/download/wikipedia.py | 476 ++++----
nemo_curator/filters/__init__.py | 81 +-
nemo_curator/filters/classifier_filter.py | 193 ++--
nemo_curator/filters/code.py | 483 ++++----
nemo_curator/filters/doc_filter.py | 82 +-
nemo_curator/filters/heuristic_filter.py | 1013 ++++++++---------
.../gpu_deduplication/connected_component.py | 414 ++++---
nemo_curator/gpu_deduplication/ioutils.py | 112 +-
.../gpu_deduplication/jaccard_compute.py | 192 ++--
.../gpu_deduplication/jaccard_map_buckets.py | 213 ++--
.../gpu_deduplication/jaccard_shuffle.py | 574 +++++-----
.../jaccard_utils/__init__.py | 2 +-
.../jaccard_utils/batch_shuffle_utils.py | 4 +-
.../jaccard_utils/doc_id_mapping.py | 1 +
.../jaccard_utils/get_anchor_utils.py | 1 +
.../jaccard_utils/get_output_map_utils.py | 1 +
.../jaccard_utils/io_utils.py | 5 +-
.../gpu_deduplication/prepare_fuzzy_ids.py | 112 +-
nemo_curator/gpu_deduplication/utils.py | 324 +++---
.../verify_all_pairs_jaccard.py | 222 ++--
.../write_deduped_result_with_text.py | 83 +-
nemo_curator/log.py | 91 +-
nemo_curator/modifiers/__init__.py | 9 +-
nemo_curator/modifiers/c4.py | 117 +-
nemo_curator/modifiers/doc_modifier.py | 4 +-
nemo_curator/modifiers/fasttext.py | 5 +-
nemo_curator/modifiers/pii_modifier.py | 49 +-
nemo_curator/modifiers/unicode_reformatter.py | 6 +-
nemo_curator/modules/__init__.py | 2 +-
nemo_curator/modules/add_id.py | 23 +-
.../modules/distributed_data_classifier.py | 14 +-
nemo_curator/modules/exact_dedup.py | 4 +-
nemo_curator/modules/filter.py | 194 ++--
nemo_curator/modules/fuzzy_dedup.py | 5 +-
nemo_curator/modules/meta.py | 5 +-
nemo_curator/modules/modify.py | 11 +-
nemo_curator/modules/task.py | 146 ++-
nemo_curator/pii/algorithm.py | 131 +--
.../pii/custom_batch_analyzer_engine.py | 21 +-
nemo_curator/pii/custom_nlp_engine.py | 20 +-
nemo_curator/sample_dataframe.py | 8 +-
nemo_curator/scripts/__init__.py | 2 +-
nemo_curator/scripts/add_id.py | 190 ++--
nemo_curator/scripts/connected_components.py | 8 +-
nemo_curator/scripts/download_and_extract.py | 228 ++--
nemo_curator/scripts/filter_documents.py | 482 ++++----
nemo_curator/scripts/find_exact_duplicates.py | 158 +--
nemo_curator/scripts/find_matching_ngrams.py | 152 +--
.../scripts/find_pii_and_deidentify.py | 82 +-
nemo_curator/scripts/get_common_crawl_urls.py | 135 ++-
nemo_curator/scripts/get_wikipedia_urls.py | 66 +-
nemo_curator/scripts/jaccard_compute.py | 13 +-
nemo_curator/scripts/jaccard_shuffle.py | 11 +-
nemo_curator/scripts/make_data_shards.py | 96 +-
nemo_curator/scripts/map_buckets.py | 8 +-
nemo_curator/scripts/minhash_lsh.py | 6 +-
.../scripts/prepare_fasttext_training_data.py | 184 +--
nemo_curator/scripts/prepare_task_data.py | 87 +-
.../scripts/remove_matching_ngrams.py | 272 ++---
nemo_curator/scripts/separate_by_metadata.py | 182 +--
nemo_curator/scripts/text_cleaning.py | 150 +--
nemo_curator/scripts/train_fasttext.py | 342 +++---
nemo_curator/tasks/__init__.py | 64 +-
nemo_curator/tasks/downstream_task.py | 62 +-
nemo_curator/tasks/metrics.py | 794 ++++++-------
nemo_curator/utils/__init__.py | 2 +-
nemo_curator/utils/code_meta.csv | 2 +-
nemo_curator/utils/config_utils.py | 114 +-
nemo_curator/utils/constants.py | 50 +-
nemo_curator/utils/distributed_utils.py | 20 +-
nemo_curator/utils/download_utils.py | 198 ++--
nemo_curator/utils/file_utils.py | 94 +-
.../utils/fuzzy_dedup_utils/__init__.py | 2 +-
.../utils/fuzzy_dedup_utils/id_mapping.py | 1 +
.../utils/fuzzy_dedup_utils/io_utils.py | 4 +-
.../fuzzy_dedup_utils/output_map_utils.py | 6 +-
.../utils/fuzzy_dedup_utils/shuffle_utils.py | 4 +-
nemo_curator/utils/gpu_utils.py | 2 +-
nemo_curator/utils/script_utils.py | 3 +-
nemo_curator/utils/text_utils.py | 247 ++--
pyproject.toml | 27 +
pytest.ini | 4 -
setup.py | 146 +--
tests/__init__.py | 2 +-
tests/pii_data/address.txt | 2 +-
tests/pii_data/birthdates.txt | 2 +-
tests/pii_data/card_no.txt | 2 +-
tests/pii_data/emails.txt | 2 +-
tests/pii_data/ip_address.txt | 2 +-
tests/pii_data/multiple.txt | 2 +-
tests/pii_data/names.txt | 2 +-
tests/pii_data/phone_numbers.txt | 2 +-
tests/pii_data/ssn.txt | 2 +-
tests/test_add_id.py | 78 +-
tests/test_exact_dedup.py | 1 +
tests/test_filters.py | 469 ++++++--
tests/test_pii_accuracy.py | 13 +-
tests/test_task_decontamination.py | 319 ++++--
tests/test_unicode_reformatter.py | 26 +-
tutorials/tinystories/README.md | 13 +
tutorials/tinystories/helpers.py | 5 +-
tutorials/tinystories/main.py | 5 +-
150 files changed, 7353 insertions(+), 5909 deletions(-)
create mode 100644 .github/workflows/test.yml
create mode 100644 .pre-commit-config.yaml
delete mode 100644 .style.yapf
create mode 100644 CITATION.cff
create mode 100644 conftest.py
create mode 100644 pyproject.toml
delete mode 100644 pytest.ini
create mode 100644 tutorials/tinystories/README.md
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..58442c7b9
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,42 @@
+name: Test Python package
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ workflow_dispatch:
+
+# When this workflow is queued, automatically cancel any previous running
+# or pending jobs from the same branch
+concurrency:
+ group: test-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build_and_test:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.10"]
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install NeMo-Curator and pytest
+ # TODO: Remove pytest when optional test dependencies are added to setup.py
+
+ # Installing wheel beforehand due to fasttext issue:
+ # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
+ # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
+ run: |
+ pip install wheel cython
+ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com .
+ pip install pytest
+ - name: Run tests
+ # TODO: Remove env variable when gpu dependencies are optional
+ run: |
+ RAPIDS_NO_INITIALIZE=1 python -m pytest -v --cpu
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..6f550fc95
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+default_language_version:
+ python: python3
+
+ci:
+ autofix_prs: true
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+ autoupdate_schedule: quarterly
+
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: check-added-large-files
+ args: ['--maxkb=1000']
+ - id: check-case-conflict
+ - id: check-yaml
+ - id: detect-private-key
+ - id: end-of-file-fixer
+ - id: requirements-txt-fixer
+ - id: trailing-whitespace
+
+ - repo: https://github.com/psf/black
+ rev: 24.3.0
+ hooks:
+ - id: black
+ name: Format code
+
+ - repo: https://github.com/PyCQA/isort
+ rev: 5.13.2
+ hooks:
+ - id: isort
+ name: Format imports
+ exclude: docs/
diff --git a/.style.yapf b/.style.yapf
deleted file mode 100644
index 4861cafe6..000000000
--- a/.style.yapf
+++ /dev/null
@@ -1,3 +0,0 @@
-[style]
-based_on_style = google
-indent_width = 2
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..4aa0bbce0
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,25 @@
+cff-version: 1.0.0
+message: "If you use this software, please cite it as below."
+title: "NeMo-Curator: a toolkit for data curation"
+repository-code: https://github.com/NVIDIA/NeMo-Curator
+authors:
+ - family-names: Jennings
+ given-names: Joseph
+ - family-names: Patwary
+ given-names: Mostofa
+ - family-names: Subramanian
+ given-names: Sandeep
+ - family-names: Prabhumoye
+ given-names: Shrimai
+ - family-names: Dattagupta
+ given-names: Ayush
+ - family-names: Jawa
+ given-names: Vibhu
+ - family-names: Liu
+ given-names: Jiwei
+ - family-names: Wolf
+ given-names: Ryan
+ - family-names: Yurick
+ given-names: Sarah
+ - family-names: Singh
+ given-names: Varun
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3bd14fc1e..b8ba733ff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ We use ``black`` as our style guide. To fix your format run `pip install pre-com
1. Minimize the use of ``**kwargs``.
1. ``RaiseError`` is preferred to ``assert``. Write: ```if X: raise Error``` instead of ```assert X```.
1. Classes are preferred to standalone methods.
-1. Methods should be atomic. A method shouldn't be longer than 75 lines, e.g. can be fit into the computer screen without scrolling.
+1. Methods should be atomic. A method shouldn't be longer than 88 lines, e.g. can be fit into the computer screen without scrolling.
1. If a method has arguments that don't fit into one line, each argument should be in its own line for readability.
1. Add ``__init__.py`` for every folder.
1. F-strings are prefered to formatted strings.
diff --git a/README.md b/README.md
index 60911adec..6eec31385 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ We currently support the following data-curation modules. For more details on ea
- [Text reformatting and cleaning](docs/user-guide/LanguageIdentificationUnicodeFormatting.rst)
- Fix unicode decoding errors via [ftfy](https://ftfy.readthedocs.io/en/latest/)
- [Quality filtering](docs/user-guide/QualityFiltering.rst)
- - Multilingual heuristic-based filtering
+ - Multilingual heuristic-based filtering
- Classifier-based filtering via [fastText](https://fasttext.cc/)
- [Document-level deduplication](docs/user-guide/GpuDeduplication.rst)
- Both exact and fuzzy deduplication are accelerated using cuDF and Dask.
@@ -43,7 +43,9 @@ NeMo Curator can be installed manually by cloning the repository and installing
```
pip install --extra-index-url https://pypi.nvidia.com .
```
-NeMo Curator is available in the [NeMo Framework Container](https://registry.ngc.nvidia.com/orgs/ea-bignlp/teams/ga-participants/containers/nemofw-training) which can be applied for [here](https://developer.nvidia.com/nemo-framework). It comes preinstalled in the container.
+### NeMo Framework Container
+
+NeMo Curator is available in the [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo). The NeMo Framework Container provides an end-to-end platform for development of custom generative AI models anywhere. The latest release of NeMo Curator comes preinstalled in the container.
## Usage
@@ -77,7 +79,7 @@ Note: This is not the only way to run NeMo Curator on SLURM. There are example s
## Module Ablation and Compute Performance
-The modules within NeMo Curator were in large part designed to curate high-quality documents from Common Crawl snapshots and to be able to do so
+The modules within NeMo Curator were in large part designed to curate high-quality documents from Common Crawl snapshots and to be able to do so
in a scalable manner. In order to assess the quality of the Common Crawl documents curated by the modules in NeMo Curator, we performed a series
of ablation experiments in which we trained a 357M-parameter GPT-style model on the datasets resulting from the different stages of our data curation
pipeline implemented in NeMo Curator. The figure below demonstrates that the different data curation modules implemented within NeMo Curator
@@ -87,7 +89,7 @@ lead to improved model zero-shot downstream task performance.
-In terms of scalability and compute performance, using the RAPIDS + Dask fuzzy deduplication, we are able to deduplicate the 1.1 Trillion token Red Pajama dataset in 1.8 hours using 64 A100s.
+In terms of scalability and compute performance, using the RAPIDS + Dask fuzzy deduplication, we are able to deduplicate the 1.1 Trillion token Red Pajama dataset in 1.8 hours using 64 A100s.
Additionally, using the CPU-based modules the table below shows the time required and resulting data size reduction of each step of processing the [Common Crawl snapshot from November/December of 2020](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)):
@@ -126,4 +128,4 @@ Additionally, using the CPU-based modules the table below shows the time require
As mentioned above, the modules within NeMo Curator enable users to scale data-mining and NLP processing tasks to many nodes within a compute cluster.
The modules accomplish this using [Dask](https://www.dask.org/) with [cuDF](https://docs.rapids.ai/api/cudf/nightly/user_guide/10min/) (for the GPU-accelerated modules).
-At the core of the NeMo Curator, `DocumentDataset` (the main dataset class) is just a simple wrapper around a Dask dataframe. Dask allows NeMo Curator to scale to arbitrary cluster sizes, and it supports a variety of distributed computing platforms. It supports reading and writing to different file formats, and it can balance these operations among nodes in the cluster. Importantly, Dask also supports the RAPIDS cuDF library for GPU-acclerated exact and fuzzy deduplication.
\ No newline at end of file
+At the core of the NeMo Curator, `DocumentDataset` (the main dataset class) is just a simple wrapper around a Dask dataframe. Dask allows NeMo Curator to scale to arbitrary cluster sizes, and it supports a variety of distributed computing platforms. It supports reading and writing to different file formats, and it can balance these operations among nodes in the cluster. Importantly, Dask also supports the RAPIDS cuDF library for GPU-acclerated exact and fuzzy deduplication.
diff --git a/SECURITY.md b/SECURITY.md
index 2be787ab1..34137c329 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -21,4 +21,4 @@ While NVIDIA currently does not have a bug bounty program, we do offer acknowled
## NVIDIA Product Security
-For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
\ No newline at end of file
+For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
diff --git a/config/arxiv_builder.yaml b/config/arxiv_builder.yaml
index 007566d65..7bc5cb664 100644
--- a/config/arxiv_builder.yaml
+++ b/config/arxiv_builder.yaml
@@ -1,11 +1,11 @@
download_module: nemo_curator.download.arxiv.ArxivDownloader
download_params: {}
iterator_module: nemo_curator.download.arxiv.ArxivIterator
-iterator_params:
+iterator_params:
log_frequency: 1000
extract_module: nemo_curator.download.arxiv.ArxivExtractor
extract_params: {}
format:
text: str
id: str
- source_id: str
\ No newline at end of file
+ source_id: str
diff --git a/config/cc_warc_builder.yaml b/config/cc_warc_builder.yaml
index 3e6a8ed98..7975ab807 100644
--- a/config/cc_warc_builder.yaml
+++ b/config/cc_warc_builder.yaml
@@ -9,4 +9,4 @@ format:
language: str
url: str
warc_id: str
- source_id: str
\ No newline at end of file
+ source_id: str
diff --git a/config/heuristic_filter_code.yaml b/config/heuristic_filter_code.yaml
index 7d6b36e48..897ce4d0e 100644
--- a/config/heuristic_filter_code.yaml
+++ b/config/heuristic_filter_code.yaml
@@ -1,7 +1,7 @@
input_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
- # This particular cascade of filters is intended to filter Python code data.
+ # This particular cascade of filters is intended to filter Python code data.
# The filter listed at the top will be applied first, and the following filters will be applied in
# the order they appear in this file. Each filter can be removed and re-ordered as desired.
# Change this based on the language of the data
diff --git a/config/heuristic_filter_en.yaml b/config/heuristic_filter_en.yaml
index 4e3bbb79b..d4c05f978 100644
--- a/config/heuristic_filter_en.yaml
+++ b/config/heuristic_filter_en.yaml
@@ -1,7 +1,7 @@
input_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
- # This particular cascade of filters is intended to filter English language data.
+ # This particular cascade of filters is intended to filter English language data.
# The filter listed at the top will be applied first, and the following filters will be applied in
# the order they appear in this file. Each filter can be removed and re-ordered as desired.
- name: nemo_curator.filters.heuristic_filter.NonAlphaNumericFilter
@@ -14,16 +14,16 @@ filters:
params:
max_number_to_text_ratio: 0.15
- name: nemo_curator.filters.heuristic_filter.UrlsFilter
- params:
+ params:
max_url_to_text_ratio: 0.2
- name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter
- params:
+ params:
max_white_space_ratio: 0.25
- name: nemo_curator.filters.heuristic_filter.ParenthesesFilter
- params:
+ params:
max_parentheses_ratio: 0.1
- name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter
- params:
+ params:
remove_if_at_top_or_bottom: True
max_boilerplate_string_ratio: 0.4
- name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter
@@ -46,7 +46,7 @@ filters:
params:
max_num_sentences_without_endmark_ratio: 0.85
- name: nemo_curator.filters.heuristic_filter.WordsWithoutAlphabetsFilter
- params:
+ params:
min_words_with_alphabets: 0.8
- name: nemo_curator.filters.heuristic_filter.CommonEnglishWordsFilter
params:
@@ -54,10 +54,10 @@ filters:
stop_at_false: True
- name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter
params:
- max_mean_word_length: 10
+ max_mean_word_length: 10
min_mean_word_length: 3
- name: nemo_curator.filters.heuristic_filter.LongWordFilter
- params:
+ params:
max_word_length: 1000
- name: nemo_curator.filters.heuristic_filter.EllipsisFilter
params:
@@ -102,4 +102,4 @@ filters:
max_repeating_duplicate_ngram_ratio: 0.10
- name: nemo_curator.filters.heuristic_filter.BulletsFilter
params:
- max_bullet_lines_ratio: 0.9
\ No newline at end of file
+ max_bullet_lines_ratio: 0.9
diff --git a/config/heuristic_filter_non-en.yaml b/config/heuristic_filter_non-en.yaml
index 783d0e541..7c456fb2d 100644
--- a/config/heuristic_filter_non-en.yaml
+++ b/config/heuristic_filter_non-en.yaml
@@ -1,7 +1,7 @@
input_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
- # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words.
+ # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words.
# The filter listed at the top will be applied first, and the following filters will be applied in
# the order they appear in this file. Each filter can be removed and re-ordered as desired.
- name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter
@@ -11,16 +11,16 @@ filters:
params:
max_number_to_text_ratio: 0.15
- name: nemo_curator.filters.heuristic_filter.UrlsFilter
- params:
+ params:
max_url_to_text_ratio: 0.2
- name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter
- params:
+ params:
max_white_space_ratio: 0.25
- name: nemo_curator.filters.heuristic_filter.ParenthesesFilter
- params:
+ params:
max_parentheses_ratio: 0.1
- name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter
- params:
+ params:
remove_if_at_top_or_bottom: True
max_boilerplate_string_ratio: 0.4
- name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter
@@ -39,17 +39,17 @@ filters:
params:
min_words: 50
max_words: 100000
- # NOTE: This filter tends to remove many documents and will need to
+ # NOTE: This filter tends to remove many documents and will need to
# be tuned per language
- name: nemo_curator.filters.heuristic_filter.PunctuationFilter
params:
max_num_sentences_without_endmark_ratio: 0.85
- name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter
params:
- max_mean_word_length: 10
+ max_mean_word_length: 10
min_mean_word_length: 3
- name: nemo_curator.filters.heuristic_filter.LongWordFilter
- params:
+ params:
max_word_length: 1000
- name: nemo_curator.filters.heuristic_filter.EllipsisFilter
params:
@@ -94,4 +94,4 @@ filters:
max_repeating_duplicate_ngram_ratio: 0.10
- name: nemo_curator.filters.heuristic_filter.BulletsFilter
params:
- max_bullet_lines_ratio: 0.9
\ No newline at end of file
+ max_bullet_lines_ratio: 0.9
diff --git a/config/lm_tasks.yaml b/config/lm_tasks.yaml
index 3d38ec6f3..d108ee2de 100644
--- a/config/lm_tasks.yaml
+++ b/config/lm_tasks.yaml
@@ -1,6 +1,6 @@
tasks:
# The Python modules below define language model downstream evaluation
- # task data. If one of the below tasks is specified, N-grams will
+ # task data. If one of the below tasks is specified, N-grams will
# be constructed from the documents that make up the task data
# using the script prepare_task_data.
# find_matching_ngrams will then search for these N-grams
diff --git a/config/pii_config.yaml b/config/pii_config.yaml
index 725fde30d..a693fa783 100644
--- a/config/pii_config.yaml
+++ b/config/pii_config.yaml
@@ -13,4 +13,4 @@ pii_config:
#type: 'hash'
#hash_type: 'sha256'
- #type: 'redact'
\ No newline at end of file
+ #type: 'redact'
diff --git a/config/wikipedia_builder.yaml b/config/wikipedia_builder.yaml
index 478315375..872155017 100644
--- a/config/wikipedia_builder.yaml
+++ b/config/wikipedia_builder.yaml
@@ -12,4 +12,4 @@ format:
id: str
url: str
language: str
- source_id: str
\ No newline at end of file
+ source_id: str
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 000000000..451ae5af8
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+
+
+def pytest_addoption(parser):
+ parser.addoption(
+ "--cpu", action="store_true", default=False, help="Run tests without gpu marker"
+ )
+
+
+def pytest_collection_modifyitems(config, items):
+ if config.getoption("--cpu"):
+ skip_gpu = pytest.mark.skip(reason="Skipping GPU tests")
+ for item in items:
+ if "gpu" in item.keywords:
+ item.add_marker(skip_gpu)
diff --git a/docs/user-guide/CPUvsGPU.rst b/docs/user-guide/CPUvsGPU.rst
index c3159b215..5fd901d19 100644
--- a/docs/user-guide/CPUvsGPU.rst
+++ b/docs/user-guide/CPUvsGPU.rst
@@ -95,4 +95,4 @@ Every SLURM cluster is different, so make sure you understand how your SLURM clu
``start-slurm.sh`` calls ``containter-entrypoint.sh`` which sets up a Dask scheduler and workers across the cluster.
Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` to run on multiple nodes.
-You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``.
\ No newline at end of file
+You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``.
diff --git a/docs/user-guide/DistributedDataClassification.rst b/docs/user-guide/DistributedDataClassification.rst
index b7a99a20f..f2bf098d3 100644
--- a/docs/user-guide/DistributedDataClassification.rst
+++ b/docs/user-guide/DistributedDataClassification.rst
@@ -8,7 +8,7 @@ Background
When preparing text data to be used in training a large language model (LLM), it is useful to classify
text documents in various ways, to enhance the LLM's performance by making it able to produce more
-contextually appropriate and accurate language across various subjects. NeMo Curator provides this module to
+contextually appropriate and accurate language across various subjects. NeMo Curator provides this module to
help a user run inference with pre-trained models on large amounts of text documents. We achieve
this by chunking the datasets across multiple computing nodes, each equipped with multiple GPUs, to
accelerate the classification task in a distributed way. In other words, because the classification of
@@ -68,4 +68,4 @@ The key differences is that it operates on the GPU instead of the CPU.
Therefore, the Dask cluster must be started as a GPU one.
And, ``DomainClassifier`` requires ``DocumentDataset`` to be on the GPU (i.e., have ``backend=cudf``).
It is easy to extend ``DistributedDataClassifier`` to your own model.
-Check out ``nemo_curator.modules.distributed_data_classifier.py`` for reference.
\ No newline at end of file
+Check out ``nemo_curator.modules.distributed_data_classifier.py`` for reference.
diff --git a/docs/user-guide/DocumentDataset.rst b/docs/user-guide/DocumentDataset.rst
index 8711227aa..351e41a95 100644
--- a/docs/user-guide/DocumentDataset.rst
+++ b/docs/user-guide/DocumentDataset.rst
@@ -48,7 +48,7 @@ You could read, filter the dataset, and write it using the following methods
text_field="text",
score_field="word_count",
)
-
+
long_books = filter_step(books)
long_books.to_json("long_books/", write_to_filename=True)
@@ -106,7 +106,7 @@ Consider a modified version of the code above:
text_field="text",
score_field="word_count",
)
-
+
long_books = filter_step(books)
long_books.to_json("long_books/", write_to_filename=True)
@@ -130,10 +130,10 @@ In these cases, we recommend processing the input dataset in batches using a sim
text_field="text",
score_field="word_count",
)
-
+
long_books = filter_step(books)
long_books.to_json("long_books/", write_to_filename=True)
This will read in 64 shards at a time, process them, and write them back to disk.
-Like ``get_remaining_files``, it only includes files that are in the input directory and not in the output directory.
\ No newline at end of file
+Like ``get_remaining_files``, it only includes files that are in the input directory and not in the output directory.
diff --git a/docs/user-guide/Download.rst b/docs/user-guide/Download.rst
index 66a344637..e2142de74 100644
--- a/docs/user-guide/Download.rst
+++ b/docs/user-guide/Download.rst
@@ -91,7 +91,7 @@ datasets. In general, it can be called as follows in order to download and extra
--builder-config-file= \
--output-json-dir=
-This utility takes as input a list of URLs that point to files that contain prepared, unextracted data (e.g., pre-crawled web pages from Common Crawl), a config file that describes how to download and extract the data, and the output directory to where the extracted text will be written in jsonl format (one json written to each document per line). For each URL provided in the list of URLs, a corresponding jsonl file will be written to the output directory.
+This utility takes as input a list of URLs that point to files that contain prepared, unextracted data (e.g., pre-crawled web pages from Common Crawl), a config file that describes how to download and extract the data, and the output directory to where the extracted text will be written in jsonl format (one json written to each document per line). For each URL provided in the list of URLs, a corresponding jsonl file will be written to the output directory.
The config file that must be provided at runtime, should take the following form
@@ -133,7 +133,7 @@ If you would prefer to use this over `wget `
Downloading and Extracting Common Crawl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-As described in the first section of this document, the first step towards using the :code:`download_and_extract` for Common Crawl will be to create a list of URLs that point to the location of the WARC files hosted by Common Crawl.
+As described in the first section of this document, the first step towards using the :code:`download_and_extract` for Common Crawl will be to create a list of URLs that point to the location of the WARC files hosted by Common Crawl.
Within NeMo Curator, we provide the utility :code:`get_common_crawl_urls` to obtain these urls. This utility can be run as follows
.. code-block:: bash
@@ -144,9 +144,9 @@ Within NeMo Curator, we provide the utility :code:`get_common_crawl_urls` to obt
--ending-snapshot="2020-50" \
--output-warc-url-file=./url_data/warc_urls_cc_2020_50.txt
-This script pulls the Common Crawl index from `https://index.commoncrawl.org` and stores the index to the file
-specified by the argument :code:`--cc-snapshot-index-file`. It then retrieves all WARC urls between the
-dates specified by the arguments :code:`--starting-snapshot` and :code:`--ending-snapshot`.
+This script pulls the Common Crawl index from `https://index.commoncrawl.org` and stores the index to the file
+specified by the argument :code:`--cc-snapshot-index-file`. It then retrieves all WARC urls between the
+dates specified by the arguments :code:`--starting-snapshot` and :code:`--ending-snapshot`.
Finally, it writes all WARC urls to the text file :code:`--output-warc-urls`. This file is a simple text file
with the following format::
@@ -175,16 +175,15 @@ example of a single line of an output `.jsonl` file extracted from a WARC record
.. code-block:: json
- {"text": "커뮤니티\n\n어린이 요리 교실은 평소 조리와 제과 제빵에 관심이 있는 초등학생을 대상으로 나이프스킬, 한식, 중식, 양식, 제과, 제빵, 디저트,
- 생활요리 등 요리 기초부터 시작해 다양한 요리에 대해 배우고, 경험할 수 있도록 구성되었다.\n\n요즘 부모들의 자녀 요리 교육에 대한 관심이 높아지고
- 있는데, 어린이 요리교실은 자녀들이 어디서 어떻게 요리를 처음 시작할지 막막하고 어려워 고민하는 이들을 위해 만들어졌다.\n\n그 뿐만 아니라 학생들이
- 식재료를 다루는 과정에서 손으로 만지고 느끼는 것이 감각을 자극하여 두뇌발달에 도움을 주며, 조리를 통해 자신의 감정을 자연스럽게 표현할 수
- 있고 이를 통해 정서적 안정을 얻을 수 있다. 또한, 다양한 사물을 만져 보면서 차이점을 구별하고 사물의 특징에 대해 인지할 수 있으므로 인지 능력 향상에
- 도움이 되며, 만지고 느끼고 비교하는 과정에서 감각 기능을 향상시킬 수 있다.\n\n방과 후 시간이 되지 않는 초등학생들을 위해 평일반 뿐만 아니라 주말반도
- 운영하고 있으며 두 분의 선생님들의 안전적인 지도하에 수업이 진행된다. 한국조리예술학원은 젊은 감각과 학생들과의 소통을 통해 자발적인 교육을 가르친다.
- 자세한 학원 문의는 한국조리예술학원 홈페이지나 대표 전화, 카카오톡 플러스친구를 통해 가능하다.", "id": "a515a7b6-b6ec-4bed-998b-8be2f86f8eac",
- "source_id": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00000.warc.gz",
+ {"text": "커뮤니티\n\n어린이 요리 교실은 평소 조리와 제과 제빵에 관심이 있는 초등학생을 대상으로 나이프스킬, 한식, 중식, 양식, 제과, 제빵, 디저트,
+ 생활요리 등 요리 기초부터 시작해 다양한 요리에 대해 배우고, 경험할 수 있도록 구성되었다.\n\n요즘 부모들의 자녀 요리 교육에 대한 관심이 높아지고
+ 있는데, 어린이 요리교실은 자녀들이 어디서 어떻게 요리를 처음 시작할지 막막하고 어려워 고민하는 이들을 위해 만들어졌다.\n\n그 뿐만 아니라 학생들이
+ 식재료를 다루는 과정에서 손으로 만지고 느끼는 것이 감각을 자극하여 두뇌발달에 도움을 주며, 조리를 통해 자신의 감정을 자연스럽게 표현할 수
+ 있고 이를 통해 정서적 안정을 얻을 수 있다. 또한, 다양한 사물을 만져 보면서 차이점을 구별하고 사물의 특징에 대해 인지할 수 있으므로 인지 능력 향상에
+ 도움이 되며, 만지고 느끼고 비교하는 과정에서 감각 기능을 향상시킬 수 있다.\n\n방과 후 시간이 되지 않는 초등학생들을 위해 평일반 뿐만 아니라 주말반도
+ 운영하고 있으며 두 분의 선생님들의 안전적인 지도하에 수업이 진행된다. 한국조리예술학원은 젊은 감각과 학생들과의 소통을 통해 자발적인 교육을 가르친다.
+ 자세한 학원 문의는 한국조리예술학원 홈페이지나 대표 전화, 카카오톡 플러스친구를 통해 가능하다.", "id": "a515a7b6-b6ec-4bed-998b-8be2f86f8eac",
+ "source_id": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00000.warc.gz",
"url": "http://hanjowon.co.kr/web/home.php?mid=70&go=pds.list&pds_type=1&start=20&num=67&s_key1=&s_que=", "language": "KOREAN"}
Once all records have been processed within a WARC file, it is by default deleted from disk.
-
diff --git a/docs/user-guide/GpuDeduplication.rst b/docs/user-guide/GpuDeduplication.rst
index d23e8ee77..d8b54811b 100644
--- a/docs/user-guide/GpuDeduplication.rst
+++ b/docs/user-guide/GpuDeduplication.rst
@@ -10,7 +10,7 @@ Background
-----------------------------------------
Training on randomly selected documents for many epochs can be sub-optimal to downstream performance for language models.
-For more information on when this is harmful, please see `Muennighoff et al., 2023 `_ and `Tirumala et al., 2023 `_.
+For more information on when this is harmful, please see `Muennighoff et al., 2023 `_ and `Tirumala et al., 2023 `_.
The exact and fuzzy document-level deduplication module in the NeMo Curator aims at reducing the occurence of duplicate and
near-duplicate documents in the dataset. Exact deduplication refers to removing identical (i.e., document strings are equal)
documents from the dataset, while fuzzy deduplication refers to removing near-identical (e.g., an excerpt of a document is used in another document)
@@ -27,7 +27,7 @@ As exact deduplication is a much less involved procedure and requires significan
we typically will first run exact deduplication before fuzzy deduplication. Also, from our experience in
deduplicating Common Crawl snapshots, a significant portion of the duplicates are in fact exact duplicates.
-When removing near-duplicates within the corpus we perform fuzzy deduplication at the document level in order to remove documents that
+When removing near-duplicates within the corpus we perform fuzzy deduplication at the document level in order to remove documents that
have high Jaccard similarity. Our approach closely resembles the approach described in `Smith et al., 2020 `_. This
approach can essentially be split into two conceptual changes. The first stage involves computing MinHashes Signatures on
documents and then performing Locality Sensitive Hashing (LSH) to find candidate duplucates. Due to the approximate nature of the bucketing via MinHash + LSH
@@ -35,11 +35,11 @@ documents and then performing Locality Sensitive Hashing (LSH) to find candidate
-Before running either of these modules, users should assign a unique document ID to each document in the corpus.
+Before running either of these modules, users should assign a unique document ID to each document in the corpus.
This can be accomplished using the :code:`add_id` module within the NeMo Curator:
.. code-block:: bash
-
+
add_id \
--input-data-dir= \
--log-dir=./log/add_id
@@ -47,7 +47,7 @@ This can be accomplished using the :code:`add_id` module within the NeMo Curator
By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001".
If the dataset already has a unique ID this step can be skipped.
-**Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the
+**Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the
dataset does not contain ID's in this format it's recommended to convert to an integer based ID or ID created by the :code:`add_id` script.
Once a unique ID has been added to each document, users can proceed with exact and fuzzy deduplication which roughly require the following
@@ -80,4 +80,3 @@ steps (all scripts are included in the :code:`nemo_curator/scripts/` subdirector
In addition to the scripts, there are examples in the `examples` directory that showcase using the python module
directly in your own code. It also has examples on how to remove documents from the corpus using the list of duplicate IDs generated from exact or fuzzy
deduplication.
-
diff --git a/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst b/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst
index ddd107bf0..3e61f8f7d 100644
--- a/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst
+++ b/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst
@@ -40,7 +40,7 @@ Here is the implementation of the ``UnicodeReformatter`` modifier:
class UnicodeReformatter(DocumentModifier):
def __init__(self):
super().__init__()
-
+
def modify_document(self, text: str) -> str:
return ftfy.fix_text(text)
@@ -51,7 +51,7 @@ Related Scripts
-----------------------------------------
To perform the language identification, we can use the config file provided in the `config` directory
-and provide the path to a local copy of the `lid.176.bin` language identification fastText model. Then, with the general purpose
+and provide the path to a local copy of the `lid.176.bin` language identification fastText model. Then, with the general purpose
:code:`filter_documents` tool, we can compute language scores and codes for each document in the corpus as follows
.. code-block:: bash
@@ -77,7 +77,7 @@ within that file. Below is an example run command for :code:`separate_by_metadat
--input-metadata-field=language \
--output-data-dir=