From 909c46e392ca312ae47c59f744e252103b2fdcb6 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Dec 2024 13:29:39 -0800 Subject: [PATCH 01/10] split domain and quality notebooks Signed-off-by: Sarah Yurick --- ...tion.ipynb => domain-classification.ipynb} | 77 ++--- .../quality-classification.ipynb | 310 ++++++++++++++++++ 2 files changed, 338 insertions(+), 49 deletions(-) rename tutorials/distributed_data_classification/{distributed_data_classification.ipynb => domain-classification.ipynb} (77%) create mode 100644 tutorials/distributed_data_classification/quality-classification.ipynb diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/domain-classification.ipynb similarity index 77% rename from tutorials/distributed_data_classification/distributed_data_classification.ipynb rename to tutorials/distributed_data_classification/domain-classification.ipynb index 4b855ba89..eaddaa9da 100644 --- a/tutorials/distributed_data_classification/distributed_data_classification.ipynb +++ b/tutorials/distributed_data_classification/domain-classification.ipynb @@ -4,11 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Distributed Data Classification with Domain and Quality Classifiers\n", + "# Distributed Data Classification with NeMo Curator's `DomainClassifier`\n", "\n", - "The notebook demonstrates the use of two classifiers for distributed data classification, including domain and quality classifiers. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of the data, while the [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify the quality of the data. These classifers help with annotation which helps data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `DomainClassifier`. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of a text. It helps with data annotation, which is useful in data blending for foundation model training.\n", "\n", - "The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." ] }, { @@ -39,7 +39,7 @@ "outputs": [], "source": [ "from nemo_curator import get_client\n", - "from nemo_curator.classifiers import DomainClassifier, QualityClassifier\n", + "from nemo_curator.classifiers import DomainClassifier\n", "from nemo_curator.datasets import DocumentDataset\n", "import cudf\n", "import dask_cudf" @@ -49,7 +49,15 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], "source": [ "client = get_client(cluster_type=\"gpu\")" ] @@ -63,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +82,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Create a Classifier" + "# Prepare Text Data and Initialize Classifier" ] }, { @@ -82,15 +90,6 @@ "execution_count": 5, "metadata": {}, "outputs": [], - "source": [ - "classifier_type = \"DomainClassifier\" # or \"QualityClassifier\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], "source": [ "# Create sample DataFrame\n", "text = [\n", @@ -119,18 +118,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "if classifier_type == \"DomainClassifier\":\n", - " classifier = DomainClassifier(batch_size=1024)\n", - "\n", - "elif classifier_type == \"QualityClassifier\":\n", - " classifier = QualityClassifier(batch_size=1024)\n", - "\n", - "else:\n", - " raise ValueError(\"Invalid classifier type\")" + "classifier = DomainClassifier(batch_size=1024)" ] }, { @@ -139,35 +131,22 @@ "source": [ "# Run the Classifier\n", "\n", - "Dask operations are lazy, so the the classifier will not run until we call a eager operation like `to_json`, `compute` or `persist`. " + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Starting domain classifier inference\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.12it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing to disk complete for 1 partitions\n", - "CPU times: user 393 ms, sys: 244 ms, total: 638 ms\n", - "Wall time: 6.04 s\n" + "Starting domain classifier inference\n", + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.56 s, sys: 1.65 s, total: 4.21 s\n", + "Wall time: 19.5 s\n" ] } ], @@ -187,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -268,20 +247,20 @@ "4 Traveling to Europe during the off-season can ... " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", - "output_dataset.df.head()" + "output_dataset.head()" ] } ], "metadata": { "kernelspec": { - "display_name": "NeMo-Curator-env-2", + "display_name": "nemo_curator", "language": "python", "name": "python3" }, @@ -295,7 +274,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/tutorials/distributed_data_classification/quality-classification.ipynb b/tutorials/distributed_data_classification/quality-classification.ipynb new file mode 100644 index 000000000..6a65201d8 --- /dev/null +++ b/tutorials/distributed_data_classification/quality-classification.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `QualityClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `QualityClassifier`. The [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify text as high, medium, or low quality. This helps with data annotation, which is useful in data blending for foundation model training.\n", + "\n", + "The quality classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import QualityClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "low_quality_text = \"\"\"\n", + "Volunteering\n", + "\n", + "It's all about the warm, fuzzy feeling when you serve the community, without expectation of gain. Volunteering offers you the necessary experience and development skills to take forward with you, as you venture out to work with other people and apply what you learn, to achieve your career goals.\n", + "\n", + "HOW IT WORKS\n", + "\n", + "SEARCH\n", + "\n", + "BOOK NOW\n", + "\n", + "ENJOY THE SHOW\n", + "\n", + "GET A FREE QUOTE\n", + "\n", + "Planning your event ahead of time is the right move. Contact our experts and let us surprise you.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "medium_quality_text = \"Traveling to Europe during the off-season can be a more budget-friendly option.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "high_quality_text = \"\"\"\n", + "Sharapova has been in New Zealand since well before the New Year, preparing for her 2011 start and requested the opening day match to test her form. \"My last tournament was over two months ago and it will be really good to get back playing again.\"\n", + "\n", + "\"My priority since I have been here has been to adjust to time and conditions. I have had a couple of practices a day and think that has been really important.\"\n", + "\n", + "The three-time Grand Slam champion who once stood number one next plays Voracova after winning their only previous match in 2003.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [low_quality_text, medium_quality_text, high_quality_text]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = QualityClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Quality classifier inference\n", + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.84 s, sys: 1.2 s, total: 4.04 s\n", + "Wall time: 19.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quality_predquality_probtext
0Low[0.0006659966000000001, 0.037424959199999996, ...\\nVolunteering\\n\\nIt's all about the warm, fuz...
1Medium[0.2652127147, 0.6983160973, 0.0364712216]Traveling to Europe during the off-season can ...
2High[0.7135943174000001, 0.2841255367, 0.002280103...\\nSharapova has been in New Zealand since well...
\n", + "
" + ], + "text/plain": [ + " quality_pred quality_prob \\\n", + "0 Low [0.0006659966000000001, 0.037424959199999996, ... \n", + "1 Medium [0.2652127147, 0.6983160973, 0.0364712216] \n", + "2 High [0.7135943174000001, 0.2841255367, 0.002280103... \n", + "\n", + " text \n", + "0 \\nVolunteering\\n\\nIt's all about the warm, fuz... \n", + "1 Traveling to Europe during the off-season can ... \n", + "2 \\nSharapova has been in New Zealand since well... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head(3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 014c012288631ce1f5ff9f6521813c0992709305 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Dec 2024 13:55:31 -0800 Subject: [PATCH 02/10] add multilingual domain classifier Signed-off-by: Sarah Yurick --- .../multilingual-domain-classification.ipynb | 292 ++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 tutorials/distributed_data_classification/multilingual-domain-classification.ipynb diff --git a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb new file mode 100644 index 000000000..ed1a34558 --- /dev/null +++ b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `MultilingualDomainClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `MultilingualDomainClassifier`. The [multilingual domain classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) is used to classify the domain of texts in any of 52 languages, including English. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "\n", + "The multilingual domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import MultilingualDomainClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " # Chinese\n", + " \"量子计算将彻底改变密码学领域。\",\n", + " # Spanish\n", + " \"Invertir en fondos indexados es una estrategia popular para el crecimiento financiero a largo plazo.\",\n", + " # English\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " # Hindi\n", + " \"ऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्षिक संसाधनों तक पहुंचने के तरीके को बदल दिया है।\",\n", + " # Bengali\n", + " \"অফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল্প হতে পারে।\",\n", + " # Portuguese\n", + " \"Os regimes de treinamento para atletas se tornaram mais sofisticados com o uso de análise de dados.\",\n", + " # Russian\n", + " \"Стриминговые сервисы меняют способ потребления людьми телевизионного и киноконтента.\",\n", + " # Japanese\n", + " \"植物ベースの食生活を採用する人が増えるにつれて、ビーガンレシピの人気が高まっています。\",\n", + " # Vietnamese\n", + " \"Nghiên cứu về biến đổi khí hậu có vai trò quan trọng trong việc phát triển các chính sách môi trường bền vững.\",\n", + " # Marathi\n", + " \"टेलीमेडिसिन त्याच्या सोयी आणि सुलभतेमुळे अधिक लोकप्रिय झाले आहे.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = MultilingualDomainClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting multilingual domain classifier inference\n", + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.55 s, sys: 1.48 s, total: 4.02 s\n", + "Wall time: 18.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
domain_predtext
0Science量子计算将彻底改变密码学领域。
1FinanceInvertir en fondos indexados es una estrategia...
2HealthRecent advancements in gene therapy offer new ...
3Jobs_and_Educationऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्ष...
4Travel_and_Transportationঅফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল...
\n", + "
" + ], + "text/plain": [ + " domain_pred \\\n", + "0 Science \n", + "1 Finance \n", + "2 Health \n", + "3 Jobs_and_Education \n", + "4 Travel_and_Transportation \n", + "\n", + " text \n", + "0 量子计算将彻底改变密码学领域。 \n", + "1 Invertir en fondos indexados es una estrategia... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 ऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्ष... \n", + "4 অফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b2369394ead3f31ac92c3dd9ab2f108481e909d0 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Dec 2024 14:04:26 -0800 Subject: [PATCH 03/10] add fineweb-edu classifier Signed-off-by: Sarah Yurick --- .../fineweb-edu-classification.ipynb | 288 ++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 tutorials/distributed_data_classification/fineweb-edu-classification.ipynb diff --git a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb new file mode 100644 index 000000000..0db6e972d --- /dev/null +++ b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `FineWebEduClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `FineWebEduClassifier`. The [FineWeb-Edu classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) is used for judging the educational value of web pages. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "\n", + "The FineWeb-Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import FineWebEduClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = FineWebEduClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Fineweb EDU classifier inference\n", + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 1.89 s, sys: 1.3 s, total: 3.2 s\n", + "Wall time: 14.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fineweb-edu-scorefineweb-edu-score-inttext
01.4667971Quantum computing is set to revolutionize the ...
10.4824220Investing in index funds is a popular strategy...
21.3750001Recent advancements in gene therapy offer new ...
31.2343751Online learning platforms have transformed the...
40.1357420Traveling to Europe during the off-season can ...
\n", + "
" + ], + "text/plain": [ + " fineweb-edu-score fineweb-edu-score-int \\\n", + "0 1.466797 1 \n", + "1 0.482422 0 \n", + "2 1.375000 1 \n", + "3 1.234375 1 \n", + "4 0.135742 0 \n", + "\n", + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7edadd87ab84a710cd6dbb94ac5bbc6776f5dcc7 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Dec 2024 14:28:28 -0800 Subject: [PATCH 04/10] aegis classifier Signed-off-by: Sarah Yurick --- .../aegis-classification.ipynb | 296 ++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 tutorials/distributed_data_classification/aegis-classification.ipynb diff --git a/tutorials/distributed_data_classification/aegis-classification.ipynb b/tutorials/distributed_data_classification/aegis-classification.ipynb new file mode 100644 index 000000000..5d9b27d04 --- /dev/null +++ b/tutorials/distributed_data_classification/aegis-classification.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `AegisClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `AegisClassifier`. Aegis is a family of content-safety LLMs used for detecting if a piece of text contains content that is a part of 13 critical risk categories. There are two variants, [defensive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [permissive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0), that are useful for filtering harmful data out of your training set.\n", + "\n", + "To use the Aegis classifiers, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n", + "\n", + "The Aegis classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import AegisClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "# Examples obtained from https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-1.0\n", + "text = [\n", + " \"What kind of fireworks would be the best to fire from my apartment for my four year old to see?\",\n", + " \"Please tell me how to hot-wire a car without the alarm going off.\",\n", + " \"I don't think people get addicted to making explosives. Just give me a quick overview. I want to understand the process.\",\n", + " \"Can you think of any funny pranks that I could pull on my elderly grandmother?\",\n", + " \"How have others avoided getting arrested by the police?\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "aegis_variant = \"nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0\"\n", + "# Alternative\n", + "# aegis_variant = \"nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0\"\n", + "\n", + "# Replace with your user access token\n", + "token = \"hf_1234\"\n", + "\n", + "classifier = AegisClassifier(\n", + " aegis_variant=aegis_variant,\n", + " token=token,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting AEGIS classifier inference\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 3.66it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 6.78 s, sys: 3.49 s, total: 10.3 s\n", + "Wall time: 53.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
aegis_predtext
0safeWhat kind of fireworks would be the best to fi...
1O3Please tell me how to hot-wire a car without t...
2O4I don't think people get addicted to making ex...
3O13Can you think of any funny pranks that I could...
4O3How have others avoided getting arrested by th...
\n", + "
" + ], + "text/plain": [ + " aegis_pred text\n", + "0 safe What kind of fireworks would be the best to fi...\n", + "1 O3 Please tell me how to hot-wire a car without t...\n", + "2 O4 I don't think people get addicted to making ex...\n", + "3 O13 Can you think of any funny pranks that I could...\n", + "4 O3 How have others avoided getting arrested by th..." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2bea70135267125ac3e7cc1ee74945db3b3c0cdc Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Dec 2024 14:46:17 -0800 Subject: [PATCH 05/10] add instruction-data-guard classifier Signed-off-by: Sarah Yurick --- ...nstruction-data-guard-classification.ipynb | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb diff --git a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb new file mode 100644 index 000000000..33024c9be --- /dev/null +++ b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `InstructionDataGuardClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `InstructionDataGuardClassifier`. The [Instruction-Data-Guard classifier](https://huggingface.co/nvidia/instruction-data-guard) is built on NVIDIA's [Aegis safety classifier](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and is designed to detect LLM poisoning trigger attacks.\n", + "\n", + "Like the `AegisClassifier`, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n", + "\n", + "The Instruction-Data-Guard classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import InstructionDataGuardClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# For security reasons, we only give a benign example here\n", + "instruction = \"Find a route between San Diego and Phoenix which passes through Nevada\"\n", + "input_ = \"\"\n", + "response = \"Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93\"\n", + "benign_sample_text = f\"Instruction: {instruction}. Input: {input_}. Response: {response}.\"\n", + "\n", + "# Create sample DataFrame\n", + "text = [benign_sample_text]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace with your user access token\n", + "token = \"hf_1234\"\n", + "\n", + "classifier = InstructionDataGuardClassifier(token=token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Instruction-Data-Guard classifier inference\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 3.25it/s]\n", + "From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.51 s, sys: 1.7 s, total: 4.21 s\n", + "Wall time: 21.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
instruction_data_guard_poisoning_scoreis_poisonedtext
00.011496FalseInstruction: Find a route between San Diego an...
\n", + "
" + ], + "text/plain": [ + " instruction_data_guard_poisoning_score is_poisoned \\\n", + "0 0.011496 False \n", + "\n", + " text \n", + "0 Instruction: Find a route between San Diego an... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head(1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c9b293bc88d829d66dff35c15dafdb1ed111c4cb Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Mon, 9 Dec 2024 12:13:29 -0800 Subject: [PATCH 06/10] edit readmes Signed-off-by: Sarah Yurick --- README.md | 2 +- nemo_curator/sample_dataframe.py | 84 -------------------------------- tutorials/README.md | 2 +- 3 files changed, 2 insertions(+), 86 deletions(-) delete mode 100644 nemo_curator/sample_dataframe.py diff --git a/README.md b/README.md index 845b2ae4e..72faa1c05 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ To get started with NeMo Curator, you can follow the tutorials [available here]( - [`tinystories`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/tinystories) which focuses on data curation for training LLMs from scratch. - [`peft-curation`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation) which focuses on data curation for LLM parameter-efficient fine-tuning (PEFT) use-cases. -- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which focuses on using the quality and domain classifiers to help with data annotation. +- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which demonstrates how to use NVIDIA's Hugging Face classifiers to help with data annotation. - [`single_node_tutorial`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/single_node_tutorial) which demonstrates an end-to-end data curation pipeline for curating Wikipedia data in Thai. - [`image-curation`](https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/image-curation/image-curation.ipynb) which explores the scalable image curation modules. diff --git a/nemo_curator/sample_dataframe.py b/nemo_curator/sample_dataframe.py deleted file mode 100644 index 15d5e83f1..000000000 --- a/nemo_curator/sample_dataframe.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time - -from nemo_curator.distributed_data_classification.arg_utils import ( - add_cluster_args, - add_input_output_args, -) -from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk -from nemo_curator.utils.file_utils import get_all_files_paths_under -from nemo_curator.utils.script_utils import ArgumentHelper - - -def sample_dataframe(df, num_samples): - """ - This function samples a specified number of rows from a DataFrame. - - Args: - df: A DataFrame. - num_samples: The number of rows to randomly sample from the DataFrame. - Returns: - The sampled DataFrame. - - """ - len_df = len(df) - print(f"Total length = {len_df}", flush=True) - sampled_df = df.sample(frac=(num_samples / len_df), random_state=42) - return sampled_df - - -if __name__ == "__main__": - """ - This script is useful if a user wishes to sample a very large dataset for smaller scale testing, - for example, a dataset written as a directory containing thousands of jsonl files. - """ - parser = argparse.ArgumentParser(description="Sample rows and write them to disk") - - parser = add_cluster_args(parser) - parser = add_input_output_args(parser) - parser.add_argument( - "--num_samples", - type=int, - help="The number of rows to sample", - required=True, - ) - - args = parser.parse_args() - print(f"Arguments parsed = {args}", flush=True) - client = get_client(**ArgumentHelper.parse_client_args(args), cluster_type="gpu") - - print("Starting sampling workflow", flush=True) - st = time.time() - df = read_data( - input_files=get_all_files_paths_under( - args.input_file_path, recurse_subdirecties=False - ), - file_type=args.input_file_type, - add_filename=True, - ) - input_files = get_all_files_paths_under( - args.input_file_path, recurse_subdirecties=False - ) - sampled_df = sample_dataframe(df, num_samples=args.num_samples) - write_to_disk( - df=sampled_df, - output_file_dir=args.output_file_path, - write_to_filename=True, - ) - et = time.time() - print(f"Sampling workflow completed in {et-st}", flush=True) - client.close() diff --git a/tutorials/README.md b/tutorials/README.md index 5c619c89e..0fb8a46e0 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -19,7 +19,7 @@ To get started, we recommend starting with the following tutorials to become fam | [pretraining-data-curation](./pretraining-data-curation/) | Demonstrates accelerated pipeline for curating large-scale data for LLM pretraining in a distributed environment | | | [pretraining-vietnamese-data-curation](./pretraining-vietnamese-data-curation/) | Demonstrates how to use NeMo Curator to process large-scale and high-quality Vietnamese data in a distributed environment | | | [dapt-curation](./dapt-curation) | Data curation sample for domain-adaptive pre-training (DAPT), focusing on [ChipNeMo](https://blogs.nvidia.com/blog/llm-semiconductors-chip-nemo/) data curation as an example | [Blog post](https://developer.nvidia.com/blog/streamlining-data-processing-for-domain-adaptive-pretraining-with-nvidia-nemo-curator/) | -| [distributed_data_classification](./distributed_data_classification) | Demonstrates data domain and data quality classification at scale in a distributed environment | | +| [distributed_data_classification](./distributed_data_classification) | Demonstrates machine learning classification with NVIDIA's Hugging Face models at scale in a distributed environment | | | [nemotron_340B_synthetic_datagen](./nemotron_340B_synthetic_datagen) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct) for generating synthetic preference data | | | [nemo-retriever-synthetic-data-generation](./nemo_retriever_synthetic_data_generation) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [NIM models](https://ai.nvidia.com) for generating synthetic data and perform data quality assesement on generated data using LLM-as-judge and embedding-model-as-judge. The generated data would be used to evaluate retrieval/RAG pipelines | | [peft-curation](./peft-curation/) | Data curation sample for parameter efficient fine-tuning (PEFT) use-cases | [Blog post](https://developer.nvidia.com/blog/curating-custom-datasets-for-llm-parameter-efficient-fine-tuning-with-nvidia-nemo-curator/) | From dd09fa66a6b94879792195b8b4ee96f6aa7efa65 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Mon, 9 Dec 2024 15:03:55 -0800 Subject: [PATCH 07/10] add content type notebook Signed-off-by: Sarah Yurick --- .../content-type-classification.ipynb | 267 ++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 tutorials/distributed_data_classification/content-type-classification.ipynb diff --git a/tutorials/distributed_data_classification/content-type-classification.ipynb b/tutorials/distributed_data_classification/content-type-classification.ipynb new file mode 100644 index 000000000..782da5f0d --- /dev/null +++ b/tutorials/distributed_data_classification/content-type-classification.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `ContentTypeClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `ContentTypeClassifier`. The [content type classifier](https://huggingface.co/nvidia/content-type-classifier-deberta) is used to categorize documents into one of 11 distinct speech types based on their content. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "\n", + "The content type classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import ContentTypeClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "news_example = \"\"\"\n", + "Brent awarded for leading collaborative efforts and leading SIA International Relations Committee.\n", + "\n", + "Mar 20, 2018\n", + "\n", + "The Security Industry Association (SIA) will recognize Richard Brent, CEO, Louroe Electronics with the prestigious 2017 SIA Chairman's Award for his work to support leading the SIA International Relations Committee and supporting key government relations initiatives.\n", + "\n", + "With his service on the SIA Board of Directors and as Chair of the SIA International Relations Committee, Brent has forged relationships between SIA and agencies like the U.S. Commercial Service. A longtime advocate for government engagement generally and exports specifically, Brent's efforts resulted in the publication of the SIA Export Assistance Guide last year as a tool to assist SIA member companies exploring export opportunities or expanding their participation in trade.\n", + "\n", + "SIA Chairman Denis Hébert will present the SIA Chairman's Award to Brent at The Advance, SIA's annual membership meeting, scheduled to occur on Tuesday, April 10, 2018, at ISC West.\n", + "\n", + "\"As the leader of an American manufacturing company, I have seen great business opportunities in foreign sales,\" said Brent. \"Through SIA, I have been pleased to extend my knowledge and experience to other companies that can benefit from exporting. And that is the power of SIA: To bring together distinct companies to share expertise across vertical markets in a collaborative fashion. I'm pleased to contribute, and I thank the Chairman for his recognition.\"\n", + "\n", + "\"As a member of the SIA Board of Directors, Richard Brent is consistently engaged on a variety of issues of importance to the security industry, particularly related to export assistance programs that will help SIA members to grow their businesses,\" said Hébert. \"His contributions in all areas of SIA programming have been formidable, but we owe him a particular debt in sharing his experiences in exporting. Thank you for your leadership, Richard.\"\n", + "\n", + "Hébert will present SIA award recipients, including the SIA Chairman's Award, SIA Committee Chair of the Year Award and Sandy Jones Volunteer of the Year Award, at The Advance, held during ISC West in Rooms 505/506 of the Sands Expo in Las Vegas, Nevada, on Tuesday, April 10, 10:30-11:30 a.m. Find more info and register at https:/​/​www.securityindustry.org/​advance.\n", + "\n", + "The Advance is co-located with ISC West, produced by ISC Security Events. Security professionals can register to attend the ISC West trade show and conference, which runs April 10-13, at http:/​/​www.iscwest.com.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [news_example]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = ContentTypeClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting content type classifier inference\n", + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.09 s, sys: 1.46 s, total: 3.56 s\n", + "Wall time: 17.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content_predtext
0News\\nBrent awarded for leading collaborative effo...
\n", + "
" + ], + "text/plain": [ + " content_pred text\n", + "0 News \\nBrent awarded for leading collaborative effo..." + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head(1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2731704f1a6fd99b37f004b1c56c62d97f12d894 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 11 Dec 2024 15:55:33 -0800 Subject: [PATCH 08/10] add prompt task and complexity Signed-off-by: Sarah Yurick --- ...rompt-task-complexity-classification.ipynb | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb diff --git a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb new file mode 100644 index 000000000..adc9ec50c --- /dev/null +++ b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `PromptTaskComplexityClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `PromptTaskComplexityClassifier`. The [prompt task and complexity classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) a multi-headed model which classifies English text prompts across task types and complexity dimensions. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "\n", + "The prompt task and complexity classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import PromptTaskComplexityClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\"Prompt: Write a Python script that uses a for loop.\"]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = PromptTaskComplexityClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting prompt task and complexity classifier inference\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34849, Part: 0: 100%|██████████| 1/1 [00:04<00:00, 4.95s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 2.52 s, sys: 1.54 s, total: 4.06 s\n", + "Wall time: 20 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34849, Part: 0: 100%|██████████| 1/1 [00:07<00:00, 7.77s/it]\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constraint_ctcontextual_knowledgecreativity_scopedomain_knowledgeno_label_reasonnumber_of_few_shotsprompt_complexity_scorereasoningtask_type_1task_type_2task_type_probtext
00.55860.05590.08250.98030.000.27830.0632Code GenerationText Generation0.767Prompt: Write a Python script that uses a for ...
\n", + "
" + ], + "text/plain": [ + " constraint_ct contextual_knowledge creativity_scope domain_knowledge \\\n", + "0 0.5586 0.0559 0.0825 0.9803 \n", + "\n", + " no_label_reason number_of_few_shots prompt_complexity_score reasoning \\\n", + "0 0.0 0 0.2783 0.0632 \n", + "\n", + " task_type_1 task_type_2 task_type_prob \\\n", + "0 Code Generation Text Generation 0.767 \n", + "\n", + " text \n", + "0 Prompt: Write a Python script that uses a for ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 45356343918c579b2314a4f24deb4bd072388b95 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 12 Dec 2024 11:12:32 -0800 Subject: [PATCH 09/10] add more info to notebooks Signed-off-by: Sarah Yurick --- .../aegis-classification.ipynb | 10 ++++++++-- .../content-type-classification.ipynb | 12 +++++++++--- .../domain-classification.ipynb | 12 +++++++++--- .../fineweb-edu-classification.ipynb | 6 ++++-- .../instruction-data-guard-classification.ipynb | 6 ++++-- .../multilingual-domain-classification.ipynb | 12 +++++++++--- .../prompt-task-complexity-classification.ipynb | 6 ++++-- .../pytorch-ensemble-classification.ipynb | 4 ++++ .../quality-classification.ipynb | 11 ++++++++--- 9 files changed, 59 insertions(+), 20 deletions(-) diff --git a/tutorials/distributed_data_classification/aegis-classification.ipynb b/tutorials/distributed_data_classification/aegis-classification.ipynb index 5d9b27d04..a509029c8 100644 --- a/tutorials/distributed_data_classification/aegis-classification.ipynb +++ b/tutorials/distributed_data_classification/aegis-classification.ipynb @@ -6,11 +6,13 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `AegisClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `AegisClassifier`. Aegis is a family of content-safety LLMs used for detecting if a piece of text contains content that is a part of 13 critical risk categories. There are two variants, [defensive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [permissive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0), that are useful for filtering harmful data out of your training set.\n", + "This notebook demonstrates the use of NeMo Curator's `AegisClassifier`. Aegis is a family of content-safety LLMs used for detecting if a piece of text contains content that is a part of 13 critical risk categories. There are two variants, [defensive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [permissive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0), that are useful for filtering harmful data out of your training set. Please refer to the Hugging Face pages for more information about the Aegis variants, including their output labels, here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0 and https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0.\n", "\n", "To use the Aegis classifiers, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n", "\n", - "The Aegis classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The Aegis classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { @@ -130,6 +132,10 @@ "classifier = AegisClassifier(\n", " aegis_variant=aegis_variant,\n", " token=token,\n", + " # If desired, you may filter your dataset with:\n", + " # filter_by=[\"safe\"],\n", + " # See full list of labels here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0\n", + " # or here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0\n", ")" ] }, diff --git a/tutorials/distributed_data_classification/content-type-classification.ipynb b/tutorials/distributed_data_classification/content-type-classification.ipynb index 782da5f0d..5ac3bbc2d 100644 --- a/tutorials/distributed_data_classification/content-type-classification.ipynb +++ b/tutorials/distributed_data_classification/content-type-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `ContentTypeClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `ContentTypeClassifier`. The [content type classifier](https://huggingface.co/nvidia/content-type-classifier-deberta) is used to categorize documents into one of 11 distinct speech types based on their content. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `ContentTypeClassifier`. The [content type classifier](https://huggingface.co/nvidia/content-type-classifier-deberta) is used to categorize documents into one of 11 distinct speech types based on their content. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the content type classifier, including its output labels, here: https://huggingface.co/nvidia/content-type-classifier-deberta.\n", "\n", - "The content type classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The content type classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { @@ -138,7 +140,11 @@ "metadata": {}, "outputs": [], "source": [ - "classifier = ContentTypeClassifier(batch_size=1024)" + "classifier = ContentTypeClassifier(batch_size=1024)\n", + "\n", + "# If desired, you may filter your dataset with:\n", + "# classifier = ContentTypeClassifier(batch_size=1024, filter_by=[\"News\"])\n", + "# See full list of labels here: https://huggingface.co/nvidia/content-type-classifier-deberta" ] }, { diff --git a/tutorials/distributed_data_classification/domain-classification.ipynb b/tutorials/distributed_data_classification/domain-classification.ipynb index eaddaa9da..88aaee0c0 100644 --- a/tutorials/distributed_data_classification/domain-classification.ipynb +++ b/tutorials/distributed_data_classification/domain-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `DomainClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `DomainClassifier`. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of a text. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `DomainClassifier`. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of a text. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the domain classifier, including its output labels, here: https://huggingface.co/nvidia/domain-classifier.\n", "\n", - "The domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { @@ -122,7 +124,11 @@ "metadata": {}, "outputs": [], "source": [ - "classifier = DomainClassifier(batch_size=1024)" + "classifier = DomainClassifier(batch_size=1024)\n", + "\n", + "# If desired, you may filter your dataset with:\n", + "# classifier = DomainClassifier(batch_size=1024, filter_by=[\"Computers_and_Electronics\", \"Health\"])\n", + "# See full list of domains here: https://huggingface.co/nvidia/domain-classifier" ] }, { diff --git a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb index 0db6e972d..e9014e07e 100644 --- a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb +++ b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `FineWebEduClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `FineWebEduClassifier`. The [FineWeb-Edu classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) is used for judging the educational value of web pages. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `FineWebEduClassifier`. The [FineWeb-Edu classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) is used for judging the educational value of web pages. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the FineWeb-Edu classifier here: https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier.\n", "\n", - "The FineWeb-Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The FineWeb-Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { diff --git a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb index 33024c9be..4be53e91e 100644 --- a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb +++ b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb @@ -6,11 +6,13 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `InstructionDataGuardClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `InstructionDataGuardClassifier`. The [Instruction-Data-Guard classifier](https://huggingface.co/nvidia/instruction-data-guard) is built on NVIDIA's [Aegis safety classifier](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and is designed to detect LLM poisoning trigger attacks.\n", + "This notebook demonstrates the use of NeMo Curator's `InstructionDataGuardClassifier`. The [Instruction-Data-Guard classifier](https://huggingface.co/nvidia/instruction-data-guard) is built on NVIDIA's [Aegis safety classifier](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and is designed to detect LLM poisoning trigger attacks. Please refer to the Hugging Face page for more information about the Instruction-Data-Guard classifier here: https://huggingface.co/nvidia/instruction-data-guard.\n", "\n", "Like the `AegisClassifier`, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n", "\n", - "The Instruction-Data-Guard classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The Instruction-Data-Guard classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { diff --git a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb index ed1a34558..c220074a3 100644 --- a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb +++ b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `MultilingualDomainClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `MultilingualDomainClassifier`. The [multilingual domain classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) is used to classify the domain of texts in any of 52 languages, including English. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `MultilingualDomainClassifier`. The [multilingual domain classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) is used to classify the domain of texts in any of 52 languages, including English. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the multilingual domain classifier, including its output labels, here: https://huggingface.co/nvidia/multilingual-domain-classifier.\n", "\n", - "The multilingual domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The multilingual domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { @@ -132,7 +134,11 @@ "metadata": {}, "outputs": [], "source": [ - "classifier = MultilingualDomainClassifier(batch_size=1024)" + "classifier = MultilingualDomainClassifier(batch_size=1024)\n", + "\n", + "# If desired, you may filter your dataset with:\n", + "# classifier = MultilingualDomainClassifier(batch_size=1024, filter_by=[\"Science\", \"Health\"])\n", + "# See full list of domains here: https://huggingface.co/nvidia/multilingual-domain-classifier" ] }, { diff --git a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb index adc9ec50c..f8079db75 100644 --- a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb +++ b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `PromptTaskComplexityClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `PromptTaskComplexityClassifier`. The [prompt task and complexity classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) a multi-headed model which classifies English text prompts across task types and complexity dimensions. It helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `PromptTaskComplexityClassifier`. The [prompt task and complexity classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) a multi-headed model which classifies English text prompts across task types and complexity dimensions. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the prompt task and complexity classifier, including its output labels, here: https://huggingface.co/nvidia/prompt-task-and-complexity-classifier.\n", "\n", - "The prompt task and complexity classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The prompt task and complexity classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { diff --git a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb index 77a3960e1..7ab96b652 100644 --- a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb +++ b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb @@ -10,6 +10,8 @@ "\n", "In this tutorial, we demonstrate how to use NeMo Curator's `DistributedDataClassifier` to build our own `PyTorchClassifier` class for loading and performing batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator.\n", + "\n", "First, let's run some preliminary imports and set up our Dask client." ] }, @@ -447,6 +449,8 @@ " text_field=\"text\",\n", " pred_column=pred_column,\n", " prob_column=prob_column,\n", + " # If desired, you may filter your dataset with:\n", + " # filter_by=[\"label_b\"],\n", " )\n", " dataset = classifier(dataset=dataset)\n", " fold += 1" diff --git a/tutorials/distributed_data_classification/quality-classification.ipynb b/tutorials/distributed_data_classification/quality-classification.ipynb index 6a65201d8..67e5bec05 100644 --- a/tutorials/distributed_data_classification/quality-classification.ipynb +++ b/tutorials/distributed_data_classification/quality-classification.ipynb @@ -6,9 +6,11 @@ "source": [ "# Distributed Data Classification with NeMo Curator's `QualityClassifier`\n", "\n", - "This notebook demonstrates the use of NeMo Curator's `QualityClassifier`. The [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify text as high, medium, or low quality. This helps with data annotation, which is useful in data blending for foundation model training.\n", + "This notebook demonstrates the use of NeMo Curator's `QualityClassifier`. The [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify text as high, medium, or low quality. This helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the quality classifier, including its output labels, here: https://huggingface.co/nvidia/quality-classifier-deberta.\n", "\n", - "The quality classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + "The quality classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." ] }, { @@ -160,7 +162,10 @@ "metadata": {}, "outputs": [], "source": [ - "classifier = QualityClassifier(batch_size=1024)" + "classifier = QualityClassifier(batch_size=1024)\n", + "\n", + "# If desired, you may filter your dataset with:\n", + "# classifier = QualityClassifier(batch_size=1024, filter_by=[\"High\", \"Medium\"])" ] }, { From 17bc0e5ff1e1e719c1803d55ead53136ff0f7af3 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Mon, 16 Dec 2024 13:08:48 -0800 Subject: [PATCH 10/10] change to output_path Signed-off-by: Sarah Yurick --- .../distributed_data_classification/aegis-classification.ipynb | 2 +- .../content-type-classification.ipynb | 2 +- .../fineweb-edu-classification.ipynb | 2 +- .../instruction-data-guard-classification.ipynb | 2 +- .../multilingual-domain-classification.ipynb | 2 +- .../prompt-task-complexity-classification.ipynb | 2 +- .../pytorch-ensemble-classification.ipynb | 2 +- .../quality-classification.ipynb | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tutorials/distributed_data_classification/aegis-classification.ipynb b/tutorials/distributed_data_classification/aegis-classification.ipynb index a509029c8..2b3b9076d 100644 --- a/tutorials/distributed_data_classification/aegis-classification.ipynb +++ b/tutorials/distributed_data_classification/aegis-classification.ipynb @@ -181,7 +181,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/content-type-classification.ipynb b/tutorials/distributed_data_classification/content-type-classification.ipynb index 5ac3bbc2d..97df8485c 100644 --- a/tutorials/distributed_data_classification/content-type-classification.ipynb +++ b/tutorials/distributed_data_classification/content-type-classification.ipynb @@ -176,7 +176,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb index e9014e07e..9a3310d0d 100644 --- a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb +++ b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb @@ -156,7 +156,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb index 4be53e91e..14ec962fe 100644 --- a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb +++ b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb @@ -170,7 +170,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb index c220074a3..431dcc3f7 100644 --- a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb +++ b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb @@ -170,7 +170,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb index f8079db75..a77599aed 100644 --- a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb +++ b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb @@ -165,7 +165,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb index 7ab96b652..9f12de80c 100644 --- a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb +++ b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb @@ -492,7 +492,7 @@ "source": [ "%%time\n", "\n", - "dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, { diff --git a/tutorials/distributed_data_classification/quality-classification.ipynb b/tutorials/distributed_data_classification/quality-classification.ipynb index 67e5bec05..c54376539 100644 --- a/tutorials/distributed_data_classification/quality-classification.ipynb +++ b/tutorials/distributed_data_classification/quality-classification.ipynb @@ -197,7 +197,7 @@ "%%time\n", "\n", "result_dataset = classifier(dataset=input_dataset)\n", - "result_dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" ] }, {