diff --git a/nemo_curator/filters/__init__.py b/nemo_curator/filters/__init__.py index 4eb800992..5ca7a2a2d 100644 --- a/nemo_curator/filters/__init__.py +++ b/nemo_curator/filters/__init__.py @@ -48,6 +48,7 @@ WordCountFilter, WordsWithoutAlphabetsFilter, ) +from .synthetic import AnswerabilityFilter, EasinessFilter __all__ = [ "DocumentFilter", @@ -84,4 +85,6 @@ "AlphaFilter", "HTMLBoilerplateFilter", "PerExtensionFilter", + "AnswerabilityFilter", + "EasinessFilter", ] diff --git a/nemo_curator/filters/synthetic.py b/nemo_curator/filters/synthetic.py new file mode 100644 index 000000000..c54a32a5b --- /dev/null +++ b/nemo_curator/filters/synthetic.py @@ -0,0 +1,219 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import List, Union + +import dask +import dask.dataframe as dd +import numpy as np +import pandas as pd +from dask.base import normalize_token, tokenize +from openai import OpenAI + +from nemo_curator.filters.doc_filter import DocumentFilter +from nemo_curator.utils.decorators import batched + + +# ----------------------------------------------------------------------------80 +# ------------------------ EASINESS FILTER ------------------------------------- +# ----------------------------------------------------------------------------80 +class EasinessFilter(DocumentFilter): + """ + Discards questions that are deemed easy to retrieve by retriever modls + """ + + def __init__( + self, + base_url: str, + api_key: str, + model: str, + percentile: float = 0.7, + truncate: str = "NONE", + batch_size: int = 1, + text_fields: List[str] = ["text", "question"], + ): + + self._name = "easiness_filter" + self.base_url = base_url + self.api_key = api_key + self.nim_model = model + self.percentile = percentile + if truncate: + self.truncate = truncate + try: + self.client = OpenAI(base_url=self.base_url, api_key=self.api_key) + except Exception as e: + print(f"Error accessing NIM model: {e}") + self.batch_size = batch_size + self.text_fields = text_fields + + @batched + def score_document(self, df: pd.DataFrame): + + document_score = self._calc_similarity_nim( + df[self.text_fields[0]].to_list(), df[self.text_fields[1]].to_list() + ) + return pd.Series(document_score, index=df.index) + + @batched + def keep_document(self, scores: pd.Series): + filter_threshold = np.percentile(scores, self.percentile) + return scores <= filter_threshold + + def _get_nim_embedding(self, text, input_type): + # Obtain embeddings from nim model + if isinstance(text, list): + input_ = text + elif isinstance(text, str): + input_ = [text] + + try: + response = self.client.embeddings.create( + input=input_, + model=self.nim_model, + encoding_format="float", + extra_body={"input_type": input_type, "truncate": self.truncate}, + ) + except Exception as e: + print(f"Error: {e}") + response = None + + if response: + if isinstance(text, list): + embeddings = [r.embedding for r in response.data] + elif isinstance(text, str): + embeddings = response.data[0].embedding + return embeddings + else: + return [] + + def _calc_similarity_nim(self, context, question): + # cosine similarity + doc_embed = self._get_nim_embedding(text=context, input_type="passage") + q_embed = self._get_nim_embedding(text=question, input_type="query") + if isinstance(context, list) and isinstance(question, list): + if doc_embed and q_embed: + sim = np.diag(np.dot(np.array(doc_embed), np.array(q_embed).T)) + else: + sim = np.zeros(len(context)) + else: + if doc_embed and q_embed: + sim = np.dot(doc_embed, q_embed) + else: + sim = 0.0 + + return sim + + def __dask_tokenize__(self): + return normalize_token(EasinessFilter) + + +# ----------------------------------------------------------------------------80 +# ----------------------- Answerability Filter --------------------------------- +# ----------------------------------------------------------------------------80 + + +class AnswerabilityFilter(DocumentFilter): + """ + Discards questions that are not answerable by content present in the + context document + """ + + def __init__( + self, + base_url: str, + api_key: str, + model: str, + answerability_system_prompt: str, + answerability_user_prompt_template: str, + num_criteria: int, + text_fields: List[str] = ["text", "question"], + ): + + self._name = "answerability_filter" + self.base_url = base_url + self.api_key = api_key + self.model_name = model + self.system_prompt = answerability_system_prompt + self.user_prompt_template = answerability_user_prompt_template + self.num_criteria = num_criteria + + try: + self.client = OpenAI(base_url=self.base_url, api_key=self.api_key) + except Exception as e: + print(f"Error accessing NIM model: {e}") + + self.text_fields = text_fields + + @batched + def score_document(self, df: pd.DataFrame): + return df.apply( + lambda row: self._llm_as_judge( + row[self.text_fields[0]], row[self.text_fields[1]] + ), + axis=1, + ) + + # ----------------------------------------------------------------------------80 + @batched + def keep_document(self, scores: pd.Series): + + def _keep_document(score: str): + is_keep = True # default is to keep + try: + json_ans = json.loads(score) + for i in range(self.num_criteria): + if json_ans[f"criterion_{i+1}"] != "Y": + # filter out data if any of the criteria fails + is_keep = False # filter out + break + except Exception as e: + pass # TODO log the errors + # print(f"Parse error {e}") + # if there is a parse error, keep the document + + return is_keep + + return scores.apply(_keep_document) + + def _llm_as_judge(self, context: str, question: str): + + user_query = self.system_prompt + "\n\n" + user_query += self.user_prompt_template.format( + context=context, question=question + ) + + try: + completion = self.client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": user_query}], + temperature=0.5, + top_p=1, + max_tokens=1024, + ) + + generation = completion.choices[0].message.content + + except Exception as e: + print(f"API call error {e}") + return None # generation + + return generation + + def __dask_tokenize__(self): + return normalize_token(AnswerabilityFilter) + + +# ----------------------------------------------------------------------------80 diff --git a/nemo_curator/synthetic/generator.py b/nemo_curator/synthetic/generator.py new file mode 100644 index 000000000..f1ef9c887 --- /dev/null +++ b/nemo_curator/synthetic/generator.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from abc import ABC, abstractmethod +from typing import Any, List, Union + + +class SyntheticDataGenerator(ABC): + """ + An abstract base class for synthetic data generator pipeline. + + This class serves as a template for creating specific synethtic + data generation pipelines. + """ + + def __init__(self): + super().__init__() + self._name = self.__class__.__name__ + + @abstractmethod + def generate(self, llm_prompt: Union[str, List[str]]) -> Union[str, List[str]]: + pass + + @abstractmethod + def parse_response(self, llm_response: Union[str, List[str]]) -> Any: + pass diff --git a/tests/test_sdg_pipeline_filters.py b/tests/test_sdg_pipeline_filters.py new file mode 100644 index 000000000..b2511248d --- /dev/null +++ b/tests/test_sdg_pipeline_filters.py @@ -0,0 +1,128 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import os + +import dask +import numpy as np +import pandas as pd +import pytest +from dask import dataframe as dd + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters import AnswerabilityFilter, DocumentFilter, EasinessFilter +from nemo_curator.modules import Filter, Score, ScoreFilter, Sequential + +config_module = importlib.import_module( + "tutorials.nemo-retriever-synthetic-data-generation.config.config" +) + + +@pytest.fixture +def get_original_data(): + docs = [ + { + "_id": "930220d64a44c223df83e0caf09013fffdf4c19c1f501f035862984979928b29", + "text": "The Eiffel Tower is an iconic landmark of Paris, France. It was designed by the engineer Gustave Eiffel and built for the 1889 Exposition Universelle (World's Fair) to celebrate the 100th anniversary of the French Revolution.", + "title": "Eiffel Tower - A French Icon", + }, + { + "_id": "5cdca9fa81b6c4d8a1a1159610c98b2bffae498dad36c90639413bf22e5a4154", + "text": "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China to protect the Chinese states and empires against raids and invasions from various nomadic groups.", + "title": "The Great Wall of China - Ancient Protection", + }, + ] + return DocumentDataset.from_pandas(pd.DataFrame(docs)) + + +@pytest.fixture +def get_generated_data(): + docs = [ + { + "_id": "930220d64a44c223df83e0caf09013fffdf4c19c1f501f035862984979928b29", + "text": "The Eiffel Tower is an iconic landmark of Paris, France. It was designed by the engineer Gustave Eiffel and built for the 1889 Exposition Universelle (World's Fair) to celebrate the 100th anniversary of the French Revolution.", + "title": "Eiffel Tower - A French Icon", + "question-id": "d9be8cb0693a354b2ba8ddd1e86c9df57db97f33e03cb33c972f8efed4084f8b", + "question": "What is the significance of the Eiffel Tower in relation to the French Revolution?", + "answer": "The Eiffel Tower was built to celebrate the 100th anniversary of the French Revolution.", + }, + { + "_id": "5cdca9fa81b6c4d8a1a1159610c98b2bffae498dad36c90639413bf22e5a4154", + "text": "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China to protect the Chinese states and empires against raids and invasions from various nomadic groups.", + "title": "The Great Wall of China - Ancient Protection", + "question-id": "e1f6e179a883a7f108d566a582a159322b2eb2b8e0d51fd78cafc72373e4be2b", + "question": "What is the purpose of the Great Wall of China?", + "answer": "The purpose of the Great Wall of China is to protect the Chinese states and empires against raids and invasions from various nomadic groups.", + }, + { + "_id": "35f822b0b38de133b815139affac94d57d0c7d35de6e11d0e52a69d416c1d248", + "text": "Machu Picchu is a 15th-century Inca citadel situated on a mountain ridge above the Sacred Valley in Peru. It is the most famous icon of Inca civilization, known for its sophisticated dry-stone walls that fuse huge blocks without the use of mortar.", + "title": "Machu Picchu - Lost City of the Incas", + "question-id": "1a60e50066c938784db3d49c41e470c197b3fc30afa07957575a1d8a34a34230", + "question": "What is Machu Picchu renowned for in terms of its architecture?", + "answer": "Machu Picchu is renowned for its sophisticated dry-stone walls that fuse huge blocks without the use of mortar.", + }, + ] + return DocumentDataset.from_pandas(pd.DataFrame(docs)) + + +@pytest.fixture +def get_config(): + cfg = config_module.RetrieverEvalSDGConfig.from_yaml( + "./tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml" + ) + cfg.api_key = os.environ.get("NVIDIA_API_KEY") + return cfg + + +class TestSDGFilterModule: + def test_easiness_filter(self, get_generated_data, get_config): + + ef = EasinessFilter( + get_config.base_url, + get_config.api_key, + get_config.easiness_filter, + get_config.percentile, + get_config.truncate, + get_config.batch_size, + ) + easiness_filter = ScoreFilter( + ef, text_field=["text", "question"], score_field="easiness_scores" + ) + + org_df = get_generated_data.df.compute() + filtered_dataset = easiness_filter(get_generated_data) + filtered_df = filtered_dataset.df.compute() + assert "easiness_scores" in filtered_df + assert org_df.shape[0] >= filtered_df.shape[0] + + def test_answerability_filter(self, get_generated_data, get_config): + + af = AnswerabilityFilter( + get_config.base_url, + get_config.api_key, + get_config.answerability_filter, + get_config.answerability_system_prompt, + get_config.answerability_user_prompt_template, + get_config.num_criteria, + ) + answerability_filter = ScoreFilter( + af, text_field=["text", "question"], score_field="answerability_scores" + ) + org_df = get_generated_data.df.compute() + filtered_dataset = answerability_filter(get_generated_data) + filtered_df = filtered_dataset.df.compute() + assert "answerability_scores" in filtered_df + assert org_df.shape[0] >= filtered_df.shape[0] diff --git a/tutorials/README.md b/tutorials/README.md index 1aa3276c0..5c619c89e 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -21,6 +21,7 @@ To get started, we recommend starting with the following tutorials to become fam | [dapt-curation](./dapt-curation) | Data curation sample for domain-adaptive pre-training (DAPT), focusing on [ChipNeMo](https://blogs.nvidia.com/blog/llm-semiconductors-chip-nemo/) data curation as an example | [Blog post](https://developer.nvidia.com/blog/streamlining-data-processing-for-domain-adaptive-pretraining-with-nvidia-nemo-curator/) | | [distributed_data_classification](./distributed_data_classification) | Demonstrates data domain and data quality classification at scale in a distributed environment | | | [nemotron_340B_synthetic_datagen](./nemotron_340B_synthetic_datagen) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct) for generating synthetic preference data | | +| [nemo-retriever-synthetic-data-generation](./nemo_retriever_synthetic_data_generation) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [NIM models](https://ai.nvidia.com) for generating synthetic data and perform data quality assesement on generated data using LLM-as-judge and embedding-model-as-judge. The generated data would be used to evaluate retrieval/RAG pipelines | | [peft-curation](./peft-curation/) | Data curation sample for parameter efficient fine-tuning (PEFT) use-cases | [Blog post](https://developer.nvidia.com/blog/curating-custom-datasets-for-llm-parameter-efficient-fine-tuning-with-nvidia-nemo-curator/) | | [peft-curation-with-sdg](./peft-curation/) | Demonstrates a pipeline to leverage external models such as [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct) for synthetic data generation, data quality annotation via [Nemotron-4 340B Reward](https://build.nvidia.com/nvidia/nemotron-4-340b-reward), as well as other data processing steps (semantic deduplication, HTML tag removal, etc.) for parameter efficient fine-tuning (PEFT) use-cases | [Use this data to fine-tune your own model](https://github.com/NVIDIA/NeMo/blob/main/tutorials/llm/llama-3/sdg-law-title-generation/llama3-sdg-lora-nemofw.ipynb) | | [single_node_tutorial](./single_node_tutorial) | A comprehensive example to demonstrate running various NeMo Curator functionalities locally | | diff --git a/tutorials/nemo-retriever-synthetic-data-generation/README.md b/tutorials/nemo-retriever-synthetic-data-generation/README.md new file mode 100644 index 000000000..595c2bcb6 --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/README.md @@ -0,0 +1,98 @@ +# NeMo Retriever Synthetic Data Generation + +NeMo Retriever Synthetic Data Generation (SDG) is designed to streamline the creation of high-quality evaluation datasets for Text QA retrieval use cases. By leveraging existing enterprise data, this pipeline enables rapid generation of relevant evaluation datasets, facilitating improved model performance. + +This version supports the generation of evaluation datasets, creating synthetic benchmark datasets compatible with commonly used evaluation frameworks such as [BEIR](https://huggingface.co/datasets/BeIR/beir). Synthetic training dataset generation will be supported in an upcoming version. + +NeMo Retriever SDG can be run either from the command line, or using the [notebook example](notebooks/quickstart.ipynb) provided in this repository. Check the [Prerequisites](#prerequisites) section for instructions on generating an API key and installing libraries. To get started with the notebook, follow the [Notebook Quick Start](#run-pipeline-ipython-notebook) instructions. Otherwise, follow the [CLI Quick Start](#run-pipeline-cli) section. + +![NeMo Retriever SDG](figures/sdg_pipeline.png) + +#### Key Features + +* Quickly generate complex QA datasets from existing text documents for retriever model evaluation. +* Output datasets can be formatted in [SQuAD (Stanford Question Answering Dataset)](https://huggingface.co/datasets/rajpurkar/squad) or [BEIR (Benchmarking Information Retrieval)](https://huggingface.co/datasets/BeIR/beir) format for easy integration into evaluation workflows. +* Designed to integrate seamlessly with [NVIDIA NeMo Evaluator](https://developer.nvidia.com/nemo-microservices) microservice, currently in early access. + + +## Quickstart + +### Prerequisites + +In order to use NeMo Retriever SDG, you will need access to NVIDIA’s API Catalog. Go to the [NGC Personal Key Manager](https://org.ngc.nvidia.com/setup) to generate a Personal Key that will allow you to access AI Foundation Models and Endpoints. + +To install the required libraries, navigate to the root directory of the project and run the following command in your notebook or command line: + +``` +$ pip install -r requirements.txt +``` + +Alternatively, you can use container `nvcr.io/nvidia/nemo:24.09`. + +``` +$ docker pull nvcr.io/nvidia/nemo:24.09 + +$ docker run -it --rm --gpus all --ipc host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:24.09 + +/workspace# pip install -r requirements.txt +/workspace# jupyter notebook +``` + + +### Run Pipeline (iPython notebook) + +Navigate to the [quick start notebook](notebooks/quickstart.ipynb) and follow the instructions. + +### Run Pipeline (CLI) + +The pipeline can be run with datasets in rawdoc (only text, title and ids if any) format. To test the pipeline, you can use the provided example data at ```sample_data_rawdoc.jsonl``` + +Navigate to the top level of this project directory and run the following command in your command line. It will take roughly 5-10 minutes. + +- `Rawdoc format` + +To use rawdoc format, provide your data in a `.jsonl` file. The structure of the data should follow this format: `{"text": , "title": }`. Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{"_id": <document_id>, "text": <document>, "title": <title>}`. + +In order to run the pipeline, use the script ```main.py``` +``` +python tutorials/nemo-retriever-synthetic-data-generation/main.py \ + --api-key=<API Key> \ + --input-file=tutorials/nemo-retriever-synthetic-data-generation/data/sample_data_rawdoc.jsonl \ + --pipeline-config=tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml\ + --input-format=rawdoc \ + --output-dir=tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc +``` + +For more information about the expected structure of the data, see the [quick start notebook](notebooks/quickstart.ipynb). + + +### Using Custom Configuration + +Edit [config.yaml](config/config.yaml) to update the configuration. Predefined configuration files can be found in [scripts/conf](config/config.yaml). + + +## Quality Improvement Playbook (for Advanced Users) + + +The default config file [config.yaml](config/config.yaml) should work best to generate synthetic data. You would need to change the few-shot examples in the prompt for specific use-cases. In case you'd like to improve the quality of synthetic data and/or apply the SDG pipeline for other domains, consider applying the recipes described below. + + +### Prompt templates + +We recommend engineering the prompt templates for better synthetic data generations. Specifically, we have observed Chain-of-Thought prompting to result in the better generations as well. We have provided additional config files ([config-nq.yaml](config/config-nq.yaml) and [config-fiqa.yaml](config/config-fiqa.yaml)) that showcase Chain-of-Thought prompting. + +Furthermore, they also showcase the use of in-context learning, wherein passage, query pairs were picked from datasets to be used as few-shot examples. Both methods yields good quality results. + + +### Choice of Easiness Filter & Threshold + +We provide the embedding-model-as-a-judge as well as filter threshold value in our default configuration. The general recommendation to increase the difficulty of questions is to lower the filter threshold value and vice versa. The user can experiment with different filter threshold values to get more challenging or easier synthetic questions in their synthetic datasets. + +The choice of the embedding model is provided in the default configuration. We experimented and verified the quality of the pipeline with the default configuration on multiple datasets such as FiQA, NQ and other internal datasets. The user can also change the embedding-model-as-a-judge by choosing any embedding model from [Huggingface Model Hub](https://huggingface.co/models). + + +### Choice of Answerability Filter + +For Answerability Filter, our recommendation is to go with the choice provided in the default configuation file. We confirmed that the checkbox-style prompt in the default configuration worked well for valid question filtering. + +However, the framework is flexible of the choice of LLM-as-a-Judge and different LLMs with different prompt templates might work better for certain use cases. You can also experiment with Likert-scale prompting if need be. diff --git a/tutorials/nemo-retriever-synthetic-data-generation/__init__.py b/tutorials/nemo-retriever-synthetic-data-generation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py b/tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml new file mode 100644 index 000000000..bd4244dde --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml @@ -0,0 +1,101 @@ +# Basic config for all the NIM models +base_url: https://integrate.api.nvidia.com/v1 +api_key: "your api key here" + +# LLM Generator Module +generator_model: mistralai/mixtral-8x22b-instruct-v0.1 +temperature: 0.5 +top_p: 1.0 +max_tokens: 2048 +num_questions: 3 +squad_format: false +generator_system_prompt: | + You are a data annotator trying to generate questions and corresponding answers based on input document. Use the following guidelines: + + - Identify key phrases and entities in the document and generate questions based on those key phrases and entities in the document. + - Generate questions that could be answered by a piece of information in the input document. + - Do not generate questions which requires looking at the input document to comprehend the question + - Do not use phrases like 'according to the document', 'according to the author', 'in the document', 'this document' etc + - Questions can also be in the form of key phrases in the document + - Generate questions that are relevant to the idea expressed in the input document, and the input document contains the complete answer to your question. + - Generate questions that provide specific context that can lead to the specific answer contained in the input document. + - Generate questions that are varied and different from each other. You can change up the phrasing, vocabulary, complexity, and the type of questions you ask throughout the task. + - DO NOT copy and paste exact phrasing from the test. Formulate questions in your own words. + - Generate answers to the questions as well. + - Provide an explanation as to why the generated question is good. Use the following example questions and answers for reference. + - Generated Questions should start with Question: + - Generated Answers should start with Answer: + - Generated Explanations should start with Explanation: + + Examples: + + Input document: + Just have the associate sign the back and then deposit it. It's called a third party cheque and is perfectly legal. I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear. Now, you may have problems if it's a large amount or you're not very well known at the bank. In that case you can have the associate go to the bank and endorse it in front of the teller with some ID. You don't even technically have to be there. Anybody can deposit money to your account if they have the account number. He could also just deposit it in his account and write a cheque to the business. + Have the check reissued to the proper payee. + + Question: + How to deposit a cheque issued to an associate in my business into my business account? + + Input Document: + Sure you can. You can fill in whatever you want in the From section of a money order, so your business name and address would be fine. The price only includes the money order itself. You can hand deliver it yourself if you want, but if you want to mail it, you'll have to provide an envelope and a stamp. Note that, since you won't have a bank record of this payment, you'll want to make sure you keep other records, such as the stub of the money order. You should probably also ask the contractor to give you a receipt. + + Question: + Can I send a money order from USPS as a business? + + Input Document: + Funds earned and spent before opening a dedicated business account should be classified according to their origination. For example, if your business received income, where did that money go? If you took the money personally, it would be considered either a 'distribution' or a 'loan' to you. It is up to you which of the two options you choose. On the flip side, if your business had an expense that you paid personally, that would be considered either a 'contribution of capital' or a 'loan' from you. If you choose to record these transactions as loans, you can offset them together, so you don't need two separate accounts, loan to you and loan from you. When the bank account was opened, the initial deposit came from where? If it came from your personal funds, then it is either a 'contribution of capital' or a 'loan' from you. From the sound of your question, you deposited what remained after the preceding income/expenses. This would, in effect, return the 'loan' account back to zero, if choosing that route. The above would also be how to record any expenses you may pay personally for the business (if any) in the future. Because these transactions were not through a dedicated business bank account, you can't record them in Quickbooks as checks and deposits. Instead, you can use Journal Entries. For any income received, you would debit your capital/loan account and credit your income account. For any expenses, you would debit the appropriate expense account and credit your distribution/loan account. Also, if setting up a loan account, you should choose either Current Asset or Current Liability type. The capital contribution and distribution account should be Equity type. Hope this helps! + + Question: + How to account for money earned and spent prior to establishing business bank accounts? + + Input Document: + I called the IRS (click here for IRS contact info) and they said I do not need to get a new EIN. I could have just filed the appropriate employer federal tax return (940/941) and then the filing requirements would have been updated. But while I was on the phone, they just updated the filing requirements for my LLC so I am all good now (I still need to file the correct form and make the correct payments, etc. but I can use this same EIN going forward). Disclaimer: Don't trust me (or this answer) for tax advice (your situation may be different). The IRS person on the phone was very helpful so I recommend calling them if you are in a similar situation. FYI, I have found calling the IRS to always be very helpful. + + Question: + Do I need a new EIN since I am hiring employees for my LLC? + + user_prompt_template: | + Generate {num_questions} questions and corresponding answers based on Input Document. + + Input Document: + {document} + +generator_user_prompt_template: | + Generate {n_openlines} questions and corresponding answers based on Input Document. + Input Document: + {document} + +#Easiness filter (embedding model as judge) +easiness_filter: nvidia/nv-embedqa-e5-v5 +truncate: "END" +percentile: 70 # Percentile for threshold calculation (float) [0, 100] +batch_size: 1 + +#Answerability filter (LLM-as-judge) +answerability_filter: "meta/llama3-70b-instruct" +num_criteria: 4 # Number of criteria to parse from the response. It must be alined with the prompt template +answerability_system_prompt: | + You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: + Criterion 1 - Can the question be understood and answered without needing additional context or access to external references not provided within the question itself? Questions should be self-contained, meaning they do not rely on specific documents, tables, or prior knowledge not shared within the question. + Criterion 2 - Is it clear what type of answer or information the question seeks? The question should convey its purpose without ambiguity, allowing for a direct and relevant response. + Criterion 3 - Does the content in the context contain information that can answer the question or part of the question? + Criterion 4 - Does the content in the context completely answer the question? + + Provide your response in a mandatory dictionary format, and a short explanation of the rating like + { + \"criterion_1_explanation\": "<Brief explanation of why criterion_1 was satisfied or not satisfied>", + \"criterion_1\": "<Y/N>", + \"criterion_2_explanation\": "<State the purpose of the question and justify why it was satisfied or not satisfied>", + \"criterion_2\": "<Y/N>", + \"criterion_3_explanation\": "<Show what parts of the content contain relevant information to the question if this criterion is satisfied, state why the information is irrelevant if unsatisfied>", + \"criterion_3\": "<Y/N>", + \"criterion_4_explanation\": "<Extract spans from the content that help completely answer the question if criterion is satisfied, state what parts are missing if not satisfied>", + \"criterion_4\": "<Y/N>" + } + Provide only the dictionary response and nothing else. + +answerability_user_prompt_template: | + Context Passage: + {context} + Question: + {question} diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml new file mode 100644 index 000000000..c5486eeb6 --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml @@ -0,0 +1,101 @@ +# Basic config for all the NIM models +base_url: https://integrate.api.nvidia.com/v1 +api_key: "your api key here" + +# LLM Generator Module +generator_model: mistralai/mixtral-8x22b-instruct-v0.1 +temperature: 0.5 +top_p: 1.0 +max_tokens: 2048 +num_questions: 3 +squad_format: false +generator_system_prompt: | + Generate questions that are relevant to the input document provided. + Follow these General Instructions: + - Questions must be completely answered by the input document. + - Questions must be relevant to the input document. + - Do not generate questions which requires looking at the input document to comprehend the question + - Generate questions and answers to the generated questions. + - Generated Questions should start with Question: + - Generated Answers should start with Answer: + Follow this chain of thought when formulating questions: + Step 1: Identify key phrases and entities in the input document + Step 2: Generate questions based on those key phrases and entities + + Compress any compounded questions to shorter questions to sound realistic. Questions can also be in the form of short phrases. + Use the following examples as guidelines. + + Examples: + + Input document: + In November 2013, Senate Democrats led by Harry Reid used the nuclear option to eliminate the 60 - vote rule on executive branch nominations and federal judicial appointments, but not for the Supreme Court. In April 2017, Senate Republicans led by Mitch McConnell extended the nuclear option to Supreme Court and the nomination of Neil Gorsuch ending the debate. + + Question: + who changed the senate rules for supreme court nominees? + + Input Document: + The First Pan-African Conference was held in London from 23 to 25 July 1900 (just prior to the Paris Exhibition of 1900 "in order to allow tourists of African descent to attend both events \'\'). Organized primarily by the Trinidadian barrister Henry Sylvester Williams, it took place in Westminster Town Hall (now Caxton Hall) and was attended by 37 delegates and about 10 other participants and observers from Africa, the West Indies, the US and the UK, including Samuel Coleridge Taylor (the youngest delegate), John Alcindor, Dadabhai Naoroji, John Archer, Henry Francis Downing, and W.E.B. Du Bois, with Bishop Alexander Walters of the AME Zion Church taking the chair. Du Bois played a leading role, drafting a letter ("Address to the Nations of the World \'\') to European leaders appealing to them to struggle against racism, to grant colonies in Africa and the West Indies the right to self - government and demanding political and other rights for African Americans. + + Question: + which figure in the pan-african movement organized the first pan-african conference in 1900 apex? + + Input Document: + After the outbreak of World War I the Defence of the Realm Act was passed by Parliament in 1914. One section of the Act concerned the hours pubs could sell alcohol, as it was believed that alcohol consumption would interfere with the war effort. It restricted opening hours for licensed premises to luncheon (12: 00 to 14: 40) and supper (18: 30 to 21: 30). In the late 1980s the licensing laws in England and Wales became less restricted and allowed pubs to allow the consumption of alcohol on the premises from 11: 00 until 23: 00, although nightclubs were allowed to stay open much later. Significantly revised rules were introduced in November 2005, when hour limits were scrapped, and pubs were allowed to apply for licences as permissive as "24 hours a day \'\'. In practice, most pubs chose to apply for more restrictive licences. + + Question: + when did all day drinking start in england? + + Input Document: + The Washington Nationals are a professional baseball team based in Washington, D.C. The Nationals compete in Major League Baseball (MLB) as a member club of the National League (NL) East division. From 2005 to 2007, the team played in RFK Stadium; since 2008 their home stadium has been Nationals Park on South Capitol Street in Southeast D.C., near the Anacostia River. + + Question: + where are the washington nationals based out of? + + Input Document: + Hugs and kisses or xoxo is a term used for expressing sincerity, faith, love, or good friendship at the end of a written letter, email or SMS text message. + + Question: + what is the symbol for hugs and kisses? + + +generator_user_prompt_template: | + Generate {n_openlines} questions and corresponding answers based on Input Document. + + Input Document: + {document} + + +#Easiness filter (embedding model as judge) +easiness_filter: nvidia/nv-embedqa-e5-v5 +truncate: "END" +percentile: 70 # Percentile for threshold calculation (float) [0, 100] +batch_size: 1 + +#Answerability filter (LLM-as-judge) +answerability_filter: "meta/llama3-70b-instruct" +num_criteria: 4 # Number of criteria to parse from the response. It must be alined with the prompt template +answerability_system_prompt: | + You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: + Criterion 1 - Can the question be understood and answered without needing additional context or access to external references not provided within the question itself? Questions should be self-contained, meaning they do not rely on specific documents, tables, or prior knowledge not shared within the question. + Criterion 2 - Is it clear what type of answer or information the question seeks? The question should convey its purpose without ambiguity, allowing for a direct and relevant response. + Criterion 3 - Does the content in the context contain information that can answer the question or part of the question? + Criterion 4 - Does the content in the context completely answer the question? + + Provide your response in a mandatory dictionary format, and a short explanation of the rating like + { + \"criterion_1_explanation\": "<Brief explanation of why criterion_1 was satisfied or not satisfied>", + \"criterion_1\": "<Y/N>", + \"criterion_2_explanation\": "<State the purpose of the question and justify why it was satisfied or not satisfied>", + \"criterion_2\": "<Y/N>", + \"criterion_3_explanation\": "<Show what parts of the content contain relevant information to the question if this criterion is satisfied, state why the information is irrelevant if unsatisfied>", + \"criterion_3\": "<Y/N>", + \"criterion_4_explanation\": "<Extract spans from the content that help completely answer the question if criterion is satisfied, state what parts are missing if not satisfied>", + \"criterion_4\": "<Y/N>" + } + Provide only the dictionary response and nothing else. + +answerability_user_prompt_template: | + Context Passage: + {context} + Question: + {question} diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config.py b/tutorials/nemo-retriever-synthetic-data-generation/config/config.py new file mode 100644 index 000000000..27e2f8f1c --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/config/config.py @@ -0,0 +1,94 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import warnings +from dataclasses import dataclass, field +from typing import List, Optional + +import yaml + +from nemo_curator.modules.config import BaseConfig + + +@dataclass +class RetrieverEvalSDGConfig(BaseConfig): + """ + Configuration for SDG pipeline for Retriever Evals + + Attributes: + + """ + + base_url: str + api_key: str + generator_model: str = "mistralai/mixtral-8x22b-instruct-v0.1" + generator_url: Optional[str] = None + generator_api_key: Optional[str] = None + temperature: Optional[float] = 0.5 + top_p: Optional[float] = 1.0 + num_questions: Optional[int] = 1 + max_tokens: Optional[int] = 2048 + squad_format: Optional[bool] = False + + generator_system_prompt: Optional[ + str + ] = """You are data annotator, your task is + to generate a question for the given document. Also generate answer to the generated + question.""" + + generator_user_prompt_template: Optional[ + str + ] = """Generate {n_openlines} questions and corresponding answers based on Input Document. + Input Document: + {document} + """ + + # easiness filter parameters + easiness_filter: str = None + easiness_url: Optional[str] = None + easiness_api_key: Optional[str] = None + truncate: str = "END" + percentile: float = 70 + batch_size: Optional[int] = 1 + + # answerability filter parameters + answerability_filter: str = None + answerability_url: Optional[str] = None + answerability_api_key: Optional[str] = None + num_criteria: int = 4 + answerability_system_prompt: str = """You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: + Criterion 1 - Can the question be understood and answered without needing additional context or access to external references not provided within the question itself? Questions should be self-contained, meaning they do not rely on specific documents, tables, or prior knowledge not shared within the question. + Criterion 2 - Is it clear what type of answer or information the question seeks? The question should convey its purpose without ambiguity, allowing for a direct and relevant response. + Criterion 3 - Does the content in the context contain information that can answer the question or part of the question? + Criterion 4 - Does the content in the context completely answer the question? + + Provide your response in a mandatory dictionary format, and a short explanation of the rating like + { + \"criterion_1_explanation\": "<Brief explanation of why criterion_1 was satisfied or not satisfied>", + \"criterion_1\": "<Y/N>", + \"criterion_2_explanation\": "<State the purpose of the question and justify why it was satisfied or not satisfied>", + \"criterion_2\": "<Y/N>", + \"criterion_3_explanation\": "<Show what parts of the content contain relevant information to the question if this criterion is satisfied, state why the information is irrelevant if unsatisfied>", + \"criterion_3\": "<Y/N>", + \"criterion_4_explanation\": "<Extract spans from the content that help completely answer the question if criterion is satisfied, state what parts are missing if not satisfied>", + \"criterion_4\": "<Y/N>" + } + Provide only the dictionary response and nothing else. + """ + answerability_user_prompt_template: str = """Context Passage: + {context} + Question: + {question} + """ diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml new file mode 100644 index 000000000..4f94dd051 --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml @@ -0,0 +1,92 @@ +# Basic config for all the NIM models +base_url: https://integrate.api.nvidia.com/v1 +api_key: "your api key here" + +# LLM Generator Module +generator_model: mistralai/mixtral-8x22b-instruct-v0.1 +temperature: 0.5 +top_p: 1.0 +max_tokens: 2048 +num_questions: 3 +squad_format: false +generator_system_prompt: | + You are a data annotator trying to generate questions and corresponding answers based on input document. Use the following guidelines: + + - Identify key phrases and entities in the document and generate questions based on those key phrases and entities in the document. + - Generate questions that could be answered by a piece of information in the input document. + - Do not generate questions which requires looking at the input document to comprehend the question + - Do not use phrases like 'according to the document', 'according to the author', 'in the document', 'this document' etc + - Questions can also be in the form of key phrases in the document + - Generate questions that are relevant to the idea expressed in the input document, and the input document contains the complete answer to your question. + - Generate questions that provide specific context that can lead to the specific answer contained in the input document. + - Generate questions that are varied and different from each other. You can change up the phrasing, vocabulary, complexity, and the type of questions you ask throughout the task. + - DO NOT copy and paste exact phrasing from the test. Formulate questions in your own words. + - Generate answers to the questions as well. + - Provide an explanation as to why the generated question is good. Use the following example questions and answers for reference. + - Generated Questions should start with Question: + - Generated Answers should start with Answer: + - Generated Explanations should start with Explanation: + + Examples: + + Input document: + Just have the associate sign the back and then deposit it. It's called a third party cheque and is perfectly legal. I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear. Now, you may have problems if it's a large amount or you're not very well known at the bank. In that case you can have the associate go to the bank and endorse it in front of the teller with some ID. You don't even technically have to be there. Anybody can deposit money to your account if they have the account number. He could also just deposit it in his account and write a cheque to the business. + Have the check reissued to the proper payee. + + Question: + How to deposit a cheque issued to an associate in my business into my business account? + + Input Document: + Sure you can. You can fill in whatever you want in the From section of a money order, so your business name and address would be fine. The price only includes the money order itself. You can hand deliver it yourself if you want, but if you want to mail it, you'll have to provide an envelope and a stamp. Note that, since you won't have a bank record of this payment, you'll want to make sure you keep other records, such as the stub of the money order. You should probably also ask the contractor to give you a receipt. + + Question: + Can I send a money order from USPS as a business? + + Input Document: + Funds earned and spent before opening a dedicated business account should be classified according to their origination. For example, if your business received income, where did that money go? If you took the money personally, it would be considered either a 'distribution' or a 'loan' to you. It is up to you which of the two options you choose. On the flip side, if your business had an expense that you paid personally, that would be considered either a 'contribution of capital' or a 'loan' from you. If you choose to record these transactions as loans, you can offset them together, so you don't need two separate accounts, loan to you and loan from you. When the bank account was opened, the initial deposit came from where? If it came from your personal funds, then it is either a 'contribution of capital' or a 'loan' from you. From the sound of your question, you deposited what remained after the preceding income/expenses. This would, in effect, return the 'loan' account back to zero, if choosing that route. The above would also be how to record any expenses you may pay personally for the business (if any) in the future. Because these transactions were not through a dedicated business bank account, you can't record them in Quickbooks as checks and deposits. Instead, you can use Journal Entries. For any income received, you would debit your capital/loan account and credit your income account. For any expenses, you would debit the appropriate expense account and credit your distribution/loan account. Also, if setting up a loan account, you should choose either Current Asset or Current Liability type. The capital contribution and distribution account should be Equity type. Hope this helps! + + Question: + How to account for money earned and spent prior to establishing business bank accounts? + + +generator_user_prompt_template: | + Generate {n_openlines} questions and corresponding answers based on Input Document. + + Input Document: + {document} + + +#Easiness filter (embedding model as judge) +easiness_filter: nvidia/nv-embedqa-e5-v5 +truncate: "END" +percentile: 70 # Percentile for threshold calculation (float) [0, 100] +batch_size: 1 + +#Answerability filter (LLM-as-judge) +answerability_filter: "meta/llama3-70b-instruct" +num_criteria: 4 # Number of criteria to parse from the response. It must be alined with the prompt template +answerability_system_prompt: | + You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: + Criterion 1 - Can the question be understood and answered without needing additional context or access to external references not provided within the question itself? Questions should be self-contained, meaning they do not rely on specific documents, tables, or prior knowledge not shared within the question. + Criterion 2 - Is it clear what type of answer or information the question seeks? The question should convey its purpose without ambiguity, allowing for a direct and relevant response. + Criterion 3 - Does the content in the context contain information that can answer the question or part of the question? + Criterion 4 - Does the content in the context completely answer the question? + + Provide your response in a mandatory dictionary format, and a short explanation of the rating like + { + \"criterion_1_explanation\": "<Brief explanation of why criterion_1 was satisfied or not satisfied>", + \"criterion_1\": "<Y/N>", + \"criterion_2_explanation\": "<State the purpose of the question and justify why it was satisfied or not satisfied>", + \"criterion_2\": "<Y/N>", + \"criterion_3_explanation\": "<Show what parts of the content contain relevant information to the question if this criterion is satisfied, state why the information is irrelevant if unsatisfied>", + \"criterion_3\": "<Y/N>", + \"criterion_4_explanation\": "<Extract spans from the content that help completely answer the question if criterion is satisfied, state what parts are missing if not satisfied>", + \"criterion_4\": "<Y/N>" + } + Provide only the dictionary response and nothing else. + +answerability_user_prompt_template: | + Context Passage: + {context} + Question: + {question} diff --git a/tutorials/nemo-retriever-synthetic-data-generation/figures/api_key.png b/tutorials/nemo-retriever-synthetic-data-generation/figures/api_key.png new file mode 100644 index 000000000..340cc8b39 Binary files /dev/null and b/tutorials/nemo-retriever-synthetic-data-generation/figures/api_key.png differ diff --git a/tutorials/nemo-retriever-synthetic-data-generation/figures/sample_output.png b/tutorials/nemo-retriever-synthetic-data-generation/figures/sample_output.png new file mode 100644 index 000000000..46c700aeb Binary files /dev/null and b/tutorials/nemo-retriever-synthetic-data-generation/figures/sample_output.png differ diff --git a/tutorials/nemo-retriever-synthetic-data-generation/figures/sdg_pipeline.png b/tutorials/nemo-retriever-synthetic-data-generation/figures/sdg_pipeline.png new file mode 100644 index 000000000..2d07b788d Binary files /dev/null and b/tutorials/nemo-retriever-synthetic-data-generation/figures/sdg_pipeline.png differ diff --git a/tutorials/nemo-retriever-synthetic-data-generation/main.py b/tutorials/nemo-retriever-synthetic-data-generation/main.py new file mode 100644 index 000000000..e940903e7 --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/main.py @@ -0,0 +1,199 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import importlib +import os +import shutil +from typing import Any, List + +from retriever_evalset_generator import RetrieverEvalSetGenerator +from tqdm.dask import TqdmCallback + +from config.config import RetrieverEvalSDGConfig +from nemo_curator import AsyncOpenAIClient, ScoreFilter, Sequential +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters import AnswerabilityFilter, EasinessFilter +from nemo_curator.modules.filter import Score, ScoreFilter + + +def get_pipeline(args: Any) -> Any: + + cfg = RetrieverEvalSDGConfig.from_yaml(args.pipeline_config) + # update api_key from input args + cfg.api_key = args.api_key + + sdg_pipeline = Sequential( + [ + RetrieverEvalSetGenerator(cfg), + ] + ) + filters = [] + if cfg.easiness_filter: + filters.append( + ScoreFilter( + EasinessFilter( + cfg.base_url, + cfg.api_key, + cfg.easiness_filter, + cfg.percentile, + cfg.truncate, + cfg.batch_size, + ), + text_field=["text", "question"], + score_field="easiness_scores", + ) + ) + if cfg.answerability_filter: + filters.append( + ScoreFilter( + AnswerabilityFilter( + cfg.base_url, + cfg.api_key, + cfg.answerability_filter, + cfg.answerability_system_prompt, + cfg.answerability_user_prompt_template, + cfg.num_criteria, + ), + text_field=["text", "question"], + score_field="answerability_scores", + ) + ) + + if filters: + filtering_pipeline = Sequential(filters) + else: + filtering_pipeline = None + + return sdg_pipeline, filtering_pipeline + + +def write_to_beir(args: Any, dataset: DocumentDataset, filtered: bool = False): + + df = dataset.df + df = df.compute() + if filtered: + save_dir = os.path.join(args.output_dir, "beir", "filtered") + qrels_save_dir = os.path.join(args.output_dir, "beir", "filtered", "qrels") + corpus_save_path = os.path.join( + args.output_dir, "beir", "filtered", "corpus.jsonl" + ) + queries_save_path = os.path.join( + args.output_dir, "beir", "filtered", "queries.jsonl" + ) + else: + save_dir = os.path.join(args.output_dir, "beir", "all") + qrels_save_dir = os.path.join(args.output_dir, "beir", "all", "qrels") + corpus_save_path = os.path.join(args.output_dir, "beir", "all", "corpus.jsonl") + queries_save_path = os.path.join( + args.output_dir, "beir", "all", "queries.jsonl" + ) + + os.makedirs(save_dir) + os.makedirs(qrels_save_dir) + + df[["question-id", "question"]].rename( + columns={"question-id": "_id", "question": "text"} + ).to_json(queries_save_path, lines=True, orient="records") + + if filtered: + corpus_file_path = os.path.join(args.output_dir, "beir", "all", "corpus.jsonl") + if os.path.exists(corpus_file_path): + shutil.copy(corpus_file_path, corpus_save_path) + else: + raise ValueError("Generate data first") + else: + df[["_id", "text"]].to_json(corpus_save_path, lines=True, orient="records") + + df[["question-id", "_id", "score"]].rename( + columns={"question-id": "query-id", "_id": "corpus-id"} + ).to_csv(os.path.join(qrels_save_dir, "test.tsv"), sep="\t", index=False) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-file", + type=str, + default="", + help="File path of input file containing document chunks for synthetic data generation", + ) + parser.add_argument( + "--input-format", + type=str, + default="rawdoc", + help="The synthetic data generation framework supports two input formats rawdoc or squad.", + ) + parser.add_argument( + "--pipeline-config", + type=str, + default="", + help="Pipeline configuartion yaml file path", + ) + parser.add_argument( + "--output-dir", + type=str, + default="", + help="Output dir for generated data", + ) + parser.add_argument( + "--api-key", + type=str, + default=None, + help="The API key to use for the synthetic data generation LLM client.", + ) + parser.add_argument( + "--api-timeout", + type=int, + default=120, + help="The timeout value for API calls in seconds.", + ) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + else: + raise ValueError("Output directory exists already, use a new directory!") + + if args.input_format == "rawdoc": + input_dataset = DocumentDataset.read_json(args.input_file) + else: + raise ValueError("Error: Only rawdoc format supported") + + sdg_pipeline, filtering_pipeline = get_pipeline(args) + + print("Generating data ...") + with TqdmCallback(desc="apply"): + generated_dataset = sdg_pipeline(input_dataset) + generated_dataset.persist() + print("Writing all generated data to disk ...") + # saving in beir format + write_to_beir(args, generated_dataset, filtered=False) + + print("Filtering data ...") + with TqdmCallback(desc="apply"): + filtered_dataset = filtering_pipeline(generated_dataset) + filtered_dataset.persist() + print("Writing filtered data to disk ...") + # saving in beir format + write_to_beir(args, filtered_dataset, filtered=True) + + # saving in jsonl format + generated_dataset.to_json( + os.path.join(args.output_dir, "jsonl", "all_generated_data.jsonl") + ) + + +if __name__ == "__main__": + main() diff --git a/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb b/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb new file mode 100644 index 000000000..dbb10586c --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb @@ -0,0 +1,858 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Synthetic Evaluation Data Generation\n", + "\n", + "## Table of Contents\n", + "1. [Install required libraries](#Install-required-libraries)\n", + "2. [Prepare input data](#Prepare-input-data)\n", + "3. [Generate API key](#Generating-API-key)\n", + "4. [Loading dataset](#Loading-datasets)\n", + "5. [Reading pipeline config](#Read-pipeline-config)\n", + "6. [Data Generation](#Running-the-Synthetic-Data-Generator)\n", + "7. [Data Quality Assessment](#Data-Quality-Assessment)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install required libraries\n", + "\n", + "Please install NeMo-Curator and required dependencies following the steps for NeMo-Curator installation. Install the tutorial specific dependencies as follows:\n", + "```\n", + "$ pip install -r requirements.txt\n", + "```\n", + "\n", + "Please also see [README.md](../README.md) for environment setup including necessary library installation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare input data\n", + "\n", + "The synthetic data generation framework supports two input formats `rawdoc`. \n", + "\n", + "- `input_format=rawdoc`\n", + "\n", + "The file should be stored in a JSONL format. Each line contains a document in the format of `{\"text\": <document>, \"title\": <title>}`.\n", + "\n", + "```\n", + "{\"text\": \"The quick brown fox jumps over the lazy dog.\", \"title\": \"Classic Pangram\" }\n", + "{\"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n", + "...\n", + "```\n", + "Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{\"_id\": <document_id>, \"text\": <document>, \"title\": <title>}`.\n", + "```\n", + "{\"_id\": \"5\", \"text\": \"The quick brown fox jumps over the lazy dog.\", \"title\": \"Classic Pangram\" }\n", + "{\"_id\": \"doc3\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n", + "...\n", + "```\n", + "This repository contains a sample JSONL file `data/sample_data.jsonl`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from omegaconf import OmegaConf\n", + "import sys\n", + "import importlib\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "from nemo_curator.filters.synthetic import EasinessFilter, AnswerabilityFilter\n", + "from nemo_curator.modules.filter import ScoreFilter, Score\n", + "from nemo_curator.datasets import DocumentDataset\n", + "\n", + "config = importlib.import_module(\n", + " \"tutorials.nemo-retriever-synthetic-data-generation.config.config\"\n", + ")\n", + "retriever_evalset_generator = importlib.import_module(\n", + " \"tutorials.nemo-retriever-synthetic-data-generation.retriever_evalset_generator\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating API key\n", + "\n", + "- The SDG pipeline uses NIM models, in order to use them, you need to generate an API key.\n", + "\n", + "- Visit [this page](https://build.nvidia.com/mistralai/mixtral-8x7b-instruct) and click \"Get API Key\" to generate an API key\n", + "\n", + "![NVIDIA API Catalog](../figures/api_key.png) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading datasets\n", + "We now load a sample dataset from out data folder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_json(\"../data/sample_data_rawdoc.jsonl\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>text</th>\n", + " <th>title</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>The Eiffel Tower is an iconic landmark of Pari...</td>\n", + " <td>Eiffel Tower - A French Icon</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>The Taj Mahal is an ivory-white marble mausole...</td>\n", + " <td>Taj Mahal - A Symbol of Love</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Machu Picchu is a 15th-century Inca citadel si...</td>\n", + " <td>Machu Picchu - Lost City of the Incas</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>The Colosseum, also known as the Flavian Amphi...</td>\n", + " <td>The Colosseum - Ancient Roman Architecture</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " text \\\n", + "0 The Eiffel Tower is an iconic landmark of Pari... \n", + "1 The Great Wall of China is a series of fortifi... \n", + "2 The Taj Mahal is an ivory-white marble mausole... \n", + "3 Machu Picchu is a 15th-century Inca citadel si... \n", + "4 The Colosseum, also known as the Flavian Amphi... \n", + "\n", + " title \n", + "0 Eiffel Tower - A French Icon \n", + "1 The Great Wall of China - Ancient Protection \n", + "2 Taj Mahal - A Symbol of Love \n", + "3 Machu Picchu - Lost City of the Incas \n", + "4 The Colosseum - Ancient Roman Architecture " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read pipeline config" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = config.RetrieverEvalSDGConfig.from_yaml(\"../config/config.yaml\")\n", + "cfg.api_key = \"your api key here\"\n", + "retrieval_evalset_generator = retriever_evalset_generator.RetrieverEvalSetGenerator(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generator model used = mistralai/mixtral-8x22b-instruct-v0.1\n" + ] + } + ], + "source": [ + "print (f\"Generator model used = {cfg.generator_model}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Running the Synthetic Data Generator\n", + "We first create the dataset object from the pandas dataframe, and pass along the dataset object through the generator and the filters. The dataset object gets transformed along the different steps of the pipeline (i.e. generator, filters)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = DocumentDataset.from_pandas(df)\n", + "generated_dataset = retrieval_evalset_generator(dataset)\n", + "generated_df = generated_dataset.df.compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Probing the generated Data\n", + "For those documents that do not have a document id, the pipeline generates a random hash as document id. For those that have an existing document id, the pipeline persists the same ids in the generated data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>text</th>\n", + " <th>title</th>\n", + " <th>question</th>\n", + " <th>_id</th>\n", + " <th>question-id</th>\n", + " <th>answer</th>\n", + " <th>score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>The Eiffel Tower is an iconic landmark of Pari...</td>\n", + " <td>Eiffel Tower - A French Icon</td>\n", + " <td>What is the significance of the Eiffel Tower i...</td>\n", + " <td>342d2d470596528b192b9f0a12d0ec5f4798ab1fc84090...</td>\n", + " <td>c6075864cc0c9318df5456c2b06bfb581562542205ff99...</td>\n", + " <td>The Eiffel Tower is an iconic landmark in Pari...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>The Eiffel Tower is an iconic landmark of Pari...</td>\n", + " <td>Eiffel Tower - A French Icon</td>\n", + " <td>Who was responsible for designing the Eiffel T...</td>\n", + " <td>12dcafeb731d5ef4e1903f1e6cc35bfa9d5e40f740e967...</td>\n", + " <td>003de77e8d7a0d499d75edfc5ad4633d4a2703b89c1f09...</td>\n", + " <td>The Eiffel Tower was designed by the engineer ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>The Eiffel Tower is an iconic landmark of Pari...</td>\n", + " <td>Eiffel Tower - A French Icon</td>\n", + " <td>When was the Eiffel Tower built and for what p...</td>\n", + " <td>e5d22c48da4684bf5da4afe414d2d6630709e5b134b847...</td>\n", + " <td>eb5bfbf35e7d53cc2affc58146721a017c72c38344ca1d...</td>\n", + " <td>The Eiffel Tower was built in 1889 for the Exp...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What materials were used to construct the Grea...</td>\n", + " <td>dab619e293076e8119d9dd0d0ea4a69bf0fff0f526951f...</td>\n", + " <td>03c619187f0aae660725a45533184a2ccf58ebb264d92a...</td>\n", + " <td>The Great Wall of China was constructed using ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What was the primary purpose of building the G...</td>\n", + " <td>329021930f100a10785cea69e4c1c42a965e5c1892b3ae...</td>\n", + " <td>b4d63625700e8f80dd0c42668eb1625d8c58e9716b904a...</td>\n", + " <td>The primary purpose of building the Great Wall...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " text \\\n", + "0 The Eiffel Tower is an iconic landmark of Pari... \n", + "1 The Eiffel Tower is an iconic landmark of Pari... \n", + "2 The Eiffel Tower is an iconic landmark of Pari... \n", + "3 The Great Wall of China is a series of fortifi... \n", + "4 The Great Wall of China is a series of fortifi... \n", + "\n", + " title \\\n", + "0 Eiffel Tower - A French Icon \n", + "1 Eiffel Tower - A French Icon \n", + "2 Eiffel Tower - A French Icon \n", + "3 The Great Wall of China - Ancient Protection \n", + "4 The Great Wall of China - Ancient Protection \n", + "\n", + " question \\\n", + "0 What is the significance of the Eiffel Tower i... \n", + "1 Who was responsible for designing the Eiffel T... \n", + "2 When was the Eiffel Tower built and for what p... \n", + "3 What materials were used to construct the Grea... \n", + "4 What was the primary purpose of building the G... \n", + "\n", + " _id \\\n", + "0 342d2d470596528b192b9f0a12d0ec5f4798ab1fc84090... \n", + "1 12dcafeb731d5ef4e1903f1e6cc35bfa9d5e40f740e967... \n", + "2 e5d22c48da4684bf5da4afe414d2d6630709e5b134b847... \n", + "3 dab619e293076e8119d9dd0d0ea4a69bf0fff0f526951f... \n", + "4 329021930f100a10785cea69e4c1c42a965e5c1892b3ae... \n", + "\n", + " question-id \\\n", + "0 c6075864cc0c9318df5456c2b06bfb581562542205ff99... \n", + "1 003de77e8d7a0d499d75edfc5ad4633d4a2703b89c1f09... \n", + "2 eb5bfbf35e7d53cc2affc58146721a017c72c38344ca1d... \n", + "3 03c619187f0aae660725a45533184a2ccf58ebb264d92a... \n", + "4 b4d63625700e8f80dd0c42668eb1625d8c58e9716b904a... \n", + "\n", + " answer score \n", + "0 The Eiffel Tower is an iconic landmark in Pari... 1 \n", + "1 The Eiffel Tower was designed by the engineer ... 1 \n", + "2 The Eiffel Tower was built in 1889 for the Exp... 1 \n", + "3 The Great Wall of China was constructed using ... 1 \n", + "4 The primary purpose of building the Great Wall... 1 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generated_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Quality Assessment\n", + "We apply two filters:\n", + "\n", + "*Answerability filer* uses LLM-as-judge in order to determine quality of questions in terms of them being answerable from content in the passage. The filter weeds out questions that are invalid and not relevant to the document chunk that was used to generate them.\n", + "\n", + "*Easiness filter* is used to filter out questions that are deemed easy for the retriever models to retrieve positive passages for the given generated question. It uses embedding model as judge. The user needs to provide threshold (number between 0 and 1) for this filter. Lower the value of the filter, harder the questions in the dataset. If the threshold value is higher, then we have many easy questions in the dataset. \n", + "\n", + "The filters can be applied in any order. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ef = EasinessFilter(cfg.base_url,\n", + " cfg.api_key,\n", + " cfg.easiness_filter,\n", + " cfg.percentile,\n", + " cfg.truncate,\n", + " cfg.batch_size)\n", + "easiness_filter = ScoreFilter(ef,\n", + " text_field = [\"text\", \"question\"],\n", + " score_field = \"easiness_scores\")\n", + "af = AnswerabilityFilter(cfg.base_url,\n", + " cfg.api_key,\n", + " cfg.answerability_filter,\n", + " cfg.answerability_system_prompt,\n", + " cfg.answerability_user_prompt_template,\n", + " cfg.num_criteria)\n", + "answerability_filter = ScoreFilter(af,\n", + " text_field = [\"text\", \"question\"],\n", + " score_field = \"answerability_scores\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Easiness filter\n", + "We see an additional column being generated \"easiness_scores\". This filter removes questions that are too easy to retrieve by retriever models." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_dataset = easiness_filter(generated_dataset)\n", + "filtered_df_1 = filtered_dataset.df.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>text</th>\n", + " <th>title</th>\n", + " <th>question</th>\n", + " <th>_id</th>\n", + " <th>question-id</th>\n", + " <th>answer</th>\n", + " <th>score</th>\n", + " <th>easiness_scores</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>The Eiffel Tower is an iconic landmark of Pari...</td>\n", + " <td>Eiffel Tower - A French Icon</td>\n", + " <td>Who was the engineer behind the design of the ...</td>\n", + " <td>5b31740eab0e66fa435ac3b2d0f3ad299e9bc885da22ad...</td>\n", + " <td>985cd7b5de889c7b62eca2d45b83eac5c1ba6fa2dce681...</td>\n", + " <td>The Eiffel Tower was designed by the engineer ...</td>\n", + " <td>1</td>\n", + " <td>0.569564</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What is the purpose of the Great Wall of China?</td>\n", + " <td>2e40d9da383f39586c7f4a2e6cdc930de7ceaa1800d41c...</td>\n", + " <td>108ee53f98dcba40d2e4654df9e41dadf313000b2cfbb0...</td>\n", + " <td>The purpose of the Great Wall of China is to p...</td>\n", + " <td>1</td>\n", + " <td>0.527854</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What materials were used to build the Great Wa...</td>\n", + " <td>b05babced766cf6b65f43bc0d8c927d08a271d30423cd8...</td>\n", + " <td>a698316fccdb6facb8341372778863bb092fe71bc60357...</td>\n", + " <td>The Great Wall of China was built using materi...</td>\n", + " <td>1</td>\n", + " <td>0.550470</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What is the general direction of the Great Wal...</td>\n", + " <td>88cd9adc26f148a24a1fbde7c5dfed1033db29c7ab997f...</td>\n", + " <td>a5996280c5a2b382c206ab4fc69b8981f7588a6c216b63...</td>\n", + " <td>The Great Wall of China was generally built al...</td>\n", + " <td>1</td>\n", + " <td>0.462438</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>The Taj Mahal is an ivory-white marble mausole...</td>\n", + " <td>Taj Mahal - A Symbol of Love</td>\n", + " <td>What is the Taj Mahal primarily used for?</td>\n", + " <td>4eaff3017898dab67377f19bef2cf7bbf7ee1223a661f7...</td>\n", + " <td>1e2f14820a6f599a5d124f5cd6b0e2575a0fa601a36d5f...</td>\n", + " <td>The Taj Mahal is primarily used as a mausoleum...</td>\n", + " <td>1</td>\n", + " <td>0.444493</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " text \\\n", + "1 The Eiffel Tower is an iconic landmark of Pari... \n", + "3 The Great Wall of China is a series of fortifi... \n", + "4 The Great Wall of China is a series of fortifi... \n", + "5 The Great Wall of China is a series of fortifi... \n", + "6 The Taj Mahal is an ivory-white marble mausole... \n", + "\n", + " title \\\n", + "1 Eiffel Tower - A French Icon \n", + "3 The Great Wall of China - Ancient Protection \n", + "4 The Great Wall of China - Ancient Protection \n", + "5 The Great Wall of China - Ancient Protection \n", + "6 Taj Mahal - A Symbol of Love \n", + "\n", + " question \\\n", + "1 Who was the engineer behind the design of the ... \n", + "3 What is the purpose of the Great Wall of China? \n", + "4 What materials were used to build the Great Wa... \n", + "5 What is the general direction of the Great Wal... \n", + "6 What is the Taj Mahal primarily used for? \n", + "\n", + " _id \\\n", + "1 5b31740eab0e66fa435ac3b2d0f3ad299e9bc885da22ad... \n", + "3 2e40d9da383f39586c7f4a2e6cdc930de7ceaa1800d41c... \n", + "4 b05babced766cf6b65f43bc0d8c927d08a271d30423cd8... \n", + "5 88cd9adc26f148a24a1fbde7c5dfed1033db29c7ab997f... \n", + "6 4eaff3017898dab67377f19bef2cf7bbf7ee1223a661f7... \n", + "\n", + " question-id \\\n", + "1 985cd7b5de889c7b62eca2d45b83eac5c1ba6fa2dce681... \n", + "3 108ee53f98dcba40d2e4654df9e41dadf313000b2cfbb0... \n", + "4 a698316fccdb6facb8341372778863bb092fe71bc60357... \n", + "5 a5996280c5a2b382c206ab4fc69b8981f7588a6c216b63... \n", + "6 1e2f14820a6f599a5d124f5cd6b0e2575a0fa601a36d5f... \n", + "\n", + " answer score easiness_scores \n", + "1 The Eiffel Tower was designed by the engineer ... 1 0.569564 \n", + "3 The purpose of the Great Wall of China is to p... 1 0.527854 \n", + "4 The Great Wall of China was built using materi... 1 0.550470 \n", + "5 The Great Wall of China was generally built al... 1 0.462438 \n", + "6 The Taj Mahal is primarily used as a mausoleum... 1 0.444493 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_df_1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of generated data points = 30\n", + "Total number of data points after application of easiness filter = 21\n" + ] + } + ], + "source": [ + "print (f\"Total number of generated data points = {generated_df.shape[0]}\") \n", + "print (f\"Total number of data points after application of easiness filter = {filtered_df_1.shape[0]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Answerability filter\n", + "We see additional column \"answerability scores\", which shows the rating provided by the LLM-as-judge on criteria used to judge the questions. The criteria can be found in the config. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_dataset_2 = answerability_filter(filtered_dataset)\n", + "filtered_df_2 = filtered_dataset_2.df.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>text</th>\n", + " <th>title</th>\n", + " <th>question</th>\n", + " <th>_id</th>\n", + " <th>question-id</th>\n", + " <th>answer</th>\n", + " <th>score</th>\n", + " <th>easiness_scores</th>\n", + " <th>answerability_scores</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What materials were used to construct the Grea...</td>\n", + " <td>a5b2fd08b6a424a371b12c7d07c37044abddf168427dee...</td>\n", + " <td>e5078730ce04b2f8314fced830fbb037528097bfa4c9f8...</td>\n", + " <td>The Great Wall of China was constructed using ...</td>\n", + " <td>1</td>\n", + " <td>0.553092</td>\n", + " <td>{\\n\"criterion_1_explanation\": \"The question is...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>What was the primary purpose of building the G...</td>\n", + " <td>51d260dd9881d4176553b1a416d3f299a375d903fc677b...</td>\n", + " <td>4b5c0ad1ac49efb0f1d252792e4537c960566309f33eb0...</td>\n", + " <td>The primary purpose of building the Great Wall...</td>\n", + " <td>1</td>\n", + " <td>0.505319</td>\n", + " <td>{\\n\"criterion_1_explanation\": \"The question is...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>The Great Wall of China is a series of fortifi...</td>\n", + " <td>The Great Wall of China - Ancient Protection</td>\n", + " <td>Which direction was the Great Wall of China ge...</td>\n", + " <td>5bf59b2efabd4a5b0d2f179841ff1cdc41086e6598a098...</td>\n", + " <td>96456def817ac60dfa8a34f3983aba20209ec661d88535...</td>\n", + " <td>The Great Wall of China was generally built al...</td>\n", + " <td>1</td>\n", + " <td>0.545968</td>\n", + " <td>{\\n\"criterion_1_explanation\": \"The question is...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>The Taj Mahal is an ivory-white marble mausole...</td>\n", + " <td>Taj Mahal - A Symbol of Love</td>\n", + " <td>What is the Taj Mahal primarily made of?</td>\n", + " <td>7cf289552442f65170be4c4d0a950a65b9d21ffeeca05d...</td>\n", + " <td>53ea4d4ff9ba312d953c3a2a393bb503712f6036830aa8...</td>\n", + " <td>The Taj Mahal is primarily made of ivory-white...</td>\n", + " <td>1</td>\n", + " <td>0.422271</td>\n", + " <td>{\\n\"criterion_1_explanation\": \"The question is...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>The Taj Mahal is an ivory-white marble mausole...</td>\n", + " <td>Taj Mahal - A Symbol of Love</td>\n", + " <td>Who commissioned the construction of the Taj M...</td>\n", + " <td>3290bf8bb526a81774e70939849fd84a4ed49e708677ca...</td>\n", + " <td>914f2cc013c20a7e34737435f17db7e05a38e8da333109...</td>\n", + " <td>The Taj Mahal was commissioned by the Mughal e...</td>\n", + " <td>1</td>\n", + " <td>0.547095</td>\n", + " <td>{\\n\"criterion_1_explanation\": \"The question is...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " text \\\n", + "3 The Great Wall of China is a series of fortifi... \n", + "4 The Great Wall of China is a series of fortifi... \n", + "5 The Great Wall of China is a series of fortifi... \n", + "6 The Taj Mahal is an ivory-white marble mausole... \n", + "7 The Taj Mahal is an ivory-white marble mausole... \n", + "\n", + " title \\\n", + "3 The Great Wall of China - Ancient Protection \n", + "4 The Great Wall of China - Ancient Protection \n", + "5 The Great Wall of China - Ancient Protection \n", + "6 Taj Mahal - A Symbol of Love \n", + "7 Taj Mahal - A Symbol of Love \n", + "\n", + " question \\\n", + "3 What materials were used to construct the Grea... \n", + "4 What was the primary purpose of building the G... \n", + "5 Which direction was the Great Wall of China ge... \n", + "6 What is the Taj Mahal primarily made of? \n", + "7 Who commissioned the construction of the Taj M... \n", + "\n", + " _id \\\n", + "3 a5b2fd08b6a424a371b12c7d07c37044abddf168427dee... \n", + "4 51d260dd9881d4176553b1a416d3f299a375d903fc677b... \n", + "5 5bf59b2efabd4a5b0d2f179841ff1cdc41086e6598a098... \n", + "6 7cf289552442f65170be4c4d0a950a65b9d21ffeeca05d... \n", + "7 3290bf8bb526a81774e70939849fd84a4ed49e708677ca... \n", + "\n", + " question-id \\\n", + "3 e5078730ce04b2f8314fced830fbb037528097bfa4c9f8... \n", + "4 4b5c0ad1ac49efb0f1d252792e4537c960566309f33eb0... \n", + "5 96456def817ac60dfa8a34f3983aba20209ec661d88535... \n", + "6 53ea4d4ff9ba312d953c3a2a393bb503712f6036830aa8... \n", + "7 914f2cc013c20a7e34737435f17db7e05a38e8da333109... \n", + "\n", + " answer score easiness_scores \\\n", + "3 The Great Wall of China was constructed using ... 1 0.553092 \n", + "4 The primary purpose of building the Great Wall... 1 0.505319 \n", + "5 The Great Wall of China was generally built al... 1 0.545968 \n", + "6 The Taj Mahal is primarily made of ivory-white... 1 0.422271 \n", + "7 The Taj Mahal was commissioned by the Mughal e... 1 0.547095 \n", + "\n", + " answerability_scores \n", + "3 {\\n\"criterion_1_explanation\": \"The question is... \n", + "4 {\\n\"criterion_1_explanation\": \"The question is... \n", + "5 {\\n\"criterion_1_explanation\": \"The question is... \n", + "6 {\\n\"criterion_1_explanation\": \"The question is... \n", + "7 {\\n\"criterion_1_explanation\": \"The question is... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_df_2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of data points after application of answerability filter = 19\n" + ] + } + ], + "source": [ + "print (f\"Total number of data points after application of answerability filter = {filtered_df_2.shape[0]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that upon adding the answerability filter, the number of data points further reduced. We removed unanswerable questions i.e. questions that can't be answered solely based on content provided in the context document." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/nemo-retriever-synthetic-data-generation/requirements.txt b/tutorials/nemo-retriever-synthetic-data-generation/requirements.txt new file mode 100644 index 000000000..000a34e62 --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/requirements.txt @@ -0,0 +1,7 @@ +beir +nltk +pandas +pydantic +sentence-transformers +tqdm +transformers diff --git a/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py b/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py new file mode 100644 index 000000000..0a812e5de --- /dev/null +++ b/tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py @@ -0,0 +1,183 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import hashlib +import importlib +import os +import re +import secrets +from abc import ABC, abstractmethod +from typing import Any + +import dask.array as da +import dask.dataframe as dd +import pandas as pd +from dask.base import normalize_token, tokenize +from dask.diagnostics import ProgressBar +from dask.distributed import progress +from distributed import Client +from omegaconf import DictConfig, OmegaConf +from openai import AsyncOpenAI, OpenAI +from tqdm import tqdm +from tqdm.dask import TqdmCallback + +from config.config import RetrieverEvalSDGConfig +from nemo_curator import AsyncOpenAIClient, OpenAIClient +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters.doc_filter import DocumentFilter +from nemo_curator.synthetic import AsyncNemotronGenerator, NemotronGenerator +from nemo_curator.synthetic.generator import SyntheticDataGenerator + + +# ----------------------------------------------------------------------------80 +class RetrieverEvalSetGenerator(SyntheticDataGenerator): + """ + Main class that generates annotated datasets for retriever evaluation + Question, Answers are generated for a given document chunk as input + Datasets are annotated in format of (passage, question, answer) triplets + """ + + def __init__( + self, + pipeline_config: RetrieverEvalSDGConfig = None, + ): + super().__init__() + self._name = self.__class__.__name__ + self.cfg = pipeline_config + + self._init_pipeline_params() + + def load_pipeline_config(self, cfg_path: str): + self.cfg = RetrieverEvalSDGConfig.from_yaml(cfg_path) + + def _validate_config(self): + return True # TODO complete this + + def _init_pipeline_params(self): + # synchronous + self.openai_client = OpenAI( + base_url=self.cfg.base_url, + api_key=self.cfg.api_key, + ) + self.client = OpenAIClient(self.openai_client) + self.generator = NemotronGenerator(self.client) + + if self._validate_config(): + self.sys_prompt = self.cfg.generator_system_prompt + self.user_prompt_template = self.cfg.generator_user_prompt_template + self.generator_model = self.cfg.generator_model + self.generator_model_kwargs = { + "temperature": self.cfg.temperature, + "top_p": self.cfg.top_p, + "max_tokens": self.cfg.max_tokens, + } + self.num_qs = self.cfg.num_questions + else: + raise Exception("Validation Error: incorrect pipeline config file") + + # ----------------------------------------------------------------------------80 + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + + df = dataset.df + + df["llm_response"] = df["text"].apply( + self.generate, meta=("llm_response", "str") + ) + df["qa_pairs"] = df["llm_response"].apply( + self.parse_response, meta=("qa_pairs", "object") + ) + + df = df.explode("qa_pairs").reset_index(drop=True) + + df["question"] = df["qa_pairs"].apply( + lambda x: x["question"], meta=("question", "str") + ) + + if "_id" in df.columns: + df["_id"] = df["_id"].apply(self._check_doc_id, meta=("_id", "str")) + else: + df["_id"] = df["text"].apply(self._get_random_hash, meta=("_id", "str")) + + df["question-id"] = df["question"].apply( + self._get_random_hash, meta=("question-id", "str") + ) + + df["answer"] = df["qa_pairs"].apply( + lambda x: x["answer"], meta=("answer", "str") + ) + + df["score"] = df["question"].apply(lambda x: 1, meta=("score", "int")) + + df = df.drop(["llm_response", "qa_pairs"], axis=1) + + return DocumentDataset(df) + + # ----------------------------------------------------------------------------80 + def parse_response(self, llm_response: str) -> Any: + qa_pairs = [] + qa_list = llm_response.split("Question")[1:] + try: + for qa in qa_list: + qas = qa.split("Answer") + q = qas[0].split(":")[1].strip() + if re.search("Explanation", qas[1]): + a = qas[1].split("Explanation")[0].split(":")[1].strip() + explanation = qas[1].split("Explanation")[1].strip() + else: + a = qas[1].split(":")[1].strip() + qa_pairs.append({"question": q, "answer": a}) + except Exception as e: + print(f"error: {e}") + return qa_pairs + + # ----------------------------------------------------------------------------80 + def generate(self, doc_text): + response = self.generator.generate_closed_qa_instructions( + document=doc_text, + prompt_template=self.sys_prompt + "\n" + self.user_prompt_template, + n_openlines=self.num_qs, + model=self.generator_model, + model_kwargs=self.generator_model_kwargs, + ) + return response[0] + + # ----------------------------------------------------------------------------80 + def _get_random_hash(self, question: str): + """Generate random hash for synthetic question IDs""" + # Generate a random string + random_string = secrets.token_hex( + 16 + ) # Generates a secure, random string of 16 bytes hex-encoded + + # Hash the random string using SHA-256 + hash_object = hashlib.sha256( + random_string.encode() + ) # Encode the string to bytes + hex_dig = hash_object.hexdigest() + return hex_dig + + # ----------------------------------------------------------------------------80 + def _check_doc_id(self, doc_id: Any) -> str: + if str(doc_id) == "nan": + return self._get_random_hash("") + else: + return str(doc_id) + + def __dask_tokenize__(self): + return normalize_token(RetrieverEvalSetGenerator) + + +# ----------------------------------------------------------------------------80