Add detailed comments to pipelines (#1)

* Add detailed comments and update paper --------- Co-authored-by: Varun Mathur <[email protected]>
avnlp · Aug 21, 2024 · ca3894d · ca3894d
1 parent 1b2a32e
commit ca3894d
Show file tree

Hide file tree

Showing 22 changed files with 953 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -80,7 +80,10 @@ print(ranked_answers)
 # ]
 ```
 
-The API documentation can be found [here](src/llm_blender/README.md).
+The detailed documentation can be found in the [LLM-Blender API Reference](src/llm_blender/README.md).
+
+As the [`llm-blender` library](https://github.com/yuchenlin/LLM-Blender) lacks a stable release, the necessary code has been incorporated into this project under `src/llm_blender/llm_blender_utils`.
+
 
 ## Results
 

diff --git a/paper/llm_blender.pdf b/paper/llm_blender.pdf
diff --git a/src/llm_blender/billsum/llama.py b/src/llm_blender/billsum/llama.py
@@ -1,3 +1,16 @@
+"""Evaluation of Llama-3-8b on the BillSum dataset
+
+This script implements a pipeline to evaluate the Llama-3-8b model on the BillSum dataset
+on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack.
+2. Generates responses for prompts and instructions from the BillSum dataset.
+3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
+
+This evaluation provides a baseline for the model's performance on the BillSum dataset.
+"""
+
 from datasets import load_dataset
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
@@ -8,46 +21,78 @@ def generate_result(
     generator: LlamaCppGenerator,
     prompt: str = "",
 ) -> str:
+    """
+    Generate a response using the LlamaCppGenerator.
 
+    The prompt and instruction are formatted to be compatible with the model.
+
+    Args:
+        generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
+        prompt (str): The main text input for the model.
+
+    Returns:
+        str: The generated response from the model.
+    """
+    # Additional instructions for the model for Summarization
     instruction = (
         """ Provide a comprehensive summary of the given text. """
         """The summary should cover all the key points and main ideas presented in the original text, """
         """while also condensing the information into a concise and easy-to-understand format."""
     )
 
     # Format prompt to be compatible with meta-llama-3-8b-instruct
+    # This specific format is required for the model to distinguish between user input and expected output
     formatted_prompt = (
         """<|begin_of_text|><|start_header_id|>user<|end_header_id|> """
         f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
     )
 
-    # Generate text
+    # Generate text using the LlamaCppGenerator
     result = generator.run(
         formatted_prompt,
         generation_kwargs={"max_tokens": 500, "temperature": 0.1},
     )
+
+    # Extract the generated text from the result
     generated_answer = result["replies"][0]
     return generated_answer
 
 
+# Define the path to the model weights
 model = "meta-llama-3-8b-instruct.Q4_K_M.gguf"
+
+# Initialize the LlamaCppGenerator with the specified model and context window size
 generator = LlamaCppGenerator(
     model=model,
     n_ctx=256,
 )
+
+# Warm up the generator (loading the model into memory)
 generator.warm_up()
 
+# Load the dataset from the HuggingFace
 dataset = load_dataset("billsum", split="test")
+
+# Convert the dataset to a pandas DataFrame for easier manipulation
 dataset = dataset.to_pandas()
+
+# Generate results for each row in the dataset
+# Apply the generate_result function to each row, using the 'text' column
+# Store the results in the 'result' column
 dataset.loc[:, "result"] = dataset.apply(
     lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
 )
-dataset.to_csv("output_llama.csv", index=False)
 
+# Save the generated texts to a CSV file
+dataset.to_csv("output_llama.csv", index=False)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the computed metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/llm_blender_ranker_all_llms.py b/src/llm_blender/billsum/llm_blender_ranker_all_llms.py
@@ -1,12 +1,30 @@
+"""
+Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
+evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral).
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the BillSum dataset
 dataset = load_dataset("billsum", split="test")
 
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -51,6 +69,7 @@
     """a concise and easy-to-understand format.: {{ prompt }} [/INST] """
 )
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template)
@@ -59,8 +78,10 @@
 qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
 
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params)
@@ -69,11 +90,13 @@
 qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
-
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
 
@@ -97,6 +120,8 @@
 
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("openchat_prompt_builder", "openchat_model")
@@ -105,6 +130,7 @@
 blender_pipeline.connect("qwen_prompt_builder", "qwen_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("openchat_model", "llm_blender_ranker")
@@ -113,10 +139,13 @@
 blender_pipeline.connect("qwen_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
 
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
             {"llama_prompt_builder": {"prompt": prompt}},
@@ -130,16 +159,21 @@
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
     # Use top ranked output as the answer
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])
diff --git a/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py b/src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py
@@ -1,12 +1,30 @@
+"""
+Evaluation of ensemble of best performing LLMs on the BillSum dataset using LLM Blender
+
+This script implements a pipeline to ensemble multiple language models on the BillSum dataset. The pipeline is
+ evaluated on the BLEURT, BARTScore, and BERTScore metrics.
+
+The pipeline performs the following steps:
+1. Loads 3 top performing LLMs: LLaMA, Phi and  Mistral.
+2. Builds prompts for each model using specific templates.
+3. Generates responses for prompts from the Mix-Instruct dataset using each model.
+4. Ranks the generated responses from all the models using the LLM Blender Ranker.
+5. Evaluates the top-ranked response against reference outputs using multiple metrics.
+
+The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs.
+"""
+
 from datasets import load_dataset
 from haystack import Pipeline
 from haystack.components.builders import PromptBuilder
 from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
 
 from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker
 
+# Load the BillSum dataset
 dataset = load_dataset("billsum", split="test")
 
+# Define prompt templates for each model
 llama_prompt_template = (
     """<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
     """text. The summary should cover all the key points and main ideas presented in the original text, while """
@@ -26,20 +44,26 @@
     """a concise and easy-to-understand format.: {{ prompt }} [/INST] """
 )
 
+# Initialize PromptBuilder for each model
 llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
 phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
 mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)
 
+# Define model and generation parameters for all models
 model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}
 
+# Initialize LlamaCppGenerator for each model
 llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
 phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
 mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)
 
+# Initialize LLMBlenderRanker to ensemble multiple models
 llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")
 
+# Create the main pipeline
 blender_pipeline = Pipeline()
 
+# Add components to the pipeline
 blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
 blender_pipeline.add_component(instance=llama_model, name="llama_model")
 
@@ -51,18 +75,24 @@
 
 blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")
 
+# Connect components in the pipeline
+# Connect the prompt builders to the respective model
 blender_pipeline.connect("llama_prompt_builder", "llama_model")
 blender_pipeline.connect("phi_prompt_builder", "phi_model")
 blender_pipeline.connect("mistral_prompt_builder", "mistral_model")
 
+# Connect all the models to the LLMBlenderRanker for ensembling
 blender_pipeline.connect("llama_model", "llm_blender_ranker")
 blender_pipeline.connect("phi_model", "llm_blender_ranker")
 blender_pipeline.connect("mistral_model", "llm_blender_ranker")
 
+# Process the dataset and generate answers
 generated_answers_labels = []
 for row in dataset:
     prompt = row["input"]
     label = row["output"]
+
+    # Run the pipeline for each input
     output = blender_pipeline.run(
         {
             {
@@ -74,16 +104,21 @@
     )
     generated_answers_labels.append((output["answers"], label))
 
+# Prepare data for evaluation
 preds = []
 labels = []
 for ranked_answers, label in generated_answers_labels:
     # Use top ranked output as the answer
     preds.append(ranked_answers[0].data)
     labels.append(label)
 
+# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
 evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)
+
+# Compute various metrics to evaluate the generated results against the reference outputs
 metrics = evaluator.compute_metrics()
 
+# Print the evaluation metrics
 print("BLEURT Score", metrics["bleurt"])
 print("BARTSCORE Score", metrics["bartscore"])
 print("BERTSCORE Score", metrics["bertscore"])