Skip to content

Commit

Permalink
Add detailed comments to pipelines (#1)
Browse files Browse the repository at this point in the history
* Add detailed comments and update paper

---------

Co-authored-by: Varun Mathur <[email protected]>
  • Loading branch information
awinml and vrunm authored Aug 21, 2024
1 parent 1b2a32e commit ca3894d
Show file tree
Hide file tree
Showing 22 changed files with 953 additions and 56 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ print(ranked_answers)
# ]
```

The API documentation can be found [here](src/llm_blender/README.md).
The detailed documentation can be found in the [LLM-Blender API Reference](src/llm_blender/README.md).

As the [`llm-blender` library](https://github.com/yuchenlin/LLM-Blender) lacks a stable release, the necessary code has been incorporated into this project under `src/llm_blender/llm_blender_utils`.


## Results

Expand Down
Binary file modified paper/llm_blender.pdf
Binary file not shown.
49 changes: 47 additions & 2 deletions src/llm_blender/billsum/llama.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
"""Evaluation of Llama-3-8b on the BillSum dataset
This script implements a pipeline to evaluate the Llama-3-8b model on the BillSum dataset
on the BLEURT, BARTScore, and BERTScore metrics.
The pipeline performs the following steps:
1. Loads the Llama-3-8b model using the LlamaCppGenerator from Haystack.
2. Generates responses for prompts and instructions from the BillSum dataset.
3. Evaluates the generated responses against reference outputs using the BLEURT, BARTScore, and BERTScore metrics.
This evaluation provides a baseline for the model's performance on the BillSum dataset.
"""

from datasets import load_dataset
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator

Expand All @@ -8,46 +21,78 @@ def generate_result(
generator: LlamaCppGenerator,
prompt: str = "",
) -> str:
"""
Generate a response using the LlamaCppGenerator.
The prompt and instruction are formatted to be compatible with the model.
Args:
generator (LlamaCppGenerator): The initialized LlamaCppGenerator object.
prompt (str): The main text input for the model.
Returns:
str: The generated response from the model.
"""
# Additional instructions for the model for Summarization
instruction = (
""" Provide a comprehensive summary of the given text. """
"""The summary should cover all the key points and main ideas presented in the original text, """
"""while also condensing the information into a concise and easy-to-understand format."""
)

# Format prompt to be compatible with meta-llama-3-8b-instruct
# This specific format is required for the model to distinguish between user input and expected output
formatted_prompt = (
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> """
f"""{instruction} {prompt} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
)

# Generate text
# Generate text using the LlamaCppGenerator
result = generator.run(
formatted_prompt,
generation_kwargs={"max_tokens": 500, "temperature": 0.1},
)

# Extract the generated text from the result
generated_answer = result["replies"][0]
return generated_answer


# Define the path to the model weights
model = "meta-llama-3-8b-instruct.Q4_K_M.gguf"

# Initialize the LlamaCppGenerator with the specified model and context window size
generator = LlamaCppGenerator(
model=model,
n_ctx=256,
)

# Warm up the generator (loading the model into memory)
generator.warm_up()

# Load the dataset from the HuggingFace
dataset = load_dataset("billsum", split="test")

# Convert the dataset to a pandas DataFrame for easier manipulation
dataset = dataset.to_pandas()

# Generate results for each row in the dataset
# Apply the generate_result function to each row, using the 'text' column
# Store the results in the 'result' column
dataset.loc[:, "result"] = dataset.apply(
lambda row: str(generate_result(generator=generator, prompt=row["text"])), axis=1
)
dataset.to_csv("output_llama.csv", index=False)

# Save the generated texts to a CSV file
dataset.to_csv("output_llama.csv", index=False)

# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
evaluator = LLMBlenderEvaluator(preds=dataset["result"], labels=dataset["output"])

# Compute various metrics to evaluate the generated results against the reference outputs
metrics = evaluator.compute_metrics()

# Print the computed metrics
print("BLEURT Score", metrics["bleurt"])
print("BARTSCORE Score", metrics["bartscore"])
print("BERTSCORE Score", metrics["bertscore"])
36 changes: 35 additions & 1 deletion src/llm_blender/billsum/llm_blender_ranker_all_llms.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
"""
Evaluation of ensemble of LLMs on the BillSum dataset using LLM Blender
This script implements a pipeline to ensemble multiple language models on the Mix-Instruct dataset. The pipeline is
evaluated on the BLEURT, BARTScore, and BERTScore metrics.
The pipeline performs the following steps:
1. Loads multiple language models (LLaMA, Phi, OpenChat, OpenHermes, Solar, Qwen, Mistral).
2. Builds prompts for each model using specific templates.
3. Generates responses for prompts from the Mix-Instruct dataset using each model.
4. Ranks the generated responses from all the models using the LLM Blender Ranker.
5. Evaluates the top-ranked response against reference outputs using multiple metrics.
The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with 7 diverse LLMs.
"""

from datasets import load_dataset
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator

from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker

# Load the BillSum dataset
dataset = load_dataset("billsum", split="test")

# Define prompt templates for each model
llama_prompt_template = (
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
"""text. The summary should cover all the key points and main ideas presented in the original text, while """
Expand Down Expand Up @@ -51,6 +69,7 @@
"""a concise and easy-to-understand format.: {{ prompt }} [/INST] """
)

# Initialize PromptBuilder for each model
llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
openchat_prompt_builder = PromptBuilder(template=openchat_prompt_template)
Expand All @@ -59,8 +78,10 @@
qwen_prompt_builder = PromptBuilder(template=qwen_prompt_template)
mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)

# Define model and generation parameters for all models
model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}

# Initialize LlamaCppGenerator for each model
llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
openchat_model = LlamaCppGenerator(model="models/openchat-3.5-0106.Q4_K_M.gguf", **model_params)
Expand All @@ -69,11 +90,13 @@
qwen_model = LlamaCppGenerator(model="models/qwen1_5-7b-chat-Q4_K_M.gguf", **model_params)
mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)

# Initialize LLMBlenderRanker to ensemble multiple models
llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")


# Create the main pipeline
blender_pipeline = Pipeline()

# Add components to the pipeline
blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
blender_pipeline.add_component(instance=llama_model, name="llama_model")

Expand All @@ -97,6 +120,8 @@

blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")

# Connect components in the pipeline
# Connect the prompt builders to the respective model
blender_pipeline.connect("llama_prompt_builder", "llama_model")
blender_pipeline.connect("phi_prompt_builder", "phi_model")
blender_pipeline.connect("openchat_prompt_builder", "openchat_model")
Expand All @@ -105,6 +130,7 @@
blender_pipeline.connect("qwen_prompt_builder", "qwen_model")
blender_pipeline.connect("mistral_prompt_builder", "mistral_model")

# Connect all the models to the LLMBlenderRanker for ensembling
blender_pipeline.connect("llama_model", "llm_blender_ranker")
blender_pipeline.connect("phi_model", "llm_blender_ranker")
blender_pipeline.connect("openchat_model", "llm_blender_ranker")
Expand All @@ -113,10 +139,13 @@
blender_pipeline.connect("qwen_model", "llm_blender_ranker")
blender_pipeline.connect("mistral_model", "llm_blender_ranker")

# Process the dataset and generate answers
generated_answers_labels = []
for row in dataset:
prompt = row["input"]
label = row["output"]

# Run the pipeline for each input
output = blender_pipeline.run(
{
{"llama_prompt_builder": {"prompt": prompt}},
Expand All @@ -130,16 +159,21 @@
)
generated_answers_labels.append((output["answers"], label))

# Prepare data for evaluation
preds = []
labels = []
for ranked_answers, label in generated_answers_labels:
# Use top ranked output as the answer
preds.append(ranked_answers[0].data)
labels.append(label)

# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)

# Compute various metrics to evaluate the generated results against the reference outputs
metrics = evaluator.compute_metrics()

# Print the evaluation metrics
print("BLEURT Score", metrics["bleurt"])
print("BARTSCORE Score", metrics["bartscore"])
print("BERTSCORE Score", metrics["bertscore"])
35 changes: 35 additions & 0 deletions src/llm_blender/billsum/llm_blender_ranker_top_3_llms.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
"""
Evaluation of ensemble of best performing LLMs on the BillSum dataset using LLM Blender
This script implements a pipeline to ensemble multiple language models on the BillSum dataset. The pipeline is
evaluated on the BLEURT, BARTScore, and BERTScore metrics.
The pipeline performs the following steps:
1. Loads 3 top performing LLMs: LLaMA, Phi and Mistral.
2. Builds prompts for each model using specific templates.
3. Generates responses for prompts from the Mix-Instruct dataset using each model.
4. Ranks the generated responses from all the models using the LLM Blender Ranker.
5. Evaluates the top-ranked response against reference outputs using multiple metrics.
The evaluation showcases the effectiveness of the ensembling approach using LLM-Blender with diverse LLMs.
"""

from datasets import load_dataset
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator

from llm_blender import LLMBlenderEvaluator, LLMBlenderRanker

# Load the BillSum dataset
dataset = load_dataset("billsum", split="test")

# Define prompt templates for each model
llama_prompt_template = (
"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Provide a comprehensive summary of the given """
"""text. The summary should cover all the key points and main ideas presented in the original text, while """
Expand All @@ -26,20 +44,26 @@
"""a concise and easy-to-understand format.: {{ prompt }} [/INST] """
)

# Initialize PromptBuilder for each model
llama_prompt_builder = PromptBuilder(template=llama_prompt_template)
phi_prompt_builder = PromptBuilder(template=phi_prompt_template)
mistral_prompt_builder = PromptBuilder(template=mistral_prompt_template)

# Define model and generation parameters for all models
model_params = {"n_ctx": 256, "generation_kwargs": {"max_tokens": 500, "temperature": 0.1}}

# Initialize LlamaCppGenerator for each model
llama_model = LlamaCppGenerator(model="models/meta-llama-3-8b-instruct.Q4_K_M.gguf", **model_params)
phi_model = LlamaCppGenerator(model="models/phi-3-mini-4k-instruct.Q4_K_M.gguf", **model_params)
mistral_model = LlamaCppGenerator(model="models/mistral-7b-Q4_K_M.gguf", **model_params)

# Initialize LLMBlenderRanker to ensemble multiple models
llm_blender_ranker = LLMBlenderRanker(model="llm-blender/PairRM", device="cpu")

# Create the main pipeline
blender_pipeline = Pipeline()

# Add components to the pipeline
blender_pipeline.add_component(instance=llama_prompt_builder, name="llama_prompt_builder")
blender_pipeline.add_component(instance=llama_model, name="llama_model")

Expand All @@ -51,18 +75,24 @@

blender_pipeline.add_component(instance=llm_blender_ranker, name="llm_blender_ranker")

# Connect components in the pipeline
# Connect the prompt builders to the respective model
blender_pipeline.connect("llama_prompt_builder", "llama_model")
blender_pipeline.connect("phi_prompt_builder", "phi_model")
blender_pipeline.connect("mistral_prompt_builder", "mistral_model")

# Connect all the models to the LLMBlenderRanker for ensembling
blender_pipeline.connect("llama_model", "llm_blender_ranker")
blender_pipeline.connect("phi_model", "llm_blender_ranker")
blender_pipeline.connect("mistral_model", "llm_blender_ranker")

# Process the dataset and generate answers
generated_answers_labels = []
for row in dataset:
prompt = row["input"]
label = row["output"]

# Run the pipeline for each input
output = blender_pipeline.run(
{
{
Expand All @@ -74,16 +104,21 @@
)
generated_answers_labels.append((output["answers"], label))

# Prepare data for evaluation
preds = []
labels = []
for ranked_answers, label in generated_answers_labels:
# Use top ranked output as the answer
preds.append(ranked_answers[0].data)
labels.append(label)

# Initialize the LLMBlenderEvaluator with the generated results and the reference outputs
evaluator = LLMBlenderEvaluator(preds=preds, labels=labels)

# Compute various metrics to evaluate the generated results against the reference outputs
metrics = evaluator.compute_metrics()

# Print the evaluation metrics
print("BLEURT Score", metrics["bleurt"])
print("BARTSCORE Score", metrics["bartscore"])
print("BERTSCORE Score", metrics["bertscore"])
Loading

0 comments on commit ca3894d

Please sign in to comment.