refactor(benchmarks) Add new evaluation metrics for general NLP chall…

…enge (#4282)
adap · Oct 6, 2024 · c232936 · c232936
1 parent 7a7d912
commit c232936
Show file tree

Hide file tree

Showing 8 changed files with 371 additions and 339 deletions.
diff --git a/benchmarks/flowertune-llm/evaluation/general-nlp/README.md b/benchmarks/flowertune-llm/evaluation/general-nlp/README.md
@@ -1,8 +1,8 @@
 # Evaluation for General NLP challenge
 
-We leverage MT-Bench metric provided by [FastChat](https://github.com/lm-sys/FastChat) to evaluate fine-tuned LLMs.
-[MT-Bench](https://arxiv.org/abs/2306.05685) represents a comprehensive suite of multi-turn, open-ended questions designed to evaluate chat assistants.
-Strong LLMs, such as GPT-4, serve as judges to assess the quality of responses provided by the chat assistants under examination.
+We build up a multi-task language understanding pipeline to evaluate our fined-tuned LLMs.
+The [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) dataset is used for this evaluation, encompassing three categories: STEM, social sciences (SS), and humanities.
+
 
 ## Environment Setup
 
@@ -20,44 +20,22 @@ pip install -r requirements.txt
 huggingface-cli login
 ```
 
-Download data from [FastChat](https://github.com/lm-sys/FastChat):
-
-```shell
-git clone https://github.com/lm-sys/FastChat.git && cd FastChat && git checkout d561f87b24de197e25e3ddf7e09af93ced8dfe36 && mv fastchat/llm_judge/data ../data && cd .. && rm -rf FastChat
-```
-
-
-## Generate model answers from MT-bench questions
-
-```bash
-python gen_model_answer.py --peft-path=/path/to/fine-tuned-peft-model-dir/ # e.g., ./peft_1
-```
-The answers will be saved to `data/mt_bench/model_answer/[base_model_name].jsonl` in default.
-
-
-## Generate judgments using GPT-4
-
-Please follow these [instructions](https://platform.openai.com/docs/quickstart/developer-quickstart) to create a OpenAI API key.
-The estimated costs of running this evaluation is approximately USD10.
+## Generate model decision & calculate accuracy
 
 > [!NOTE]
-> If you changed the base model of your LLM project specify it to the command below via `--model-list`.
+> Please ensure that you use `quantization=4` to run the evaluation if you wish to participate in the LLM Leaderboard.
 
 ```bash
-export OPENAI_API_KEY=XXXXXX  # set the OpenAI API key
-python gen_judgement.py --model-list Mistral-7B-v0.3
+python eval.py \
+--peft-path=/path/to/fine-tuned-peft-model-dir/ \ # e.g., ./peft_1
+--run-name=fl  \ # specified name for this run  
+--batch-size=16 \
+--quantization=4 \
+--category=stem,social_sciences,humanities
 ```
 
-The judgments will be saved to `data/mt_bench/model_judgment/gpt-4_single.jsonl` in default.
-
+The model answers and accuracy values will be saved to `benchmarks/generation_{dataset_name}_{category_name}_{run_name}.jsonl` and `benchmarks/acc_{dataset_name}_{category_name}_{run_name}.txt`, respectively.
 
-## Show MT-bench scores
-
-```bash
-python show_result.py --model-list Mistral-7B-v0.3
-```
-GPT-4 will give a score on a scale of 10 to the first-turn (MT-1) and second-turn (MT-2) of the conversations, along with an average value as the third score.
 
 > [!NOTE]
-> Please ensure that you provide all **three scores** when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
-
+> Please ensure that you provide all **three accuracy values (STEM, SS, Humanities)** for three evaluation categories when submitting to the LLM Leaderboard (see the [`Make Submission`](https://github.com/adap/flower/tree/main/benchmarks/flowertune-llm/evaluation#make-submission-on-flowertune-llm-leaderboard) section).
diff --git a/benchmarks/flowertune-llm/evaluation/general-nlp/benchmarks.py b/benchmarks/flowertune-llm/evaluation/general-nlp/benchmarks.py
@@ -0,0 +1,201 @@
+import json
+
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from utils import format_answer, format_example, save_results
+
+from datasets import Dataset, load_dataset
+
+INSTRUCTIONS = {
+    "mmlu": "Answer the following multiple choice question.",
+}
+
+MMLU_CATEGORY = {
+    "stem": [
+        "abstract_algebra",
+        "anatomy",
+        "astronomy",
+        "college_biology",
+        "college_chemistry",
+        "college_computer_science",
+        "college_mathematics",
+        "college_physics",
+        "computer_security",
+        "conceptual_physics",
+        "electrical_engineering",
+        "elementary_mathematics",
+        "high_school_biology",
+        "high_school_chemistry",
+        "high_school_computer_science",
+        "high_school_mathematics",
+        "high_school_physics",
+        "high_school_statistics",
+        "machine_learning",
+    ],
+    "social_sciences": [
+        "econometrics",
+        "high_school_geography",
+        "high_school_government_and_politics",
+        "high_school_macroeconomics",
+        "high_school_microeconomics",
+        "high_school_psychology",
+        "human_sexuality",
+        "professional_psychology",
+        "public_relations",
+        "security_studies",
+        "sociology",
+        "us_foreign_policy",
+    ],
+    "humanities": [
+        "formal_logic",
+        "high_school_european_history",
+        "high_school_us_history",
+        "high_school_world_history",
+        "international_law",
+        "jurisprudence",
+        "logical_fallacies",
+        "moral_disputes",
+        "moral_scenarios",
+        "philosophy",
+        "prehistory",
+        "professional_law",
+        "world_religions",
+    ],
+    "other": [
+        "business_ethics",
+        "clinical_knowledge",
+        "college_medicine",
+        "global_facts",
+        "human_aging",
+        "management",
+        "marketing",
+        "medical_genetics",
+        "miscellaneous",
+        "nutrition",
+        "professional_accounting",
+        "professional_medicine",
+        "virology",
+    ],
+}
+
+
+def infer_mmlu(model, tokenizer, batch_size, category, run_name):
+    name = "mmlu"
+    answer_type = "mcq"
+
+    # Download dataset
+    dataframes = []
+    for subset in MMLU_CATEGORY[category]:
+        subset_data = load_dataset(
+            "lukaemon/mmlu",
+            subset,
+            split="test",
+            trust_remote_code=True,
+        )
+        subset_df = pd.DataFrame(subset_data.map(lambda x: {"subset": subset, **x}))
+        dataframes.append(subset_df)
+
+    dataset_df = pd.concat(dataframes, axis=0)
+    dataset = Dataset.from_pandas(dataset_df)
+    if "__index_level_0__" in dataset.column_names:
+        dataset = dataset.remove_columns("__index_level_0__")
+
+    # Post process
+    instruction = INSTRUCTIONS[name]
+
+    def post_process(row):
+        options = [row["A"], row["B"], row["C"], row["D"]]
+        row["prompt"] = format_example(row["input"], options)
+        row["gold"] = row["target"]
+        row["subset"] = row["subset"]
+        row["prompt"] = f"{instruction}\n{row['prompt']}\nThe answer is:\n"
+        return row
+
+    dataset = dataset.map(post_process)
+
+    # Generate results
+    generate_results(
+        name, run_name, dataset, model, tokenizer, batch_size, answer_type, category
+    )
+
+
+def generate_results(
+    name, run_name, dataset, model, tokenizer, batch_size, answer_type, category
+):
+    # Run inference
+    prediction = inference(dataset, model, tokenizer, batch_size)
+
+    # Calculate accuracy
+    acc = accuracy_compute(prediction, answer_type)
+
+    # Save results and generations
+    save_results(name, category, run_name, prediction, acc)
+
+
+def inference(dataset, model, tokenizer, batch_size):
+    columns_process = ["prompt", "gold"]
+    if "subset" in dataset.features:
+        columns_process.append("subset")
+    dataset_process = pd.DataFrame(dataset, columns=dataset.features)[columns_process]
+    dataset_process = dataset_process.assign(output="Null")
+    temperature = 1.0
+
+    inference_data = json.loads(dataset_process.to_json(orient="records"))
+    data_loader = DataLoader(inference_data, batch_size=batch_size, shuffle=False)
+
+    batch_counter = 0
+    for batch in tqdm(data_loader, total=len(data_loader), position=0, leave=True):
+        prompts = [
+            f"<|im_start|>question\n{prompt}<|im_end|>\n<|im_start|>answer\n"
+            for prompt in batch["prompt"]
+        ]
+        if batch_counter == 0:
+            print(prompts[0])
+
+        # Process tokenizer
+        stop_seq = ["###"]
+        if tokenizer.eos_token is not None:
+            stop_seq.append(tokenizer.eos_token)
+        if tokenizer.pad_token is not None:
+            stop_seq.append(tokenizer.pad_token)
+        max_new_tokens = len(
+            tokenizer(batch["gold"][0], add_special_tokens=False)["input_ids"]
+        )
+
+        outputs = []
+        for prompt in prompts:
+            input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
+            output_ids = model.generate(
+                inputs=input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                top_p=1.0,
+                temperature=temperature,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+            output_ids = output_ids[0][len(input_ids[0]) :]
+            output = tokenizer.decode(output_ids, skip_special_tokens=True)
+            outputs.append(output)
+
+        for prompt, out in zip(batch["prompt"], outputs):
+            dataset_process.loc[dataset_process["prompt"] == prompt, "output"] = out
+        batch_counter += 1
+
+    return dataset_process
+
+
+def accuracy_compute(dataset, answer_type):
+    dataset = json.loads(dataset.to_json(orient="records"))
+    preds, golds = [], []
+    for row in dataset:
+        answer = row["gold"].lower()
+        output = row["output"].lower()
+        pred, gold = format_answer(output, answer, answer_type=answer_type)
+        preds.append(pred)
+        golds.append(gold)
+
+    accuracy = accuracy_score(preds, golds)
+
+    return accuracy
diff --git a/benchmarks/flowertune-llm/evaluation/general-nlp/eval.py b/benchmarks/flowertune-llm/evaluation/general-nlp/eval.py
@@ -0,0 +1,68 @@
+import argparse
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from benchmarks import MMLU_CATEGORY, infer_mmlu
+
+# Fixed seed
+torch.manual_seed(2024)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--base-model-name-path", type=str, default="mistralai/Mistral-7B-v0.3"
+)
+parser.add_argument("--run-name", type=str, default="fl")
+parser.add_argument("--peft-path", type=str, default=None)
+parser.add_argument(
+    "--datasets",
+    type=str,
+    default="mmlu",
+    help="The dataset to infer on",
+)
+parser.add_argument(
+    "--category",
+    type=str,
+    default=None,
+    help="The category for MMLU dataset, chosen from [stem, social_sciences, humanities, other]",
+)
+parser.add_argument("--batch-size", type=int, default=16)
+parser.add_argument("--quantization", type=int, default=4)
+args = parser.parse_args()
+
+
+# Load model and tokenizer
+if args.quantization == 4:
+    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+    torch_dtype = torch.float32
+elif args.quantization == 8:
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+    torch_dtype = torch.float16
+else:
+    raise ValueError(
+        f"Use 4-bit or 8-bit quantization. You passed: {args.quantization}/"
+    )
+
+model = AutoModelForCausalLM.from_pretrained(
+    args.base_model_name_path,
+    quantization_config=quantization_config,
+    torch_dtype=torch_dtype,
+)
+if args.peft_path is not None:
+    model = PeftModel.from_pretrained(
+        model, args.peft_path, torch_dtype=torch_dtype
+    ).to("cuda")
+
+tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_path)
+
+# Evaluate
+for dataset in args.datasets.split(","):
+    if dataset == "mmlu":
+        for cate in args.category.split(","):
+            if cate not in MMLU_CATEGORY.keys():
+                raise ValueError("Undefined Category.")
+            else:
+                infer_mmlu(model, tokenizer, args.batch_size, cate, args.run_name)
+    else:
+        raise ValueError("Undefined Dataset.")