From 0ad331c12f39f674f29f8a35b1a70a6f7951a9d1 Mon Sep 17 00:00:00 2001
From: Mattia Sangermano <mattiasangermano1997@gmail.com>
Date: Mon, 1 May 2023 18:30:39 +0200
Subject: [PATCH] First implementation of Standford SHP reward dataset

---
 .../chatllama/artifacts/download_dataset.py   |   8 +-
 .../chatllama/chatllama/rlhf/dataset.py       | 109 +++++++++++++++++-
 2 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/optimization/chatllama/artifacts/download_dataset.py b/optimization/chatllama/artifacts/download_dataset.py
index 281c942c..330e8f29 100644
--- a/optimization/chatllama/artifacts/download_dataset.py
+++ b/optimization/chatllama/artifacts/download_dataset.py
@@ -1,7 +1,7 @@
 import argparse
 import os
 
-from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset
+from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset, StanfordNLPSHPRewardDataset
 
 
 if __name__ == "__main__":
@@ -15,7 +15,7 @@
     parser.add_argument(
         "dataset_name",
         help="dataset name it can be. SSHP: stanfordnlp/SHP or ",
-        choices=["SHP", "ARLHF"],
+        choices=["SHP", "ARLHF", "SHPReward"],
     )
     parser.add_argument(
         "-p",
@@ -49,3 +49,7 @@
             args.path,
             n_samples,
         )
+    
+    if args.dataset_name == "SHPReward":
+        dataset = StanfordNLPSHPRewardDataset()
+        dataset.save_dataset(args.path, n_samples)
diff --git a/optimization/chatllama/chatllama/rlhf/dataset.py b/optimization/chatllama/chatllama/rlhf/dataset.py
index 8e1c7c6f..465d7272 100644
--- a/optimization/chatllama/chatllama/rlhf/dataset.py
+++ b/optimization/chatllama/chatllama/rlhf/dataset.py
@@ -2,7 +2,7 @@
 import os
 
 import numpy as np
-
+import pandas as pd
 from beartype.typing import Dict, List, Union
 from datasets import load_dataset
 from chatllama.rlhf.config import Config, ConfigActor, ConfigReward
@@ -430,3 +430,110 @@ def save_dataset(
             json.dump(conversations, f, indent=4)
 
         print("Generation Completed")
+
+
+class StanfordNLPSHPRewardDataset(BaseDataset):
+    """Class for Stanford NLP SHP dataset from HuggingFace"""
+
+    def __init__(
+        self,
+    ) -> None:
+        print("Download the dataset")
+        self.dataset = load_dataset("stanfordnlp/SHP")
+        print("Download Completed")
+
+    def reformat_dataset(self, data: List) -> List[Dict]:
+        """Reformat the dataset to the format required by RLHF
+
+        Args:
+            data (List): dataset from HuggingFace
+
+        Returns:
+            List[Dict]: reformatted dataset
+        """
+        
+        def get_score_winning_answer(x):
+
+            return int(
+                (
+                    min(x["score"], upper_whisker[x["post_id"]])
+                    / min(max_vote[x["post_id"]], upper_whisker[x["post_id"]])
+                ) * 5
+            )
+
+        data = data.to_pandas()
+
+        A_answers = data[["c_root_id_A", "score_A", "post_id", "history", "human_ref_A"]]
+        B_answers = data[["c_root_id_B", "score_B", "post_id", "history", "human_ref_B"]]
+
+        # Take both answers A and B
+        A_answers.rename(
+            columns={
+                "c_root_id_A": "c_id",
+                "score_A": "score",
+                "history": "user_input",
+                "human_ref_A": "completion",
+            },
+            inplace=True,
+        )
+        B_answers.rename(
+            columns={
+                "c_root_id_B": "c_id",
+                "score_B": "score",
+                "history": "user_input",
+                "human_ref_B": "completion",
+            },
+            inplace=True,
+        )
+        conversations = pd.concat([A_answers, B_answers], axis=0)
+
+        # Removing duplicates so that each answer is used only once
+        conversations.drop_duplicates(subset=["c_id"], inplace=True)
+
+        # Computing for each post the upper whisker as Q3 + 1.5 * IQR
+        upper_whisker = conversations.groupby(by=["post_id"]).agg(
+            {"score": lambda x: x.quantile(0.75) + 1.5 * (x.quantile(0.75) - x.quantile(0.25))}
+        )["score"]
+        
+        max_vote = conversations.groupby(by=["post_id"]).agg({"score": max})["score"]
+        
+        norm_score = conversations.apply(get_score_winning_answer, axis=1)
+
+        conversations["reward"] = norm_score
+
+        conversations = conversations[["user_input", "completion", "reward"]].rename({"reward":"score"}).to_dict("records")
+
+        return conversations
+
+    def save_dataset(
+        self, dataset_folder: str, number_of_samples: int, reverse: bool = True
+    ) -> None:
+        """Save the dataset in the format required by RLHF
+
+        Args:
+            dataset_folder (str): path to the folder where the dataset
+                will be saved
+            number_of_samples (int): number of samples to take from the
+                dataset
+            reverse (bool, optional): sort the dataset in descending order.
+                Defaults to True.
+        """
+
+        print("Generate reward datasets")
+
+        # take the train and test dataset to create the finetuning dataset
+        conversations = self.reformat_dataset(self.dataset["train"])
+        conversations.extend(self.reformat_dataset(self.dataset["test"]))
+
+        # sort conversations by length of user_input + completion
+        conversations = self.sort_conversation(conversations, reverse=reverse)
+
+        # take N samples and sort them
+        conversations = self.take_n_samples(conversations, number_of_samples)
+        conversations = self.sort_conversation(conversations, reverse=reverse)
+
+        # save reward training data
+        with open(f"{dataset_folder}/reward_training_data.json", "w") as f:
+            json.dump(conversations, f, indent=4)
+
+        print("Generation Completed")
\ No newline at end of file