From 0ad331c12f39f674f29f8a35b1a70a6f7951a9d1 Mon Sep 17 00:00:00 2001 From: Mattia Sangermano Date: Mon, 1 May 2023 18:30:39 +0200 Subject: [PATCH] First implementation of Standford SHP reward dataset --- .../chatllama/artifacts/download_dataset.py | 8 +- .../chatllama/chatllama/rlhf/dataset.py | 109 +++++++++++++++++- 2 files changed, 114 insertions(+), 3 deletions(-) diff --git a/optimization/chatllama/artifacts/download_dataset.py b/optimization/chatllama/artifacts/download_dataset.py index 281c942c..330e8f29 100644 --- a/optimization/chatllama/artifacts/download_dataset.py +++ b/optimization/chatllama/artifacts/download_dataset.py @@ -1,7 +1,7 @@ import argparse import os -from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset +from chatllama.rlhf.dataset import AnthropicRLHF, StanfordNLPSHPDataset, StanfordNLPSHPRewardDataset if __name__ == "__main__": @@ -15,7 +15,7 @@ parser.add_argument( "dataset_name", help="dataset name it can be. SSHP: stanfordnlp/SHP or ", - choices=["SHP", "ARLHF"], + choices=["SHP", "ARLHF", "SHPReward"], ) parser.add_argument( "-p", @@ -49,3 +49,7 @@ args.path, n_samples, ) + + if args.dataset_name == "SHPReward": + dataset = StanfordNLPSHPRewardDataset() + dataset.save_dataset(args.path, n_samples) diff --git a/optimization/chatllama/chatllama/rlhf/dataset.py b/optimization/chatllama/chatllama/rlhf/dataset.py index 8e1c7c6f..465d7272 100644 --- a/optimization/chatllama/chatllama/rlhf/dataset.py +++ b/optimization/chatllama/chatllama/rlhf/dataset.py @@ -2,7 +2,7 @@ import os import numpy as np - +import pandas as pd from beartype.typing import Dict, List, Union from datasets import load_dataset from chatllama.rlhf.config import Config, ConfigActor, ConfigReward @@ -430,3 +430,110 @@ def save_dataset( json.dump(conversations, f, indent=4) print("Generation Completed") + + +class StanfordNLPSHPRewardDataset(BaseDataset): + """Class for Stanford NLP SHP dataset from HuggingFace""" + + def __init__( + self, + ) -> None: + print("Download the dataset") + self.dataset = load_dataset("stanfordnlp/SHP") + print("Download Completed") + + def reformat_dataset(self, data: List) -> List[Dict]: + """Reformat the dataset to the format required by RLHF + + Args: + data (List): dataset from HuggingFace + + Returns: + List[Dict]: reformatted dataset + """ + + def get_score_winning_answer(x): + + return int( + ( + min(x["score"], upper_whisker[x["post_id"]]) + / min(max_vote[x["post_id"]], upper_whisker[x["post_id"]]) + ) * 5 + ) + + data = data.to_pandas() + + A_answers = data[["c_root_id_A", "score_A", "post_id", "history", "human_ref_A"]] + B_answers = data[["c_root_id_B", "score_B", "post_id", "history", "human_ref_B"]] + + # Take both answers A and B + A_answers.rename( + columns={ + "c_root_id_A": "c_id", + "score_A": "score", + "history": "user_input", + "human_ref_A": "completion", + }, + inplace=True, + ) + B_answers.rename( + columns={ + "c_root_id_B": "c_id", + "score_B": "score", + "history": "user_input", + "human_ref_B": "completion", + }, + inplace=True, + ) + conversations = pd.concat([A_answers, B_answers], axis=0) + + # Removing duplicates so that each answer is used only once + conversations.drop_duplicates(subset=["c_id"], inplace=True) + + # Computing for each post the upper whisker as Q3 + 1.5 * IQR + upper_whisker = conversations.groupby(by=["post_id"]).agg( + {"score": lambda x: x.quantile(0.75) + 1.5 * (x.quantile(0.75) - x.quantile(0.25))} + )["score"] + + max_vote = conversations.groupby(by=["post_id"]).agg({"score": max})["score"] + + norm_score = conversations.apply(get_score_winning_answer, axis=1) + + conversations["reward"] = norm_score + + conversations = conversations[["user_input", "completion", "reward"]].rename({"reward":"score"}).to_dict("records") + + return conversations + + def save_dataset( + self, dataset_folder: str, number_of_samples: int, reverse: bool = True + ) -> None: + """Save the dataset in the format required by RLHF + + Args: + dataset_folder (str): path to the folder where the dataset + will be saved + number_of_samples (int): number of samples to take from the + dataset + reverse (bool, optional): sort the dataset in descending order. + Defaults to True. + """ + + print("Generate reward datasets") + + # take the train and test dataset to create the finetuning dataset + conversations = self.reformat_dataset(self.dataset["train"]) + conversations.extend(self.reformat_dataset(self.dataset["test"])) + + # sort conversations by length of user_input + completion + conversations = self.sort_conversation(conversations, reverse=reverse) + + # take N samples and sort them + conversations = self.take_n_samples(conversations, number_of_samples) + conversations = self.sort_conversation(conversations, reverse=reverse) + + # save reward training data + with open(f"{dataset_folder}/reward_training_data.json", "w") as f: + json.dump(conversations, f, indent=4) + + print("Generation Completed") \ No newline at end of file