Merge pull request #8 from TianyiQ/main

feat: add support for sglang backend and hub models
PKU-Alignment · Oct 6, 2024 · 9c7eec4 · 9c7eec4
2 parents d9d9181 + 0537752
commit 9c7eec4
Show file tree

Hide file tree

Showing 72 changed files with 1,194 additions and 1,370 deletions.
diff --git a/__init__.py b/__init__.py
@@ -17,9 +17,9 @@
 
 __all__ = [
     "run_benchmark",
-    "Model", 
-    "Data", 
-    "DataFileCollection", 
+    "Model",
+    "Data",
+    "DataFileCollection",
     "JudgeBase",
     "ExamineeBase",
     "DummyJudge",
@@ -32,4 +32,4 @@
     "ExtrapolativeDPOExaminee",
     "ExtrapolativeRLHFExaminee",
     "fill_in_QA_template",
-]
+]
diff --git a/algorithms/utils/rw_utils.py b/algorithms/utils/rw_utils.py
@@ -206,7 +206,7 @@ def get_rw_query(
 def elicit_rw_preference(
     examinee: ExamineeBase,
     judge: JudgeBase,
-    backend: Literal["deepspeed", "trl"] = "deepspeed",
+    backend: Literal["deepspeed"] = "deepspeed",
     aligned: bool = False,
 ) -> Data:
     """Elicit preference from the judge for reward modeling."""
@@ -222,120 +222,89 @@ def elicit_rw_preference(
         examinee
     )
 
-    if backend == "trl":
-        rw_data = {"chosen": [], "rejected": []}
-        for i, dic in tqdm(enumerate(query)):
-            q = fill_in_QA_template(dic["instruction"], dic["input"])
-            try:
-                response = judge.query_from_examinee(q)
-                answer = response["predict"]
-            except Exception as e:
-                print(
-                    f"Skipping sample due to error: {type(e)} {e}. Possibly due to over-sized query."
-                )
-                continue
-
-            if "yes" in answer.lower():
-                rw_data["chosen"].append(choice1[i])
-                rw_data["rejected"].append(choice2[i])
-            elif "no" in answer.lower():
-                rw_data["rejected"].append(choice1[i])
-                rw_data["chosen"].append(choice2[i])
-            else:
-                write_log(
-                    "invalid response from judge, "
-                    + str(response)
-                    + "|| response over",
-                    log_name="rlhf",
-                )
-
-    elif backend == "deepspeed":
-        query_with_pred = judge.query_from_examinee(query)
-        assert len(query_with_pred) == len(query), "Query and response length mismatch"
+    if backend != "deepspeed":
+        raise ValueError("backend not recognized")
 
-        # with open(f'./logs/query_with_pred_{examinee.current_timestep}_{examinee.instance_id}_{judge.instance_id}.json', 'w') as f:
-        #     json.dump([query, query_with_pred], f)
+    query_with_pred = judge.query_from_examinee(query)
+    assert len(query_with_pred) == len(query), "Query and response length mismatch"
 
-        rw_data = []
-        debug_data = []
+    rw_data = []
+    debug_data = []
 
-        for i, dic in tqdm(enumerate(query_with_pred)):
-            assert dic["id"] == i, f"ID mismatch: {dic['id']} != {i}"
-            answer = dic["predict"]
+    for i, dic in tqdm(enumerate(query_with_pred)):
+        assert dic["id"] == i, f"ID mismatch: {dic['id']} != {i}"
+        answer = dic["predict"]
 
-            temp = {
-                "instruction": original_instruction[i],
-                "input": original_input[i],
-                "output": [],
-            }
+        temp = {
+            "instruction": original_instruction[i],
+            "input": original_input[i],
+            "output": [],
+        }
 
-            def get_status(answer: str) -> int:
-                answer = answer.lower()
-                if "1" in answer and "2" not in answer:
+        def get_status(answer: str) -> int:
+            answer = answer.lower()
+            if "1" in answer and "2" not in answer:
+                return 1
+            if "2" in answer and "1" not in answer:
+                return 2
+
+            def filter(s: str) -> str:
+                return "".join([c for c in s if c.isalnum()]).strip().lower()
+
+            choice1_letter = (
+                "B" if 'Response 1: """B"""' in original_instruction[i] else "A"
+            )
+            choice2_letter = "B" if choice1_letter == "A" else "A"
+            if filter(answer) == filter(choice1_letter):
+                return 1
+            if filter(answer) == filter(choice2_letter):
+                return 2
+
+            if len(choice1[i]) > 5 and len(choice2[i]) > 5:
+                dist1 = Levenshtein.distance(choice1[i], answer)
+                dist2 = Levenshtein.distance(choice2[i], answer)
+
+                if (
+                    dist1 <= len(choice1[i]) // 3
+                    and dist2 >= len(choice2[i]) - len(choice2[i]) // 3
+                    and dist1 + 4 <= dist2
+                    and dist1 * 2 < dist2
+                ):
                     return 1
-                if "2" in answer and "1" not in answer:
-                    return 2
 
-                def filter(s: str) -> str:
-                    return "".join([c for c in s if c.isalnum()]).strip().lower()
-
-                choice1_letter = (
-                    "B" if 'Response 1: """B"""' in original_instruction[i] else "A"
-                )
-                choice2_letter = "B" if choice1_letter == "A" else "A"
-                if filter(answer) == filter(choice1_letter):
-                    return 1
-                if filter(answer) == filter(choice2_letter):
+                if (
+                    dist2 <= len(choice2[i]) // 3
+                    and dist1 >= len(choice1[i]) - len(choice1[i]) // 3
+                    and dist2 + 4 <= dist1
+                    and dist2 * 2 < dist1
+                ):
                     return 2
 
-                if len(choice1[i]) > 5 and len(choice2[i]) > 5:
-                    dist1 = Levenshtein.distance(choice1[i], answer)
-                    dist2 = Levenshtein.distance(choice2[i], answer)
-
-                    if (
-                        dist1 <= len(choice1[i]) // 3
-                        and dist2 >= len(choice2[i]) - len(choice2[i]) // 3
-                        and dist1 + 4 <= dist2
-                        and dist1 * 2 < dist2
-                    ):
-                        return 1
-
-                    if (
-                        dist2 <= len(choice2[i]) // 3
-                        and dist1 >= len(choice1[i]) - len(choice1[i]) // 3
-                        and dist2 + 4 <= dist1
-                        and dist2 * 2 < dist1
-                    ):
-                        return 2
-
-                return None
-
-            status = get_status(answer)
-
-            if status == 1:
-                temp["output"].append(choice1[i])
-                temp["output"].append(choice2[i])
-                rw_data.append(temp)
-            elif status == 2:
-                temp["output"].append(choice2[i])
-                temp["output"].append(choice1[i])
-                rw_data.append(temp)
-            else:
-                assert status is None
-                if aligned:
-                    rw_data.append([])
-                write_log(
-                    "invalid response from judge, " + str(dic) + "|| response over",
-                    log_name="rlhf",
-                )
-
-            debug_data.append([dic, temp])
-
-        with open("./logs/debug_data.json", "w") as f:
-            json.dump(debug_data, f)
-
-    else:
-        raise ValueError("backend not recognized")
+            return None
+
+        status = get_status(answer)
+
+        if status == 1:
+            temp["output"].append(choice1[i])
+            temp["output"].append(choice2[i])
+            rw_data.append(temp)
+        elif status == 2:
+            temp["output"].append(choice2[i])
+            temp["output"].append(choice1[i])
+            rw_data.append(temp)
+        else:
+            assert status is None
+            if aligned:
+                rw_data.append([])
+            write_log(
+                "invalid response from judge, " + str(dic) + "|| response over",
+                log_name="rlhf",
+            )
+
+        debug_data.append([dic, temp])
+
+    with open("./logs/debug_data.json", "w") as f:
+        json.dump(debug_data, f)
 
     with open(save_path, "w") as f:
         json.dump(rw_data, f)

diff --git a/benchmark/framework.py b/benchmark/framework.py
@@ -163,7 +163,7 @@ def query_from_examinee(
                 results: str = model.inference(
                     data=[{"instruction": prompt}],
                     result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )[0]["predict"]
 
         elif isinstance(prompt, Data):
@@ -184,7 +184,7 @@ def query_from_examinee(
                 results: Data = model.inference(
                     data=prompt,
                     result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )
 
         elif isinstance(prompt, list):
@@ -203,7 +203,7 @@ def query_from_examinee(
                 results: List[Dict] = model.inference(
                     data=prompt,
                     result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )
 
         else:
@@ -234,7 +234,7 @@ def produce_final_result(self) -> Dict[str, Any]:
         }
 
     @abstractmethod
-    #@classmethod
+    # @classmethod
     def interpret_result(cls, result: Dict[str, Any]) -> float:
         """Given an benchmark result dictionary, calculate a single score that represents the overall performance of the examinee. HIGHER scores must mean better performance. This method is called by the leaderboard to rank the examinees."""
         raise NotImplementedError
@@ -305,7 +305,7 @@ def reset(self, **kwargs) -> None:
         When `examinee_model_size` is not specified, the model will be initialized as a copy of the Judge's initial model. In that case, the examinee will be able to start from the same initial state as the judge.
         Normally, you should implement this method in your subclass to initialize the examinee as needed, after calling the base class implementation for basic setup.
         """
-        
+
         if "model_name" not in kwargs:
             self.model_size = (
                 int(kwargs["examinee_model_size"].lower().replace("b", "").strip())
@@ -389,21 +389,21 @@ def query_from_judge(
                 results: str = model.inference(
                     data=[{"instruction": prompt, "input": ""}],
                     result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )[0]["predict"]
 
             elif isinstance(prompt, Data):
                 results: Data = model.inference(
                     data=prompt,
                     result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )
 
             elif isinstance(prompt, list):
                 results: List[Dict] = model.inference(
                     data=prompt,
                     result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
-                    backend="vllm",
+                    backend="sglang",
                 )
 
             else:
@@ -433,7 +433,7 @@ def run(self, judge: JudgeBase) -> Iterable:
         At every iteration:
         1. The examinee learns about the latest human preference by calling the judge's query_from_examinee method.
         2. After it has updated its language model, it yields control back to the judge and allow it to evaluate it (by calling query_from_judge).
-        
+
         Unless you are sure that you need to completely override this method, you should not do so. Instead, call the base class implementation at the beginning of your subclass's implementation.
         """
 

diff --git a/build_dataset.py b/build_dataset.py
@@ -54,8 +54,10 @@ def build_pile_of_law():
 
 if __name__ == "__main__":
     tw.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
-    print('This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand.')
-
+    print(
+        "This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand."
+    )
+
     # Checking if the user wants to proceed
     if "y" not in input("Start running this script? (y/n)").lower():
         print("Exiting.")

diff --git a/challenges/coevolve.py b/challenges/coevolve.py
@@ -73,7 +73,7 @@ def update_human_proxy(
             )
             query_results: Data = influencer.inference(
                 data=self.queries,
-                backend="vllm",
+                backend="sglang",
                 result_data_name=f"{self.instance_id}_interact_{comment}_{self.eval_times}th",
             )
         query_results.set_key_fields(

diff --git a/doc_generation/build/doctrees/Model.doctree b/doc_generation/build/doctrees/Model.doctree
diff --git a/doc_generation/build/doctrees/running.doctree b/doc_generation/build/doctrees/running.doctree
diff --git a/doc_generation/source/conf.py b/doc_generation/source/conf.py
@@ -21,9 +21,9 @@
 extensions = ["sphinx.ext.autodoc"]
 
 autodoc_default_options = {
-    'members': True,
-    'special-members': '__init__',  # 只显示构造函数
-    'private-members': False,       # 不显示私有成员
+    "members": True,
+    "special-members": "__init__",  # 只显示构造函数
+    "private-members": False,  # 不显示私有成员
 }
 
 templates_path = ["_templates"]

diff --git a/doc_generation/source/running.rst b/doc_generation/source/running.rst
@@ -22,6 +22,16 @@ You could install ProgressGym by cloning the codebase:
    $ cd ./ProgressGym
    $ . ./initialize.sh
 
+or
+
+.. code-block:: console
+
+   $ git clone https://github.com/PKU-Alignment/ProgressGym.git
+   $ cd ./ProgressGym
+   $ . ./initialize.sh --multinode
+
+...if you intend to run ProgressGym on a multi-node environment.
+
 This will create two new conda environments ``pa38`` and ``pa38-lf``, where the former is for the normal operations of ProgressGym, and the latter is used by ProgressGym when calling LLaMA-Factory internally. 
 
 When calling entry-point scripts of ProgressGym (e.g., ``run_benchmark.py`` and ``build_dataset.py``), please make sure you are in the ``pa38`` environment, and ProgressGym will automatically handle any further environment switching.