Skip to content

Commit

Permalink
Merge pull request #8 from TianyiQ/main
Browse files Browse the repository at this point in the history
feat: add support for sglang backend and hub models
  • Loading branch information
TianyiQ authored Oct 6, 2024
2 parents d9d9181 + 0537752 commit 9c7eec4
Show file tree
Hide file tree
Showing 72 changed files with 1,194 additions and 1,370 deletions.
8 changes: 4 additions & 4 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

__all__ = [
"run_benchmark",
"Model",
"Data",
"DataFileCollection",
"Model",
"Data",
"DataFileCollection",
"JudgeBase",
"ExamineeBase",
"DummyJudge",
Expand All @@ -32,4 +32,4 @@
"ExtrapolativeDPOExaminee",
"ExtrapolativeRLHFExaminee",
"fill_in_QA_template",
]
]
181 changes: 75 additions & 106 deletions algorithms/utils/rw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def get_rw_query(
def elicit_rw_preference(
examinee: ExamineeBase,
judge: JudgeBase,
backend: Literal["deepspeed", "trl"] = "deepspeed",
backend: Literal["deepspeed"] = "deepspeed",
aligned: bool = False,
) -> Data:
"""Elicit preference from the judge for reward modeling."""
Expand All @@ -222,120 +222,89 @@ def elicit_rw_preference(
examinee
)

if backend == "trl":
rw_data = {"chosen": [], "rejected": []}
for i, dic in tqdm(enumerate(query)):
q = fill_in_QA_template(dic["instruction"], dic["input"])
try:
response = judge.query_from_examinee(q)
answer = response["predict"]
except Exception as e:
print(
f"Skipping sample due to error: {type(e)} {e}. Possibly due to over-sized query."
)
continue

if "yes" in answer.lower():
rw_data["chosen"].append(choice1[i])
rw_data["rejected"].append(choice2[i])
elif "no" in answer.lower():
rw_data["rejected"].append(choice1[i])
rw_data["chosen"].append(choice2[i])
else:
write_log(
"invalid response from judge, "
+ str(response)
+ "|| response over",
log_name="rlhf",
)

elif backend == "deepspeed":
query_with_pred = judge.query_from_examinee(query)
assert len(query_with_pred) == len(query), "Query and response length mismatch"
if backend != "deepspeed":
raise ValueError("backend not recognized")

# with open(f'./logs/query_with_pred_{examinee.current_timestep}_{examinee.instance_id}_{judge.instance_id}.json', 'w') as f:
# json.dump([query, query_with_pred], f)
query_with_pred = judge.query_from_examinee(query)
assert len(query_with_pred) == len(query), "Query and response length mismatch"

rw_data = []
debug_data = []
rw_data = []
debug_data = []

for i, dic in tqdm(enumerate(query_with_pred)):
assert dic["id"] == i, f"ID mismatch: {dic['id']} != {i}"
answer = dic["predict"]
for i, dic in tqdm(enumerate(query_with_pred)):
assert dic["id"] == i, f"ID mismatch: {dic['id']} != {i}"
answer = dic["predict"]

temp = {
"instruction": original_instruction[i],
"input": original_input[i],
"output": [],
}
temp = {
"instruction": original_instruction[i],
"input": original_input[i],
"output": [],
}

def get_status(answer: str) -> int:
answer = answer.lower()
if "1" in answer and "2" not in answer:
def get_status(answer: str) -> int:
answer = answer.lower()
if "1" in answer and "2" not in answer:
return 1
if "2" in answer and "1" not in answer:
return 2

def filter(s: str) -> str:
return "".join([c for c in s if c.isalnum()]).strip().lower()

choice1_letter = (
"B" if 'Response 1: """B"""' in original_instruction[i] else "A"
)
choice2_letter = "B" if choice1_letter == "A" else "A"
if filter(answer) == filter(choice1_letter):
return 1
if filter(answer) == filter(choice2_letter):
return 2

if len(choice1[i]) > 5 and len(choice2[i]) > 5:
dist1 = Levenshtein.distance(choice1[i], answer)
dist2 = Levenshtein.distance(choice2[i], answer)

if (
dist1 <= len(choice1[i]) // 3
and dist2 >= len(choice2[i]) - len(choice2[i]) // 3
and dist1 + 4 <= dist2
and dist1 * 2 < dist2
):
return 1
if "2" in answer and "1" not in answer:
return 2

def filter(s: str) -> str:
return "".join([c for c in s if c.isalnum()]).strip().lower()

choice1_letter = (
"B" if 'Response 1: """B"""' in original_instruction[i] else "A"
)
choice2_letter = "B" if choice1_letter == "A" else "A"
if filter(answer) == filter(choice1_letter):
return 1
if filter(answer) == filter(choice2_letter):
if (
dist2 <= len(choice2[i]) // 3
and dist1 >= len(choice1[i]) - len(choice1[i]) // 3
and dist2 + 4 <= dist1
and dist2 * 2 < dist1
):
return 2

if len(choice1[i]) > 5 and len(choice2[i]) > 5:
dist1 = Levenshtein.distance(choice1[i], answer)
dist2 = Levenshtein.distance(choice2[i], answer)

if (
dist1 <= len(choice1[i]) // 3
and dist2 >= len(choice2[i]) - len(choice2[i]) // 3
and dist1 + 4 <= dist2
and dist1 * 2 < dist2
):
return 1

if (
dist2 <= len(choice2[i]) // 3
and dist1 >= len(choice1[i]) - len(choice1[i]) // 3
and dist2 + 4 <= dist1
and dist2 * 2 < dist1
):
return 2

return None

status = get_status(answer)

if status == 1:
temp["output"].append(choice1[i])
temp["output"].append(choice2[i])
rw_data.append(temp)
elif status == 2:
temp["output"].append(choice2[i])
temp["output"].append(choice1[i])
rw_data.append(temp)
else:
assert status is None
if aligned:
rw_data.append([])
write_log(
"invalid response from judge, " + str(dic) + "|| response over",
log_name="rlhf",
)

debug_data.append([dic, temp])

with open("./logs/debug_data.json", "w") as f:
json.dump(debug_data, f)

else:
raise ValueError("backend not recognized")
return None

status = get_status(answer)

if status == 1:
temp["output"].append(choice1[i])
temp["output"].append(choice2[i])
rw_data.append(temp)
elif status == 2:
temp["output"].append(choice2[i])
temp["output"].append(choice1[i])
rw_data.append(temp)
else:
assert status is None
if aligned:
rw_data.append([])
write_log(
"invalid response from judge, " + str(dic) + "|| response over",
log_name="rlhf",
)

debug_data.append([dic, temp])

with open("./logs/debug_data.json", "w") as f:
json.dump(debug_data, f)

with open(save_path, "w") as f:
json.dump(rw_data, f)
Expand Down
18 changes: 9 additions & 9 deletions benchmark/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def query_from_examinee(
results: str = model.inference(
data=[{"instruction": prompt}],
result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
backend="vllm",
backend="sglang",
)[0]["predict"]

elif isinstance(prompt, Data):
Expand All @@ -184,7 +184,7 @@ def query_from_examinee(
results: Data = model.inference(
data=prompt,
result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
backend="vllm",
backend="sglang",
)

elif isinstance(prompt, list):
Expand All @@ -203,7 +203,7 @@ def query_from_examinee(
results: List[Dict] = model.inference(
data=prompt,
result_data_name=f"{self.instance_id}_query_from_examinee_{self.query_times}th",
backend="vllm",
backend="sglang",
)

else:
Expand Down Expand Up @@ -234,7 +234,7 @@ def produce_final_result(self) -> Dict[str, Any]:
}

@abstractmethod
#@classmethod
# @classmethod
def interpret_result(cls, result: Dict[str, Any]) -> float:
"""Given an benchmark result dictionary, calculate a single score that represents the overall performance of the examinee. HIGHER scores must mean better performance. This method is called by the leaderboard to rank the examinees."""
raise NotImplementedError
Expand Down Expand Up @@ -305,7 +305,7 @@ def reset(self, **kwargs) -> None:
When `examinee_model_size` is not specified, the model will be initialized as a copy of the Judge's initial model. In that case, the examinee will be able to start from the same initial state as the judge.
Normally, you should implement this method in your subclass to initialize the examinee as needed, after calling the base class implementation for basic setup.
"""

if "model_name" not in kwargs:
self.model_size = (
int(kwargs["examinee_model_size"].lower().replace("b", "").strip())
Expand Down Expand Up @@ -389,21 +389,21 @@ def query_from_judge(
results: str = model.inference(
data=[{"instruction": prompt, "input": ""}],
result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
backend="vllm",
backend="sglang",
)[0]["predict"]

elif isinstance(prompt, Data):
results: Data = model.inference(
data=prompt,
result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
backend="vllm",
backend="sglang",
)

elif isinstance(prompt, list):
results: List[Dict] = model.inference(
data=prompt,
result_data_name=f"{self.instance_id}_query_from_judge_{self.query_times}th",
backend="vllm",
backend="sglang",
)

else:
Expand Down Expand Up @@ -433,7 +433,7 @@ def run(self, judge: JudgeBase) -> Iterable:
At every iteration:
1. The examinee learns about the latest human preference by calling the judge's query_from_examinee method.
2. After it has updated its language model, it yields control back to the judge and allow it to evaluate it (by calling query_from_judge).
Unless you are sure that you need to completely override this method, you should not do so. Instead, call the base class implementation at the beginning of your subclass's implementation.
"""

Expand Down
6 changes: 4 additions & 2 deletions build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ def build_pile_of_law():

if __name__ == "__main__":
tw.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
print('This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand.')

print(
"This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand."
)

# Checking if the user wants to proceed
if "y" not in input("Start running this script? (y/n)").lower():
print("Exiting.")
Expand Down
2 changes: 1 addition & 1 deletion challenges/coevolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def update_human_proxy(
)
query_results: Data = influencer.inference(
data=self.queries,
backend="vllm",
backend="sglang",
result_data_name=f"{self.instance_id}_interact_{comment}_{self.eval_times}th",
)
query_results.set_key_fields(
Expand Down
Binary file modified doc_generation/build/doctrees/Model.doctree
Binary file not shown.
Binary file modified doc_generation/build/doctrees/running.doctree
Binary file not shown.
6 changes: 3 additions & 3 deletions doc_generation/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
extensions = ["sphinx.ext.autodoc"]

autodoc_default_options = {
'members': True,
'special-members': '__init__', # 只显示构造函数
'private-members': False, # 不显示私有成员
"members": True,
"special-members": "__init__", # 只显示构造函数
"private-members": False, # 不显示私有成员
}

templates_path = ["_templates"]
Expand Down
10 changes: 10 additions & 0 deletions doc_generation/source/running.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ You could install ProgressGym by cloning the codebase:
$ cd ./ProgressGym
$ . ./initialize.sh
or

.. code-block:: console
$ git clone https://github.com/PKU-Alignment/ProgressGym.git
$ cd ./ProgressGym
$ . ./initialize.sh --multinode
...if you intend to run ProgressGym on a multi-node environment.

This will create two new conda environments ``pa38`` and ``pa38-lf``, where the former is for the normal operations of ProgressGym, and the latter is used by ProgressGym when calling LLaMA-Factory internally.

When calling entry-point scripts of ProgressGym (e.g., ``run_benchmark.py`` and ``build_dataset.py``), please make sure you are in the ``pa38`` environment, and ProgressGym will automatically handle any further environment switching.
Expand Down
Loading

0 comments on commit 9c7eec4

Please sign in to comment.