Skip to content

Commit

Permalink
Merge pull request #20 from TianyiQ/main
Browse files Browse the repository at this point in the history
chore: support importing from outside
  • Loading branch information
TianyiQ authored Nov 28, 2024
2 parents b898f21 + d60f437 commit 2116d28
Show file tree
Hide file tree
Showing 42 changed files with 266 additions and 216 deletions.
3 changes: 3 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os, sys
sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path

from benchmark.framework import JudgeBase, ExamineeBase
from benchmark.dummies import DummyJudge
from challenges.follow import FollowJudge
Expand Down
5 changes: 2 additions & 3 deletions algorithms/extrapolative_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_utils import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
from algorithms.utils.extrapolation_utils import extrapolate
from src.utils.data_utils import elicit_rw_preference, default_rw_data, extrapolate
import warnings
from tqdm import tqdm
import numpy as np
Expand Down
6 changes: 3 additions & 3 deletions algorithms/extrapolative_rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import pandas as pd
import json
import datasets
from src.text_utils import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import (
from src.utils.data_utils import (
elicit_rw_preference,
default_rw_data,
default_ppo_data,
extrapolate,
)
from algorithms.utils.extrapolation_utils import extrapolate
import warnings
from tqdm import tqdm
from sympy import binomial
Expand Down
4 changes: 2 additions & 2 deletions algorithms/lifelong_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_utils import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
from src.utils.data_utils import elicit_rw_preference, default_rw_data
import warnings
from tqdm import tqdm

Expand Down
4 changes: 2 additions & 2 deletions algorithms/lifelong_rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_utils import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import (
from src.utils.data_utils import (
elicit_rw_preference,
default_rw_data,
default_ppo_data,
Expand Down
34 changes: 17 additions & 17 deletions build_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import src.text_utils as tw
from src.path import root
import src.utils.text_utils as tw
import src.cleanser.rule_based_cleanser as rb
import src.cleanser.localllm_cleanser as llm_cleanser
import src.model_training.train_hislm as hislm
Expand All @@ -8,7 +9,6 @@
import os
import time


import src.eebo.download_eebo as eebo_dl
import src.eebo.process_eebo as eebo_pc

Expand All @@ -26,10 +26,10 @@ def build_EEBO():

def build_gutenberg():
print("======= START BUILDING GUTENBERG DATASET =======")
dir = "./dataset/raw_downloads/Gutenberg/"
dir = f"{root}/dataset/raw_downloads/Gutenberg/"
gtb_gd.get_data_gutenberg(dir)
gtb_gm.gather_meta(
os.path.join(dir, "data/raw"), "./dataset/raw_downloads/Gutenberg_records.txt"
os.path.join(dir, "data/raw"), f"{root}/dataset/raw_downloads/Gutenberg_records.txt"
)
print("======= FINISHED BUILDING GUTENBERG DATASET =======\n\n\n")

Expand Down Expand Up @@ -91,8 +91,8 @@ def build_pile_of_law():
):
proceed = True
rb.cleanse(
"./dataset/dataset_text_sequence/",
"./dataset/dataset_text_sequence_rulebased_cleansed/",
f"{root}/dataset/dataset_text_sequence/",
f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
)
print("Finished rule-based data cleansing. Now exiting.")

Expand All @@ -102,25 +102,25 @@ def build_pile_of_law():
):
proceed = True
llm_cleanser.run_cleanser(
in_path="./dataset/dataset_text_sequence_rulebased_cleansed/",
out_path="./dataset/dataset_text_sequence_llm_cleansed/",
in_path=f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
out_path=f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
)

# Make llm-cleansed version the official version ("dataset_text_sequence"), and move the other two versions into dataset/raw_downloads
path = (
f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
)
os.makedirs(path)

print(f"Moving pre-cleansing version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence/",
f"{root}/dataset/dataset_text_sequence/",
os.path.join(path, "dataset_text_sequence_original/"),
)

print(f"Moving rule-cleansed version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence_rulebased_cleansed/",
f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
os.path.join(path, "dataset_text_sequence_rulebased_cleansed/"),
)

Expand All @@ -131,7 +131,7 @@ def build_pile_of_law():

print(f"Copying LLM-cleansed version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence_llm_cleansed/",
f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
os.path.join(path, "dataset_text_sequence_llm_cleansed/"),
)

Expand All @@ -148,19 +148,19 @@ def build_pile_of_law():
proceed = True

print(f"Removing overly small or messy subdatasets...")
path = f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
path = f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
os.makedirs(path)

sub_datasets = [
f
for f in os.listdir("./dataset/dataset_text_sequence/")
if os.path.isdir(os.path.join("./dataset/dataset_text_sequence/", f))
for f in os.listdir(f"{root}/dataset/dataset_text_sequence/")
if os.path.isdir(os.path.join(f"{root}/dataset/dataset_text_sequence/", f))
]
for sub in sub_datasets:
# Remove if size < 10MB AND century number < 13
if (
hislm.get_directory_size_bytes(
os.path.join("./dataset/dataset_text_sequence/", sub)
os.path.join(f"{root}/dataset/dataset_text_sequence/", sub)
)
< 10 * 1024 * 1024
and int(sub.strip("C")) < 13
Expand All @@ -169,7 +169,7 @@ def build_pile_of_law():
os.system(f"mv ./dataset/dataset_text_sequence/{sub} {path}")

hislm.run_training(
"./dataset/dataset_text_sequence/", "./dataset/dataset_model_sequence/"
f"{root}/dataset/dataset_text_sequence/", f"{root}/dataset/dataset_model_sequence/"
)
print("Finished model training. Exiting.")

Expand Down
11 changes: 6 additions & 5 deletions challenges/coevolve.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from src.path import root
from src.utils.data_utils import elicit_rw_preference, default_rw_data
from benchmark.framework import JudgeBase, ExamineeBase
from typing import Iterable, Tuple, Dict, Union, List, Any
from src.abstractions import Model, Data
import numpy as np
import scipy.spatial as sp
import datasets
import json, os
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
import json, os, sys


class CoevolveJudge(JudgeBase):
Expand All @@ -27,10 +28,10 @@ def reset(self, **kwargs) -> None:
assert self.simulated_model.model_name == self.model_list[0].model_name

if os.path.exists(
f"./output/benchmark_results/initial_supplementary_data.json"
f"{root}/output/benchmark_results/initial_supplementary_data.json"
):
with open(
f"./output/benchmark_results/initial_supplementary_data.json", "r"
f"{root}/output/benchmark_results/initial_supplementary_data.json", "r"
) as f:
self.supplementary_data = json.load(f)
else:
Expand All @@ -47,7 +48,7 @@ def reset(self, **kwargs) -> None:

# Backup supplementary data
with open(
f"./output/benchmark_results/initial_supplementary_data.json", "w"
f"{root}/output/benchmark_results/initial_supplementary_data.json", "w"
) as f:
json.dump(self.supplementary_data, f)

Expand Down
3 changes: 2 additions & 1 deletion doc_generation/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
from src.path import root
import os
import sys

sys.path.insert(0, os.path.abspath("../.."))
sys.path.insert(0, os.path.abspath(root))

project = "ProgressGym"
copyright = "2024 PKU Alignment, Tianyi Qiu, Yang Zhang, Xuchuan Huang, Xinze Li"
Expand Down
3 changes: 2 additions & 1 deletion examples/abstractions/finetuning_datamanip.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from src.path import root
from src.abstractions import Model, Data, DataFileCollection

gemma2b_base = Model(
Expand Down Expand Up @@ -59,7 +60,7 @@ def remove_curse_words(sample_dict: dict) -> dict:
histext_collection = DataFileCollection( # build a collection holding json files of year 1826 to 2018
collection_name="histext_1826_to_2018_collection",
data_type="pretrain",
collection_path="./dataset/dataset_text_sequence/",
collection_path=f"{root}/dataset/dataset_text_sequence/",
file_selection_func=(
lambda path: "Y" in path and 1826 <= int(path.split("/")[-1][1:6]) <= 2018
), # if this argument is omitted, all json files will be selected
Expand Down
3 changes: 2 additions & 1 deletion run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Note that all names are case-sensitive. Dummies are for debugging purposes only.
"""

from src.path import root
import pdb
import traceback
import argparse
Expand Down Expand Up @@ -97,7 +98,7 @@ def run_benchmark(
parser.add_argument(
"--output_dir",
type=str,
default="./output/benchmark_results",
default=f"{root}/output/benchmark_results",
required=False,
)
args, unknownargs = parser.parse_known_args()
Expand Down
21 changes: 11 additions & 10 deletions src/abstractions/backends.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Edit flashinfer cascade.py to make it compatible with Python 3.8
from src.path import root
import os

path = os.path.join(
Expand Down Expand Up @@ -31,16 +32,16 @@
import random

# create output directories
os.makedirs("./output/benchmark_results", exist_ok=True)
os.makedirs("./output/datasets", exist_ok=True)
os.makedirs("./output/evaluation_results", exist_ok=True)
os.makedirs("./output/inference_results", exist_ok=True)
os.makedirs("./output/training_results", exist_ok=True)
os.makedirs("./output/rlhf_results", exist_ok=True)
os.makedirs("./output/merged_lora_results", exist_ok=True)
os.makedirs("./output/saved/saved_model/", exist_ok=True)
os.makedirs("./output/saved/saved_data/", exist_ok=True)
os.makedirs("./output/downloaded", exist_ok=True)
os.makedirs(f"{root}/output/benchmark_results", exist_ok=True)
os.makedirs(f"{root}/output/datasets", exist_ok=True)
os.makedirs(f"{root}/output/evaluation_results", exist_ok=True)
os.makedirs(f"{root}/output/inference_results", exist_ok=True)
os.makedirs(f"{root}/output/training_results", exist_ok=True)
os.makedirs(f"{root}/output/rlhf_results", exist_ok=True)
os.makedirs(f"{root}/output/merged_lora_results", exist_ok=True)
os.makedirs(f"{root}/output/saved/saved_model/", exist_ok=True)
os.makedirs(f"{root}/output/saved/saved_data/", exist_ok=True)
os.makedirs(f"{root}/output/downloaded", exist_ok=True)

random.seed(time.time())
MY_USERNAME = pwd.getpwuid(os.getuid()).pw_name
Expand Down
27 changes: 26 additions & 1 deletion src/abstractions/configs/templates_configs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from src.path import root
from string import Template
import json
import os
from typing import Dict, Any, Literal, Optional, List, Union

bash_command_template = """PYTHONNOUSERSITE=1 MASTER_PORT=9902 conda run --no-capture-output -n %s deepspeed %s --master_port=9902 ./libs/llama_factory/src/train_bash.py \\
Expand Down Expand Up @@ -121,8 +123,31 @@
"""
)

with open("./src/abstractions/configs/abstractions_config.json", "r") as config_file:
with open(f"{root}/src/abstractions/configs/abstractions_config.json", "r") as config_file:
abstractions_config = json.load(config_file)

data_search_paths: List[str] = abstractions_config["data_search_paths"]
data_save_path: str = abstractions_config["data_save_path"]

if not os.path.exists(data_save_path):
data_save_path = f"{root}/" + data_save_path
if not os.path.exists(data_save_path):
raise FileNotFoundError(f"Data save path {data_save_path} doesn't exist.")

for i, path in enumerate(data_search_paths):
if not os.path.exists(path):
data_search_paths[i] = f"{root}/" + path

model_search_paths: List[str] = abstractions_config["model_search_paths"]
model_save_path: str = abstractions_config["model_save_path"]

if not os.path.exists(model_save_path):
model_save_path = f"{root}/" + model_save_path
if not os.path.exists(model_save_path):
raise FileNotFoundError(f"Model save path {model_save_path} doesn't exist.")

for i, path in enumerate(model_search_paths):
if not os.path.exists(path):
model_search_paths[i] = f"{root}/" + path

multinode_master_addr: str = abstractions_config["multinode_master_addr"]
Loading

0 comments on commit 2116d28

Please sign in to comment.