From f939b39c55a6d6d9942f6fab4b6bb51eb61f888b Mon Sep 17 00:00:00 2001 From: MenuaB Date: Wed, 27 Mar 2024 16:29:38 +0400 Subject: [PATCH] pass tokenizer path through args, add optim lr scheduler to config, add lr argpars --- chemlactica/config/default_train_config.py | 2 ++ chemlactica/custom_trainer.py | 7 +++++-- chemlactica/get_trainer.py | 8 +++----- chemlactica/train.py | 11 +++++++---- chemlactica/utils/parseargs.py | 9 +++++++++ submit_run_galactica_pre.py | 5 +++-- 6 files changed, 29 insertions(+), 13 deletions(-) diff --git a/chemlactica/config/default_train_config.py b/chemlactica/config/default_train_config.py index 1966300..1de91a0 100644 --- a/chemlactica/config/default_train_config.py +++ b/chemlactica/config/default_train_config.py @@ -21,6 +21,8 @@ class TrainConfig: max_learning_rate: float = 6.0e-4 warmup_steps: int = 500 weight_decay: float = 0.1 + optimizer: str = "adamw_torch" + lr_scheduler_type: str = "linear" @dataclass diff --git a/chemlactica/custom_trainer.py b/chemlactica/custom_trainer.py index ec24023..73e0136 100644 --- a/chemlactica/custom_trainer.py +++ b/chemlactica/custom_trainer.py @@ -27,14 +27,17 @@ class CustomArguments(TrainingArguments): ) command: str = field(default=None) experiment_name: str = field(default=None) + tokenizer_path: str = field( + default="/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/ChemLacticaTokenizer66" + ) # train_config: dict = field(default=None) class CustomTrainer(Trainer): - def __init__(self, tokenizer_path, *args, **kwargs): + def __init__(self, *args, **kwargs): # the number of samples to print when the training begins, for debugging purposes self.num_samples_to_print = 5 - self.tokenizer_path = tokenizer_path + self.tokenizer_path = kwargs["args"].tokenizer_path super().__init__(*args, **kwargs) def training_step(self, model: Module, inputs: Dict[str, Tensor | Any]) -> Tensor: diff --git a/chemlactica/get_trainer.py b/chemlactica/get_trainer.py index b3bd11d..1677793 100644 --- a/chemlactica/get_trainer.py +++ b/chemlactica/get_trainer.py @@ -7,13 +7,11 @@ from config.default_train_config import SFTTrainConfig -def get_trainer( - train_type, model, model_config, dataset, training_args, evaluate_only, slurm_eval -): +def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_eval): if train_type == "pretrain": trainer = CustomTrainer( model=model, - tokenizer_path=model_config.tokenizer_path, + # tokenizer_path=model_config.tokenizer_path, args=training_args, # compute_metrics=compute_metrics, train_dataset=dataset["train"] if not evaluate_only else None, @@ -26,7 +24,7 @@ def get_trainer( elif train_type == "sft": sft_config = SFTTrainConfig() - tokenizer = get_tokenizer(model_config.tokenizer_path) + tokenizer = get_tokenizer(training_args.tokenizer_path) response_template = "[PROPERTY]activity " collator = DataCollatorForCompletionOnlyLM( response_template, tokenizer=tokenizer diff --git a/chemlactica/train.py b/chemlactica/train.py index cd2b482..be0f804 100644 --- a/chemlactica/train.py +++ b/chemlactica/train.py @@ -64,6 +64,7 @@ def train( training_data_dirs, dir_data_types, valid_data_dir, + learning_rate, scheduler_max_steps, eval_steps, save_steps, @@ -218,6 +219,7 @@ def train( slurm_eval=slurm_eval, experiment_name=experiment_name, # train_config=train_config, + tokenizer_path=model_config.tokenizer_path, do_train=not evaluate_only, output_dir=checkpoints_dir, per_device_train_batch_size=train_batch_size, @@ -228,7 +230,9 @@ def train( bf16_full_eval=True, fp16=False, logging_dir=track_dir, - learning_rate=train_config.max_learning_rate, + learning_rate=learning_rate + if learning_rate + else train_config.max_learning_rate, weight_decay=train_config.weight_decay, adam_beta1=train_config.adam_beta1, adam_beta2=train_config.adam_beta2, @@ -251,8 +255,8 @@ def train( # gradient_accumulation_steps=gradient_accumulation_steps, # save_total_limit=4, in order for offline eval to work, we keep all of them for now resume_from_checkpoint=resume_from_checkpoint, - lr_scheduler_type="linear", - optim="adamw_torch", + lr_scheduler_type=train_config.lr_scheduler_type, + optim=train_config.optimizer, # load_best_model=True ) @@ -271,7 +275,6 @@ def train( trainer = get_trainer( train_type, model, - model_config, dataset, training_args, evaluate_only, diff --git a/chemlactica/utils/parseargs.py b/chemlactica/utils/parseargs.py index a28d7f6..59073a1 100644 --- a/chemlactica/utils/parseargs.py +++ b/chemlactica/utils/parseargs.py @@ -53,6 +53,15 @@ def init_parser(): required=True, help="path to directory containing validation data", ) + parser.add_argument( + "--learning_rate", + type=int, + metavar="LR", + dest="learning_rate", + required=False, + default=None, + help="learning rate", + ) parser.add_argument( "--max_steps", type=int, diff --git a/submit_run_galactica_pre.py b/submit_run_galactica_pre.py index b420ee8..9421164 100644 --- a/submit_run_galactica_pre.py +++ b/submit_run_galactica_pre.py @@ -5,7 +5,7 @@ use_accelerate = True rsync_enabled = False -executor_name = "local" # options are ["slurm", "local"] +executor_name = "slurm" # options are ["slurm", "local"] root_path = "" num_gpus = 2 model_name = "galactica" @@ -40,9 +40,10 @@ "valid_data_dir": "/nfs/ap/mnt/sxtn/rdkit_computed_rel+form/valid_rdkit_computed_rel+form", "max_steps": 120000, # "num_train_epochs": 15, + # "learning_rate": 5, "eval_steps": 1000, "save_steps": 1000, - "train_batch_size": 16, + "train_batch_size": 2, # "valid_batch_size": 16, "dataloader_num_workers": 30, "experiment_name": "freesolv_30e",