-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_seq2seq_IA3.py
158 lines (140 loc) · 6.24 KB
/
run_seq2seq_IA3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import argparse
import numpy as np
from transformers import (
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
AutoTokenizer,
set_seed,
)
from datasets import load_from_disk
import urllib3
urllib3.disable_warnings()
import torch
import nltk
nltk.download("punkt", quiet=True)
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
import pathlib
from peft import (
IA3Config,
get_peft_model,
get_peft_model_state_dict,
prepare_model_for_int8_training,
set_peft_model_state_dict,
)
def parse_arge():
"""Parse the arguments."""
parser = argparse.ArgumentParser()
# add model id and dataset path argument
parser.add_argument("--model_id", type=str, default="google/flan-t5-xl", help="Model id to use for training.")
parser.add_argument("--dataset_path", type=str, default="data", help="Path to the already processed dataset.")
# add training hyperparameters for epochs, batch size, learning rate, and seed
parser.add_argument("--epochs", type=int, default=15, help="Number of epochs to train for.")
parser.add_argument("--per_device_train_batch_size", type=int, default=1, help="Batch size to use for training.")
parser.add_argument("--per_device_eval_batch_size", type=int, default=1, help="Batch size to use for testing.")
parser.add_argument("--max_position_embedding", type=int, default=1024, help="Maximum input sequence length.")
parser.add_argument("--generation_max_length", type=int, default=512, help="Maximum length to use for generation")
parser.add_argument("--generation_num_beams", type=int, default=4, help="Number of beams to use for generation.")
parser.add_argument("--lr", type=float, default=2e-4, help="Learning rate to use for training.")
parser.add_argument("--seed", type=int, default=42, help="Seed to use for training.")
parser.add_argument("--deepspeed", type=str, default=None, help="Path to deepspeed config file.")
parser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Path to deepspeed config file.")
parser.add_argument(
"--bf16",
type=bool,
default=True if torch.cuda.get_device_capability()[0] == 8 else False,
help="Whether to use bf16.",
)
args = parser.parse_known_args()
return args
def training_function(args):
# set seed
set_seed(args.seed)
# set seed
set_seed(args.seed)
# load dataset from disk and tokenizer
train_dataset = load_from_disk(os.path.join(args.dataset_path, "train"))
eval_dataset = load_from_disk(os.path.join(args.dataset_path, "eval"))
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
# Save our tokenizer and create model card
output_dir = 'my-' + args.model_id.split("/")[-1] + '-IA3'
tokenizer.save_pretrained(output_dir)
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(
args.model_id,
use_cache=False if args.gradient_checkpointing else True, # this is needed for gradient checkpointing
)
model.config.n_positions = args.max_position_embedding
#config = IA3Config(task_type="SEQ_2_SEQ_LM", target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"])
config = IA3Config(task_type="SEQ_2_SEQ_LM", inference_mode=False, feedforward_modules=[])
# https://github.com/huggingface/peft/blob/c33c42f1582b13e9b14b6b9467eff9275164ea36/examples/conditional_generation/
# prepare int-8 model for training
#model = prepare_model_for_int8_training(model)
model.enable_input_require_grads()
# add adaptor
model = get_peft_model(model, config)
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
exit()
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8, max_length=args.max_position_embedding
)
# -100: we want to ignore tokenizer pad token in the loss
# load rouge for validation
rouge = evaluate.load("rouge")
def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions
# all unnecessary tokens are removed
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"]
return {"rougeL": round(rouge_output, 4)}
gradient_accumulation_steps = 64 // torch.cuda.device_count() // args.per_device_train_batch_size # 128 from radiology-GPT
# Define training args
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=args.per_device_train_batch_size,
per_device_eval_batch_size=args.per_device_eval_batch_size,
do_train=True,
do_eval=False,
evaluation_strategy="no",
load_best_model_at_end=False,
gradient_accumulation_steps=gradient_accumulation_steps,
fp16=False, # T5 overflows with fp16
bf16=args.bf16, # Use BF16 if available
deepspeed=args.deepspeed,
learning_rate=args.lr,
num_train_epochs=args.epochs,
gradient_checkpointing=args.gradient_checkpointing,
# logging & evaluation strategies
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=128,
save_strategy="epoch",
save_total_limit=100,
report_to="tensorboard",
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics)
# Start training
trainer.train()
output_dir_final_model = os.path.join(output_dir, 'final_model')
os.makedirs(output_dir_final_model, exist_ok=True)
model.save_pretrained(output_dir_final_model)
def main():
args, _ = parse_arge()
training_function(args)
if __name__ == "__main__":
main()