-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetune_FlanT5.py
89 lines (76 loc) · 4.33 KB
/
finetune_FlanT5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install deepspeed xformers==0.0.16 peft")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install ninja accelerate --upgrade")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install datasets evaluate")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install -U transformers huggingface-hub tokenizers triton")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install requests==2.27.1 numpy==1.20.3")
import pandas as pd
from transformers import AutoTokenizer
import nltk
import torch
nltk.download('punkt')
from tqdm import tqdm
import datasets
def preprocess(df_train, text_column, summary_column, model_id, encoder_max_length, decoder_max_length, save_dataset_path):
#Select part of data we want to keep
df_val = df_train.sample(2000, random_state=42)
df_train = df_train.drop(df_val.index)
df_train = df_train[[text_column, summary_column]].copy()
df_val = df_val[[text_column, summary_column]].copy()
tokenizer = AutoTokenizer.from_pretrained(model_id)
def process_data_to_model_inputs(batch):
# tokenize the inputs and labels
inputs = tokenizer(batch[text_column], padding="max_length", truncation=True, max_length=encoder_max_length)
outputs = tokenizer(batch[summary_column], padding="max_length", truncation=True, max_length=decoder_max_length)
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["decoder_attention_mask"] = outputs.attention_mask
batch["labels"] = outputs.input_ids.copy()
# because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
# We have to make sure that the PAD token is ignored
batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
return batch
train_data = datasets.Dataset.from_pandas(df_train)
train_data = train_data.map(
#lambda x: process_data_to_model_inputs(x, tokenizer, encoder_max_length, decoder_max_length) ,
process_data_to_model_inputs,
batched=True,
batch_size=32)
train_data.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"])
val_data = datasets.Dataset.from_pandas(df_val)
val_data = val_data.map(
process_data_to_model_inputs,
batched=True,
batch_size=32)
val_data.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"])
train_data.save_to_disk(os.path.join(save_dataset_path, "train"))
val_data.save_to_disk(os.path.join(save_dataset_path, "eval"))
if __name__ == "__main__":
# Get data
df_train = pd.read_excel('./archive/train.xlsx')
# clean data
df_train['impressions'] = df_train['impressions'].apply(lambda x: x.replace('\n', ' '))
df_train['findings_info'] = df_train['findings_info'].apply(lambda x: x.replace('\n', ' '))
text_column = "findings_info" # column of input text is
summary_column = "impressions" # column of the output text
model_id = "google/flan-t5-large" # Hugging Face Model Id
encoder_max_length = 1024 # max length of the input text
decoder_max_length = 512 # max length of the output text
save_dataset_path = f"data" # local path to save processed dataset
#preprocess(df_train, text_column, summary_column, model_id, encoder_max_length, decoder_max_length, save_dataset_path)
# if 2048, you should reset rerun the preprocess function
os.system(f'deepspeed --num_gpus=1 run_seq2seq_lora.py \
--model_id {model_id} \
--dataset_path {save_dataset_path} \
--epochs 15 \
--per_device_train_batch_size 64 \
--lr 1e-3\
--max_position_embedding {encoder_max_length} \
--generation_max_length {decoder_max_length} \
--bf16 True \
--master_port=25641 \
--deepspeed configs/ds_flan_t5_z3_config_bf16.json') # ds_flan_t5_z3_config_bf16.json