-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict_FlanT5.py
102 lines (87 loc) · 4.95 KB
/
predict_FlanT5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install deepspeed xformers==0.0.16 peft")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install ninja accelerate --upgrade")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install datasets evaluate dill==0.3.4")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install -U transformers huggingface-hub tokenizers triton")
os.system("pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install requests==2.27.1 numpy==1.20.3")
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import datasets
import numpy as np
import pandas as pd
from peft import PeftModel, PeftConfig
# Command: deepspeed --num_gpus=2 predict_XL.py
# https://github.com/tloen/alpaca-lora/blob/main/generate.py
def predict_text(model_id, reports, lora_weights, savename, test_epoch):
#model.tie_weights() # Now you can use the model for inference without encountering the error
config = PeftConfig.from_pretrained(lora_weights)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=False, device_map="auto")
#model.config.n_positions = 1024
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # add special tokens
# Load the Lora model
model = PeftModel.from_pretrained(model, lora_weights, device_map="auto")
model = model.merge_and_unload() # merge the weights and unload the Lora model
model.eval()
encoder_max_length = 1024
decoder_max_length = 512
# map data correctly
def generate_summary(batch):
# Tokenizer will automatically set [BOS] <text> [EOS]
inputs = tokenizer(batch["findings_info"], padding="max_length", truncation=True, max_length=encoder_max_length, return_tensors="pt")
input_ids = inputs.input_ids.to("cuda")
# Add model = model.merge_and_unload()
# If you treat the input as a pointer such as, in this case: .generate(**inputs) that works as well
outputs = model.generate(input_ids, max_new_tokens=decoder_max_length, \
num_beam_groups=1,\
num_beams=4, \
do_sample=False,\
diversity_penalty=0.0,\
num_return_sequences=1, \
length_penalty=2.0,\
no_repeat_ngram_size=3,\
early_stopping=True)
# all special tokens including will be removed
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
batch["pred"] = output_str
return batch
test_data = datasets.Dataset.from_pandas(reports)
results = test_data.map(generate_summary, batched=True, batch_size=4)
pred_str = results["pred"]
reports['AI_impression'] = pred_str
save_pred_dir = f'{savename}'
os.makedirs(save_pred_dir, exist_ok=True)
reports.to_excel(f'{save_pred_dir}/test_{test_epoch}.xlsx', index=False)
def filter_test_data(df):
# select the recent 5 years data
exam_date = df['Exam Date Time']
exam_date_new = []
for ii in tqdm(range(len(exam_date))):
if '2018' in exam_date[ii] or '2019' in exam_date[ii] or '2020' in exam_date[ii] \
or '2021' in exam_date[ii] or '2022' in exam_date[ii] or '2023' in exam_date[ii]:
exam_date_new.append(exam_date[ii])
else:
exam_date_new.append('Remove')
df['Exam Date Time'] = exam_date_new
# drop the data with no exam date
df = df[df['Exam Date Time'] != 'Remove'].reset_index(drop=True)
df = df[df['Study Description'] == 'PET CT WHOLE BODY'].reset_index(drop=True)
df = df.sample(n=1000, random_state=716).reset_index(drop=True)
return df
if __name__ == '__main__':
# Testing
df = pd.read_excel('./archive/test.xlsx')
#df = filter_test_data(df)
# clean data
df['impressions'] = df['impressions'].apply(lambda x: x.replace('\n', ' '))
df['findings_info'] = df['findings_info'].apply(lambda x: x.replace('\n', ' '))
text_column = "findings_info" # column of input text is
summary_column = "impressions" # column of the output text
model_id = "google/flan-t5-large" # Hugging Face Model Id
savename = 'flan_t5_large_lora'
steps = [525, 1050, 1575, 2100, 2625, 3150, 3675, 4200, 4725, 5250, 5775, 6300, 6825, 7350, 7875, 8400, 8925, 9450, 9975, 10500]
for test_epoch in [8]:
lora_weights = f"my-flan-t5-large/checkpoint-{steps[test_epoch]}"
predict_text(model_id, df, lora_weights, savename, test_epoch)