-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakeModel.py
122 lines (99 loc) · 3.44 KB
/
MakeModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.utils import compute_class_weight
sys.path.append("../")
from bert_sklearn import BertTokenClassifier
from pathlib import Path
# A function that flattens a 2-D list
# into a 1-D list. The source of the function
# is: https://github.com/charles9n/bert-sklearn
# Passing Parameters:
# l: a 2-D list
# Return values:
# A 1-D list
def flatten(l):
return [item for sublist in l for item in sublist]
# A function that computes the class weights
# The source of the function is:
# https://github.com/junwang4/causal-language-use-in-science
# Passing parameters:
# labels: A list of labels
# Return values:
# class_weight: The weights for each class
def get_class_weight(labels):
class_weight = [x for x in compute_class_weight("balanced", np.unique(labels), labels)]
print('- auto-computed class weight:', class_weight)
return class_weight
# required for bert to train correctly
# on windows
if __name__ == "__main__":
# The number of epochs
num_epochs = 15
# The learning rate
lr = 3e-5
# The number of MLP hidden layers
# set to 0 for linear
num_mlp = 1
# Intitialize the paths to the data folder
# and the BioBERT-large folder
cwd = os.getcwd()
data_folder = Path(cwd + "/Data/")
model_folder = Path(cwd + "/Models" +"/biobert-large")
bert_large = model_folder / "biobert-large-cased"
bert_large_config = model_folder / "biobert-large-cased-config.json"
bert_large_vocab = model_folder / "biobert-large-cased-vocab.txt"
output_folder = Path(cwd + "/outputs/")
if not os.path.exists(output_folder):
os.mkdir(output_folder)
# The models to be used in experiments
model_list=[
# 'scibert-scivocab-cased',
# 'biobert-base-cased',
# 'biobert-v1.0-pubmed-pmc-base-cased',
'biobert-large-cased'
]
# Read the data
f = open(data_folder / "EntityRecognition.txt", "r")
data = f.readlines()
tokens = []
labels = []
for i in range(0, len(data) - 1, 2):
tokens.append(data[i].strip().split(" "))
labels.append(data[i + 1].strip().split(" "))
f.close()
# The tags of the data
tags = ['B-C', 'B-CON', 'B-CS','B-EF', 'B-ES','B-VC', 'B-VE', 'I-CON', 'I-CS', 'I-ES']
# Get the a list of unique labels
label_list = np.unique(flatten(labels))
label_list = list(label_list)
# Convert the data to a dataframe
data = {'tokens': tokens, 'labels': labels}
dataset = pd.DataFrame(data=data)
class_weights = get_class_weight(flatten(dataset["labels"]))
X_train, y_train = dataset.tokens, dataset.labels
model = BertTokenClassifier(
max_seq_length=512,
epochs=num_epochs,
train_batch_size=32,
gradient_accumulation_steps=8, validation_fraction=0,
learning_rate=lr,
bert_model=str(bert_large),
bert_config_json=str(bert_large_config),
bert_vocab=str(bert_large_vocab),
class_weight=class_weights,
eval_batch_size=32,
num_mlp_layers=num_mlp,
num_mlp_hiddens=500,
label_list=label_list,
ignore_label=['O']
)
# Train the model
model.fit(X_train, y_train)
model.save("bert_large-er")