-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain_bcms.py
101 lines (85 loc) · 2.71 KB
/
train_bcms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import numpy as np
from simpletransformers.classification import MultiLabelClassificationModel
import pandas as pd
import logging
from numpy.linalg import norm
R = 6371
def evaluate(c1, c2, scale_km=True):
d = np.radians(c2-c1)
a = np.sin(d[:,0]/2) * np.sin(d[:,0]/2) + np.cos(np.radians(c1[:,0])) * np.cos(np.radians(c2[:,0])) * np.sin(d[:,1]/2) * np.sin(d[:,1]/2)
d = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
if scale_km:
return R * d
else:
return d
def mean_dist(a, b):
global scl
a_tr = scl.inverse_transform(a)
b_tr = scl.inverse_transform(b)
d = evaluate(a_tr, b_tr)
return np.mean(d)
def median_dist(a, b):
global scl
a_tr = scl.inverse_transform(a)
b_tr = scl.inverse_transform(b)
d = evaluate(a_tr, b_tr)
return np.median(d)
class GlobalScaler():
def __init__(self):
self.means = None
self.stddev = None
def fit_transform(self, data):
self.means = np.mean(data, axis=0)
centereddata = data - self.means
self.stddev = np.std(centereddata)
return centereddata / self.stddev
def transform(self, data):
return (data - self.means) / self.stddev
def inverse_transform(self, data):
return (data * self.stddev) + self.means
def load_data(path, size=-1):
data=[]
for line in open(path):
x, y, text = line.strip().split('\t')
x, y = float(x),float(y)
data.append((text,(x,y)))
if len(data) == size:
break
return data
# Preparing train data
train_data = load_data('bcms/train.txt')
dev_data = load_data('bcms/dev.txt')
scl=GlobalScaler()
train_y=scl.fit_transform([e[1] for e in train_data])
pickle.dump(scl,open('bcms.scaler','wb'))
dev_y=scl.transform([e[1] for e in dev_data])
train_df = pd.DataFrame(zip([e[0] for e in train_data],train_y))
train_df.columns = ["text", "labels"]
dev_df = pd.DataFrame(zip([e[0] for e in dev_data],dev_y))
dev_df.columns = ["text", "labels"]
model_args = {
"regression": True,
"num_train_epochs": 10,
"overwrite_output_dir": True,
"best_model_dir": "bcms_output/best_model",
"evaluate_during_training": True,
"evaluate_during_training_steps": 0,
"evaluate_during_training_verbose": True,
"early_stopping_metric": "median_dist",
"early_stopping_metric_minimize": True,
"output_dir": 'bcms_output',
"do_lower_case": True,
"save_steps": 0,
"learning_rate": 5e-4,
"train_batch_size": 128,
"save_eval_checkpoints": False,
"save_model_every_epoch": False,
}
model = MultiLabelClassificationModel(
"electra",
"classla/bcms-bertic",
num_labels=2,
loss_fct="MAELoss",
args=model_args,
)
model.train_model(train_df, eval_df = dev_df, median_dist = median_dist)