-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_1_single_source.py
168 lines (129 loc) · 4.88 KB
/
model_1_single_source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBClassifier, XGBRegressor
import lightgbm
import pandas as pd
import pickle
from preprocessing import preprocess_data as preprocess
import utils
class Model1:
'''
This class is used to code the first approach in the presentation
of using each data source for every challenge.
'''
def __init__(self):
pass
def fetch_face_data(self):
df_face, _ = utils.load_data_from_csv(dtype="face")
df_face = preprocess(df_face, dtype="face")
return df_face
def fetch_text_data(self):
df_text, _ = utils.load_data_from_csv(dtype="text")
df_text = preprocess(df_text, dtype="text")
return df_text
def fetch_relation_data(self):
df_relation, df_output = utils.load_data_from_csv(dtype="relation")
# Getting sparse matrix based on page likes
df_relation_matrix = utils.get_transformed_relation(df_relation, min_likes=5)
df_relation_matrix = pd.merge(df_relation_matrix, df_output,
left_on="userid",
right_on="userid",
how="outer")
# Filling mean values for users with no page likes (among the pages selected)
df_relation_matrix.fillna(df_relation_matrix.mean(), inplace=True)
return df_relation_matrix
def fetch_node2vec_data(self):
df_relation, df_output = utils.load_data_from_csv(dtype="relation")
df_n2v = ""
return df_n2v
def build_model_and_evaluate(data, target, classifier="XGB"):
model = Model1()
if data == "face":
df_X = model.fetch_face_data()
elif data == "text":
df_X = model.fetch_text_data()
elif data == "relation":
df_X = model.fetch_relation_data()
else:
raise ValueError("Incorrect data format")
X, y = utils.extract_data(df_X, label=target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
if classifier == "xgb":
clf = XGBClassifier(n_estimators=200)
elif classifier == "svm":
clf = SGDClassifier()
else:
raise ValueError("Incorrect classifier")
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
return accuracy_score
def build_model_and_evaluate_rms(data, regressor="XGB"):
model = Model1()
if data == "face":
df_X = model.fetch_face_data()
elif data == "text":
df_X = model.fetch_text_data()
elif data == "relation":
df_X = model.fetch_relation_data()
else:
raise ValueError("Incorrect data format")
X, y = utils.extract_data(df_X, label="personality")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
if regressor == "xgb":
reg = MultiOutputRegressor(XGBRegressor(n_estimators=200,
max_depth=2,
objective="reg:squarederror"))
elif regressor == "rf":
reg = MultiOutputRegressor(RandomForestRegressor(n_estimators=100))
elif regressor == "lasso":
reg = ""
elif regressor == "lightgbm":
reg = MultiOutputRegressor(lightgbm.LGBMRegressor(objective="regression"))
else:
raise ValueError("Incorrect classifier")
reg = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
# Calculating RMSE for all personality
rmse = []
for i, value in enumerate(utils.regressor_labels):
rmse.append(sqrt(mean_squared_error(y_pred[:, i], y_test[value])))
return rmse
if __name__ == "__main__":
## Classification Tasks
accuracy_face_age, clf = build_model_and_evaluate(
data="face",
target="age")
pickle.dump(clf, open("face_age.pkl", 'wb'))
accuracy_face_gender, clf = build_model_and_evaluate(
data="face",
target="gender")
pickle.dump(clf, open("face_gender.pkl", 'wb'))
accuracy_text_age, clf = build_model_and_evaluate(
data="text",
target="age")
pickle.dump(clf, open("text_age.pkl", 'wb'))
accuracy_text_gender, clf = build_model_and_evaluate(
data="text",
target="gender")
pickle.dump(clf, open("text_gender.pkl", 'wb'))
accuracy_relation_age, clf = build_model_and_evaluate(
data="relation",
target="age")
pickle.dump(clf, open("relation_age.pkl", 'wb'))
accuracy_relation_gender, clf = build_model_and_evaluate(
data="relation",
target="gender")
pickle.dump(clf, open("relation_gender.pkl", 'wb'))
## Regression Tasks
rmse_text_personality, clf = build_model_and_evaluate_rms(
data="text")
pickle.dump(clf, open("text_regression.pkl", 'wb'))
rmse_face_personality, clf = build_model_and_evaluate_rms(
data="face")
pickle.dump(clf, open("face_regression.pkl", 'wb'))
rmse_relation_personality, clf = build_model_and_evaluate_rms(
data="relation")
pickle.dump(clf, open("relation_regression.pkl", 'wb'))