-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
310 lines (245 loc) · 11.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
### THESE ARE ONLY SUGGESTED IMPORTS ###
# web (db and server) imports
from flask import Flask, render_template, request, url_for, jsonify, make_response
import pymysql
from pymongo import MongoClient
import urllib
# machine learning imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import time
# helpers
from collections import Counter
from datetime import datetime
# need pickle to store (if you want) binary files in mongo
import pickle
# json/bson handling
import json
from bson import ObjectId
from bson.decimal128 import Decimal128
from math import floor
import credentials
### HELPER FUNCTIONS ###
# you need this to decode mongo objs to JSON so they can render in the browser
class JSONEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, ObjectId):
return str(o)
return json.JSONEncoder.default(self, o)
# function to get the current time as a string
def get_current_time():
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
return dt_string
username=credentials.USERNAME
password=credentials.MYSQL_PASSWORD
mongo_db_password=credentials.MONGO_PASSWORD
mongo_db_host=credentials.MONGO_DB_HOST
# Connect to the database
connection = pymysql.connect(host=credentials.HOST_NAME,
user=username,
password=password,
db=credentials.DB_NAME,
charset='utf8mb4'
)
### MONGODB CONNECTION ###
client = MongoClient(f'mongodb://{username}:{mongo_db_password}@{mongo_db_host}:27017/{username}?auth_source={username}', ssl=True)
print('Connected to the MongoDB database :)')
# create an object of our database
mdb = client.c2075016
category_map={0:"not covid", 1:"covid"}
app = Flask(__name__)
# Adapted from: https://stackoverflow.com/questions/14853694/python-jsonify-dictionary-in-utf-8
# This allows emojis and special characters to be rendered in the browser when a JSON is returned.
app.config['JSON_AS_ASCII'] = False
def reset_views():
# your code here
print("Dropping views...")
with connection.cursor() as cur:
q="""DROP VIEW IF EXISTS training_set;"""
cur.execute(q)
q="""DROP VIEW IF EXISTS test_set;"""
cur.execute(q)
connection.commit()
print('Successfully dropped training_data and test_data views')
def create_labelling_function():
print("Creating labelling function...")
with connection.cursor() as cur:
q="""DROP FUNCTION IF EXISTS assign_label;"""
cur.execute(q)
q= """CREATE FUNCTION assign_label(ID_of_post INT)
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE label INT;
SET @subreddit_name = (SELECT subr_name FROM posts INNER JOIN subreddits ON subreddits.subr_ID=posts.subreddit_ID WHERE post_ID=ID_of_post);
SET @subreddit_description = (SELECT subr_description FROM posts INNER JOIN subreddits ON subreddits.subr_ID=posts.subreddit_ID WHERE post_ID=ID_of_post);
IF @subreddit_name LIKE '%covid%' OR @subreddit_name LIKE '%corona%' OR @subreddit_name LIKE '%lockdown%' OR @subreddit_description LIKE '%covid%' OR @subreddit_description LIKE '%corona%' OR @subreddit_description LIKE '%lockdown%'
THEN SET label=1;
ELSE
SET label=0;
END IF;
RETURN label;
END;"""
cur.execute(q)
print("Successfully created labelling function")
def create_training_view():
# your code here
print("Creating training view...")
with connection.cursor() as cur:
q="""SELECT COUNT(*) FROM posts;"""
cur.execute(q)
total_posts=cur.fetchone()[0]
print(total_posts)
q="""SELECT COUNT(*) FROM posts WHERE assign_label(post_ID)=1;"""
cur.execute(q)
total_covid_posts=cur.fetchone()[0]
total_non_covid_posts=total_posts-total_covid_posts
train_test_proportion=4/5
train_covid_posts=floor(train_test_proportion*total_covid_posts)
test_covid_posts=total_covid_posts-train_covid_posts
train_non_covid_posts=floor(train_test_proportion*total_non_covid_posts)
test_non_covid_posts=total_non_covid_posts-train_non_covid_posts
q="""CREATE VIEW training_set AS
(SELECT CONCAT_WS('\n', title, selftext) AS full_text, assign_label(post_ID) AS label FROM posts
WHERE assign_label(post_ID)=1
ORDER BY posted_at ASC, full_text ASC
LIMIT {0})
UNION ALL
(SELECT CONCAT_WS('\n', title, selftext) AS full_text, assign_label(post_ID) AS label FROM posts
WHERE assign_label(post_ID)=0
ORDER BY posted_at ASC, full_text ASC
LIMIT {1});
"""
print(q.format(train_covid_posts, train_non_covid_posts))
cur.execute(q.format(train_covid_posts, train_non_covid_posts))
connection.commit()
print('Successfully created training set')
def create_test_view():
# your code here
print("Creating test view...")
with connection.cursor() as cur:
q="""SELECT COUNT(*) FROM posts;"""
cur.execute(q)
total_posts=cur.fetchone()[0]
q="""SELECT COUNT(*) FROM posts WHERE assign_label(post_ID)=1;"""
cur.execute(q)
total_covid_posts=cur.fetchone()[0]
total_non_covid_posts=total_posts-total_covid_posts
train_test_proportion=4/5
train_covid_posts=floor(train_test_proportion*total_covid_posts)
test_covid_posts=total_covid_posts-train_covid_posts
train_non_covid_posts=floor(train_test_proportion*total_non_covid_posts)
test_non_covid_posts=total_non_covid_posts-train_non_covid_posts
q="""CREATE VIEW test_set AS
(SELECT CONCAT_WS('\n', title, selftext) AS full_text, assign_label(post_ID) AS label FROM posts
WHERE assign_label(post_ID)=1
ORDER BY posted_at DESC, full_text DESC
LIMIT {0})
UNION ALL
(SELECT CONCAT_WS('\n', title, selftext) AS full_text, assign_label(post_ID) AS label FROM posts
WHERE assign_label(post_ID)=0
ORDER BY posted_at DESC, full_text DESC
LIMIT {1});
"""
print(q.format(test_covid_posts, test_non_covid_posts))
cur.execute(q.format(test_covid_posts, test_non_covid_posts))
connection.commit()
print('Successfully created test set')
def check_views():
# your code here
print("Checking views...")
with connection.cursor() as cur:
q="""SELECT COUNT(*) FROM training_set;"""
cur.execute(q)
training_posts=cur.fetchone()[0]
q="""SELECT COUNT(*) FROM test_set;"""
cur.execute(q)
test_posts=cur.fetchone()[0]
print('Training data size: ',training_posts)
# your code here
print('Test data size: ',test_posts)
@app.route('/')
def form():
reset_views()
# Added create_labelling_function in to create the labelling function.
create_labelling_function()
create_training_view()
create_test_view()
check_views()
return render_template('index.html')
# Trains a classifier using the parameters input by the user, which is saved to the MongoDB database.
@app.route('/experiment_done', methods=['POST'])
def experiment_done():
# your code here
print("Retrieving data...")
with connection.cursor() as cur:
q="""SELECT * FROM training_set;"""
cur.execute(q)
train_data=cur.fetchall()
q="""SELECT * FROM test_set;"""
cur.execute(q)
test_data=cur.fetchall()
train_full_text=[row[0] for row in train_data]
train_labels=[row[1] for row in train_data]
test_full_text=[row[0] for row in test_data]
test_labels=[row[1] for row in test_data]
print("Choosing features and vectorizing data...")
vectorizer=TfidfVectorizer(max_features=1000, stop_words='english')
X=vectorizer.fit_transform(train_full_text)
X_test=vectorizer.transform(test_full_text)
print("Training classifier...")
start=time.time()
max_depth=int(request.form["max_depth"])
n_estimators=int(request.form["n_estimators"])
max_features=int(request.form["max_features"])
clf=RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)
clf=clf.fit(X, train_labels)
print("Evaluating classifier...")
predicted_labels=clf.predict(X_test)
accuracy=accuracy_score(test_labels, predicted_labels)
precision=precision_score(test_labels, predicted_labels, average='macro')
recall=recall_score(test_labels, predicted_labels, average='macro')
f1score=f1_score(test_labels, predicted_labels, average='macro')
end=time.time()
print(end-start, accuracy, precision, recall, f1score)
print("Saving model...")
model_binary=pickle.dumps(clf)
vectorizer_binary=pickle.dumps(vectorizer)
record_to_store={"model": {"binary": model_binary, "classifier": "RandomForestClassifier", "parameters": {"max_depth":max_depth, "n_estimators":n_estimators, "max_features":max_features}}, "vectorizer": {"binary": vectorizer_binary, "type": "TfidfVectorizer", "parameters": {"max_features":1000, "stop_words":"english"}}, "train_test_proportion":4/5, "evaluation_metrics": {"time":end-start, "accuracy": accuracy, "precision": precision, "recall": recall, "f1score": f1score}}
record_to_show={"model": {"classifier": "RandomForestClassifier", "parameters": {"max_depth":max_depth, "n_estimators":n_estimators, "max_features":max_features}}, "vectorizer": {"type": "TfidfVectorizer", "parameters": {"max_features":1000, "stop_words":"english"}}, "train_test_proportion":4/5, "evaluation_metrics": {"time":end-start, "accuracy": accuracy, "precision": precision, "recall": recall, "f1score": f1score}}
# you will be saving binary files in your mongodb database
# it is ok to have two different records, one with the models and one without it, so you store the one WITH models,
# and show in the browser the one WITHOUT models
mdb.results.insert_one(record_to_store)
return JSONEncoder().encode(record_to_show)
# Retrieves the top 3 models from MongoDB with the highest F1-scores and they are rendered on the page.
@app.route('/report', methods=['GET', 'POST'])
def retrieve_results():
# # your code here
projection={"_id":0, "model.classifier":1, "model.parameters":1, "vectorizer.type":1, "vectorizer.parameters":1, "train_test_proportion":1, "evaluation_metrics":1}
res=list(mdb.results.find({}, projection).sort("evaluation_metrics.f1score", -1).limit(3))
return JSONEncoder().encode(res)
# Retrieves the model and vectorizer binaries for the best model in terms of F1-score compiles them using pickle.loads
# and generates a prediction for the value of "input_text" submitted in the form, which is then rendered in the browser
# along with "input_text".
@app.route('/submitted', methods=['POST'])
def submitted_form():
# # your code here
projection={"_id":0, "model.binary":1, "vectorizer.binary":1}
result=list(mdb.results.find({}, projection).sort("evaluation_metrics.f1score", -1).limit(1))[0]
vectorizer_binary=result["vectorizer"]["binary"]
vectorizer=pickle.loads(vectorizer_binary)
best_clf_binary=result["model"]["binary"]
best_clf=pickle.loads(best_clf_binary)
post=request.form["input_text"]
X=vectorizer.transform([post])
prediction_key=best_clf.predict(X)[0]
print(prediction_key)
prediction=category_map[prediction_key]
res={"input_text": post, "prediction": prediction}
return jsonify(res)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080, debug=True)