forked from nikicc/twitter-emotion-recognition
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemotion_predictor.py
111 lines (95 loc) · 4.21 KB
/
emotion_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import html
import pickle
import re
import pandas as pd
from keras import backend as K
from keras.models import load_model
from keras.preprocessing import sequence
class EmotionPredictor:
def __init__(self, classification, setting, use_unison_model=True):
"""
Args:
classification (str): Either 'ekman', 'plutchik', 'poms'
or 'unison'.
setting (str): Either 'mc' or 'ml'.
use_unison_model (bool): Whether to use unison model;
else use single model.
"""
if classification not in ['ekman', 'plutchik', 'poms', 'unison']:
raise ValueError('Unknown emotion classification: {}'.format(
classification))
if setting not in ['mc', 'ml']:
raise ValueError('Unknown setting: {}'.format(setting))
self.classification = classification
self.setting = setting
self.use_unison_model = use_unison_model
self.model = self._get_model()
self.embeddings_model = self._get_embeddings_model()
self.char_to_ind = self._get_char_mapping()
self.class_values = self._get_class_values()
self.max_len = self._get_max_sequence_length()
def _get_model(self):
self._loaded_model_filename = 'models/{}{}-{}.h5'.format(
'unison-' if self.use_unison_model else '',
self.classification,
self.setting,
)
return load_model(self._loaded_model_filename)
def _get_embeddings_model(self):
last_layer_output = K.function([self.model.layers[0].input,
K.learning_phase()],
[self.model.layers[-3].output])
return lambda x: last_layer_output([x, 0])[0]
@staticmethod
def _get_char_mapping():
with open('models/allowed-chars.pkl', 'rb') as f:
return pickle.load(f)
def _get_class_values(self):
if self.classification == 'ekman':
return ['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']
elif self.classification == 'plutchik':
return ['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise',
'Trust', 'Anticipation']
elif self.classification == 'poms':
return ['Anger', 'Depression', 'Fatigue', 'Vigour', 'Tension',
'Confusion']
def _get_max_sequence_length(self):
if self.use_unison_model or self.classification == 'poms':
return 143
elif self.classification in ['ekman', 'plutchik']:
return 141
def predict_classes(self, tweets):
indices = self._tweet_to_indices(tweets)
predictions = self.model.predict(indices, verbose=False)
df = pd.DataFrame({'Tweet': tweets})
if self.setting == 'mc':
df['Emotion'] = [self.class_values[i] for i in
predictions.argmax(axis=-1)]
else:
predictions[predictions >= 0.5] = 1
predictions[predictions < 0.5] = 0
for emotion, values in zip(self.class_values, predictions.T):
df[emotion] = values
return df
def predict_probabilities(self, tweets):
indices = self._tweet_to_indices(tweets)
predictions = self.model.predict(indices, verbose=False)
df = pd.DataFrame({'Tweet': tweets})
for emotion, values in zip(self.class_values, predictions.T):
df[emotion] = values
return df
def embedd(self, tweets):
indices = self._tweet_to_indices(tweets)
embeddings = self.embeddings_model(indices)
df = pd.DataFrame({'Tweet': tweets})
for index, values in enumerate(embeddings.T, start=1):
df['Dim{}'.format(index)] = values
return df
def _tweet_to_indices(self, tweets):
indices = []
for t in tweets:
t = html.unescape(t) # unescape HTML
t = re.sub(r"http\S+", "", t) # remove normal URLS
t = re.sub(r"pic\.twitter\.com/\S+", "", t) # remove pic.twitter.com URLS
indices.append([self.char_to_ind[char] for char in t])
return sequence.pad_sequences(indices, maxlen=self.max_len)