-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopic_model.py
79 lines (54 loc) · 3.17 KB
/
topic_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from dataparser import Data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import classification_report
# class to implement topic model using Latent Dirichlet Allocation
class TopicModel:
# given an address to a dataset of legit tweets and another to fake tweets, constructs a TopicModel object
def __init__(self, data = None, testing = False, training_data = None, training_label = None):
self.legit, self.fake = [], []
if testing == False:
for tweet in data.getRealTweets():
self.legit.append(tweet.getText())
for tweet in data.getTrollTweets():
self.fake.append(tweet.getText())
elif testing:
assert(len(training_data) == len(training_label))
for i in range(len(training_data)):
if training_label[i] == 0:
self.legit.append(training_data[i])
else:
self.fake.append(training_data[i])
self.tweets = self.legit + self.fake
self.cv_tweets = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
self.cv_tweets.fit(self.tweets)
self.df_tweets = self.cv_tweets.transform(self.tweets)
self.lda_tweets = LatentDirichletAllocation(n_components = 20, learning_method = 'online', random_state = 42)
self.doc_top = self.lda_tweets.fit_transform(self.df_tweets)
# given a string tweet, classifies it as either legit or fake using topic model
def classify(self, tweet):
tweet_vector = self.cv_tweets.transform([tweet])
topic_vector = self.lda_tweets.transform(tweet_vector)
max_legit_vec = max( self.doc_top[:len(self.legit)], key = lambda t : cosine_similarity( topic_vector, np.array(t).reshape(1,-1) )[0][0] )
max_fake_vec = max( self.doc_top[len(self.legit):], key = lambda t : cosine_similarity( topic_vector, np.array(t).reshape(1,-1) )[0][0] )
legit_similarity = cosine_similarity( topic_vector, max_legit_vec.reshape(1,-1) )[0][0]
fake_similarity = cosine_similarity( topic_vector, max_fake_vec.reshape(1,-1) )[0][0]
#
return 0 if legit_similarity > fake_similarity else 1 # 0 represents legit, 1 represents fake
# given a string tweet, returns its topic vector form
def topic_vectorize(self, tweet):
tweet_vector = self.cv_tweets.transform([tweet])
topic_vector = self.lda_tweets.transform(tweet_vector)
return topic_vector
if __name__ == "__main__":
electionTweets = "./data/2016_US_election_tweets_100k.csv"
electionTrolls = "./data/IRAhandle_tweets_1.csv"
data = Data(electionTweets, electionTrolls)
x_train, x_test, y_train, y_test = data.getSplitDataDL(.3)
tc = TopicModel(testing = True, training_data = x_train[:5000] , training_label = y_train[:5000])
y_pred = []
for x in x_test:
y_pred.append(tc.classify(x))
print(classification_report(y_test, y_pred))