-
Notifications
You must be signed in to change notification settings - Fork 1
/
ldaNMW.py
111 lines (79 loc) · 2.95 KB
/
ldaNMW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import csv
from collections import Counter
import emoji
from emoji import unicode_codes
import pickle
import re
import pandas
import string
from num2words import num2words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import datapath
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
import time
#pd = pandas.read_csv("/data/06333/aroraish/rest.csv", encoding='utf-8')
#pd3 = pandas.read_csv("/data/06333/aroraish/modifiableN.csv", encoding='utf-8', error_bad_lines=False)
emojicols = [u"\U0001f3fb", u"\U0001f3fc", u"\U0001f3fd", u"\U0001f3fe", u"\U0001f3ff"]
pattern = u'(' + u'|'.join(re.escape(u) for u in emojicols) + u')'
allCols = re.compile(pattern)
emojiss = unicode_codes.EMOJI_ALIAS_UNICODE
coloured = set()
for key in emojiss:
if(allCols.findall(emojiss[key])):
coloured.add(emojiss[key])
coloured.add(allCols.sub('',emojiss[key]))
coloured.remove(u"")
emojis = sorted(coloured, key=len,
reverse=True)
pattern2 = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
colouredRE = re.compile(pattern2)
emojis = sorted(emojiss.values(), key=len,
reverse=True)
pattern3 = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
ree = re.compile(pattern3)
def pipe(message):
text = preprocess(message)
n_all(text)
def num(token):
try:
return num2words(token)
except:
return token
def n_all(message):
#message = message.decode('utf-8')
tokens = list()
sp = message.split()
for i in sp:
l = ree.findall(i)
if(l):
tokens.extend(l)
else:
tokens.append(i)
return sp
pd = pandas.read_csv("/data/06333/aroraish/modifiableN_processed.csv", encoding='utf-8', usecols=['message'], low_memory=False, error_bad_lines=False, chunksize=1000000)
chunklist = []
for c in pd:
a = c[u'message'].map(n_all)
chunklist.append(a)
processed_docs = pandas.concat(chunklist)
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
pickle.dump(dictionary, open("/data/06333/aroraish/dictionaryNM2.pkl", "w"))
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=25, id2word=dictionary, passes=2, workers=1)
temp_file = datapath("/data/06333/aroraish/lda_model_bog_NM2")
lda_model.save(temp_file)
with open("/data/06333/aroraish/lda_bog_NM_2.txt", 'w') as bw:
for idx, topic in lda_model.print_topics(-1):
bw.write('Topic: {} \nWords: {}\n\n'.format(idx, topic.encode('utf-8')))
for idx, topic in lda_model.print_topics(-1):
print('Topic: {} \nWords: {}\n'.format(idx, topic.encode('utf-8')))