-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre_processor.py
267 lines (199 loc) · 10.2 KB
/
pre_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# Libraries
import re
import nltk
import unidecode
import unicodedata
import contractions
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from googletrans import Translator
class PreProcessor:
def __init__(self, regex_dict = None):
# creating classes
# stem
self.sb = nltk.stem.SnowballStemmer('english')
# lemmatize
self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# translate
self.translator = Translator()
# declare a default regex dict
self.default_regex_dict = {'goo[o]*d':'good', '2morrow':'tomorrow', 'b4':'before', 'otw':'on the way',
'idk':"i don't know", ':)':'smile', 'bc':'because', '2nite':'tonight',
'yeah':'yes', 'yeshhhhhhhh':'yes', ' yeeeee':'yes', 'btw':'by the way',
'fyi':'for your information', 'gr8':'great', 'asap':'as soon as possible',
'yummmmmy':'yummy', 'gf':'girlfriend', 'thx':'thanks','nowwwwwww':'now',
' ppl ':' people ', 'yeiii':'yes'}
# if no regex_dict defined by user, then use
# one by default. Else, concat two regex dicts
if regex_dict:
self.regex_dict = {**regex_dict, **default_regex_dict}
else:
self.regex_dict = self.default_regex_dict
def translate_twt(self, pdf):
"""
This function helps to translate a tweet from any
language to English.
Inputs:
- pdf: Pandas dataframe. This dataframe must have
the following columns:
- lang: Tweet's language.
- clean_tweet: Partially pre-processed tweet.
Outputs: Translated tweet from any language available
in googletrans api to English.
"""
# Check if the language of the tweet is either undefined or English
# to avoid translation.
if pdf["lang"] == "und" or pdf["lang"] == "en":
pdf["translated_tweet"] = pdf["clean_tweet"]
# Check if tweet is in Hindi. The code of Hindi language is "hi", but
# Twitter has defined the code as "in".
elif pdf["lang"] == "in":
pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = "hi", dest = "en").text
# Check if tweet is in Chinese.
# The api supports simplified and traditional chinese.
elif pdf["lang"] == "zh":
pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = "zh-cn", dest = "en").text
# For any other language the translator should work just fine, so the
# api should work with the language detected by Twitter.
else:
try:
pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = pdf["lang"],
dest = "en").text
except (TypeError, ValueError):
pdf["translated_tweet"] = pdf["clean_tweet"]
return pdf["translated_tweet"]
def removeNoise(self, pdf):
"""
Function to remove noise from strings.
Inputs: A pandas dataframe with raw strings of length n.
Output: A clean string where elements such as accented
words, html tags, punctuation marks, and extra white
spaces will be removed (or transform) if it's the case.
"""
# to lower case
pdf["clean_tweet"] = pdf.text.apply(lambda x: x.lower())
# remove accented characters from string
# e.g. canción --> cancion
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: unidecode.unidecode(x))
# remove html tags
pdf["clean_tweet"] = pdf.clean_tweet.str.replace(r'<[^<>]*>', '', regex = True)
# remove (match with) usernames | hashtags | punct marks | links
# punct marks = ",.':!?;
# do not remove: '
# but remove: "
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x:' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([-.,:_;])|(https?:\/\/.*[\r\n]*)",
"", x).split()).replace('"',''))
# remove white spaces at the begining and at
# the end of a string
pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: x.lstrip(' '))
pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: x.rstrip(' '))
# Translate tweet
pdf["clean_tweet"] = pdf.apply(lambda x: self.translate_twt(x) \
if pd.isnull(x.clean_tweet) == False else x, axis = 1)
# normalize string
# normalize accented charcaters and other strange characters
# NFKD if there are accented characters (????
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: unicodedata.normalize('NFKC', x)\
.encode('ASCII', 'ignore').decode("utf-8")\
if (pd.isnull(x) == False and x != "") else x)
return pdf
def textNormalization(self, pdf):
"""
Function to normalize a string.
Inputs: A pandas dataframe with strings (of length n) that
will be normalized.
Outputs: A normalized string whitout noise, words in their
(expected) correct form and with no stopwords.
"""
# remove noise first
pdf = self.removeNoise(pdf)
# expand contractions
# e.g. don't --> do not
pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: contractions.fix(x))
# Normalize words
pdf['clean_tweet'] = pdf.clean_tweet.replace(self.regex_dict)
# get English stopwords
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)
# remove stopwords from string
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: ' '.join([word for word in x.split()
if word not in stopwords_dict]))
return pdf
def wordTokenize(self, pdf):
"""
Function to tokenize a string into words. Tokenization is a way
of separating a piece of text into smaller units called tokens.
In this case tokens are words (but can also be characters or
subwords).
Inputs: A pandas dataframe with strings (of length n) that will be tokenized.
Outputs: A list of tokenized words.
"""
# string normalized
pdf = self.textNormalization(pdf)
# Use word_tokenize method to split the string
# into individual words. By default it returns
# a list.
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: nltk.word_tokenize(x))
# Using isalpha() will help us to only keep
# items from the alphabet (no punctuation
# marks).
#pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [word for word in x if word.isalpha()])
# Keep only unique elements
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: list(set(x)))
# return list of tokenized words by row
return pdf
def phraseTokenize(self, pdf):
"""
Function to tokenize a string into sentences. Tokenization is
a way of separating a piece of text into smaller units called
tokens. In this case tokens are phrases (but can also be words,
characters or subwords).
Inputs: A string (of length n) that will be tokenized.
Outputs: A list of tokenized sentences.
"""
# pandas dataframe with strings normalized
pdf = self.textNormalization(pdf)
# Use sent_tokenize method to split the string
# into sentences. By default it returns a list.
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: nltk.sent_tokenize(x))
return pdf
def stemWords(self, pdf):
"""
Function to stem strings. Stemming is the process of reducing
a word to its word stem that affixes to suffixes and prefixes
or to the roots of words (known as a lemma).
Inputs: A raw string of length n.
Output: Roots of each word of a given string.
"""
# pandas dataframe with strings normalized
pdf = self.textNormalization(pdf)
# tokenized string (into words)
pdf = self.wordTokenize(data)
# reduct words to its root
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [self.sb.stem(word) for word in x])
return pdf
def lemmatizeWords(self, pdf):
"""
Function to lemmatize strings. Lemmatization is a method
responsible for grouping different inflected forms of
words into the root form, having the same meaning. It is
similar to stemming.
Inputs: A raw string of length n.
Output: Roots of each word of a given string (with better
performance than in stemming).
"""
unw_chars = ["(", ")", "[", "]"]
# pandas dataframe with strings normalized
pdf = self.textNormalization(pdf)
# list of tokenized words (from string)
# Here it was decided to tokenize by words
# rather than by sentences due to it might
# be easier to find the correct roots
# of each word
pdf = self.wordTokenize(pdf)
# lematize word from list of tokenized words
#lematized = [self.lemmatizer.lemmatize(word) for word in tokenized]
pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [self.lemmatizer.lemmatize(word)
for word in x if word not in unw_chars])
return pdf