-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
522 lines (407 loc) · 23.5 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
"""
This module contains utility functions that process the data extracted
from the emails into a format more suitable for use with machine
learning algorithms.
"""
import pandas as pd
import numpy as np
import re
import string
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import random
random.seed(1746)
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
"""
Text Preprocessing
"""
def check_empty(input_string):
"""
Check if input is considered an empty email.
An empty email would contain only whitespace.
The implementation simply uses python's strip() to see if
anything remains after stripping the leading and trailing
whitespace, which in this case should be an empty string.
Parameters
----------
input_string : str
The string to be checked.
Returns
-------
bool
True if the input is found empty.
"""
if input_string.strip():
return False
else:
return True
def replace_email(input_string, replacement_string='<emailaddress>'):
"""
Replace email addresses in a string with a defined string.
A simple regex is used to search a string for usual patterns
of email addresses.
The pattern is good enough for the purposes of this project,
but it should not be considered perfect.
Parameters
----------
input_string : str
The string that will be searched for email addresses.
replacement_string: str, default '<emailaddress>'
The string that will replace email addresses.
Returns
-------
str
The input string with replaced email addresses.
"""
output_string = re.sub(r'\b[A-Za-z0-9._%+-]+@([A-Za-z0-9.-]+\.[A-Z|a-z]{2,})\b', '<emailaddress>', input_string)
return output_string
def replace_url(input_string, replacement_string='<urladdress>'):
"""
Replace URLs in a string with a defined string.
A regex is used to search a string for usual patterns
of email addresses. It tries to detect as many URL formats
as possible, like example.com or https://www.example.com
and even when there are added parameters, like
http://example.com/subpage?param1=1¶m2=2#service
This can result in some false positives.
The biggest weakness is that it detects domains of email
addresses (for example it will replace example.com
in [email protected]), but this can be solved by replacing
the emails first.
In order to avoid detecting strings such as document.pdf,
it uses a list of TLDs from IANA to distinguish those
from the file extensions. This also means that the regex
is unable to match IP addresses.
Parameters
----------
input_string : str
The string that will be searched for URLs.
replacement_string: str, default '<urladdress>'
The string that will replace URLs.
Returns
-------
str
The input string with replaced URLs.
See Also
--------
replace_email : Replace email addresses in a string with a defined string.
"""
output_string = re.sub(
r'((http|ftp|https)\:\/\/)?([\w_\.-]+(?:\.(zw|zuerich|zone|zm|zip|zero|zara|zappos|za|yun|yt|youtube|you|yokohama|yoga|yodobashi|ye|yandex|yamaxun|yahoo|yachts|xyz|xxx|xn--zfr164b|xn--ygbi2ammx|xn--yfro4i67o|xn--y9a3aq|xn--xkc2dl3a5ee0h|xn--xkc2al3hye2a|xn--xhq521b|xn--wgbl6a|xn--wgbh1c|xn--w4rs40l|xn--w4r85el8fhu5dnra|xn--vuq861b|xn--vhquv|xn--vermgensberatung-pwb|xn--vermgensberater-ctb|xn--unup4y|xn--tiq49xqyj|xn--tckwe|xn--t60b56a|xn--ses554g|xn--s9brj9c|xn--rvc1e0am3e|xn--rovu88b|xn--rhqv96g|xn--qxam|xn--qxa6a|xn--qcka1pmc|xn--q9jyb4c|xn--q7ce6a|xn--pssy2u|xn--pgbs0dh|xn--p1ai|xn--p1acf|xn--otu796d|xn--ogbpf8fl|xn--o3cw4h|xn--nyqy26a|xn--nqv7fs00ema|xn--nqv7f|xn--node|xn--ngbrx|xn--ngbe9e0a|xn--ngbc5azd|xn--mxtq1m|xn--mk1bu44c|xn--mix891f|xn--mgbx4cd0ab|xn--mgbtx2b|xn--mgbt3dhd|xn--mgbpl2fh|xn--mgbi4ecexp|xn--mgbgu82a|xn--mgberp4a5d4ar|xn--mgbcpq6gpa1a|xn--mgbca7dzdo|xn--mgbc0a9azcg|xn--mgbbh1a71e|xn--mgbbh1a|xn--mgbayh7gpa|xn--mgbai9azgqp6j|xn--mgbah1a3hjkrd|xn--mgbab2bd|xn--mgbaam7a8h|xn--mgbaakc7dvf|xn--mgba7c0bbn0a|xn--mgba3a4f16a|xn--mgba3a3ejt|xn--mgb9awbf|xn--lgbbat1ad8j|xn--l1acc|xn--kput3i|xn--kpry57d|xn--kprw13d|xn--kcrx77d1x4a|xn--jvr189m|xn--jlq61u9w7b|xn--jlq480n2rg|xn--j6w193g|xn--j1amh|xn--j1aef|xn--io0a7i|xn--imr513n|xn--i1b6b1a6a2e|xn--hxt814e|xn--h2brj9c8c|xn--h2brj9c|xn--h2breg3eve|xn--gk3at1e|xn--gecrj9c|xn--gckr3f0f|xn--g2xx48c|xn--fzys8d69uvgm|xn--fzc2c9e2c|xn--fpcrj9c3d|xn--flw351e|xn--fjq720a|xn--fiqz9s|xn--fiqs8s|xn--fiq64b|xn--fiq228c5hs|xn--fhbei|xn--fct429k|xn--efvy88h|xn--eckvdtc9d|xn--e1a4c|xn--d1alf|xn--d1acj3b|xn--czru2d|xn--czrs0t|xn--czr694b|xn--clchc0ea0b2g2a9gcd|xn--cg4bki|xn--cckwcxetd|xn--cck2b3b|xn--c2br7g|xn--c1avg|xn--bck1b9a5dre4c|xn--b4w605ferd|xn--9krt00a|xn--9et52u|xn--9dbq2a|xn--90ais|xn--90ae|xn--90a3ac|xn--8y0a063a|xn--80aswg|xn--80asehdb|xn--80aqecdr1a|xn--80ao21a|xn--80adxhks|xn--6qq986b3xl|xn--6frz82g|xn--5tzm5g|xn--5su34j936bgsg|xn--55qx5d|xn--55qw42g|xn--54b7fta0cc|xn--4gbrim|xn--4dbrk0ce|xn--45q11c|xn--45brj9c|xn--45br5cyl|xn--42c2d9a|xn--3pxu8k|xn--3hcrj9c|xn--3e0b707e|xn--3ds443g|xn--3bst00m|xn--30rr7y|xn--2scrj9c|xn--1qqw23a|xn--1ck2e1b|xn--11b4c3d|xin|xihuan|xfinity|xerox|xbox|wtf|wtc|ws|wow|world|works|work|woodside|wolterskluwer|wme|winners|wine|windows|win|williamhill|wiki|wien|whoswho|wf|weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|weather|watches|watch|wanggou|wang|walter|walmart|wales|vuelos|vu|voyage|voto|voting|vote|volvo|volkswagen|vodka|vn|vlaanderen|vivo|viva|vision|visa|virgin|vip|vin|villas|viking|vig|video|viajes|vi|vg|vet|versicherung|verisign|ventures|vegas|ve|vc|vanguard|vana|vacations|va|uz|uy|us|ups|uol|uno|university|unicom|uk|ug|ubs|ubank|ua|tz|tw|tvs|tv|tushu|tunes|tui|tube|tt|trv|trust|travelersinsurance|travelers|travelchannel|travel|training|trading|trade|tr|toys|toyota|town|tours|total|toshiba|toray|top|tools|tokyo|today|to|tn|tmall|tm|tl|tkmaxx|tk|tjx|tjmaxx|tj|tirol|tires|tips|tiffany|tienda|tickets|tiaa|theatre|theater|thd|th|tg|tf|teva|tennis|temasek|tel|technology|tech|team|tdk|td|tci|tc|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|sz|systems|sydney|sy|sx|swiss|swatch|sv|suzuki|surgery|surf|support|supply|supplies|sucks|su|style|study|studio|stream|store|storage|stockholm|stcgroup|stc|statefarm|statebank|star|staples|stada|st|ss|srl|sr|spot|sport|space|spa|soy|sony|song|solutions|solar|sohu|software|softbank|social|soccer|so|sncf|sn|smile|smart|sm|sling|sl|skype|sky|skin|ski|sk|sj|site|singles|sina|silk|si|showtime|show|shouji|shopping|shop|shoes|shiksha|shia|shell|shaw|sharp|shangrila|sh|sg|sfr|sexy|sex|sew|seven|ses|services|sener|select|seek|security|secure|seat|search|se|sd|scot|science|schwarz|schule|school|scholarships|schmidt|schaeffler|scb|sca|sc|sbs|sbi|sb|saxo|save|sas|sarl|sap|sanofi|sandvikcoromant|sandvik|samsung|samsclub|salon|sale|sakura|safety|safe|saarland|sa|ryukyu|rwe|rw|run|ruhr|rugby|ru|rsvp|rs|room|rogers|rodeo|rocks|rocher|ro|rip|rio|ril|ricoh|richardli|rich|rexroth|reviews|review|restaurant|rest|republican|report|repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|redumbrella|redstone|red|recipes|realty|realtor|realestate|read|re|radio|racing|quest|quebec|qpon|qa|py|pwc|pw|pub|pt|ps|prudential|pru|protection|property|properties|promo|progressive|prof|productions|prod|pro|prime|press|praxi|pramerica|pr|post|porn|politie|poker|pohl|pnc|pn|pm|plus|plumbing|playstation|play|place|pl|pk|pizza|pioneer|pink|ping|pin|pid|pictures|pictet|pics|physio|photos|photography|photo|phone|philips|phd|pharmacy|ph|pg|pfizer|pf|pet|pe|pccw|pay|passagens|party|parts|partners|pars|paris|panasonic|page|pa|ovh|ott|otsuka|osaka|origins|organic|org|orange|oracle|open|ooo|online|onl|ong|one|omega|om|ollo|oldnavy|olayangroup|olayan|okinawa|office|observer|obi|nz|nyc|nu|ntt|nrw|nra|nr|np|nowtv|nowruz|now|norton|northwesternmutual|nokia|no|nl|nissay|nissan|ninja|nikon|nike|nico|ni|nhk|ngo|ng|nfl|nf|nexus|nextdirect|next|news|new|neustar|network|netflix|netbank|net|nec|ne|nc|nba|navy|natura|name|nagoya|nab|na|mz|my|mx|mw|mv|mutual|music|museum|mu|mtr|mtn|mt|msd|ms|mr|mq|mp|movie|mov|motorcycles|moto|moscow|mortgage|mormon|monster|money|monash|mom|moi|moe|moda|mobile|mobi|mo|mn|mma|mm|mls|mlb|ml|mk|mitsubishi|mit|mint|mini|mil|microsoft|miami|mh|mg|merckmsd|menu|men|memorial|meme|melbourne|meet|media|med|me|md|mckinsey|mc|mba|mattel|maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|makeup|maison|maif|madrid|macys|ma|ly|lv|luxury|luxe|lundbeck|lu|ltda|ltd|lt|ls|lr|lplfinancial|lpl|love|lotto|lotte|london|lol|loft|locus|locker|loans|loan|llp|llc|lk|living|live|lipsy|link|linde|lincoln|limo|limited|lilly|like|lighting|lifestyle|lifeinsurance|life|lidl|li|lgbt|lexus|lego|legal|lefrak|leclerc|lease|lds|lc|lb|lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|lancia|lancaster|lamer|lamborghini|lacaixa|la|kz|kyoto|ky|kw|kuokgroup|kred|krd|kr|kpn|kpmg|kp|kosher|komatsu|koeln|kn|km|kiwi|kitchen|kindle|kinder|kim|kids|kia|ki|kh|kg|kfh|kerryproperties|kerrylogistics|kerryhotels|ke|kddi|kaufen|juniper|juegos|jprs|jpmorgan|jp|joy|jot|joburg|jobs|jo|jnj|jmp|jm|jll|jio|jewelry|jetzt|jeep|je|jcb|java|jaguar|itv|itau|it|istanbul|ist|ismaili|is|irish|ir|iq|ipiranga|io|investments|intuit|international|int|insure|insurance|institute|ink|ing|info|infiniti|industries|inc|in|immobilien|immo|imdb|imamat|im|il|ikano|ifm|ieee|ie|id|icu|ice|icbc|ibm|hyundai|hyatt|hughes|hu|ht|hsbc|hr|how|house|hotmail|hotels|hoteles|hot|hosting|host|hospital|horse|honda|homesense|homes|homegoods|homedepot|holiday|holdings|hockey|hn|hm|hkt|hk|hiv|hitachi|hisamitsu|hiphop|hgtv|hermes|here|helsinki|help|healthcare|health|hdfcbank|hdfc|hbo|haus|hangout|hamburg|hair|gy|gw|guru|guitars|guide|guge|gucci|guardian|gu|gt|gs|group|grocery|gripe|green|gratis|graphics|grainger|gr|gq|gp|gov|got|gop|google|goog|goodyear|goo|golf|goldpoint|gold|godaddy|gn|gmx|gmo|gmbh|gmail|gm|globo|global|gle|glass|gl|giving|gives|gifts|gift|gi|gh|ggee|gg|gf|george|genting|gent|gea|ge|gdn|gd|gbiz|gb|gay|garden|gap|games|game|gallup|gallo|gallery|gal|ga|fyi|futbol|furniture|fund|fun|fujitsu|ftr|frontier|frontdoor|frogans|frl|fresenius|free|fr|fox|foundation|forum|forsale|forex|ford|football|foodnetwork|food|foo|fo|fm|fly|flowers|florist|flir|flights|flickr|fk|fj|fitness|fit|fishing|fish|firmdale|firestone|fire|financial|finance|final|film|fido|fidelity|fiat|fi|ferrero|ferrari|feedback|fedex|fast|fashion|farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|express|exposed|expert|exchange|events|eus|eurovision|eu|etisalat|et|estate|esq|es|erni|ericsson|er|equipment|epson|enterprises|engineering|engineer|energy|emerck|email|eg|ee|education|edu|edeka|eco|ec|eat|earth|dz|dvr|dvag|durban|dupont|dunlop|dubai|dtv|drive|download|dot|domains|dog|doctor|docs|do|dnp|dm|dk|dj|diy|dish|discover|discount|directory|direct|digital|diet|diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|delivery|degree|deals|dealer|deal|de|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cz|cyou|cymru|cy|cx|cw|cv|cuisinella|cu|cruises|cruise|crs|crown|cricket|creditunion|creditcard|credit|cr|cpa|courses|coupons|coupon|country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|construction|condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|college|coffee|codes|coach|co|cn|cm|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|cl|ck|cityeats|city|citic|citi|citadel|cisco|circle|cipriani|ci|church|chrome|christmas|chintai|cheap|chat|chase|charity|channel|chanel|ch|cg|cfd|cfa|cf|cern|ceo|center|cd|cc|cbs|cbre|cbn|cba|catholic|catering|cat|casino|cash|case|casa|cars|careers|career|care|cards|caravan|car|capitalone|capital|capetown|canon|cancerresearch|camp|camera|cam|calvinklein|call|cal|cafe|cab|ca|bzh|bz|by|bw|bv|buzz|buy|business|builders|build|bugatti|bt|bs|brussels|brother|broker|broadway|bridgestone|bradesco|br|box|boutique|bot|boston|bostik|bosch|booking|book|boo|bond|bom|bofa|boehringer|boats|bo|bnpparibas|bn|bmw|bms|bm|blue|bloomberg|blog|blockbuster|blackfriday|black|bj|biz|bio|bingo|bing|bike|bid|bible|bi|bharti|bh|bg|bf|bet|bestbuy|best|berlin|bentley|beer|beauty|beats|be|bd|bcn|bcg|bbva|bbt|bbc|bb|bayern|bauhaus|basketball|baseball|bargains|barefoot|barclays|barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|ba|azure|az|axa|ax|aws|aw|avianca|autos|auto|author|auspost|audio|audible|audi|auction|au|attorney|athleta|at|associates|asia|asda|as|arte|art|arpa|army|archi|aramco|arab|ar|aquarelle|aq|apple|app|apartments|aol|ao|anz|anquan|android|analytics|amsterdam|amica|amfam|amex|americanfamily|americanexpress|amazon|am|alstom|alsace|ally|allstate|allfinanz|alipay|alibaba|alfaromeo|al|akdn|airtel|airforce|airbus|aig|ai|agency|agakhan|ag|africa|afl|af|aetna|aero|aeg|ae|adult|ads|adac|ad|actor|aco|accountants|accountant|accenture|academy|ac|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa)\b))([\w.,?^=%&@:/~+#-]*[\w?^=%&@/~+#-])?',
replacement_string, input_string)
return output_string
def tokenize(input_text):
"""
Convert a string of text to a list with words.
The initial NLTK tokenization produces tokens with
punctuation marks or other special characters.
A regex filter is then applied to throw out all such
tokens, keeping only alphanumeric characters (a-z and 0-9),
underscores (_) and dashes (-), so that tokens like "e-mail"
are not removed.
Also, there is a conversion to lowercase.
Parameters
----------
input_text : str
The string that will be tokenized.
Returns
-------
list of str
The list that contains the cleaned-up tokens.
"""
lowercase = input_text.lower()
token_list = nltk.word_tokenize(lowercase)
clean_list = [word for word in token_list if len(re.findall(r'^[a-zA-Z0-9]+-?[\w-]*$', word)) == 1]
return clean_list
def remove_stopwords(tokenized_text):
"""
Remove stopwords from a list of words.
Parameters
----------
tokenized_text : list
The list of words to process.
Returns
-------
list
The word list without the stopwords.
"""
stop_words = stopwords.words('english')
clean_list = [word for word in tokenized_text if word not in stop_words]
return clean_list
def get_wordnet_pos(treebank_tag):
"""
Converts a Treebank part-of-speech tag to WordNet.
Parameters
----------
treebank_tag : str
The Treebank POS tag.
Returns
-------
str or None
A wordnet POS constant or None if no maping was found.
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return None
def lemmatize(token_list):
"""
Lemmatizes a list of tokens using WordNet lemmatizer.
Depending on the result of get_wordnet_pos(), the function
will either use a POS tag to improve lemmatization or not.
Parameters
----------
token_list : list of str
The list that contains the tokenized words.
Returns
-------
list of str
A list with the lemmas of the input words.
See Also
--------
get_wordnet_pos : Converts a Treebank part-of-speech tag to WordNet.
"""
lemmatizer = WordNetLemmatizer()
tagged_list = nltk.pos_tag(token_list)
lemmatized_list = [lemmatizer.lemmatize(word)
if get_wordnet_pos(tag) is None
else lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
for word, tag in tagged_list]
return lemmatized_list
def remove_punctuation(text):
"""
Removes punctuation marks from the input text and fixes stray dots.
This function takes an input string, removes all punctuation marks from it, and
addresses the issue of stray dots. If a line in the input consists only of a dot,
it is appended to the previous line containing text, and the line with the dot is
removed. Finally, leading and trailing whitespace is stripped from the output.
Parameters
----------
text : str
The input string to be sanitized.
Returns
-------
str
The sanitized string with punctuation marks removed and stray dots fixed.
"""
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
def remove_non_letters_and_extra_spaces(text):
"""
Removes non-letter characters from the input text and fixes extra spaces.
This function takes an input string, replaces all characters that are not letters
(a-z or A-Z) with a space, and then ensures that there are no consecutive spaces
in the resulting text. Finally, it returns the cleaned and sanitized text.
Parameters
----------
text : str
The input string to be processed.
Returns
-------
str
The cleaned and sanitized string with non-letter characters removed and
extra spaces fixed.
"""
cleaned_text = re.sub("[^a-zA-Z]", " ", text)
return " ".join(cleaned_text.split())
def sanitize_whitespace(input_string):
"""
Remove some extra whitespace and fix stray dots.
First, check if a line in the input only contains a dot, in which
case the dot is appended to the previous line that contains text
and the line with the dot is removed.
Finally, strip leading and trailing whitespace from the output.
Parameters
----------
input_string : str
The string to be sanitized.
Returns
-------
str
The sanitized string.
"""
lines = input_string.split('\n')
for index, line in enumerate(lines):
if line == '.':
lines.remove(line)
for i in range(1, len(lines)): # check for lines that contain text
if (lines[index - i] != ''):
lines[index - i] += '.'
break
output_string = '\n'.join(lines)
return output_string.strip()
def sanitize_addresses(input_string):
"""
Remove extra special characters from addresses.
Parameters
----------
input_string : str
The string to be sanitized.
Returns
-------
str
The sanitized string.
"""
less = re.sub(r'<+', '<', input_string)
more = re.sub(r'>+', '>', less)
more = re.sub(r'<emailaddress>', '', more)
more = re.sub(r'[Ee]nron', '', more)
return more
def sanitize_url(input_string):
"""
Remove extra special characters from addresses.
Parameters
----------
input_string : str
The string to be sanitized.
Returns
-------
str
The sanitized string.
"""
more = re.sub(r'urladdress', '', input_string)
return more
def dataset_split(dataset, percent=0.2, keep_index=False):
"""
Split a dataset into train and test sets.
It is a wrapper for sklearn's train_test_split() that also resets
the index.
Parameters
----------
dataset : pandas.DataFrame
The dataset to split.
percent : int, default 20
If it is in range [0, 100], it means the percentage of the
original dataset to use as the test set.
keep_index : bool, default False
If True, keep the old index in a new column, otherwise drop it.
Returns
-------
train : pandas.DataFrame
The result train set.
test : pandas.DataFrame
The result test set.
"""
train, test = train_test_split(dataset, test_size=percent / 100, random_state=1746)
if not keep_index:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
else:
train = train.reset_index()
test = test.reset_index()
return train, test
def dataset_add_columns(dataset, columns, column_names, position=0):
"""
Add columns to a dataset.
This is mostly used to streamline the addition of class and id
columns to datasets that contain only features.
The extra columns will appear in the reverse order from their
order on the input lists, and on the beginning of the DataFrame
by default.
Parameters
----------
dataset : pandas.DataFrame
The dataset to add the columns.
columns : list of pandas.Series
The values of the columns to be added.
column_names : list of str
The names of the columns to be added.
position : int, default 0
The position on the column index to put the new columns.
The default is at the beginning (position 0).
Returns
-------
pandas.DataFrame
The initial DataFrame with the added columns.
Raises
------
ValueError
If columns and column_names are not the same size.
"""
if len(columns) != len(column_names):
raise ValueError("The lists of names and values have to be the same size.")
else:
for col, name in zip(columns, column_names):
dataset.insert(position, name, col)
return dataset
def separate_features_target(dataframe, num_cols_ignore=2, class_col_name='email_class'):
"""
Separate feature columns from the target column.
It assumes that any non-feature columns are in the
beginning of the dataset (right after the index).
Essentially it is the inverse of a specific case of
dataset_add_columns().
Parameters
----------
dataframe : pandas.DataFrame
The DataFrame with data to split.
num_cols_ignore : int
The number of non-feature columns to skip.
class_col_name : str
The name of the target column.
Returns
-------
dict
{'features': pandas.DataFrame,
'target': pandas.Series}
A dictionary containing the features and target.
"""
return {'features': dataframe[dataframe.columns[num_cols_ignore:]],
'target': dataframe[class_col_name]}
"""
Missing Values
"""
def impute_missing(train, test, strategy='mean'):
"""
Fills the missing values of a dataset using the provided strategy.
The sklearn.SimpleImputer instance will be fitted on the train
data and will be used to transform the test data too.
Parameters
----------
train : pandas.DataFrame
The train set to process.
test : pandas.DataFrame
The test set to process.
strategy : {'mean', 'median', 'most_frequent'}
This parameter will be passed to SimpleImputer to determine
the imputation method.
Returns
-------
train : pandas.DataFrame
The imputed training set.
test : pandas.DataFrame
The imputed test set.
"""
imp_mean = SimpleImputer(missing_values=np.NaN, strategy=strategy)
train = pd.DataFrame(imp_mean.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imp_mean.transform(test), columns=train.columns)
return train, test
def word_stemming(text):
"""
This function takes in a string of text and returns the stemmed version of each word in the text.
Parameters:
text (str): The input text that needs stemming.
Returns:
list: A list of stemmed words.
"""
# Initialize the PorterStemmer object
porter = PorterStemmer()
# Apply stemming to each word in the tokenized text
stemmed_words = [porter.stem(word) for word in text]
return stemmed_words