-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessor.py
138 lines (111 loc) · 4.75 KB
/
Preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from collections import defaultdict
import numpy as np
import operator
import pandas as pd
import re
from tqdm import tqdm
# %% Define the preprocessor
class Preprocessor:
def __init__(self, embeddings: dict):
self.embeddings = embeddings
def build_tf_dict(self, sentences: list):
"""
Build a simple TF (term frequency) dictionary for all words in the provided sentences.
"""
tf_dict = defaultdict(int)
for sentence in sentences:
for word in sentence:
tf_dict[word] += 1
return tf_dict
def check_coverage(self, tf_dictionary: dict):
"""
Build a simple list of words that are not embedded. Can be used down the stream to preprocess them to something
known.
"""
in_vocabulary = defaultdict(int)
out_of_vocabulary = defaultdict(int)
in_count = 0
out_count = 0
for word in tqdm(tf_dictionary):
if word in self.embeddings:
in_vocabulary[word] = embeddings[word]
in_count += tf_dictionary[word]
else:
out_of_vocabulary[word] = tf_dictionary[word]
out_count += tf_dictionary[word]
percent_tf = len(in_vocabulary) / len(tf_dictionary)
percent_all = in_count / (in_count + out_count)
print('Found embeddings for {:.2%} of vocabulary and {:.2%} of all text'.format(percent_tf, percent_all))
return sorted(out_of_vocabulary.items(), key=operator.itemgetter(1))[::-1]
def clean_punctuation(self, text: list):
result = text
#TODO Wrong - don't replace the `result` with a single `text`
for punct in "/-'":
result = result.replace(punct, ' ')
for punct in '&':
result = result.replace(punct, f' {punct} ')
for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
result = result.replace(punct, '')
return result
def clean_digits(self, text: list):
result = text
return result
def clean_misspelling(self, text: list):
mispell_dict = {'colour':'color',
'centre':'center',
'didnt':'did not',
'doesnt':'does not',
'isnt':'is not',
'shouldnt':'should not',
'favourite':'favorite',
'travelling':'traveling',
'counselling':'counseling',
'theatre':'theater',
'cancelled':'canceled',
'labour':'labor',
'organisation':'organization',
'wwii':'world war 2',
'citicise':'criticize',
'instagram': 'social medium',
'whatsapp': 'social medium',
'snapchat': 'social medium'}
def _get_mispell(mispell_dict):
mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
return mispell_dict, mispell_re
mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace(match):
return mispellings[match.group(0)]
return mispellings_re.sub(replace, text)
def apply_cleaning_function(self, fn, texts: list, description = ""):
result = [fn(text) for text in tqdm(texts)]
sentences = [text.split() for text in result]
tf_dict = self.build_tf_dict(sentences)
oov = self.check_coverage(tf_dict)
print(oov[:10])
return result
def preprocess_for_embeddings_coverage(self, texts: list):
result = texts
sentences = [text.split() for text in result]
tf_dict = self.build_tf_dict(sentences)
oov = self.check_coverage(tf_dict)
result = self.apply_cleaning_function(lambda x: self.clean_punctuation(x), result, "Cleaning punctuation...")
result = self.apply_cleaning_function(lambda x: self.clean_digits(x), result, "Cleaning numbers...")
result = self.apply_cleaning_function(lambda x: self.clean_misspelling(x), result, "Cleaning misspelled words...")
return result
# %% Load the data
df_train = pd.read_csv("input/train.csv")
# %% Load the embeddings
def load_embeddings(file):
embeddings = {}
with open(file, encoding="utf8", errors='ignore') as f:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings = dict(get_coefs(*line.split(" ")) for (i, line) in enumerate(tqdm(f)))
print('Found %s word vectors.' % len(embeddings))
return embeddings
embeddings = load_embeddings("input/embeddings/glove.840B.300d/glove.840B.300d.txt")
# %% Preprocess
preprocessor = Preprocessor(embeddings)
questions = df_train["question_text"].values
print(questions[:3])
questions = preprocessor.preprocess_for_embeddings_coverage(questions)
print(questions[:3])