-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizers.py
225 lines (181 loc) · 10.5 KB
/
tokenizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# tokenize_dataset(sql="select id,details->>'title' as title,details->>'text' as body from amazon_users_reviews limit 400 --where added_at<='2020-03-06 19:58:34.422863'",name='amazon_users_reviews_till_2020-03-06 19_58_34.422863',size=1000,exp_length=19_400_000)
# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------
import logging
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------
from .python import ensure_installed
ensure_installed("psycopg2 nltk spacy")
# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------
from typing import *
import psycopg2
from . import db
from .strings import *
from collections import defaultdict
import nltk
import spacy
nlp = None
vars = "NUM_AS_SEPARATE_WORD,NUM_OCCS,NUM_FIRSTLETTER_CAPITAL,NUM_ALLLETTERS_CAPITAL,INWORD_ABSOLUTE_POSITION,INWORD_RELATIVE_POSITION,NUM_FIRSTWORD_INSENTENCE,NUM_LASTWORD_INSENTENCE,INSENTENCE_ABSOLUTE_POSITION,INSENTENCE_RELATIVE_POSITION,NUM_PREV_WORDS,NUM_PREV_SENTENCE_WORDS".split(
","
)
try:
from tqdm.notebook import tqdm
except:
from tqdm import tqdm
class AdvancedTokenizer:
def __init__(self, language_model="en_core_web_sm"):
global nlp
if nlp is None:
nlp = spacy.load(language_model)
self.NUM_AS_SEPARATE_WORD = defaultdict(int)
self.NUM_OCCS = defaultdict(int)
self.NUM_FIRSTLETTER_CAPITAL = defaultdict(int)
self.NUM_ALLLETTERS_CAPITAL = defaultdict(int)
self.INWORD_ABSOLUTE_POSITION = defaultdict(int)
self.INWORD_RELATIVE_POSITION = defaultdict(int)
self.NUM_FIRSTWORD_INSENTENCE = defaultdict(int)
self.NUM_LASTWORD_INSENTENCE = defaultdict(int)
self.INSENTENCE_ABSOLUTE_POSITION = defaultdict(int)
self.INSENTENCE_RELATIVE_POSITION = defaultdict(int)
self.NUM_PREV_WORDS = dict() # defaultdict(lambda: defaultdict(int))
self.NUM_PREV_SENTENCE_WORDS = dict() # defaultdict(lambda: defaultdict(int))
def tokenize(self, text):
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
# 1) get full list of unique genuine words along with their frequencies. we know that genuine words should not contain punctuation (except -)
# and are usually separated by space, comma, semicolon, hyphen (with spaces on each side)
# Also count how many times each word starts with a capital being not a first word in the sentence, and how many times it's used with all capitals.
# 2) for every genuine word with number of occurences>1, count its consecutive symbols of length MIN_MORPHEME_LENGTH=2 to MAX_MORPHEME_LENGTH
# 3) after aggregating morpheme occurences over all words in corpus (irrelevant of frequencies of words themselves! jsut freqs of morphemes
# in the list of unique words!),
# create morphemes as most long stable letters sequences occuring in different words. With such approach, morphemes must be capturing
# some meaning, like -ed in verbs for example... or ha[d],ha[s],ha[ve]
# edge cases: word is the same as some of letters (i), morpheme is the same as some of words or letters
# 4) after all each letter, morpheme, and word gets its own embedding
cur_sentences = nltk.sent_tokenize(text)
last_sentence_word = None
for s, sent in enumerate(cur_sentences):
stext = sent.strip()
# print(stext)
words = merge_punctuation_signs(nltk.word_tokenize(stext))
k = len(words)
last_word = None
for w, word in enumerate(words):
l = len(word)
for i in range(l):
for j in range(1, l - i + 1):
morpheme = word[i : i + j]
# if len(morpheme)>0:
if j == 1:
if morpheme.isupper():
FIRSTLETTER_CAPITAL = True
ALLLETTERS_CAPITAL = False
else:
FIRSTLETTER_CAPITAL = False
ALLLETTERS_CAPITAL = False
else:
if FIRSTLETTER_CAPITAL:
if morpheme.isupper():
ALLLETTERS_CAPITAL = True
else:
ALLLETTERS_CAPITAL = False
# print(morpheme)
base_morpheme = morpheme.lower()
# stats
if w == 0:
self.NUM_FIRSTWORD_INSENTENCE[base_morpheme] += 1
elif w == k - 1:
# print(word,base_morpheme)
self.NUM_LASTWORD_INSENTENCE[base_morpheme] += 1
if FIRSTLETTER_CAPITAL:
self.NUM_FIRSTLETTER_CAPITAL[base_morpheme] += 1
if ALLLETTERS_CAPITAL:
self.NUM_ALLLETTERS_CAPITAL[base_morpheme] += 1
self.INWORD_ABSOLUTE_POSITION[base_morpheme] += i + 1
self.INWORD_RELATIVE_POSITION[base_morpheme] += (i + 1) / l
self.INSENTENCE_ABSOLUTE_POSITION[base_morpheme] += w + 1
self.INSENTENCE_RELATIVE_POSITION[base_morpheme] += (w + 1) / k
self.NUM_OCCS[base_morpheme] += 1
if i == 0:
if j == l:
self.NUM_AS_SEPARATE_WORD[base_morpheme] += 1
if last_word:
if base_morpheme not in self.NUM_PREV_WORDS:
self.NUM_PREV_WORDS[base_morpheme] = defaultdict(int)
self.NUM_PREV_WORDS[base_morpheme][last_word] += 1
last_word = base_morpheme
if w == 0:
if last_sentence_word:
if last_word not in self.NUM_PREV_SENTENCE_WORDS:
self.NUM_PREV_SENTENCE_WORDS[last_word] = defaultdict(int)
self.NUM_PREV_SENTENCE_WORDS[last_word][last_sentence_word] += 1
if last_word.isalpha() or last_word.isnumeric():
last_sentence_word = last_word
def tokenize_db_reviews(self, sql: str, tokens: dict, save_as: str = None, chunk_size: int = 1000, exp_length: int = 10000, newlines=None):
nchunks = 0
nitems = 0
cur = db.safe_execute(sql, cursor_factory=psycopg2.extras.NamedTupleCursor, cursor_name="test", return_cursor=True)
pbar = tqdm(total=exp_length)
if type(cur) == list:
print(cur)
else:
while True:
chunk = cur.fetchmany(size=chunk_size)
chunksize = len(chunk)
if chunksize == 0:
break
for review in chunk:
res = ""
for text in (review.title, review.body):
if newlines:
text = text.replace(newlines, "\n")
text = fix_broken_sentences(remove_videos(fix_quotations(fix_spaces(fix_duplicate_tokens(fix_html(text))))))
if len(text) > 1:
# print(text)
res += ("" if len(res) == 0 else " ") + sentencize_text(text)
if len(res) > 0:
res = ensure_space_after_comma(res)
# print(res)
self.tokenize(res)
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Update stats
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
nitems = nitems + chunksize
pbar.update(nitems)
if nitems > exp_length:
pbar.total = nitems * 1.1
pbar.refresh()
nchunks = nchunks + 1
logger.info(f"nchunks={nchunks},nitems={nitems}")
pbar.total = nitems
pbar.refresh()
pbar.close()
cur.close()
if save_as:
self.save_tokens_to_file(file_name=save_as)
def save_tokens_to_file(self, file_name: str, desired_vars: list = None):
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Save computed data
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------
import pickle, gzip
whole = dict()
if desired_vars is None:
vars_to_use = vars
else:
vars_to_use = desired_vars
for var in vars_to_use:
whole[var] = getattr(self, var)
with gzip.open(f"{file_name}.pickle", "wb") as handle:
pickle.dump(whole, handle, protocol=pickle.HIGHEST_PROTOCOL)
def load_tokens(self, name: str, tokens: dict):
import pickle, gzip
with gzip.open(f"{name}.pickle", "rb") as handle:
whole = pickle.load(handle)
for var in vars:
if var in whole:
tokens[var] = whole[var]