-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
117 lines (99 loc) · 3.23 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from functools import lru_cache
import gensim
import numpy as np
from gensim.parsing import preprocessing
from nltk import PerceptronTagger, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
wnl = WordNetLemmatizer()
pos_tagger = PerceptronTagger()
stopwords = stopwords.words("english")
def get_top_items(dict_or_list, n=None, sort_by_index=1, ascending=False):
"""
Return the top items from a dictionary or a list of tuples
"""
if not n:
n = len(dict_or_list)
if isinstance(dict_or_list, dict):
return dict(
sorted(
dict_or_list.items(),
key=lambda item: item[sort_by_index],
reverse=not ascending,
)[:n]
)
return sorted(
dict_or_list, key=lambda item: item[sort_by_index], reverse=not ascending
)[:n]
def normalize(dict_or_list):
if isinstance(dict_or_list, dict):
factor = 1 / np.nansum(list(dict_or_list.values()))
return {key: val * factor for key, val in dict_or_list.items()}
factor = 1 / np.nansum(dict_or_list)
return [val * factor if val is not None else val for val in dict_or_list]
def tokenize(
text,
remove_stopwords=True,
lower=True,
deacc=True,
to_str=False,
stemming=False,
lemmatize=False,
word_min_length=2,
):
"""
Convert a single sentence into a list of tokens.
This lowercases, tokenizes (to alphabetic characters only) and converts to unicode.
"""
tokens = gensim.utils.tokenize(text, lower=lower, deacc=deacc)
tokens = filter(
lambda x: len(x) >= word_min_length
and (not remove_stopwords or x not in stopwords),
tokens,
)
filtered_text = " ".join(tokens)
filters = []
if stemming:
filters.append(preprocessing.stem)
tokens = preprocessing.preprocess_string(filtered_text, filters)
if lemmatize:
tokens = lemmatize_words(tokens, wnl, pos_tagger)
return " ".join(tokens) if to_str else list(tokens)
def _get_wordnet_pos(treebank_tag):
if treebank_tag.startswith("J"):
return wordnet.ADJ
elif treebank_tag.startswith("V"):
return wordnet.VERB
elif treebank_tag.startswith("N"):
return wordnet.NOUN
elif treebank_tag.startswith("R"):
return wordnet.ADV
def get_word_pos(word, pos_tagger=pos_tagger):
if not word:
return word
pos = _get_wordnet_pos(pos_tagger.tag([word])[0][1]) if pos_tagger else None
return pos
@lru_cache(maxsize=100000)
def lemmatize_word(word, wnl, pos_tagger=None, pos=None):
if not word:
return word
if not pos:
pos = get_word_pos(word, pos_tagger)
lemma = wnl.lemmatize(word, pos) if pos else wnl.lemmatize(word)
return lemma
def lemmatize_words(words, wnl, pos_tagger=None):
for word in words:
if not word:
yield word
lemma = lemmatize_word(word, wnl, pos_tagger)
yield lemma
def argpartition(l, n, highest=True):
"""
Get the indices of the `n` items with the highest values from the list `l`
"""
if n >= len(l) or n == 0:
return list(range(len(l)))
if highest:
indices = np.argpartition(l, -n)[-n:]
else:
indices = np.argpartition(l, n)[:n]
return indices