-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
148 lines (104 loc) · 3.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# imports
import configs
from utils import load_pkl_file
from utils import SimsModel
import sys
import logging
from itertools import chain
from six.moves import cPickle as pickle
from collections import Counter
from operator import itemgetter
from random import choice
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import wordnet as wn
# setup
logging.basicConfig(level=logging.DEBUG)
# classes:
class Frequencies(object):
def __init__(self):
self.frequencies = self.get_frequencies()
def get_frequencies(self):
logging.debug('loading frequencies')
return load_pkl_file(configs.FREQUENCIES_FILE_PATH)
def get_frequency(self, word):
return self.frequencies[word]
# functions
def get_synonyms(word):
logging.debug('getting synonyms of word: {}'.format(word))
synsets = wn.synsets(word)
synonyms = [word.lemma_names() for word in synsets]
synonyms = chain.from_iterable(synonyms)
synonyms = list(set(synonyms))
synonyms = [synonym.lower()
for synonym in synonyms
if synonym != word]
return synonyms
def filter_frequencies(words, max_word_frequency, frequencies):
logging.debug('filtering frequencies')
word_frequencies = [(word, frequencies.get_frequency(word))
for word in words]
filtered_words = []
filtered_words.extend([word
for word, freq in word_frequencies
if freq <= max_word_frequency])
return filtered_words
def filter_similarities(main_word, synonyms, min_similarity, model):
logging.debug('filtering similarities')
return [word
for word in synonyms
if model.get_similarity(main_word, word) >= min_similarity]
def convert_tag(tag):
return tag[0]
def filter_pos_tags(main_word, synonyms, main_pos_tag=None):
logging.debug('filtering pos tags')
def get_word_pos_tag(word):
return convert_tag(pos_tag([word])[0][1])
if main_pos_tag is None:
main_pos_tag = get_word_pos_tag(main_word)
return [word
for word in synonyms
if get_word_pos_tag(word) == main_pos_tag]
def get_advanced_words(word, frequencies, sims_model, word_tag=None):
logging.debug('getting advanced words from: {}'.format(word))
syns = get_synonyms(word)
syns = filter_similarities(word, syns, configs.MIN_WORD2VEC_SIMILARITY,
sims_model)
syns = filter_frequencies(syns, configs.MAX_WORD_FREQUENCY, frequencies)
syns = filter_pos_tags(word, syns, main_pos_tag=word_tag)
return syns
def tag_text(text):
logging.debug('pos tagging text')
return [(word, convert_tag(tag)) for word, tag in pos_tag(word_tokenize(text))]
def is_accepted(word, tag):
return True
def transform_sentence(sentence, frequencies, sims_model):
logging.debug('transforming sentence')
tagged = tag_text(sentence)
new_sentence = ''
for word, tag in tagged:
if is_accepted(word, tag):
advanced_words = get_advanced_words(word.lower(), frequencies, sims_model, word_tag=tag)
else:
advanced_words = None
if advanced_words:
new_word = choice(advanced_words)
if word[0].isupper():
new_word = new_word.title()
else:
new_word = word
if word not in '?!.,':
new_sentence += ' '
new_sentence += new_word
new_sentence = new_sentence[1:]
return new_sentence
def main():
logging.debug('running main')
frequencies = Frequencies()
sims_model = SimsModel()
sentence = sys.argv[1]
transformed = transform_sentence(sentence, frequencies, sims_model)
print(transformed)
if __name__ == '__main__':
main()