-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsent2vec.py
78 lines (67 loc) · 2.9 KB
/
sent2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from gensim.models import KeyedVectors
from sklearn.feature_extraction import stop_words
import numpy as np
def sent2vec(sentence, model, method='tfidf', **kwargs):
"""
Generic function to convert a sentence to a vector using
avg or TFIDF vecorization
"""
##### It is recommended to pass seperate stopwords #####
stopwords = kwargs.get('stopwords')
if stopwords is None:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
stopwords = ENGLISH_STOP_WORDS
##### It is recommended to pass seperate tokenizers #####
tokenizer = kwargs.get('tokenizer')
if tokenizer is None:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(sentence) # Tokenize the words
words = {each for each in words if each not in stopwords} # Remove all the stopwords
V = []
for word in words: # Process over all the words in the sentence
if model.__contains__(word):
V.append(model[word])
V = np.array(V)
# If no words were present in the model
# or blank sentence was passed, return a
# word vector with all 0's
if V.shape[0] == 0:
# If model returns word2vec of different size
# Default value is taken 300
custom_shape = kwargs.get('shape', 300)
return np.zeros(custom_shape)
# If there is atleast one word in the sentence that
# was vectoried properly
if method.lower() == 'avg':
V = V.sum(axis=0)
return V / np.sqrt((V ** 2).sum())
elif method.lower() == 'tfidf':
tfidf_model = kwargs.get('tfidf_model') # Load the tfidf model
if tfidf_model: # If model loaded sucessfully
tfidf_vec = tfidf_model.transform([sentence]) # get TFIDF for the sentence
indx = tfidf_model.vocabulary_.get(word, -1)
tfidfs = []
for word in words:
if model.__contains__(word):
if indx != -1:
tfidfs.append(tfidf_vec[0, indx])
else:
tfidfs.append(0.0)
tfidfs = np.array(tfidfs)
denominator = tfidfs.sum()
if denominator == 0.0: # No word is representred in tfidf and w2v both
# Better than skipping that sentence
denominator = tfidf_model.idf_.min() * 0.01
numerator = V * tfidfs.reshape(V.shape[0], 1)
numerator = numerator.sum(axis=0)
return numerator / denominator
else:
raise ValueError('No tfidf model is present')
if __name__ == '__main__':
text = 'Hello, how are you today?'
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
vec = sent2vec(text, model, method='avg',
stopwords=stop_words.ENGLISH_STOP_WORDS)
print(vec.shape)
print(vec)