-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
32 lines (22 loc) · 781 Bytes
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models import Word2Vec
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
my_phrases = []
with open('feeds.txt') as my_file:
my_phrases = my_file.readlines()
phrase = []
with open('feeds1.txt') as my_file1:
phrases = my_file1.readlines()
vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
all_phrases = phrase + my_phrases
my_features = vectorizer.fit_transform(all_phrases)
scores = (my_features[0, :] * my_features[1:, :].T).A[0]
best_score = np.argmax(scores)
answer = my_phrases[best_score]
print answer