-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimilarity.py
86 lines (72 loc) · 2.36 KB
/
Similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from ReadDocument import ReadDocument
import spacy
class Similarity():
nlp = spacy.load('en_vectors_web_lg')
stop_words = set(stopwords.words('english'))
def toString(self,listName):
value = ""
count = 0
for word in listName:
value = value + word + " "
return value
def removeStopwords(self,content):
sim = Similarity()
stop_words = set(stopwords.words('english'))
#print("Stop words loaded")
words = content.split(' ')
#print("String split into words")
fileWithoutStopWords = [word for word in words if word not in stop_words]
#print("list with no stopwords")
contentWithoutStopWords = sim.toString(fileWithoutStopWords)
#print("list to string converted")
return contentWithoutStopWords
def findMeasure(self,string1,string2):
nlp = spacy.load('en_vectors_web_lg')
listOfSentences1 = string1.split('.')
listOfSentences2 = string2.split('.')
#listOfSentences1 = sent_tokenize(string1)
#listOfSentences2 = sent_tokenize(string2)
#print("Split into sentenes")
#measures = []
sim = 0
for line1 in listOfSentences1:
measures = []
for line2 in listOfSentences2:
doc1 = nlp(line1)
doc2 = nlp(line2)
measures.append(doc1.similarity(doc2))
for value in measures:
if(value>0.75):
sim = sim + 1
break
#count = len(measures)
'''for value in measures:
if(value>0.75):
sim = sim + 1'''
return sim/len(listOfSentences1)
def stemWords(self,document):
stemmer = PorterStemmer()
docList = document.split(' ')
stemmedDoc = ""
for word in docList:
stemmedDoc = stemmedDoc + " " + stemmer.stem(word)
return stemmedDoc
def similarValue(self,file1,file2):
readDoc = ReadDocument()
sim = Similarity()
fileContent1 = readDoc.readFile(file1)
fileContent2 = readDoc.readFile(file2)
#print("File reading done")
file1WithoutStopWords = sim.removeStopwords(fileContent1)
#print("Stop words in file1 removed")
file2WithoutStopWords = sim.removeStopwords(fileContent2)
#print("Stop Words in file2 removed")
file1AfterStemming = sim.stemWords(file1WithoutStopWords)
file2AfterStemming = sim.stemWords(file2WithoutStopWords)
#print(file1AfterStemming)
#print(file2AfterStemming)
similarityValue = sim.findMeasure(file1AfterStemming,file2AfterStemming)
return similarityValue