-
Notifications
You must be signed in to change notification settings - Fork 0
/
Summarizer.py
125 lines (93 loc) · 3.23 KB
/
Summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import networkx as nx
import itertools
import editdistance
import nltk
import textdistance
import os
def buildGraph(nodes):
graph = nx.Graph()
graph.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
distance = editdistance.eval(firstString, secondString)
graph.add_edge(firstString, secondString, weight=distance)
return graph
def buildGraphAlt(nodes):
graph = nx.Graph()
graph.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
distance = textdistance.hamming.distance(firstString,
secondString) # https://pypi.org/project/textdistance/ for other distance
graph.add_edge(firstString, secondString, weight=distance)
return graph
def summarize(text, sentenceCount):
file = open(text)
text = file.read()
sentences = nltk.tokenize.sent_tokenize(text)
graph = buildGraph(sentences)
scoredSentences = nx.pagerank(graph, weight='weight')
rankedSentences = sorted(((value, key) for (key, value) in scoredSentences.items()), reverse=True)
summary = ''
for i in range(0, sentenceCount):
summary = summary + rankedSentences[i][1] + '\n\n'
return summary
def summarizeAlt(text, sentenceCount):
file = open(text)
text = file.read()
sentences = nltk.tokenize.sent_tokenize(text)
graph = buildGraphAlt(sentences)
scoredSentences = nx.pagerank(graph, weight='weight')
rankedSentences = sorted(((value, key) for (key, value) in scoredSentences.items()), reverse=True)
rankedSentences
summary = ''
for i in range(0, sentenceCount):
summary = summary + rankedSentences[i][1] + '\n\n'
return summary
def writeSummary(summary, text):
file = open(text + ' Summary.txt', 'w+')
file.write(summary)
file.close()
def rogue(text, sentenceCount):
path = "Gold/"
file = open(path + text + ' Gold.txt')
goldSum = file.read()
goldSum = nltk.tokenize.word_tokenize(goldSum)
path = "Text/"
A = summarize(path + text + '.txt', sentenceCount)
B = summarizeAlt(path + text + '.txt', sentenceCount)
tokenA = nltk.tokenize.word_tokenize(A)
tokenB = nltk.tokenize.word_tokenize(B)
rogueCount = {}
totalword = 0
totalword2 = 0
# counting unigram words
for word in tokenA:
if word in goldSum:
totalword += 1
for word in tokenB:
if word in goldSum:
totalword2 += 1
rogueCount[1] = totalword / len(goldSum)
rogueCount[2] = totalword2 / len(goldSum)
path = "Summary/"
if (rogueCount[1] > rogueCount[2]):
writeSummary(A, path + text)
else:
writeSummary(B, path + text)
return rogueCount
def run(pattern, count):
i = 1
score = ''
for file in os.listdir('Text/'):
filename = pattern + ' 0' + str(i)
eval = rogue(filename, count)
score = score + "Number " + str(i) + " 1: " + str(eval[1]) + " 2: " + str(eval[1]) + "\n"
i += 1
file = open('Score.txt', 'w+')
file.write(score)
file.close()