-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.py
168 lines (144 loc) · 6.03 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Main Idea:
# - Text preprocessing (removing stopwards, removing punctuation, etc.)
# - Frequency table of words/Word Frequency distribution - how many times each word appears in the document
# - Score each sentence depending on the words it contains and the frequency table
# - build summary by joining ever sentence above a certain score limit
# NOTE: Might need to run 'python3 -m spacy download en' to download english spacy package
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import wikipedia
import random as r
from heapq import nlargest
from collections import Counter
from string import punctuation
from extern import *
r.seed(Q_RANDOM_SEED)
# Function to test the functionality of text_summarization.py
def _test():
wikipedia.set_lang('en')
document_1 = str(wikipedia.page(TEST_WIKI_ARTICLE).content)
final_summary = core_summary_function(document_1)
print(final_summary)
def _is_repeat_sentence(s):
if type(s) != str:
s = s.text
s = s.replace('\n', ' ')
s_list = s.split(' ')
counts = Counter(s_list)
num_words = len(s_list)
for (w, c) in counts.items():
if c/num_words > REPEAT_THRESHOLD:
return True
return False
# encapsulates logic that is repeated for num_likes and num_retweets
def _get_top_tweets_helper(top_arr, num_for_metric, metric):
top_n_arr = []
for i, t in enumerate(top_arr):
if int(t[metric] if t[metric] else 0) > 0:
top_n_arr.append(t)
else:
zero_idx = i
break
zero_occ_for_metric = top_arr[zero_idx:]
r.shuffle(zero_occ_for_metric)
num_randomly_sampled_zero_metric_tweets = num_for_metric - len(top_n_arr)
top_n_arr.extend(zero_occ_for_metric[:num_randomly_sampled_zero_metric_tweets])
return top_n_arr
def _remove_duplicates(tweets):
texts = set()
results = list()
for tweet in tweets:
if tweet['text'] in texts:
continue
texts.add(tweet['text'])
results.append(tweet)
return results
def get_top_tweets(tweets, num_likes=100, num_retweets=100):
top_likes = sorted(tweets, key=lambda t: int(t['fav_count'] if t['fav_count'] else 0), reverse=True)
_num_likes = r.randint(len(tweets) // 2, len(tweets)) if num_likes > len(tweets) else num_likes
top_n_likes = _get_top_tweets_helper(top_likes, _num_likes, 'fav_count')
top_retweets = sorted(tweets, key=lambda t: int(t['ret_count'] if t['ret_count'] else 0), reverse=True)
_num_retweets = r.randint(len(tweets) // 2, len(tweets)) if num_retweets > len(tweets) else num_retweets
top_n_retweets = _get_top_tweets_helper(top_retweets, _num_retweets, 'ret_count')
combined = [*top_n_likes, *top_n_retweets]
return _remove_duplicates(combined)
def core_summary_function(document, target, lang='en', max_sentence_len=30):
_target = target[1:]
nlp = spacy.load(lang)
# Had to set it to a high value to process large collections of text
nlp.max_length = NLP_DOC_LENGTH
nlp_doc = nlp(document)
hashtag_set = set()
document_sentences = document.split('\n')
for line in document_sentences:
for w in line.split(' '):
if len(w) > 0 and '#' in w:
if w[0] == '#':
hashtag_set.add(w[1:])
else:
hashtag_set.add(w)
### Word frequency table
# dictionary of words and their counts using non stop words
word_freq = {}
for word in nlp_doc:
w = word.text
w = w.strip(punctuation)
w = w.lower()
# stopword omission
if w not in STOP_WORDS and w not in hashtag_set:
if w not in word_freq:
word_freq[w] = 0
word_freq[w] += 1
### Maximum frequency
max_freq = max(word_freq.items(), key=lambda x: x[1])[1] # recall that the 0th index is the word
for (word, freq) in word_freq.items():
word_freq[word] = freq/max_freq
### Sentence scores
# scoring every sentence based on the number of words
# (non-stop words in our word freq table)
sentence_list = [sentence for sentence in nlp_doc.sents]
seen_sentences = set()
sent_scores = {}
for sent in sentence_list:
# ignore sentences that aren't a certain length
if len(sent) <= 2:
continue
# ignore sentences that repeat the same thing over and over
if _is_repeat_sentence(sent):
continue
# ignore sentences that are duplicates
if sent.text.strip() in seen_sentences:
continue
seen_sentences.add(sent.text.strip())
for word in sent:
if word.text == _target:
continue
w = word.text.lower()
if w in word_freq:
if len(sent.text.split(' ')) < max_sentence_len:
if sent not in sent_scores:
sent_scores[sent] = 0
sent_scores[sent] += word_freq[w]
summarized_sentences = nlargest(NUM_SENTENCE_SUMMARY, sent_scores, key=sent_scores.get)
# convert spacy span to string
final_sentences = [s.text.replace('\n', ' ').replace('.', ' ').strip().capitalize() for s in summarized_sentences]
final_summary = '. '.join(final_sentences) + '.'
for pattern, replacement in REPLACE_DICT.items():
final_summary = re.sub(pattern, replacement, final_summary)
return final_summary
def summarize_tweets(target, mock):
'''Summarizes tweets passed in from zeitgeist'''
selection = sample(target)
if mock:
r.shuffle(selection)
sentences = selection[:NUM_SENTENCE_SUMMARY]
return ''.join(t['text'] for t in sentences)
log(f'Summarizing {len(selection)} tweets from {target}...')
top_n_tweets = get_top_tweets(selection,
num_likes=r.randint(100, 300),
num_retweets=r.randint(100, 300))
log(f'Selected top {len(top_n_tweets)} tweets')
corpus = ''.join([row['text'] for row in top_n_tweets])
summary = core_summary_function(corpus, target)
return summary