-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen.py
154 lines (128 loc) · 3.54 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/python
"""
Homework : Search Engine.
Module: gen
Author: Wael Al-Sallami
# Structure of index on disk:
{
'users' : {id: name},
'docs' : {user: id, terms: set(terms)},
'pageranks': {user-id: pagerank}
'terms': {
'term' : {
'docsID': {'tf': 1, 'user': 32},
'docsID': {'tf': 1, 'user': 31},
...
},
...
}
}
"""
import os, re, timer, marshal, json, pr
from collections import Counter
class Index:
"""The data store"""
size = 0
terms = {}
docs = {}
users = {}
pageranks = {}
tweets = []
index_name = "index.dat"
def __init__(self, json_file):
"""Build index, store index"""
if self.on_disk():
print "\n> Reading index! This happens once per session, please wait ..."
self.load()
print '\a' # ring
else:
print "\n> Writing index! This only happens once, please wait ..."
self.build(json_file)
self.save()
print '\a' # ring
def build(self, json_file):
"""Build index from tweets"""
tweets = self.read_docs(json_file)
self.size = len(tweets)
for d in tweets:
self.add_terms(d)
self.add_doc(d)
self.add_user(d['user'])
self.add_mentions(d)
self.pageranks = pr.PageRank(self.users).build()
def add_terms(self, d):
"""Add all tweet tokens to our terms index"""
for t in d['terms']:
if t not in self.terms: self.terms[t] = {}
self.terms[t][d['id']] = d['terms'][t]
def add_doc(self, d):
"""Cache document-to-user relationships"""
self.docs[d['id']] = {
'user': d['user']['id'],
'terms': d['terms'].keys()
}
def add_user(self, user):
"""Add username to self.users[user-id]"""
if user['id'] not in self.users:
self.users[user['id']] = {
'name': user['name'],
'mentions': set()
}
def add_mentions(self, d):
"""Add all mentions to a user's adjacency list"""
if not d['mentions']: return
user_id = d['user']['id']
for m in d['mentions']:
if m['id'] == user_id: continue
self.add_user({
'id': m['id'],
'name': m['screen_name']
})
self.users[user_id]['mentions'].add(m['id'])
def tokenize(self, text):
"""tokenize a tweet"""
return re.split(r'[^\w]', text.lower(), flags=re.UNICODE)
def read_docs(self, filename):
"""Read tweets into {'docID': text} dictionary"""
f = open(filename, 'rU')
tweets = []
for line in f:
d = json.loads(line)
tweet = {
'id': d['id'],
'terms': Counter(self.tokenize(d['text'])),
'user': {
'id': d['user']['id'],
'name': d['user']['screen_name']
},
'mentions': d['entities']['user_mentions']
}
tweets.append(tweet)
f.close()
return tweets
def save(self):
"""Save index to disk"""
index_file = open(self.index_name, "w")
index = {
'terms': self.terms,
'pageranks': self.pageranks,
'users': self.users,
'docs': self.docs
}
marshal.dump(index, index_file)
del index
index_file.close()
def load(self):
"""Loads index into memory"""
index_file = open(self.index_name)
index = marshal.load(index_file)
self.terms = index['terms']
self.pageranks = index['pageranks']
self.users = index['users']
self.docs = index['docs']
self.size = len(index['docs'])
del index
index_file.close()
def on_disk(self):
"""Return True if index is present on disk"""
if os.path.exists(self.index_name): return True