-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathspear.py
353 lines (286 loc) · 13.4 KB
/
spear.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
"""
The reference implementation of the SPEAR ranking algorithm in Python.
The purpose of this implementation is to make the inner workings of
the algorithm easy to understand and not to distract or confuse
the reader with highly optimized code.
The SPEAR algorithm takes a list of user activities on resources
as input, and returns ranked lists of users by expertise scores
and resources by quality scores, respectively.
You can also use this library to simulate the HITS algorithm of
Jon Kleinberg. Simply supply a credit score function C(x) = 1 to
the SPEAR algorithm (see documentation of Spear.run()).
More information about the SPEAR algorithm is available at:
* http://www.spear-algorithm.org/
* "Telling Experts from Spammers: Expertise Ranking in Folksonomies"
Michael G. Noll, Ching-man Au Yeung, et al.
SIGIR 09: Proceedings of 32nd International ACM SIGIR Conference
on Research and Development in Information Retrieval, Boston, USA,
July 2009, pp. 612-619, ISBN 978-1-60558-483-6
The code is licensed to you under version 2 of the GNU General Public
License.
"""
__author__ = "Michael G. Noll"
__copyright__ = "(c) 2009-2010 Michael G. Noll and Ching-man Au Yeung"
__description__ = "The reference implementation of the SPEAR ranking algorithm."
__email__ = "michael[AT]quuxlabs[DOT]com"
__license__ = "GPLv2"
__maintainer__ = "Michael G. Noll"
__status__ = "Development"
__url__ = "http://www.quuxlabs.com/"
__version__ = "1.0"
import datetime
import sys
import unittest
try:
from scipy import sparse
"""
Note: The following program makes use of the sparse matrix module in SciPy
In general, a lil_matrix is created first as it provides more convenient indexing.
The matrix will then be converted into a csr_matrix for faster computation of multiplications.
"""
import numpy
except:
print "ERROR: could not import SciPy or NumPy Python module"
print ""
print "You can download SciPy and NumPy at"
print "http://www.scipy.org/Download"
print
raise
class Spear(object):
def __init__(self, activities):
"""
Initialize the SPEAR algorithm with input activities.
@param A: List of (timestamp, user, resource) tuples.
A (timestamp, user, resource) tuple represents that <user>
'acted on' <resource> on date/time <timestamp>.
Example:
[
(datetime.datetime(2010,7,1,9,0,0), "alice", "http://www.quuxlabs.com/"),
(datetime.datetime(2010,8,1,12,45,0), "bob", "http://www.quuxlabs.com/"),
]
@param type: list
"""
self.activities = activities
# Sort activity data by timestamp (oldest first).
self.activities.sort()
# Extract users and resources for later use.
self._users = set()
self._resources = set()
for timestamp, user, resource in activities:
self._users.add(user)
self._resources.add(resource)
# Assign IDs to users and resources.
#
# This is needed because scipy requires numerical values
# for addressing cells in matrices.
self._user2id = {}
self._id2user = {}
for id, user in enumerate(self.users):
self._user2id[user] = id
self._id2user[id] = user
self._resource2id = {}
self._id2resource = {}
for id, resource in enumerate(self.resources):
self._resource2id[resource] = id
self._id2resource[id] = resource
def get_users(self):
return self._users
users = property(fget=get_users,
doc="Returns the set of users found in activities")
def get_resources(self):
return self._resources
resources = property(fget=get_resources,
doc="Returns the set of resources found in activities")
def _get_userid(self, user):
return self._user2id[user]
def _get_user(self, userid):
return self._id2user[userid]
def _get_resourceid(self, resource):
return self._resource2id[resource]
def _get_resource(self, resourceid):
return self._id2resource[resourceid]
def _populate(self, A):
"""
Populates the adjacency matrix A for use with the discoverer-follower scheme.
@param A: Empty adjacency matrix A (a numpy matrix), which maps users
to resources.
@param type: list
@return: Populated adjacency matrix A (a numpy matrix).
"""
# Calculate the number of actions per resource.
#
# In social bookmarking scenarios, for example, the number of actions
# on a resource would be its number of bookmarks.
num_actions = {}
for timestamp, user, resource in self.activities:
num_actions[resource] = num_actions.get(resource, 0) + 1
# Number of users found so far who have acted on a resource.
current_num_users = {}
# Score to be assigned to the next user.
#
# The first user will receive a score equals to number of total actions + 1,
# and the last user will receive a score of 1.
#
current_score = {}
# Stores the previously encountered (= most recent) timestamp
# of actions on a resource
#
prev_timestamp_of_docs = {}
# Each matrix cell stores the "action position" of user i
# with regard to resource j.
#
# If there are 10 users with the follow timestamps (smaller
# int values = earlier timestamps) for a resource j:
#
# 1, 1, 1, 2, 3, 3, 4, 4, 5, 6
#
# then their scores in the respective A[i,j] will be:
#
# 10, 10, 10, 7, 6, 6, 4, 4, 2, 1
#
for timestamp, user, resource in self.activities:
# raise an error if timestamp is None because this
# means there is an error in the input data which
# we want to know about
#
assert timestamp is not None
current_num_users.setdefault(resource, 0)
if timestamp == prev_timestamp_of_docs.get(resource):
A[self._get_userid(user),self._get_resourceid(resource)] = current_score[resource]
else:
current_score[resource] = num_actions[resource] - current_num_users[resource]
A[self._get_userid(user),self._get_resourceid(resource)] = current_score[resource]
prev_timestamp_of_docs[resource] = timestamp
current_num_users[resource] += 1
return A
def _apply_credit_scores(self, A, C):
"""
Applies credit scores to the adjacency matrix A.
@param A: Populated adjacency matrix A (a numpy matrix), which maps users to resources.
@param type: list
@param C: Credit score function C().
See the documentation of C at Spear.run().
@param type: A function that takes a numeric argument and returns a float
@return: Adjacency matrix A (a numpy matrix), with credit scores applied.
"""
for i in xrange(len(A.data)):
if A.data[i]:
for j in xrange(len(A.data[i])):
A.data[i][j] = C(A.data[i][j])
return A
def run(self, iterations=250, C=lambda score: pow(score, 0.5), verbose=True):
"""
Runs the SPEAR algorithm to find the Top users (experts) and resources.
@param iterations: Number of iterations for the algorithm.
Default: 250
@param type: int
@param C: Credit score function C().
Default: the root function, i.e. C(x) = x^0.5
The default value of C is the parameter used in the SPEAR
publication "Telling Experts from Spammers" from SIGIR 2009.
If you want to simulate HITS, supply the function C(x) = 1,
for example through C=lambda x: 1.
Please note that the credit score function will ALWAYS
return a value of 0 (zero) for the input values of 0 (zero),
regardless of the function passed for the parameter C.
In other words, it is fixed that C(0) == 0.
@param type: a function that takes a numeric argument and returns a float
@param verbose: If True (default), print some status information
to STDOUT during computation.
@param type: bool
@return: A tuple of lists (expertise_results, quality_results), which
contains ranked lists of (expertise_score, user) and (quality_score, resource)
tuples, respectively. The lists are ranked, i.e. the best items are listed first.
"""
# A: user-resource matrix (adjacency matrix)
A = sparse.lil_matrix( (len(self.users), len(self.resources)) )
if verbose:
print "Step 1) Populating adjacency matrix A"
A = self._populate(A)
if verbose:
print "Step 2) Applying credit score function C() to A"
A = self._apply_credit_scores(A, C)
A = A.tocsr()
# E: expertise vector for users
#
# For the record, setting E to [1, 1, ..., 1] is not really needed
# as E will be overwritten in the very first iteration step (E = Q x A^T).
#
E = numpy.ones(len(self.users))
# Q: quality vector for resources
#
Q = numpy.ones(len(self.resources))
# Update expertise and quality iteratively = mutual reinforcement
#
if verbose:
print "Step 3) Mutual reinforcement using %d iterations ***this might take some time***" % (iterations)
for i in xrange(iterations):
# E is the y_p weight vector in the original HITS algorithm;
# the next line is HITS' "O" operation, which is based on the
# "out-degree" of Web pages; in our case, "out-degree" is not
# computed by counting the number of pages qs to which a given
# page p links to, but rather the number of resources qs a given
# user p has 'acted on'
# HITS:
# p -> {q1, q2, q3, ...}
# SPEAR:
# user -> {resource1, resource2, resource3, ...}
E = Q * A.T
# Q is the x_p weight vector in the original HITS algorithm;
# the next line is HITS' "I" operation, which is based on the
# "in-degree" of web pages; in our case, "in-degree" is not
# computed by counting the number of pages qs linking to a
# given page p, but the number of users qs who 'acted on' a
# given resource p
# HITS:
# p <- {q1, q2, q3, ...}
# SPEAR:
# resource <- {user1, user2, user3, ...}
Q = E * A
# normalization
E = E / E.sum()
Q = Q / Q.sum()
if verbose:
print "Step 4) Sorting vectors E and Q by expertise and quality scores, respectively"
expertise_results = [ (expertise_score, self._get_user(userid)) for userid, expertise_score in enumerate(E) ]
expertise_results.sort(reverse=True)
quality_results = [ (quality_score, self._get_resource(resourceid)) for resourceid, quality_score in enumerate(Q) ]
quality_results.sort(reverse=True)
return expertise_results, quality_results
class SpearTester(unittest.TestCase):
def testSpearOnSampleData(self):
USERS = ["Steve Jobs", "Bill Gates", "Sergey Brin", "Larry Page"]
RESOURCES = ["D1", "D2", "D3"]
EXPERTISE_RESULTS = [
(0.42154381, USERS[0]),
(0.32808641, USERS[1]),
(0.21227046, USERS[2]),
(0.03809933, USERS[3]),
]
QUALITY_RESULTS = [
(0.52695009, RESOURCES[1]),
(0.34629657, RESOURCES[0]),
(0.12675334, RESOURCES[2]),
]
activities = []
activities.append((datetime.datetime(2010,7,1,9,0,0), USERS[0], RESOURCES[0]))
activities.append((datetime.datetime(2010,7,2,9,0,0), USERS[1], RESOURCES[0]))
activities.append((datetime.datetime(2010,6,1,9,0,0), USERS[0], RESOURCES[1]))
activities.append((datetime.datetime(2010,6,1,10,0,0), USERS[1], RESOURCES[1]))
activities.append((datetime.datetime(2010,6,2,11,0,0), USERS[2], RESOURCES[1]))
activities.append((datetime.datetime(2010,6,10,12,0,0), USERS[2], RESOURCES[2]))
activities.append((datetime.datetime(2010,6,14,12,0,0), USERS[3], RESOURCES[2]))
spear = Spear(activities)
expertise_results, quality_results = spear.run(verbose=False)
# check expertise results
for index, (expertise_score, user) in enumerate(expertise_results):
ref_expertise_score, ref_user = EXPERTISE_RESULTS[index]
self.assertAlmostEqual(expertise_score, ref_expertise_score, places=7)
self.assertEqual(user, ref_user)
# check quality results
for index, (quality_score, resource) in enumerate(quality_results):
ref_quality_score, ref_resource = QUALITY_RESULTS[index]
self.assertAlmostEqual(quality_score, ref_quality_score, places=7)
self.assertEqual(resource, ref_resource)
if __name__ == "__main__":
unittest.main()