-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWikiGraph.py
326 lines (252 loc) · 11.5 KB
/
WikiGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
################################################################################
# #
# WikiGraph.py #
# #
# Author: cgearhart #
# Date: 7/17/12 #
# https://github.com/cgearhart #
# #
################################################################################
import urllib2
from bs4 import BeautifulSoup
from HTMLParser import HTMLParseError
class Pot():
# Wrapper/helper class for BeautifulSoup - also a graphObj for search
def __init__(self,words,filters,baseURL=None):
self._baseURL = baseURL
self._nodeWords = words
self._filters = [lambda x: x] + filters # always want to filter None first
self._url = None
self.title = None
self.node = None
self.links = None
self.isAtNode = None
self.soup = None
def _makeSoup(self,verbose=False):
# handle page request and convert page to soup
# TODO: Figure out why /wiki/chain_rule breaks the parser (not a BS bug)
botHeaders = {'User-Agent' : '628318'} # Tau FTW
req = urllib2.Request(self._url, headers=botHeaders)
try:
pageHandle = urllib2.urlopen( req )
self.soup = BeautifulSoup(pageHandle.read())
self.title = self.soup.title.string
if self.title.endswith('Wikipedia, the free encyclopedia'):
if len(self.title) > 34 and self.title[-34] == '-':
idx = -35
else:
idx = -32
self.title = self.title[:idx]
if verbose:
print '\nFinished the new batch of soup at url: {}\n'.format(self._url)
except urllib2.HTTPError, msg:
print '_makeSoup() error!'
print self._url
print msg
except HTMLParseError, msg:
print '_makeSoup() error!'
print msg
print self.node
def _getLinks(self,verbose=False):
# make a list of all links found at the target URL
a_tags = self.soup.find_all('a')
links = [tag.get('href') for tag in a_tags]
for test in self._filters:
if verbose:
oldlinks = links[:]
links = filter(test,links)
if verbose:
print '\n\n\nfilter removed-----------------------------'
linksdiff = set(oldlinks) - set(links)
for link in linksdiff:
print link
self.links = list(set(links)) # ensure unique links
def _isNode(self,verbose=False):
''' Tests whether the current URL should be considered a node.
The current URL is a node if one of the paragraphs before the TOC contains
any of the words in self._nodeWords.
'''
self.isAtNode = False
headingTag = self.soup.find(id="toctitle")
# TODO: some pages don't have enough content to have <h2>; this should
# be modified to handle those cases more elegantly (like "if no h2, search all")
# p.parent does not contain a table tag - the "part of science" tables
# screw things up
try:
if headingTag:
for paragraph in headingTag.find_all_previous('p'):
if paragraph.find(text=self._nodeWords):
self.isAtNode = True
# some tables in the summary section contain links to mathematics
# or similar topics; I haven't found a better way to exclude them
for parent in paragraph.parents:
if parent.name == 'table':
self.isAtNode = False # some tables contain key words
if self.isAtNode and verbose:
print '\n{} is a node.\n'.format(self._url)
except AttributeError, msg:
print 'Error!'
print 'Node: {}'.format(self.node)
print msg
def moveTo(self,url,verbose=False):
self.refresh(url=url,verbose=verbose)
def refresh(self,filters=None,nodeWords=None,url=None,verbose=False):
# Force a refresh of the object state
if verbose:
print 'Forcing Pot refresh.'
if url:
if verbose:
print 'Updating the URL from the link tuple'
self.node = url
self._url = self._baseURL + str(url)
else:
if filters:
if verbose:
print 'The filters changed.'
self._filters[1:] = filters
if nodeWords:
if verbose:
print ('The key words defining graph edges have been changed from {} '
'to {}'.format(self._nodeWords,nodeWords)
)
self._nodeWords = nodeWords
if self._url:
self._makeSoup(verbose) # update the soup
self._getLinks(verbose) # refresh the links
self._isNode(verbose) # determine if the URL is a node
else:
print 'This Pot instance has no _url!'
class DepthLimitedBFS():
def __init__(self,graphObj,startingNode,depthLimit=1):
self._stack = [startingNode]
self._nextLayerStack = []
self._graphObj = graphObj
self.depthLimit = depthLimit
self.visited = {}
self.graph = {}
self.depth = 0
def search(self,verbose=False):
# Loop over the stack to a limited depth to build a dict of nodes
while self._stack and self.depth <= self.depthLimit:
possibleNode = self._stack.pop(0)
if possibleNode not in self.visited: # don't visit the same link twice
self._graphObj.moveTo(possibleNode)
self.visited[possibleNode] = self._graphObj.title
if self._graphObj.isAtNode:
links = self._graphObj.links
self.graph[possibleNode] = links
self._nextLayerStack.extend(links)
if verbose:
print 'New node: {}\n'.format(possibleNode)
elif verbose:
print 'Not added: {}\n'.format(possibleNode)
elif verbose:
print 'Repeat node: {}\n'.format(possibleNode)
if not self._stack and self._nextLayerStack:
self._nextLayerStack = list(set(self._nextLayerStack))
print r'###### End of layer {} ######'.format(self.depth)
print r'###### Links in next layer: {} ######'.format(len(self._nextLayerStack)) + '\n'
self._stack.extend(self._nextLayerStack)
self._nextLayerStack = []
self.depth += 1
def write(self,nodeFileName='node.csv',edgeFileName='edge.csv'):
"""
# write all visited nodes to the node file as csv
nodeFileObj = open(nodeFileName,'w')
nodeFileObj.write('id;label\n')
for key,nodeID in self.visited.iteritems():
nodeLabel = key[1].encode('ascii','ignore')
thisLine = '{0};{1}\n'.format(str(nodeID),nodeLabel)
nodeFileObj.write(thisLine)
nodeFileObj.close()
"""
# write all the edges to the edge file as csv
edgeID = 0
with open(edgeFileName,'w') as edgeFileObj:
edgeFileObj.write('id;source;target\n')
for sourceNode,links in self.graph.iteritems():
unicodeText = self.visited[sourceNode]
sourceText = unicodeText.encode('ascii','ignore')
for targetNode in links:
if targetNode in self.graph: # only write edges that connect to things that are nodes
targetText = self.visited[targetNode].encode('ascii','ignore')
thisLine = '{0};{1};{2}\n'.format(str(edgeID),sourceText,targetText)
edgeFileObj.write(thisLine)
edgeID += 1
def testPot():
baseURL = 'http://en.wikipedia.org'
threadWords = ['Math','Mathematics','math','mathematics']
filters = [lambda x: x.startswith('/wiki'), # only keep other wikipedia links, and exclude foreign languages
lambda x: x.count(':') == 0, # exclude templates
lambda x: x.count('#') == 0, # exclude links to named sections
lambda x: not x.endswith('(disambiguation)') # obvious
]
testURLs = ['/wiki/Outline_of_calculus', # is a node
'/wiki/Calculus', # is a node
'/wiki/Art' # is NOT a node
]
print '##################### LET\'S TRY TESTING ##################'
for tURL in testURLs:
myPot = Pot(threadWords,filters,baseURL)
print 'Attempting to use URL: {}\n'.format(tURL)
myPot.moveTo(tURL)
if myPot.isAtNode:
print '\n{} is a node!!!\n'.format(tURL)
for link in list(set(myPot.links)):
print link
else:
print '\n{} is NOT a node!!!\n'.format(tURL)
print '#### LET\'S TRY CHANGING THINGS FOR A SINGLE INSTANCE ####'
tURL = testURLs[0]
testPot = Pot(threadWords,filters,baseURL)
print '## FORCE REFRESH'
testPot.refresh(verbose=True)
print '\nFinished force refresh\n'
print '## URL'
newURL = testURLs[1]
testPot.refresh(url=newURL,verbose=True)
print '\nChanged the URL\n'
print '## NODE WORDS'
words = ['Matt']
testPot.refresh(nodeWords=words,verbose=True) # this should break things
print '\nThis should say False: {}\n'.format(testPot.isAtNode)
testPot.nodeWords = threadWords
print '## FILTERS'
filters = [lambda x: not x] # should return zero links - might break things
testPot.refresh(filters=filters,verbose=True)
if not testPot.links:
print '\nAll links removed.\n'
print '\nBroke the links (hopefully) by removing them all.\n'
def testSearcher(verbose=False):
baseURL = 'http://en.wikipedia.org'
threadWords = ['Math','Mathematics','math','mathematics']
filters = [lambda x: x.startswith('/wiki'), # only keep other wikipedia links, and exclude foreign languages
lambda x: x.count(':') == 0, # exclude templates
lambda x: x.count('#') == 0, # exclude links to named sections
lambda x: not x.endswith('(disambiguation)') # obvious
]
testURL = '/wiki/Outline_of_calculus'
myPot = Pot(threadWords,filters,baseURL)
nodeSearcher = DepthLimitedBFS(myPot,testURL)
nodeSearcher.search(verbose=verbose)
print 'Test graph:\n'
for key,value in nodeSearcher.graph.iteritems():
print "\n\nNode: {}\n".format(key)
print "Edges:"
for thing in value:
print '\t{}'.format(thing)
def testWhole(verbose=False):
baseURL = 'http://en.wikipedia.org'
threadWords = ['Calculus','calculus']
filters = [lambda x: x.startswith('/wiki'), # only keep other wikipedia links, and exclude foreign languages
lambda x: x.count(':') == 0, # exclude templates
lambda x: x.count('#') == 0, # exclude links to named sections
lambda x: not x.endswith('(disambiguation)') # obvious
]
tailURL = '/wiki/Outline_of_calculus'
myPot = Pot(threadWords,filters,baseURL)
nodeSearcher = DepthLimitedBFS(myPot,tailURL,depthLimit=7)
nodeSearcher.search()
nodeSearcher.write()
if __name__=='__main__':
testWhole()