-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
99 lines (84 loc) · 2.67 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# encoding: utf-8
import urllib2
import sys
import urlparse
from cgi import escape
from bs4 import BeautifulSoup
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
class Crawler(object):
"""Docstring for Crawler. """
def __init__(self, url, max_depth=1):
"""@todo: to be defined1. """
self.url = url
self.max_depth = max_depth
def crawl(self):
tocrawl = [self.url]
crawled = []
depth = 0
next_depth = []
while tocrawl and depth <= self.max_depth:
page = tocrawl.pop()
print page
if page not in crawled:
union(next_depth, get_all_links(page))
crawled.append(page)
if not tocrawl:
print 'xxxxxx'
tocrawl, next_depth = next_depth, []
depth += 1
return crawled
def union(list_p, list_q):
for p in list_q:
if p not in list_p:
list_p.append(p)
def get_all_links(url):
page = GetLinks(url)
page.getlinks()
return page.urls
class GetLinks(object):
"""Docstring for GetLinks. """
def __init__(self, url):
"""@todo: to be defined1.
:url: @todo
"""
self.url = url
self.urls = []
def open(self):
try:
request = urllib2.Request(self.url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
except IOError:
return None
return response
def getlinks(self):
response = self.open()
if response:
try:
content = unicode(response.read(), "utf-8",
errors = "replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.getcode() == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self.urls:
self.urls.append(url)
urls = get_all_links('http://www.baidu.com')
for url in urls:
print url
#anjn = Crawler('http://anjn.info')
#print anjn.crawl()
wiki = Crawler('file:///Users/anjiannian/Dropbox/wiki/index.html')
print wiki.crawl()