-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplesitescraper.py
132 lines (103 loc) · 4.86 KB
/
simplesitescraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!usr/bin/env python
import re
import urllib2
class SimpleSiteScraper(object):
def __init__(self, url, recursive=False, page_depth=5):
# TODO: Document ;)
self.url = url
self.recursive = recursive
self.page_depth = page_depth
self.link_blacklist = None
# By default we just use the domain of the URL we're scraping. In theory
# you should be able to set to a Python regex string to allow subdomains
# for example.
# TODO: Capture cases where self.url is invalid URL. Right now this will
# result in an uncaught exception.
# TODO: Test various regex.
self.link_regex = re.match('.+?://(.+?)(?:/.*?)?$', self.url).group(1)
# The regular expression that will run on the HTML of every visited link.
# For demonstration purposes, the default regex will find the 'src'
# attribute of all 'img' tags.
self.regex = '<img.+?src=["\'](.+?)["\'].*?\/?>'
# A callable object that will be executed on result of self.regex. It
# will be passed the current URL and regular expression result. For
# demonstration purposes, the default will simply return a list.
self.regex_callback = lambda url, regex_result: [url, regex_result]
# "private" stuff.
self._link_blacklist_regex = None
self._current_depth = 0
self._visited_links = set()
def get_links(self, html, link_regex=''):
"""
Return a set of href values from all a elements.
html: The HTML text to search.
link_regex: An optional regex used to determine whether link should
be followed.
"""
# Use a set over a list, because set will strip out duplicates.
# TODO: Add suppport for relative URLs.
return set(re.findall('<a.+?href=["\'](http://'+link_regex+'.*?)(?:#.*?)?["\'].+?>',html))
def _can_visit_link(self, link):
"""
Return whether or not we 'can' visit supplied link.
Reasons we may not want to visit the link currently include:
- Link exists in our blacklist.
- Link exists in our visited links list.
"""
if self._link_blacklist_regex and re.search(self._link_blacklist_regex, link):
#print('Ah snap! This ish is blacklisted: %s' % link)
return False
if link in self._visited_links:
#print('Damn son! I done already seen this ish: %s' % link)
return False
return True
def _scrape_url(self, url):
"""
Function to [recursively] scrape a URL.
"""
try:
req = urllib2.Request(url)
res = urllib2.urlopen(req)
except urllib2.HTTPError as err:
# For now silently catch HTTP errors such as 404s
return
html = res.read()
self.regex_callback(url, re.findall(self.regex, html))
if self.recursive:
for link in self.get_links(html, self.link_regex):
if self._can_visit_link(link) and self._current_depth < self.page_depth:
self._visited_links.add(link)
self._current_depth += 1
#print "I'm %d page(s) up in this ish son!" % self._current_depth
self._scrape_url(link)
self._current_depth -= 1
if self._current_depth >= self.page_depth:
self._current_depth -= 1
return
def scrape(self):
# Generate link blacklist regular expression.
if self.link_blacklist:
self._link_blacklist_regex = re.compile('|'.join(['(%s)' % u for u in s.link_blacklist]))
self._scrape_url(self.url)
if __name__ == '__main__':
s = SimpleSiteScraper('http://example.com')
# You can optionally supply a list of strings to blacklist. This can be
# a single string, full URL, or Python regex string.
#s.link_blacklist = [
# 'http://example.com/some_file.html',
# 'http://example.com/some_dir.*?',
# '.+?gallery'
#]
# Should follow links
s.recursive = True
# Define a callback for SimpleSiteScraper.regex_callback. We'll just print
# the img src, number of found img src urls (from our regex) and a comma separated list
# of the img srcs.
def print_url_and_zones(url, regex_response):
print('"%s",%d,"%s"' % (url, len(regex_response), ','.join(regex_response)))
s.regex_callback = print_url_and_zones
# The regex_callback we set prints outupt for a CSV file, so lets set up
# a header row for the output before we start our scrape.
print('"url","num_of_imgs","srcs"')
# Start our scraping!
s.scrape()