-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
130 lines (117 loc) · 5.19 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
"""
A simple example crawler implementation skeleton.
Used during Lectures of courses in:
Department of Computer Engineering & Informatics
Information Retrieval (NE5597)
Winter Semester 2016-2017
--
Multiple improvements are expected!
Installation requirement:
pip install bs4
Execution example:
python crawler.py -u http://www.example.com
"""
from urllib2 import urlopen
from urlparse import urljoin
from bs4 import BeautifulSoup
import argparse
def run_crawler(pending_links, visited_links, max_pages, visited_count):
"""
:param list pending_links: A list of links to be visited and processed.
:param list visited_links: A list of visited links, helping avoid revisiting.
:param int max_pages: A maximum number of visited pages limit.
:param int visited_count: A counter of visited pages.
:return: None
"""
while visited_count < max_pages and pending_links:
try:
curr_url = pending_links.pop(0)
visited_links.append(curr_url)
print "#", visited_count, "processing: ", curr_url, "\n\tFound:"
link_parser(curr_url, pending_links, visited_links)
visited_count += 1
except (KeyboardInterrupt, SystemExit):
exit(0)
except Exception as msg:
print "[ERROR]", msg, " in url: ", curr_url
exit(9)
print "DONE..."
def link_parser(url, pending_links, visited_links):
"""
:param str url: The url to be processed and parsed for urls.
:param list pending_links: A list of links to be visited and processed.
:param list visited_links: A list of visited links, helping avoid revisiting.
:return: None
"""
try:
# Open url with urlopen
# More info on urllib2.urlopen:
# - https://docs.python.org/2/library/urllib2.html#urllib2.urlopen
response = urlopen(url)
# Check response Content-Type header. Ignore anything other
# than HTML documents ('text/html').
# CSS, javascript, images, etc are ignored.
if 'text/html' in response.info().getheader('Content-Type'):
# HTTP response codes other than 200 would indicate a problem.
# More info:
# - https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
if response.code == 200:
# HTML page of the url may now be processed. Lets read it:
html_page = response.read().decode("utf-8")
# Parse html page with BeautifulSoup HTML Parser
# More info on BeautifulSoup:
# - https://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(html_page, "html.parser")
links = []
# Find all link tags
# html link example: <a href="http://www.example.com">example.com link</a>
# More info:
# - https://www.tutorialspoint.com/html/html_text_links.htm
# - https://en.wikipedia.org/wiki/Hyperlink
for link in soup.find_all('a'):
href = link.get('href')
# normalize href url
href = normalize_href(url, href)
if href:
# Update pending_links and visited_links accordingly:
if href not in pending_links and href not in visited_links:
pending_links.append(href)
links.append(href)
# Print the new url:
print "\t", href
except (KeyboardInterrupt, SystemExit):
print "Exited..."
exit(0)
except Exception as msg:
print "[ERROR] ", msg, "\n parsing ", url
def normalize_href(current_url, href):
"""
Very primitive and simple href normalization.
(HINT: needs expansion)
More info:
- https://en.wikipedia.org/wiki/URL_normalization
:param current_url: The url where a href is found.
:param href: Stands for Hypertext REFerence. Url..
:return: A normalized href, or None if href is a mailto link.
"""
if href and ("mailto:" not in href):
if href.startswith("www"):
href = 'http://'+href
elif href.startswith('//'):
href = 'http:' + href
elif not href.startswith('http'):
href = urljoin(current_url, href)
return href
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser(description="A simple urllib + BeautifulSoup crawler.",
epilog="Example usage: python crawler.py -u http://www.example.com")
parser.add_argument('-u', '--url', type=str, required=True, help='The starting crawler URL')
parser.add_argument('-m', '--maxpages', type=int, default=100, help='Maximum allowed number of crawled pages')
args = parser.parse_args()
# Starts the crawler. We user a single url as a starting point [args.url].
# Visited urls list is empty: []. We define a maximum number of crawled
# pages provided by maxpages command line argument.
# We also initialize crawled pages counter with 0
run_crawler([args.url], [], args.maxpages, 0)