-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_crawler.py
138 lines (101 loc) · 3.61 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import time
from threading import Thread
from MySet import MySet
from Page import Page
from Path import Path
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser(prog="python3 web_crawler.py", description="Export shortest paths of a domain into XML, create XML sitemap, find dead links")
parser.add_argument("-d", "--domain", help="The domain url", type=str, required=True)
parser.add_argument("-f", "--firstpage", help="The home page of the domain. Defaults to /", type=str, default="/")
parser.add_argument("-so", "--sitemapout", help="The path to where the sitemap xml will be saved", type=str, default="sitemap.xml")
parser.add_argument("-po", "--pathsout", help="The path to where the shortest paths xml will be saved", type=str, default="paths.xml")
parser.add_argument("-do", "--deadout", help="The path to where the dead links xml will be saved", type=str, default="dead.xml")
arg = parser.parse_args()
base_url = arg.domain
first_page = arg.firstpage
pages = []
parsed = []
dead = []
def sitemap_out():
rootEl = ET.Element("urlset")
for site in collection.visited:
url_Elem = ET.Element("url")
loc_Elem = ET.Element("loc")
loc_Elem.text = site.url
url_Elem.append(loc_Elem)
rootEl.append(url_Elem)
xmltree = ET.ElementTree(element=rootEl)
xmltree.write(arg.sitemapout)
def paths_out():
rootElem = ET.Element("paths")
for path in collection.paths:
rootElem.append(path.to_xml())
xmltree = ET.ElementTree(element=rootElem)
xmltree.write(arg.pathsout)
def dead_out():
rootElem = ET.Element("dead")
for page in dead:
rootElem.append(page.to_xml())
xmltree = ET.ElementTree(element=rootElem)
xmltree.write(arg.deadout)
def command(url, baseurl):
page = Page(url, baseurl)
if not(page.is_dead()):
pages.append(page)
else:
dead.append(page)
def create_pages(urls: list):
global collection
global pages
pages = []
threads = []
for url in urls:
if collection.is_visited(url) or (url in parsed):
continue
else:
parsed.append(url)
t = Thread(target=command, args=(url, base_url))
threads.append(t)
t.start()
for t in threads:
if t.is_alive():
t.join()
return pages
start = time.time()
# create a custom composite data structure that implements some of the algorithm logic
# and keeps track of each needed data structure
collection = MySet()
# create the first page/node
aPage = Page(first_page, base_url)
# create a path starting from the first page
path = Path(aPage)
# add to the data structure which will update its contents
collection.add(path)
# find the next min path that will be chosen
collection.set_min()
# while there are still unvisited paths
while collection.has_next():
# get the next unvisited cheapest path
path = collection.pop()
# get its last node
node = path.last
# get its neighbors
raw_neighbors = create_pages(node.links)
# get its unvisited neighbors
neighbors = list(filter(lambda x: not(x in collection.visited), raw_neighbors))
# for each neighbor
for n in neighbors:
# copy the base path so we can create a new path for each neighbor
new_path = path.copy()
# create a path for the neighbor
new_path.add(n)
# let the collection decide whether to add it or not
collection.add(new_path)
# find the next min path
collection.set_min()
end = time.time()
paths_out()
sitemap_out()
dead_out()
print("It took {} seconds to crawl the domain".format(end - start))