forked from ernestova/sitemap_warmup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
221 lines (175 loc) · 7.52 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
import os
import sys
import datetime
import argparse
import functools
import asyncio
import aiohttp
import requests
import pandas as pd
from lxml import etree, html
from tabulate import tabulate
from urllib.parse import urlparse
parser = argparse.ArgumentParser(description='Asynchronous CDN Cache Warmer')
parser.add_argument('-s', '--site', action="append", dest='sites', default=None)
parser.add_argument('-d', '--depth', action="store", dest='depth', default=None)
parser.add_argument('-c', '--concurrency', action="store", dest='concurrency', default=1)
parser.add_argument('-o', '--output', action="store_true", default=None)
parser.add_argument('-q', '--quiet', action="store_true", help="Only print 40x, 50x or 200 with noindex")
args = parser.parse_args()
concurrency = args.concurrency
depth = args.depth
sites = args.sites
quiet = args.quiet
output = args.output
# Colorize #
red = '\033[0;31m'
green = '\033[0;32m'
no_color = '\033[0m'
tasks = []
tab_headers = ['URL', 'Response code', 'Time (ms)', 'Meta Robots', 'Cache Control']
results = pd.DataFrame(columns=['domain', 'http_code', 'time', 'robots_status', 'cache_control'])
time_array = []
dot = 0
dot_total = 0
failed_links = 0
success_links = 0
domain = ''
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.googlebot.com/bot.html)'}
def get_links(mage_links):
r = requests.get(mage_links)
if "200" not in str(r):
sys.exit(red + "Sitemap fetch failed for %s with %s. Exiting..." % (mage_links, r) + no_color)
root = etree.fromstring(r.content)
print("URLs on sitemap found: %s" % str((len(root))))
links = []
for sitemap in root:
prefix, tag = sitemap.tag.split("}")
children = sitemap.getchildren()
if tag == 'sitemap':
print("Sitemap Index found %s" % children[0].text)
sites.append(children[0].text)
else:
links.append(children[0].text)
return links
async def bound_warms(sem, url):
async with sem:
await warm_it(url)
async def warm_it(url):
connection_started_time = None
connection_made_time = None
class TimedResponseHandler(aiohttp.client_proto.ResponseHandler):
def connection_made(self, transport):
nonlocal connection_made_time
connection_made_time = datetime.datetime.now()
return super().connection_made(transport)
class TimedTCPConnector(aiohttp.TCPConnector):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._factory = functools.partial(TimedResponseHandler, loop=loop)
async def _create_connection(self, req, traces, timeout):
nonlocal connection_started_time
connection_started_time = datetime.datetime.now()
return await super()._create_connection(req, traces, timeout)
async with aiohttp.ClientSession(connector=TimedTCPConnector(loop=loop), headers=headers) as session:
async with session.get(url) as response:
global dot, dot_total, results
time_delta = connection_made_time - connection_started_time
time_taken = "%s.%s" % (time_delta.seconds, time_delta.microseconds)
robots_status = ""
cache_control = ""
if response.status != 200:
response_output = red + str(response.status) + no_color
else:
response_output = green + str(response.status) + no_color
time_array.append(time_delta)
doc = html.fromstring(await response.text())
robots = doc.xpath("//meta[@name='robots']/@content")
if len(robots) > 0:
robots_status = 'NA'
if 'noindex' in robots[0]:
robots_status = 'NI'
response_output = red + str(response.status) + no_color
elif 'index' in robots[0]:
robots_status = 'I'
elif 'index,follow' in robots[0]:
robots_status = 'IF'
else:
robots_status = 'NA'
if 'Cache-Control' in response.headers:
current_control = response.headers['Cache-Control'].split(', ')
cache_control = current_control[0].replace('max-age=', '')
if (quiet is False) or ((quiet is True) and (response.status != 200)):
res = {'domain': url.replace(domain, ''),
'http_code': response_output,
'time': time_taken[:5],
'robots_status': robots_status,
'cache_control': cache_control}
results = results.append(res, ignore_index=True)
if quiet is False:
dot += 1
if dot == 100:
dor = ". %i\n" % dot_total
dot = 0
dot_total += 100
else:
dor = '.'
print(dor, end='', flush=True)
del doc, robots, response
def write_list_to_csv(current_sites, headers, results):
a = urlparse(current_sites)
url_file = os.path.basename(a.path)
filename = url_file.split('.')
results.to_csv('/tmp/%s.csv' % filename[0], encoding='utf-8', index=False)
def main():
global success_links, failed_links, time_array, results, domain, tab_headers
if concurrency is None:
print("The concurrency limit isn't specified. Setting limit to 150")
else:
print("Setting concurrency limit to %s Quiet: %s Output: %s" % (concurrency, quiet, output))
print("\n")
iteration = 0
while sites:
current_sites = sites.pop(0)
domain = urlparse(current_sites)
domain = domain.scheme+'://'+domain.netloc
print("SITEMAP: %s" % current_sites)
mage_links = get_links(str(current_sites))
if len(mage_links) > 0:
for i in mage_links:
task = asyncio.ensure_future(bound_warms(sem, i))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
if results is not None and isinstance(results, pd.DataFrame) and not results.empty:
for index, row in results.iterrows():
if "200" in row["http_code"]:
success_links += 1
else:
failed_links += 1
avg_time = str((sum([x.total_seconds() for x in time_array]))/len(time_array))
print("\n")
print("Meta Robots: I = index, IF=index/follow, F=noindex, NA = Not Found")
print(tabulate(results, showindex=True,
headers=tab_headers))
print(tabulate([[str(failed_links), str(success_links), avg_time]],
headers=['Failed', 'Successful', 'Average Time']))
if output is True:
write_list_to_csv(current_sites, tab_headers, results)
iteration += 1
if (depth is not False) and iteration == depth:
print("Depth level of %i reach" % iteration)
exit()
del mage_links
results = results.iloc[0:0]
print("END INTERATION %i \n" % iteration)
if failed_links > 0:
sys.exit("Number of failed links: %s. Exiting..." % str(failed_links))
if __name__ == "__main__":
sem = asyncio.Semaphore(int(concurrency))
loop = asyncio.get_event_loop()
# execute only if run as a script
main()
'''
if results is not None:
'''