-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobots_perm.py
53 lines (46 loc) · 1.39 KB
/
robots_perm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
from bs4 import BeautifulSoup as bs
import re
import time
import pandas as pd
from ratelimit import limits
from multiprocessing import Pool
import urllib.robotparser as rp
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
def getFilename_fromCd(cd):
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0]
def get_rt(url, suffix):
urll = (f"http://{url}.{suffix}/robots.txt")
r = requests.get(urll,headers=headers, allow_redirects=True)
if r.status_code == 200:
print(f'request for {url} successful')
else:
print(f'request for {url} failed')
filename = getFilename_fromCd(r.headers.get('content-disposition'))
open(f'{url}.txt', 'wb').write(r.content)
print(f'file for {url} saved')
time.sleep(5)
def df_loader(df):
df = pd.read_csv(df)
for i,r in df.iterrows():
get_rt(r['Url'], r['suffix'])
# df_loader('indian_websites.csv')
def parse_robofile():
r = rp.RobotFileParser()
r.set_url(file)
r.read()
rrate = r.request_rate("*")
def check_robotfile(name, url):
fle = open(f"webs_robots_files/{name}.txt")
r = rp.RobotFileParser()
r = r.set_url(fle)
op = r.can_fetch("*", url)
print(op)
return op