-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AUDIT][WEBSITE] Find and Parse Sitemap
- Loading branch information
1 parent
1be7a61
commit a2bbb1d
Showing
3 changed files
with
143 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import toolkit.controller.audit.site_audit | ||
import toolkit.controller.audit.page_audit | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from urllib.parse import urlparse | ||
from toolkit.lib.http_tools import request_page | ||
from bs4 import BeautifulSoup | ||
|
||
class AuditPage(): | ||
def __init__(self, url): | ||
parsed_url = urlparse(url) | ||
self.domain = parsed_url.netloc | ||
self.scheme = parsed_url.scheme | ||
self.path = parsed_url.path | ||
self.request = request_page(self.generate_url()) | ||
self.status_code = self.request.status_code | ||
self.headers = self.request.headers | ||
self.soup = BeautifulSoup(self.request.content, 'html.parser') | ||
|
||
|
||
def __str__(self): | ||
a = "--------------------\n" | ||
a += "Domain: " + self.domain + "\n" | ||
a += "Scheme: " + self.scheme + "\n" | ||
a += "Path: " + self.path + "\n" | ||
a += "Status Code: " + str(self.status_code) + "\n" | ||
a += "Headers: " + str([x for x in self.headers]) + "\n" | ||
return a | ||
|
||
|
||
|
||
|
||
def generate_url(self): | ||
return self.scheme + "://" + self.domain + "/" + self.path | ||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from urllib.parse import urlparse | ||
from toolkit.lib.http_tools import request_page | ||
from bs4 import BeautifulSoup | ||
import requests | ||
from bs4 import BeautifulSoup as Soup | ||
import pandas as pd | ||
import hashlib | ||
|
||
|
||
class AuditWebsite(): | ||
def __init__(self, url): | ||
parsed_url = urlparse(url) | ||
self.domain = parsed_url.netloc | ||
self.scheme = parsed_url.scheme | ||
self.path = parsed_url.path | ||
self.sitemap = [] | ||
self.robots = False | ||
self.populate_request() | ||
self.robots_finder() | ||
self.populate_urls() | ||
|
||
def populate_request(self): | ||
self.request = request_page(self.generate_url()) | ||
self.status_code = self.request.status_code | ||
|
||
def robots_finder(self): | ||
request = request_page(self.generate_url() + "/robots.txt") | ||
if request.status_code == 200: | ||
self.robots_present = True | ||
self.find_sitemap(request.text) | ||
|
||
def find_sitemap(self, robots): | ||
self.sitemap = [] | ||
for line in robots.split("\n"): | ||
line = line.lower() | ||
line = line.split(" ") | ||
if line[0] == "sitemap:": | ||
self.sitemap.append(line[1]) | ||
if line[0] == "sitemaps:": | ||
self.sitemap.append(line[1]) | ||
|
||
def populate_urls(self): | ||
list_urls = [] | ||
self.urls = [] | ||
print(len(self.sitemap)) | ||
if len(self.sitemap) > 0: | ||
for i in self.sitemap: | ||
sitemap_urls = parse_sitemap(i) | ||
for url in sitemap_urls: | ||
if url not in list_urls: | ||
list_urls.append(url) | ||
self.urls = list_urls | ||
|
||
|
||
def generate_url(self): | ||
return self.scheme + "://" + self.domain | ||
|
||
|
||
|
||
|
||
|
||
def parse_sitemap( url): | ||
resp = requests.get(url) | ||
# we didn't get a valid response, bail | ||
if (200 != resp.status_code): | ||
return False | ||
|
||
# BeautifulSoup to parse the document | ||
soup = Soup(resp.content, "xml") | ||
|
||
# find all the <url> tags in the document | ||
urls = soup.findAll('url') | ||
sitemaps = soup.findAll('sitemap') | ||
panda_out_total = [] | ||
|
||
|
||
if not urls and not sitemaps: | ||
return False | ||
|
||
# Recursive call to the the function if sitemap contains sitemaps | ||
if sitemaps: | ||
for u in sitemaps: | ||
test = u.find('loc').string | ||
panda_recursive = parse_sitemap(test) | ||
panda_out_total += panda_recursive | ||
|
||
# storage for later... | ||
out = [] | ||
|
||
# Extract the keys we want | ||
for u in urls: | ||
loc = None | ||
loc = u.find("loc") | ||
if not loc: | ||
loc = "None" | ||
else: | ||
loc = loc.string | ||
out.append(loc) | ||
|
||
#returns the dataframe | ||
return panda_out_total + out |