diff --git a/toolkit/controller/audit/__init__.py b/toolkit/controller/audit/__init__.py new file mode 100644 index 0000000..8153369 --- /dev/null +++ b/toolkit/controller/audit/__init__.py @@ -0,0 +1,6 @@ +import toolkit.controller.audit.site_audit +import toolkit.controller.audit.page_audit + + + + \ No newline at end of file diff --git a/toolkit/controller/audit/page_audit.py b/toolkit/controller/audit/page_audit.py new file mode 100644 index 0000000..ab6bef7 --- /dev/null +++ b/toolkit/controller/audit/page_audit.py @@ -0,0 +1,36 @@ +from urllib.parse import urlparse +from toolkit.lib.http_tools import request_page +from bs4 import BeautifulSoup + +class AuditPage(): + def __init__(self, url): + parsed_url = urlparse(url) + self.domain = parsed_url.netloc + self.scheme = parsed_url.scheme + self.path = parsed_url.path + self.request = request_page(self.generate_url()) + self.status_code = self.request.status_code + self.headers = self.request.headers + self.soup = BeautifulSoup(self.request.content, 'html.parser') + + + def __str__(self): + a = "--------------------\n" + a += "Domain: " + self.domain + "\n" + a += "Scheme: " + self.scheme + "\n" + a += "Path: " + self.path + "\n" + a += "Status Code: " + str(self.status_code) + "\n" + a += "Headers: " + str([x for x in self.headers]) + "\n" + return a + + + + + def generate_url(self): + return self.scheme + "://" + self.domain + "/" + self.path + + + + + + \ No newline at end of file diff --git a/toolkit/controller/audit/site_audit.py b/toolkit/controller/audit/site_audit.py new file mode 100644 index 0000000..0e13bba --- /dev/null +++ b/toolkit/controller/audit/site_audit.py @@ -0,0 +1,101 @@ +from urllib.parse import urlparse +from toolkit.lib.http_tools import request_page +from bs4 import BeautifulSoup +import requests +from bs4 import BeautifulSoup as Soup +import pandas as pd +import hashlib + + +class AuditWebsite(): + def __init__(self, url): + parsed_url = urlparse(url) + self.domain = parsed_url.netloc + self.scheme = parsed_url.scheme + self.path = parsed_url.path + self.sitemap = [] + self.robots = False + self.populate_request() + self.robots_finder() + self.populate_urls() + + def populate_request(self): + self.request = request_page(self.generate_url()) + self.status_code = self.request.status_code + + def robots_finder(self): + request = request_page(self.generate_url() + "/robots.txt") + if request.status_code == 200: + self.robots_present = True + self.find_sitemap(request.text) + + def find_sitemap(self, robots): + self.sitemap = [] + for line in robots.split("\n"): + line = line.lower() + line = line.split(" ") + if line[0] == "sitemap:": + self.sitemap.append(line[1]) + if line[0] == "sitemaps:": + self.sitemap.append(line[1]) + + def populate_urls(self): + list_urls = [] + self.urls = [] + print(len(self.sitemap)) + if len(self.sitemap) > 0: + for i in self.sitemap: + sitemap_urls = parse_sitemap(i) + for url in sitemap_urls: + if url not in list_urls: + list_urls.append(url) + self.urls = list_urls + + + def generate_url(self): + return self.scheme + "://" + self.domain + + + + + +def parse_sitemap( url): + resp = requests.get(url) + # we didn't get a valid response, bail + if (200 != resp.status_code): + return False + + # BeautifulSoup to parse the document + soup = Soup(resp.content, "xml") + + # find all the tags in the document + urls = soup.findAll('url') + sitemaps = soup.findAll('sitemap') + panda_out_total = [] + + + if not urls and not sitemaps: + return False + + # Recursive call to the the function if sitemap contains sitemaps + if sitemaps: + for u in sitemaps: + test = u.find('loc').string + panda_recursive = parse_sitemap(test) + panda_out_total += panda_recursive + + # storage for later... + out = [] + + # Extract the keys we want + for u in urls: + loc = None + loc = u.find("loc") + if not loc: + loc = "None" + else: + loc = loc.string + out.append(loc) + + #returns the dataframe + return panda_out_total + out \ No newline at end of file