From 7a6a4d51e27562169c9ecbe35ce554a7dd79b4bc Mon Sep 17 00:00:00 2001 From: Jack Ward Date: Fri, 9 Feb 2024 17:30:11 -0600 Subject: [PATCH 1/7] Added BeautifulSoup Helper - Work in Progress --- bbot/core/helpers/web.py | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py index 983a88c96..d834f8955 100644 --- a/bbot/core/helpers/web.py +++ b/bbot/core/helpers/web.py @@ -566,6 +566,61 @@ def is_spider_danger(self, source_event, url): return True return False + def beautifulsoup( + self, + markup, + features="html.parser", + builder=None, + parse_only=None, + from_encoding=None, + exclude_encodings=None, + element_classes=None, + **kwargs, + ): + """ + Naviate, Search, Modify, Parse, or PrettyPrint HTML Content. + More information at https://beautiful-soup-4.readthedocs.io/en/latest/ + + Args: + markup: A string or a file-like object representing markup to be parsed. + features: Desirable features of the parser to be used. + This may be the name of a specific parser ("lxml", + "lxml-xml", "html.parser", or "html5lib") or it may be + the type of markup to be used ("html", "html5", "xml"). + Defaults to 'html.parser'. + builder: A TreeBuilder subclass to instantiate (or instance to use) + instead of looking one up based on `features`. + parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. + from_encoding: A string indicating the encoding of the + document to be parsed. + exclude_encodings = A list of strings indicating + encodings known to be wrong. + element_classes = A dictionary mapping BeautifulSoup + classes like Tag and NavigableString, to other classes you'd + like to be instantiated instead as the parse tree is + built. + **kwargs = For backwards compatibility purposes. + + Returns: + soup: An instance of the BeautifulSoup class + + Todo: + - Write tests for this function + + Examples: + >>> soup = BeautifulSoup(event.data["body"], "html.parser") + Perform an html parse of the 'markup' argument and return a soup instance + + >>> email_type = soup.find(type="email") + Searches the soup instance for all occurances of the passed in argument + """ + + soup = BeautifulSoup( + markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs + ) + return soup + def ssl_context_noverify(self): if self._ssl_context_noverify is None: ssl_context = ssl.create_default_context() From 205697bacababa81d8412abb7e7c93f50f1d61fb Mon Sep 17 00:00:00 2001 From: Jack Ward Date: Thu, 15 Feb 2024 12:04:48 -0600 Subject: [PATCH 2/7] Replace BeautifulSoup with the new 'helpers.beautifulsoup()' --- bbot/modules/dnsdumpster.py | 5 ++--- bbot/modules/newsletters.py | 3 +-- bbot/modules/viewdns.py | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/bbot/modules/dnsdumpster.py b/bbot/modules/dnsdumpster.py index 8bb1fa1ed..c119857be 100644 --- a/bbot/modules/dnsdumpster.py +++ b/bbot/modules/dnsdumpster.py @@ -1,5 +1,4 @@ import re -from bs4 import BeautifulSoup from bbot.modules.templates.subdomain_enum import subdomain_enum @@ -25,7 +24,7 @@ async def query(self, domain): return ret else: self.debug(f'Valid response code "{status_code}" from DNSDumpster') - html = BeautifulSoup(res1.content, "html.parser") + html = self.helpers.beautifulsoup(res1.content, "html.parser") csrftoken = None csrfmiddlewaretoken = None try: @@ -73,7 +72,7 @@ async def query(self, domain): self.verbose(f'Bad response code "{status_code}" from DNSDumpster') return ret - html = BeautifulSoup(res2.content, "html.parser") + html = self.helpers.beautifulsoup(res2.content, "html.parser") escaped_domain = re.escape(domain) match_pattern = re.compile(r"^[\w\.-]+\." + escaped_domain + r"$") for subdomain in html.findAll(text=match_pattern): diff --git a/bbot/modules/newsletters.py b/bbot/modules/newsletters.py index 62ef98463..d6c990650 100644 --- a/bbot/modules/newsletters.py +++ b/bbot/modules/newsletters.py @@ -7,7 +7,6 @@ from .base import BaseModule import re -from bs4 import BeautifulSoup # Known Websites with Newsletters # https://futureparty.com/ @@ -37,7 +36,7 @@ def find_type(self, soup): async def handle_event(self, event): if event.data["status_code"] == 200: - soup = BeautifulSoup(event.data["body"], "html.parser") + soup = self.helpers.beautifulsoup(event.data["body"], "html.parser") result = self.find_type(soup) if result: description = f"Found a Newsletter Submission Form that could be used for email bombing attacks" diff --git a/bbot/modules/viewdns.py b/bbot/modules/viewdns.py index d9a589845..bf55953a3 100644 --- a/bbot/modules/viewdns.py +++ b/bbot/modules/viewdns.py @@ -37,9 +37,8 @@ async def query(self, query): self.verbose(f"Error retrieving reverse whois results (status code: {status_code})") content = getattr(r, "content", b"") - from bs4 import BeautifulSoup - - html = BeautifulSoup(content, "html.parser") + + html = self.helpers.beautifulsoup(content, "html.parser") found = set() for table_row in html.findAll("tr"): table_cells = table_row.findAll("td") From d00a0f8e6c6eb3e6277e1cf92251e04788673b39 Mon Sep 17 00:00:00 2001 From: Jack Ward Date: Thu, 15 Feb 2024 12:06:24 -0600 Subject: [PATCH 3/7] black & flake8 --- bbot/modules/viewdns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbot/modules/viewdns.py b/bbot/modules/viewdns.py index bf55953a3..9b154ab63 100644 --- a/bbot/modules/viewdns.py +++ b/bbot/modules/viewdns.py @@ -37,7 +37,7 @@ async def query(self, query): self.verbose(f"Error retrieving reverse whois results (status code: {status_code})") content = getattr(r, "content", b"") - + html = self.helpers.beautifulsoup(content, "html.parser") found = set() for table_row in html.findAll("tr"): From 625d2ca17afa412ebb24bb4d1cb9f53992a29f16 Mon Sep 17 00:00:00 2001 From: Jack Ward Date: Thu, 15 Feb 2024 14:24:04 -0600 Subject: [PATCH 4/7] Added BeautifulSoup Web Helper Test --- bbot/core/helpers/web.py | 17 ++++++++++------- bbot/test/test_step_1/test_web.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py index d834f8955..c17cfb5a4 100644 --- a/bbot/core/helpers/web.py +++ b/bbot/core/helpers/web.py @@ -398,7 +398,7 @@ async def api_page_iter(self, url, page_size=100, json=True, next_key=None, **re new_url = next_key(result) except Exception as e: log.debug(f"Failed to extract next page of results from {url}: {e}") - log.debug(traceback.formate_exc()) + log.debug(traceback.format_exc()) else: new_url = url.format(page=page, page_size=page_size, offset=offset) result = await self.request(new_url, **requests_kwargs) @@ -609,17 +609,20 @@ def beautifulsoup( - Write tests for this function Examples: - >>> soup = BeautifulSoup(event.data["body"], "html.parser") + >>> soup = self.helpers.beautifulsoup(event.data["body"], "html.parser") Perform an html parse of the 'markup' argument and return a soup instance >>> email_type = soup.find(type="email") Searches the soup instance for all occurances of the passed in argument """ - - soup = BeautifulSoup( - markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs - ) - return soup + try: + soup = BeautifulSoup( + markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs + ) + return soup + except Exception as e: + log.debug(f"Error parsing beautifulsoup: {e}") + return False def ssl_context_noverify(self): if self._ssl_context_noverify is None: diff --git a/bbot/test/test_step_1/test_web.py b/bbot/test/test_step_1/test_web.py index 13edaf725..a8ea39ae5 100644 --- a/bbot/test/test_step_1/test_web.py +++ b/bbot/test/test_step_1/test_web.py @@ -45,6 +45,30 @@ async def test_web_helpers(bbot_scanner, bbot_config, bbot_httpserver): assert filename2.is_file() with open(filename2) as f: assert f.read() == download_content + + # beautifulsoup + download_content = """ +
+

Example Domain

+

This domain is for use in illustrative examples in documents. You may use this + domain in literature without prior coordination or asking for permission.

+

More information...

+
+ """ + + path = "/test_http_helpers_beautifulsoup" + url = bbot_httpserver.url_for(path) + bbot_httpserver.expect_request(uri=path).respond_with_data(download_content, status=200) + webpage = await scan1.helpers.request(url) + assert webpage, f"Webpage is False" + soup = scan1.helpers.beautifulsoup(webpage, "html.parser") + assert soup, f"Soup is False" + # pretty_print = soup.prettify() + # assert pretty_print, f"PrettyPrint is False" + # scan1.helpers.log.info(f"{pretty_print}") + html_text = soup.find(text="Example Domain") + assert html_text, f"Find HTML Text is False" + # 404 path = "/test_http_helpers_download_404" url = bbot_httpserver.url_for(path) From c51e059bdab236fee1d687438e873233e57a72a0 Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 15 Feb 2024 16:44:20 -0500 Subject: [PATCH 5/7] add socksio --- poetry.lock | 14 ++++++++++++-- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index f12b77dc7..ab85c1785 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2038,7 +2038,6 @@ optional = false python-versions = "*" files = [ {file = "requests-file-2.0.0.tar.gz", hash = "sha256:20c5931629c558fda566cacc10cfe2cd502433e628f568c34c80d96a0cc95972"}, - {file = "requests_file-2.0.0-py2.py3-none-any.whl", hash = "sha256:3e493d390adb44aa102ebea827a48717336d5268968c370eaf19abaf5cae13bf"}, ] [package.dependencies] @@ -2099,6 +2098,17 @@ files = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] +[[package]] +name = "socksio" +version = "1.0.0" +description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5." +optional = false +python-versions = ">=3.6" +files = [ + {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"}, + {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"}, +] + [[package]] name = "soupsieve" version = "2.5" @@ -2424,4 +2434,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4eb296ea314405bf39920f67d20eebb13cc8974254fd1643538bcb3a338976d2" +content-hash = "e9c476ba44a5968f7bd6c9759ac4c6f8e679384bd6b0dd4f128af873a68a34da" diff --git a/pyproject.toml b/pyproject.toml index 7d1b2fb32..f16540fb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ httpx = "^0.26.0" cloudcheck = "^2.1.0.181" tldextract = "^5.1.1" cachetools = "^5.3.2" +socksio = "^1.0.0" [tool.poetry.group.dev.dependencies] flake8 = "^6.0.0" From 497e5bb3932ab3c8adfc62284dd76b6ca2cf9855 Mon Sep 17 00:00:00 2001 From: Jack Ward Date: Thu, 15 Feb 2024 16:38:57 -0600 Subject: [PATCH 6/7] Merged Dev & removed 'deps_pip' --- bbot/modules/newsletters.py | 5 +---- docs/contribution.md | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/bbot/modules/newsletters.py b/bbot/modules/newsletters.py index d6c990650..a59cc30e3 100644 --- a/bbot/modules/newsletters.py +++ b/bbot/modules/newsletters.py @@ -2,8 +2,7 @@ # thanks to BBOT's sub-domain enumeration) looking for the presence of an 'email type' that also # contains a 'placeholder'. The combination of these two HTML items usually signify the presence # of an "Enter Your Email Here" type Newsletter Subscription service. This module could be used -# to find newsletters for a future email bombing attack and/or find user-input fields that could -# be be susceptible to overflows or injections. +# to find newsletters for a future email bombing attack. from .base import BaseModule import re @@ -15,8 +14,6 @@ # https://www.milkkarten.net/ # https://geekout.mattnavarra.com/ -deps_pip = ["beautifulsoup4"] - class newsletters(BaseModule): watched_events = ["HTTP_RESPONSE"] diff --git a/docs/contribution.md b/docs/contribution.md index 2d36cfe44..175c3e7af 100644 --- a/docs/contribution.md +++ b/docs/contribution.md @@ -134,7 +134,6 @@ BBOT automates module dependencies with **Ansible**. If your module relies on a ```python class MyModule(BaseModule): ... - deps_pip = ["beautifulsoup4"] deps_apt = ["chromium-browser"] deps_ansible = [ { From 0bf795718e0633325ebba0ef184b5284705969d6 Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Fri, 16 Feb 2024 11:39:31 -0500 Subject: [PATCH 7/7] handle SOCKS error --- bbot/core/helpers/web.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py index 880565132..1a442c7e3 100644 --- a/bbot/core/helpers/web.py +++ b/bbot/core/helpers/web.py @@ -11,6 +11,7 @@ from contextlib import asynccontextmanager from httpx._models import Cookies +from socksio.exceptions import SOCKSError from bbot.core.errors import WordlistError, CurlError from bbot.core.helpers.ratelimiter import RateLimiter @@ -674,6 +675,12 @@ async def _acatch(self, url, raise_error): log.trace(traceback.format_exc()) if raise_error: raise httpx.RequestError(msg) + except SOCKSError as e: + msg = f"SOCKS error with request to URL: {url}: {e}" + log.trace(msg) + log.trace(traceback.format_exc()) + if raise_error: + raise httpx.RequestError(msg) except BaseException as e: # don't log if the error is the result of an intentional cancellation if not any(