From 7a6a4d51e27562169c9ecbe35ce554a7dd79b4bc Mon Sep 17 00:00:00 2001
From: Jack Ward <stryker2k2@msn.com>
Date: Fri, 9 Feb 2024 17:30:11 -0600
Subject: [PATCH 1/7] Added BeautifulSoup Helper - Work in Progress

---
 bbot/core/helpers/web.py | 55 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
index 983a88c96..d834f8955 100644
--- a/bbot/core/helpers/web.py
+++ b/bbot/core/helpers/web.py
@@ -566,6 +566,61 @@ def is_spider_danger(self, source_event, url):
             return True
         return False
 
+    def beautifulsoup(
+        self,
+        markup,
+        features="html.parser",
+        builder=None,
+        parse_only=None,
+        from_encoding=None,
+        exclude_encodings=None,
+        element_classes=None,
+        **kwargs,
+    ):
+        """
+        Naviate, Search, Modify, Parse, or PrettyPrint HTML Content.
+        More information at https://beautiful-soup-4.readthedocs.io/en/latest/
+
+        Args:
+            markup: A string or a file-like object representing markup to be parsed.
+            features: Desirable features of the parser to be used.
+                This may be the name of a specific parser ("lxml",
+                "lxml-xml", "html.parser", or "html5lib") or it may be
+                the type of markup to be used ("html", "html5", "xml").
+                Defaults to 'html.parser'.
+            builder: A TreeBuilder subclass to instantiate (or instance to use)
+                instead of looking one up based on `features`.
+            parse_only: A SoupStrainer. Only parts of the document
+                matching the SoupStrainer will be considered.
+            from_encoding: A string indicating the encoding of the
+                document to be parsed.
+            exclude_encodings = A list of strings indicating
+                encodings known to be wrong.
+            element_classes = A dictionary mapping BeautifulSoup
+                classes like Tag and NavigableString, to other classes you'd
+                like to be instantiated instead as the parse tree is
+                built.
+            **kwargs = For backwards compatibility purposes.
+
+        Returns:
+            soup: An instance of the BeautifulSoup class
+
+        Todo:
+            - Write tests for this function
+
+        Examples:
+            >>> soup = BeautifulSoup(event.data["body"], "html.parser")
+            Perform an html parse of the 'markup' argument and return a soup instance
+
+            >>> email_type = soup.find(type="email")
+            Searches the soup instance for all occurances of the passed in argument
+        """
+
+        soup = BeautifulSoup(
+            markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs
+        )
+        return soup
+
     def ssl_context_noverify(self):
         if self._ssl_context_noverify is None:
             ssl_context = ssl.create_default_context()

From 205697bacababa81d8412abb7e7c93f50f1d61fb Mon Sep 17 00:00:00 2001
From: Jack Ward <stryker2k2@msn.com>
Date: Thu, 15 Feb 2024 12:04:48 -0600
Subject: [PATCH 2/7] Replace BeautifulSoup with the new
 'helpers.beautifulsoup()'

---
 bbot/modules/dnsdumpster.py | 5 ++---
 bbot/modules/newsletters.py | 3 +--
 bbot/modules/viewdns.py     | 5 ++---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/bbot/modules/dnsdumpster.py b/bbot/modules/dnsdumpster.py
index 8bb1fa1ed..c119857be 100644
--- a/bbot/modules/dnsdumpster.py
+++ b/bbot/modules/dnsdumpster.py
@@ -1,5 +1,4 @@
 import re
-from bs4 import BeautifulSoup
 
 from bbot.modules.templates.subdomain_enum import subdomain_enum
 
@@ -25,7 +24,7 @@ async def query(self, domain):
             return ret
         else:
             self.debug(f'Valid response code "{status_code}" from DNSDumpster')
-        html = BeautifulSoup(res1.content, "html.parser")
+        html = self.helpers.beautifulsoup(res1.content, "html.parser")
         csrftoken = None
         csrfmiddlewaretoken = None
         try:
@@ -73,7 +72,7 @@ async def query(self, domain):
             self.verbose(f'Bad response code "{status_code}" from DNSDumpster')
             return ret
 
-        html = BeautifulSoup(res2.content, "html.parser")
+        html = self.helpers.beautifulsoup(res2.content, "html.parser")
         escaped_domain = re.escape(domain)
         match_pattern = re.compile(r"^[\w\.-]+\." + escaped_domain + r"$")
         for subdomain in html.findAll(text=match_pattern):
diff --git a/bbot/modules/newsletters.py b/bbot/modules/newsletters.py
index 62ef98463..d6c990650 100644
--- a/bbot/modules/newsletters.py
+++ b/bbot/modules/newsletters.py
@@ -7,7 +7,6 @@
 
 from .base import BaseModule
 import re
-from bs4 import BeautifulSoup
 
 # Known Websites with Newsletters
 # https://futureparty.com/
@@ -37,7 +36,7 @@ def find_type(self, soup):
 
     async def handle_event(self, event):
         if event.data["status_code"] == 200:
-            soup = BeautifulSoup(event.data["body"], "html.parser")
+            soup = self.helpers.beautifulsoup(event.data["body"], "html.parser")
             result = self.find_type(soup)
             if result:
                 description = f"Found a Newsletter Submission Form that could be used for email bombing attacks"
diff --git a/bbot/modules/viewdns.py b/bbot/modules/viewdns.py
index d9a589845..bf55953a3 100644
--- a/bbot/modules/viewdns.py
+++ b/bbot/modules/viewdns.py
@@ -37,9 +37,8 @@ async def query(self, query):
             self.verbose(f"Error retrieving reverse whois results (status code: {status_code})")
 
         content = getattr(r, "content", b"")
-        from bs4 import BeautifulSoup
-
-        html = BeautifulSoup(content, "html.parser")
+        
+        html = self.helpers.beautifulsoup(content, "html.parser")
         found = set()
         for table_row in html.findAll("tr"):
             table_cells = table_row.findAll("td")

From d00a0f8e6c6eb3e6277e1cf92251e04788673b39 Mon Sep 17 00:00:00 2001
From: Jack Ward <stryker2k2@msn.com>
Date: Thu, 15 Feb 2024 12:06:24 -0600
Subject: [PATCH 3/7] black & flake8

---
 bbot/modules/viewdns.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bbot/modules/viewdns.py b/bbot/modules/viewdns.py
index bf55953a3..9b154ab63 100644
--- a/bbot/modules/viewdns.py
+++ b/bbot/modules/viewdns.py
@@ -37,7 +37,7 @@ async def query(self, query):
             self.verbose(f"Error retrieving reverse whois results (status code: {status_code})")
 
         content = getattr(r, "content", b"")
-        
+
         html = self.helpers.beautifulsoup(content, "html.parser")
         found = set()
         for table_row in html.findAll("tr"):

From 625d2ca17afa412ebb24bb4d1cb9f53992a29f16 Mon Sep 17 00:00:00 2001
From: Jack Ward <stryker2k2@msn.com>
Date: Thu, 15 Feb 2024 14:24:04 -0600
Subject: [PATCH 4/7] Added BeautifulSoup Web Helper Test

---
 bbot/core/helpers/web.py          | 17 ++++++++++-------
 bbot/test/test_step_1/test_web.py | 24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
index d834f8955..c17cfb5a4 100644
--- a/bbot/core/helpers/web.py
+++ b/bbot/core/helpers/web.py
@@ -398,7 +398,7 @@ async def api_page_iter(self, url, page_size=100, json=True, next_key=None, **re
                     new_url = next_key(result)
                 except Exception as e:
                     log.debug(f"Failed to extract next page of results from {url}: {e}")
-                    log.debug(traceback.formate_exc())
+                    log.debug(traceback.format_exc())
             else:
                 new_url = url.format(page=page, page_size=page_size, offset=offset)
             result = await self.request(new_url, **requests_kwargs)
@@ -609,17 +609,20 @@ def beautifulsoup(
             - Write tests for this function
 
         Examples:
-            >>> soup = BeautifulSoup(event.data["body"], "html.parser")
+            >>> soup = self.helpers.beautifulsoup(event.data["body"], "html.parser")
             Perform an html parse of the 'markup' argument and return a soup instance
 
             >>> email_type = soup.find(type="email")
             Searches the soup instance for all occurances of the passed in argument
         """
-
-        soup = BeautifulSoup(
-            markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs
-        )
-        return soup
+        try:
+            soup = BeautifulSoup(
+                markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs
+            )
+            return soup
+        except Exception as e:
+            log.debug(f"Error parsing beautifulsoup: {e}")
+            return False
 
     def ssl_context_noverify(self):
         if self._ssl_context_noverify is None:
diff --git a/bbot/test/test_step_1/test_web.py b/bbot/test/test_step_1/test_web.py
index 13edaf725..a8ea39ae5 100644
--- a/bbot/test/test_step_1/test_web.py
+++ b/bbot/test/test_step_1/test_web.py
@@ -45,6 +45,30 @@ async def test_web_helpers(bbot_scanner, bbot_config, bbot_httpserver):
     assert filename2.is_file()
     with open(filename2) as f:
         assert f.read() == download_content
+
+    # beautifulsoup
+    download_content = """
+    <div>
+    <h1>Example Domain</h1>
+    <p>This domain is for use in illustrative examples in documents. You may use this
+    domain in literature without prior coordination or asking for permission.</p>
+    <p><a href="https://www.iana.org/domains/example">More information...</a></p>
+    </div>
+    """
+
+    path = "/test_http_helpers_beautifulsoup"
+    url = bbot_httpserver.url_for(path)
+    bbot_httpserver.expect_request(uri=path).respond_with_data(download_content, status=200)
+    webpage = await scan1.helpers.request(url)
+    assert webpage, f"Webpage is False"
+    soup = scan1.helpers.beautifulsoup(webpage, "html.parser")
+    assert soup, f"Soup is False"
+    # pretty_print = soup.prettify()
+    # assert pretty_print, f"PrettyPrint is False"
+    # scan1.helpers.log.info(f"{pretty_print}")
+    html_text = soup.find(text="Example Domain")
+    assert html_text, f"Find HTML Text is False"
+
     # 404
     path = "/test_http_helpers_download_404"
     url = bbot_httpserver.url_for(path)

From c51e059bdab236fee1d687438e873233e57a72a0 Mon Sep 17 00:00:00 2001
From: TheTechromancer <thetechromancer@protonmail.com>
Date: Thu, 15 Feb 2024 16:44:20 -0500
Subject: [PATCH 5/7] add socksio

---
 poetry.lock    | 14 ++++++++++++--
 pyproject.toml |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f12b77dc7..ab85c1785 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2038,7 +2038,6 @@ optional = false
 python-versions = "*"
 files = [
     {file = "requests-file-2.0.0.tar.gz", hash = "sha256:20c5931629c558fda566cacc10cfe2cd502433e628f568c34c80d96a0cc95972"},
-    {file = "requests_file-2.0.0-py2.py3-none-any.whl", hash = "sha256:3e493d390adb44aa102ebea827a48717336d5268968c370eaf19abaf5cae13bf"},
 ]
 
 [package.dependencies]
@@ -2099,6 +2098,17 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
+[[package]]
+name = "socksio"
+version = "1.0.0"
+description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"},
+    {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"},
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.5"
@@ -2424,4 +2434,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4eb296ea314405bf39920f67d20eebb13cc8974254fd1643538bcb3a338976d2"
+content-hash = "e9c476ba44a5968f7bd6c9759ac4c6f8e679384bd6b0dd4f128af873a68a34da"
diff --git a/pyproject.toml b/pyproject.toml
index 7d1b2fb32..f16540fb8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ httpx = "^0.26.0"
 cloudcheck = "^2.1.0.181"
 tldextract = "^5.1.1"
 cachetools = "^5.3.2"
+socksio = "^1.0.0"
 
 [tool.poetry.group.dev.dependencies]
 flake8 = "^6.0.0"

From 497e5bb3932ab3c8adfc62284dd76b6ca2cf9855 Mon Sep 17 00:00:00 2001
From: Jack Ward <stryker2k2@msn.com>
Date: Thu, 15 Feb 2024 16:38:57 -0600
Subject: [PATCH 6/7] Merged Dev & removed 'deps_pip'

---
 bbot/modules/newsletters.py | 5 +----
 docs/contribution.md        | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/bbot/modules/newsletters.py b/bbot/modules/newsletters.py
index d6c990650..a59cc30e3 100644
--- a/bbot/modules/newsletters.py
+++ b/bbot/modules/newsletters.py
@@ -2,8 +2,7 @@
 # thanks to BBOT's sub-domain enumeration) looking for the presence of an 'email type' that also
 # contains a 'placeholder'. The combination of these two HTML items usually signify the presence
 # of an "Enter Your Email Here" type Newsletter Subscription service. This module could be used
-# to find newsletters for a future email bombing attack and/or find user-input fields that could
-# be be susceptible to overflows or injections.
+# to find newsletters for a future email bombing attack.
 
 from .base import BaseModule
 import re
@@ -15,8 +14,6 @@
 # https://www.milkkarten.net/
 # https://geekout.mattnavarra.com/
 
-deps_pip = ["beautifulsoup4"]
-
 
 class newsletters(BaseModule):
     watched_events = ["HTTP_RESPONSE"]
diff --git a/docs/contribution.md b/docs/contribution.md
index 2d36cfe44..175c3e7af 100644
--- a/docs/contribution.md
+++ b/docs/contribution.md
@@ -134,7 +134,6 @@ BBOT automates module dependencies with **Ansible**. If your module relies on a
 ```python
 class MyModule(BaseModule):
     ...
-    deps_pip = ["beautifulsoup4"]
     deps_apt = ["chromium-browser"]
     deps_ansible = [
         {

From 0bf795718e0633325ebba0ef184b5284705969d6 Mon Sep 17 00:00:00 2001
From: TheTechromancer <thetechromancer@protonmail.com>
Date: Fri, 16 Feb 2024 11:39:31 -0500
Subject: [PATCH 7/7] handle SOCKS error

---
 bbot/core/helpers/web.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
index 880565132..1a442c7e3 100644
--- a/bbot/core/helpers/web.py
+++ b/bbot/core/helpers/web.py
@@ -11,6 +11,7 @@
 from contextlib import asynccontextmanager
 
 from httpx._models import Cookies
+from socksio.exceptions import SOCKSError
 
 from bbot.core.errors import WordlistError, CurlError
 from bbot.core.helpers.ratelimiter import RateLimiter
@@ -674,6 +675,12 @@ async def _acatch(self, url, raise_error):
             log.trace(traceback.format_exc())
             if raise_error:
                 raise httpx.RequestError(msg)
+        except SOCKSError as e:
+            msg = f"SOCKS error with request to URL: {url}: {e}"
+            log.trace(msg)
+            log.trace(traceback.format_exc())
+            if raise_error:
+                raise httpx.RequestError(msg)
         except BaseException as e:
             # don't log if the error is the result of an intentional cancellation
             if not any(