merge dev

blacklanternsecurity · Feb 16, 2024 · ac5ac64 · ac5ac64
2 parents 30eb11d + 25e5390
commit ac5ac64
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 14 deletions.
diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
@@ -11,6 +11,7 @@
 from contextlib import asynccontextmanager
 
 from httpx._models import Cookies
+from socksio.exceptions import SOCKSError
 
 from bbot.core.errors import WordlistError, CurlError
 from bbot.core.helpers.ratelimiter import RateLimiter
@@ -566,6 +567,64 @@ def is_spider_danger(self, source_event, url):
             return True
         return False
 
+    def beautifulsoup(
+        self,
+        markup,
+        features="html.parser",
+        builder=None,
+        parse_only=None,
+        from_encoding=None,
+        exclude_encodings=None,
+        element_classes=None,
+        **kwargs,
+    ):
+        """
+        Naviate, Search, Modify, Parse, or PrettyPrint HTML Content.
+        More information at https://beautiful-soup-4.readthedocs.io/en/latest/
+
+        Args:
+            markup: A string or a file-like object representing markup to be parsed.
+            features: Desirable features of the parser to be used.
+                This may be the name of a specific parser ("lxml",
+                "lxml-xml", "html.parser", or "html5lib") or it may be
+                the type of markup to be used ("html", "html5", "xml").
+                Defaults to 'html.parser'.
+            builder: A TreeBuilder subclass to instantiate (or instance to use)
+                instead of looking one up based on `features`.
+            parse_only: A SoupStrainer. Only parts of the document
+                matching the SoupStrainer will be considered.
+            from_encoding: A string indicating the encoding of the
+                document to be parsed.
+            exclude_encodings = A list of strings indicating
+                encodings known to be wrong.
+            element_classes = A dictionary mapping BeautifulSoup
+                classes like Tag and NavigableString, to other classes you'd
+                like to be instantiated instead as the parse tree is
+                built.
+            **kwargs = For backwards compatibility purposes.
+
+        Returns:
+            soup: An instance of the BeautifulSoup class
+
+        Todo:
+            - Write tests for this function
+
+        Examples:
+            >>> soup = self.helpers.beautifulsoup(event.data["body"], "html.parser")
+            Perform an html parse of the 'markup' argument and return a soup instance
+
+            >>> email_type = soup.find(type="email")
+            Searches the soup instance for all occurances of the passed in argument
+        """
+        try:
+            soup = BeautifulSoup(
+                markup, features, builder, parse_only, from_encoding, exclude_encodings, element_classes, **kwargs
+            )
+            return soup
+        except Exception as e:
+            log.debug(f"Error parsing beautifulsoup: {e}")
+            return False
+
     def ssl_context_noverify(self):
         if self._ssl_context_noverify is None:
             ssl_context = ssl.create_default_context()
@@ -616,6 +675,12 @@ async def _acatch(self, url, raise_error):
             log.trace(traceback.format_exc())
             if raise_error:
                 raise httpx.RequestError(msg)
+        except SOCKSError as e:
+            msg = f"SOCKS error with request to URL: {url}: {e}"
+            log.trace(msg)
+            log.trace(traceback.format_exc())
+            if raise_error:
+                raise httpx.RequestError(msg)
         except BaseException as e:
             # don't log if the error is the result of an intentional cancellation
             if not any(

diff --git a/bbot/modules/dnsdumpster.py b/bbot/modules/dnsdumpster.py
@@ -1,5 +1,4 @@
 import re
-from bs4 import BeautifulSoup
 
 from bbot.modules.templates.subdomain_enum import subdomain_enum
 
@@ -25,7 +24,7 @@ async def query(self, domain):
             return ret
         else:
             self.debug(f'Valid response code "{status_code}" from DNSDumpster')
-        html = BeautifulSoup(res1.content, "html.parser")
+        html = self.helpers.beautifulsoup(res1.content, "html.parser")
         csrftoken = None
         csrfmiddlewaretoken = None
         try:
@@ -73,7 +72,7 @@ async def query(self, domain):
             self.verbose(f'Bad response code "{status_code}" from DNSDumpster')
             return ret
 
-        html = BeautifulSoup(res2.content, "html.parser")
+        html = self.helpers.beautifulsoup(res2.content, "html.parser")
         escaped_domain = re.escape(domain)
         match_pattern = re.compile(r"^[\w\.-]+\." + escaped_domain + r"$")
         for subdomain in html.findAll(text=match_pattern):

diff --git a/bbot/modules/newsletters.py b/bbot/modules/newsletters.py
@@ -2,12 +2,10 @@
 # thanks to BBOT's sub-domain enumeration) looking for the presence of an 'email type' that also
 # contains a 'placeholder'. The combination of these two HTML items usually signify the presence
 # of an "Enter Your Email Here" type Newsletter Subscription service. This module could be used
-# to find newsletters for a future email bombing attack and/or find user-input fields that could
-# be be susceptible to overflows or injections.
+# to find newsletters for a future email bombing attack.
 
 from .base import BaseModule
 import re
-from bs4 import BeautifulSoup
 
 # Known Websites with Newsletters
 # https://futureparty.com/
@@ -16,8 +14,6 @@
 # https://www.milkkarten.net/
 # https://geekout.mattnavarra.com/
 
-deps_pip = ["beautifulsoup4"]
-
 
 class newsletters(BaseModule):
     watched_events = ["HTTP_RESPONSE"]
@@ -37,7 +33,7 @@ def find_type(self, soup):
 
     async def handle_event(self, event):
         if event.data["status_code"] == 200:
-            soup = BeautifulSoup(event.data["body"], "html.parser")
+            soup = self.helpers.beautifulsoup(event.data["body"], "html.parser")
             result = self.find_type(soup)
             if result:
                 description = f"Found a Newsletter Submission Form that could be used for email bombing attacks"

diff --git a/bbot/modules/viewdns.py b/bbot/modules/viewdns.py
@@ -37,9 +37,8 @@ async def query(self, query):
             self.verbose(f"Error retrieving reverse whois results (status code: {status_code})")
 
         content = getattr(r, "content", b"")
-        from bs4 import BeautifulSoup
 
-        html = BeautifulSoup(content, "html.parser")
+        html = self.helpers.beautifulsoup(content, "html.parser")
         found = set()
         for table_row in html.findAll("tr"):
             table_cells = table_row.findAll("td")

diff --git a/bbot/test/test_step_1/test_web.py b/bbot/test/test_step_1/test_web.py
@@ -45,6 +45,30 @@ async def test_web_helpers(bbot_scanner, bbot_config, bbot_httpserver):
     assert filename2.is_file()
     with open(filename2) as f:
         assert f.read() == download_content
+
+    # beautifulsoup
+    download_content = """
+    <div>
+    <h1>Example Domain</h1>
+    <p>This domain is for use in illustrative examples in documents. You may use this
+    domain in literature without prior coordination or asking for permission.</p>
+    <p><a href="https://www.iana.org/domains/example">More information...</a></p>
+    </div>
+    """
+
+    path = "/test_http_helpers_beautifulsoup"
+    url = bbot_httpserver.url_for(path)
+    bbot_httpserver.expect_request(uri=path).respond_with_data(download_content, status=200)
+    webpage = await scan1.helpers.request(url)
+    assert webpage, f"Webpage is False"
+    soup = scan1.helpers.beautifulsoup(webpage, "html.parser")
+    assert soup, f"Soup is False"
+    # pretty_print = soup.prettify()
+    # assert pretty_print, f"PrettyPrint is False"
+    # scan1.helpers.log.info(f"{pretty_print}")
+    html_text = soup.find(text="Example Domain")
+    assert html_text, f"Find HTML Text is False"
+
     # 404
     path = "/test_http_helpers_download_404"
     url = bbot_httpserver.url_for(path)

diff --git a/docs/contribution.md b/docs/contribution.md
@@ -134,7 +134,6 @@ BBOT automates module dependencies with **Ansible**. If your module relies on a
 ```python
 class MyModule(BaseModule):
     ...
-    deps_pip = ["beautifulsoup4"]
     deps_apt = ["chromium-browser"]
     deps_ansible = [
         {

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ httpx = "^0.26.0"
 cloudcheck = "^2.1.0.181"
 tldextract = "^5.1.1"
 cachetools = "^5.3.2"
+socksio = "^1.0.0"
 
 [tool.poetry.group.dev.dependencies]
 flake8 = "^6.0.0"