Merge pull request #2104 from blacklanternsecurity/tag-attribute-rege…

…x-fix Fix issues with mailto links and parameter extraction
blacklanternsecurity · Dec 20, 2024 · 798670d · 798670d
2 parents d5ce97c + 22773f7
commit 798670d
Show file tree

Hide file tree

Showing 7 changed files with 40 additions and 27 deletions.
diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py
@@ -559,13 +559,12 @@ def is_port(p):
     return p and p.isdigit() and 0 <= int(p) <= 65535
 
 
-def is_dns_name(d, include_local=True):
+def is_dns_name(d):
     """
     Determines if the given string is a valid DNS name.
 
     Args:
         d (str): The string to be checked.
-        include_local (bool): Consider local hostnames to be valid (hostnames without periods)
 
     Returns:
         bool: True if the string is a valid DNS name, False otherwise.
@@ -575,17 +574,12 @@ def is_dns_name(d, include_local=True):
         True
         >>> is_dns_name('localhost')
         True
-        >>> is_dns_name('localhost', include_local=False)
-        False
         >>> is_dns_name('192.168.1.1')
         False
     """
     if is_ip(d):
         return False
     d = smart_decode(d)
-    if include_local:
-        if bbot_regexes.hostname_regex.match(d):
-            return True
     if bbot_regexes.dns_name_validation_regex.match(d):
         return True
     return False

diff --git a/bbot/core/helpers/regexes.py b/bbot/core/helpers/regexes.py
@@ -39,14 +39,10 @@
 ip_range_regexes = [re.compile(r, re.I) for r in _ip_range_regexes]
 
 # dns names with periods
-_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.)+(?:[xX][nN]--)?[^\W_]{1,63}\.?"
+_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.?)+(?:[xX][nN]--)?[^\W_]{1,63}\.?"
 dns_name_extraction_regex = re.compile(_dns_name_regex, re.I)
 dns_name_validation_regex = re.compile(r"^" + _dns_name_regex + r"$", re.I)
 
-# dns names without periods
-_hostname_regex = r"(?!\w*\.\w+)\w(?:[\w-]{0,100}\w)?"
-hostname_regex = re.compile(r"^" + _hostname_regex + r"$", re.I)
-
 _email_regex = r"(?:[^\W_][\w\-\.\+']{,100})@" + _dns_name_regex
 email_regex = re.compile(_email_regex, re.I)
 
@@ -61,14 +57,12 @@
 
 _open_port_regexes = (
     _dns_name_regex + r":[0-9]{1,5}",
-    _hostname_regex + r":[0-9]{1,5}",
     r"\[" + _ipv6_regex + r"\]:[0-9]{1,5}",
 )
 open_port_regexes = [re.compile(r, re.I) for r in _open_port_regexes]
 
 _url_regexes = (
     r"https?://" + _dns_name_regex + r"(?::[0-9]{1,5})?(?:(?:/|\?).*)?",
-    r"https?://" + _hostname_regex + r"(?::[0-9]{1,5})?(?:(?:/|\?).*)?",
     r"https?://\[" + _ipv6_regex + r"\](?::[0-9]{1,5})?(?:(?:/|\?).*)?",
 )
 url_regexes = [re.compile(r, re.I) for r in _url_regexes]
@@ -83,10 +77,7 @@
         for k, regexes in (
             (
                 "DNS_NAME",
-                (
-                    r"^" + _dns_name_regex + r"$",
-                    r"^" + _hostname_regex + r"$",
-                ),
+                (r"^" + _dns_name_regex + r"$",),
             ),
             (
                 "EMAIL_ADDRESS",
@@ -140,7 +131,7 @@
 textarea_tag_regex = re.compile(
     r'<textarea[^>]*\bname=["\']?(\w+)["\']?[^>]*>(.*?)</textarea>', re.IGNORECASE | re.DOTALL
 )
-tag_attribute_regex = re.compile(r"<[^>]*(?:href|src)\s*=\s*[\"\']([^\"\']+)[\"\'][^>]*>")
+tag_attribute_regex = re.compile(r"<[^>]*(?:href|action|src)\s*=\s*[\"\']?(?!mailto:)([^\s\'\"\>]+)[\"\']?[^>]*>")
 
 valid_netloc = r"[^\s!@#$%^&()=/?\\'\";~`<>]+"
 

diff --git a/bbot/modules/github_org.py b/bbot/modules/github_org.py
@@ -206,11 +206,7 @@ async def validate_org(self, org):
         for k, v in json.items():
             if (
                 isinstance(v, str)
-                and (
-                    self.helpers.is_dns_name(v, include_local=False)
-                    or self.helpers.is_url(v)
-                    or self.helpers.is_email(v)
-                )
+                and (self.helpers.is_dns_name(v) and "." in v or self.helpers.is_url(v) or self.helpers.is_email(v))
                 and self.scan.in_scope(v)
             ):
                 self.verbose(f'Found in-scope key "{k}": "{v}" for {org}, it appears to be in-scope')

diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
@@ -505,6 +505,11 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
                             if self.excavate.helpers.validate_parameter(parameter_name, parameter_type):
                                 if self.excavate.in_bl(parameter_name) is False:
                                     parsed_url = urlparse(url)
+                                    if not parsed_url.hostname:
+                                        self.excavate.warning(
+                                            f"Error Parsing reconstructed URL [{url}] during parameter extraction, missing hostname"
+                                        )
+                                        continue
                                     description = f"HTTP Extracted Parameter [{parameter_name}] ({parameterExtractorSubModule.name} Submodule)"
                                     data = {
                                         "host": parsed_url.hostname,
@@ -703,7 +708,7 @@ class URLExtractor(ExcavateRule):
                 """
             ),
         }
-        full_url_regex = re.compile(r"(https?)://((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
+        full_url_regex = re.compile(r"(https?)://(\w(?:[\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
         full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$")
         tag_attribute_regex = bbot_regexes.tag_attribute_regex
 

diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py
@@ -122,7 +122,7 @@ async def test_helpers_misc(helpers, scan, bbot_scanner, bbot_httpserver):
     assert not helpers.is_dns_name("evilcorp.com:80")
     assert not helpers.is_dns_name("http://evilcorp.com:80")
     assert helpers.is_dns_name("evilcorp")
-    assert not helpers.is_dns_name("evilcorp", include_local=False)
+    assert helpers.is_dns_name("evilcorp.")
     assert helpers.is_dns_name("ドメイン.テスト")
     assert not helpers.is_dns_name("127.0.0.1")
     assert not helpers.is_dns_name("dead::beef")

diff --git a/bbot/test/test_step_1/test_regexes.py b/bbot/test/test_step_1/test_regexes.py
@@ -267,7 +267,6 @@ def test_url_regexes():
         "http:///evilcorp.com",
         "http:// evilcorp.com",
         "http://evilcorp com",
-        "http://evilcorp.",
         "http://.com",
         "evilcorp.com",
         "http://ex..ample.com",
@@ -288,6 +287,7 @@ def test_url_regexes():
 
     good_urls = [
         "https://evilcorp.com",
+        "http://evilcorp.",
         "https://asdf.www.evilcorp.com",
         "https://asdf.www-test.evilcorp.com",
         "https://a.www-test.evilcorp.c",

diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py
@@ -29,6 +29,7 @@ async def setup_before_prep(self, module_test):
         # these ones should
         <a href="/a_relative.txt">
         <link href="/link_relative.txt">
+        <a href="mailto:[email protected]?subject=help">Help</a>
         """
         expect_args = {"method": "GET", "uri": "/"}
         respond_args = {"response_data": response_data}
@@ -1010,3 +1011,29 @@ def check(self, module_test, events):
         assert (
             "/donot_detect.js" not in url_events
         ), f"URL extracted from extractous text is incorrect, got {url_events}"
+
+
+class TestExcavateBadURLs(ModuleTestBase):
+    targets = ["http://127.0.0.1:8888/"]
+    modules_overrides = ["excavate", "httpx", "hunt"]
+    config_overrides = {"interactsh_disable": True, "scope": {"report_distance": 10}}
+
+    bad_url_data = """
+<a href='mailto:[email protected]?subject=help'>Help</a>
+<a href='https://ssl.'>Help</a>
+"""
+
+    async def setup_after_prep(self, module_test):
+        module_test.set_expect_requests({"uri": "/"}, {"response_data": self.bad_url_data})
+
+    def check(self, module_test, events):
+        log_file = module_test.scan.home / "debug.log"
+        log_text = log_file.read_text()
+        # make sure our logging is working
+        assert "Setting scan status to STARTING" in log_text
+        # make sure we don't have any URL validation errors
+        assert "Error Parsing reconstructed URL" not in log_text
+        assert "Error sanitizing event data" not in log_text
+
+        url_events = [e for e in events if e.type == "URL_UNVERIFIED"]
+        assert sorted([e.data for e in url_events]) == sorted(["https://ssl/", "http://127.0.0.1:8888/"])