Skip to content

Commit

Permalink
Merge pull request #2104 from blacklanternsecurity/tag-attribute-rege…
Browse files Browse the repository at this point in the history
…x-fix

Fix issues with mailto links and parameter extraction
  • Loading branch information
TheTechromancer authored Dec 20, 2024
2 parents d5ce97c + 22773f7 commit 798670d
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 27 deletions.
8 changes: 1 addition & 7 deletions bbot/core/helpers/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,13 +559,12 @@ def is_port(p):
return p and p.isdigit() and 0 <= int(p) <= 65535


def is_dns_name(d, include_local=True):
def is_dns_name(d):
"""
Determines if the given string is a valid DNS name.
Args:
d (str): The string to be checked.
include_local (bool): Consider local hostnames to be valid (hostnames without periods)
Returns:
bool: True if the string is a valid DNS name, False otherwise.
Expand All @@ -575,17 +574,12 @@ def is_dns_name(d, include_local=True):
True
>>> is_dns_name('localhost')
True
>>> is_dns_name('localhost', include_local=False)
False
>>> is_dns_name('192.168.1.1')
False
"""
if is_ip(d):
return False
d = smart_decode(d)
if include_local:
if bbot_regexes.hostname_regex.match(d):
return True
if bbot_regexes.dns_name_validation_regex.match(d):
return True
return False
Expand Down
15 changes: 3 additions & 12 deletions bbot/core/helpers/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,10 @@
ip_range_regexes = [re.compile(r, re.I) for r in _ip_range_regexes]

# dns names with periods
_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.)+(?:[xX][nN]--)?[^\W_]{1,63}\.?"
_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.?)+(?:[xX][nN]--)?[^\W_]{1,63}\.?"
dns_name_extraction_regex = re.compile(_dns_name_regex, re.I)
dns_name_validation_regex = re.compile(r"^" + _dns_name_regex + r"$", re.I)

# dns names without periods
_hostname_regex = r"(?!\w*\.\w+)\w(?:[\w-]{0,100}\w)?"
hostname_regex = re.compile(r"^" + _hostname_regex + r"$", re.I)

_email_regex = r"(?:[^\W_][\w\-\.\+']{,100})@" + _dns_name_regex
email_regex = re.compile(_email_regex, re.I)

Expand All @@ -61,14 +57,12 @@

_open_port_regexes = (
_dns_name_regex + r":[0-9]{1,5}",
_hostname_regex + r":[0-9]{1,5}",
r"\[" + _ipv6_regex + r"\]:[0-9]{1,5}",
)
open_port_regexes = [re.compile(r, re.I) for r in _open_port_regexes]

_url_regexes = (
r"https?://" + _dns_name_regex + r"(?::[0-9]{1,5})?(?:(?:/|\?).*)?",
r"https?://" + _hostname_regex + r"(?::[0-9]{1,5})?(?:(?:/|\?).*)?",
r"https?://\[" + _ipv6_regex + r"\](?::[0-9]{1,5})?(?:(?:/|\?).*)?",
)
url_regexes = [re.compile(r, re.I) for r in _url_regexes]
Expand All @@ -83,10 +77,7 @@
for k, regexes in (
(
"DNS_NAME",
(
r"^" + _dns_name_regex + r"$",
r"^" + _hostname_regex + r"$",
),
(r"^" + _dns_name_regex + r"$",),
),
(
"EMAIL_ADDRESS",
Expand Down Expand Up @@ -140,7 +131,7 @@
textarea_tag_regex = re.compile(
r'<textarea[^>]*\bname=["\']?(\w+)["\']?[^>]*>(.*?)</textarea>', re.IGNORECASE | re.DOTALL
)
tag_attribute_regex = re.compile(r"<[^>]*(?:href|src)\s*=\s*[\"\']([^\"\']+)[\"\'][^>]*>")
tag_attribute_regex = re.compile(r"<[^>]*(?:href|action|src)\s*=\s*[\"\']?(?!mailto:)([^\s\'\"\>]+)[\"\']?[^>]*>")

valid_netloc = r"[^\s!@#$%^&()=/?\\'\";~`<>]+"

Expand Down
6 changes: 1 addition & 5 deletions bbot/modules/github_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,7 @@ async def validate_org(self, org):
for k, v in json.items():
if (
isinstance(v, str)
and (
self.helpers.is_dns_name(v, include_local=False)
or self.helpers.is_url(v)
or self.helpers.is_email(v)
)
and (self.helpers.is_dns_name(v) and "." in v or self.helpers.is_url(v) or self.helpers.is_email(v))
and self.scan.in_scope(v)
):
self.verbose(f'Found in-scope key "{k}": "{v}" for {org}, it appears to be in-scope')
Expand Down
7 changes: 6 additions & 1 deletion bbot/modules/internal/excavate.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,11 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
if self.excavate.helpers.validate_parameter(parameter_name, parameter_type):
if self.excavate.in_bl(parameter_name) is False:
parsed_url = urlparse(url)
if not parsed_url.hostname:
self.excavate.warning(
f"Error Parsing reconstructed URL [{url}] during parameter extraction, missing hostname"
)
continue
description = f"HTTP Extracted Parameter [{parameter_name}] ({parameterExtractorSubModule.name} Submodule)"
data = {
"host": parsed_url.hostname,
Expand Down Expand Up @@ -703,7 +708,7 @@ class URLExtractor(ExcavateRule):
"""
),
}
full_url_regex = re.compile(r"(https?)://((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
full_url_regex = re.compile(r"(https?)://(\w(?:[\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$")
tag_attribute_regex = bbot_regexes.tag_attribute_regex

Expand Down
2 changes: 1 addition & 1 deletion bbot/test/test_step_1/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async def test_helpers_misc(helpers, scan, bbot_scanner, bbot_httpserver):
assert not helpers.is_dns_name("evilcorp.com:80")
assert not helpers.is_dns_name("http://evilcorp.com:80")
assert helpers.is_dns_name("evilcorp")
assert not helpers.is_dns_name("evilcorp", include_local=False)
assert helpers.is_dns_name("evilcorp.")
assert helpers.is_dns_name("ドメイン.テスト")
assert not helpers.is_dns_name("127.0.0.1")
assert not helpers.is_dns_name("dead::beef")
Expand Down
2 changes: 1 addition & 1 deletion bbot/test/test_step_1/test_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ def test_url_regexes():
"http:///evilcorp.com",
"http:// evilcorp.com",
"http://evilcorp com",
"http://evilcorp.",
"http://.com",
"evilcorp.com",
"http://ex..ample.com",
Expand All @@ -288,6 +287,7 @@ def test_url_regexes():

good_urls = [
"https://evilcorp.com",
"http://evilcorp.",
"https://asdf.www.evilcorp.com",
"https://asdf.www-test.evilcorp.com",
"https://a.www-test.evilcorp.c",
Expand Down
27 changes: 27 additions & 0 deletions bbot/test/test_step_2/module_tests/test_module_excavate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ async def setup_before_prep(self, module_test):
# these ones should
<a href="/a_relative.txt">
<link href="/link_relative.txt">
<a href="mailto:[email protected]?subject=help">Help</a>
"""
expect_args = {"method": "GET", "uri": "/"}
respond_args = {"response_data": response_data}
Expand Down Expand Up @@ -1010,3 +1011,29 @@ def check(self, module_test, events):
assert (
"/donot_detect.js" not in url_events
), f"URL extracted from extractous text is incorrect, got {url_events}"


class TestExcavateBadURLs(ModuleTestBase):
targets = ["http://127.0.0.1:8888/"]
modules_overrides = ["excavate", "httpx", "hunt"]
config_overrides = {"interactsh_disable": True, "scope": {"report_distance": 10}}

bad_url_data = """
<a href='mailto:[email protected]?subject=help'>Help</a>
<a href='https://ssl.'>Help</a>
"""

async def setup_after_prep(self, module_test):
module_test.set_expect_requests({"uri": "/"}, {"response_data": self.bad_url_data})

def check(self, module_test, events):
log_file = module_test.scan.home / "debug.log"
log_text = log_file.read_text()
# make sure our logging is working
assert "Setting scan status to STARTING" in log_text
# make sure we don't have any URL validation errors
assert "Error Parsing reconstructed URL" not in log_text
assert "Error sanitizing event data" not in log_text

url_events = [e for e in events if e.type == "URL_UNVERIFIED"]
assert sorted([e.data for e in url_events]) == sorted(["https://ssl/", "http://127.0.0.1:8888/"])

0 comments on commit 798670d

Please sign in to comment.