diff --git a/bbot/defaults.yml b/bbot/defaults.yml index 1e5a1a080..98ef4d21c 100644 --- a/bbot/defaults.yml +++ b/bbot/defaults.yml @@ -100,6 +100,7 @@ url_extension_blacklist: - woff - woff2 - ttf + - eot - sass - scss # audio diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 24322d06b..40067c69e 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -67,7 +67,7 @@ async def report(self, result, name, event, **kwargs): class URLExtractor(BaseExtractor): - url_path_regex = r"((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]+)*/?)" + url_path_regex = r"((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)" regexes = { "fulluri": r"(?i)" + r"([a-z]\w{1,15})://" + url_path_regex, "fullurl": r"(?i)" + r"(https?)://" + url_path_regex, diff --git a/bbot/test/test_step_2/module_tests/test_module_httpx.py b/bbot/test/test_step_2/module_tests/test_module_httpx.py index ebd9bbdb1..e4970e02e 100644 --- a/bbot/test/test_step_2/module_tests/test_module_httpx.py +++ b/bbot/test/test_step_2/module_tests/test_module_httpx.py @@ -98,3 +98,29 @@ def check(self, module_test, events): if e.type.startswith("DNS_NAME") and e.data == "www.evilcorp.com" and "affiliate" in e.tags ] ) + + +class TestHTTPX_URLBlacklist(ModuleTestBase): + targets = ["http://127.0.0.1:8888"] + modules_overrides = ["httpx", "speculate", "excavate"] + config_overrides = {"web_spider_distance": 10, "web_spider_depth": 10} + + async def setup_after_prep(self, module_test): + module_test.httpserver.expect_request("/").respond_with_data( + """ + + + + + """ + ) + + def check(self, module_test, events): + assert 4 == len([e for e in events if e.type == "URL_UNVERIFIED"]) + assert 3 == len([e for e in events if e.type == "HTTP_RESPONSE"]) + assert 3 == len([e for e in events if e.type == "URL"]) + assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/"]) + assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/test.aspx"]) + assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/test.txt"]) + assert not any([e for e in events if "URL" in e.type and ".svg" in e.data]) + assert not any([e for e in events if "URL" in e.type and ".woff" in e.data])