diff --git a/bbot/defaults.yml b/bbot/defaults.yml
index 1e5a1a080f..98ef4d21c8 100644
--- a/bbot/defaults.yml
+++ b/bbot/defaults.yml
@@ -100,6 +100,7 @@ url_extension_blacklist:
- woff
- woff2
- ttf
+ - eot
- sass
- scss
# audio
diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
index 24322d06b1..40067c69e2 100644
--- a/bbot/modules/internal/excavate.py
+++ b/bbot/modules/internal/excavate.py
@@ -67,7 +67,7 @@ async def report(self, result, name, event, **kwargs):
class URLExtractor(BaseExtractor):
- url_path_regex = r"((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]+)*/?)"
+ url_path_regex = r"((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)"
regexes = {
"fulluri": r"(?i)" + r"([a-z]\w{1,15})://" + url_path_regex,
"fullurl": r"(?i)" + r"(https?)://" + url_path_regex,
diff --git a/bbot/test/test_step_2/module_tests/test_module_httpx.py b/bbot/test/test_step_2/module_tests/test_module_httpx.py
index ebd9bbdb11..e4970e02e4 100644
--- a/bbot/test/test_step_2/module_tests/test_module_httpx.py
+++ b/bbot/test/test_step_2/module_tests/test_module_httpx.py
@@ -98,3 +98,29 @@ def check(self, module_test, events):
if e.type.startswith("DNS_NAME") and e.data == "www.evilcorp.com" and "affiliate" in e.tags
]
)
+
+
+class TestHTTPX_URLBlacklist(ModuleTestBase):
+ targets = ["http://127.0.0.1:8888"]
+ modules_overrides = ["httpx", "speculate", "excavate"]
+ config_overrides = {"web_spider_distance": 10, "web_spider_depth": 10}
+
+ async def setup_after_prep(self, module_test):
+ module_test.httpserver.expect_request("/").respond_with_data(
+ """
+
+
+
+
+ """
+ )
+
+ def check(self, module_test, events):
+ assert 4 == len([e for e in events if e.type == "URL_UNVERIFIED"])
+ assert 3 == len([e for e in events if e.type == "HTTP_RESPONSE"])
+ assert 3 == len([e for e in events if e.type == "URL"])
+ assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/"])
+ assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/test.aspx"])
+ assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/test.txt"])
+ assert not any([e for e in events if "URL" in e.type and ".svg" in e.data])
+ assert not any([e for e in events if "URL" in e.type and ".woff" in e.data])