From 6821954c77982415c1e2e5ea5bd684825e3cbabd Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 30 Nov 2023 13:24:33 -0500 Subject: [PATCH 1/5] better handling of non-HTTP URIs --- bbot/modules/internal/excavate.py | 14 ++--- bbot/test/conftest.py | 2 +- .../module_tests/test_module_excavate.py | 54 +++++++++++++++++-- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 04d8000fa..e27e6d228 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -149,6 +149,8 @@ def report(self, result, name, event, **kwargs): host, port = self.excavate.helpers.split_host_port(parsed_uri.netloc) # Handle non-HTTP URIs (ftp, s3, etc.) if not "http" in parsed_uri.scheme.lower(): + # these findings are pretty mundane so don't bother with them if they aren't in scope + abort_if = lambda e: e.scope_distance > 0 event_data = {"host": str(host), "description": f"Non-HTTP URI: {result}"} parsed_url = getattr(event, "parsed", None) if parsed_url: @@ -157,11 +159,16 @@ def report(self, result, name, event, **kwargs): event_data, "FINDING", source=event, + abort_if=abort_if, ) + protocol_data = {"protocol": parsed_uri.scheme, "host": str(host)} + if port: + protocol_data["port"] = port self.excavate.emit_event( - {"protocol": parsed_uri.scheme, "host": str(host)}, + protocol_data, "PROTOCOL", source=event, + abort_if=abort_if, ) return @@ -340,7 +347,6 @@ async def handle_event(self, event): web_spider_distance = getattr(event, "web_spider_distance", 0) num_redirects = max(getattr(event, "num_redirects", 0), web_spider_distance) location = event.data.get("location", "") - host = event.host # if it's a redirect if location: # get the url scheme @@ -361,10 +367,6 @@ async def handle_event(self, event): self.emit_event(url_event) else: self.verbose(f"Exceeded max HTTP redirects ({self.max_redirects}): {location}") - elif scheme: - # we ran into a scheme that's not HTTP or HTTPS - data = {"host": host, "description": f"Non-standard URI scheme: {scheme}://", "url": location} - self.emit_event(data, "FINDING", event) body = self.helpers.recursive_decode(event.data.get("body", "")) # Cloud extractors diff --git a/bbot/test/conftest.py b/bbot/test/conftest.py index 4dcf8ed21..15ef7ecc2 100644 --- a/bbot/test/conftest.py +++ b/bbot/test/conftest.py @@ -18,7 +18,7 @@ def pytest_sessionfinish(session, exitstatus): logger.removeHandler(handler) # Wipe out BBOT home dir - shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) + # shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) yield diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index 2d65dde4d..4ca750b4e 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -150,7 +150,7 @@ def check(self, module_test, events): class TestExcavateRedirect(TestExcavate): - targets = ["http://127.0.0.1:8888/", "http://127.0.0.1:8888/relative/"] + targets = ["http://127.0.0.1:8888/", "http://127.0.0.1:8888/relative/", "http://127.0.0.1:8888/nonhttpredirect/"] config_overrides = {"scope_report_distance": 1} async def setup_before_prep(self, module_test): @@ -161,11 +161,59 @@ async def setup_before_prep(self, module_test): module_test.httpserver.expect_request("/relative/").respond_with_data( "", status=302, headers={"Location": "./owa/"} ) + module_test.httpserver.expect_request("/relative/owa/").respond_with_data( + "ftp://127.0.0.1:2121\nsmb://127.0.0.1\nssh://127.0.0.2" + ) + module_test.httpserver.expect_request("/nonhttpredirect/").respond_with_data( + "", status=302, headers={"Location": "awb://127.0.0.1:7777"} + ) module_test.httpserver.no_handler_status_code = 404 def check(self, module_test, events): - assert any(e.data == "https://www.test.notreal/yep" for e in events) - assert any(e.data == "http://127.0.0.1:8888/relative/owa/" for e in events) + assert 1 == len( + [ + e + for e in events + if e.type == "URL_UNVERIFIED" and e.data == "https://www.test.notreal/yep" and e.scope_distance == 1 + ] + ) + assert 1 == len([e for e in events if e.type == "URL" and e.data == "http://127.0.0.1:8888/relative/owa/"]) + assert 1 == len( + [ + e + for e in events + if e.type == "FINDING" and e.data["description"] == "Non-HTTP URI: awb://127.0.0.1:7777" + ] + ) + assert 1 == len( + [ + e + for e in events + if e.type == "PROTOCOL" and e.data["protocol"] == "AWB" and e.data.get("port", 0) == 7777 + ] + ) + assert 1 == len( + [ + e + for e in events + if e.type == "FINDING" and e.data["description"] == "Non-HTTP URI: ftp://127.0.0.1:2121" + ] + ) + assert 1 == len( + [ + e + for e in events + if e.type == "PROTOCOL" and e.data["protocol"] == "FTP" and e.data.get("port", 0) == 2121 + ] + ) + assert 1 == len( + [e for e in events if e.type == "FINDING" and e.data["description"] == "Non-HTTP URI: smb://127.0.0.1"] + ) + assert 1 == len( + [e for e in events if e.type == "PROTOCOL" and e.data["protocol"] == "SMB" and not "port" in e.data] + ) + assert 0 == len([e for e in events if e.type == "FINDING" and "ssh://127.0.0.1" in e.data["description"]]) + assert 0 == len([e for e in events if e.type == "PROTOCOL" and e.data["protocol"] == "SSH"]) class TestExcavateMaxLinksPerPage(TestExcavate): From ac1550340717d8a6b718828d75828d5d235cfd4c Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 30 Nov 2023 13:27:09 -0500 Subject: [PATCH 2/5] remove redundant sanitization --- bbot/core/event/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 57d2108a8..52cd2847a 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -722,12 +722,6 @@ def sanitize_data(self, data): class DictEvent(BaseEvent): - def sanitize_data(self, data): - url = data.get("url", "") - if url: - self.parsed = validators.validate_url_parsed(url) - return data - def _data_human(self): return json.dumps(self.data, sort_keys=True) From 036729ecb3a5f01ca75b2fefc22f89450c17d79a Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 30 Nov 2023 13:29:10 -0500 Subject: [PATCH 3/5] remove debugging --- bbot/test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbot/test/conftest.py b/bbot/test/conftest.py index 15ef7ecc2..4dcf8ed21 100644 --- a/bbot/test/conftest.py +++ b/bbot/test/conftest.py @@ -18,7 +18,7 @@ def pytest_sessionfinish(session, exitstatus): logger.removeHandler(handler) # Wipe out BBOT home dir - # shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) + shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) yield From ff158fec88f623ee3588f21b4d3fd16553b7723b Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 30 Nov 2023 16:28:47 -0500 Subject: [PATCH 4/5] oopsie --- bbot/core/event/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 52cd2847a..57d2108a8 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -722,6 +722,12 @@ def sanitize_data(self, data): class DictEvent(BaseEvent): + def sanitize_data(self, data): + url = data.get("url", "") + if url: + self.parsed = validators.validate_url_parsed(url) + return data + def _data_human(self): return json.dumps(self.data, sort_keys=True) From 73d309f3434bb0dae29f558e0256a20e13f5e27d Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Fri, 15 Dec 2023 16:25:00 -0500 Subject: [PATCH 5/5] fix queue bug by implementing shufflequeue --- bbot/core/helpers/async_helpers.py | 10 ++++++++++ bbot/modules/base.py | 6 +++--- bbot/scanner/manager.py | 13 +++---------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bbot/core/helpers/async_helpers.py b/bbot/core/helpers/async_helpers.py index f2a054892..916764e53 100644 --- a/bbot/core/helpers/async_helpers.py +++ b/bbot/core/helpers/async_helpers.py @@ -1,4 +1,5 @@ import uuid +import random import asyncio import logging import threading @@ -12,6 +13,15 @@ from .cache import CacheDict +class ShuffleQueue(asyncio.Queue): + def _put(self, item): + random_index = random.randint(0, self.qsize()) + self._queue.insert(random_index, item) + + def _get(self): + return self._queue.popleft() + + class _Lock(asyncio.Lock): def __init__(self, name): self.name = name diff --git a/bbot/modules/base.py b/bbot/modules/base.py index 6dee96745..7599289ea 100644 --- a/bbot/modules/base.py +++ b/bbot/modules/base.py @@ -6,7 +6,7 @@ from ..core.helpers.misc import get_size # noqa from ..core.errors import ValidationError -from ..core.helpers.async_helpers import TaskCounter +from ..core.helpers.async_helpers import TaskCounter, ShuffleQueue class BaseModule: @@ -1065,13 +1065,13 @@ def config(self): @property def incoming_event_queue(self): if self._incoming_event_queue is None: - self._incoming_event_queue = asyncio.PriorityQueue() + self._incoming_event_queue = ShuffleQueue() return self._incoming_event_queue @property def outgoing_event_queue(self): if self._outgoing_event_queue is None: - self._outgoing_event_queue = asyncio.PriorityQueue() + self._outgoing_event_queue = ShuffleQueue() return self._outgoing_event_queue @property diff --git a/bbot/scanner/manager.py b/bbot/scanner/manager.py index 13d46669d..1258c9e80 100644 --- a/bbot/scanner/manager.py +++ b/bbot/scanner/manager.py @@ -4,7 +4,7 @@ from contextlib import suppress from ..core.errors import ValidationError -from ..core.helpers.async_helpers import TaskCounter +from ..core.helpers.async_helpers import TaskCounter, ShuffleQueue log = logging.getLogger("bbot.scanner.manager") @@ -18,7 +18,7 @@ class ScanManager: Attributes: scan (Scan): Reference to the Scan object that instantiated the ScanManager. - incoming_event_queue (asyncio.PriorityQueue): Queue storing incoming events for processing. + incoming_event_queue (ShuffleQueue): Queue storing incoming events for processing. events_distributed (set): Set tracking globally unique events. events_accepted (set): Set tracking events accepted by individual modules. dns_resolution (bool): Flag to enable or disable DNS resolution. @@ -39,14 +39,7 @@ def __init__(self, scan): self.scan = scan - # TODO: consider reworking modules' dedupe policy (accept_dupes) - # by creating a function that decides the criteria for what is - # considered to be a duplicate (by default this would be a simple - # hash(event)), but allowing each module to override it if needed. - # If a module used the default function, its dedupe could be done - # at the manager level to save memory. If not, it would be done by the scan. - - self.incoming_event_queue = asyncio.PriorityQueue() + self.incoming_event_queue = ShuffleQueue() # track incoming duplicates module-by-module (for `suppress_dupes` attribute of modules) self.incoming_dup_tracker = set() # track outgoing duplicates (for `accept_dupes` attribute of modules)