From dd60ceaf8bd9c21400413bc93e852e1e81759439 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 6 Nov 2024 15:33:14 -0500 Subject: [PATCH] documentation, tests for blacklisting by regex --- bbot/modules/bevigil.py | 4 +- bbot/scanner/preset/path.py | 4 +- bbot/scanner/preset/preset.py | 2 +- bbot/scanner/target.py | 79 ++++++++++++++-------------- bbot/test/bbot_fixtures.py | 8 --- bbot/test/conftest.py | 8 +++ bbot/test/test_step_1/test_target.py | 63 ++++++++++++++++++++++ docs/scanning/index.md | 25 +++++++++ 8 files changed, 141 insertions(+), 52 deletions(-) diff --git a/bbot/modules/bevigil.py b/bbot/modules/bevigil.py index f3889e7fd4..8e70fe4143 100644 --- a/bbot/modules/bevigil.py +++ b/bbot/modules/bevigil.py @@ -60,14 +60,14 @@ async def request_urls(self, query): url = f"{self.base_url}/{self.helpers.quote(query)}/urls/" return await self.api_request(url) - def parse_subdomains(self, r, query=None): + async def parse_subdomains(self, r, query=None): results = set() subdomains = r.json().get("subdomains") if subdomains: results.update(subdomains) return results - def parse_urls(self, r, query=None): + async def parse_urls(self, r, query=None): results = set() urls = r.json().get("urls") if urls: diff --git a/bbot/scanner/preset/path.py b/bbot/scanner/preset/path.py index 730b16e637..9b84566124 100644 --- a/bbot/scanner/preset/path.py +++ b/bbot/scanner/preset/path.py @@ -33,7 +33,9 @@ def find(self, filename): if "/" in str(filename): if filename_path.parent not in paths_to_search: paths_to_search.append(filename_path.parent) - log.debug(f"Searching for preset in {paths_to_search}, file candidates: {file_candidates_str}") + log.debug( + f"Searching for preset in {[str(p) for p in paths_to_search]}, file candidates: {file_candidates_str}" + ) for path in paths_to_search: for candidate in file_candidates: for file in path.rglob(candidate): diff --git a/bbot/scanner/preset/preset.py b/bbot/scanner/preset/preset.py index 976053308f..7f5262a45e 100644 --- a/bbot/scanner/preset/preset.py +++ b/bbot/scanner/preset/preset.py @@ -245,7 +245,7 @@ def __init__( # "presets" is alias to "include" if presets and include: raise ValueError( - 'Cannot use both "presets" and "include" args at the same time (presets is only an alias to include). Please pick only one :)' + 'Cannot use both "presets" and "include" args at the same time (presets is an alias to include). Please pick one or the other :)' ) if presets and not include: include = presets diff --git a/bbot/scanner/target.py b/bbot/scanner/target.py index b55d143b98..68067cee0a 100644 --- a/bbot/scanner/target.py +++ b/bbot/scanner/target.py @@ -42,23 +42,16 @@ class BaseTarget(RadixTarget): def __init__(self, *targets, scan=None, **kwargs): self.scan = scan self.events = set() - super().__init__(**kwargs) - # we preserve the raw inputs to ensure we don't lose any information - self.inputs, events = self._make_events(targets) - # sort by host size to ensure consistency - events = sorted(events, key=lambda e: (0 if not e.host else host_size_key(e.host))) - for event in events: - if event.host: - self._add(event.host, data=event) - else: - self.events.add(event) + self.inputs = set() # Register decorated methods for method in dir(self): - if callable(getattr(self, method)): + if callable(getattr(self, method, None)): func = getattr(self, method) if hasattr(func, "_regex"): self.special_target_types[func._regex] = func + super().__init__(*targets, **kwargs) + def get(self, event, single=True, **kwargs): """ Override default .get() to accept events and optionally return multiple results @@ -92,42 +85,42 @@ def make_event(self, *args, **kwargs): kwargs["tags"].update(self.tags) return make_event(*args, dummy=True, scan=self.scan, **kwargs) - def _add(self, host, data=None): - """ - Overrides the base method to enable having multiple events for the same host. - - The "data" attribute of the node is now a set of events. - """ - if data is None: - event = self.make_event(host) - else: - event = data - self.events.add(event) - if event.host: - try: - event_set = self.get(event.host, single=False, raise_error=True) - event_set.add(event) - except KeyError: - event_set = {event} - super()._add(event.host, data=event_set) - return event - - def _make_events(self, targets): - inputs = set() + def add(self, targets): + if not isinstance(targets, (list, set, tuple)): + targets = [targets] events = set() for target in targets: _events = [] special_target_type, _events = self.check_special_target_types(str(target)) if special_target_type: - inputs.add(str(target)) + self.inputs.add(str(target)) else: event = self.make_event(target) if event: _events = [event] for event in _events: - inputs.add(event.data) + self.inputs.add(event.data) events.add(event) - return inputs, events + + # sort by host size to ensure consistency + events = sorted(events, key=lambda e: (0 if not e.host else host_size_key(e.host))) + for event in events: + self._add(event.host, data=event) + + def _add(self, host, data): + """ + Overrides the base method to enable having multiple events for the same host. + + The "data" attribute of the node is now a set of events. + """ + self.events.add(data) + if host: + try: + event_set = self.get(host, single=False, raise_error=True) + event_set.add(data) + except KeyError: + event_set = {data} + super()._add(host, data=event_set) def check_special_target_types(self, target): for regex, callback in self.special_target_types.items(): @@ -205,14 +198,20 @@ def get(self, event, **kwargs): """ event = self.make_event(event) # first, check event's host against blacklist - event_result = super().get(event, **kwargs) + try: + event_result = super().get(event, raise_error=True) + except KeyError: + event_result = None if event_result is not None: return event_result # next, check event's host against regexes host_or_url = event.host_filterable - for regex in self.blacklist_regexes: - if regex.match(host_or_url): - return event + if host_or_url: + for regex in self.blacklist_regexes: + if regex.search(str(host_or_url)): + return event + if kwargs.get("raise_error", False): + raise KeyError(f"Host not found: '{event.data}'") return None def _hash_value(self): diff --git a/bbot/test/bbot_fixtures.py b/bbot/test/bbot_fixtures.py index 0b2a0ec573..18738f4996 100644 --- a/bbot/test/bbot_fixtures.py +++ b/bbot/test/bbot_fixtures.py @@ -222,11 +222,3 @@ class bbot_events: e.scope_distance = 0 return bbot_events - - -@pytest.fixture(scope="session", autouse=True) -def install_all_python_deps(): - deps_pip = set() - for module in DEFAULT_PRESET.module_loader.preloaded().values(): - deps_pip.update(set(module.get("deps", {}).get("pip", []))) - subprocess.run([sys.executable, "-m", "pip", "install"] + list(deps_pip)) diff --git a/bbot/test/conftest.py b/bbot/test/conftest.py index c2e8b3448a..a8ce20d359 100644 --- a/bbot/test/conftest.py +++ b/bbot/test/conftest.py @@ -316,6 +316,14 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): # pragma: no # traceback.print_exc() +@pytest.fixture(scope="session", autouse=True) +def install_all_python_deps(): + deps_pip = set() + for module in DEFAULT_PRESET.module_loader.preloaded().values(): + deps_pip.update(set(module.get("deps", {}).get("pip", []))) + subprocess.run([sys.executable, "-m", "pip", "install"] + list(deps_pip)) + + @pytest.hookimpl(tryfirst=True, hookwrapper=True) def pytest_sessionfinish(session, exitstatus): # Remove handlers from all loggers to prevent logging errors at exit diff --git a/bbot/test/test_step_1/test_target.py b/bbot/test/test_step_1/test_target.py index 4dd4f17d7f..0890e5dfbf 100644 --- a/bbot/test/test_step_1/test_target.py +++ b/bbot/test/test_step_1/test_target.py @@ -333,3 +333,66 @@ async def test_target(bbot_scanner): events = target.get("www.evilcorp.com", single=False) assert len(events) == 2 assert set([e.data for e in events]) == {"http://evilcorp.com/", "evilcorp.com:443"} + + +@pytest.mark.asyncio +async def test_blacklist_regex(bbot_scanner, bbot_httpserver): + + from bbot.scanner.target import ScanBlacklist + + blacklist = ScanBlacklist("evilcorp.com") + assert blacklist.inputs == {"evilcorp.com"} + assert "www.evilcorp.com" in blacklist + assert "http://www.evilcorp.com" in blacklist + blacklist.add("RE:test") + assert "RE:test" in blacklist.inputs + assert set(blacklist.inputs) == {"evilcorp.com", "RE:test"} + assert blacklist.blacklist_regexes + assert next(iter(blacklist.blacklist_regexes)).pattern == "test" + result1 = blacklist.get("test.com") + assert result1.type == "DNS_NAME" + assert result1.data == "test.com" + result2 = blacklist.get("www.evilcorp.com") + assert result2.type == "DNS_NAME" + assert result2.data == "evilcorp.com" + result2 = blacklist.get("www.evil.com") + assert result2 is None + with pytest.raises(KeyError): + blacklist.get("www.evil.com", raise_error=True) + assert "test.com" in blacklist + assert "http://evilcorp.com/test.aspx" in blacklist + assert not "http://tes.com" in blacklist + + blacklist = ScanBlacklist("evilcorp.com", r"RE:[0-9]{6}\.aspx$") + assert "http://evilcorp.com" in blacklist + assert not "http://test.com/123456" in blacklist + assert not "http://test.com/12345.aspx?a=asdf" in blacklist + assert not "http://test.com/asdf/123456.aspx/asdf" in blacklist + assert "http://test.com/asdf/123456.aspx?a=asdf" in blacklist + assert "http://test.com/asdf/123456.aspx" in blacklist + + bbot_httpserver.expect_request(uri="/").respond_with_data("") + bbot_httpserver.expect_request(uri="/asdfevilasdf").respond_with_data("") + + # make sure URL is detected normally + scan = bbot_scanner("http://127.0.0.1:8888/", presets=["spider"], config={"excavate": True}, debug=True) + events = [e async for e in scan.async_start()] + urls = [e.data for e in events if e.type == "URL"] + assert len(urls) == 2 + assert set(urls) == {"http://127.0.0.1:8888/", "http://127.0.0.1:8888/asdfevil333asdf"} + + # same scan again but with blacklist regex + scan = bbot_scanner( + "http://127.0.0.1:8888/", + blacklist=[r"RE:evil[0-9]{3}"], + presets=["spider"], + config={"excavate": True}, + debug=True, + ) + print(scan.target.blacklist.blacklist_regexes) + assert scan.target.blacklist.blacklist_regexes + assert next(iter(scan.target.blacklist.blacklist_regexes)).pattern == "evil[0-9]{3}" + events = [e async for e in scan.async_start()] + urls = [e.data for e in events if e.type == "URL"] + assert len(urls) == 1 + assert set(urls) == {"http://127.0.0.1:8888/"} diff --git a/docs/scanning/index.md b/docs/scanning/index.md index e2e4a79211..e515653c61 100644 --- a/docs/scanning/index.md +++ b/docs/scanning/index.md @@ -178,6 +178,8 @@ Note that `--strict-scope` only applies to targets and whitelists, but not black BBOT allows precise control over scope with whitelists and blacklists. These both use the same syntax as `--target`, meaning they accept the same event types, and you can specify an unlimited number of them, via a file, the CLI, or both. +#### Whitelists + `--whitelist` enables you to override what's in scope. For example, if you want to run nuclei against `evilcorp.com`, but stay only inside their corporate IP range of `1.2.3.0/24`, you can accomplish this like so: ```bash @@ -185,6 +187,8 @@ BBOT allows precise control over scope with whitelists and blacklists. These bot bbot -t evilcorp.com --whitelist 1.2.3.0/24 -f subdomain-enum -m nmap nuclei --allow-deadly ``` +#### Blacklists + `--blacklist` takes ultimate precedence. Anything in the blacklist is completely excluded from the scan, even if it's in the whitelist. ```bash @@ -192,6 +196,27 @@ bbot -t evilcorp.com --whitelist 1.2.3.0/24 -f subdomain-enum -m nmap nuclei --a bbot -t evilcorp.com --blacklist internal.evilcorp.com -f subdomain-enum -m nmap nuclei --allow-deadly ``` +#### Blacklist by Regex + +Blacklists also accept regex patterns. These regexes are are checked against the full URL, including the host and path. + +To specify a regex, prefix the pattern with `RE:`. For example, to exclude all events containing "signout", you could do: + +```bash +bbot -t evilcorp.com --blacklist "RE:signout" +``` + +Note that this would blacklist both of the following events: + +- `[URL] http://evilcorp.com/signout.aspx` +- `[DNS_NAME] signout.evilcorp.com` + +If you only want to blacklist the URL, you could narrow the regex like so: + +```bash +bbot -t evilcorp.com --blacklist 'RE:signout\.aspx$' +``` + ## DNS Wildcards BBOT has robust wildcard detection built-in. It can reliably detect wildcard domains, and will tag them accordingly: