From be9ddade3d0e612a53bf4b129a4017abcde3cc31 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 13 Oct 2024 22:49:56 -0400 Subject: [PATCH 1/7] Convert excavate to intercept module --- bbot/modules/base.py | 3 +- bbot/modules/httpx.py | 3 - bbot/modules/internal/cloudcheck.py | 4 +- bbot/modules/internal/dnsresolve.py | 4 +- bbot/modules/internal/excavate.py | 56 +++++++++++++++++-- bbot/scanner/manager.py | 6 +- .../module_tests/test_module_httpx.py | 4 +- 7 files changed, 63 insertions(+), 17 deletions(-) diff --git a/bbot/modules/base.py b/bbot/modules/base.py index 946506094..89660edb2 100644 --- a/bbot/modules/base.py +++ b/bbot/modules/base.py @@ -1559,7 +1559,7 @@ def critical(self, *args, trace=True, **kwargs): self.trace() -class InterceptModule(BaseModule): +class BaseInterceptModule(BaseModule): """ An Intercept Module is a special type of high-priority module that gets early access to events. @@ -1571,7 +1571,6 @@ class InterceptModule(BaseModule): """ accept_dupes = True - suppress_dupes = False _intercept = True async def _worker(self): diff --git a/bbot/modules/httpx.py b/bbot/modules/httpx.py index deda243a0..2cd2c0504 100644 --- a/bbot/modules/httpx.py +++ b/bbot/modules/httpx.py @@ -172,9 +172,6 @@ async def handle_batch(self, *events): httpx_ip = j.get("host", "") if httpx_ip: tags.append(f"ip-{httpx_ip}") - # detect login pages - if self.helpers.web.is_login_page(j.get("body", "")): - tags.append("login-page") # grab title title = self.helpers.tagify(j.get("title", ""), maxlen=30) if title: diff --git a/bbot/modules/internal/cloudcheck.py b/bbot/modules/internal/cloudcheck.py index 9b7b6e147..392c8e0c5 100644 --- a/bbot/modules/internal/cloudcheck.py +++ b/bbot/modules/internal/cloudcheck.py @@ -1,7 +1,7 @@ -from bbot.modules.base import InterceptModule +from bbot.modules.base import BaseInterceptModule -class CloudCheck(InterceptModule): +class CloudCheck(BaseInterceptModule): watched_events = ["*"] meta = {"description": "Tag events by cloud provider, identify cloud resources like storage buckets"} scope_distance_modifier = 1 diff --git a/bbot/modules/internal/dnsresolve.py b/bbot/modules/internal/dnsresolve.py index 5dc4acc83..53b317d9a 100644 --- a/bbot/modules/internal/dnsresolve.py +++ b/bbot/modules/internal/dnsresolve.py @@ -3,11 +3,11 @@ from bbot.errors import ValidationError from bbot.core.helpers.dns.engine import all_rdtypes -from bbot.modules.base import InterceptModule, BaseModule from bbot.core.helpers.dns.helpers import extract_targets +from bbot.modules.base import BaseInterceptModule, BaseModule -class DNSResolve(InterceptModule): +class DNSResolve(BaseInterceptModule): watched_events = ["*"] _priority = 1 scope_distance_modifier = None diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index b85881d8b..d07f5feb1 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -6,6 +6,7 @@ from pathlib import Path from bbot.errors import ExcavateError import bbot.core.helpers.regexes as bbot_regexes +from bbot.modules.base import BaseInterceptModule from bbot.modules.internal.base import BaseInternalModule from urllib.parse import urlparse, urljoin, parse_qs, urlunparse @@ -279,7 +280,7 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte await self.report(event_data, event, yara_rule_settings, discovery_context) -class excavate(BaseInternalModule): +class excavate(BaseInternalModule, BaseInterceptModule): """ Example (simple) Excavate Rules: @@ -310,6 +311,7 @@ class excavateTestRule(ExcavateRule): "custom_yara_rules": "Include custom Yara rules", } scope_distance_modifier = None + accept_dupes = False _module_threads = 8 @@ -669,8 +671,32 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte class URLExtractor(ExcavateRule): yara_rules = { - "url_full": r'rule url_full { meta: tags = "spider-danger" description = "contains full URL" strings: $url_full = /https?:\/\/([\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/ condition: $url_full }', - "url_attr": r'rule url_attr { meta: tags = "spider-danger" description = "contains tag with src or href attribute" strings: $url_attr = /<[^>]+(href|src)=["\'][^"\']*["\'][^>]*>/ condition: $url_attr }', + "url_full": ( + r""" + rule url_full { + meta: + tags = "spider-danger" + description = "contains full URL" + strings: + $url_full = /https?:\/\/([\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/ + condition: + $url_full + } + """ + ), + "url_attr": ( + r""" + rule url_attr { + meta: + tags = "spider-danger" + description = "contains tag with src or href attribute" + strings: + $url_attr = /<[^>]+(href|src)=["\'][^"\']*["\'][^>]*>/ + condition: + $url_attr + } + """ + ), } full_url_regex = re.compile(r"(https?)://((?:\w|\d)(?:[\d\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)") full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$") @@ -749,6 +775,26 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte for domain_str in yara_results[identifier]: await self.report(domain_str, event, yara_rule_settings, discovery_context, event_type="DNS_NAME") + class LoginPageExtractor(ExcavateRule): + yara_rules = { + "login_page": r""" + rule login_page { + meta: + description = "Detects login pages with username and password fields" + strings: + $username_field = /]+name=["']?(user|login|email)/ nocase + $password_field = /]+name=["']?passw?/ nocase + condition: + $username_field and $password_field + } + """ + } + + async def process(self, yara_results, event, yara_rule_settings, discovery_context): + self.excavate.critical(f"Login page detected: {event.data['url']}") + if yara_results: + event.add_tag("login-page") + def add_yara_rule(self, rule_name, rule_content, rule_instance): rule_instance.name = rule_name self.yara_rules_dict[rule_name] = rule_content @@ -829,7 +875,9 @@ async def setup(self): yara_rules_combined = "\n".join(self.yara_rules_dict.values()) try: self.info(f"Compiling {len(self.yara_rules_dict):,} YARA rules") - self.yara_rules = yara.compile(source=yara_rules_combined) + for rule_name, rule_content in self.yara_rules_dict.items(): + self.info(f"Compiling YARA rule [{rule_name}]") + self.yara_rules = yara.compile(source=yara_rules_combined) except yara.SyntaxError as e: self.debug(yara_rules_combined) return False, f"Yara Rules failed to compile with error: [{e}]" diff --git a/bbot/scanner/manager.py b/bbot/scanner/manager.py index 70658e69d..720331625 100644 --- a/bbot/scanner/manager.py +++ b/bbot/scanner/manager.py @@ -1,10 +1,10 @@ import asyncio from contextlib import suppress -from bbot.modules.base import InterceptModule +from bbot.modules.base import BaseInterceptModule -class ScanIngress(InterceptModule): +class ScanIngress(BaseInterceptModule): """ This is always the first intercept module in the chain, responsible for basic scope checks @@ -169,7 +169,7 @@ def is_incoming_duplicate(self, event, add=False): return False -class ScanEgress(InterceptModule): +class ScanEgress(BaseInterceptModule): """ This is always the last intercept module in the chain, responsible for executing and acting on the `abort_if` and `on_success_callback` functions. diff --git a/bbot/test/test_step_2/module_tests/test_module_httpx.py b/bbot/test/test_step_2/module_tests/test_module_httpx.py index ef9744516..c05b6842d 100644 --- a/bbot/test/test_step_2/module_tests/test_module_httpx.py +++ b/bbot/test/test_step_2/module_tests/test_module_httpx.py @@ -1,8 +1,10 @@ from .base import ModuleTestBase -class TestHTTPX(ModuleTestBase): +class TestHTTPXBase(ModuleTestBase): targets = ["http://127.0.0.1:8888/url", "127.0.0.1:8888"] + module_name = "httpx" + modules_overrides = ["httpx", "excavate"] config_overrides = {"modules": {"httpx": {"store_responses": True}}} # HTML for a page with a login form From 8642bc3edadd2b6c96c25735b8f2973ae3879ba2 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 13 Oct 2024 22:50:32 -0400 Subject: [PATCH 2/7] remove old login page helper --- bbot/core/helpers/web/web.py | 47 ------------------------------------ 1 file changed, 47 deletions(-) diff --git a/bbot/core/helpers/web/web.py b/bbot/core/helpers/web/web.py index a49748008..28c8b5f37 100644 --- a/bbot/core/helpers/web/web.py +++ b/bbot/core/helpers/web/web.py @@ -464,53 +464,6 @@ def beautifulsoup( log.debug(f"Error parsing beautifulsoup: {e}") return False - user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]] - pass_keywords = [re.compile(r, re.I) for r in ["pass"]] - - def is_login_page(self, html): - """ - TODO: convert this into an excavate YARA rule - - Determines if the provided HTML content contains a login page. - - This function parses the HTML to search for forms with input fields typically used for - authentication. If it identifies password fields or a combination of username and password - fields, it returns True. - - Args: - html (str): The HTML content to analyze. - - Returns: - bool: True if the HTML contains a login page, otherwise False. - - Examples: - >>> is_login_page('
') - True - - >>> is_login_page('
') - False - """ - try: - soup = BeautifulSoup(html, "html.parser") - except Exception as e: - log.debug(f"Error parsing html: {e}") - return False - - forms = soup.find_all("form") - - # first, check for obvious password fields - for form in forms: - if form.find_all("input", {"type": "password"}): - return True - - # next, check for forms that have both a user-like and password-like field - for form in forms: - user_fields = sum(bool(form.find_all("input", {"name": r})) for r in self.user_keywords) - pass_fields = sum(bool(form.find_all("input", {"name": r})) for r in self.pass_keywords) - if user_fields and pass_fields: - return True - return False - def response_to_json(self, response): """ Convert web response to JSON object, similar to the output of `httpx -irr -json` From ac983ec788785e6ab2180fbc44f52467e579d795 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 13 Oct 2024 22:55:58 -0400 Subject: [PATCH 3/7] flaked --- bbot/core/helpers/web/web.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bbot/core/helpers/web/web.py b/bbot/core/helpers/web/web.py index 28c8b5f37..b05b2d798 100644 --- a/bbot/core/helpers/web/web.py +++ b/bbot/core/helpers/web/web.py @@ -1,4 +1,3 @@ -import re import logging import warnings from pathlib import Path From 3fc0ef762dab9e20ce1a6c6aa1bdfbce0488115f Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 14 Oct 2024 00:17:56 -0400 Subject: [PATCH 4/7] fix excavate tests --- bbot/modules/internal/excavate.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index d07f5feb1..ad34b6efb 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -875,9 +875,7 @@ async def setup(self): yara_rules_combined = "\n".join(self.yara_rules_dict.values()) try: self.info(f"Compiling {len(self.yara_rules_dict):,} YARA rules") - for rule_name, rule_content in self.yara_rules_dict.items(): - self.info(f"Compiling YARA rule [{rule_name}]") - self.yara_rules = yara.compile(source=yara_rules_combined) + self.yara_rules = yara.compile(source=yara_rules_combined) except yara.SyntaxError as e: self.debug(yara_rules_combined) return False, f"Yara Rules failed to compile with error: [{e}]" From c59ab925c5bb87daa10837ae5ceaf9de666ba981 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 15 Oct 2024 00:28:34 -0400 Subject: [PATCH 5/7] fix excavate tests --- bbot/core/event/base.py | 5 ----- bbot/modules/internetdb.py | 4 +++- bbot/scanner/manager.py | 8 -------- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index d5a9552e2..fddd0c424 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -440,11 +440,6 @@ def always_emit(self): no_host_information = not bool(self.host) return self._always_emit or always_emit_tags or no_host_information - @property - def quick_emit(self): - no_host_information = not bool(self.host) - return self._quick_emit or no_host_information - @property def id(self): """ diff --git a/bbot/modules/internetdb.py b/bbot/modules/internetdb.py index 55b613f16..52c5040b2 100644 --- a/bbot/modules/internetdb.py +++ b/bbot/modules/internetdb.py @@ -48,6 +48,9 @@ class internetdb(BaseModule): "show_open_ports": "Display OPEN_TCP_PORT events in output, even if they didn't lead to an interesting discovery" } + # we get lots of 404s, that's normal + _api_failure_abort_threshold = 9999999999 + _qsize = 500 base_url = "https://internetdb.shodan.io" @@ -113,7 +116,6 @@ async def _parse_response(self, data: dict, event, ip): "OPEN_TCP_PORT", parent=event, internal=(not self.show_open_ports), - quick=True, context=f'{{module}} queried Shodan\'s InternetDB API for "{query_host}" and found {{event.type}}: {{event.data}}', ) vulns = data.get("vulns", []) diff --git a/bbot/scanner/manager.py b/bbot/scanner/manager.py index 720331625..3cb1f5fdf 100644 --- a/bbot/scanner/manager.py +++ b/bbot/scanner/manager.py @@ -115,14 +115,6 @@ async def handle_event(self, event, **kwargs): # nerf event's priority if it's not in scope event.module_priority += event.scope_distance - async def forward_event(self, event, kwargs): - # if a module qualifies for "quick-emit", we skip all the intermediate modules like dns and cloud - # and forward it straight to the egress module - if event.quick_emit: - await self.scan.egress_module.queue_event(event, kwargs) - else: - await super().forward_event(event, kwargs) - @property def non_intercept_modules(self): if self._non_intercept_modules is None: From 5ab09d10df3e22f6698ef1397611b3b34d6d3b90 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 15 Oct 2024 09:37:42 -0400 Subject: [PATCH 6/7] fix stats tests --- bbot/test/test_step_1/test_modules_basic.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/bbot/test/test_step_1/test_modules_basic.py b/bbot/test/test_step_1/test_modules_basic.py index 9fbaed085..e2f55f8dc 100644 --- a/bbot/test/test_step_1/test_modules_basic.py +++ b/bbot/test/test_step_1/test_modules_basic.py @@ -380,13 +380,14 @@ async def handle_event(self, event): scan.modules["dummy"] = dummy(scan) events = [e async for e in scan.async_start()] - assert len(events) == 9 + assert len(events) == 10 + for e in events: + log.critical(e) assert 2 == len([e for e in events if e.type == "SCAN"]) - assert 3 == len([e for e in events if e.type == "DNS_NAME"]) + assert 4 == len([e for e in events if e.type == "DNS_NAME"]) # one from target and one from speculate assert 2 == len([e for e in events if e.type == "DNS_NAME" and e.data == "evilcorp.com"]) - # the reason we don't have a DNS_NAME for www.evilcorp.com is because FINDING.quick_emit = True - assert 0 == len([e for e in events if e.type == "DNS_NAME" and e.data == "www.evilcorp.com"]) + assert 1 == len([e for e in events if e.type == "DNS_NAME" and e.data == "www.evilcorp.com"]) assert 1 == len([e for e in events if e.type == "DNS_NAME" and e.data == "asdf.evilcorp.com"]) assert 1 == len([e for e in events if e.type == "ORG_STUB" and e.data == "evilcorp"]) assert 1 == len([e for e in events if e.type == "FINDING"]) @@ -394,7 +395,7 @@ async def handle_event(self, event): assert scan.stats.events_emitted_by_type == { "SCAN": 1, - "DNS_NAME": 3, + "DNS_NAME": 4, "URL": 1, "ORG_STUB": 1, "URL_UNVERIFIED": 1, @@ -414,7 +415,7 @@ async def handle_event(self, event): assert dummy_stats.produced == {"FINDING": 1, "URL": 1} assert dummy_stats.produced_total == 2 assert dummy_stats.consumed == { - "DNS_NAME": 2, + "DNS_NAME": 3, "FINDING": 1, "OPEN_TCP_PORT": 1, "ORG_STUB": 1, @@ -422,26 +423,26 @@ async def handle_event(self, event): "URL": 1, "URL_UNVERIFIED": 1, } - assert dummy_stats.consumed_total == 8 + assert dummy_stats.consumed_total == 9 python_stats = scan.stats.module_stats["python"] assert python_stats.produced == {} assert python_stats.produced_total == 0 assert python_stats.consumed == { - "DNS_NAME": 3, + "DNS_NAME": 4, "FINDING": 1, "ORG_STUB": 1, "SCAN": 1, "URL": 1, "URL_UNVERIFIED": 1, } - assert python_stats.consumed_total == 8 + assert python_stats.consumed_total == 9 speculate_stats = scan.stats.module_stats["speculate"] assert speculate_stats.produced == {"DNS_NAME": 1, "URL_UNVERIFIED": 1, "ORG_STUB": 1} assert speculate_stats.produced_total == 3 - assert speculate_stats.consumed == {"URL": 1, "DNS_NAME": 2, "URL_UNVERIFIED": 1, "IP_ADDRESS": 2} - assert speculate_stats.consumed_total == 6 + assert speculate_stats.consumed == {"URL": 1, "DNS_NAME": 3, "URL_UNVERIFIED": 1, "IP_ADDRESS": 3} + assert speculate_stats.consumed_total == 8 @pytest.mark.asyncio From e81a549e7a45307b47c256bbfef3df78f373e6aa Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 16 Oct 2024 12:08:10 -0400 Subject: [PATCH 7/7] fix queue bug --- bbot/scanner/manager.py | 4 +--- bbot/scanner/scanner.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bbot/scanner/manager.py b/bbot/scanner/manager.py index 3cb1f5fdf..8cbe098a5 100644 --- a/bbot/scanner/manager.py +++ b/bbot/scanner/manager.py @@ -15,9 +15,7 @@ class ScanIngress(BaseInterceptModule): # accept all events regardless of scope distance scope_distance_modifier = None _name = "_scan_ingress" - - # small queue size so we don't drain modules' outgoing queues - _qsize = 10 + _qsize = -1 @property def priority(self): diff --git a/bbot/scanner/scanner.py b/bbot/scanner/scanner.py index 35cbaf220..97550179a 100644 --- a/bbot/scanner/scanner.py +++ b/bbot/scanner/scanner.py @@ -288,7 +288,9 @@ async def _prep(self): self.debug( f"Setting intercept module {intercept_module.name}._incoming_event_queue to previous intercept module {prev_intercept_module.name}.outgoing_event_queue" ) - intercept_module._incoming_event_queue = prev_intercept_module.outgoing_event_queue + interqueue = asyncio.Queue() + intercept_module._incoming_event_queue = interqueue + prev_intercept_module._outgoing_event_queue = interqueue # abort if there are no output modules num_output_modules = len([m for m in self.modules.values() if m._type == "output"])