diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py index 5ac1cd82a..f26d4666d 100644 --- a/bbot/core/helpers/web.py +++ b/bbot/core/helpers/web.py @@ -7,6 +7,7 @@ import traceback from pathlib import Path from bs4 import BeautifulSoup +from contextlib import asynccontextmanager from httpx._models import Cookies @@ -216,7 +217,7 @@ async def request(self, *args, **kwargs): if client_kwargs: client = self.AsyncClient(**client_kwargs) - try: + async with self._acatch(url, raise_error): if self.http_debug: logstr = f"Web request: {str(args)}, {str(kwargs)}" log.debug(logstr) @@ -226,41 +227,6 @@ async def request(self, *args, **kwargs): f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}" ) return response - except httpx.PoolTimeout: - # this block exists because of this: - # https://github.com/encode/httpcore/discussions/783 - log.verbose(f"PoolTimeout to URL: {url}") - self.web_client = self.AsyncClient(persist_cookies=False) - return await self.request(*args, **kwargs) - except httpx.TimeoutException: - log.verbose(f"HTTP timeout to URL: {url}") - if raise_error: - raise - except httpx.ConnectError: - log.verbose(f"HTTP connect failed to URL: {url}") - if raise_error: - raise - except httpx.RequestError as e: - log.trace(f"Error with request to URL: {url}: {e}") - log.trace(traceback.format_exc()) - if raise_error: - raise - except ssl.SSLError as e: - msg = f"SSL error with request to URL: {url}: {e}" - log.trace(msg) - log.trace(traceback.format_exc()) - if raise_error: - raise httpx.RequestError(msg) - except anyio.EndOfStream as e: - msg = f"AnyIO error with request to URL: {url}: {e}" - log.trace(msg) - log.trace(traceback.format_exc()) - if raise_error: - raise httpx.RequestError(msg) - except BaseException as e: - log.trace(f"Unhandled exception with request to URL: {url}: {e}") - log.trace(traceback.format_exc()) - raise async def download(self, url, **kwargs): """ @@ -272,9 +238,11 @@ async def download(self, url, **kwargs): url (str): The URL of the file to download. filename (str, optional): The filename to save the downloaded file as. If not provided, will generate based on URL. + max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes. cache_hrs (float, optional): The number of hours to cache the downloaded file. A negative value disables caching. Defaults to -1. method (str, optional): The HTTP method to use for the request, defaults to 'GET'. + raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False. **kwargs: Additional keyword arguments to pass to the httpx request. Returns: @@ -285,7 +253,15 @@ async def download(self, url, **kwargs): """ success = False filename = kwargs.pop("filename", self.parent_helper.cache_filename(url)) + follow_redirects = kwargs.pop("follow_redirects", True) + max_size = kwargs.pop("max_size", None) + warn = kwargs.pop("warn", True) + raise_error = kwargs.pop("raise_error", False) + if max_size is not None: + max_size = self.parent_helper.human_to_bytes(max_size) cache_hrs = float(kwargs.pop("cache_hrs", -1)) + total_size = 0 + chunk_size = 8192 log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}") if cache_hrs > 0 and self.parent_helper.is_cached(url): log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}") @@ -293,20 +269,32 @@ async def download(self, url, **kwargs): else: # kwargs["raise_error"] = True # kwargs["stream"] = True + kwargs["follow_redirects"] = follow_redirects if not "method" in kwargs: kwargs["method"] = "GET" try: - async with self.AsyncClient().stream(url=url, **kwargs) as response: + async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response: status_code = getattr(response, "status_code", 0) log.debug(f"Download result: HTTP {status_code}") if status_code != 0: response.raise_for_status() with open(filename, "wb") as f: - async for chunk in response.aiter_bytes(chunk_size=8192): + agen = response.aiter_bytes(chunk_size=chunk_size) + async for chunk in agen: + if max_size is not None and total_size + chunk_size > max_size: + log.verbose( + f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated" + ) + agen.aclose() + break + total_size += chunk_size f.write(chunk) success = True except httpx.HTTPError as e: - log.warning(f"Failed to download {url}: {e}") + log_fn = log.verbose + if warn: + log_fn = log.warning + log_fn(f"Failed to download {url}: {e}") return if success: @@ -574,6 +562,59 @@ def is_spider_danger(self, source_event, url): return True return False + def ssl_context_noverify(self): + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3 + ssl_context.set_ciphers("ALL:@SECLEVEL=0") + ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option + return ssl_context + + @asynccontextmanager + async def _acatch(self, url, raise_error): + """ + Asynchronous context manager to handle various httpx errors during a request. + + Yields: + None + + Note: + This function is internal and should generally not be used directly. + `url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function. + """ + try: + yield + except httpx.TimeoutException: + log.verbose(f"HTTP timeout to URL: {url}") + if raise_error: + raise + except httpx.ConnectError: + log.verbose(f"HTTP connect failed to URL: {url}") + if raise_error: + raise + except httpx.RequestError as e: + log.trace(f"Error with request to URL: {url}: {e}") + log.trace(traceback.format_exc()) + if raise_error: + raise + except ssl.SSLError as e: + msg = f"SSL error with request to URL: {url}: {e}" + log.trace(msg) + log.trace(traceback.format_exc()) + if raise_error: + raise httpx.RequestError(msg) + except anyio.EndOfStream as e: + msg = f"AnyIO error with request to URL: {url}: {e}" + log.trace(msg) + log.trace(traceback.format_exc()) + if raise_error: + raise httpx.RequestError(msg) + except BaseException as e: + log.trace(f"Unhandled exception with request to URL: {url}: {e}") + log.trace(traceback.format_exc()) + raise + user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]] pass_keywords = [re.compile(r, re.I) for r in ["pass"]] diff --git a/bbot/modules/filedownload.py b/bbot/modules/filedownload.py new file mode 100644 index 000000000..8b61dbdd8 --- /dev/null +++ b/bbot/modules/filedownload.py @@ -0,0 +1,165 @@ +import json +from pathlib import Path + +from bbot.modules.base import BaseModule + + +class filedownload(BaseModule): + """ + Watch for common filetypes and download them. + + Capable of identifying interesting files even if the extension is not in the URL. + E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension. + """ + + watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"] + produced_events = [] + flags = ["active", "safe"] + meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."} + options = { + "extensions": [ + "bak", # Backup File + "bash", # Bash Script or Configuration + "bashrc", # Bash Script or Configuration + "conf", # Configuration File + "cfg", # Configuration File + "crt", # Certificate File + "csv", # Comma Separated Values File + "db", # SQLite Database File + "sqlite", # SQLite Database File + "doc", # Microsoft Word Document (Old Format) + "docx", # Microsoft Word Document + "exe", # Windows PE executable + "ica", # Citrix Independent Computing Architecture File + "indd", # Adobe InDesign Document + "ini", # Initialization File + "jar", # Java Archive + "key", # Private Key File + "pub", # Public Key File + "log", # Log File + "markdown", # Markdown File + "md", # Markdown File + "msi", # Windows setup file + "odg", # OpenDocument Graphics (LibreOffice, OpenOffice) + "odp", # OpenDocument Presentation (LibreOffice, OpenOffice) + "ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice) + "odt", # OpenDocument Text (LibreOffice, OpenOffice) + "pdf", # Adobe Portable Document Format + "pem", # Privacy Enhanced Mail (SSL certificate) + "png", # Portable Network Graphics Image + "pps", # Microsoft PowerPoint Slideshow (Old Format) + "ppsx", # Microsoft PowerPoint Slideshow + "ppt", # Microsoft PowerPoint Presentation (Old Format) + "pptx", # Microsoft PowerPoint Presentation + "ps1", # PowerShell Script + "raw", # Raw Image File Format + "rdp", # Remote Desktop Protocol File + "sh", # Shell Script + "sql", # SQL Database Dump + "swp", # Swap File (temporary file, often Vim) + "sxw", # OpenOffice.org Writer document + "tar", # Tar Archive + "tar.gz", # Gzip-Compressed Tar Archive + "zip", # Zip Archive + "txt", # Plain Text Document + "vbs", # Visual Basic Script + "wpd", # WordPerfect Document + "xls", # Microsoft Excel Spreadsheet (Old Format) + "xlsx", # Microsoft Excel Spreadsheet + "xml", # eXtensible Markup Language File + "yml", # YAML Ain't Markup Language + "yaml", # YAML Ain't Markup Language + ], + "max_filesize": "10MB", + } + options_desc = { + "extensions": "File extensions to download", + "max_filesize": "Cancel download if filesize is greater than this size", + } + + scope_distance_modifier = 1 + + async def setup(self): + self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])])) + self.max_filesize = self.options.get("max_filesize", "10MB") + self.download_dir = self.scan.home / "filedownload" + self.helpers.mkdir(self.download_dir) + self.files_downloaded = set() + self.mime_db_file = await self.helpers.wordlist( + "https://raw.githubusercontent.com/jshttp/mime-db/master/db.json" + ) + self.mime_db = {} + with open(self.mime_db_file) as f: + mime_db = json.load(f) + for content_type, attrs in mime_db.items(): + if "extensions" in attrs and attrs["extensions"]: + self.mime_db[content_type] = attrs["extensions"][0].lower() + return True + + async def filter_event(self, event): + # accept file download requests from other modules + if "filedownload" in event.tags: + return True + h = self.hash_event(event) + if h in self.files_downloaded: + return False, f"Already processed {event}" + return True + + def hash_event(self, event): + if event.type == "HTTP_RESPONSE": + return hash(event.data["url"]) + return hash(event.data) + + async def handle_event(self, event): + if event.type == "URL_UNVERIFIED": + url_lower = event.data.lower() + if any(url_lower.endswith(f".{e}") for e in self.extensions): + await self.download_file(event.data) + elif event.type == "HTTP_RESPONSE": + content_type = event.data["header"].get("content_type", "") + if content_type: + url = event.data["url"] + await self.download_file(url, content_type=content_type) + + async def download_file(self, url, content_type=None): + orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type) + if orig_filename is None: + return + result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize) + if result: + self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}') + self.files_downloaded.add(hash(url)) + + def make_filename(self, url, content_type=None): + # first, try to determine original filename + parsed_url = self.helpers.urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + url_path = parsed_url.path.strip("/") + # try to get extension from URL path + extension = Path(url_path).suffix.strip(".").lower() + if extension: + url_stem = url.rsplit(".", 1)[0] + else: + url_stem = str(url) + filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}" + if not url_path: + url_path = "unknown" + filename = f"{filename}-{url_path}" + # if that fails, try to get it from content type + if not extension: + if content_type and content_type in self.mime_db: + extension = self.mime_db[content_type] + + if (not extension) or (extension not in self.extensions): + self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.') + return None, None, None + + orig_filename = Path(url_path).stem + if extension: + filename = f"{filename}.{extension}" + orig_filename = f"{orig_filename}.{extension}" + return orig_filename, self.download_dir / filename, base_url + + async def report(self): + if self.files_downloaded: + self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}") diff --git a/bbot/modules/sslcert.py b/bbot/modules/sslcert.py index a9f269c55..de598dd73 100644 --- a/bbot/modules/sslcert.py +++ b/bbot/modules/sslcert.py @@ -1,4 +1,3 @@ -import ssl import asyncio from OpenSSL import crypto from contextlib import suppress @@ -109,12 +108,7 @@ async def visit_host(self, host, port): # Create an SSL context try: - ssl_context = ssl.create_default_context() - ssl_context.check_hostname = False - ssl_context.verify_mode = ssl.CERT_NONE - ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3 - ssl_context.set_ciphers("ALL:@SECLEVEL=0") - ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option + ssl_context = self.helpers.ssl_context_noverify() except Exception as e: self.warning(f"Error creating SSL context: {e}") return [], [], (host, port) diff --git a/bbot/test/test_step_2/module_tests/test_module_filedownload.py b/bbot/test/test_step_2/module_tests/test_module_filedownload.py new file mode 100644 index 000000000..e4471d159 --- /dev/null +++ b/bbot/test/test_step_2/module_tests/test_module_filedownload.py @@ -0,0 +1,68 @@ +from .base import ModuleTestBase + + +class TestFileDownload(ModuleTestBase): + targets = ["http://127.0.0.1:8888"] + modules_overrides = ["filedownload", "httpx", "excavate", "speculate"] + config_overrides = {"web_spider_distance": 2, "web_spider_depth": 2} + + pdf_data = """%PDF-1. +1 0 obj<>endobj +2 0 obj<>endobj +3 0 obj<>endobj +trailer <>""" + + async def setup_before_prep(self, module_test): + module_test.httpx_mock.add_response( + url="https://raw.githubusercontent.com/jshttp/mime-db/master/db.json", + json={ + "application/pdf": {"source": "iana", "compressible": False, "extensions": ["pdf"]}, + }, + ) + + async def setup_after_prep(self, module_test): + module_test.set_expect_requests( + dict(uri="/"), + dict( + response_data='' + ), + ) + module_test.set_expect_requests( + dict(uri="/Test_File.txt"), + dict( + response_data="juicy stuff", + ), + ) + module_test.set_expect_requests( + dict(uri="/Test_PDF"), + dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}), + ) + module_test.set_expect_requests( + dict(uri="/test.html"), + dict(response_data="", headers={"Content-Type": "text/html"}), + ) + module_test.set_expect_requests( + dict(uri="/test2"), + dict(response_data="", headers={"Content-Type": "text/html"}), + ) + + def check(self, module_test, events): + download_dir = module_test.scan.home / "filedownload" + + # text file + text_files = list(download_dir.glob("*test-file.txt")) + assert len(text_files) == 1, f"No text file found at {download_dir}" + file = text_files[0] + assert file.is_file(), f"File not found at {file}" + assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content" + + # PDF file (no extension) + pdf_files = list(download_dir.glob("*test-pdf.pdf")) + assert len(pdf_files) == 1, f"No PDF file found at {download_dir}" + file = pdf_files[0] + assert file.is_file(), f"File not found at {file}" + assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content" + + # we don't want html files + html_files = list(download_dir.glob("*.html")) + assert len(html_files) == 0, "HTML files were erroneously downloaded"