diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
index 5ac1cd82a..f26d4666d 100644
--- a/bbot/core/helpers/web.py
+++ b/bbot/core/helpers/web.py
@@ -7,6 +7,7 @@
import traceback
from pathlib import Path
from bs4 import BeautifulSoup
+from contextlib import asynccontextmanager
from httpx._models import Cookies
@@ -216,7 +217,7 @@ async def request(self, *args, **kwargs):
if client_kwargs:
client = self.AsyncClient(**client_kwargs)
- try:
+ async with self._acatch(url, raise_error):
if self.http_debug:
logstr = f"Web request: {str(args)}, {str(kwargs)}"
log.debug(logstr)
@@ -226,41 +227,6 @@ async def request(self, *args, **kwargs):
f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}"
)
return response
- except httpx.PoolTimeout:
- # this block exists because of this:
- # https://github.com/encode/httpcore/discussions/783
- log.verbose(f"PoolTimeout to URL: {url}")
- self.web_client = self.AsyncClient(persist_cookies=False)
- return await self.request(*args, **kwargs)
- except httpx.TimeoutException:
- log.verbose(f"HTTP timeout to URL: {url}")
- if raise_error:
- raise
- except httpx.ConnectError:
- log.verbose(f"HTTP connect failed to URL: {url}")
- if raise_error:
- raise
- except httpx.RequestError as e:
- log.trace(f"Error with request to URL: {url}: {e}")
- log.trace(traceback.format_exc())
- if raise_error:
- raise
- except ssl.SSLError as e:
- msg = f"SSL error with request to URL: {url}: {e}"
- log.trace(msg)
- log.trace(traceback.format_exc())
- if raise_error:
- raise httpx.RequestError(msg)
- except anyio.EndOfStream as e:
- msg = f"AnyIO error with request to URL: {url}: {e}"
- log.trace(msg)
- log.trace(traceback.format_exc())
- if raise_error:
- raise httpx.RequestError(msg)
- except BaseException as e:
- log.trace(f"Unhandled exception with request to URL: {url}: {e}")
- log.trace(traceback.format_exc())
- raise
async def download(self, url, **kwargs):
"""
@@ -272,9 +238,11 @@ async def download(self, url, **kwargs):
url (str): The URL of the file to download.
filename (str, optional): The filename to save the downloaded file as.
If not provided, will generate based on URL.
+ max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes.
cache_hrs (float, optional): The number of hours to cache the downloaded file.
A negative value disables caching. Defaults to -1.
method (str, optional): The HTTP method to use for the request, defaults to 'GET'.
+ raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False.
**kwargs: Additional keyword arguments to pass to the httpx request.
Returns:
@@ -285,7 +253,15 @@ async def download(self, url, **kwargs):
"""
success = False
filename = kwargs.pop("filename", self.parent_helper.cache_filename(url))
+ follow_redirects = kwargs.pop("follow_redirects", True)
+ max_size = kwargs.pop("max_size", None)
+ warn = kwargs.pop("warn", True)
+ raise_error = kwargs.pop("raise_error", False)
+ if max_size is not None:
+ max_size = self.parent_helper.human_to_bytes(max_size)
cache_hrs = float(kwargs.pop("cache_hrs", -1))
+ total_size = 0
+ chunk_size = 8192
log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
if cache_hrs > 0 and self.parent_helper.is_cached(url):
log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
@@ -293,20 +269,32 @@ async def download(self, url, **kwargs):
else:
# kwargs["raise_error"] = True
# kwargs["stream"] = True
+ kwargs["follow_redirects"] = follow_redirects
if not "method" in kwargs:
kwargs["method"] = "GET"
try:
- async with self.AsyncClient().stream(url=url, **kwargs) as response:
+ async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response:
status_code = getattr(response, "status_code", 0)
log.debug(f"Download result: HTTP {status_code}")
if status_code != 0:
response.raise_for_status()
with open(filename, "wb") as f:
- async for chunk in response.aiter_bytes(chunk_size=8192):
+ agen = response.aiter_bytes(chunk_size=chunk_size)
+ async for chunk in agen:
+ if max_size is not None and total_size + chunk_size > max_size:
+ log.verbose(
+ f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated"
+ )
+ agen.aclose()
+ break
+ total_size += chunk_size
f.write(chunk)
success = True
except httpx.HTTPError as e:
- log.warning(f"Failed to download {url}: {e}")
+ log_fn = log.verbose
+ if warn:
+ log_fn = log.warning
+ log_fn(f"Failed to download {url}: {e}")
return
if success:
@@ -574,6 +562,59 @@ def is_spider_danger(self, source_event, url):
return True
return False
+ def ssl_context_noverify(self):
+ ssl_context = ssl.create_default_context()
+ ssl_context.check_hostname = False
+ ssl_context.verify_mode = ssl.CERT_NONE
+ ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
+ ssl_context.set_ciphers("ALL:@SECLEVEL=0")
+ ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
+ return ssl_context
+
+ @asynccontextmanager
+ async def _acatch(self, url, raise_error):
+ """
+ Asynchronous context manager to handle various httpx errors during a request.
+
+ Yields:
+ None
+
+ Note:
+ This function is internal and should generally not be used directly.
+ `url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function.
+ """
+ try:
+ yield
+ except httpx.TimeoutException:
+ log.verbose(f"HTTP timeout to URL: {url}")
+ if raise_error:
+ raise
+ except httpx.ConnectError:
+ log.verbose(f"HTTP connect failed to URL: {url}")
+ if raise_error:
+ raise
+ except httpx.RequestError as e:
+ log.trace(f"Error with request to URL: {url}: {e}")
+ log.trace(traceback.format_exc())
+ if raise_error:
+ raise
+ except ssl.SSLError as e:
+ msg = f"SSL error with request to URL: {url}: {e}"
+ log.trace(msg)
+ log.trace(traceback.format_exc())
+ if raise_error:
+ raise httpx.RequestError(msg)
+ except anyio.EndOfStream as e:
+ msg = f"AnyIO error with request to URL: {url}: {e}"
+ log.trace(msg)
+ log.trace(traceback.format_exc())
+ if raise_error:
+ raise httpx.RequestError(msg)
+ except BaseException as e:
+ log.trace(f"Unhandled exception with request to URL: {url}: {e}")
+ log.trace(traceback.format_exc())
+ raise
+
user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]]
pass_keywords = [re.compile(r, re.I) for r in ["pass"]]
diff --git a/bbot/modules/filedownload.py b/bbot/modules/filedownload.py
new file mode 100644
index 000000000..8b61dbdd8
--- /dev/null
+++ b/bbot/modules/filedownload.py
@@ -0,0 +1,165 @@
+import json
+from pathlib import Path
+
+from bbot.modules.base import BaseModule
+
+
+class filedownload(BaseModule):
+ """
+ Watch for common filetypes and download them.
+
+ Capable of identifying interesting files even if the extension is not in the URL.
+ E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
+ """
+
+ watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
+ produced_events = []
+ flags = ["active", "safe"]
+ meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
+ options = {
+ "extensions": [
+ "bak", # Backup File
+ "bash", # Bash Script or Configuration
+ "bashrc", # Bash Script or Configuration
+ "conf", # Configuration File
+ "cfg", # Configuration File
+ "crt", # Certificate File
+ "csv", # Comma Separated Values File
+ "db", # SQLite Database File
+ "sqlite", # SQLite Database File
+ "doc", # Microsoft Word Document (Old Format)
+ "docx", # Microsoft Word Document
+ "exe", # Windows PE executable
+ "ica", # Citrix Independent Computing Architecture File
+ "indd", # Adobe InDesign Document
+ "ini", # Initialization File
+ "jar", # Java Archive
+ "key", # Private Key File
+ "pub", # Public Key File
+ "log", # Log File
+ "markdown", # Markdown File
+ "md", # Markdown File
+ "msi", # Windows setup file
+ "odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
+ "odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
+ "ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
+ "odt", # OpenDocument Text (LibreOffice, OpenOffice)
+ "pdf", # Adobe Portable Document Format
+ "pem", # Privacy Enhanced Mail (SSL certificate)
+ "png", # Portable Network Graphics Image
+ "pps", # Microsoft PowerPoint Slideshow (Old Format)
+ "ppsx", # Microsoft PowerPoint Slideshow
+ "ppt", # Microsoft PowerPoint Presentation (Old Format)
+ "pptx", # Microsoft PowerPoint Presentation
+ "ps1", # PowerShell Script
+ "raw", # Raw Image File Format
+ "rdp", # Remote Desktop Protocol File
+ "sh", # Shell Script
+ "sql", # SQL Database Dump
+ "swp", # Swap File (temporary file, often Vim)
+ "sxw", # OpenOffice.org Writer document
+ "tar", # Tar Archive
+ "tar.gz", # Gzip-Compressed Tar Archive
+ "zip", # Zip Archive
+ "txt", # Plain Text Document
+ "vbs", # Visual Basic Script
+ "wpd", # WordPerfect Document
+ "xls", # Microsoft Excel Spreadsheet (Old Format)
+ "xlsx", # Microsoft Excel Spreadsheet
+ "xml", # eXtensible Markup Language File
+ "yml", # YAML Ain't Markup Language
+ "yaml", # YAML Ain't Markup Language
+ ],
+ "max_filesize": "10MB",
+ }
+ options_desc = {
+ "extensions": "File extensions to download",
+ "max_filesize": "Cancel download if filesize is greater than this size",
+ }
+
+ scope_distance_modifier = 1
+
+ async def setup(self):
+ self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])]))
+ self.max_filesize = self.options.get("max_filesize", "10MB")
+ self.download_dir = self.scan.home / "filedownload"
+ self.helpers.mkdir(self.download_dir)
+ self.files_downloaded = set()
+ self.mime_db_file = await self.helpers.wordlist(
+ "https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
+ )
+ self.mime_db = {}
+ with open(self.mime_db_file) as f:
+ mime_db = json.load(f)
+ for content_type, attrs in mime_db.items():
+ if "extensions" in attrs and attrs["extensions"]:
+ self.mime_db[content_type] = attrs["extensions"][0].lower()
+ return True
+
+ async def filter_event(self, event):
+ # accept file download requests from other modules
+ if "filedownload" in event.tags:
+ return True
+ h = self.hash_event(event)
+ if h in self.files_downloaded:
+ return False, f"Already processed {event}"
+ return True
+
+ def hash_event(self, event):
+ if event.type == "HTTP_RESPONSE":
+ return hash(event.data["url"])
+ return hash(event.data)
+
+ async def handle_event(self, event):
+ if event.type == "URL_UNVERIFIED":
+ url_lower = event.data.lower()
+ if any(url_lower.endswith(f".{e}") for e in self.extensions):
+ await self.download_file(event.data)
+ elif event.type == "HTTP_RESPONSE":
+ content_type = event.data["header"].get("content_type", "")
+ if content_type:
+ url = event.data["url"]
+ await self.download_file(url, content_type=content_type)
+
+ async def download_file(self, url, content_type=None):
+ orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
+ if orig_filename is None:
+ return
+ result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
+ if result:
+ self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
+ self.files_downloaded.add(hash(url))
+
+ def make_filename(self, url, content_type=None):
+ # first, try to determine original filename
+ parsed_url = self.helpers.urlparse(url)
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+ url_path = parsed_url.path.strip("/")
+ # try to get extension from URL path
+ extension = Path(url_path).suffix.strip(".").lower()
+ if extension:
+ url_stem = url.rsplit(".", 1)[0]
+ else:
+ url_stem = str(url)
+ filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
+ if not url_path:
+ url_path = "unknown"
+ filename = f"{filename}-{url_path}"
+ # if that fails, try to get it from content type
+ if not extension:
+ if content_type and content_type in self.mime_db:
+ extension = self.mime_db[content_type]
+
+ if (not extension) or (extension not in self.extensions):
+ self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
+ return None, None, None
+
+ orig_filename = Path(url_path).stem
+ if extension:
+ filename = f"{filename}.{extension}"
+ orig_filename = f"{orig_filename}.{extension}"
+ return orig_filename, self.download_dir / filename, base_url
+
+ async def report(self):
+ if self.files_downloaded:
+ self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
diff --git a/bbot/modules/sslcert.py b/bbot/modules/sslcert.py
index a9f269c55..de598dd73 100644
--- a/bbot/modules/sslcert.py
+++ b/bbot/modules/sslcert.py
@@ -1,4 +1,3 @@
-import ssl
import asyncio
from OpenSSL import crypto
from contextlib import suppress
@@ -109,12 +108,7 @@ async def visit_host(self, host, port):
# Create an SSL context
try:
- ssl_context = ssl.create_default_context()
- ssl_context.check_hostname = False
- ssl_context.verify_mode = ssl.CERT_NONE
- ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
- ssl_context.set_ciphers("ALL:@SECLEVEL=0")
- ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
+ ssl_context = self.helpers.ssl_context_noverify()
except Exception as e:
self.warning(f"Error creating SSL context: {e}")
return [], [], (host, port)
diff --git a/bbot/test/test_step_2/module_tests/test_module_filedownload.py b/bbot/test/test_step_2/module_tests/test_module_filedownload.py
new file mode 100644
index 000000000..e4471d159
--- /dev/null
+++ b/bbot/test/test_step_2/module_tests/test_module_filedownload.py
@@ -0,0 +1,68 @@
+from .base import ModuleTestBase
+
+
+class TestFileDownload(ModuleTestBase):
+ targets = ["http://127.0.0.1:8888"]
+ modules_overrides = ["filedownload", "httpx", "excavate", "speculate"]
+ config_overrides = {"web_spider_distance": 2, "web_spider_depth": 2}
+
+ pdf_data = """%PDF-1.
+1 0 obj<>endobj
+2 0 obj<>endobj
+3 0 obj<>endobj
+trailer <>"""
+
+ async def setup_before_prep(self, module_test):
+ module_test.httpx_mock.add_response(
+ url="https://raw.githubusercontent.com/jshttp/mime-db/master/db.json",
+ json={
+ "application/pdf": {"source": "iana", "compressible": False, "extensions": ["pdf"]},
+ },
+ )
+
+ async def setup_after_prep(self, module_test):
+ module_test.set_expect_requests(
+ dict(uri="/"),
+ dict(
+ response_data=''
+ ),
+ )
+ module_test.set_expect_requests(
+ dict(uri="/Test_File.txt"),
+ dict(
+ response_data="juicy stuff",
+ ),
+ )
+ module_test.set_expect_requests(
+ dict(uri="/Test_PDF"),
+ dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}),
+ )
+ module_test.set_expect_requests(
+ dict(uri="/test.html"),
+ dict(response_data="", headers={"Content-Type": "text/html"}),
+ )
+ module_test.set_expect_requests(
+ dict(uri="/test2"),
+ dict(response_data="", headers={"Content-Type": "text/html"}),
+ )
+
+ def check(self, module_test, events):
+ download_dir = module_test.scan.home / "filedownload"
+
+ # text file
+ text_files = list(download_dir.glob("*test-file.txt"))
+ assert len(text_files) == 1, f"No text file found at {download_dir}"
+ file = text_files[0]
+ assert file.is_file(), f"File not found at {file}"
+ assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content"
+
+ # PDF file (no extension)
+ pdf_files = list(download_dir.glob("*test-pdf.pdf"))
+ assert len(pdf_files) == 1, f"No PDF file found at {download_dir}"
+ file = pdf_files[0]
+ assert file.is_file(), f"File not found at {file}"
+ assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"
+
+ # we don't want html files
+ html_files = list(download_dir.glob("*.html"))
+ assert len(html_files) == 0, "HTML files were erroneously downloaded"