Skip to content

Commit

Permalink
Merge pull request #774 from blacklanternsecurity/filedownload-module-2
Browse files Browse the repository at this point in the history
Filedownload Module
  • Loading branch information
TheTechromancer authored Oct 13, 2023
2 parents 09f492c + 70c00c5 commit e4f2617
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 46 deletions.
119 changes: 80 additions & 39 deletions bbot/core/helpers/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import traceback
from pathlib import Path
from bs4 import BeautifulSoup
from contextlib import asynccontextmanager

from httpx._models import Cookies

Expand Down Expand Up @@ -216,7 +217,7 @@ async def request(self, *args, **kwargs):
if client_kwargs:
client = self.AsyncClient(**client_kwargs)

try:
async with self._acatch(url, raise_error):
if self.http_debug:
logstr = f"Web request: {str(args)}, {str(kwargs)}"
log.debug(logstr)
Expand All @@ -226,41 +227,6 @@ async def request(self, *args, **kwargs):
f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}"
)
return response
except httpx.PoolTimeout:
# this block exists because of this:
# https://github.com/encode/httpcore/discussions/783
log.verbose(f"PoolTimeout to URL: {url}")
self.web_client = self.AsyncClient(persist_cookies=False)
return await self.request(*args, **kwargs)
except httpx.TimeoutException:
log.verbose(f"HTTP timeout to URL: {url}")
if raise_error:
raise
except httpx.ConnectError:
log.verbose(f"HTTP connect failed to URL: {url}")
if raise_error:
raise
except httpx.RequestError as e:
log.trace(f"Error with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
if raise_error:
raise
except ssl.SSLError as e:
msg = f"SSL error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except anyio.EndOfStream as e:
msg = f"AnyIO error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except BaseException as e:
log.trace(f"Unhandled exception with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
raise

async def download(self, url, **kwargs):
"""
Expand All @@ -272,9 +238,11 @@ async def download(self, url, **kwargs):
url (str): The URL of the file to download.
filename (str, optional): The filename to save the downloaded file as.
If not provided, will generate based on URL.
max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes.
cache_hrs (float, optional): The number of hours to cache the downloaded file.
A negative value disables caching. Defaults to -1.
method (str, optional): The HTTP method to use for the request, defaults to 'GET'.
raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False.
**kwargs: Additional keyword arguments to pass to the httpx request.
Returns:
Expand All @@ -285,28 +253,48 @@ async def download(self, url, **kwargs):
"""
success = False
filename = kwargs.pop("filename", self.parent_helper.cache_filename(url))
follow_redirects = kwargs.pop("follow_redirects", True)
max_size = kwargs.pop("max_size", None)
warn = kwargs.pop("warn", True)
raise_error = kwargs.pop("raise_error", False)
if max_size is not None:
max_size = self.parent_helper.human_to_bytes(max_size)
cache_hrs = float(kwargs.pop("cache_hrs", -1))
total_size = 0
chunk_size = 8192
log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
if cache_hrs > 0 and self.parent_helper.is_cached(url):
log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
success = True
else:
# kwargs["raise_error"] = True
# kwargs["stream"] = True
kwargs["follow_redirects"] = follow_redirects
if not "method" in kwargs:
kwargs["method"] = "GET"
try:
async with self.AsyncClient().stream(url=url, **kwargs) as response:
async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response:
status_code = getattr(response, "status_code", 0)
log.debug(f"Download result: HTTP {status_code}")
if status_code != 0:
response.raise_for_status()
with open(filename, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=8192):
agen = response.aiter_bytes(chunk_size=chunk_size)
async for chunk in agen:
if max_size is not None and total_size + chunk_size > max_size:
log.verbose(
f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated"
)
agen.aclose()
break
total_size += chunk_size
f.write(chunk)
success = True
except httpx.HTTPError as e:
log.warning(f"Failed to download {url}: {e}")
log_fn = log.verbose
if warn:
log_fn = log.warning
log_fn(f"Failed to download {url}: {e}")
return

if success:
Expand Down Expand Up @@ -574,6 +562,59 @@ def is_spider_danger(self, source_event, url):
return True
return False

def ssl_context_noverify(self):
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
ssl_context.set_ciphers("ALL:@SECLEVEL=0")
ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
return ssl_context

@asynccontextmanager
async def _acatch(self, url, raise_error):
"""
Asynchronous context manager to handle various httpx errors during a request.
Yields:
None
Note:
This function is internal and should generally not be used directly.
`url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function.
"""
try:
yield
except httpx.TimeoutException:
log.verbose(f"HTTP timeout to URL: {url}")
if raise_error:
raise
except httpx.ConnectError:
log.verbose(f"HTTP connect failed to URL: {url}")
if raise_error:
raise
except httpx.RequestError as e:
log.trace(f"Error with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
if raise_error:
raise
except ssl.SSLError as e:
msg = f"SSL error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except anyio.EndOfStream as e:
msg = f"AnyIO error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except BaseException as e:
log.trace(f"Unhandled exception with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
raise


user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]]
pass_keywords = [re.compile(r, re.I) for r in ["pass"]]
Expand Down
165 changes: 165 additions & 0 deletions bbot/modules/filedownload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import json
from pathlib import Path

from bbot.modules.base import BaseModule


class filedownload(BaseModule):
"""
Watch for common filetypes and download them.
Capable of identifying interesting files even if the extension is not in the URL.
E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
"""

watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
produced_events = []
flags = ["active", "safe"]
meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
options = {
"extensions": [
"bak", # Backup File
"bash", # Bash Script or Configuration
"bashrc", # Bash Script or Configuration
"conf", # Configuration File
"cfg", # Configuration File
"crt", # Certificate File
"csv", # Comma Separated Values File
"db", # SQLite Database File
"sqlite", # SQLite Database File
"doc", # Microsoft Word Document (Old Format)
"docx", # Microsoft Word Document
"exe", # Windows PE executable
"ica", # Citrix Independent Computing Architecture File
"indd", # Adobe InDesign Document
"ini", # Initialization File
"jar", # Java Archive
"key", # Private Key File
"pub", # Public Key File
"log", # Log File
"markdown", # Markdown File
"md", # Markdown File
"msi", # Windows setup file
"odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
"odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
"ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
"odt", # OpenDocument Text (LibreOffice, OpenOffice)
"pdf", # Adobe Portable Document Format
"pem", # Privacy Enhanced Mail (SSL certificate)
"png", # Portable Network Graphics Image
"pps", # Microsoft PowerPoint Slideshow (Old Format)
"ppsx", # Microsoft PowerPoint Slideshow
"ppt", # Microsoft PowerPoint Presentation (Old Format)
"pptx", # Microsoft PowerPoint Presentation
"ps1", # PowerShell Script
"raw", # Raw Image File Format
"rdp", # Remote Desktop Protocol File
"sh", # Shell Script
"sql", # SQL Database Dump
"swp", # Swap File (temporary file, often Vim)
"sxw", # OpenOffice.org Writer document
"tar", # Tar Archive
"tar.gz", # Gzip-Compressed Tar Archive
"zip", # Zip Archive
"txt", # Plain Text Document
"vbs", # Visual Basic Script
"wpd", # WordPerfect Document
"xls", # Microsoft Excel Spreadsheet (Old Format)
"xlsx", # Microsoft Excel Spreadsheet
"xml", # eXtensible Markup Language File
"yml", # YAML Ain't Markup Language
"yaml", # YAML Ain't Markup Language
],
"max_filesize": "10MB",
}
options_desc = {
"extensions": "File extensions to download",
"max_filesize": "Cancel download if filesize is greater than this size",
}

scope_distance_modifier = 1

async def setup(self):
self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])]))
self.max_filesize = self.options.get("max_filesize", "10MB")
self.download_dir = self.scan.home / "filedownload"
self.helpers.mkdir(self.download_dir)
self.files_downloaded = set()
self.mime_db_file = await self.helpers.wordlist(
"https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
)
self.mime_db = {}
with open(self.mime_db_file) as f:
mime_db = json.load(f)
for content_type, attrs in mime_db.items():
if "extensions" in attrs and attrs["extensions"]:
self.mime_db[content_type] = attrs["extensions"][0].lower()
return True

async def filter_event(self, event):
# accept file download requests from other modules
if "filedownload" in event.tags:
return True
h = self.hash_event(event)
if h in self.files_downloaded:
return False, f"Already processed {event}"
return True

def hash_event(self, event):
if event.type == "HTTP_RESPONSE":
return hash(event.data["url"])
return hash(event.data)

async def handle_event(self, event):
if event.type == "URL_UNVERIFIED":
url_lower = event.data.lower()
if any(url_lower.endswith(f".{e}") for e in self.extensions):
await self.download_file(event.data)
elif event.type == "HTTP_RESPONSE":
content_type = event.data["header"].get("content_type", "")
if content_type:
url = event.data["url"]
await self.download_file(url, content_type=content_type)

async def download_file(self, url, content_type=None):
orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
if orig_filename is None:
return
result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
if result:
self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
self.files_downloaded.add(hash(url))

def make_filename(self, url, content_type=None):
# first, try to determine original filename
parsed_url = self.helpers.urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
url_path = parsed_url.path.strip("/")
# try to get extension from URL path
extension = Path(url_path).suffix.strip(".").lower()
if extension:
url_stem = url.rsplit(".", 1)[0]
else:
url_stem = str(url)
filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
if not url_path:
url_path = "unknown"
filename = f"{filename}-{url_path}"
# if that fails, try to get it from content type
if not extension:
if content_type and content_type in self.mime_db:
extension = self.mime_db[content_type]

if (not extension) or (extension not in self.extensions):
self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
return None, None, None

orig_filename = Path(url_path).stem
if extension:
filename = f"{filename}.{extension}"
orig_filename = f"{orig_filename}.{extension}"
return orig_filename, self.download_dir / filename, base_url

async def report(self):
if self.files_downloaded:
self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
8 changes: 1 addition & 7 deletions bbot/modules/sslcert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import ssl
import asyncio
from OpenSSL import crypto
from contextlib import suppress
Expand Down Expand Up @@ -109,12 +108,7 @@ async def visit_host(self, host, port):

# Create an SSL context
try:
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
ssl_context.set_ciphers("ALL:@SECLEVEL=0")
ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
ssl_context = self.helpers.ssl_context_noverify()
except Exception as e:
self.warning(f"Error creating SSL context: {e}")
return [], [], (host, port)
Expand Down
Loading

0 comments on commit e4f2617

Please sign in to comment.