Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filedownload Module #774

Merged
merged 4 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 80 additions & 39 deletions bbot/core/helpers/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import traceback
from pathlib import Path
from bs4 import BeautifulSoup
from contextlib import asynccontextmanager

from httpx._models import Cookies

Expand Down Expand Up @@ -216,7 +217,7 @@ async def request(self, *args, **kwargs):
if client_kwargs:
client = self.AsyncClient(**client_kwargs)

try:
async with self._acatch(url, raise_error):
if self.http_debug:
logstr = f"Web request: {str(args)}, {str(kwargs)}"
log.debug(logstr)
Expand All @@ -226,41 +227,6 @@ async def request(self, *args, **kwargs):
f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}"
)
return response
except httpx.PoolTimeout:
# this block exists because of this:
# https://github.com/encode/httpcore/discussions/783
log.verbose(f"PoolTimeout to URL: {url}")
self.web_client = self.AsyncClient(persist_cookies=False)
return await self.request(*args, **kwargs)
except httpx.TimeoutException:
log.verbose(f"HTTP timeout to URL: {url}")
if raise_error:
raise
except httpx.ConnectError:
log.verbose(f"HTTP connect failed to URL: {url}")
if raise_error:
raise
except httpx.RequestError as e:
log.trace(f"Error with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
if raise_error:
raise
except ssl.SSLError as e:
msg = f"SSL error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except anyio.EndOfStream as e:
msg = f"AnyIO error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except BaseException as e:
log.trace(f"Unhandled exception with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
raise

async def download(self, url, **kwargs):
"""
Expand All @@ -272,9 +238,11 @@ async def download(self, url, **kwargs):
url (str): The URL of the file to download.
filename (str, optional): The filename to save the downloaded file as.
If not provided, will generate based on URL.
max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes.
cache_hrs (float, optional): The number of hours to cache the downloaded file.
A negative value disables caching. Defaults to -1.
method (str, optional): The HTTP method to use for the request, defaults to 'GET'.
raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False.
**kwargs: Additional keyword arguments to pass to the httpx request.

Returns:
Expand All @@ -285,28 +253,48 @@ async def download(self, url, **kwargs):
"""
success = False
filename = kwargs.pop("filename", self.parent_helper.cache_filename(url))
follow_redirects = kwargs.pop("follow_redirects", True)
max_size = kwargs.pop("max_size", None)
warn = kwargs.pop("warn", True)
raise_error = kwargs.pop("raise_error", False)
if max_size is not None:
max_size = self.parent_helper.human_to_bytes(max_size)
cache_hrs = float(kwargs.pop("cache_hrs", -1))
total_size = 0
chunk_size = 8192
log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
if cache_hrs > 0 and self.parent_helper.is_cached(url):
log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
success = True
else:
# kwargs["raise_error"] = True
# kwargs["stream"] = True
kwargs["follow_redirects"] = follow_redirects
if not "method" in kwargs:
kwargs["method"] = "GET"
try:
async with self.AsyncClient().stream(url=url, **kwargs) as response:
async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response:
status_code = getattr(response, "status_code", 0)
log.debug(f"Download result: HTTP {status_code}")
if status_code != 0:
response.raise_for_status()
with open(filename, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=8192):
agen = response.aiter_bytes(chunk_size=chunk_size)
async for chunk in agen:
if max_size is not None and total_size + chunk_size > max_size:
log.verbose(
f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated"
)
agen.aclose()
break
total_size += chunk_size
f.write(chunk)
success = True
except httpx.HTTPError as e:
log.warning(f"Failed to download {url}: {e}")
log_fn = log.verbose
if warn:
log_fn = log.warning
log_fn(f"Failed to download {url}: {e}")
return

if success:
Expand Down Expand Up @@ -574,6 +562,59 @@ def is_spider_danger(self, source_event, url):
return True
return False

def ssl_context_noverify(self):
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
ssl_context.set_ciphers("ALL:@SECLEVEL=0")
ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
return ssl_context

@asynccontextmanager
async def _acatch(self, url, raise_error):
"""
Asynchronous context manager to handle various httpx errors during a request.

Yields:
None

Note:
This function is internal and should generally not be used directly.
`url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function.
"""
try:
yield
except httpx.TimeoutException:
log.verbose(f"HTTP timeout to URL: {url}")
if raise_error:
raise
except httpx.ConnectError:
log.verbose(f"HTTP connect failed to URL: {url}")
if raise_error:
raise
except httpx.RequestError as e:
log.trace(f"Error with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
if raise_error:
raise
except ssl.SSLError as e:
msg = f"SSL error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except anyio.EndOfStream as e:
msg = f"AnyIO error with request to URL: {url}: {e}"
log.trace(msg)
log.trace(traceback.format_exc())
if raise_error:
raise httpx.RequestError(msg)
except BaseException as e:
log.trace(f"Unhandled exception with request to URL: {url}: {e}")
log.trace(traceback.format_exc())
raise


user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]]
pass_keywords = [re.compile(r, re.I) for r in ["pass"]]
Expand Down
165 changes: 165 additions & 0 deletions bbot/modules/filedownload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import json
from pathlib import Path

from bbot.modules.base import BaseModule


class filedownload(BaseModule):
"""
Watch for common filetypes and download them.

Capable of identifying interesting files even if the extension is not in the URL.
E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
"""

watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
produced_events = []
flags = ["active", "safe"]
meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
options = {
"extensions": [
"bak", # Backup File
"bash", # Bash Script or Configuration
"bashrc", # Bash Script or Configuration
"conf", # Configuration File
"cfg", # Configuration File
"crt", # Certificate File
"csv", # Comma Separated Values File
"db", # SQLite Database File
"sqlite", # SQLite Database File
"doc", # Microsoft Word Document (Old Format)
"docx", # Microsoft Word Document
"exe", # Windows PE executable
"ica", # Citrix Independent Computing Architecture File
"indd", # Adobe InDesign Document
"ini", # Initialization File
"jar", # Java Archive
"key", # Private Key File
"pub", # Public Key File
"log", # Log File
"markdown", # Markdown File
"md", # Markdown File
"msi", # Windows setup file
"odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
"odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
"ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
"odt", # OpenDocument Text (LibreOffice, OpenOffice)
"pdf", # Adobe Portable Document Format
"pem", # Privacy Enhanced Mail (SSL certificate)
"png", # Portable Network Graphics Image
"pps", # Microsoft PowerPoint Slideshow (Old Format)
"ppsx", # Microsoft PowerPoint Slideshow
"ppt", # Microsoft PowerPoint Presentation (Old Format)
"pptx", # Microsoft PowerPoint Presentation
"ps1", # PowerShell Script
"raw", # Raw Image File Format
"rdp", # Remote Desktop Protocol File
"sh", # Shell Script
"sql", # SQL Database Dump
"swp", # Swap File (temporary file, often Vim)
"sxw", # OpenOffice.org Writer document
"tar", # Tar Archive
"tar.gz", # Gzip-Compressed Tar Archive
"zip", # Zip Archive
"txt", # Plain Text Document
"vbs", # Visual Basic Script
"wpd", # WordPerfect Document
"xls", # Microsoft Excel Spreadsheet (Old Format)
"xlsx", # Microsoft Excel Spreadsheet
"xml", # eXtensible Markup Language File
"yml", # YAML Ain't Markup Language
"yaml", # YAML Ain't Markup Language
],
"max_filesize": "10MB",
}
options_desc = {
"extensions": "File extensions to download",
"max_filesize": "Cancel download if filesize is greater than this size",
}

scope_distance_modifier = 1

async def setup(self):
self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])]))
self.max_filesize = self.options.get("max_filesize", "10MB")
self.download_dir = self.scan.home / "filedownload"
self.helpers.mkdir(self.download_dir)
self.files_downloaded = set()
self.mime_db_file = await self.helpers.wordlist(
"https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
)
self.mime_db = {}
with open(self.mime_db_file) as f:
mime_db = json.load(f)
for content_type, attrs in mime_db.items():
if "extensions" in attrs and attrs["extensions"]:
self.mime_db[content_type] = attrs["extensions"][0].lower()
return True

async def filter_event(self, event):
# accept file download requests from other modules
if "filedownload" in event.tags:
return True
h = self.hash_event(event)
if h in self.files_downloaded:
return False, f"Already processed {event}"
return True

def hash_event(self, event):
if event.type == "HTTP_RESPONSE":
return hash(event.data["url"])
return hash(event.data)

async def handle_event(self, event):
if event.type == "URL_UNVERIFIED":
url_lower = event.data.lower()
if any(url_lower.endswith(f".{e}") for e in self.extensions):
await self.download_file(event.data)
elif event.type == "HTTP_RESPONSE":
content_type = event.data["header"].get("content_type", "")
if content_type:
url = event.data["url"]
await self.download_file(url, content_type=content_type)

async def download_file(self, url, content_type=None):
orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
if orig_filename is None:
return
result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
if result:
self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
self.files_downloaded.add(hash(url))

def make_filename(self, url, content_type=None):
# first, try to determine original filename
parsed_url = self.helpers.urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
url_path = parsed_url.path.strip("/")
# try to get extension from URL path
extension = Path(url_path).suffix.strip(".").lower()
if extension:
url_stem = url.rsplit(".", 1)[0]
else:
url_stem = str(url)
filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
if not url_path:
url_path = "unknown"
filename = f"{filename}-{url_path}"
# if that fails, try to get it from content type
if not extension:
if content_type and content_type in self.mime_db:
extension = self.mime_db[content_type]

if (not extension) or (extension not in self.extensions):
self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
return None, None, None

orig_filename = Path(url_path).stem
if extension:
filename = f"{filename}.{extension}"
orig_filename = f"{orig_filename}.{extension}"
return orig_filename, self.download_dir / filename, base_url

async def report(self):
if self.files_downloaded:
self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
8 changes: 1 addition & 7 deletions bbot/modules/sslcert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import ssl
import asyncio
from OpenSSL import crypto
from contextlib import suppress
Expand Down Expand Up @@ -109,12 +108,7 @@ async def visit_host(self, host, port):

# Create an SSL context
try:
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
ssl_context.set_ciphers("ALL:@SECLEVEL=0")
ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option
ssl_context = self.helpers.ssl_context_noverify()
except Exception as e:
self.warning(f"Error creating SSL context: {e}")
return [], [], (host, port)
Expand Down
Loading