Merge pull request #774 from blacklanternsecurity/filedownload-module-2

Filedownload Module
blacklanternsecurity · Oct 13, 2023 · e4f2617 · e4f2617
2 parents 09f492c + 70c00c5
commit e4f2617
Show file tree

Hide file tree

Showing 4 changed files with 314 additions and 46 deletions.
diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
@@ -7,6 +7,7 @@
 import traceback
 from pathlib import Path
 from bs4 import BeautifulSoup
+from contextlib import asynccontextmanager
 
 from httpx._models import Cookies
 
@@ -216,7 +217,7 @@ async def request(self, *args, **kwargs):
         if client_kwargs:
             client = self.AsyncClient(**client_kwargs)
 
-        try:
+        async with self._acatch(url, raise_error):
             if self.http_debug:
                 logstr = f"Web request: {str(args)}, {str(kwargs)}"
                 log.debug(logstr)
@@ -226,41 +227,6 @@ async def request(self, *args, **kwargs):
                     f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}"
                 )
             return response
-        except httpx.PoolTimeout:
-            # this block exists because of this:
-            #  https://github.com/encode/httpcore/discussions/783
-            log.verbose(f"PoolTimeout to URL: {url}")
-            self.web_client = self.AsyncClient(persist_cookies=False)
-            return await self.request(*args, **kwargs)
-        except httpx.TimeoutException:
-            log.verbose(f"HTTP timeout to URL: {url}")
-            if raise_error:
-                raise
-        except httpx.ConnectError:
-            log.verbose(f"HTTP connect failed to URL: {url}")
-            if raise_error:
-                raise
-        except httpx.RequestError as e:
-            log.trace(f"Error with request to URL: {url}: {e}")
-            log.trace(traceback.format_exc())
-            if raise_error:
-                raise
-        except ssl.SSLError as e:
-            msg = f"SSL error with request to URL: {url}: {e}"
-            log.trace(msg)
-            log.trace(traceback.format_exc())
-            if raise_error:
-                raise httpx.RequestError(msg)
-        except anyio.EndOfStream as e:
-            msg = f"AnyIO error with request to URL: {url}: {e}"
-            log.trace(msg)
-            log.trace(traceback.format_exc())
-            if raise_error:
-                raise httpx.RequestError(msg)
-        except BaseException as e:
-            log.trace(f"Unhandled exception with request to URL: {url}: {e}")
-            log.trace(traceback.format_exc())
-            raise
 
     async def download(self, url, **kwargs):
         """
@@ -272,9 +238,11 @@ async def download(self, url, **kwargs):
             url (str): The URL of the file to download.
             filename (str, optional): The filename to save the downloaded file as.
                 If not provided, will generate based on URL.
+            max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes.
             cache_hrs (float, optional): The number of hours to cache the downloaded file.
                 A negative value disables caching. Defaults to -1.
             method (str, optional): The HTTP method to use for the request, defaults to 'GET'.
+            raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False.
             **kwargs: Additional keyword arguments to pass to the httpx request.
 
         Returns:
@@ -285,28 +253,48 @@ async def download(self, url, **kwargs):
         """
         success = False
         filename = kwargs.pop("filename", self.parent_helper.cache_filename(url))
+        follow_redirects = kwargs.pop("follow_redirects", True)
+        max_size = kwargs.pop("max_size", None)
+        warn = kwargs.pop("warn", True)
+        raise_error = kwargs.pop("raise_error", False)
+        if max_size is not None:
+            max_size = self.parent_helper.human_to_bytes(max_size)
         cache_hrs = float(kwargs.pop("cache_hrs", -1))
+        total_size = 0
+        chunk_size = 8192
         log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
         if cache_hrs > 0 and self.parent_helper.is_cached(url):
             log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
             success = True
         else:
             # kwargs["raise_error"] = True
             # kwargs["stream"] = True
+            kwargs["follow_redirects"] = follow_redirects
             if not "method" in kwargs:
                 kwargs["method"] = "GET"
             try:
-                async with self.AsyncClient().stream(url=url, **kwargs) as response:
+                async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response:
                     status_code = getattr(response, "status_code", 0)
                     log.debug(f"Download result: HTTP {status_code}")
                     if status_code != 0:
                         response.raise_for_status()
                         with open(filename, "wb") as f:
-                            async for chunk in response.aiter_bytes(chunk_size=8192):
+                            agen = response.aiter_bytes(chunk_size=chunk_size)
+                            async for chunk in agen:
+                                if max_size is not None and total_size + chunk_size > max_size:
+                                    log.verbose(
+                                        f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated"
+                                    )
+                                    agen.aclose()
+                                    break
+                                total_size += chunk_size
                                 f.write(chunk)
                         success = True
             except httpx.HTTPError as e:
-                log.warning(f"Failed to download {url}: {e}")
+                log_fn = log.verbose
+                if warn:
+                    log_fn = log.warning
+                log_fn(f"Failed to download {url}: {e}")
                 return
 
         if success:
@@ -574,6 +562,59 @@ def is_spider_danger(self, source_event, url):
             return True
         return False
 
+    def ssl_context_noverify(self):
+        ssl_context = ssl.create_default_context()
+        ssl_context.check_hostname = False
+        ssl_context.verify_mode = ssl.CERT_NONE
+        ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
+        ssl_context.set_ciphers("ALL:@SECLEVEL=0")
+        ssl_context.options |= 0x4  # Add the OP_LEGACY_SERVER_CONNECT option
+        return ssl_context
+
+    @asynccontextmanager
+    async def _acatch(self, url, raise_error):
+        """
+        Asynchronous context manager to handle various httpx errors during a request.
+
+        Yields:
+            None
+
+        Note:
+            This function is internal and should generally not be used directly.
+            `url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function.
+        """
+        try:
+            yield
+        except httpx.TimeoutException:
+            log.verbose(f"HTTP timeout to URL: {url}")
+            if raise_error:
+                raise
+        except httpx.ConnectError:
+            log.verbose(f"HTTP connect failed to URL: {url}")
+            if raise_error:
+                raise
+        except httpx.RequestError as e:
+            log.trace(f"Error with request to URL: {url}: {e}")
+            log.trace(traceback.format_exc())
+            if raise_error:
+                raise
+        except ssl.SSLError as e:
+            msg = f"SSL error with request to URL: {url}: {e}"
+            log.trace(msg)
+            log.trace(traceback.format_exc())
+            if raise_error:
+                raise httpx.RequestError(msg)
+        except anyio.EndOfStream as e:
+            msg = f"AnyIO error with request to URL: {url}: {e}"
+            log.trace(msg)
+            log.trace(traceback.format_exc())
+            if raise_error:
+                raise httpx.RequestError(msg)
+        except BaseException as e:
+            log.trace(f"Unhandled exception with request to URL: {url}: {e}")
+            log.trace(traceback.format_exc())
+            raise
+
 
 user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]]
 pass_keywords = [re.compile(r, re.I) for r in ["pass"]]

diff --git a/bbot/modules/filedownload.py b/bbot/modules/filedownload.py
@@ -0,0 +1,165 @@
+import json
+from pathlib import Path
+
+from bbot.modules.base import BaseModule
+
+
+class filedownload(BaseModule):
+    """
+    Watch for common filetypes and download them.
+
+    Capable of identifying interesting files even if the extension is not in the URL.
+    E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
+    """
+
+    watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
+    produced_events = []
+    flags = ["active", "safe"]
+    meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
+    options = {
+        "extensions": [
+            "bak",  #  Backup File
+            "bash",  #  Bash Script or Configuration
+            "bashrc",  #  Bash Script or Configuration
+            "conf",  #  Configuration File
+            "cfg",  #  Configuration File
+            "crt",  #  Certificate File
+            "csv",  #  Comma Separated Values File
+            "db",  #  SQLite Database File
+            "sqlite",  #  SQLite Database File
+            "doc",  #  Microsoft Word Document (Old Format)
+            "docx",  #  Microsoft Word Document
+            "exe",  #  Windows PE executable
+            "ica",  #  Citrix Independent Computing Architecture File
+            "indd",  #  Adobe InDesign Document
+            "ini",  #  Initialization File
+            "jar",  #  Java Archive
+            "key",  #  Private Key File
+            "pub",  #  Public Key File
+            "log",  #  Log File
+            "markdown",  #  Markdown File
+            "md",  #  Markdown File
+            "msi",  # Windows setup file
+            "odg",  #  OpenDocument Graphics (LibreOffice, OpenOffice)
+            "odp",  #  OpenDocument Presentation (LibreOffice, OpenOffice)
+            "ods",  #  OpenDocument Spreadsheet (LibreOffice, OpenOffice)
+            "odt",  #  OpenDocument Text (LibreOffice, OpenOffice)
+            "pdf",  #  Adobe Portable Document Format
+            "pem",  #  Privacy Enhanced Mail (SSL certificate)
+            "png",  #  Portable Network Graphics Image
+            "pps",  #  Microsoft PowerPoint Slideshow (Old Format)
+            "ppsx",  #  Microsoft PowerPoint Slideshow
+            "ppt",  #  Microsoft PowerPoint Presentation (Old Format)
+            "pptx",  #  Microsoft PowerPoint Presentation
+            "ps1",  #  PowerShell Script
+            "raw",  #  Raw Image File Format
+            "rdp",  #  Remote Desktop Protocol File
+            "sh",  #  Shell Script
+            "sql",  #  SQL Database Dump
+            "swp",  #  Swap File (temporary file, often Vim)
+            "sxw",  #  OpenOffice.org Writer document
+            "tar",  #  Tar Archive
+            "tar.gz",  # Gzip-Compressed Tar Archive
+            "zip",  #  Zip Archive
+            "txt",  #  Plain Text Document
+            "vbs",  #  Visual Basic Script
+            "wpd",  #  WordPerfect Document
+            "xls",  #  Microsoft Excel Spreadsheet (Old Format)
+            "xlsx",  #  Microsoft Excel Spreadsheet
+            "xml",  #  eXtensible Markup Language File
+            "yml",  #  YAML Ain't Markup Language
+            "yaml",  #  YAML Ain't Markup Language
+        ],
+        "max_filesize": "10MB",
+    }
+    options_desc = {
+        "extensions": "File extensions to download",
+        "max_filesize": "Cancel download if filesize is greater than this size",
+    }
+
+    scope_distance_modifier = 1
+
+    async def setup(self):
+        self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])]))
+        self.max_filesize = self.options.get("max_filesize", "10MB")
+        self.download_dir = self.scan.home / "filedownload"
+        self.helpers.mkdir(self.download_dir)
+        self.files_downloaded = set()
+        self.mime_db_file = await self.helpers.wordlist(
+            "https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
+        )
+        self.mime_db = {}
+        with open(self.mime_db_file) as f:
+            mime_db = json.load(f)
+            for content_type, attrs in mime_db.items():
+                if "extensions" in attrs and attrs["extensions"]:
+                    self.mime_db[content_type] = attrs["extensions"][0].lower()
+        return True
+
+    async def filter_event(self, event):
+        # accept file download requests from other modules
+        if "filedownload" in event.tags:
+            return True
+        h = self.hash_event(event)
+        if h in self.files_downloaded:
+            return False, f"Already processed {event}"
+        return True
+
+    def hash_event(self, event):
+        if event.type == "HTTP_RESPONSE":
+            return hash(event.data["url"])
+        return hash(event.data)
+
+    async def handle_event(self, event):
+        if event.type == "URL_UNVERIFIED":
+            url_lower = event.data.lower()
+            if any(url_lower.endswith(f".{e}") for e in self.extensions):
+                await self.download_file(event.data)
+        elif event.type == "HTTP_RESPONSE":
+            content_type = event.data["header"].get("content_type", "")
+            if content_type:
+                url = event.data["url"]
+                await self.download_file(url, content_type=content_type)
+
+    async def download_file(self, url, content_type=None):
+        orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
+        if orig_filename is None:
+            return
+        result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
+        if result:
+            self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
+        self.files_downloaded.add(hash(url))
+
+    def make_filename(self, url, content_type=None):
+        # first, try to determine original filename
+        parsed_url = self.helpers.urlparse(url)
+        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        url_path = parsed_url.path.strip("/")
+        # try to get extension from URL path
+        extension = Path(url_path).suffix.strip(".").lower()
+        if extension:
+            url_stem = url.rsplit(".", 1)[0]
+        else:
+            url_stem = str(url)
+        filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
+        if not url_path:
+            url_path = "unknown"
+            filename = f"{filename}-{url_path}"
+        # if that fails, try to get it from content type
+        if not extension:
+            if content_type and content_type in self.mime_db:
+                extension = self.mime_db[content_type]
+
+        if (not extension) or (extension not in self.extensions):
+            self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
+            return None, None, None
+
+        orig_filename = Path(url_path).stem
+        if extension:
+            filename = f"{filename}.{extension}"
+            orig_filename = f"{orig_filename}.{extension}"
+        return orig_filename, self.download_dir / filename, base_url
+
+    async def report(self):
+        if self.files_downloaded:
+            self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
diff --git a/bbot/modules/sslcert.py b/bbot/modules/sslcert.py
@@ -1,4 +1,3 @@
-import ssl
 import asyncio
 from OpenSSL import crypto
 from contextlib import suppress
@@ -109,12 +108,7 @@ async def visit_host(self, host, port):
 
             # Create an SSL context
             try:
-                ssl_context = ssl.create_default_context()
-                ssl_context.check_hostname = False
-                ssl_context.verify_mode = ssl.CERT_NONE
-                ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3
-                ssl_context.set_ciphers("ALL:@SECLEVEL=0")
-                ssl_context.options |= 0x4  # Add the OP_LEGACY_SERVER_CONNECT option
+                ssl_context = self.helpers.ssl_context_noverify()
             except Exception as e:
                 self.warning(f"Error creating SSL context: {e}")
                 return [], [], (host, port)