filedownload - support for HTTP_RESPONSE

blacklanternsecurity · Sep 21, 2023 · 8171cad · 8171cad
1 parent 8e9ede2
commit 8171cad
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 35 deletions.
diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py
@@ -266,7 +266,7 @@ async def download(self, url, **kwargs):
         cache_hrs = float(kwargs.pop("cache_hrs", -1))
         total_size = 0
         chunk_size = 8192
-        log.hugesuccess(f"Downloading file from {url} with cache_hrs={cache_hrs}")
+        log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
         if cache_hrs > 0 and self.parent_helper.is_cached(url):
             log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
             success = True

diff --git a/bbot/modules/filedownload.py b/bbot/modules/filedownload.py
@@ -1,14 +1,18 @@
+import json
 from pathlib import Path
 
 from bbot.modules.base import BaseModule
 
 
 class filedownload(BaseModule):
     """
-    Watch for common filetypes and download them
+    Watch for common filetypes and download them.
+
+    Capable of identifying interesting files even if the extension is not in the URL.
+    E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
     """
 
-    watched_events = ["URL_UNVERIFIED"]
+    watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
     produced_events = []
     flags = ["active", "safe"]
     meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
@@ -86,37 +90,80 @@ async def setup(self):
         self.max_filesize = self.options.get("max_filesize", "10MB")
         self.download_dir = self.scan.home / "filedownload"
         self.helpers.mkdir(self.download_dir)
-        self.files_downloaded = 0
-        self.seen = set()
-        # https://raw.githubusercontent.com/jshttp/mime-db/master/db.json
+        self.files_downloaded = set()
+        self.mime_db_file = await self.helpers.wordlist(
+            "https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
+        )
+        self.mime_db = {}
+        with open(self.mime_db_file) as f:
+            mime_db = json.load(f)
+            for content_type, attrs in mime_db.items():
+                if "extensions" in attrs and attrs["extensions"]:
+                    self.mime_db[content_type] = attrs["extensions"][0].lower()
         return True
 
     async def filter_event(self, event):
-        h = hash(event.data)
-        if h in self.seen:
+        # accept file download requests from other modules
+        if "filedownload" in event.tags:
+            return True
+        h = self.hash_event(event)
+        if h in self.files_downloaded:
             return False, f"Already processed {event}"
-        self.seen.add(h)
         return True
 
+    def hash_event(self, event):
+        if event.type == "HTTP_RESPONSE":
+            return hash(event.data["url"])
+        return hash(event.data)
+
     async def handle_event(self, event):
-        url_lower = event.data.lower()
-        if any(url_lower.endswith(f".{e}") for e in self.extensions):
-            timestamp = self.helpers.make_date(event.timestamp)
-            filepath = Path(event.parsed.path)
-            split_url = url_lower.rsplit(".", 1)
-            url_stem = split_url[0]
-            filename = f"{timestamp}_{self.helpers.tagify(url_stem)}"
-            if len(split_url) == 2:
-                filename = f"{filename}.{split_url[-1]}"
-            file_destination = self.download_dir / filename
-            base_url = f"{event.parsed.scheme}://{event.parsed.netloc}"
-            result = await self.helpers.download(
-                event.data, warn=False, filename=file_destination, max_size=self.max_filesize
-            )
-            if result:
-                self.info(f'Found "{filepath.name}" at "{base_url}", downloaded to {file_destination}')
-                self.files_downloaded += 1
+        if event.type == "URL_UNVERIFIED":
+            url_lower = event.data.lower()
+            if any(url_lower.endswith(f".{e}") for e in self.extensions):
+                await self.download_file(event.data)
+        elif event.type == "HTTP_RESPONSE":
+            content_type = event.data["header"].get("content_type", "")
+            if content_type:
+                url = event.data["url"]
+                await self.download_file(url, content_type=content_type)
+
+    async def download_file(self, url, content_type=None):
+        orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
+        if orig_filename is None:
+            return
+        result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
+        if result:
+            self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
+        self.files_downloaded.add(hash(url))
+
+    def make_filename(self, url, content_type=None):
+        # first, try to determine original filename
+        parsed_url = self.helpers.urlparse(url)
+        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        url_path = parsed_url.path.strip("/")
+        if not url_path:
+            url_path = "unknown"
+        # try to get extension from URL path
+        extension = Path(url_path).suffix.strip(".").lower()
+        if extension:
+            url_stem = url.rsplit(".", 1)[0]
+        else:
+            url_stem = str(url)
+        filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
+        # if that fails, try to get it from content type
+        if not extension:
+            if content_type and content_type in self.mime_db:
+                extension = self.mime_db[content_type]
+            else:
+                self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
+                return None, None, None
+
+        orig_filename = Path(url_path).stem
+        if extension:
+            filename = f"{filename}.{extension}"
+            orig_filename = f"{orig_filename}.{extension}"
+        return orig_filename, self.download_dir / filename, base_url
 
     async def report(self):
-        if self.files_downloaded > 0:
-            self.success(f"Downloaded {self.files_downloaded:,} file(s) to {self.download_dir}")
+        if self.files_downloaded:
+            self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
diff --git a/bbot/test/test_step_2/module_tests/test_module_filedownload.py b/bbot/test/test_step_2/module_tests/test_module_filedownload.py
@@ -2,17 +2,51 @@
 
 
 class TestFileDownload(ModuleTestBase):
-    targets = ["http://127.0.0.1:8888/"]
-    modules_overrides = ["filedownload", "httpx", "excavate"]
+    targets = ["http://127.0.0.1:8888/Test_PDF"]
+    modules_overrides = ["filedownload", "httpx", "excavate", "speculate"]
+
+    pdf_data = """%PDF-1.
+1 0 obj<</Pages 2 0 R>>endobj
+2 0 obj<</Kids[3 0 R]/Count 1>>endobj
+3 0 obj<</Parent 2 0 R>>endobj
+trailer <</Root 1 0 R>>"""
+
+    async def setup_before_prep(self, module_test):
+        module_test.httpx_mock.add_response(
+            url="https://raw.githubusercontent.com/jshttp/mime-db/master/db.json",
+            json={
+                "application/pdf": {"source": "iana", "compressible": False, "extensions": ["pdf"]},
+            },
+        )
 
     async def setup_after_prep(self, module_test):
-        module_test.set_expect_requests(dict(uri="/"), dict(response_data='<a href="/Test_File.txt"/>'))
-        module_test.set_expect_requests(dict(uri="/Test_File.txt"), dict(response_data="juicy stuff"))
+        module_test.set_expect_requests(
+            dict(uri="/"), dict(response_data='<a href="/Test_File.txt"/><a href="/Test_PDF"/>')
+        )
+        module_test.set_expect_requests(
+            dict(uri="/Test_File.txt"),
+            dict(
+                response_data="juicy stuff",
+            ),
+        )
+        module_test.set_expect_requests(
+            dict(uri="/Test_PDF"),
+            dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}),
+        )
 
     def check(self, module_test, events):
         download_dir = module_test.scan.home / "filedownload"
-        files = list(download_dir.glob("*_test-file.txt"))
-        assert len(files) == 1, f"No file found at {download_dir}"
-        file = files[0]
+
+        # text file
+        text_files = list(download_dir.glob("*test-file.txt"))
+        assert len(text_files) == 1, f"No text file found at {download_dir}"
+        file = text_files[0]
         assert file.is_file(), f"File not found at {file}"
         assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content"
+
+        # PDF file (no extension)
+        pdf_files = list(download_dir.glob("*test-pdf.pdf"))
+        assert len(pdf_files) == 1, f"No PDF file found at {download_dir}"
+        file = pdf_files[0]
+        assert file.is_file(), f"File not found at {file}"
+        assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"