Skip to content

Commit

Permalink
filedownload - support for HTTP_RESPONSE
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTechromancer committed Sep 21, 2023
1 parent 8e9ede2 commit 8171cad
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 35 deletions.
2 changes: 1 addition & 1 deletion bbot/core/helpers/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ async def download(self, url, **kwargs):
cache_hrs = float(kwargs.pop("cache_hrs", -1))
total_size = 0
chunk_size = 8192
log.hugesuccess(f"Downloading file from {url} with cache_hrs={cache_hrs}")
log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
if cache_hrs > 0 and self.parent_helper.is_cached(url):
log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
success = True
Expand Down
101 changes: 74 additions & 27 deletions bbot/modules/filedownload.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import json
from pathlib import Path

from bbot.modules.base import BaseModule


class filedownload(BaseModule):
"""
Watch for common filetypes and download them
Watch for common filetypes and download them.
Capable of identifying interesting files even if the extension is not in the URL.
E.g. if a PDF is being served at https://evilcorp.com/mypdf, it will still be downloaded and given the proper extension.
"""

watched_events = ["URL_UNVERIFIED"]
watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
produced_events = []
flags = ["active", "safe"]
meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
Expand Down Expand Up @@ -86,37 +90,80 @@ async def setup(self):
self.max_filesize = self.options.get("max_filesize", "10MB")
self.download_dir = self.scan.home / "filedownload"
self.helpers.mkdir(self.download_dir)
self.files_downloaded = 0
self.seen = set()
# https://raw.githubusercontent.com/jshttp/mime-db/master/db.json
self.files_downloaded = set()
self.mime_db_file = await self.helpers.wordlist(
"https://raw.githubusercontent.com/jshttp/mime-db/master/db.json"
)
self.mime_db = {}
with open(self.mime_db_file) as f:
mime_db = json.load(f)
for content_type, attrs in mime_db.items():
if "extensions" in attrs and attrs["extensions"]:
self.mime_db[content_type] = attrs["extensions"][0].lower()
return True

async def filter_event(self, event):
h = hash(event.data)
if h in self.seen:
# accept file download requests from other modules
if "filedownload" in event.tags:
return True
h = self.hash_event(event)
if h in self.files_downloaded:
return False, f"Already processed {event}"
self.seen.add(h)
return True

def hash_event(self, event):
if event.type == "HTTP_RESPONSE":
return hash(event.data["url"])
return hash(event.data)

async def handle_event(self, event):
url_lower = event.data.lower()
if any(url_lower.endswith(f".{e}") for e in self.extensions):
timestamp = self.helpers.make_date(event.timestamp)
filepath = Path(event.parsed.path)
split_url = url_lower.rsplit(".", 1)
url_stem = split_url[0]
filename = f"{timestamp}_{self.helpers.tagify(url_stem)}"
if len(split_url) == 2:
filename = f"{filename}.{split_url[-1]}"
file_destination = self.download_dir / filename
base_url = f"{event.parsed.scheme}://{event.parsed.netloc}"
result = await self.helpers.download(
event.data, warn=False, filename=file_destination, max_size=self.max_filesize
)
if result:
self.info(f'Found "{filepath.name}" at "{base_url}", downloaded to {file_destination}')
self.files_downloaded += 1
if event.type == "URL_UNVERIFIED":
url_lower = event.data.lower()
if any(url_lower.endswith(f".{e}") for e in self.extensions):
await self.download_file(event.data)
elif event.type == "HTTP_RESPONSE":
content_type = event.data["header"].get("content_type", "")
if content_type:
url = event.data["url"]
await self.download_file(url, content_type=content_type)

async def download_file(self, url, content_type=None):
orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
if orig_filename is None:
return
result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
if result:
self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
self.files_downloaded.add(hash(url))

def make_filename(self, url, content_type=None):
# first, try to determine original filename
parsed_url = self.helpers.urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
url_path = parsed_url.path.strip("/")
if not url_path:
url_path = "unknown"
# try to get extension from URL path
extension = Path(url_path).suffix.strip(".").lower()
if extension:
url_stem = url.rsplit(".", 1)[0]
else:
url_stem = str(url)
filename = f"{self.helpers.make_date()}_{self.helpers.tagify(url_stem)}"
# if that fails, try to get it from content type
if not extension:
if content_type and content_type in self.mime_db:
extension = self.mime_db[content_type]
else:
self.debug(f'Extension "{extension}" at url "{url}" not in list of watched extensions.')
return None, None, None

orig_filename = Path(url_path).stem
if extension:
filename = f"{filename}.{extension}"
orig_filename = f"{orig_filename}.{extension}"
return orig_filename, self.download_dir / filename, base_url

async def report(self):
if self.files_downloaded > 0:
self.success(f"Downloaded {self.files_downloaded:,} file(s) to {self.download_dir}")
if self.files_downloaded:
self.success(f"Downloaded {len(self.files_downloaded):,} file(s) to {self.download_dir}")
48 changes: 41 additions & 7 deletions bbot/test/test_step_2/module_tests/test_module_filedownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,51 @@


class TestFileDownload(ModuleTestBase):
targets = ["http://127.0.0.1:8888/"]
modules_overrides = ["filedownload", "httpx", "excavate"]
targets = ["http://127.0.0.1:8888/Test_PDF"]
modules_overrides = ["filedownload", "httpx", "excavate", "speculate"]

pdf_data = """%PDF-1.
1 0 obj<</Pages 2 0 R>>endobj
2 0 obj<</Kids[3 0 R]/Count 1>>endobj
3 0 obj<</Parent 2 0 R>>endobj
trailer <</Root 1 0 R>>"""

async def setup_before_prep(self, module_test):
module_test.httpx_mock.add_response(
url="https://raw.githubusercontent.com/jshttp/mime-db/master/db.json",
json={
"application/pdf": {"source": "iana", "compressible": False, "extensions": ["pdf"]},
},
)

async def setup_after_prep(self, module_test):
module_test.set_expect_requests(dict(uri="/"), dict(response_data='<a href="/Test_File.txt"/>'))
module_test.set_expect_requests(dict(uri="/Test_File.txt"), dict(response_data="juicy stuff"))
module_test.set_expect_requests(
dict(uri="/"), dict(response_data='<a href="/Test_File.txt"/><a href="/Test_PDF"/>')
)
module_test.set_expect_requests(
dict(uri="/Test_File.txt"),
dict(
response_data="juicy stuff",
),
)
module_test.set_expect_requests(
dict(uri="/Test_PDF"),
dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}),
)

def check(self, module_test, events):
download_dir = module_test.scan.home / "filedownload"
files = list(download_dir.glob("*_test-file.txt"))
assert len(files) == 1, f"No file found at {download_dir}"
file = files[0]

# text file
text_files = list(download_dir.glob("*test-file.txt"))
assert len(text_files) == 1, f"No text file found at {download_dir}"
file = text_files[0]
assert file.is_file(), f"File not found at {file}"
assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content"

# PDF file (no extension)
pdf_files = list(download_dir.glob("*test-pdf.pdf"))
assert len(pdf_files) == 1, f"No PDF file found at {download_dir}"
file = pdf_files[0]
assert file.is_file(), f"File not found at {file}"
assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"

0 comments on commit 8171cad

Please sign in to comment.