Skip to content

Commit 4bc355e

Browse files
Added filedownload module
1 parent cfe76ca commit 4bc355e

File tree

3 files changed

+140
-1
lines changed

3 files changed

+140
-1
lines changed

bbot/core/helpers/web.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ async def download(self, url, **kwargs):
272272
url (str): The URL of the file to download.
273273
filename (str, optional): The filename to save the downloaded file as.
274274
If not provided, will generate based on URL.
275+
max_size (str or int): Maximum filesize as a string ("5MB") or integer in bytes.
275276
cache_hrs (float, optional): The number of hours to cache the downloaded file.
276277
A negative value disables caching. Defaults to -1.
277278
method (str, optional): The HTTP method to use for the request, defaults to 'GET'.
@@ -285,7 +286,12 @@ async def download(self, url, **kwargs):
285286
"""
286287
success = False
287288
filename = kwargs.pop("filename", self.parent_helper.cache_filename(url))
289+
max_size = kwargs.pop("max_size", None)
290+
if max_size is not None:
291+
max_size = self.parent_helper.human_to_bytes(max_size)
288292
cache_hrs = float(kwargs.pop("cache_hrs", -1))
293+
total_size = 0
294+
chunk_size = 8192
289295
log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}")
290296
if cache_hrs > 0 and self.parent_helper.is_cached(url):
291297
log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}")
@@ -302,7 +308,15 @@ async def download(self, url, **kwargs):
302308
if status_code != 0:
303309
response.raise_for_status()
304310
with open(filename, "wb") as f:
305-
async for chunk in response.aiter_bytes(chunk_size=8192):
311+
agen = response.aiter_bytes(chunk_size=chunk_size)
312+
async for chunk in agen:
313+
if max_size is not None and total_size + chunk_size > max_size:
314+
log.verbose(
315+
f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated"
316+
)
317+
agen.aclose()
318+
break
319+
total_size += chunk_size
306320
f.write(chunk)
307321
success = True
308322
except httpx.HTTPError as e:

bbot/modules/filedownload.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from pathlib import Path
2+
3+
from bbot.modules.base import BaseModule
4+
5+
6+
class filedownload(BaseModule):
7+
"""
8+
Watch for common filetypes and download them
9+
"""
10+
11+
watched_events = ["URL_UNVERIFIED"]
12+
produced_events = []
13+
flags = ["active", "safe"]
14+
meta = {"description": "Download common filetypes such as PDF, DOCX, PPTX, etc."}
15+
options = {
16+
"extensions": [
17+
"bak", # Backup File
18+
"bash", # Bash Script or Configuration
19+
"bashrc", # Bash Script or Configuration
20+
"conf", # Configuration File
21+
"cfg", # Configuration File
22+
"cr2", # Canon RAW Image
23+
"crt", # Certificate File
24+
"crw", # Canon RAW Image (Older Format)
25+
"csv", # Comma Separated Values File
26+
"db", # SQLite Database File
27+
"sqlite", # SQLite Database File
28+
"doc", # Microsoft Word Document (Old Format)
29+
"docx", # Microsoft Word Document
30+
"ica", # Citrix Independent Computing Architecture File
31+
"indd", # Adobe InDesign Document
32+
"ini", # Initialization File
33+
"jar", # Java Archive
34+
"jpg", # JPEG Image
35+
"jpeg", # JPEG Image
36+
"js", # JavaScript File
37+
"json", # JavaScript Object Notation File
38+
"key", # Private Key File
39+
"pub", # Public Key File
40+
"log", # Log File
41+
"md", # Markdown File
42+
"markdown", # Markdown File
43+
"odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
44+
"odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
45+
"ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
46+
"odt", # OpenDocument Text (LibreOffice, OpenOffice)
47+
"pdf", # Adobe Portable Document Format
48+
"pem", # Privacy Enhanced Mail (SSL certificate)
49+
"png", # Portable Network Graphics Image
50+
"pps", # Microsoft PowerPoint Slideshow (Old Format)
51+
"ppsx", # Microsoft PowerPoint Slideshow
52+
"ppt", # Microsoft PowerPoint Presentation (Old Format)
53+
"pptx", # Microsoft PowerPoint Presentation
54+
"ps1", # PowerShell Script
55+
"raw", # Raw Image File Format
56+
"rdp", # Remote Desktop Protocol File
57+
"sh", # Shell Script
58+
"sql", # SQL Database Dump
59+
"svg", # Scalable Vector Graphics
60+
"svgz", # Compressed SVG
61+
"swp", # Swap File (temporary file, often Vim)
62+
"sxw", # OpenOffice.org Writer document
63+
"tar", # Tar Archive
64+
"tar.gz", # Gzip-Compressed Tar Archive
65+
"zip", # Zip Archive
66+
"txt", # Plain Text Document
67+
"vbs", # Visual Basic Script
68+
"wpd", # WordPerfect Document
69+
"xls", # Microsoft Excel Spreadsheet (Old Format)
70+
"xlsx", # Microsoft Excel Spreadsheet
71+
"xml", # eXtensible Markup Language File
72+
"yml", # YAML Ain't Markup Language
73+
"yaml", # YAML Ain't Markup Language
74+
],
75+
"max_filesize": "10MB",
76+
}
77+
options_desc = {
78+
"extensions": "File extensions to download",
79+
"max_filesize": "Cancel download if filesize is greater than this size",
80+
}
81+
82+
scope_distance_modifier = 1
83+
84+
async def setup(self):
85+
self.extensions = list(set([e.lower().strip(".") for e in self.options.get("extensions", [])]))
86+
self.max_filesize = self.options.get("max_filesize", "10MB")
87+
self.download_dir = self.scan.home / "filedownload"
88+
self.helpers.mkdir(self.download_dir)
89+
self.files_downloaded = 0
90+
return True
91+
92+
async def handle_event(self, event):
93+
url_lower = event.data.lower()
94+
if any(url_lower.endswith(f".{e}") for e in self.extensions):
95+
timestamp = self.helpers.make_date(event.timestamp)
96+
filepath = Path(event.parsed.path)
97+
filename_stem = self.helpers.tagify(filepath.stem)
98+
filename = f"{timestamp}_{filename_stem}{filepath.suffix}"
99+
file_destination = self.download_dir / filename
100+
base_url = f"{event.parsed.scheme}://{event.parsed.netloc}"
101+
self.info(f'Found "{filepath.name}" at "{base_url}", downloading to {file_destination}')
102+
await self.helpers.download(event.data, filename=file_destination, max_size=self.max_filesize)
103+
self.files_downloaded += 1
104+
105+
async def report(self):
106+
if self.files_downloaded > 0:
107+
self.success(f"Downloaded {self.files_downloaded:,} file(s) to {self.download_dir}")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from .base import ModuleTestBase
2+
3+
4+
class TestFileDownload(ModuleTestBase):
5+
targets = ["http://127.0.0.1:8888/"]
6+
modules_overrides = ["filedownload", "httpx", "excavate"]
7+
8+
async def setup_after_prep(self, module_test):
9+
module_test.set_expect_requests(dict(uri="/"), dict(response_data='<a href="/Test_File.txt"/>'))
10+
module_test.set_expect_requests(dict(uri="/Test_File.txt"), dict(response_data="juicy stuff"))
11+
12+
def check(self, module_test, events):
13+
download_dir = module_test.scan.home / "filedownload"
14+
files = list(download_dir.glob("*_test-file.txt"))
15+
assert len(files) == 1, f"No file found at {download_dir}"
16+
file = files[0]
17+
assert file.is_file(), f"File not found at {file}"
18+
assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content"

0 commit comments

Comments
 (0)