From 8e9ede2464d9539eff2993f8e3df2605c089b826 Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Thu, 21 Sep 2023 11:54:55 -0400 Subject: [PATCH] improve ssl disablement, filedownload improvements --- bbot/core/helpers/web.py | 111 ++++++++++++++++++++++------------- bbot/modules/filedownload.py | 25 ++++++-- bbot/modules/sslcert.py | 8 +-- 3 files changed, 92 insertions(+), 52 deletions(-) diff --git a/bbot/core/helpers/web.py b/bbot/core/helpers/web.py index ff564b21c..f86fc0c5f 100644 --- a/bbot/core/helpers/web.py +++ b/bbot/core/helpers/web.py @@ -7,6 +7,7 @@ import traceback from pathlib import Path from bs4 import BeautifulSoup +from contextlib import asynccontextmanager from httpx._models import Cookies @@ -137,7 +138,11 @@ def __init__(self, parent_helper): def AsyncClient(self, *args, **kwargs): kwargs["_bbot_scan"] = self.parent_helper.scan retries = kwargs.pop("retries", self.parent_helper.config.get("http_retries", 1)) - kwargs["transport"] = httpx.AsyncHTTPTransport(retries=retries, verify=self.ssl_verify) + transport = httpx.AsyncHTTPTransport(retries=retries, verify=self.ssl_verify) + if not self.ssl_verify: + # if we don't want to verify cert validity, we REALLY don't want to verify. + transport._pool._ssl_context = self.ssl_context_noverify() + kwargs["transport"] = transport kwargs["verify"] = self.ssl_verify return BBOTAsyncClient(*args, **kwargs) @@ -216,7 +221,7 @@ async def request(self, *args, **kwargs): if client_kwargs: client = self.AsyncClient(**client_kwargs) - try: + async with self._acatch(url, raise_error): if self.http_debug: logstr = f"Web request: {str(args)}, {str(kwargs)}" log.debug(logstr) @@ -226,41 +231,6 @@ async def request(self, *args, **kwargs): f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}" ) return response - except httpx.PoolTimeout: - # this block exists because of this: - # https://github.com/encode/httpcore/discussions/783 - log.verbose(f"PoolTimeout to URL: {url}") - self.web_client = self.AsyncClient(persist_cookies=False) - return await self.request(*args, **kwargs) - except httpx.TimeoutException: - log.verbose(f"HTTP timeout to URL: {url}") - if raise_error: - raise - except httpx.ConnectError: - log.verbose(f"HTTP connect failed to URL: {url}") - if raise_error: - raise - except httpx.RequestError as e: - log.trace(f"Error with request to URL: {url}: {e}") - log.trace(traceback.format_exc()) - if raise_error: - raise - except ssl.SSLError as e: - msg = f"SSL error with request to URL: {url}: {e}" - log.trace(msg) - log.trace(traceback.format_exc()) - if raise_error: - raise httpx.RequestError(msg) - except anyio.EndOfStream as e: - msg = f"AnyIO error with request to URL: {url}: {e}" - log.trace(msg) - log.trace(traceback.format_exc()) - if raise_error: - raise httpx.RequestError(msg) - except BaseException as e: - log.trace(f"Unhandled exception with request to URL: {url}: {e}") - log.trace(traceback.format_exc()) - raise async def download(self, url, **kwargs): """ @@ -276,6 +246,7 @@ async def download(self, url, **kwargs): cache_hrs (float, optional): The number of hours to cache the downloaded file. A negative value disables caching. Defaults to -1. method (str, optional): The HTTP method to use for the request, defaults to 'GET'. + raise_error (bool, optional): Whether to raise exceptions for HTTP connect, timeout errors. Defaults to False. **kwargs: Additional keyword arguments to pass to the httpx request. Returns: @@ -286,23 +257,27 @@ async def download(self, url, **kwargs): """ success = False filename = kwargs.pop("filename", self.parent_helper.cache_filename(url)) + follow_redirects = kwargs.pop("follow_redirects", True) max_size = kwargs.pop("max_size", None) + warn = kwargs.pop("warn", True) + raise_error = kwargs.pop("raise_error", False) if max_size is not None: max_size = self.parent_helper.human_to_bytes(max_size) cache_hrs = float(kwargs.pop("cache_hrs", -1)) total_size = 0 chunk_size = 8192 - log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}") + log.hugesuccess(f"Downloading file from {url} with cache_hrs={cache_hrs}") if cache_hrs > 0 and self.parent_helper.is_cached(url): log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}") success = True else: # kwargs["raise_error"] = True # kwargs["stream"] = True + kwargs["follow_redirects"] = follow_redirects if not "method" in kwargs: kwargs["method"] = "GET" try: - async with self.AsyncClient().stream(url=url, **kwargs) as response: + async with self._acatch(url, raise_error), self.AsyncClient().stream(url=url, **kwargs) as response: status_code = getattr(response, "status_code", 0) log.debug(f"Download result: HTTP {status_code}") if status_code != 0: @@ -320,7 +295,10 @@ async def download(self, url, **kwargs): f.write(chunk) success = True except httpx.HTTPError as e: - log.warning(f"Failed to download {url}: {e}") + log_fn = log.verbose + if warn: + log_fn = log.warning + log_fn(f"Failed to download {url}: {e}") return if success: @@ -588,6 +566,59 @@ def is_spider_danger(self, source_event, url): return True return False + def ssl_context_noverify(self): + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3 + ssl_context.set_ciphers("ALL:@SECLEVEL=0") + ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option + return ssl_context + + @asynccontextmanager + async def _acatch(self, url, raise_error): + """ + Asynchronous context manager to handle various httpx errors during a request. + + Yields: + None + + Note: + This function is internal and should generally not be used directly. + `url`, `args`, `kwargs`, and `raise_error` should be in the same context as this function. + """ + try: + yield + except httpx.TimeoutException: + log.verbose(f"HTTP timeout to URL: {url}") + if raise_error: + raise + except httpx.ConnectError: + log.verbose(f"HTTP connect failed to URL: {url}") + if raise_error: + raise + except httpx.RequestError as e: + log.trace(f"Error with request to URL: {url}: {e}") + log.trace(traceback.format_exc()) + if raise_error: + raise + except ssl.SSLError as e: + msg = f"SSL error with request to URL: {url}: {e}" + log.trace(msg) + log.trace(traceback.format_exc()) + if raise_error: + raise httpx.RequestError(msg) + except anyio.EndOfStream as e: + msg = f"AnyIO error with request to URL: {url}: {e}" + log.trace(msg) + log.trace(traceback.format_exc()) + if raise_error: + raise httpx.RequestError(msg) + except BaseException as e: + log.trace(f"Unhandled exception with request to URL: {url}: {e}") + log.trace(traceback.format_exc()) + raise + user_keywords = [re.compile(r, re.I) for r in ["user", "login", "email"]] pass_keywords = [re.compile(r, re.I) for r in ["pass"]] diff --git a/bbot/modules/filedownload.py b/bbot/modules/filedownload.py index fb3631c4c..6ac7868dd 100644 --- a/bbot/modules/filedownload.py +++ b/bbot/modules/filedownload.py @@ -87,6 +87,15 @@ async def setup(self): self.download_dir = self.scan.home / "filedownload" self.helpers.mkdir(self.download_dir) self.files_downloaded = 0 + self.seen = set() + # https://raw.githubusercontent.com/jshttp/mime-db/master/db.json + return True + + async def filter_event(self, event): + h = hash(event.data) + if h in self.seen: + return False, f"Already processed {event}" + self.seen.add(h) return True async def handle_event(self, event): @@ -94,13 +103,19 @@ async def handle_event(self, event): if any(url_lower.endswith(f".{e}") for e in self.extensions): timestamp = self.helpers.make_date(event.timestamp) filepath = Path(event.parsed.path) - filename_stem = self.helpers.tagify(filepath.stem) - filename = f"{timestamp}_{filename_stem}{filepath.suffix}" + split_url = url_lower.rsplit(".", 1) + url_stem = split_url[0] + filename = f"{timestamp}_{self.helpers.tagify(url_stem)}" + if len(split_url) == 2: + filename = f"{filename}.{split_url[-1]}" file_destination = self.download_dir / filename base_url = f"{event.parsed.scheme}://{event.parsed.netloc}" - self.info(f'Found "{filepath.name}" at "{base_url}", downloading to {file_destination}') - await self.helpers.download(event.data, filename=file_destination, max_size=self.max_filesize) - self.files_downloaded += 1 + result = await self.helpers.download( + event.data, warn=False, filename=file_destination, max_size=self.max_filesize + ) + if result: + self.info(f'Found "{filepath.name}" at "{base_url}", downloaded to {file_destination}') + self.files_downloaded += 1 async def report(self): if self.files_downloaded > 0: diff --git a/bbot/modules/sslcert.py b/bbot/modules/sslcert.py index a9f269c55..de598dd73 100644 --- a/bbot/modules/sslcert.py +++ b/bbot/modules/sslcert.py @@ -1,4 +1,3 @@ -import ssl import asyncio from OpenSSL import crypto from contextlib import suppress @@ -109,12 +108,7 @@ async def visit_host(self, host, port): # Create an SSL context try: - ssl_context = ssl.create_default_context() - ssl_context.check_hostname = False - ssl_context.verify_mode = ssl.CERT_NONE - ssl_context.options &= ~ssl.OP_NO_SSLv2 & ~ssl.OP_NO_SSLv3 - ssl_context.set_ciphers("ALL:@SECLEVEL=0") - ssl_context.options |= 0x4 # Add the OP_LEGACY_SERVER_CONNECT option + ssl_context = self.helpers.ssl_context_noverify() except Exception as e: self.warning(f"Error creating SSL context: {e}") return [], [], (host, port)