From 25e971c030cad206e06c254879f6cd4a73573835 Mon Sep 17 00:00:00 2001 From: TheTechromancer Date: Fri, 3 May 2024 10:49:18 -0400 Subject: [PATCH] more WIP web engine --- bbot/core/helpers/web/engine.py | 84 ++++++++++++++++++++++++++++--- bbot/core/helpers/web/web.py | 60 +++++----------------- bbot/test/test_step_1/test_web.py | 24 +++++++-- 3 files changed, 112 insertions(+), 56 deletions(-) diff --git a/bbot/core/helpers/web/engine.py b/bbot/core/helpers/web/engine.py index 9e30dbb8b..3a13bbb2d 100644 --- a/bbot/core/helpers/web/engine.py +++ b/bbot/core/helpers/web/engine.py @@ -1,9 +1,17 @@ import ssl +import anyio import httpx +import asyncio +import logging +import traceback from httpx._models import Cookies +from socksio.exceptions import SOCKSError from contextlib import asynccontextmanager from bbot.core.engine import EngineServer +from bbot.core.helpers.misc import bytes_to_human, human_to_bytes, get_exception_chain + +log = logging.getLogger("bbot.core.helpers.web.engine") class DummyCookies(Cookies): @@ -31,7 +39,6 @@ class BBOTAsyncClient(httpx.AsyncClient): def __init__(self, *args, **kwargs): self._config = kwargs.pop("_config") - web_requests_per_second = self._config.get("web_requests_per_second", 100) http_debug = self._config.get("http_debug", None) if http_debug: @@ -84,6 +91,7 @@ class HTTPEngine(EngineServer): 0: "request", 1: "request_batch", 2: "request_custom_batch", + 3: "download", 99: "_mock", } @@ -145,14 +153,80 @@ async def request(self, *args, **kwargs): async with self._acatch(url, raise_error): if self.http_debug: logstr = f"Web request: {str(args)}, {str(kwargs)}" - log.trace(logstr) + self.log.trace(logstr) response = await client.request(*args, **kwargs) if self.http_debug: - log.trace( + self.log.trace( f"Web response from {url}: {response} (Length: {len(response.content)}) headers: {response.headers}" ) return response + async def request_batch(self, urls, *args, threads=10, **kwargs): + tasks = {} + + def new_task(url): + task = asyncio.create_task(self.request(url, *args, **kwargs)) + tasks[task] = url + + urls = list(urls) + for _ in range(threads): # Start initial batch of tasks + if urls: # Ensure there are args to process + new_task(urls.pop(0)) + + while tasks: # While there are tasks pending + # Wait for the first task to complete + done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + + for task in done: + results = task.result() + url = tasks.pop(task) + + if results: + yield (url, results) + + if urls: # Start a new task for each one completed, if URLs remain + new_task(urls.pop(0)) + + async def download(self, url, **kwargs): + follow_redirects = kwargs.pop("follow_redirects", True) + filename = kwargs.pop("filename") + max_size = kwargs.pop("max_size", None) + warn = kwargs.pop("warn", True) + raise_error = kwargs.pop("raise_error", False) + if max_size is not None: + max_size = human_to_bytes(max_size) + kwargs["follow_redirects"] = follow_redirects + if not "method" in kwargs: + kwargs["method"] = "GET" + try: + total_size = 0 + chunk_size = 8192 + + async with self._acatch(url, raise_error=True), self.web_client.stream(url=url, **kwargs) as response: + status_code = getattr(response, "status_code", 0) + self.log.debug(f"Download result: HTTP {status_code}") + if status_code != 0: + response.raise_for_status() + with open(filename, "wb") as f: + agen = response.aiter_bytes(chunk_size=chunk_size) + async for chunk in agen: + if max_size is not None and total_size + chunk_size > max_size: + self.log.verbose( + f"Filesize of {url} exceeds {bytes_to_human(max_size)}, file will be truncated" + ) + agen.aclose() + break + total_size += chunk_size + f.write(chunk) + return True + except httpx.HTTPError as e: + log_fn = self.log.verbose + if warn: + log_fn = self.log.warning + log_fn(f"Failed to download {url}: {e}") + if raise_error: + raise + def ssl_context_noverify(self): if self._ssl_context_noverify is None: ssl_context = ssl.create_default_context() @@ -217,9 +291,7 @@ async def _acatch(self, url, raise_error): log.trace(traceback.format_exc()) except BaseException as e: # don't log if the error is the result of an intentional cancellation - if not any( - isinstance(_e, asyncio.exceptions.CancelledError) for _e in self.parent_helper.get_exception_chain(e) - ): + if not any(isinstance(_e, asyncio.exceptions.CancelledError) for _e in get_exception_chain(e)): log.trace(f"Unhandled exception with request to URL: {url}: {e}") log.trace(traceback.format_exc()) raise diff --git a/bbot/core/helpers/web/web.py b/bbot/core/helpers/web/web.py index 6d44cca61..09bc3b581 100644 --- a/bbot/core/helpers/web/web.py +++ b/bbot/core/helpers/web/web.py @@ -1,18 +1,12 @@ import re -import anyio -import httpx -import asyncio import logging import warnings import traceback from pathlib import Path from bs4 import BeautifulSoup -from socksio.exceptions import SOCKSError - from bbot.core.engine import EngineClient from bbot.errors import WordlistError, CurlError -from bbot.core.helpers.ratelimiter import RateLimiter from bs4 import MarkupResemblesLocatorWarning from bs4.builder import XMLParsedAsHTMLWarning @@ -101,9 +95,16 @@ async def request(self, *args, **kwargs): Note: If the web request fails, it will return None unless `raise_error` is `True`. """ - self.log.critical(f"CLIENT {args} / {kwargs}") return await self.run_and_return("request", *args, **kwargs) + async def request_batch(self, urls, *args, **kwargs): + async for _ in self.run_and_yield("request_batch", urls, *args, **kwargs): + yield _ + + async def request_custom_batch(self, urls_and_args): + async for _ in self.run_and_yield("request_custom_batch", urls_and_args): + yield _ + async def download(self, url, **kwargs): """ Asynchronous function for downloading files from a given URL. Supports caching with an optional @@ -129,56 +130,21 @@ async def download(self, url, **kwargs): """ success = False filename = kwargs.pop("filename", self.parent_helper.cache_filename(url)) - follow_redirects = kwargs.pop("follow_redirects", True) + filename = Path(filename).resolve() + kwargs["filename"] = filename max_size = kwargs.pop("max_size", None) - warn = kwargs.pop("warn", True) - raise_error = kwargs.pop("raise_error", False) if max_size is not None: max_size = self.parent_helper.human_to_bytes(max_size) + kwargs["max_size"] = max_size cache_hrs = float(kwargs.pop("cache_hrs", -1)) - total_size = 0 - chunk_size = 8192 - log.debug(f"Downloading file from {url} with cache_hrs={cache_hrs}") if cache_hrs > 0 and self.parent_helper.is_cached(url): log.debug(f"{url} is cached at {self.parent_helper.cache_filename(url)}") success = True else: - # kwargs["raise_error"] = True - # kwargs["stream"] = True - kwargs["follow_redirects"] = follow_redirects - if not "method" in kwargs: - kwargs["method"] = "GET" - try: - async with self._acatch(url, raise_error=True), self.AsyncClient().stream( - url=url, **kwargs - ) as response: - status_code = getattr(response, "status_code", 0) - log.debug(f"Download result: HTTP {status_code}") - if status_code != 0: - response.raise_for_status() - with open(filename, "wb") as f: - agen = response.aiter_bytes(chunk_size=chunk_size) - async for chunk in agen: - if max_size is not None and total_size + chunk_size > max_size: - log.verbose( - f"Filesize of {url} exceeds {self.parent_helper.bytes_to_human(max_size)}, file will be truncated" - ) - agen.aclose() - break - total_size += chunk_size - f.write(chunk) - success = True - except httpx.HTTPError as e: - log_fn = log.verbose - if warn: - log_fn = log.warning - log_fn(f"Failed to download {url}: {e}") - if raise_error: - raise - return + success = await self.run_and_return("download", url, **kwargs) if success: - return filename.resolve() + return filename async def wordlist(self, path, lines=None, **kwargs): """ diff --git a/bbot/test/test_step_1/test_web.py b/bbot/test/test_step_1/test_web.py index dc9116e0f..aeac2ba2f 100644 --- a/bbot/test/test_step_1/test_web.py +++ b/bbot/test/test_step_1/test_web.py @@ -4,10 +4,28 @@ @pytest.mark.asyncio -async def test_web_engine(bbot_scanner): +async def test_web_engine(bbot_scanner, bbot_httpserver): + + url = bbot_httpserver.url_for("/test") + bbot_httpserver.expect_request(uri="/test").respond_with_data("hello_there") + scan = bbot_scanner() - response = await scan.helpers.request("http://example.com") - log.critical(response) + + # request + response = await scan.helpers.request(url) + assert response.status_code > 0 + assert response.text == "hello_there" + + # request_batch + responses = [r async for r in scan.helpers.request_batch([url] * 100)] + assert len(responses) == 100 + assert all([r[0] == url for r in responses]) + assert all([r[1].status_code > 0 and r[1].text == "hello_there" for r in responses]) + + # download + filename = await scan.helpers.download(url) + file_content = open(filename).read() + assert file_content == "hello_there" @pytest.mark.asyncio