Skip to content

Commit

Permalink
Offload wayback URL parsing into separate process
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTechromancer committed Oct 31, 2023
1 parent cb76846 commit dbe9c34
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
1 change: 1 addition & 0 deletions bbot/core/helpers/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ def collapse_urls(urls, threshold=10):
["http://evilcorp.com/user/11111/info"]
"""
log.verbose(f"Collapsing {len(urls):,} URLs")
url_hashes = {}
for url in urls:
try:
Expand Down
25 changes: 24 additions & 1 deletion bbot/modules/wayback.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import datetime

from bbot.modules.templates.subdomain_enum import subdomain_enum


Expand Down Expand Up @@ -49,8 +51,19 @@ async def query(self, query):
except KeyError:
continue

self.verbose(f"Found {len(urls):,} URLs for {query}")

dns_names = set()
for parsed_url in self.helpers.validators.collapse_urls(urls, threshold=self.garbage_threshold):
collapsed_urls = 0
start_time = datetime.now()
parsed_urls = await self.scan.run_in_executor_mp(
self.execute_callback,
self.helpers.validators.collapse_urls,
urls,
threshold=self.garbage_threshold,
)
for parsed_url in parsed_urls:
collapsed_urls += 1
if not self.urls:
dns_name = parsed_url.hostname
h = hash(dns_name)
Expand All @@ -59,4 +72,14 @@ async def query(self, query):
results.add((dns_name, "DNS_NAME"))
else:
results.add((parsed_url.geturl(), "URL_UNVERIFIED"))
end_time = datetime.now()
duration = self.helpers.human_timedelta(end_time - start_time)
self.verbose(f"Collapsed {len(urls):,} -> {collapsed_urls:,} URLs in {duration}")
return results

@staticmethod
def execute_callback(callback, *args, **kwargs):
"""
This exists so that we can run our URL parsing logic in a separate process.
"""
return list(callback(*args, **kwargs))
2 changes: 1 addition & 1 deletion bbot/scanner/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def __init__(
mp.set_start_method("spawn")
except Exception:
self.warning(f"Failed to set multiprocessing spawn method. This may negatively affect performance.")
self.process_pool = ProcessPoolExecutor()
self.process_pool = ProcessPoolExecutor(max_tasks_per_child=10)

self._stopping = False

Expand Down

0 comments on commit dbe9c34

Please sign in to comment.