Skip to content

Commit

Permalink
Merge pull request #1278 from blacklanternsecurity/faster-regexes
Browse files Browse the repository at this point in the history
Faster Regexes
  • Loading branch information
TheTechromancer authored Apr 26, 2024
2 parents ecad649 + 6feb345 commit fad1afe
Show file tree
Hide file tree
Showing 40 changed files with 415 additions and 257 deletions.
8 changes: 5 additions & 3 deletions bbot/core/event/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
from datetime import datetime
from contextlib import suppress
from urllib.parse import urljoin
from radixtarget import RadixTarget
from pydantic import BaseModel, field_validator

from .helpers import *
from bbot.errors import *
from bbot.core.helpers import (
extract_words,
get_file_extension,
host_in_host,
is_domain,
is_subdomain,
is_ip,
Expand Down Expand Up @@ -93,7 +93,7 @@ class BaseEvent:
# Always emit this event type even if it's not in scope
_always_emit = False
# Always emit events with these tags even if they're not in scope
_always_emit_tags = ["affiliate"]
_always_emit_tags = ["affiliate", "target"]
# Bypass scope checking and dns resolution, distribute immediately to modules
# This is useful for "end-of-line" events like FINDING and VULNERABILITY
_quick_emit = False
Expand Down Expand Up @@ -580,7 +580,9 @@ def __contains__(self, other):
if self.host == other.host:
return True
# hostnames and IPs
return host_in_host(other.host, self.host)
radixtarget = RadixTarget()
radixtarget.insert(self.host)
return bool(radixtarget.search(other.host))
return False

def json(self, mode="json", siem_friendly=False):
Expand Down
18 changes: 9 additions & 9 deletions bbot/core/helpers/dns/dns.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import logging
import dns.exception
import dns.asyncresolver
from radixtarget import RadixTarget

from bbot.core.engine import EngineClient
from ..misc import clean_dns_record, is_ip, is_domain, is_dns_name, host_in_host
from ..misc import clean_dns_record, is_ip, is_domain, is_dns_name

from .engine import DNSEngine

Expand Down Expand Up @@ -63,10 +64,9 @@ def __init__(self, parent_helper):

# wildcard handling
self.wildcard_disable = self.config.get("dns_wildcard_disable", False)
self.wildcard_ignore = self.config.get("dns_wildcard_ignore", None)
if not self.wildcard_ignore:
self.wildcard_ignore = []
self.wildcard_ignore = tuple([str(d).strip().lower() for d in self.wildcard_ignore])
self.wildcard_ignore = RadixTarget()
for d in self.config.get("dns_wildcard_ignore", []):
self.wildcard_ignore.insert(d)

# copy the system's current resolvers to a text file for tool use
self.system_resolvers = dns.resolver.Resolver().nameservers
Expand Down Expand Up @@ -150,10 +150,10 @@ def _wildcard_prevalidation(self, host):
return False

# skip check if the query's parent domain is excluded in the config
for d in self.wildcard_ignore:
if host_in_host(host, d):
log.debug(f"Skipping wildcard detection on {host} because it is excluded in the config")
return False
wildcard_ignore = self.wildcard_ignore.search(host)
if wildcard_ignore:
log.debug(f"Skipping wildcard detection on {host} because {wildcard_ignore} is excluded in the config")
return False

return host

Expand Down
21 changes: 14 additions & 7 deletions bbot/core/helpers/dns/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,8 @@ def new_task(query, rdtype):
if queries: # Start a new task for each one completed, if URLs remain
new_task(*queries.pop(0))

def extract_targets(self, record):
@staticmethod
def extract_targets(record):
"""
Extracts hostnames or IP addresses from a given DNS record.
Expand All @@ -429,24 +430,30 @@ def extract_targets(self, record):
"""
results = set()

def add_result(rdtype, _record):
cleaned = clean_dns_record(_record)
if cleaned:
results.add((rdtype, cleaned))

rdtype = str(record.rdtype.name).upper()
if rdtype in ("A", "AAAA", "NS", "CNAME", "PTR"):
results.add((rdtype, clean_dns_record(record)))
add_result(rdtype, record)
elif rdtype == "SOA":
results.add((rdtype, clean_dns_record(record.mname)))
add_result(rdtype, record.mname)
elif rdtype == "MX":
results.add((rdtype, clean_dns_record(record.exchange)))
add_result(rdtype, record.exchange)
elif rdtype == "SRV":
results.add((rdtype, clean_dns_record(record.target)))
add_result(rdtype, record.target)
elif rdtype == "TXT":
for s in record.strings:
s = smart_decode(s)
for match in dns_name_regex.finditer(s):
start, end = match.span()
host = s[start:end]
results.add((rdtype, host))
add_result(rdtype, host)
elif rdtype == "NSEC":
results.add((rdtype, clean_dns_record(record.next)))
add_result(rdtype, record.next)
else:
log.warning(f'Unknown DNS record type "{rdtype}"')
return results
Expand Down
50 changes: 50 additions & 0 deletions bbot/core/helpers/helper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import os
import asyncio
import logging
from pathlib import Path
import multiprocessing as mp
from functools import partial
from cloudcheck import cloud_providers
from concurrent.futures import ProcessPoolExecutor

from . import misc
from .dns import DNSHelper
from .web import WebHelper
from .diff import HttpCompare
from .regex import RegexHelper
from .wordcloud import WordCloud
from .interactsh import Interactsh
from ...scanner.target import Target
Expand Down Expand Up @@ -65,8 +70,21 @@ def __init__(self, preset):
self.mkdir(self.tools_dir)
self.mkdir(self.lib_dir)

self._loop = None

# multiprocessing thread pool
start_method = mp.get_start_method()
if start_method != "spawn":
self.warning(f"Multiprocessing spawn method is set to {start_method}.")

# we spawn 1 fewer processes than cores
# this helps to avoid locking up the system or competing with the main python process for cpu time
num_processes = max(1, mp.cpu_count() - 1)
self.process_pool = ProcessPoolExecutor(max_workers=num_processes)

self.cloud = cloud_providers

self.re = RegexHelper(self)
self.dns = DNSHelper(self)
self.web = WebHelper(self)
self.depsinstaller = DepsInstaller(self)
Expand Down Expand Up @@ -103,6 +121,38 @@ def config(self):
def scan(self):
return self.preset.scan

@property
def loop(self):
"""
Get the current event loop
"""
if self._loop is None:
self._loop = asyncio.get_running_loop()
return self._loop

def run_in_executor(self, callback, *args, **kwargs):
"""
Run a synchronous task in the event loop's default thread pool executor
Examples:
Execute callback:
>>> result = await self.helpers.run_in_executor(callback_fn, arg1, arg2)
"""
callback = partial(callback, **kwargs)
return self.loop.run_in_executor(None, callback, *args)

def run_in_executor_mp(self, callback, *args, **kwargs):
"""
Same as run_in_executor() except with a process pool executor
Use only in cases where callback is CPU-bound
Examples:
Execute callback:
>>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2)
"""
callback = partial(callback, **kwargs)
return self.loop.run_in_executor(self.process_pool, callback, *args)

@property
def in_tests(self):
return os.environ.get("BBOT_TESTING", "") == "True"
Expand Down
73 changes: 6 additions & 67 deletions bbot/core/helpers/misc.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import re
import sys
import json
import random
import string
import asyncio
import logging
import ipaddress
import regex as re
import subprocess as sp
from pathlib import Path
from contextlib import suppress
Expand Down Expand Up @@ -637,7 +637,7 @@ def is_ip_type(i):
>>> is_ip_type("192.168.1.0/24")
False
"""
return isinstance(i, ipaddress._BaseV4) or isinstance(i, ipaddress._BaseV6)
return ipaddress._IPAddressBase in i.__class__.__mro__


def make_ip_type(s):
Expand All @@ -663,78 +663,17 @@ def make_ip_type(s):
>>> make_ip_type("evilcorp.com")
'evilcorp.com'
"""
if not s:
raise ValueError(f'Invalid hostname: "{s}"')
# IP address
with suppress(Exception):
return ipaddress.ip_address(str(s).strip())
return ipaddress.ip_address(s)
# IP network
with suppress(Exception):
return ipaddress.ip_network(str(s).strip(), strict=False)
return ipaddress.ip_network(s, strict=False)
return s


def host_in_host(host1, host2):
"""
Checks if host1 is included within host2, either as a subdomain, IP, or IP network.
Used for scope calculations/decisions within BBOT.
Args:
host1 (str or ipaddress.IPv4Address or ipaddress.IPv6Address or ipaddress.IPv4Network or ipaddress.IPv6Network):
The host to check for inclusion within host2.
host2 (str or ipaddress.IPv4Address or ipaddress.IPv6Address or ipaddress.IPv4Network or ipaddress.IPv6Network):
The host within which to check for the inclusion of host1.
Returns:
bool: True if host1 is included in host2, otherwise False.
Examples:
>>> host_in_host("www.evilcorp.com", "evilcorp.com")
True
>>> host_in_host("evilcorp.com", "www.evilcorp.com")
False
>>> host_in_host(ipaddress.IPv6Address('dead::beef'), ipaddress.IPv6Network('dead::/64'))
True
>>> host_in_host(ipaddress.IPv4Address('192.168.1.1'), ipaddress.IPv4Network('10.0.0.0/8'))
False
Notes:
- If checking an IP address/network, you MUST FIRST convert your IP into an ipaddress object (e.g. via `make_ip_type()`) before passing it to this function.
"""

"""
Is host1 included in host2?
"www.evilcorp.com" in "evilcorp.com"? --> True
"evilcorp.com" in "www.evilcorp.com"? --> False
IPv6Address('dead::beef') in IPv6Network('dead::/64')? --> True
IPv4Address('192.168.1.1') in IPv4Network('10.0.0.0/8')? --> False
Very important! Used throughout BBOT for scope calculations/decisions.
Works with hostnames, IPs, and IP networks.
"""

if not host1 or not host2:
return False

# check if hosts are IP types
host1_ip_type = is_ip_type(host1)
host2_ip_type = is_ip_type(host2)
# if both hosts are IP types
if host1_ip_type and host2_ip_type:
if not host1.version == host2.version:
return False
host1_net = ipaddress.ip_network(host1)
host2_net = ipaddress.ip_network(host2)
return host1_net.subnet_of(host2_net)

# else hostnames
elif not (host1_ip_type or host2_ip_type):
host2_len = len(host2.split("."))
host1_truncated = ".".join(host1.split(".")[-host2_len:])
return host1_truncated == host2

return False


def sha1(data):
"""
Computes the SHA-1 hash of the given data.
Expand Down
72 changes: 72 additions & 0 deletions bbot/core/helpers/regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import regex as re
from . import misc


class RegexHelper:
"""
Class for misc CPU-intensive regex operations
Offloads regex processing to other CPU cores via GIL release + thread pool
For quick, one-off regexes, you don't need to use this helper.
Only use this helper if you're searching large bodies of text
or if your regex is CPU-intensive
"""

def __init__(self, parent_helper):
self.parent_helper = parent_helper

def ensure_compiled_regex(self, r):
"""
Make sure a regex has been compiled
"""
if not isinstance(r, re.Pattern):
raise ValueError("Regex must be compiled first!")

def compile(self, *args, **kwargs):
return re.compile(*args, **kwargs)

async def search(self, compiled_regex, *args, **kwargs):
self.ensure_compiled_regex(compiled_regex)
return await self.parent_helper.run_in_executor(compiled_regex.search, *args, **kwargs)

async def findall(self, compiled_regex, *args, **kwargs):
self.ensure_compiled_regex(compiled_regex)
return await self.parent_helper.run_in_executor(compiled_regex.findall, *args, **kwargs)

async def finditer(self, compiled_regex, *args, **kwargs):
self.ensure_compiled_regex(compiled_regex)
return await self.parent_helper.run_in_executor(self._finditer, compiled_regex, *args, **kwargs)

async def finditer_multi(self, compiled_regexes, *args, **kwargs):
"""
Same as finditer() but with multiple regexes
"""
for r in compiled_regexes:
self.ensure_compiled_regex(r)
return await self.parent_helper.run_in_executor(self._finditer_multi, compiled_regexes, *args, **kwargs)

def _finditer_multi(self, compiled_regexes, *args, **kwargs):
matches = []
for r in compiled_regexes:
for m in r.finditer(*args, **kwargs):
matches.append(m)
return matches

def _finditer(self, compiled_regex, *args, **kwargs):
return list(compiled_regex.finditer(*args, **kwargs))

async def extract_params_html(self, *args, **kwargs):
return await self.parent_helper.run_in_executor(misc.extract_params_html, *args, **kwargs)

async def extract_emails(self, *args, **kwargs):
return await self.parent_helper.run_in_executor(misc.extract_emails, *args, **kwargs)

async def search_dict_values(self, *args, **kwargs):
def _search_dict_values(*_args, **_kwargs):
return list(misc.search_dict_values(*_args, **_kwargs))

return await self.parent_helper.run_in_executor(_search_dict_values, *args, **kwargs)

async def recursive_decode(self, *args, **kwargs):
return await self.parent_helper.run_in_executor(misc.recursive_decode, *args, **kwargs)
6 changes: 5 additions & 1 deletion bbot/core/helpers/regexes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import re
import regex as re
from collections import OrderedDict

# for extracting words from strings
Expand Down Expand Up @@ -104,3 +104,7 @@

_extract_host_regex = r"(?:[a-z0-9]{1,20}://)?(?:[^?]*@)?(" + valid_netloc + ")"
extract_host_regex = re.compile(_extract_host_regex, re.I)

# for use in recursive_decode()
encoded_regex = re.compile(r"%[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}|\\[ntrbv]")
backslash_regex = re.compile(r"(?P<slashes>\\+)(?P<char>[ntrvb])")
Loading

0 comments on commit fad1afe

Please sign in to comment.