Skip to content

Commit

Permalink
don't punycode-encode non-host segments
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTechromancer committed Sep 19, 2023
1 parent 5d4d434 commit 7c34499
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 70 deletions.
5 changes: 1 addition & 4 deletions bbot/core/event/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
smart_decode,
get_file_extension,
validators,
smart_encode_punycode,
tagify,
)

Expand Down Expand Up @@ -982,9 +981,7 @@ def make_event(
return data
else:
if event_type is None:
if isinstance(data, str):
data = smart_encode_punycode(data)
event_type = get_event_type(data)
event_type, data = get_event_type(data)
if not dummy:
log.debug(f'Autodetected event type "{event_type}" based on data: "{data}"')

Expand Down
11 changes: 6 additions & 5 deletions bbot/core/event/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,26 @@ def get_event_type(data):
"""
Attempt to divine event type from data
"""
data = smart_encode_punycode(smart_decode(data).strip())

# IP address
with suppress(Exception):
ipaddress.ip_address(data)
return "IP_ADDRESS"
return "IP_ADDRESS", data

# IP network
with suppress(Exception):
ipaddress.ip_network(data, strict=False)
return "IP_RANGE"
return "IP_RANGE", data

data = smart_encode_punycode(smart_decode(data).strip())

# Strict regexes
for t, regexes in event_type_regexes.items():
for r in regexes:
if r.match(data):
if t == "URL":
return "URL_UNVERIFIED"
return t
return "URL_UNVERIFIED", data
return t, data

raise ValidationError(f'Unable to autodetect event type from "{data}"')

Expand Down
80 changes: 79 additions & 1 deletion bbot/core/helpers/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import sys
import copy
import idna
import json
import atexit
import codecs
Expand Down Expand Up @@ -34,7 +35,6 @@

from .url import * # noqa F401
from .. import errors
from .punycode import * # noqa F401
from .logger import log_to_stderr
from . import regexes as bbot_regexes
from .names_generator import random_name, names, adjectives # noqa F401
Expand Down Expand Up @@ -898,10 +898,88 @@ def clean_old(d, keep=10, filter=lambda x: True, key=latest_mtime, reverse=True,


def extract_emails(s):
"""
Extract email addresses from a body of text
"""
for email in bbot_regexes.email_regex.findall(smart_decode(s)):
yield email.lower()


def extract_host(s):
"""
Attempts to find and extract the host portion of a string.
Args:
s (str): The string from which to extract the host.
Returns:
tuple: A tuple containing three strings:
(hostname (None if not found), string_before_hostname, string_after_hostname).
Examples:
>>> extract_host("evilcorp.com:80")
("evilcorp.com", "", ":80")
>>> extract_host("http://evilcorp.com:80/asdf.php?a=b")
("evilcorp.com", "http://", ":80/asdf.php?a=b")
>>> extract_host("[email protected]")
("evilcorp.com", "bob@", "")
>>> extract_host("[dead::beef]:22")
("dead::beef", "[", "]:22")
"""
match = bbot_regexes.extract_host_regex.search(s)

if match:
hostname = match.group(1)
before = s[: match.start(1)]
after = s[match.end(1) :]
host, port = split_host_port(hostname)
if host is not None:
hostname = str(host)
if port is not None:
after = f":{port}{after}"
if is_ip(hostname, version=6):
before = f"{before}["
after = f"]{after}"
return (hostname, before, after)

return (None, s, "")


def smart_encode_punycode(text: str) -> str:
"""
ドメイン.テスト --> xn--eckwd4c7c.xn--zckzah
"""
host, before, after = extract_host(text)
if host is None:
return text

try:
host = idna.encode(host).decode(errors="ignore")
except UnicodeError:
pass # If encoding fails, leave the host as it is

return f"{before}{host}{after}"


def smart_decode_punycode(text: str) -> str:
"""
xn--eckwd4c7c.xn--zckzah --> ドメイン.テスト
"""
host, before, after = extract_host(text)
if host is None:
return text

try:
host = idna.decode(host)
except UnicodeError:
pass # If decoding fails, leave the host as it is

return f"{before}{host}{after}"


def can_sudo_without_password():
"""
Return True if the current user can sudo without a password
Expand Down
51 changes: 0 additions & 51 deletions bbot/core/helpers/punycode.py

This file was deleted.

3 changes: 3 additions & 0 deletions bbot/core/helpers/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,6 @@
jquery_get_regex = re.compile(r"url:\s?[\"\'].+?\?(\w+)=")
jquery_post_regex = re.compile(r"\$.post\([\'\"].+[\'\"].+\{(.+)\}")
a_tag_regex = re.compile(r"<a[^>]*href=[\"\'][^\"\'?>]*\?([^&\"\'=]+)")

_extract_host_regex = r"(?:[a-z0-9]{1,20}://)?(?:[^?]*@)?([^\s!@#$%^&()=/?\\]+)"
extract_host_regex = re.compile(_extract_host_regex, re.I)
3 changes: 1 addition & 2 deletions bbot/core/helpers/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from bbot.core.helpers import regexes
from bbot.core.helpers.url import parse_url, hash_url
from bbot.core.helpers.punycode import smart_encode_punycode
from bbot.core.helpers.misc import split_host_port, make_netloc, is_ip
from bbot.core.helpers.misc import smart_encode_punycode, split_host_port, make_netloc, is_ip

log = logging.getLogger("bbot.core.helpers.validators")

Expand Down
29 changes: 28 additions & 1 deletion bbot/test/test_step_1/test_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,38 +250,52 @@ async def test_events(events, scan, helpers, bbot_config):
# japanese
assert scan.make_event("ドメイン.テスト", dummy=True).type == "DNS_NAME"
assert scan.make_event("bob@ドメイン.テスト", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("テスト@ドメイン.テスト", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("ドメイン.テスト:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://ドメイン.テスト:80", dummy=True).type == "URL_UNVERIFIED"
assert scan.make_event("http://ドメイン.テスト:80/テスト", dummy=True).type == "URL_UNVERIFIED"

assert scan.make_event("xn--eckwd4c7c.xn--zckzah", dummy=True).type == "DNS_NAME"
assert scan.make_event("[email protected]", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("テスト@xn--eckwd4c7c.xn--zckzah", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("xn--eckwd4c7c.xn--zckzah:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80", dummy=True).type == "URL_UNVERIFIED"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80/テスト", dummy=True).type == "URL_UNVERIFIED"

# thai
assert scan.make_event("เราเที่ยวด้วยกัน.com", dummy=True).type == "DNS_NAME"
assert scan.make_event("bob@เราเที่ยวด้วยกัน.com", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("ทดสอบ@เราเที่ยวด้วยกัน.com", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("เราเที่ยวด้วยกัน.com:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://เราเที่ยวด้วยกัน.com:80", dummy=True).type == "URL_UNVERIFIED"
assert scan.make_event("http://เราเที่ยวด้วยกัน.com:80/ทดสอบ", dummy=True).type == "URL_UNVERIFIED"

assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).type == "DNS_NAME"
assert scan.make_event("[email protected]", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("ทดสอบ@xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).type == "URL_UNVERIFIED"
assert scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80/ทดสอบ", dummy=True).type == "URL_UNVERIFIED"

# punycode - encoding / decoding tests

# japanese
assert scan.make_event("xn--eckwd4c7c.xn--zckzah", dummy=True).data == "xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("[email protected]", dummy=True).data == "[email protected]"
assert scan.make_event("テスト@xn--eckwd4c7c.xn--zckzah", dummy=True).data == "テスト@xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "xn--eckwd4c7c.xn--zckzah:80"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "http://xn--eckwd4c7c.xn--zckzah/"
assert (
scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80/テスト", dummy=True).data
== "http://xn--eckwd4c7c.xn--zckzah/テスト"
)

assert scan.make_event("ドメイン.テスト", dummy=True).data == "xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("bob@ドメイン.テスト", dummy=True).data == "[email protected]"
assert scan.make_event("テスト@ドメイン.テスト", dummy=True).data == "テスト@xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("ドメイン.テスト:80", dummy=True).data == "xn--eckwd4c7c.xn--zckzah:80"
assert scan.make_event("http://ドメイン.テスト:80", dummy=True).data == "http://xn--eckwd4c7c.xn--zckzah/"

assert scan.make_event("http://ドメイン.テスト:80/テスト", dummy=True).data == "http://xn--eckwd4c7c.xn--zckzah/テスト"
# thai
assert (
scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
Expand All @@ -290,6 +304,10 @@ async def test_events(events, scan, helpers, bbot_config):
scan.make_event("[email protected]", dummy=True).data
== "[email protected]"
)
assert (
scan.make_event("ทดสอบ@xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).data
== "ทดสอบ@xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
)
assert (
scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).data
== "xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80"
Expand All @@ -298,14 +316,23 @@ async def test_events(events, scan, helpers, bbot_config):
scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).data
== "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/"
)
assert (
scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80/ทดสอบ", dummy=True).data
== "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/ทดสอบ"
)

assert scan.make_event("เราเที่ยวด้วยกัน.com", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
assert scan.make_event("bob@เราเที่ยวด้วยกัน.com", dummy=True).data == "[email protected]"
assert scan.make_event("ทดสอบ@เราเที่ยวด้วยกัน.com", dummy=True).data == "ทดสอบ@xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
assert scan.make_event("เราเที่ยวด้วยกัน.com:80", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80"
assert (
scan.make_event("http://เราเที่ยวด้วยกัน.com:80", dummy=True).data
== "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/"
)
assert (
scan.make_event("http://เราเที่ยวด้วยกัน.com:80/ทดสอบ", dummy=True).data
== "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/ทดสอบ"
)

# test event serialization
from bbot.core.event import event_from_json
Expand Down
50 changes: 50 additions & 0 deletions bbot/test/test_step_1/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,54 @@ async def test_helpers_misc(helpers, scan, bbot_scanner, bbot_config, bbot_https
"[email protected]",
)

assert helpers.extract_host("evilcorp.com:80") == ("evilcorp.com", "", ":80")
assert helpers.extract_host("http://evilcorp.com:80/asdf.php?a=b") == (
"evilcorp.com",
"http://",
":80/asdf.php?a=b",
)
assert helpers.extract_host("http://evilcorp.com:80/[email protected]") == (
"evilcorp.com",
"http://",
":80/[email protected]",
)
assert helpers.extract_host("[email protected]") == ("evilcorp.com", "bob@", "")
assert helpers.extract_host("[dead::beef]:22") == ("dead::beef", "[", "]:22")
assert helpers.extract_host("scp://[dead::beef]:22") == ("dead::beef", "scp://[", "]:22")
assert helpers.extract_host("https://[dead::beef]:22?a=b") == ("dead::beef", "https://[", "]:22?a=b")
assert helpers.extract_host("https://[dead::beef]/?a=b") == ("dead::beef", "https://[", "]/?a=b")
assert helpers.extract_host("https://[dead::beef]?a=b") == ("dead::beef", "https://[", "]?a=b")
assert helpers.extract_host("ftp://username:[email protected]/my-file.csv") == (
"my-ftp.com",
"ftp://username:password@",
"/my-file.csv",
)
assert helpers.extract_host("ftp://username:p@[email protected]/my-file.csv") == (
"my-ftp.com",
"ftp://username:p@ssword@",
"/my-file.csv",
)
assert helpers.extract_host("ftp://username:password:/@my-ftp.com/my-file.csv") == (
"my-ftp.com",
"ftp://username:password:/@",
"/my-file.csv",
)
assert helpers.extract_host("ftp://username:password:/@dead::beef/my-file.csv") == (
"my-ftp.com",
"ftp://username:password:/@",
"/my-file.csv",
)
assert helpers.extract_host("ftp://username:password:/@[dead::beef]/my-file.csv") == (
"dead::beef",
"ftp://username:password:/@[",
"]/my-file.csv",
)
assert helpers.extract_host("ftp://username:password:/@[dead::beef]:22/my-file.csv") == (
"dead::beef",
"ftp://username:password:/@[",
"]:22/my-file.csv",
)

assert helpers.split_domain("www.evilcorp.co.uk") == ("www", "evilcorp.co.uk")
assert helpers.split_domain("asdf.www.test.notreal") == ("asdf.www", "test.notreal")
assert helpers.split_domain("www.test.notreal") == ("www", "test.notreal")
Expand All @@ -120,6 +168,8 @@ async def test_helpers_misc(helpers, scan, bbot_scanner, bbot_config, bbot_https
assert helpers.split_host_port("evilcorp.co.uk") == ("evilcorp.co.uk", None)
assert helpers.split_host_port("d://wat:wat") == ("wat", None)
assert helpers.split_host_port("https://[dead::beef]:8338") == (ipaddress.ip_address("dead::beef"), 8338)
assert helpers.split_host_port("[dead::beef]") == (ipaddress.ip_address("dead::beef"), None)
assert helpers.split_host_port("dead::beef") == (ipaddress.ip_address("dead::beef"), None)
extracted_words = helpers.extract_words("blacklanternsecurity")
assert "black" in extracted_words
# assert "blacklantern" in extracted_words
Expand Down
Loading

0 comments on commit 7c34499

Please sign in to comment.