Skip to content

Commit

Permalink
improve punycode support
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTechromancer committed Sep 18, 2023
1 parent ba4fc61 commit 763268f
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 22 deletions.
4 changes: 2 additions & 2 deletions bbot/core/event/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
smart_decode,
get_file_extension,
validators,
smart_decode_punycode,
smart_encode_punycode,
tagify,
)

Expand Down Expand Up @@ -983,7 +983,7 @@ def make_event(
else:
if event_type is None:
if isinstance(data, str):
data = smart_decode_punycode(data)
data = smart_encode_punycode(data)
event_type = get_event_type(data)
if not dummy:
log.debug(f'Autodetected event type "{event_type}" based on data: "{data}"')
Expand Down
4 changes: 2 additions & 2 deletions bbot/core/event/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from contextlib import suppress

from bbot.core.errors import ValidationError
from bbot.core.helpers import sha1, smart_decode, smart_decode_punycode
from bbot.core.helpers import sha1, smart_decode, smart_encode_punycode
from bbot.core.helpers.regexes import event_type_regexes, event_id_regex


Expand All @@ -14,7 +14,7 @@ def get_event_type(data):
"""
Attempt to divine event type from data
"""
data = smart_decode_punycode(smart_decode(data).strip())
data = smart_encode_punycode(smart_decode(data).strip())

# IP address
with suppress(Exception):
Expand Down
14 changes: 6 additions & 8 deletions bbot/core/helpers/punycode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
import idna


alphanum_regex = re.compile(r"([\w-]+)")
alphanum_anchored = re.compile(r"^[\w-]+$")
split_regex = re.compile(r"([/:@\[\]]+)")


def split_text(text):
# Split text into segments by special characters
# We assume that only alphanumeric segments should be encoded
# We have to split this way in order to handle URLs and email addresses
# which the idna library is not equipped to deal with
if not isinstance(text, str):
raise ValueError(f"data must be a string, not {type(text)}")
segments = alphanum_regex.split(text)
segments = split_regex.split(text)
return segments


Expand All @@ -24,7 +23,7 @@ def smart_encode_punycode(text: str) -> str:

for segment in segments:
try:
if alphanum_anchored.match(segment): # Only encode alphanumeric segments
if not split_regex.match(segment):
segment = idna.encode(segment).decode(errors="ignore")
except UnicodeError:
pass # If encoding fails, leave the segment as it is
Expand All @@ -43,8 +42,7 @@ def smart_decode_punycode(text: str) -> str:

for segment in segments:
try:
if alphanum_anchored.match(segment): # Only decode alphanumeric segments
segment = idna.decode(segment)
segment = idna.decode(segment)
except UnicodeError:
pass # If decoding fails, leave the segment as it is

Expand Down
2 changes: 1 addition & 1 deletion bbot/core/helpers/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
_ipv6_regex = r"[A-F0-9:]*:[A-F0-9:]*:[A-F0-9:]*"
ipv6_regex = re.compile(_ipv6_regex, re.I)
# dns names with periods
_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.)+[^\W_]{1,63}\.?"
_dns_name_regex = r"(?:\w(?:[\w-]{0,100}\w)?\.)+(?:[xX][nN]--)?[^\W_]{1,63}\.?"
# dns names without periods
_hostname_regex = r"(?!\w*\.\w+)\w(?:[\w-]{0,100}\w)?"
_email_regex = r"(?:[^\W_][\w\-\.\+]{,100})@" + _dns_name_regex
Expand Down
6 changes: 3 additions & 3 deletions bbot/core/helpers/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from bbot.core.helpers import regexes
from bbot.core.helpers.url import parse_url, hash_url
from bbot.core.helpers.punycode import smart_decode_punycode
from bbot.core.helpers.punycode import smart_encode_punycode
from bbot.core.helpers.misc import split_host_port, make_netloc, is_ip

log = logging.getLogger("bbot.core.helpers.validators")
Expand Down Expand Up @@ -57,7 +57,7 @@ def validate_host(host):
return str(ip)
except Exception:
# finally, try DNS_NAME
host = smart_decode_punycode(host)
host = smart_encode_punycode(host)
# clean asterisks and clinging dashes
host = host.strip("*.-").replace("*", "")
for r in regexes.event_type_regexes["DNS_NAME"]:
Expand Down Expand Up @@ -89,7 +89,7 @@ def validate_severity(severity):

@validator
def validate_email(email):
email = smart_decode_punycode(str(email).strip().lower())
email = smart_encode_punycode(str(email).strip().lower())
if any(r.match(email) for r in regexes.event_type_regexes["EMAIL_ADDRESS"]):
return email
assert False, f'Invalid email: "{email}"'
Expand Down
44 changes: 38 additions & 6 deletions bbot/test/test_step_1/test_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,22 +245,54 @@ async def test_events(events, scan, helpers, bbot_config):
{"host": "evilcorp.com", "severity": "WACK", "description": "asdf"}, "VULNERABILITY", dummy=True
)

# punycode
# punycode - event type detection

# japanese
assert scan.make_event("ドメイン.テスト", dummy=True).type == "DNS_NAME"
assert scan.make_event("bob@ドメイン.テスト", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("ドメイン.テスト:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://ドメイン.テスト:80", dummy=True).type == "URL_UNVERIFIED"

assert scan.make_event("xn--eckwd4c7c.xn--zckzah", dummy=True).data == "ドメイン.テスト"
assert scan.make_event("[email protected]", dummy=True).data == "bob@ドメイン.テスト"
assert scan.make_event("xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "ドメイン.テスト:80"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "http://ドメイン.テスト/"

assert scan.make_event("xn--eckwd4c7c.xn--zckzah", dummy=True).type == "DNS_NAME"
assert scan.make_event("[email protected]", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("xn--eckwd4c7c.xn--zckzah:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80", dummy=True).type == "URL_UNVERIFIED"

# thai
assert scan.make_event("เราเที่ยวด้วยกัน.com", dummy=True).type == "DNS_NAME"
assert scan.make_event("bob@เราเที่ยวด้วยกัน.com", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("เราเที่ยวด้วยกัน.com:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://เราเที่ยวด้วยกัน.com:80", dummy=True).type == "URL_UNVERIFIED"

assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).type == "DNS_NAME"
assert scan.make_event("[email protected]", dummy=True).type == "EMAIL_ADDRESS"
assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).type == "OPEN_TCP_PORT"
assert scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).type == "URL_UNVERIFIED"

# punycode - encoding / decoding tests

# japanese
assert scan.make_event("xn--eckwd4c7c.xn--zckzah", dummy=True).data == "xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("[email protected]", dummy=True).data == "[email protected]"
assert scan.make_event("xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "xn--eckwd4c7c.xn--zckzah:80"
assert scan.make_event("http://xn--eckwd4c7c.xn--zckzah:80", dummy=True).data == "http://xn--eckwd4c7c.xn--zckzah/"

assert scan.make_event("ドメイン.テスト", dummy=True).data == "xn--eckwd4c7c.xn--zckzah"
assert scan.make_event("bob@ドメイン.テスト", dummy=True).data == "[email protected]"
assert scan.make_event("ドメイン.テスト:80", dummy=True).data == "xn--eckwd4c7c.xn--zckzah:80"
assert scan.make_event("http://ドメイン.テスト:80", dummy=True).data == "http://xn--eckwd4c7c.xn--zckzah/"

# thai
assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
assert scan.make_event("[email protected]", dummy=True).data == "[email protected]"
assert scan.make_event("xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80"
assert scan.make_event("http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80", dummy=True).data == "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/"

assert scan.make_event("เราเที่ยวด้วยกัน.com", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com"
assert scan.make_event("bob@เราเที่ยวด้วยกัน.com", dummy=True).data == "[email protected]"
assert scan.make_event("เราเที่ยวด้วยกัน.com:80", dummy=True).data == "xn--12c1bik6bbd8ab6hd1b5jc6jta.com:80"
assert scan.make_event("http://เราเที่ยวด้วยกัน.com:80", dummy=True).data == "http://xn--12c1bik6bbd8ab6hd1b5jc6jta.com/"

# test event serialization
from bbot.core.event import event_from_json

Expand Down

0 comments on commit 763268f

Please sign in to comment.