Skip to content

Commit

Permalink
improves path matching
Browse files Browse the repository at this point in the history
  • Loading branch information
huettenhain committed Nov 13, 2024
1 parent 0bb9058 commit d9f7613
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 30 deletions.
2 changes: 1 addition & 1 deletion refinery/lib/patterns/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def display(self):
_pattern_nix_path_element = R'(?:{n} ){{0,1}}{n}'.format(n=_pattern_pathpart_nospace)
_pattern_win_env_variable = R'%[a-zA-Z][a-zA-Z0-9_\-\(\)]*%'

_pattern_win_path = R'(?:{s})(?P<__pathsep>[\\\/])(?:{p}(?P=__pathsep))*{p}(?:(?P=__pathsep)|\b)'.format(
_pattern_win_path = R'(?:{s}|{p}|)(?P<__pathsep>[\\\/])(?:{p}(?P=__pathsep))*{p}(?:(?P=__pathsep)|\b)'.format(
s='|'.join([
_pattern_win_env_variable, # environment variable
R'[A-Za-z]:', # drive letter with colon
Expand Down
69 changes: 40 additions & 29 deletions refinery/units/pattern/xtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class LetterWeights(LetterWeight, Enum):
B'$%&()*+-<=>?[]{}~\t': 2,
B'.,:;#/\\|@_ ': 4,
B'0123456789': 4,
B'ABCDEFGHIJKLMNOPQRSTUVWXYZ': 5,
B'ABCDEFGHIJKLMNOPQRSTUVWXYZ': 6,
B'abcdefghijklmnopqrstuvwxyz': 8,
})

Expand Down Expand Up @@ -243,12 +243,10 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
}:
if self.args.filter >= 2:
if LetterWeights.IOC(value) < 0.6:
self.log_info(value)
self.log_info('excluding indicator because of low IOC score')
self.log_info(F'excluding indicator because with low score: {value}', clip=True)
return None
if name != indicators.url.name and len(value) > 0x100:
self.log_info(value)
self.log_info('excluding indicator because it is too long')
self.log_info(F'excluding indicator because it is too long: {value}', clip=True)
return None
ioc = value.decode(self.codec)
if '://' not in ioc: ioc = F'tcp://{ioc}'
Expand All @@ -257,11 +255,8 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
hl = host.lower()
for white, level in self._LEGITIMATE_HOSTS.items():
if self.args.filter >= level and (hl == white or hl.endswith(F'.{white}')):
self.log_info(value)
self.log_info(
F'excluding indicator because domain {hl} is whitelisted via {white}; '
F'reduce level below {level} to allow, current level is {self.args.filter}'
)
self.log_info(F'excluding indicator because domain {hl} is whitelisted via {white}: {value}', clip=True)
self.log_debug(F'reduce level below {level} to allow, current level is {self.args.filter}')
return None
if name == indicators.url.name:
scheme = parts.scheme.lower()
Expand All @@ -271,8 +266,7 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
value = value[pos:]
break
if any(hl == w for w in self._DOMAIN_WHITELIST):
self.log_info(value)
self.log_info(F'excluding indicator because domain {hl} is whitelisted')
self.log_info(F'excluding indicator because domain {hl} is whitelisted: {value}')
return None
if name in {
indicators.hostname.name,
Expand All @@ -284,13 +278,11 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
hostparts = host.split('.')
if self.args.filter >= 2:
if not all(p.isdigit() for p in hostparts) and all(len(p) < 4 for p in hostparts):
self.log_info(value)
self.log_info('excluding host with too many short parts')
self.log_info(F'excluding host with too many short parts: {value}')
return None
if self.args.filter >= 3:
if len(hostparts) <= sum(3 for p in hostparts if p != p.lower() and p != p.upper()):
self.log_info(value)
self.log_info('excluding host with too many mixed case parts')
self.log_info(F'excluding host with too many mixed case parts: {value}')
return None
# These heuristics attempt to filter out member access to variables in
# scripts which can be mistaken for domains because of the TLD inflation
Expand All @@ -300,29 +292,24 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
if lowercase and uppercase:
caseratio = uppercase / lowercase
if 0.1 < caseratio < 0.9:
self.log_info(value)
self.log_info('excluding indicator with too much uppercase letters')
self.log_info(F'excluding indicator with too much uppercase letters: {value}')
return None
if all(x.isidentifier() for x in hostparts):
if len(hostparts) == 2 and hostparts[0] in ('this', 'self'):
self.log_info(value)
self.log_info('excluding host that looks like a code snippet')
self.log_info(F'excluding host that looks like a code snippet: {value}')
return None
if len(hostparts[-2]) < 3:
self.log_info(value)
self.log_info('excluding host with too short root domain name')
self.log_info(F'excluding host with too short root domain name: {value}')
return None
if any(x.startswith('_') for x in hostparts):
self.log_info(value)
self.log_info('excluding host with underscores')
self.log_info(F'excluding host with underscores: {value}')
return None
if len(hostparts[-1]) > 3:
prefix = '.'.join(hostparts[:-1])
seen_before = len(set(re.findall(
R'{}(?:\.\w+)+'.format(prefix).encode('ascii'), data)))
if seen_before > 2:
self.log_debug(value)
self.log_debug('excluding indicator that was already seen')
self.log_debug(F'excluding indicator that was already seen: {value}')
return None
elif name == indicators.email.name:
at = value.find(B'@')
Expand All @@ -336,12 +323,15 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
indicators.nixpath.name,
):
if len(value) < 8:
self.log_info(F'excluding path because it is too short: {value}')
return None
if len(value) > 16 and len(re.findall(RB'\\x\d\d', value)) > len(value) // 10:
self.log_info(F'excluding long path containign hex: {value}', clip=True)
return None
try:
path_string = value.decode(self.codec)
except Exception:
self.log_debug(F'excluding path which did not decode: {value!r}', clip=True)
return None
try:
path = Path(path_string)
Expand All @@ -355,8 +345,18 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
(2, path_string.startswith('\\\\')),
(2, path_string[1:3] == ':\\'),
] if x)
if path_likeness < min(self.args.filter, 2):
if 2 + path_likeness < min(self.args.filter, 2):
self.log_info(F'excluding long path because it has no characteristic parts: {value}')
return None
bad_parts = 0
all_parts = len(path.parts)
if self.args.filter >= 1:
date_likeness = sum(1
for t in ['yyyy', 'yy', 'mm', 'dd', 'hh', 'ss']
if t in path.parts or t.upper() in path.parts)
if len(value) < 20 and date_likeness >= all_parts - 1:
self.log_info(F'excluding path that looks like a date format: {value}', clip=True)
return None
if self.args.filter >= 2:
for k, part in enumerate(path.parts):
if not k:
Expand All @@ -365,8 +365,19 @@ def _check_match(self, data: Union[memoryview, bytearray], pos: int, name: str,
continue
if part[0] == part[~0] == '%':
continue
if LetterWeights.Path(part) < 0.4 + (min(self.args.filter, 5) * 0.1):
return None
if len(part) == 1:
continue
if (
LetterWeights.Path(part) < 0.5 + (min(self.args.filter, 4) * 0.1)
or (self.args.filter >= 2 and LetterWeights.Path(part[:1]) < 0.5)
):
bad_parts += 1
self.log_debug(F'bad part {k + 1} in path: {part}')
for filter_limit in (2, 3, 4):
bad_ratio = 2 ** (filter_limit - 1)
if self.args.filter >= filter_limit and bad_parts * bad_ratio >= all_parts:
self.log_info(F'excluding path with bad parts: {value}', clip=True)
return None
return value

def process(self, data):
Expand Down
15 changes: 15 additions & 0 deletions test/units/pattern/test_xtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,18 @@ def test_strip_quote_from_url_regression(self):
def test_webDAV_paths(self):
data = B"\\\\1.1.1.1@556\\the\\finest\\binaires"
self.assertEqual(data, data | self.load('path') | bytes)

def test_registry_paths(self):
data = BR'''
\Thunderbird\Profiles\
%s%s\logins.json
%s%s\key4.db
SOFTWARE\Microsoft\Office\16.0\Outlook\Profiles\Outlook\9375CFF0413111d3B88A00104B2A6676\
Software\Microsoft\Windows Messaging Subsystem\Profiles\9375CFF0413111d3B88A00104B2A6676
Software\Microsoft\Windows NT\CurrentVersion\Windows Messaging Subsystem\Profiles\Outlook\9375CFF0413111d3B88A00104B2A6676
'''
unit = self.load('path')
test = data | unit | [str]
self.assertIn(
R'Software\Microsoft\Windows Messaging Subsystem\Profiles\9375CFF0413111d3B88A00104B2A6676', test
)

0 comments on commit d9f7613

Please sign in to comment.