From 46b395fcb242c8bbd7903cfb8c89fda9a906b285 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 27 Nov 2024 11:01:29 +0200 Subject: [PATCH] Filter configurable arguments in rules config (#622) * Configurable filters * BM scores fix * Update credsweeper/rules/rule.py * Add filter for "Jira / Confluence PAT token" --- .ci/benchmark.txt | 6 +-- credsweeper/filters/filter.py | 2 +- .../filters/value_couple_keyword_check.py | 7 +-- credsweeper/rules/config.yaml | 6 ++- credsweeper/rules/rule.py | 24 ++++++++- tests/__init__.py | 2 +- tests/data/depth_3.json | 50 ++----------------- tests/data/doc.json | 6 +-- tests/data/ml_threshold.json | 6 +-- tests/data/output.json | 6 +-- .../test_value_couple_keyword_check.py | 11 ++++ tests/samples/aws_client_id | 2 + tests/samples/jira_confluence_pat | 3 +- 13 files changed, 63 insertions(+), 68 deletions(-) diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index b05a945cb..798ac8ac4 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -223,12 +223,12 @@ FileType FileNumber ValidLines Positives Negatives Templat .zsh 6 872 12 .zsh-theme 1 97 1 TOTAL: 10232 16342283 12255 49690 5102 -credsweeper result_cnt : 11521, lost_cnt : 0, true_cnt : 11342, false_cnt : 179 +credsweeper result_cnt : 11517, lost_cnt : 0, true_cnt : 11342, false_cnt : 175 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706 AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610 -AWS Multi 82 10 0 88 82 5 5 0 0.500000 0.000000 0.945652 0.942529 1.000000 0.970414 +AWS Multi 82 10 0 84 82 1 9 0 0.100000 0.000000 0.989130 0.987952 1.000000 0.993939 AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503 Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895 Auth 414 2739 82 390 387 3 2818 27 0.001063 0.065217 0.990726 0.992308 0.934783 0.962687 @@ -271,4 +271,4 @@ Token 643 4170 454 61 Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 URL Credentials 210 156 216 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952 UUID 1069 265 0 1068 1067 1 264 2 0.003774 0.001871 0.997751 0.999064 0.998129 0.998596 - 12255 49690 5102 11528 11342 179 49511 913 0.003602 0.074500 0.982371 0.984463 0.925500 0.954071 + 12255 49690 5102 11524 11342 175 49515 913 0.003522 0.074500 0.982436 0.984805 0.925500 0.954232 diff --git a/credsweeper/filters/filter.py b/credsweeper/filters/filter.py index 0fde60bbf..be0acf51b 100644 --- a/credsweeper/filters/filter.py +++ b/credsweeper/filters/filter.py @@ -9,7 +9,7 @@ class Filter: """Base class for all filters that operates on 'line_data' objects.""" @abstractmethod - def __init__(self, config: Config): + def __init__(self, config: Config, *args): raise NotImplementedError() @abstractmethod diff --git a/credsweeper/filters/value_couple_keyword_check.py b/credsweeper/filters/value_couple_keyword_check.py index 0f97f0678..c0a9fbbc2 100644 --- a/credsweeper/filters/value_couple_keyword_check.py +++ b/credsweeper/filters/value_couple_keyword_check.py @@ -8,8 +8,9 @@ class ValueCoupleKeywordCheck(Filter): """Check value if TWO words from morphemes checklist exists in value""" - def __init__(self, config: Config = None) -> None: - pass + def __init__(self, config: Config = None, threshold=1) -> None: + # threshold - minimum morphemes number in a value + self.threshold = threshold def run(self, line_data: LineData, target: AnalysisTarget) -> bool: """Run filter checks on received credential candidate data 'line_data'. @@ -22,4 +23,4 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: True, if need to filter candidate and False if left """ - return static_keyword_checklist.check_morphemes(line_data.value.lower(), 1) + return static_keyword_checklist.check_morphemes(line_data.value.lower(), self.threshold) diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index 4f593c1a6..b017346d9 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -163,7 +163,10 @@ values: - (?(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![0-9A-Za-z_-]) - (?[0-9A-Za-z/+]{35,80})(?![0-9A-Za-z_/+-]) - filter_type: GeneralPattern + filter_type: + - LineSpecificKeyCheck + - ValuePatternCheck + - ValueCoupleKeywordCheck(3) required_substrings: - A min_line_len: 20 @@ -1040,6 +1043,7 @@ - (?[NMO][ADgjQTwz][0-9A-Za-z_-]{42})(?![0-9A-Za-z_-]) filter_type: - ValueAtlassianTokenCheck + - ValueBase64PartCheck min_line_len: 44 required_substrings: - M diff --git a/credsweeper/rules/rule.py b/credsweeper/rules/rule.py index e288acf10..539224dd0 100644 --- a/credsweeper/rules/rule.py +++ b/credsweeper/rules/rule.py @@ -1,3 +1,4 @@ +import contextlib import logging import re from functools import cached_property @@ -112,6 +113,15 @@ def filters(self) -> List[Filter]: """filters getter""" return self.__filters + @staticmethod + def _get_arg(arg: str) -> Union[int, float, str]: + """Transform given string value to int, then float. In worst case - returns str""" + with contextlib.suppress(Exception): + return int(arg) + with contextlib.suppress(Exception): + return float(arg) + return str(arg) + def _init_filters(self, filter_type: Union[None, str, List[str]]) -> List[Filter]: """ filter_type: str - applies Group of filter @@ -126,9 +136,19 @@ def _init_filters(self, filter_type: Union[None, str, List[str]]) -> List[Filter elif isinstance(filter_type, list): # list type means - list of (Filter)s is applied for i in filter_type: - _filter = getattr(filters, i, None) + if '(' in i and ')' in i: + left_pos = i.find('(') + filter_parameters = [self._get_arg(x.strip()) for x in i[left_pos + 1:i.find(')')].split(',')] + filter_name = i[:left_pos].strip() + else: + filter_parameters = None + filter_name = i + _filter = getattr(filters, filter_name, None) if isinstance(_filter, type) and issubclass(_filter, Filter): - _filters.append(_filter(self.config)) + if filter_parameters: + _filters.append(_filter(self.config, *filter_parameters)) + else: + _filters.append(_filter(self.config)) else: break else: diff --git a/tests/__init__.py b/tests/__init__.py index 3e0bd43a3..a9ccc6952 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -17,7 +17,7 @@ SAMPLES_IN_DOC = 447 # archived credentials that are not found without --depth -SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 30 +SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 29 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 53 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index b28039408..eacefc4f2 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -580,50 +580,6 @@ } ] }, - { - "api_validation": "NOT_AVAILABLE", - "ml_validation": "NOT_AVAILABLE", - "ml_probability": null, - "rule": "AWS Multi", - "severity": "high", - "confidence": "moderate", - "line_data_list": [ - { - "line": " Follow the white rabbitAKIAGIREOGIPPTX1Y45X", - "line_num": 2, - "path": "./tests/samples/aws_id.pptx", - "info": "./tests/samples/aws_id.pptx|ZIP|ppt/slides/slide1.xml|RAW", - "value": "AKIAGIREOGIPPTX1Y45X", - "value_start": 2403, - "value_end": 2423, - "variable": null, - "variable_start": -2, - "variable_end": -2, - "entropy_validation": { - "iterator": "BASE64_CHARS", - "entropy": 3.6841837197791887, - "valid": false - } - }, - { - "line": " Follow the white rabbitAKIAGIREOGIPPTX1Y45X", - "line_num": 2, - "path": "./tests/samples/aws_id.pptx", - "info": "./tests/samples/aws_id.pptx|ZIP|ppt/slides/slide1.xml|RAW", - "value": "org/officeDocument/2006/relationships", - "value_start": 179, - "value_end": 216, - "variable": null, - "variable_start": -2, - "variable_end": -2, - "entropy_validation": { - "iterator": "BASE36_CHARS", - "entropy": 3.794653677335903, - "valid": true - } - } - ] - }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -8143,13 +8099,13 @@ "confidence": "strong", "line_data_list": [ { - "line": "JIRA = \"OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ\"", + "line": "TP: https://www.example.com/api/verification/version2322/token/OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", "line_num": 1, "path": "./tests/samples/jira_confluence_pat", "info": "./tests/samples/jira_confluence_pat|RAW", "value": "OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", - "value_start": 8, - "value_end": 52, + "value_start": 63, + "value_end": 107, "variable": null, "variable_start": -2, "variable_end": -2, diff --git a/tests/data/doc.json b/tests/data/doc.json index 06e86d4b1..636662b12 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -12228,13 +12228,13 @@ "confidence": "strong", "line_data_list": [ { - "line": "JIRA = \"OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ\"", + "line": "TP: https://www.example.com/api/verification/version2322/token/OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", "line_num": 1, "path": "./tests/samples/jira_confluence_pat", "info": "./tests/samples/jira_confluence_pat|RAW", "value": "OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", - "value_start": 8, - "value_end": 52, + "value_start": 63, + "value_end": 107, "variable": null, "variable_start": -2, "variable_end": -2, diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index 5a53ef27d..70caba570 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -8733,13 +8733,13 @@ "confidence": "strong", "line_data_list": [ { - "line": "d29b86517de7225ce062bc602beb5886d1454a5255911f4c3ab140af6973cf8a", + "line": "1623edb467cc32623f35eae6186382b1b4c3c2f6d70e20f42d5ab342d5c79d03", "line_num": 1, "path": "./tests/samples/jira_confluence_pat", "info": "", "value": "728e0e362437be53ffa2f9ee605f3870000122d7d03c20ce3d7c7b8f1d733d8e", - "value_start": 8, - "value_end": 52, + "value_start": 63, + "value_end": 107, "variable": null, "variable_start": -2, "variable_end": -2, diff --git a/tests/data/output.json b/tests/data/output.json index 9e83fc5dc..4c5f0e370 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -7680,13 +7680,13 @@ "confidence": "strong", "line_data_list": [ { - "line": "JIRA = \"OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ\"", + "line": "TP: https://www.example.com/api/verification/version2322/token/OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", "line_num": 1, "path": "./tests/samples/jira_confluence_pat", "info": "", "value": "OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ", - "value_start": 8, - "value_end": 52, + "value_start": 63, + "value_end": 107, "variable": null, "variable_start": -2, "variable_end": -2, diff --git a/tests/filters/test_value_couple_keyword_check.py b/tests/filters/test_value_couple_keyword_check.py index 4b868a71c..dab4ebf12 100644 --- a/tests/filters/test_value_couple_keyword_check.py +++ b/tests/filters/test_value_couple_keyword_check.py @@ -1,6 +1,7 @@ import pytest from credsweeper.filters import ValueCoupleKeywordCheck +from tests import AZ_STRING from tests.filters.conftest import LINE_VALUE_PATTERN, DUMMY_ANALYSIS_TARGET from tests.test_utils.dummy_line_data import get_line_data @@ -18,3 +19,13 @@ def test_value_couple_keyword_check_p(self, file_path: pytest.fixture, line: str def test_value_couple_keyword_check_n(self, file_path: pytest.fixture, line: str) -> None: line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN) assert ValueCoupleKeywordCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True + + @pytest.mark.parametrize("line", [AZ_STRING]) + def test_value_couple_keyword_check_arg_n(self, file_path: pytest.fixture, line: str) -> None: + line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN) + assert ValueCoupleKeywordCheck(threshold=9).run(line_data, DUMMY_ANALYSIS_TARGET) is False + + @pytest.mark.parametrize("line", [AZ_STRING]) + def test_value_couple_keyword_check_arg_p(self, file_path: pytest.fixture, line: str) -> None: + line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN) + assert ValueCoupleKeywordCheck(threshold=8).run(line_data, DUMMY_ANALYSIS_TARGET) is True diff --git a/tests/samples/aws_client_id b/tests/samples/aws_client_id index b5a83275c..45215bf30 100644 --- a/tests/samples/aws_client_id +++ b/tests/samples/aws_client_id @@ -2,3 +2,5 @@ The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X the coma is necessary there ^ bariers thesting !!! must be filtered: AKIAGIREOGIAEXAMPLE7 filtered too: AKIALGSBKLIKEAREAL12 --access-key + +exchangeAuthorizationAccessTokenWithPermission # looks like a key, but FP for aws multi \ No newline at end of file diff --git a/tests/samples/jira_confluence_pat b/tests/samples/jira_confluence_pat index a5f85ddcb..94bf2daf7 100644 --- a/tests/samples/jira_confluence_pat +++ b/tests/samples/jira_confluence_pat @@ -1 +1,2 @@ -JIRA = "OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ" \ No newline at end of file +TP: https://www.example.com/api/verification/version2322/token/OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ +FP: "image/png": "iVBORx09VIskhxhCe7sh03R1dnENPiB66xQSIZjEYN13vafX/OTI2NjA3NjU1NTI2Oh2DOnASdOHoIhEGyqIuYrdkYaQZ/hZwUteHsmN+z+aoEAAAAvL+Q5FSQGyqIuYrdkYaQZuW1TvI=\n",