From 4a64d98bc6d1674837b06791e706e7825a264e15 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Tue, 23 Jul 2024 20:54:29 +0300 Subject: [PATCH 01/18] Filters optimisation --- credsweeper/credentials/line_data.py | 2 + credsweeper/filters/__init__.py | 2 - credsweeper/filters/group/group.py | 6 +- .../filters/group/url_credentials_group.py | 5 +- .../value_dictionary_value_length_check.py | 7 ++- credsweeper/filters/value_length_check.py | 26 -------- .../filters/value_pattern_length_check.py | 10 --- credsweeper/rules/config.yaml | 2 +- tests/__init__.py | 6 +- tests/data/depth_3.json | 62 +++++++++++++++++-- tests/data/ml_threshold.json | 62 +++++++++++++++++-- tests/data/output.json | 62 +++++++++++++++++-- tests/filters/test_value_length_check.py | 19 ------ tests/ml_model/test_ml_validator.py | 17 +++-- tests/samples/salt.py | 1 + tests/samples/url_cred.js | 2 + tests/test_main.py | 2 +- 17 files changed, 200 insertions(+), 93 deletions(-) delete mode 100644 credsweeper/filters/value_length_check.py delete mode 100644 credsweeper/filters/value_pattern_length_check.py delete mode 100644 tests/filters/test_value_length_check.py diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py index 06a68fb5b..d407b657a 100644 --- a/credsweeper/credentials/line_data.py +++ b/credsweeper/credentials/line_data.py @@ -193,6 +193,8 @@ def sanitize_variable(self) -> None: while self.variable and sanitized_var_len != len(self.variable): sanitized_var_len = len(self.variable) self.variable = self.variable.strip(self.variable_strip_pattern) + if self.variable.endswith('\\'): + self.variable = self.variable[:-1] if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end: start = variable.find(self.variable) self.variable_start += start diff --git a/credsweeper/filters/__init__.py b/credsweeper/filters/__init__.py index 66de37b97..f7cc9c58f 100644 --- a/credsweeper/filters/__init__.py +++ b/credsweeper/filters/__init__.py @@ -28,13 +28,11 @@ from credsweeper.filters.value_jfrog_token_check import ValueJfrogTokenCheck from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenCheck from credsweeper.filters.value_last_word_check import ValueLastWordCheck -from credsweeper.filters.value_length_check import ValueLengthCheck from credsweeper.filters.value_method_check import ValueMethodCheck from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck from credsweeper.filters.value_not_part_encoded_check import ValueNotPartEncodedCheck from credsweeper.filters.value_number_check import ValueNumberCheck from credsweeper.filters.value_pattern_check import ValuePatternCheck -from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck from credsweeper.filters.value_similarity_check import ValueSimilarityCheck from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck from credsweeper.filters.value_string_type_check import ValueStringTypeCheck diff --git a/credsweeper/filters/group/group.py b/credsweeper/filters/group/group.py index 6ee25387d..65a245fa9 100644 --- a/credsweeper/filters/group/group.py +++ b/credsweeper/filters/group/group.py @@ -5,9 +5,9 @@ from credsweeper.config import Config from credsweeper.filters import (Filter, LineSpecificKeyCheck, ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck, ValueCamelCaseCheck, ValueFilePathCheck, ValueFirstWordCheck, - ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, + ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck, ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck, - VariableNotAllowedPatternCheck, ValuePatternLengthCheck, ValueHexNumberCheck) + VariableNotAllowedPatternCheck, ValueHexNumberCheck) class Group(ABC): @@ -43,7 +43,6 @@ def get_keyword_base_filters(config: Config) -> List[Filter]: ValueFirstWordCheck(), ValueHexNumberCheck(), ValueLastWordCheck(), - ValueLengthCheck(config), ValueMethodCheck(), ValueSimilarityCheck(), ValueStringTypeCheck(config), @@ -60,5 +59,4 @@ def get_pattern_base_filters(config: Config) -> List[Filter]: return [ # LineSpecificKeyCheck(), # ValuePatternCheck(config), # - ValuePatternLengthCheck(config) ] diff --git a/credsweeper/filters/group/url_credentials_group.py b/credsweeper/filters/group/url_credentials_group.py index 9a7477191..4c4c5d6a0 100644 --- a/credsweeper/filters/group/url_credentials_group.py +++ b/credsweeper/filters/group/url_credentials_group.py @@ -2,7 +2,7 @@ from credsweeper.config import Config from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck, ValueCamelCaseCheck, ValueDictionaryValueLengthCheck, ValueFilePathCheck, - ValueFirstWordCheck, ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck, + ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck) from credsweeper.filters.group import Group @@ -25,11 +25,10 @@ def __init__(self, config: Config) -> None: ValueFilePathCheck(), ValueFirstWordCheck(), ValueLastWordCheck(), - ValueLengthCheck(config), ValueMethodCheck(), ValueStringTypeCheck(config), ValueNotAllowedPatternCheck(), ValueTokenCheck(), - ValueDictionaryValueLengthCheck(), + ValueDictionaryValueLengthCheck(min_len=4, max_len=80), ValuePatternCheck(config) ] diff --git a/credsweeper/filters/value_dictionary_value_length_check.py b/credsweeper/filters/value_dictionary_value_length_check.py index c0b92a846..8186f8229 100644 --- a/credsweeper/filters/value_dictionary_value_length_check.py +++ b/credsweeper/filters/value_dictionary_value_length_check.py @@ -7,8 +7,9 @@ class ValueDictionaryValueLengthCheck(Filter): """Check that candidate length is between 5 and 30.""" - def __init__(self, config: Config = None) -> None: - pass + def __init__(self, config: Config = None, min_len: int = 4, max_len: int = 31) -> None: + self.min_len = min_len + self.max_len = max_len def run(self, line_data: LineData, target: AnalysisTarget) -> bool: """Run filter checks on received credential candidate data 'line_data'. @@ -21,7 +22,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: True, if need to filter candidate and False if left """ - if 4 <= len(line_data.value) <= 31: + if self.min_len <= len(line_data.value) <= self.max_len: return False else: return True diff --git a/credsweeper/filters/value_length_check.py b/credsweeper/filters/value_length_check.py deleted file mode 100644 index 57596f35e..000000000 --- a/credsweeper/filters/value_length_check.py +++ /dev/null @@ -1,26 +0,0 @@ -from credsweeper.config import Config -from credsweeper.credentials import LineData -from credsweeper.file_handler.analysis_target import AnalysisTarget -from credsweeper.filters import Filter - - -class ValueLengthCheck(Filter): - """Check if potential candidate value is not too short (longer or equal to `min_len`).""" - - def __init__(self, config: Config) -> None: - self.min_len = config.min_keyword_value_length - - def run(self, line_data: LineData, target: AnalysisTarget) -> bool: - """Run filter checks on received credential candidate data 'line_data'. - - Args: - line_data: credential candidate data - target: multiline target from which line data was obtained - - Return: - True, if need to filter candidate and False if left - - """ - if len(line_data.value) < self.min_len: - return True - return False diff --git a/credsweeper/filters/value_pattern_length_check.py b/credsweeper/filters/value_pattern_length_check.py deleted file mode 100644 index dd4531bf1..000000000 --- a/credsweeper/filters/value_pattern_length_check.py +++ /dev/null @@ -1,10 +0,0 @@ -from credsweeper.config import Config -from credsweeper.filters import ValueLengthCheck - - -class ValuePatternLengthCheck(ValueLengthCheck): - """Check if potential candidate value is not too short like ValueLengthCheck but with different min_len""" - - def __init__(self, config: Config) -> None: - super().__init__(config) - self.min_len = config.min_pattern_value_length diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index df36dee3f..5ff2a8c92 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -673,7 +673,7 @@ confidence: moderate type: pattern values: - - (?P[\"'])?(?P[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}@:/]{0,80}:){1,3}(?P[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P[\"'])? + - (?P[\"'])?(?P[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}:/]{0,80}:){1,3}(?P[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P[\"'])? filter_type: UrlCredentialsGroup use_ml: true required_substrings: diff --git a/tests/__init__.py b/tests/__init__.py index 3dfc0a7b5..d83aadba8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,11 +7,11 @@ NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 429 -SAMPLES_CRED_LINE_COUNT: int = 446 +SAMPLES_CRED_COUNT: int = 431 +SAMPLES_CRED_LINE_COUNT: int = 448 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 387 +SAMPLES_POST_CRED_COUNT: int = 389 # with option --doc SAMPLES_IN_DOC = 410 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 448776d10..27447f7fc 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -11243,6 +11243,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.665, + "rule": "Salt", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"", + "line_num": 5, + "path": "tests/samples/salt.py", + "info": "tests/samples/salt.py|RAW", + "value": "4b9a6d8b638eb0c6", + "value_start": 35, + "value_end": 51, + "variable": "salt8", + "variable_start": 21, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2806390622295662, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -12647,6 +12674,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 3, + "path": "tests/samples/url_cred.js", + "info": "tests/samples/url_cred.js|RAW", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -12657,7 +12711,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 3, + "line_num": 5, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -12684,7 +12738,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 7, + "line_num": 9, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "f45VgF8jX79o@anydata.com", @@ -12711,7 +12765,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "3487263-2384579834-234732875-345", @@ -12738,7 +12792,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "546DFS64N90P3AW7DX", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index 0287a0ae5..878f82d6d 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -11007,6 +11007,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.665, + "rule": "Salt", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"", + "line_num": 5, + "path": "tests/samples/salt.py", + "info": "", + "value": "4b9a6d8b638eb0c6", + "value_start": 35, + "value_end": 51, + "variable": "salt8", + "variable_start": 21, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2806390622295662, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -11682,6 +11709,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 3, + "path": "tests/samples/url_cred.js", + "info": "", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -11692,7 +11746,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 3, + "line_num": 5, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -11719,7 +11773,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 7, + "line_num": 9, "path": "tests/samples/url_cred.js", "info": "", "value": "f45VgF8jX79o@anydata.com", @@ -11746,7 +11800,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "", "value": "3487263-2384579834-234732875-345", @@ -11773,7 +11827,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX", diff --git a/tests/data/output.json b/tests/data/output.json index b9d388ae9..033d703a2 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -9954,6 +9954,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.665, + "rule": "Salt", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"", + "line_num": 5, + "path": "tests/samples/salt.py", + "info": "", + "value": "4b9a6d8b638eb0c6", + "value_start": 35, + "value_end": 51, + "variable": "salt8", + "variable_start": 21, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2806390622295662, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -10575,6 +10602,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 3, + "path": "tests/samples/url_cred.js", + "info": "", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -10585,7 +10639,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 3, + "line_num": 5, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -10612,7 +10666,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 7, + "line_num": 9, "path": "tests/samples/url_cred.js", "info": "", "value": "f45VgF8jX79o@anydata.com", @@ -10639,7 +10693,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "", "value": "3487263-2384579834-234732875-345", @@ -10666,7 +10720,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 10, + "line_num": 12, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX", diff --git a/tests/filters/test_value_length_check.py b/tests/filters/test_value_length_check.py deleted file mode 100644 index 69bd5f809..000000000 --- a/tests/filters/test_value_length_check.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from credsweeper.config import Config -from credsweeper.filters import ValueLengthCheck -from tests.filters.conftest import LINE_VALUE_PATTERN, DUMMY_ANALYSIS_TARGET -from tests.test_utils.dummy_line_data import get_line_data - - -class TestValueLengthCheck: - - def test_value_length_check_p(self, file_path: pytest.fixture, config: Config, - success_line: pytest.fixture) -> None: - line_data = get_line_data(file_path, line=success_line, pattern=LINE_VALUE_PATTERN) - assert ValueLengthCheck(config).run(line_data, DUMMY_ANALYSIS_TARGET) is False - - @pytest.mark.parametrize("line", ["Cra"]) - def test_value_length_check_n(self, file_path: pytest.fixture, config: Config, line: str) -> None: - line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN) - assert ValueLengthCheck(config).run(line_data, DUMMY_ANALYSIS_TARGET) is True diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py index 9322eae63..fefc1eef8 100644 --- a/tests/ml_model/test_ml_validator.py +++ b/tests/ml_model/test_ml_validator.py @@ -10,7 +10,7 @@ from credsweeper.credentials import Candidate, CandidateKey from credsweeper.ml_model import MlValidator from credsweeper.utils import Util -from tests import AZ_STRING, NEGLIGIBLE_ML_THRESHOLD +from tests import NEGLIGIBLE_ML_THRESHOLD class TestMlValidator(unittest.TestCase): @@ -31,7 +31,6 @@ def setUp(self): self.config = Config(config_dict) def test_ml_validator_simple_p(self): - def validate(_candidate: Candidate) -> Tuple[bool, float]: """Validate single credential candidate.""" candidate_key = CandidateKey(_candidate.line_data_list[0]) @@ -48,22 +47,22 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]: candidate.line_data_list[0].value = "Ahga%$FiQ@Ei8" decision, probability = validate(candidate) - self.assertAlmostEqual(probability, 0.9997520446777344, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9997520446777344, probability, delta=NEGLIGIBLE_ML_THRESHOLD) candidate.line_data_list[0].path = "sample.py" candidate.line_data_list[0].file_type = ".yaml" decision, probability = validate(candidate) - self.assertAlmostEqual(probability, 0.9994515776634216, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9994515776634216, probability, delta=NEGLIGIBLE_ML_THRESHOLD) candidate.line_data_list[0].path = "test.zip" candidate.line_data_list[0].file_type = ".zip" decision, probability = validate(candidate) - self.assertAlmostEqual(probability, 0.9994281530380249, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9994281530380249, probability, delta=NEGLIGIBLE_ML_THRESHOLD) candidate.line_data_list[0].path = "other.txt" candidate.line_data_list[0].file_type = ".txt" decision, probability = validate(candidate) - self.assertAlmostEqual(probability, 0.9980608820915222, delta=NEGLIGIBLE_ML_THRESHOLD) + self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD) def test_extract_features_p(self): candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info") @@ -74,10 +73,10 @@ def test_extract_features_p(self): candidate1.line_data_list[0].value = "123" candidate1.rule_name = "Password" features1 = self.ml_validator.extract_features([candidate1]) - self.assertEqual(18, np.count_nonzero(features1)) + self.assertAlmostEqual(18, np.count_nonzero(features1), delta=NEGLIGIBLE_ML_THRESHOLD) candidate2 = copy.deepcopy(candidate1) features2 = self.ml_validator.extract_features([candidate1, candidate2]) - self.assertEqual(18, np.count_nonzero(features2)) + self.assertAlmostEqual(18, np.count_nonzero(features2), delta=NEGLIGIBLE_ML_THRESHOLD) candidate2.rule_name = "Secret" features3 = self.ml_validator.extract_features([candidate1, candidate2]) - self.assertEqual(19, np.count_nonzero(features3)) + self.assertAlmostEqual(19, np.count_nonzero(features3), delta=NEGLIGIBLE_ML_THRESHOLD) diff --git a/tests/samples/salt.py b/tests/samples/salt.py index 4140c4e5a..60b2fcd07 100644 --- a/tests/samples/salt.py +++ b/tests/samples/salt.py @@ -2,3 +2,4 @@ salt2 = r"""\0x12\0x3s""" salt3 = u"\u0020827634876" salt4 = {"salt5": "my124%#$@s\x04clt\0"} +json_escaped = "{\\\"salt8\\\":\\\"4b9a6d8b638eb0c6\\\"}" diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js index 4b21bc971..db6af3e57 100644 --- a/tests/samples/url_cred.js +++ b/tests/samples/url_cred.js @@ -1,5 +1,7 @@ const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local'); +email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465"; + url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut"; // note:dummyuser@example.com diff --git a/tests/test_main.py b/tests/test_main.py index d7066debf..6c774c4f9 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -749,7 +749,7 @@ def prepare(report: List[Dict[str, Any]]): tmp_file = Path(tmp_dir) / cfg["json_filename"] # apply the current path to keep equivalence in path os.chdir(TESTS_PATH.parent) - content_provider: AbstractProvider = FilesProvider(["tests/samples"]) + content_provider: AbstractProvider = FilesProvider([Path("tests") / "samples"]) # replace output report file to place in tmp_dir cfg["json_filename"] = str(tmp_file) cred_sweeper = CredSweeper(**cfg) From 49e82cee1c82d0f793f41f53041262a2e0fc52d9 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Tue, 23 Jul 2024 21:08:26 +0300 Subject: [PATCH 02/18] style --- credsweeper/filters/group/group.py | 4 ++-- credsweeper/filters/group/url_credentials_group.py | 4 ++-- tests/ml_model/test_ml_validator.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/credsweeper/filters/group/group.py b/credsweeper/filters/group/group.py index 65a245fa9..37cea6948 100644 --- a/credsweeper/filters/group/group.py +++ b/credsweeper/filters/group/group.py @@ -5,8 +5,8 @@ from credsweeper.config import Config from credsweeper.filters import (Filter, LineSpecificKeyCheck, ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck, ValueCamelCaseCheck, ValueFilePathCheck, ValueFirstWordCheck, - ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, - ValuePatternCheck, ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck, + ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck, + ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck, VariableNotAllowedPatternCheck, ValueHexNumberCheck) diff --git a/credsweeper/filters/group/url_credentials_group.py b/credsweeper/filters/group/url_credentials_group.py index 4c4c5d6a0..23aba1d3b 100644 --- a/credsweeper/filters/group/url_credentials_group.py +++ b/credsweeper/filters/group/url_credentials_group.py @@ -2,8 +2,8 @@ from credsweeper.config import Config from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck, ValueCamelCaseCheck, ValueDictionaryValueLengthCheck, ValueFilePathCheck, - ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck, - ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck) + ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, + ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck) from credsweeper.filters.group import Group diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py index fefc1eef8..ee7083ae9 100644 --- a/tests/ml_model/test_ml_validator.py +++ b/tests/ml_model/test_ml_validator.py @@ -31,6 +31,7 @@ def setUp(self): self.config = Config(config_dict) def test_ml_validator_simple_p(self): + def validate(_candidate: Candidate) -> Tuple[bool, float]: """Validate single credential candidate.""" candidate_key = CandidateKey(_candidate.line_data_list[0]) From fdd483bd93b93bc46431f6bfbc10ec016358c8cd Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Tue, 23 Jul 2024 21:44:59 +0300 Subject: [PATCH 03/18] unicode cases in filter --- credsweeper/filters/value_not_allowed_pattern_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/credsweeper/filters/value_not_allowed_pattern_check.py b/credsweeper/filters/value_not_allowed_pattern_check.py index 944c9c34e..a0cc89aa1 100644 --- a/credsweeper/filters/value_not_allowed_pattern_check.py +++ b/credsweeper/filters/value_not_allowed_pattern_check.py @@ -10,7 +10,7 @@ class ValueNotAllowedPatternCheck(Filter): """Check that secret doesn't open or closes brackets or a new line.""" - NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"^\s*\\", r"^\s*\\n\s*"] + NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"\\u00(26|3c)gt;?(\s|\\+[nrt])?", r"^\s*\\", r"^\s*\\n\s*"] NOT_ALLOWED_PATTERN = re.compile( # f"{Util.get_regex_combine_or(NOT_ALLOWED)}$", # flags=re.IGNORECASE) From 21fd81fb8dcc175921d7a20153154d3d43c9abf7 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Tue, 23 Jul 2024 21:47:42 +0300 Subject: [PATCH 04/18] tmp markup loan --- cicd/benchmark.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index c8d8fa7f7..52f7b0ca0 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -223,6 +223,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .zsh 6 872 12 .zsh-theme 1 97 1 TOTAL: 10333 16988573 8377 60439 5233 +NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password credsweeper result_cnt : 7800, lost_cnt : 0, true_cnt : 7231, false_cnt : 569 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- From 97ef85bbd209dc242829c312806f968b54a2651c Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 24 Jul 2024 07:56:54 +0300 Subject: [PATCH 05/18] BM scores fix --- cicd/benchmark.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 52f7b0ca0..60795a0fe 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -224,7 +224,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .zsh-theme 1 97 1 TOTAL: 10333 16988573 8377 60439 5233 NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password -credsweeper result_cnt : 7800, lost_cnt : 0, true_cnt : 7231, false_cnt : 569 +credsweeper result_cnt : 7808, lost_cnt : 1, true_cnt : 7237, false_cnt : 570 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 123 3163 185 112 109 3 3345 14 0.000896 0.113821 0.995102 0.973214 0.886179 0.927660 @@ -260,12 +260,12 @@ Key 483 8494 464 44 Nonce 83 53 0 85 79 6 47 4 0.113208 0.048193 0.926471 0.929412 0.951807 0.940476 Other 0 0 5 0 0 5 0 0.000000 1.000000 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1823 7474 2752 1681 1614 67 10159 209 0.006552 0.114646 0.977094 0.960143 0.885354 0.921233 +Password 1823 7474 2752 1682 1614 67 10159 209 0.006552 0.114646 0.977094 0.960143 0.885354 0.921233 Salt 42 76 2 38 38 0 78 4 0.000000 0.095238 0.966667 1.000000 0.904762 0.950000 Secret 1358 28497 869 1234 1229 5 29361 129 0.000170 0.094993 0.995639 0.995948 0.905007 0.948302 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 585 3972 439 519 511 8 4403 74 0.001814 0.126496 0.983587 0.984586 0.873504 0.925725 +Token 585 3972 439 521 512 9 4402 73 0.002040 0.124786 0.983587 0.982726 0.875214 0.925859 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 194 125 251 184 184 0 376 10 0.000000 0.051546 0.982456 1.000000 0.948454 0.973545 - 8377 60439 5233 7942 7231 569 59870 1146 0.009414 0.136803 0.975078 0.927051 0.863197 0.893985 +URL Credentials 194 125 251 189 189 0 376 5 0.000000 0.025773 0.991228 1.000000 0.974227 0.986945 + 8377 60439 5233 7950 7237 570 59869 1140 0.009431 0.136087 0.975151 0.926989 0.863913 0.894340 From f851904671619c24226fadc2048b11918a41fec0 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 24 Jul 2024 07:59:13 +0300 Subject: [PATCH 06/18] docs upd --- docs/source/credsweeper.filters.rst | 48 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/docs/source/credsweeper.filters.rst b/docs/source/credsweeper.filters.rst index f2da332f5..5d336e89d 100644 --- a/docs/source/credsweeper.filters.rst +++ b/docs/source/credsweeper.filters.rst @@ -20,6 +20,14 @@ credsweeper.filters.filter module :undoc-members: :show-inheritance: +credsweeper.filters.line\_git\_binary\_check module +--------------------------------------------------- + +.. automodule:: credsweeper.filters.line_git_binary_check + :members: + :undoc-members: + :show-inheritance: + credsweeper.filters.line\_specific\_key\_check module ----------------------------------------------------- @@ -132,6 +140,14 @@ credsweeper.filters.value\_dictionary\_value\_length\_check module :undoc-members: :show-inheritance: +credsweeper.filters.value\_discord\_bot\_check module +----------------------------------------------------- + +.. automodule:: credsweeper.filters.value_discord_bot_check + :members: + :undoc-members: + :show-inheritance: + credsweeper.filters.value\_entropy\_base32\_check module -------------------------------------------------------- @@ -188,6 +204,22 @@ credsweeper.filters.value\_grafana\_check module :undoc-members: :show-inheritance: +credsweeper.filters.value\_grafana\_service\_check module +--------------------------------------------------------- + +.. automodule:: credsweeper.filters.value_grafana_service_check + :members: + :undoc-members: + :show-inheritance: + +credsweeper.filters.value\_hex\_number\_check module +---------------------------------------------------- + +.. automodule:: credsweeper.filters.value_hex_number_check + :members: + :undoc-members: + :show-inheritance: + credsweeper.filters.value\_ip\_check module ------------------------------------------- @@ -220,14 +252,6 @@ credsweeper.filters.value\_last\_word\_check module :undoc-members: :show-inheritance: -credsweeper.filters.value\_length\_check module ------------------------------------------------ - -.. automodule:: credsweeper.filters.value_length_check - :members: - :undoc-members: - :show-inheritance: - credsweeper.filters.value\_method\_check module ----------------------------------------------- @@ -268,14 +292,6 @@ credsweeper.filters.value\_pattern\_check module :undoc-members: :show-inheritance: -credsweeper.filters.value\_pattern\_length\_check module --------------------------------------------------------- - -.. automodule:: credsweeper.filters.value_pattern_length_check - :members: - :undoc-members: - :show-inheritance: - credsweeper.filters.value\_similarity\_check module --------------------------------------------------- From ec48914eec0164de46f58893efb3e7e08d5cf17a Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 24 Jul 2024 12:52:04 +0300 Subject: [PATCH 07/18] removed unused filter --- .../filters/separator_unusual_check.py | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 credsweeper/filters/separator_unusual_check.py diff --git a/credsweeper/filters/separator_unusual_check.py b/credsweeper/filters/separator_unusual_check.py deleted file mode 100644 index b05da326b..000000000 --- a/credsweeper/filters/separator_unusual_check.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging - -from credsweeper.config import Config -from credsweeper.credentials import LineData -from credsweeper.file_handler.analysis_target import AnalysisTarget -from credsweeper.filters import Filter - -logger = logging.getLogger(__name__) - - -class SeparatorUnusualCheck(Filter): - """Check that candidate have no double symbol ops (like ++, --, <<) or comparison ops (like != or ==) as separator. - - Example: - `pwd == 'value'` - `pwd != 'value'` - `pwd << value` - - """ - - def __init__(self, config: Config = None) -> None: - pass - - def run(self, line_data: LineData, target: AnalysisTarget) -> bool: - """Run filter checks on received credential candidate data 'line_data'. - - Args: - line_data: credential candidate data - target: multiline target from which line data was obtained - - Return: - True, if need to filter candidate and False if left - - """ - if line_data.separator is None: - return True - - if 1 > line_data.separator_start: - logger.warning(f"Wrong separator start position {line_data}") - return True - - try: - if line_data.separator == line_data.line[line_data.separator_start + 1] or \ - (line_data.separator == "=" and line_data.line[line_data.separator_start - 1] == "!"): - return True - except IndexError: - return True - - return False From 898a2523fce2823723669506399d401eeb444901 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 25 Jul 2024 10:01:54 +0300 Subject: [PATCH 08/18] doc upd --- docs/source/credsweeper.filters.rst | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/source/credsweeper.filters.rst b/docs/source/credsweeper.filters.rst index 5d336e89d..94f1891be 100644 --- a/docs/source/credsweeper.filters.rst +++ b/docs/source/credsweeper.filters.rst @@ -36,14 +36,6 @@ credsweeper.filters.line\_specific\_key\_check module :undoc-members: :show-inheritance: -credsweeper.filters.separator\_unusual\_check module ----------------------------------------------------- - -.. automodule:: credsweeper.filters.separator_unusual_check - :members: - :undoc-members: - :show-inheritance: - credsweeper.filters.value\_allowlist\_check module -------------------------------------------------- From 92858b6cbd8db78b8e1b660b07dc0a3d976b4051 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 25 Jul 2024 12:31:23 +0300 Subject: [PATCH 09/18] [no ci] upd2 --- cicd/benchmark.txt | 11 +++--- credsweeper/filters/group/general_pattern.py | 2 ++ .../filters/line_specific_key_check.py | 12 +++++-- .../filters/value_useless_word_check.py | 7 ++-- tests/__init__.py | 2 +- tests/data/depth_3.json | 35 ++++++++++++++++--- tests/data/ml_threshold.json | 8 ++--- tests/data/output.json | 8 ++--- tests/filters/test_line_specific_key_check.py | 4 ++- .../filters/test_value_useless_word_check.py | 2 +- tests/samples/aws_client_id | 2 ++ tests/samples/key.hs | 2 +- 12 files changed, 67 insertions(+), 28 deletions(-) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 60795a0fe..1e49d192d 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -223,14 +223,15 @@ FileType FileNumber ValidLines Positives Negatives Templat .zsh 6 872 12 .zsh-theme 1 97 1 TOTAL: 10333 16988573 8377 60439 5233 -NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password -credsweeper result_cnt : 7808, lost_cnt : 1, true_cnt : 7237, false_cnt : 570 +UNMATCH (35, 949): 1338367,8605db08,GitHub,48fd3902,data/48fd3902/test/8605db08.kt,30,30,F,F,34,949,F,F,,,,,0.0,0,F,F,F,Token +NOT FOUND WITH KEY: 1338368,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password +credsweeper result_cnt : 7810, lost_cnt : 2, true_cnt : 7237, false_cnt : 571 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 123 3163 185 112 109 3 3345 14 0.000896 0.113821 0.995102 0.973214 0.886179 0.927660 AWS Client ID 168 13 0 160 160 0 13 8 0.000000 0.047619 0.955801 1.000000 0.952381 0.975610 AWS Multi 75 12 0 87 75 11 1 0 0.916667 0.000000 0.873563 0.872093 1.000000 0.931677 -AWS S3 Bucket 61 25 0 87 61 24 1 0 0.960000 0.000000 0.720930 0.717647 1.000000 0.835616 +AWS S3 Bucket 61 25 0 92 61 25 0 0 1.000000 0.000000 0.709302 0.709302 1.000000 0.829932 Atlassian Old PAT token 27 212 3 12 3 8 207 24 0.037209 0.888889 0.867769 0.272727 0.111111 0.157895 Auth 407 2725 77 372 351 21 2781 56 0.007495 0.137592 0.976005 0.943548 0.862408 0.901155 Azure Access Token 19 0 0 0 0 0 19 1.000000 0.000000 0.000000 @@ -265,7 +266,7 @@ Salt 42 76 2 3 Secret 1358 28497 869 1234 1229 5 29361 129 0.000170 0.094993 0.995639 0.995948 0.905007 0.948302 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 585 3972 439 521 512 9 4402 73 0.002040 0.124786 0.983587 0.982726 0.875214 0.925859 +Token 585 3972 439 522 512 9 4402 73 0.002040 0.124786 0.983587 0.982726 0.875214 0.925859 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 URL Credentials 194 125 251 189 189 0 376 5 0.000000 0.025773 0.991228 1.000000 0.974227 0.986945 - 8377 60439 5233 7950 7237 570 59869 1140 0.009431 0.136087 0.975151 0.926989 0.863913 0.894340 + 8377 60439 5233 7956 7237 571 59868 1140 0.009448 0.136087 0.975137 0.926870 0.863913 0.894285 diff --git a/credsweeper/filters/group/general_pattern.py b/credsweeper/filters/group/general_pattern.py index b340055fd..96018a5f4 100644 --- a/credsweeper/filters/group/general_pattern.py +++ b/credsweeper/filters/group/general_pattern.py @@ -1,5 +1,6 @@ from credsweeper.common.constants import GroupType from credsweeper.config import Config +from credsweeper.filters import ValueUselessWordCheck from credsweeper.filters.group import Group @@ -8,3 +9,4 @@ class GeneralPattern(Group): def __init__(self, config: Config) -> None: super().__init__(config, GroupType.PATTERN) + self.filters.extend([ValueUselessWordCheck()]) diff --git a/credsweeper/filters/line_specific_key_check.py b/credsweeper/filters/line_specific_key_check.py index 8bbfa15a1..71fec9dc0 100644 --- a/credsweeper/filters/line_specific_key_check.py +++ b/credsweeper/filters/line_specific_key_check.py @@ -1,5 +1,6 @@ import re +from credsweeper.common.constants import ML_HUNK from credsweeper.config import Config from credsweeper.credentials import LineData from credsweeper.file_handler.analysis_target import AnalysisTarget @@ -10,8 +11,8 @@ class LineSpecificKeyCheck(Filter): """Check that values from list below is not in candidate line.""" - NOT_ALLOWED = [r"example", r"enc\(", r"enc\[", r"true", r"false"] - NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED)) + NOT_ALLOWED = [r"example", r"\benc[\(\[]", r"\btrue\b", r"\bfalse\b"] + NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED), re.IGNORECASE) def __init__(self, config: Config = None) -> None: pass @@ -29,8 +30,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: """ if line_data.line is None: return True + if 0 <= line_data.variable_start: + # variable may be defined too + sub_line_start = 0 if ML_HUNK >= line_data.variable_start else line_data.variable_start - ML_HUNK + else: + sub_line_start = 0 if ML_HUNK >= line_data.value_start else line_data.value_start - ML_HUNK - if self.NOT_ALLOWED_PATTERN.search(target.line_lower): + if self.NOT_ALLOWED_PATTERN.search(line_data.line, sub_line_start, line_data.value_end + ML_HUNK): return True return False diff --git a/credsweeper/filters/value_useless_word_check.py b/credsweeper/filters/value_useless_word_check.py index c921c937d..6182d1e3f 100644 --- a/credsweeper/filters/value_useless_word_check.py +++ b/credsweeper/filters/value_useless_word_check.py @@ -11,11 +11,10 @@ class ValueUselessWordCheck(Filter): """Check is candidate value contains sub-rows with operators (like ->).""" NOT_ALLOWED = [ - "((\\{)?(0x)+([0-9a-f]|\\%){1}.*)", # Check is contain \{0x or 0x - "(\\-\\>.*)", # Check if contain -> - "(xxxx.*)", # Check if contain xxxxx + "((\\{)?(0x)+([0-9a-f]|\\%){1})", # Check is contain \{0x or 0x + r"((\w+)?->)", # Check if contain -> + "(.*example)", # Check if contain `example` word "(\\$\\w+)", # Check whether it looks like a variable e.g. $word - "(\\s).*" # Check if contain \s ] NOT_ALLOWED_PATTERN = re.compile( # Util.get_regex_combine_or(NOT_ALLOWED), # diff --git a/tests/__init__.py b/tests/__init__.py index d83aadba8..9a68241c1 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -18,7 +18,7 @@ # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25 -SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 18 +SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 19 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1 # well known string with all latin letters diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 27447f7fc..f9b502b47 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -9075,17 +9075,17 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.986, + "ml_probability": 0.999, "rule": "Secret", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"", + "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"", "line_num": 5, "path": "tests/samples/key.hs", "info": "tests/samples/key.hs|RAW", - "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE", + "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE", "value_start": 32, "value_end": 72, "variable": "secret_looks_like_linux_path__", @@ -9093,7 +9093,7 @@ "variable_end": 30, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.784183719779189, + "entropy": 4.8530559073332755, "valid": true } } @@ -11162,6 +11162,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.911, + "rule": "Salt", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "salt3 = \" 827634876\"; ", + "line_num": 1, + "path": "tests/samples/salt.py", + "info": "tests/samples/salt.py|STRUCT|STRUCT:2|KEYWORD:`salt3 = \" 827634876\"; `", + "value": " 827634876", + "value_start": 9, + "value_end": 19, + "variable": "salt3", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.389735285398626, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index 878f82d6d..ea8154995 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -9736,17 +9736,17 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.986, + "ml_probability": 0.999, "rule": "Secret", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"", + "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"", "line_num": 5, "path": "tests/samples/key.hs", "info": "", - "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE", + "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE", "value_start": 32, "value_end": 72, "variable": "secret_looks_like_linux_path__", @@ -9754,7 +9754,7 @@ "variable_end": 30, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.784183719779189, + "entropy": 4.8530559073332755, "valid": true } } diff --git a/tests/data/output.json b/tests/data/output.json index 033d703a2..e6b2d1d13 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -8818,17 +8818,17 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.986, + "ml_probability": 0.999, "rule": "Secret", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"", + "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"", "line_num": 5, "path": "tests/samples/key.hs", "info": "", - "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE", + "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE", "value_start": 32, "value_end": 72, "variable": "secret_looks_like_linux_path__", @@ -8836,7 +8836,7 @@ "variable_end": 30, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.784183719779189, + "entropy": 4.8530559073332755, "valid": true } } diff --git a/tests/filters/test_line_specific_key_check.py b/tests/filters/test_line_specific_key_check.py index ca123d839..c55a3cb3f 100644 --- a/tests/filters/test_line_specific_key_check.py +++ b/tests/filters/test_line_specific_key_check.py @@ -18,9 +18,11 @@ def test_line_specific_key_check_p(self, file_path: pytest.fixture, line: str) - @pytest.mark.parametrize("line", [ '"AwsAccessKey": enc("AKIAGIREOGIAWSKEY123"),', - '"AwsAccessKey": "AKIAGIREXAMPLEKEY123"', + '"AwsAccessKey as example": "AKIAGIREXAMPLEKEY123"', ]) def test_line_specific_key_check_n(self, file_path: pytest.fixture, line: str) -> None: cred_candidate = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN) + # LINE_VALUE_PATTERN does not detect a value position + cred_candidate.value_start = line.find("AKIA") target = AnalysisTarget(line_pos=0, lines=[line], line_nums=[1], descriptor=DUMMY_DESCRIPTOR) assert LineSpecificKeyCheck().run(cred_candidate, target) is True diff --git a/tests/filters/test_value_useless_word_check.py b/tests/filters/test_value_useless_word_check.py index 2cbdcf192..911f3781f 100644 --- a/tests/filters/test_value_useless_word_check.py +++ b/tests/filters/test_value_useless_word_check.py @@ -11,7 +11,7 @@ def test_value_useless_word_check_p(self, file_path: pytest.fixture, success_lin line_data = get_line_data(file_path=file_path, line=success_line, pattern=LINE_VALUE_PATTERN) assert ValueUselessWordCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False - @pytest.mark.parametrize("line", ["{0x943058439}", "0x%", "->gi_reo_gi", "xxxxxGIREOGI", " GIREOGI"]) + @pytest.mark.parametrize("line", ["{0x943058439}", "0x%", "->gi_reo_gi", "GIREOGIEXAMPLE"]) def test_value_useless_word_check_n(self, file_path: pytest.fixture, line: str) -> None: line_data = get_line_data(file_path=file_path, line=line, pattern=LINE_VALUE_PATTERN) assert ValueUselessWordCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True diff --git a/tests/samples/aws_client_id b/tests/samples/aws_client_id index 3685378f5..b5a83275c 100644 --- a/tests/samples/aws_client_id +++ b/tests/samples/aws_client_id @@ -1,2 +1,4 @@ The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X the coma is necessary there ^ bariers thesting !!! +must be filtered: AKIAGIREOGIAEXAMPLE7 +filtered too: AKIALGSBKLIKEAREAL12 --access-key diff --git a/tests/samples/key.hs b/tests/samples/key.hs index 5a13fab50..3d197e623 100644 --- a/tests/samples/key.hs +++ b/tests/samples/key.hs @@ -2,6 +2,6 @@ prKeyValid=LS0tLS1CRUdJTiBQUklWQVRFIENDcUdTTTQ5QXdFSEJHMHdhd0lCQVFRZ0ViVnpmUGWxh secret_looks_like_linux_path_1="/VnpmUGWxhQW9KQAwrL2ZYdDJPNG1PQjYxMXNPaF" secret_looks_like_linux_path_2="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjYxMXNPF" secret_looks_like_linux_path_3="VnpmUGWxhQW/9KQAwrL2ZYdDJPNG1PQjYxMXNPF=" -secret_looks_like_linux_path__="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE" +secret_looks_like_linux_path__="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE" "https://example.com/api/js?key=dhd0lCQVFRZ0ViVnpmUGWxhQW9KQWwrLzZYdDJPNG1PQjYxMXNPaFJB&bug=true" From 6d0ab82c9f22f5cb240cf113f4d16c2567ad6de2 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 7 Aug 2024 12:57:30 +0300 Subject: [PATCH 10/18] test counters fix --- tests/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 4f6c5b878..41e62d375 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,18 +7,18 @@ NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 362 -SAMPLES_CRED_LINE_COUNT: int = 379 +SAMPLES_CRED_COUNT: int = 364 +SAMPLES_CRED_LINE_COUNT: int = 381 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 320 +SAMPLES_POST_CRED_COUNT: int = 322 # with option --doc SAMPLES_IN_DOC = 411 # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24 -SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 17 +SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 18 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1 # well known string with all latin letters From fdb5ee70ba2926b5739a3126b9c67fea29970f03 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 7 Aug 2024 13:16:48 +0300 Subject: [PATCH 11/18] reduce whitespaces during extracting subtext --- credsweeper/utils/util.py | 10 +++++++++- tests/utils/test_util.py | 16 +++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py index 3f51d18d5..1fa4d8188 100644 --- a/credsweeper/utils/util.py +++ b/credsweeper/utils/util.py @@ -4,6 +4,7 @@ import logging import math import os +import string import struct import tarfile from dataclasses import dataclass @@ -685,6 +686,13 @@ def subtext(text: str, pos: int, hunk_size: int) -> str: else: left_quota = hunk_size - pos left_pos = 0 + # skip leading whitespaces in result string + for i in range(left_pos, pos): + if text[i] in string.whitespace: + left_quota += 1 + left_pos += 1 + else: + break right_remain = len(text) - pos if hunk_size <= right_remain: right_quota = 0 @@ -698,4 +706,4 @@ def subtext(text: str, pos: int, hunk_size: int) -> str: left_pos -= right_quota if 0 > left_pos: left_pos = 0 - return text[left_pos:right_pos] + return text[left_pos:right_pos].rstrip() diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py index 2d67cccde..b2cf2cca0 100644 --- a/tests/utils/test_util.py +++ b/tests/utils/test_util.py @@ -599,13 +599,19 @@ def test_get_chunks_coverage_p(self): def test_subtext_n(self): self.assertEqual("", Util.subtext("", 0, 0)) + self.assertEqual("", Util.subtext(' ' * 42, 0, 0)) def test_subtext_p(self): - # self.assertEqual(AZ_STRING, Util.subtext(AZ_STRING, 37, 40)) - self.assertEqual("The quick ", Util.subtext(AZ_STRING, 0, 5)) - self.assertEqual("The quick ", Util.subtext(AZ_STRING, 3, 5)) - self.assertEqual(" fox jumps", Util.subtext(AZ_STRING, 20, 5)) + self.assertEqual("var=value0123456789;", Util.subtext(" var=value0123456789; ", 21, 10)) + self.assertEqual(AZ_STRING, Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 1 + len(AZ_STRING) >> 1)) + self.assertEqual("x jump", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 3)) + self.assertEqual("ox jumps", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 4)) + self.assertEqual("fox jumps", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 5)) + self.assertEqual("fox jumps ov", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 6)) + self.assertEqual("The quick", Util.subtext(AZ_STRING, 0, 5)) + self.assertEqual("The quick", Util.subtext(AZ_STRING, 3, 5)) + self.assertEqual("fox jumps", Util.subtext(AZ_STRING, AZ_STRING.find("jumps"), 5)) self.assertEqual("e lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 5)) self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6)) - self.assertEqual(AZ_STRING[:40], Util.subtext(AZ_STRING, 15, 20)) + self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20)) self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20)) From 2c2191994d5ec05486fca72bbe3fe698960dd35f Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 7 Aug 2024 13:32:36 +0300 Subject: [PATCH 12/18] aux BM ref --- .github/workflows/benchmark.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d6017bb39..8b418ecf1 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: jwt + ref: auxiliary - name: Markup hashing run: | @@ -74,7 +74,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: jwt + ref: auxiliary - name: Markup hashing run: | @@ -172,7 +172,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: jwt + ref: auxiliary - name: Markup hashing run: | @@ -354,7 +354,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: jwt + ref: auxiliary - name: Markup hashing run: | From d7f65c148f75c508e062d1601cb6ac5d49f2d8b1 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 7 Aug 2024 14:05:34 +0300 Subject: [PATCH 13/18] BM scores fix --- cicd/benchmark.txt | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index eb12c2dd0..3380e7756 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,7 +1,7 @@ -DATA: 16978521 interested lines. MARKUP: 61845 items +DATA: 16978521 interested lines. MARKUP: 61851 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- - 194 28318 64 427 89 + 194 28318 64 429 87 .1 2 641 2 5 .admx 1 26 1 .adoc 1 158 11 6 1 @@ -53,8 +53,8 @@ FileType FileNumber ValidLines Positives Negatives Templat .erb 13 323 27 .erl 4 96 8 .ex 25 4968 3 105 5 -.example 17 1838 73 37 55 -.exs 24 4842 3 188 4 +.example 17 1838 73 38 55 +.exs 24 4842 3 189 4 .ext 5 211 1 4 2 .fsproj 1 75 1 .g4 2 201 2 @@ -80,7 +80,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 134 5 .j 1 241 4 .j2 30 5530 6 213 10 -.java 621 134132 314 1357 170 +.java 621 134132 314 1361 170 .jenkinsfile 1 58 1 7 .jinja2 1 64 2 .js 659 536413 521 2642 336 @@ -89,7 +89,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .jsx 7 857 19 .jwt 1 1 2 .key 83 2737 70 14 -.kt 123 20774 50 384 3 +.kt 123 20774 51 384 3 .l 1 982 1 .las 1 6656 46 .lasso 1 230 6 @@ -150,13 +150,13 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 618 3465 748 +.py 890 291553 618 3466 748 .pyi 4 1361 9 .pyp 1 167 1 .pyx 2 1094 21 .r 4 62 6 3 1 .rake 2 51 2 -.rb 861 131867 237 3457 615 +.rb 861 131867 237 3458 615 .re 1 31 1 .red 1 159 1 .release 1 13 4 @@ -222,8 +222,8 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 418 36162 460 916 384 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10281 16978521 7499 59954 5230 -credsweeper result_cnt : 6597, lost_cnt : 0, true_cnt : 6352, false_cnt : 245 +TOTAL: 10281 16978521 7500 59964 5228 +credsweeper result_cnt : 6594, lost_cnt : 0, true_cnt : 6346, false_cnt : 248 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 123 3163 185 112 109 3 3345 14 0.000896 0.113821 0.995102 0.973214 0.886179 0.927660 @@ -231,7 +231,7 @@ AWS Client ID 168 13 0 16 AWS Multi 75 12 0 87 75 11 1 0 0.916667 0.000000 0.873563 0.872093 1.000000 0.931677 AWS S3 Bucket 61 25 0 92 61 25 0 0 1.000000 0.000000 0.709302 0.709302 1.000000 0.829932 Atlassian Old PAT token 27 212 3 12 3 8 207 24 0.037209 0.888889 0.867769 0.272727 0.111111 0.157895 -Auth 407 2725 77 372 351 21 2781 56 0.007495 0.137592 0.976005 0.943548 0.862408 0.901155 +Auth 407 2728 77 371 350 21 2784 57 0.007487 0.140049 0.975716 0.943396 0.859951 0.899743 Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 @@ -253,16 +253,16 @@ Grafana Provisioned API Key 22 1 0 JSON Web Token 169 61 0 158 137 21 40 32 0.344262 0.189349 0.769565 0.867089 0.810651 0.837920 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333 -Key 483 8494 464 445 436 9 8949 47 0.001005 0.097308 0.994068 0.979775 0.902692 0.939655 +Key 483 8496 464 443 434 9 8951 49 0.001004 0.101449 0.993858 0.979684 0.898551 0.937365 Nonce 83 53 0 85 79 6 47 4 0.113208 0.048193 0.926471 0.929412 0.951807 0.940476 Other 0 0 5 0 0 5 0 0.000000 1.000000 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1823 7474 2752 1682 1614 67 10159 209 0.006552 0.114646 0.977094 0.960143 0.885354 0.921233 +Password 1820 7476 2751 1681 1613 68 10159 207 0.006649 0.113736 0.977173 0.959548 0.886264 0.921451 Salt 42 76 2 38 38 0 78 4 0.000000 0.095238 0.966667 1.000000 0.904762 0.950000 Secret 1358 28497 869 1234 1229 5 29361 129 0.000170 0.094993 0.995639 0.995948 0.905007 0.948302 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 584 3973 438 519 511 8 4403 73 0.001814 0.125000 0.983784 0.984586 0.875000 0.926564 +Token 585 3975 438 512 503 9 4404 82 0.002039 0.140171 0.981793 0.982422 0.859829 0.917046 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 194 125 251 184 184 0 376 10 0.000000 0.051546 0.982456 1.000000 0.948454 0.973545 - 7499 59954 5230 6604 6352 245 59709 1147 0.004086 0.152954 0.979363 0.962862 0.847046 0.901249 +URL Credentials 197 126 250 190 190 0 376 7 0.000000 0.035533 0.987784 1.000000 0.964467 0.981912 + 7500 59964 5228 6605 6346 248 59716 1154 0.004136 0.153867 0.979219 0.962390 0.846133 0.900525 From 95d12f3c2d2dad831c6ac037405b878a23232208 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 7 Aug 2024 19:59:53 +0300 Subject: [PATCH 14/18] Rollback BM --- .github/workflows/benchmark.yml | 12 +++---- tests/data/depth_3.json | 62 ++++++++++++++++----------------- tests/data/ml_threshold.json | 62 ++++++++++++++++----------------- tests/data/output.json | 62 ++++++++++++++++----------------- tests/samples/url_cred.js | 4 +-- 5 files changed, 99 insertions(+), 103 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8b418ecf1..1caaa133f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,8 +22,7 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: babenek/CredData - ref: auxiliary + repository: Samsung/CredData - name: Markup hashing run: | @@ -73,8 +72,7 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: babenek/CredData - ref: auxiliary + repository: Samsung/CredData - name: Markup hashing run: | @@ -171,8 +169,7 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: babenek/CredData - ref: auxiliary + repository: Samsung/CredData - name: Markup hashing run: | @@ -353,8 +350,7 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: babenek/CredData - ref: auxiliary + repository: Samsung/CredData - name: Markup hashing run: | diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index d4d44afa9..01cdc64ff 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -10784,33 +10784,6 @@ } ] }, - { - "api_validation": "NOT_AVAILABLE", - "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.989, - "rule": "URL Credentials", - "severity": "high", - "confidence": "moderate", - "line_data_list": [ - { - "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", - "line_num": 3, - "path": "tests/samples/url_cred.js", - "info": "tests/samples/url_cred.js|RAW", - "value": "FnD83JZs", - "value_start": 44, - "value_end": 52, - "variable": "smtps://", - "variable_start": 18, - "variable_end": 26, - "entropy_validation": { - "iterator": "BASE64_CHARS", - "entropy": 3.0, - "valid": false - } - } - ] - }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -10821,7 +10794,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 5, + "line_num": 3, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -10848,7 +10821,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 9, + "line_num": 7, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "f45VgF8jX79o@anydata.com", @@ -10875,7 +10848,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "3487263-2384579834-234732875-345", @@ -10902,7 +10875,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "tests/samples/url_cred.js|RAW", "value": "546DFS64N90P3AW7DX", @@ -10919,6 +10892,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 13, + "path": "tests/samples/url_cred.js", + "info": "tests/samples/url_cred.js|RAW", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index c0bf22718..d905cd4b5 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -9846,33 +9846,6 @@ } ] }, - { - "api_validation": "NOT_AVAILABLE", - "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.989, - "rule": "URL Credentials", - "severity": "high", - "confidence": "moderate", - "line_data_list": [ - { - "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", - "line_num": 3, - "path": "tests/samples/url_cred.js", - "info": "", - "value": "FnD83JZs", - "value_start": 44, - "value_end": 52, - "variable": "smtps://", - "variable_start": 18, - "variable_end": 26, - "entropy_validation": { - "iterator": "BASE64_CHARS", - "entropy": 3.0, - "valid": false - } - } - ] - }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -9883,7 +9856,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 5, + "line_num": 3, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -9910,7 +9883,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 9, + "line_num": 7, "path": "tests/samples/url_cred.js", "info": "", "value": "f45VgF8jX79o@anydata.com", @@ -9937,7 +9910,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "", "value": "3487263-2384579834-234732875-345", @@ -9964,7 +9937,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX", @@ -9981,6 +9954,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 13, + "path": "tests/samples/url_cred.js", + "info": "", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index 24dc955c2..1d75227d5 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -8766,33 +8766,6 @@ } ] }, - { - "api_validation": "NOT_AVAILABLE", - "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.989, - "rule": "URL Credentials", - "severity": "high", - "confidence": "moderate", - "line_data_list": [ - { - "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", - "line_num": 3, - "path": "tests/samples/url_cred.js", - "info": "", - "value": "FnD83JZs", - "value_start": 44, - "value_end": 52, - "variable": "smtps://", - "variable_start": 18, - "variable_end": 26, - "entropy_validation": { - "iterator": "BASE64_CHARS", - "entropy": 3.0, - "valid": false - } - } - ] - }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -8803,7 +8776,7 @@ "line_data_list": [ { "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";", - "line_num": 5, + "line_num": 3, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX%2Fkeep", @@ -8830,7 +8803,7 @@ "line_data_list": [ { "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"", - "line_num": 9, + "line_num": 7, "path": "tests/samples/url_cred.js", "info": "", "value": "f45VgF8jX79o@anydata.com", @@ -8857,7 +8830,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "", "value": "3487263-2384579834-234732875-345", @@ -8884,7 +8857,7 @@ "line_data_list": [ { "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2", - "line_num": 12, + "line_num": 10, "path": "tests/samples/url_cred.js", "info": "", "value": "546DFS64N90P3AW7DX", @@ -8901,6 +8874,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.989, + "rule": "URL Credentials", + "severity": "high", + "confidence": "moderate", + "line_data_list": [ + { + "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";", + "line_num": 13, + "path": "tests/samples/url_cred.js", + "info": "", + "value": "FnD83JZs", + "value_start": 44, + "value_end": 52, + "variable": "smtps://", + "variable_start": 18, + "variable_end": 26, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js index db6af3e57..d555f7233 100644 --- a/tests/samples/url_cred.js +++ b/tests/samples/url_cred.js @@ -1,7 +1,5 @@ const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local'); -email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465"; - url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut"; // note:dummyuser@example.com @@ -11,3 +9,5 @@ url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut /* partially line to sanitize url-like items 39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2 */ + +email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465"; From 660fc44955b83a7fba0403075afb870a03ed67dd Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 8 Aug 2024 11:10:54 +0300 Subject: [PATCH 15/18] JWT fix --- credsweeper/filters/value_json_web_token_check.py | 2 +- tests/filters/test_value_json_web_token_check.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/credsweeper/filters/value_json_web_token_check.py b/credsweeper/filters/value_json_web_token_check.py index d7265dbce..ed6a2e2e1 100644 --- a/credsweeper/filters/value_json_web_token_check.py +++ b/credsweeper/filters/value_json_web_token_check.py @@ -53,7 +53,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool: if not header_check: header_check = bool(ValueJsonWebTokenCheck.header_keys.intersection(json_keys)) # payload follows the header - if not payload_check: + elif not payload_check: payload_check = bool(ValueJsonWebTokenCheck.payload_keys.intersection(json_keys)) # any other payloads are allowed elif header_check and payload_check and not signature_check: diff --git a/tests/filters/test_value_json_web_token_check.py b/tests/filters/test_value_json_web_token_check.py index 9aa85a752..4cb701956 100644 --- a/tests/filters/test_value_json_web_token_check.py +++ b/tests/filters/test_value_json_web_token_check.py @@ -20,6 +20,9 @@ def test_value_jwt_check_p(self): self.assertTrue(ValueJsonWebTokenCheck().run( get_line_data(line="eyJhbGciOiJSUzI1NiJ9Cg.eyJleHAiOjY1NTM2fQo.AAAAAAAAAAAAAAAAAAAAAAA", pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET)) + self.assertTrue(ValueJsonWebTokenCheck().run( + get_line_data(line="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.x3.GFsFyGiCUIP5VHI9CEJL9thWsGjSZf1fJfarNk-LGTM", + pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET)) def test_value_jwt_check_n(self): self.assertFalse(ValueJsonWebTokenCheck().run( @@ -33,3 +36,8 @@ def test_value_jwt_check_n(self): "Ui1o9ndy7ckISHQVhuYFKu78l7nqC4heghK_Gw4h7EB7s8eEuUC-D6JjVtX10IyS" \ "vCRkRo7f8dWQTjFLs7mlPowjRz0cP5J-MmCoegKHYagOHZ_ArXOR91_u8jMdwmOf", pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET)) + self.assertFalse(ValueJsonWebTokenCheck().run( + get_line_data(line="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9." \ + "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9." \ + ".e30.GFsFyGiCUIP5VHI9CEJL9thWsGjSZf1fJfarNk-LGTM", + pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET)) From 536a7f394bb9d2592e89704933bacfb97a32a3a7 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 8 Aug 2024 11:31:03 +0300 Subject: [PATCH 16/18] customBMref --- .github/workflows/benchmark.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1caaa133f..8b418ecf1 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,7 +22,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: auxiliary - name: Markup hashing run: | @@ -72,7 +73,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: auxiliary - name: Markup hashing run: | @@ -169,7 +171,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: auxiliary - name: Markup hashing run: | @@ -350,7 +353,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v4 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: auxiliary - name: Markup hashing run: | From 6d072d44cdf88a6b4701899c6c1d6031ece1c472 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 8 Aug 2024 11:41:34 +0300 Subject: [PATCH 17/18] JWT fix BC scor --- cicd/benchmark.txt | 52 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 3380e7756..4ec884387 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,4 +1,4 @@ -DATA: 16978521 interested lines. MARKUP: 61851 items +DATA: 16978521 interested lines. MARKUP: 61852 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 64 429 87 @@ -11,7 +11,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .backup 1 62 1 1 .bash 2 2158 2 1 .bat 4 233 1 13 2 -.bats 15 2804 12 52 9 +.bats 15 2804 14 50 9 .bazel 3 424 8 .build 2 40 3 .bundle 4 1512 570 @@ -27,7 +27,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .cmd 4 401 2 3 .cnf 8 858 18 45 18 .coffee 1 585 2 -.conf 60 4945 50 74 54 +.conf 60 4945 53 71 54 .config 20 492 16 33 1 .cpp 15 5688 1 61 .creds 1 10 1 1 @@ -53,7 +53,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .erb 13 323 27 .erl 4 96 8 .ex 25 4968 3 105 5 -.example 17 1838 73 38 55 +.example 17 1838 74 38 54 .exs 24 4842 3 189 4 .ext 5 211 1 4 2 .fsproj 1 75 1 @@ -61,7 +61,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .gd 1 37 1 .gml 3 3075 26 .gni 3 5017 18 -.go 1079 566327 619 4333 742 +.go 1079 566327 621 4331 742 .golden 5 1168 1 14 29 .gradle 45 3265 4 91 100 .graphql 7 420 13 @@ -74,16 +74,16 @@ FileType FileNumber ValidLines Positives Negatives Templat .html 53 15327 14 115 18 .idl 2 777 4 .iml 6 699 36 -.in 6 2130 1 80 12 +.in 6 2130 3 78 12 .inc 2 56 2 1 .ini 11 1437 24 12 18 .ipynb 1 134 5 -.j 1 241 4 +.j 1 241 2 2 .j2 30 5530 6 213 10 .java 621 134132 314 1361 170 .jenkinsfile 1 58 1 7 .jinja2 1 64 2 -.js 659 536413 521 2642 336 +.js 659 536413 526 2637 336 .json 860 13670669 623 10948 140 .jsp 13 3202 1 42 .jsx 7 857 19 @@ -105,12 +105,12 @@ FileType FileNumber ValidLines Positives Negatives Templat .lock 24 160912 144 .log 2 199 38 52 .lua 10 1924 37 3 -.m 16 13358 11 151 3 +.m 16 13358 11 152 3 .manifest 3 102 3 .markdown 3 139 3 1 .markerb 3 12 3 .marko 1 21 2 -.md 673 149294 646 2366 671 +.md 673 149294 658 2361 664 .mdx 3 549 7 .mjml 1 18 1 .mjs 22 4424 50 343 @@ -122,7 +122,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .mqh 1 1023 2 .msg 1 26644 1 .mysql 1 36 2 -.ndjson 2 5006 34 268 2 +.ndjson 2 5006 37 266 2 .nix 4 211 12 .nolint 1 2 1 .odd 1 1281 57 @@ -132,7 +132,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .patch 4 109405 27 .pbxproj 1 941 1 .pem 48 1169 47 8 -.php 371 75710 129 1770 80 +.php 371 75710 130 1769 80 .pl 16 14727 6 47 .pm 3 744 8 .po 3 2994 15 @@ -150,7 +150,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 618 3466 748 +.py 890 291553 626 3461 744 .pyi 4 1361 9 .pyp 1 167 1 .pyx 2 1094 21 @@ -222,16 +222,16 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 418 36162 460 916 384 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10281 16978521 7500 59964 5228 -credsweeper result_cnt : 6594, lost_cnt : 0, true_cnt : 6346, false_cnt : 248 +TOTAL: 10281 16978521 7541 59936 5216 +credsweeper result_cnt : 6566, lost_cnt : 0, true_cnt : 6348, false_cnt : 218 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- -API 123 3163 185 112 109 3 3345 14 0.000896 0.113821 0.995102 0.973214 0.886179 0.927660 +API 124 3162 185 112 109 3 3344 15 0.000896 0.120968 0.994814 0.973214 0.879032 0.923729 AWS Client ID 168 13 0 160 160 0 13 8 0.000000 0.047619 0.955801 1.000000 0.952381 0.975610 AWS Multi 75 12 0 87 75 11 1 0 0.916667 0.000000 0.873563 0.872093 1.000000 0.931677 AWS S3 Bucket 61 25 0 92 61 25 0 0 1.000000 0.000000 0.709302 0.709302 1.000000 0.829932 Atlassian Old PAT token 27 212 3 12 3 8 207 24 0.037209 0.888889 0.867769 0.272727 0.111111 0.157895 -Auth 407 2728 77 371 350 21 2784 57 0.007487 0.140049 0.975716 0.943396 0.859951 0.899743 +Auth 408 2727 77 371 350 21 2783 58 0.007489 0.142157 0.975405 0.943396 0.857843 0.898588 Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 @@ -250,19 +250,19 @@ Google API Key 12 0 0 1 Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381 Google OAuth Access Token 3 0 0 3 3 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Grafana Provisioned API Key 22 1 0 5 5 0 1 17 0.000000 0.772727 0.260870 1.000000 0.227273 0.370370 -JSON Web Token 169 61 0 158 137 21 40 32 0.344262 0.189349 0.769565 0.867089 0.810651 0.837920 +JSON Web Token 169 61 0 130 130 0 61 39 0.000000 0.230769 0.830435 1.000000 0.769231 0.869565 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333 -Key 483 8496 464 443 434 9 8951 49 0.001004 0.101449 0.993858 0.979684 0.898551 0.937365 -Nonce 83 53 0 85 79 6 47 4 0.113208 0.048193 0.926471 0.929412 0.951807 0.940476 +Key 493 8487 464 443 434 9 8942 59 0.001005 0.119675 0.992800 0.979684 0.880325 0.927350 +Nonce 90 46 0 85 84 1 45 6 0.021739 0.066667 0.948529 0.988235 0.933333 0.960000 Other 0 0 5 0 0 5 0 0.000000 1.000000 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1820 7476 2751 1681 1613 68 10159 207 0.006649 0.113736 0.977173 0.959548 0.886264 0.921451 -Salt 42 76 2 38 38 0 78 4 0.000000 0.095238 0.966667 1.000000 0.904762 0.950000 -Secret 1358 28497 869 1234 1229 5 29361 129 0.000170 0.094993 0.995639 0.995948 0.905007 0.948302 +Password 1834 7472 2741 1681 1617 64 10149 217 0.006267 0.118321 0.976675 0.961927 0.881679 0.920057 +Salt 45 73 2 38 38 0 75 7 0.000000 0.155556 0.941667 1.000000 0.844444 0.915663 +Secret 1362 28494 868 1234 1229 5 29357 133 0.000170 0.097651 0.995508 0.995948 0.902349 0.946841 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 585 3975 438 512 503 9 4404 82 0.002039 0.140171 0.981793 0.982422 0.859829 0.917046 +Token 586 3974 438 512 503 9 4403 83 0.002040 0.141638 0.981593 0.982422 0.858362 0.916211 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 197 126 250 190 190 0 376 7 0.000000 0.035533 0.987784 1.000000 0.964467 0.981912 - 7500 59964 5228 6605 6346 248 59716 1154 0.004136 0.153867 0.979219 0.962390 0.846133 0.900525 +URL Credentials 197 127 249 190 190 0 376 7 0.000000 0.035533 0.987784 1.000000 0.964467 0.981912 + 7541 59936 5216 6577 6348 218 59718 1193 0.003637 0.158202 0.979089 0.966799 0.841798 0.899979 From 23d3abb39b1df39c852fa1c38519a29c45213139 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 8 Aug 2024 16:13:46 +0300 Subject: [PATCH 18/18] BM scores fix --- cicd/benchmark.txt | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 4ec884387..1626253d5 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,4 +1,4 @@ -DATA: 16978521 interested lines. MARKUP: 61852 items +DATA: 16978521 interested lines. MARKUP: 61855 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 64 429 87 @@ -27,7 +27,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .cmd 4 401 2 3 .cnf 8 858 18 45 18 .coffee 1 585 2 -.conf 60 4945 53 71 54 +.conf 60 4945 54 69 54 .config 20 492 16 33 1 .cpp 15 5688 1 61 .creds 1 10 1 1 @@ -54,14 +54,14 @@ FileType FileNumber ValidLines Positives Negatives Templat .erl 4 96 8 .ex 25 4968 3 105 5 .example 17 1838 74 38 54 -.exs 24 4842 3 189 4 +.exs 24 4842 3 190 4 .ext 5 211 1 4 2 .fsproj 1 75 1 .g4 2 201 2 .gd 1 37 1 .gml 3 3075 26 .gni 3 5017 18 -.go 1079 566327 621 4331 742 +.go 1079 566327 623 4329 742 .golden 5 1168 1 14 29 .gradle 45 3265 4 91 100 .graphql 7 420 13 @@ -83,8 +83,8 @@ FileType FileNumber ValidLines Positives Negatives Templat .java 621 134132 314 1361 170 .jenkinsfile 1 58 1 7 .jinja2 1 64 2 -.js 659 536413 526 2637 336 -.json 860 13670669 623 10948 140 +.js 659 536413 526 2638 336 +.json 860 13670669 624 10946 140 .jsp 13 3202 1 42 .jsx 7 857 19 .jwt 1 1 2 @@ -110,7 +110,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .markdown 3 139 3 1 .markerb 3 12 3 .marko 1 21 2 -.md 673 149294 658 2361 664 +.md 673 149294 658 2362 664 .mdx 3 549 7 .mjml 1 18 1 .mjs 22 4424 50 343 @@ -150,7 +150,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 626 3461 744 +.py 890 291553 627 3460 744 .pyi 4 1361 9 .pyp 1 167 1 .pyx 2 1094 21 @@ -222,8 +222,8 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 418 36162 460 916 384 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10281 16978521 7541 59936 5216 -credsweeper result_cnt : 6566, lost_cnt : 0, true_cnt : 6348, false_cnt : 218 +TOTAL: 10281 16978521 7546 59932 5216 +credsweeper result_cnt : 6585, lost_cnt : 0, true_cnt : 6367, false_cnt : 218 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- API 124 3162 185 112 109 3 3344 15 0.000896 0.120968 0.994814 0.973214 0.879032 0.923729 @@ -231,7 +231,7 @@ AWS Client ID 168 13 0 16 AWS Multi 75 12 0 87 75 11 1 0 0.916667 0.000000 0.873563 0.872093 1.000000 0.931677 AWS S3 Bucket 61 25 0 92 61 25 0 0 1.000000 0.000000 0.709302 0.709302 1.000000 0.829932 Atlassian Old PAT token 27 212 3 12 3 8 207 24 0.037209 0.888889 0.867769 0.272727 0.111111 0.157895 -Auth 408 2727 77 371 350 21 2783 58 0.007489 0.142157 0.975405 0.943396 0.857843 0.898588 +Auth 408 2727 77 372 351 21 2783 57 0.007489 0.139706 0.975716 0.943548 0.860294 0.900000 Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 @@ -253,16 +253,16 @@ Grafana Provisioned API Key 22 1 0 JSON Web Token 169 61 0 130 130 0 61 39 0.000000 0.230769 0.830435 1.000000 0.769231 0.869565 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333 -Key 493 8487 464 443 434 9 8942 59 0.001005 0.119675 0.992800 0.979684 0.880325 0.927350 -Nonce 90 46 0 85 84 1 45 6 0.021739 0.066667 0.948529 0.988235 0.933333 0.960000 +Key 497 8483 464 448 439 9 8938 58 0.001006 0.116700 0.992906 0.979911 0.883300 0.929101 +Nonce 90 47 0 84 83 1 46 7 0.021277 0.077778 0.941606 0.988095 0.922222 0.954023 Other 0 0 5 0 0 5 0 0.000000 1.000000 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1834 7472 2741 1681 1617 64 10149 217 0.006267 0.118321 0.976675 0.961927 0.881679 0.920057 -Salt 45 73 2 38 38 0 75 7 0.000000 0.155556 0.941667 1.000000 0.844444 0.915663 -Secret 1362 28494 868 1234 1229 5 29357 133 0.000170 0.097651 0.995508 0.995948 0.902349 0.946841 +Password 1834 7473 2741 1691 1627 64 10150 207 0.006266 0.112868 0.977507 0.962153 0.887132 0.923121 +Salt 45 73 2 39 39 0 75 6 0.000000 0.133333 0.950000 1.000000 0.866667 0.928571 +Secret 1362 28492 868 1236 1231 5 29355 131 0.000170 0.096182 0.995573 0.995955 0.903818 0.947652 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 586 3974 438 512 503 9 4403 83 0.002040 0.141638 0.981593 0.982422 0.858362 0.916211 +Token 586 3974 438 513 504 9 4403 82 0.002040 0.139932 0.981793 0.982456 0.860068 0.917197 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 197 127 249 190 190 0 376 7 0.000000 0.035533 0.987784 1.000000 0.964467 0.981912 - 7541 59936 5216 6577 6348 218 59718 1193 0.003637 0.158202 0.979089 0.966799 0.841798 0.899979 +URL Credentials 198 127 249 190 190 0 376 8 0.000000 0.040404 0.986063 1.000000 0.959596 0.979381 + 7546 59932 5216 6596 6367 218 59714 1179 0.003637 0.156242 0.979297 0.966894 0.843758 0.901139