From 4a64d98bc6d1674837b06791e706e7825a264e15 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Tue, 23 Jul 2024 20:54:29 +0300
Subject: [PATCH 01/18] Filters optimisation

---
 credsweeper/credentials/line_data.py          |  2 +
 credsweeper/filters/__init__.py               |  2 -
 credsweeper/filters/group/group.py            |  6 +-
 .../filters/group/url_credentials_group.py    |  5 +-
 .../value_dictionary_value_length_check.py    |  7 ++-
 credsweeper/filters/value_length_check.py     | 26 --------
 .../filters/value_pattern_length_check.py     | 10 ---
 credsweeper/rules/config.yaml                 |  2 +-
 tests/__init__.py                             |  6 +-
 tests/data/depth_3.json                       | 62 +++++++++++++++++--
 tests/data/ml_threshold.json                  | 62 +++++++++++++++++--
 tests/data/output.json                        | 62 +++++++++++++++++--
 tests/filters/test_value_length_check.py      | 19 ------
 tests/ml_model/test_ml_validator.py           | 17 +++--
 tests/samples/salt.py                         |  1 +
 tests/samples/url_cred.js                     |  2 +
 tests/test_main.py                            |  2 +-
 17 files changed, 200 insertions(+), 93 deletions(-)
 delete mode 100644 credsweeper/filters/value_length_check.py
 delete mode 100644 credsweeper/filters/value_pattern_length_check.py
 delete mode 100644 tests/filters/test_value_length_check.py

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
index 06a68fb5b..d407b657a 100644
--- a/credsweeper/credentials/line_data.py
+++ b/credsweeper/credentials/line_data.py
@@ -193,6 +193,8 @@ def sanitize_variable(self) -> None:
         while self.variable and sanitized_var_len != len(self.variable):
             sanitized_var_len = len(self.variable)
             self.variable = self.variable.strip(self.variable_strip_pattern)
+            if self.variable.endswith('\\'):
+                self.variable = self.variable[:-1]
         if variable and len(self.variable) < len(variable) and 0 <= self.variable_start and 0 <= self.variable_end:
             start = variable.find(self.variable)
             self.variable_start += start
diff --git a/credsweeper/filters/__init__.py b/credsweeper/filters/__init__.py
index 66de37b97..f7cc9c58f 100644
--- a/credsweeper/filters/__init__.py
+++ b/credsweeper/filters/__init__.py
@@ -28,13 +28,11 @@
 from credsweeper.filters.value_jfrog_token_check import ValueJfrogTokenCheck
 from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenCheck
 from credsweeper.filters.value_last_word_check import ValueLastWordCheck
-from credsweeper.filters.value_length_check import ValueLengthCheck
 from credsweeper.filters.value_method_check import ValueMethodCheck
 from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck
 from credsweeper.filters.value_not_part_encoded_check import ValueNotPartEncodedCheck
 from credsweeper.filters.value_number_check import ValueNumberCheck
 from credsweeper.filters.value_pattern_check import ValuePatternCheck
-from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck
 from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
 from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
 from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
diff --git a/credsweeper/filters/group/group.py b/credsweeper/filters/group/group.py
index 6ee25387d..65a245fa9 100644
--- a/credsweeper/filters/group/group.py
+++ b/credsweeper/filters/group/group.py
@@ -5,9 +5,9 @@
 from credsweeper.config import Config
 from credsweeper.filters import (Filter, LineSpecificKeyCheck, ValueAllowlistCheck, ValueArrayDictionaryCheck,
                                  ValueBlocklistCheck, ValueCamelCaseCheck, ValueFilePathCheck, ValueFirstWordCheck,
-                                 ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
+                                 ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
                                  ValuePatternCheck, ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck,
-                                 VariableNotAllowedPatternCheck, ValuePatternLengthCheck, ValueHexNumberCheck)
+                                 VariableNotAllowedPatternCheck, ValueHexNumberCheck)
 
 
 class Group(ABC):
@@ -43,7 +43,6 @@ def get_keyword_base_filters(config: Config) -> List[Filter]:
             ValueFirstWordCheck(),
             ValueHexNumberCheck(),
             ValueLastWordCheck(),
-            ValueLengthCheck(config),
             ValueMethodCheck(),
             ValueSimilarityCheck(),
             ValueStringTypeCheck(config),
@@ -60,5 +59,4 @@ def get_pattern_base_filters(config: Config) -> List[Filter]:
         return [  #
             LineSpecificKeyCheck(),  #
             ValuePatternCheck(config),  #
-            ValuePatternLengthCheck(config)
         ]
diff --git a/credsweeper/filters/group/url_credentials_group.py b/credsweeper/filters/group/url_credentials_group.py
index 9a7477191..4c4c5d6a0 100644
--- a/credsweeper/filters/group/url_credentials_group.py
+++ b/credsweeper/filters/group/url_credentials_group.py
@@ -2,7 +2,7 @@
 from credsweeper.config import Config
 from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck,
                                  ValueCamelCaseCheck, ValueDictionaryValueLengthCheck, ValueFilePathCheck,
-                                 ValueFirstWordCheck, ValueLastWordCheck, ValueLengthCheck, ValueMethodCheck,
+                                 ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck,
                                  ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck)
 from credsweeper.filters.group import Group
 
@@ -25,11 +25,10 @@ def __init__(self, config: Config) -> None:
             ValueFilePathCheck(),
             ValueFirstWordCheck(),
             ValueLastWordCheck(),
-            ValueLengthCheck(config),
             ValueMethodCheck(),
             ValueStringTypeCheck(config),
             ValueNotAllowedPatternCheck(),
             ValueTokenCheck(),
-            ValueDictionaryValueLengthCheck(),
+            ValueDictionaryValueLengthCheck(min_len=4, max_len=80),
             ValuePatternCheck(config)
         ]
diff --git a/credsweeper/filters/value_dictionary_value_length_check.py b/credsweeper/filters/value_dictionary_value_length_check.py
index c0b92a846..8186f8229 100644
--- a/credsweeper/filters/value_dictionary_value_length_check.py
+++ b/credsweeper/filters/value_dictionary_value_length_check.py
@@ -7,8 +7,9 @@
 class ValueDictionaryValueLengthCheck(Filter):
     """Check that candidate length is between 5 and 30."""
 
-    def __init__(self, config: Config = None) -> None:
-        pass
+    def __init__(self, config: Config = None, min_len: int = 4, max_len: int = 31) -> None:
+        self.min_len = min_len
+        self.max_len = max_len
 
     def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         """Run filter checks on received credential candidate data 'line_data'.
@@ -21,7 +22,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
             True, if need to filter candidate and False if left
 
         """
-        if 4 <= len(line_data.value) <= 31:
+        if self.min_len <= len(line_data.value) <= self.max_len:
             return False
         else:
             return True
diff --git a/credsweeper/filters/value_length_check.py b/credsweeper/filters/value_length_check.py
deleted file mode 100644
index 57596f35e..000000000
--- a/credsweeper/filters/value_length_check.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from credsweeper.config import Config
-from credsweeper.credentials import LineData
-from credsweeper.file_handler.analysis_target import AnalysisTarget
-from credsweeper.filters import Filter
-
-
-class ValueLengthCheck(Filter):
-    """Check if potential candidate value is not too short (longer or equal to `min_len`)."""
-
-    def __init__(self, config: Config) -> None:
-        self.min_len = config.min_keyword_value_length
-
-    def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
-        """Run filter checks on received credential candidate data 'line_data'.
-
-        Args:
-            line_data: credential candidate data
-            target: multiline target from which line data was obtained
-
-        Return:
-            True, if need to filter candidate and False if left
-
-        """
-        if len(line_data.value) < self.min_len:
-            return True
-        return False
diff --git a/credsweeper/filters/value_pattern_length_check.py b/credsweeper/filters/value_pattern_length_check.py
deleted file mode 100644
index dd4531bf1..000000000
--- a/credsweeper/filters/value_pattern_length_check.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from credsweeper.config import Config
-from credsweeper.filters import ValueLengthCheck
-
-
-class ValuePatternLengthCheck(ValueLengthCheck):
-    """Check if potential candidate value is not too short like ValueLengthCheck but with different min_len"""
-
-    def __init__(self, config: Config) -> None:
-        super().__init__(config)
-        self.min_len = config.min_pattern_value_length
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
index df36dee3f..5ff2a8c92 100644
--- a/credsweeper/rules/config.yaml
+++ b/credsweeper/rules/config.yaml
@@ -673,7 +673,7 @@
   confidence: moderate
   type: pattern
   values:
-    - (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}@:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
+    - (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
   filter_type: UrlCredentialsGroup
   use_ml: true
   required_substrings:
diff --git a/tests/__init__.py b/tests/__init__.py
index 3dfc0a7b5..d83aadba8 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -7,11 +7,11 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 429
-SAMPLES_CRED_LINE_COUNT: int = 446
+SAMPLES_CRED_COUNT: int = 431
+SAMPLES_CRED_LINE_COUNT: int = 448
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 387
+SAMPLES_POST_CRED_COUNT: int = 389
 
 # with option --doc
 SAMPLES_IN_DOC = 410
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index 448776d10..27447f7fc 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -11243,6 +11243,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.665,
+        "rule": "Salt",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"",
+                "line_num": 5,
+                "path": "tests/samples/salt.py",
+                "info": "tests/samples/salt.py|RAW",
+                "value": "4b9a6d8b638eb0c6",
+                "value_start": 35,
+                "value_end": 51,
+                "variable": "salt8",
+                "variable_start": 21,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2806390622295662,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -12647,6 +12674,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 3,
+                "path": "tests/samples/url_cred.js",
+                "info": "tests/samples/url_cred.js|RAW",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -12657,7 +12711,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 3,
+                "line_num": 5,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -12684,7 +12738,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 7,
+                "line_num": 9,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -12711,7 +12765,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "3487263-2384579834-234732875-345",
@@ -12738,7 +12792,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "546DFS64N90P3AW7DX",
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index 0287a0ae5..878f82d6d 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -11007,6 +11007,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.665,
+        "rule": "Salt",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"",
+                "line_num": 5,
+                "path": "tests/samples/salt.py",
+                "info": "",
+                "value": "4b9a6d8b638eb0c6",
+                "value_start": 35,
+                "value_end": 51,
+                "variable": "salt8",
+                "variable_start": 21,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2806390622295662,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -11682,6 +11709,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 3,
+                "path": "tests/samples/url_cred.js",
+                "info": "",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -11692,7 +11746,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 3,
+                "line_num": 5,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -11719,7 +11773,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 7,
+                "line_num": 9,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -11746,7 +11800,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "3487263-2384579834-234732875-345",
@@ -11773,7 +11827,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX",
diff --git a/tests/data/output.json b/tests/data/output.json
index b9d388ae9..033d703a2 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -9954,6 +9954,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.665,
+        "rule": "Salt",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "json_escaped = \"{\\\\\\\"salt8\\\\\\\":\\\\\\\"4b9a6d8b638eb0c6\\\\\\\"}\"",
+                "line_num": 5,
+                "path": "tests/samples/salt.py",
+                "info": "",
+                "value": "4b9a6d8b638eb0c6",
+                "value_start": 35,
+                "value_end": 51,
+                "variable": "salt8",
+                "variable_start": 21,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.2806390622295662,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
@@ -10575,6 +10602,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 3,
+                "path": "tests/samples/url_cred.js",
+                "info": "",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -10585,7 +10639,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 3,
+                "line_num": 5,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -10612,7 +10666,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 7,
+                "line_num": 9,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -10639,7 +10693,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "3487263-2384579834-234732875-345",
@@ -10666,7 +10720,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 10,
+                "line_num": 12,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX",
diff --git a/tests/filters/test_value_length_check.py b/tests/filters/test_value_length_check.py
deleted file mode 100644
index 69bd5f809..000000000
--- a/tests/filters/test_value_length_check.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pytest
-
-from credsweeper.config import Config
-from credsweeper.filters import ValueLengthCheck
-from tests.filters.conftest import LINE_VALUE_PATTERN, DUMMY_ANALYSIS_TARGET
-from tests.test_utils.dummy_line_data import get_line_data
-
-
-class TestValueLengthCheck:
-
-    def test_value_length_check_p(self, file_path: pytest.fixture, config: Config,
-                                  success_line: pytest.fixture) -> None:
-        line_data = get_line_data(file_path, line=success_line, pattern=LINE_VALUE_PATTERN)
-        assert ValueLengthCheck(config).run(line_data, DUMMY_ANALYSIS_TARGET) is False
-
-    @pytest.mark.parametrize("line", ["Cra"])
-    def test_value_length_check_n(self, file_path: pytest.fixture, config: Config, line: str) -> None:
-        line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN)
-        assert ValueLengthCheck(config).run(line_data, DUMMY_ANALYSIS_TARGET) is True
diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py
index 9322eae63..fefc1eef8 100644
--- a/tests/ml_model/test_ml_validator.py
+++ b/tests/ml_model/test_ml_validator.py
@@ -10,7 +10,7 @@
 from credsweeper.credentials import Candidate, CandidateKey
 from credsweeper.ml_model import MlValidator
 from credsweeper.utils import Util
-from tests import AZ_STRING, NEGLIGIBLE_ML_THRESHOLD
+from tests import NEGLIGIBLE_ML_THRESHOLD
 
 
 class TestMlValidator(unittest.TestCase):
@@ -31,7 +31,6 @@ def setUp(self):
         self.config = Config(config_dict)
 
     def test_ml_validator_simple_p(self):
-
         def validate(_candidate: Candidate) -> Tuple[bool, float]:
             """Validate single credential candidate."""
             candidate_key = CandidateKey(_candidate.line_data_list[0])
@@ -48,22 +47,22 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]:
         candidate.line_data_list[0].value = "Ahga%$FiQ@Ei8"
 
         decision, probability = validate(candidate)
-        self.assertAlmostEqual(probability, 0.9997520446777344, delta=NEGLIGIBLE_ML_THRESHOLD)
+        self.assertAlmostEqual(0.9997520446777344, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
 
         candidate.line_data_list[0].path = "sample.py"
         candidate.line_data_list[0].file_type = ".yaml"
         decision, probability = validate(candidate)
-        self.assertAlmostEqual(probability, 0.9994515776634216, delta=NEGLIGIBLE_ML_THRESHOLD)
+        self.assertAlmostEqual(0.9994515776634216, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
 
         candidate.line_data_list[0].path = "test.zip"
         candidate.line_data_list[0].file_type = ".zip"
         decision, probability = validate(candidate)
-        self.assertAlmostEqual(probability, 0.9994281530380249, delta=NEGLIGIBLE_ML_THRESHOLD)
+        self.assertAlmostEqual(0.9994281530380249, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
 
         candidate.line_data_list[0].path = "other.txt"
         candidate.line_data_list[0].file_type = ".txt"
         decision, probability = validate(candidate)
-        self.assertAlmostEqual(probability, 0.9980608820915222, delta=NEGLIGIBLE_ML_THRESHOLD)
+        self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD)
 
     def test_extract_features_p(self):
         candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info")
@@ -74,10 +73,10 @@ def test_extract_features_p(self):
         candidate1.line_data_list[0].value = "123"
         candidate1.rule_name = "Password"
         features1 = self.ml_validator.extract_features([candidate1])
-        self.assertEqual(18, np.count_nonzero(features1))
+        self.assertAlmostEqual(18, np.count_nonzero(features1), delta=NEGLIGIBLE_ML_THRESHOLD)
         candidate2 = copy.deepcopy(candidate1)
         features2 = self.ml_validator.extract_features([candidate1, candidate2])
-        self.assertEqual(18, np.count_nonzero(features2))
+        self.assertAlmostEqual(18, np.count_nonzero(features2), delta=NEGLIGIBLE_ML_THRESHOLD)
         candidate2.rule_name = "Secret"
         features3 = self.ml_validator.extract_features([candidate1, candidate2])
-        self.assertEqual(19, np.count_nonzero(features3))
+        self.assertAlmostEqual(19, np.count_nonzero(features3), delta=NEGLIGIBLE_ML_THRESHOLD)
diff --git a/tests/samples/salt.py b/tests/samples/salt.py
index 4140c4e5a..60b2fcd07 100644
--- a/tests/samples/salt.py
+++ b/tests/samples/salt.py
@@ -2,3 +2,4 @@
 salt2 = r"""\0x12\0x3s"""
 salt3 = u"\u0020827634876"
 salt4 = {"salt5": "my124%#$@s\x04clt\0"}
+json_escaped = "{\\\"salt8\\\":\\\"4b9a6d8b638eb0c6\\\"}"
diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js
index 4b21bc971..db6af3e57 100644
--- a/tests/samples/url_cred.js
+++ b/tests/samples/url_cred.js
@@ -1,5 +1,7 @@
 const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local');
 
+email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465";
+
 url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut";
 
 // note:dummyuser@example.com
diff --git a/tests/test_main.py b/tests/test_main.py
index d7066debf..6c774c4f9 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -749,7 +749,7 @@ def prepare(report: List[Dict[str, Any]]):
                 tmp_file = Path(tmp_dir) / cfg["json_filename"]
                 # apply the current path to keep equivalence in path
                 os.chdir(TESTS_PATH.parent)
-                content_provider: AbstractProvider = FilesProvider(["tests/samples"])
+                content_provider: AbstractProvider = FilesProvider([Path("tests") / "samples"])
                 # replace output report file to place in tmp_dir
                 cfg["json_filename"] = str(tmp_file)
                 cred_sweeper = CredSweeper(**cfg)

From 49e82cee1c82d0f793f41f53041262a2e0fc52d9 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Tue, 23 Jul 2024 21:08:26 +0300
Subject: [PATCH 02/18] style

---
 credsweeper/filters/group/group.py                 | 4 ++--
 credsweeper/filters/group/url_credentials_group.py | 4 ++--
 tests/ml_model/test_ml_validator.py                | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/credsweeper/filters/group/group.py b/credsweeper/filters/group/group.py
index 65a245fa9..37cea6948 100644
--- a/credsweeper/filters/group/group.py
+++ b/credsweeper/filters/group/group.py
@@ -5,8 +5,8 @@
 from credsweeper.config import Config
 from credsweeper.filters import (Filter, LineSpecificKeyCheck, ValueAllowlistCheck, ValueArrayDictionaryCheck,
                                  ValueBlocklistCheck, ValueCamelCaseCheck, ValueFilePathCheck, ValueFirstWordCheck,
-                                 ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
-                                 ValuePatternCheck, ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck,
+                                 ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck, ValuePatternCheck,
+                                 ValueSimilarityCheck, ValueStringTypeCheck, ValueTokenCheck,
                                  VariableNotAllowedPatternCheck, ValueHexNumberCheck)
 
 
diff --git a/credsweeper/filters/group/url_credentials_group.py b/credsweeper/filters/group/url_credentials_group.py
index 4c4c5d6a0..23aba1d3b 100644
--- a/credsweeper/filters/group/url_credentials_group.py
+++ b/credsweeper/filters/group/url_credentials_group.py
@@ -2,8 +2,8 @@
 from credsweeper.config import Config
 from credsweeper.filters import (ValueAllowlistCheck, ValueArrayDictionaryCheck, ValueBlocklistCheck,
                                  ValueCamelCaseCheck, ValueDictionaryValueLengthCheck, ValueFilePathCheck,
-                                 ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck,
-                                 ValueNotAllowedPatternCheck, ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck)
+                                 ValueFirstWordCheck, ValueLastWordCheck, ValueMethodCheck, ValueNotAllowedPatternCheck,
+                                 ValuePatternCheck, ValueStringTypeCheck, ValueTokenCheck)
 from credsweeper.filters.group import Group
 
 
diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py
index fefc1eef8..ee7083ae9 100644
--- a/tests/ml_model/test_ml_validator.py
+++ b/tests/ml_model/test_ml_validator.py
@@ -31,6 +31,7 @@ def setUp(self):
         self.config = Config(config_dict)
 
     def test_ml_validator_simple_p(self):
+
         def validate(_candidate: Candidate) -> Tuple[bool, float]:
             """Validate single credential candidate."""
             candidate_key = CandidateKey(_candidate.line_data_list[0])

From fdd483bd93b93bc46431f6bfbc10ec016358c8cd Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Tue, 23 Jul 2024 21:44:59 +0300
Subject: [PATCH 03/18] unicode cases in filter

---
 credsweeper/filters/value_not_allowed_pattern_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/credsweeper/filters/value_not_allowed_pattern_check.py b/credsweeper/filters/value_not_allowed_pattern_check.py
index 944c9c34e..a0cc89aa1 100644
--- a/credsweeper/filters/value_not_allowed_pattern_check.py
+++ b/credsweeper/filters/value_not_allowed_pattern_check.py
@@ -10,7 +10,7 @@
 class ValueNotAllowedPatternCheck(Filter):
     """Check that secret doesn't open or closes brackets or a new line."""
 
-    NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"^\s*\\", r"^\s*\\n\s*"]
+    NOT_ALLOWED = [r"[<>\[\]{}]\s+", r"\\u00(26|3c)gt;?(\s|\\+[nrt])?", r"^\s*\\", r"^\s*\\n\s*"]
     NOT_ALLOWED_PATTERN = re.compile(  #
         f"{Util.get_regex_combine_or(NOT_ALLOWED)}$",  #
         flags=re.IGNORECASE)

From 21fd81fb8dcc175921d7a20153154d3d43c9abf7 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Tue, 23 Jul 2024 21:47:42 +0300
Subject: [PATCH 04/18] tmp markup loan

---
 cicd/benchmark.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index c8d8fa7f7..52f7b0ca0 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -223,6 +223,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
 TOTAL:                  10333      16988573         8377        60439         5233
+NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password
 credsweeper result_cnt : 7800, lost_cnt : 0, true_cnt : 7231, false_cnt : 569
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------

From 97ef85bbd209dc242829c312806f968b54a2651c Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 24 Jul 2024 07:56:54 +0300
Subject: [PATCH 05/18] BM scores fix

---
 cicd/benchmark.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index 52f7b0ca0..60795a0fe 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -224,7 +224,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh-theme                  1            97                         1
 TOTAL:                  10333      16988573         8377        60439         5233
 NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password
-credsweeper result_cnt : 7800, lost_cnt : 0, true_cnt : 7231, false_cnt : 569
+credsweeper result_cnt : 7808, lost_cnt : 1, true_cnt : 7237, false_cnt : 570
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     123         3163          185         112   109     3   3345    14  0.000896  0.113821  0.995102  0.973214  0.886179  0.927660
@@ -260,12 +260,12 @@ Key                                     483         8494          464         44
 Nonce                                    83           53            0          85    79     6     47     4  0.113208  0.048193  0.926471  0.929412  0.951807  0.940476
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1823         7474         2752        1681  1614    67  10159   209  0.006552  0.114646  0.977094  0.960143  0.885354  0.921233
+Password                               1823         7474         2752        1682  1614    67  10159   209  0.006552  0.114646  0.977094  0.960143  0.885354  0.921233
 Salt                                     42           76            2          38    38     0     78     4  0.000000  0.095238  0.966667  1.000000  0.904762  0.950000
 Secret                                 1358        28497          869        1234  1229     5  29361   129  0.000170  0.094993  0.995639  0.995948  0.905007  0.948302
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   585         3972          439         519   511     8   4403    74  0.001814  0.126496  0.983587  0.984586  0.873504  0.925725
+Token                                   585         3972          439         521   512     9   4402    73  0.002040  0.124786  0.983587  0.982726  0.875214  0.925859
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         194          125          251         184   184     0    376    10  0.000000  0.051546  0.982456  1.000000  0.948454  0.973545
-                                       8377        60439         5233        7942  7231   569  59870  1146  0.009414  0.136803  0.975078  0.927051  0.863197  0.893985
+URL Credentials                         194          125          251         189   189     0    376     5  0.000000  0.025773  0.991228  1.000000  0.974227  0.986945
+                                       8377        60439         5233        7950  7237   570  59869  1140  0.009431  0.136087  0.975151  0.926989  0.863913  0.894340

From f851904671619c24226fadc2048b11918a41fec0 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 24 Jul 2024 07:59:13 +0300
Subject: [PATCH 06/18] docs upd

---
 docs/source/credsweeper.filters.rst | 48 +++++++++++++++++++----------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/docs/source/credsweeper.filters.rst b/docs/source/credsweeper.filters.rst
index f2da332f5..5d336e89d 100644
--- a/docs/source/credsweeper.filters.rst
+++ b/docs/source/credsweeper.filters.rst
@@ -20,6 +20,14 @@ credsweeper.filters.filter module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.filters.line\_git\_binary\_check module
+---------------------------------------------------
+
+.. automodule:: credsweeper.filters.line_git_binary_check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.filters.line\_specific\_key\_check module
 -----------------------------------------------------
 
@@ -132,6 +140,14 @@ credsweeper.filters.value\_dictionary\_value\_length\_check module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.filters.value\_discord\_bot\_check module
+-----------------------------------------------------
+
+.. automodule:: credsweeper.filters.value_discord_bot_check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.filters.value\_entropy\_base32\_check module
 --------------------------------------------------------
 
@@ -188,6 +204,22 @@ credsweeper.filters.value\_grafana\_check module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.filters.value\_grafana\_service\_check module
+---------------------------------------------------------
+
+.. automodule:: credsweeper.filters.value_grafana_service_check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+credsweeper.filters.value\_hex\_number\_check module
+----------------------------------------------------
+
+.. automodule:: credsweeper.filters.value_hex_number_check
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.filters.value\_ip\_check module
 -------------------------------------------
 
@@ -220,14 +252,6 @@ credsweeper.filters.value\_last\_word\_check module
    :undoc-members:
    :show-inheritance:
 
-credsweeper.filters.value\_length\_check module
------------------------------------------------
-
-.. automodule:: credsweeper.filters.value_length_check
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 credsweeper.filters.value\_method\_check module
 -----------------------------------------------
 
@@ -268,14 +292,6 @@ credsweeper.filters.value\_pattern\_check module
    :undoc-members:
    :show-inheritance:
 
-credsweeper.filters.value\_pattern\_length\_check module
---------------------------------------------------------
-
-.. automodule:: credsweeper.filters.value_pattern_length_check
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 credsweeper.filters.value\_similarity\_check module
 ---------------------------------------------------
 

From ec48914eec0164de46f58893efb3e7e08d5cf17a Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 24 Jul 2024 12:52:04 +0300
Subject: [PATCH 07/18] removed unused filter

---
 .../filters/separator_unusual_check.py        | 49 -------------------
 1 file changed, 49 deletions(-)
 delete mode 100644 credsweeper/filters/separator_unusual_check.py

diff --git a/credsweeper/filters/separator_unusual_check.py b/credsweeper/filters/separator_unusual_check.py
deleted file mode 100644
index b05da326b..000000000
--- a/credsweeper/filters/separator_unusual_check.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import logging
-
-from credsweeper.config import Config
-from credsweeper.credentials import LineData
-from credsweeper.file_handler.analysis_target import AnalysisTarget
-from credsweeper.filters import Filter
-
-logger = logging.getLogger(__name__)
-
-
-class SeparatorUnusualCheck(Filter):
-    """Check that candidate have no double symbol ops (like ++, --, <<) or comparison ops (like != or ==) as separator.
-
-    Example:
-        `pwd == 'value'`
-        `pwd != 'value'`
-        `pwd << value`
-
-    """
-
-    def __init__(self, config: Config = None) -> None:
-        pass
-
-    def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
-        """Run filter checks on received credential candidate data 'line_data'.
-
-        Args:
-            line_data: credential candidate data
-            target: multiline target from which line data was obtained
-
-        Return:
-            True, if need to filter candidate and False if left
-
-        """
-        if line_data.separator is None:
-            return True
-
-        if 1 > line_data.separator_start:
-            logger.warning(f"Wrong separator start position {line_data}")
-            return True
-
-        try:
-            if line_data.separator == line_data.line[line_data.separator_start + 1] or \
-                    (line_data.separator == "=" and line_data.line[line_data.separator_start - 1] == "!"):
-                return True
-        except IndexError:
-            return True
-
-        return False

From 898a2523fce2823723669506399d401eeb444901 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 25 Jul 2024 10:01:54 +0300
Subject: [PATCH 08/18] doc upd

---
 docs/source/credsweeper.filters.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/source/credsweeper.filters.rst b/docs/source/credsweeper.filters.rst
index 5d336e89d..94f1891be 100644
--- a/docs/source/credsweeper.filters.rst
+++ b/docs/source/credsweeper.filters.rst
@@ -36,14 +36,6 @@ credsweeper.filters.line\_specific\_key\_check module
    :undoc-members:
    :show-inheritance:
 
-credsweeper.filters.separator\_unusual\_check module
-----------------------------------------------------
-
-.. automodule:: credsweeper.filters.separator_unusual_check
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 credsweeper.filters.value\_allowlist\_check module
 --------------------------------------------------
 

From 92858b6cbd8db78b8e1b660b07dc0a3d976b4051 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 25 Jul 2024 12:31:23 +0300
Subject: [PATCH 09/18] [no ci] upd2

---
 cicd/benchmark.txt                            | 11 +++---
 credsweeper/filters/group/general_pattern.py  |  2 ++
 .../filters/line_specific_key_check.py        | 12 +++++--
 .../filters/value_useless_word_check.py       |  7 ++--
 tests/__init__.py                             |  2 +-
 tests/data/depth_3.json                       | 35 ++++++++++++++++---
 tests/data/ml_threshold.json                  |  8 ++---
 tests/data/output.json                        |  8 ++---
 tests/filters/test_line_specific_key_check.py |  4 ++-
 .../filters/test_value_useless_word_check.py  |  2 +-
 tests/samples/aws_client_id                   |  2 ++
 tests/samples/key.hs                          |  2 +-
 12 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index 60795a0fe..1e49d192d 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -223,14 +223,15 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
 TOTAL:                  10333      16988573         8377        60439         5233
-NOT FOUND WITH KEY: 1338367,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password
-credsweeper result_cnt : 7808, lost_cnt : 1, true_cnt : 7237, false_cnt : 570
+UNMATCH (35, 949): 1338367,8605db08,GitHub,48fd3902,data/48fd3902/test/8605db08.kt,30,30,F,F,34,949,F,F,,,,,0.0,0,F,F,F,Token
+NOT FOUND WITH KEY: 1338368,b42689a1,GitHub,ac9be8d9,data/ac9be8d9/test/b42689a1.exs,445,445,F,F,68,78,F,F,,,,,0.0,0,F,F,F,Password
+credsweeper result_cnt : 7810, lost_cnt : 2, true_cnt : 7237, false_cnt : 571
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     123         3163          185         112   109     3   3345    14  0.000896  0.113821  0.995102  0.973214  0.886179  0.927660
 AWS Client ID                           168           13            0         160   160     0     13     8  0.000000  0.047619  0.955801  1.000000  0.952381  0.975610
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
-AWS S3 Bucket                            61           25            0          87    61    24      1     0  0.960000  0.000000  0.720930  0.717647  1.000000  0.835616
+AWS S3 Bucket                            61           25            0          92    61    25      0     0  1.000000  0.000000  0.709302  0.709302  1.000000  0.829932
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
 Auth                                    407         2725           77         372   351    21   2781    56  0.007495  0.137592  0.976005  0.943548  0.862408  0.901155
 Azure Access Token                       19            0            0                 0     0      0    19            1.000000  0.000000            0.000000
@@ -265,7 +266,7 @@ Salt                                     42           76            2          3
 Secret                                 1358        28497          869        1234  1229     5  29361   129  0.000170  0.094993  0.995639  0.995948  0.905007  0.948302
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   585         3972          439         521   512     9   4402    73  0.002040  0.124786  0.983587  0.982726  0.875214  0.925859
+Token                                   585         3972          439         522   512     9   4402    73  0.002040  0.124786  0.983587  0.982726  0.875214  0.925859
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
 URL Credentials                         194          125          251         189   189     0    376     5  0.000000  0.025773  0.991228  1.000000  0.974227  0.986945
-                                       8377        60439         5233        7950  7237   570  59869  1140  0.009431  0.136087  0.975151  0.926989  0.863913  0.894340
+                                       8377        60439         5233        7956  7237   571  59868  1140  0.009448  0.136087  0.975137  0.926870  0.863913  0.894285
diff --git a/credsweeper/filters/group/general_pattern.py b/credsweeper/filters/group/general_pattern.py
index b340055fd..96018a5f4 100644
--- a/credsweeper/filters/group/general_pattern.py
+++ b/credsweeper/filters/group/general_pattern.py
@@ -1,5 +1,6 @@
 from credsweeper.common.constants import GroupType
 from credsweeper.config import Config
+from credsweeper.filters import ValueUselessWordCheck
 from credsweeper.filters.group import Group
 
 
@@ -8,3 +9,4 @@ class GeneralPattern(Group):
 
     def __init__(self, config: Config) -> None:
         super().__init__(config, GroupType.PATTERN)
+        self.filters.extend([ValueUselessWordCheck()])
diff --git a/credsweeper/filters/line_specific_key_check.py b/credsweeper/filters/line_specific_key_check.py
index 8bbfa15a1..71fec9dc0 100644
--- a/credsweeper/filters/line_specific_key_check.py
+++ b/credsweeper/filters/line_specific_key_check.py
@@ -1,5 +1,6 @@
 import re
 
+from credsweeper.common.constants import ML_HUNK
 from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -10,8 +11,8 @@
 class LineSpecificKeyCheck(Filter):
     """Check that values from list below is not in candidate line."""
 
-    NOT_ALLOWED = [r"example", r"enc\(", r"enc\[", r"true", r"false"]
-    NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED))
+    NOT_ALLOWED = [r"example", r"\benc[\(\[]", r"\btrue\b", r"\bfalse\b"]
+    NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED), re.IGNORECASE)
 
     def __init__(self, config: Config = None) -> None:
         pass
@@ -29,8 +30,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         """
         if line_data.line is None:
             return True
+        if 0 <= line_data.variable_start:
+            # variable may be defined too
+            sub_line_start = 0 if ML_HUNK >= line_data.variable_start else line_data.variable_start - ML_HUNK
+        else:
+            sub_line_start = 0 if ML_HUNK >= line_data.value_start else line_data.value_start - ML_HUNK
 
-        if self.NOT_ALLOWED_PATTERN.search(target.line_lower):
+        if self.NOT_ALLOWED_PATTERN.search(line_data.line, sub_line_start, line_data.value_end + ML_HUNK):
             return True
 
         return False
diff --git a/credsweeper/filters/value_useless_word_check.py b/credsweeper/filters/value_useless_word_check.py
index c921c937d..6182d1e3f 100644
--- a/credsweeper/filters/value_useless_word_check.py
+++ b/credsweeper/filters/value_useless_word_check.py
@@ -11,11 +11,10 @@ class ValueUselessWordCheck(Filter):
     """Check is candidate value contains sub-rows with operators (like ->)."""
 
     NOT_ALLOWED = [
-        "((\\{)?(0x)+([0-9a-f]|\\%){1}.*)",  # Check is contain \{0x or 0x
-        "(\\-\\>.*)",  # Check if contain ->
-        "(xxxx.*)",  # Check if contain xxxxx
+        "((\\{)?(0x)+([0-9a-f]|\\%){1})",  # Check is contain \{0x or 0x
+        r"((\w+)?->)",  # Check if contain ->
+        "(.*example)",  # Check if contain `example` word
         "(\\$\\w+)",  # Check whether it looks like a variable e.g. $word
-        "(\\s).*"  # Check if contain \s
     ]
     NOT_ALLOWED_PATTERN = re.compile(  #
         Util.get_regex_combine_or(NOT_ALLOWED),  #
diff --git a/tests/__init__.py b/tests/__init__.py
index d83aadba8..9a68241c1 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -18,7 +18,7 @@
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 18
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 19
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index 27447f7fc..f9b502b47 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -9075,17 +9075,17 @@
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.986,
+        "ml_probability": 0.999,
         "rule": "Secret",
         "severity": "medium",
         "confidence": "moderate",
         "line_data_list": [
             {
-                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"",
+                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"",
                 "line_num": 5,
                 "path": "tests/samples/key.hs",
                 "info": "tests/samples/key.hs|RAW",
-                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE",
+                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE",
                 "value_start": 32,
                 "value_end": 72,
                 "variable": "secret_looks_like_linux_path__",
@@ -9093,7 +9093,7 @@
                 "variable_end": 30,
                 "entropy_validation": {
                     "iterator": "BASE64_CHARS",
-                    "entropy": 4.784183719779189,
+                    "entropy": 4.8530559073332755,
                     "valid": true
                 }
             }
@@ -11162,6 +11162,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.911,
+        "rule": "Salt",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "salt3 = \" 827634876\"; ",
+                "line_num": 1,
+                "path": "tests/samples/salt.py",
+                "info": "tests/samples/salt.py|STRUCT|STRUCT:2|KEYWORD:`salt3 = \" 827634876\"; `",
+                "value": " 827634876",
+                "value_start": 9,
+                "value_end": 19,
+                "variable": "salt3",
+                "variable_start": 0,
+                "variable_end": 5,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 2.389735285398626,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index 878f82d6d..ea8154995 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -9736,17 +9736,17 @@
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.986,
+        "ml_probability": 0.999,
         "rule": "Secret",
         "severity": "medium",
         "confidence": "moderate",
         "line_data_list": [
             {
-                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"",
+                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"",
                 "line_num": 5,
                 "path": "tests/samples/key.hs",
                 "info": "",
-                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE",
+                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE",
                 "value_start": 32,
                 "value_end": 72,
                 "variable": "secret_looks_like_linux_path__",
@@ -9754,7 +9754,7 @@
                 "variable_end": 30,
                 "entropy_validation": {
                     "iterator": "BASE64_CHARS",
-                    "entropy": 4.784183719779189,
+                    "entropy": 4.8530559073332755,
                     "valid": true
                 }
             }
diff --git a/tests/data/output.json b/tests/data/output.json
index 033d703a2..e6b2d1d13 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -8818,17 +8818,17 @@
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.986,
+        "ml_probability": 0.999,
         "rule": "Secret",
         "severity": "medium",
         "confidence": "moderate",
         "line_data_list": [
             {
-                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE\"",
+                "line": "secret_looks_like_linux_path__=\"VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE\"",
                 "line_num": 5,
                 "path": "tests/samples/key.hs",
                 "info": "",
-                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE",
+                "value": "VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE",
                 "value_start": 32,
                 "value_end": 72,
                 "variable": "secret_looks_like_linux_path__",
@@ -8836,7 +8836,7 @@
                 "variable_end": 30,
                 "entropy_validation": {
                     "iterator": "BASE64_CHARS",
-                    "entropy": 4.784183719779189,
+                    "entropy": 4.8530559073332755,
                     "valid": true
                 }
             }
diff --git a/tests/filters/test_line_specific_key_check.py b/tests/filters/test_line_specific_key_check.py
index ca123d839..c55a3cb3f 100644
--- a/tests/filters/test_line_specific_key_check.py
+++ b/tests/filters/test_line_specific_key_check.py
@@ -18,9 +18,11 @@ def test_line_specific_key_check_p(self, file_path: pytest.fixture, line: str) -
 
     @pytest.mark.parametrize("line", [
         '"AwsAccessKey": enc("AKIAGIREOGIAWSKEY123"),',
-        '"AwsAccessKey": "AKIAGIREXAMPLEKEY123"',
+        '"AwsAccessKey as example": "AKIAGIREXAMPLEKEY123"',
     ])
     def test_line_specific_key_check_n(self, file_path: pytest.fixture, line: str) -> None:
         cred_candidate = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN)
+        # LINE_VALUE_PATTERN does not detect a value position
+        cred_candidate.value_start = line.find("AKIA")
         target = AnalysisTarget(line_pos=0, lines=[line], line_nums=[1], descriptor=DUMMY_DESCRIPTOR)
         assert LineSpecificKeyCheck().run(cred_candidate, target) is True
diff --git a/tests/filters/test_value_useless_word_check.py b/tests/filters/test_value_useless_word_check.py
index 2cbdcf192..911f3781f 100644
--- a/tests/filters/test_value_useless_word_check.py
+++ b/tests/filters/test_value_useless_word_check.py
@@ -11,7 +11,7 @@ def test_value_useless_word_check_p(self, file_path: pytest.fixture, success_lin
         line_data = get_line_data(file_path=file_path, line=success_line, pattern=LINE_VALUE_PATTERN)
         assert ValueUselessWordCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False
 
-    @pytest.mark.parametrize("line", ["{0x943058439}", "0x%", "->gi_reo_gi", "xxxxxGIREOGI", " GIREOGI"])
+    @pytest.mark.parametrize("line", ["{0x943058439}", "0x%", "->gi_reo_gi", "GIREOGIEXAMPLE"])
     def test_value_useless_word_check_n(self, file_path: pytest.fixture, line: str) -> None:
         line_data = get_line_data(file_path=file_path, line=line, pattern=LINE_VALUE_PATTERN)
         assert ValueUselessWordCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is True
diff --git a/tests/samples/aws_client_id b/tests/samples/aws_client_id
index 3685378f5..b5a83275c 100644
--- a/tests/samples/aws_client_id
+++ b/tests/samples/aws_client_id
@@ -1,2 +1,4 @@
 The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X
 the coma is necessary there       ^      bariers thesting !!!
+must be filtered: AKIAGIREOGIAEXAMPLE7
+filtered too: AKIALGSBKLIKEAREAL12 --access-key <xcFsdeGddSAdI/KFRS2CB/3fGCsdCYEXAMPLEKEY>
diff --git a/tests/samples/key.hs b/tests/samples/key.hs
index 5a13fab50..3d197e623 100644
--- a/tests/samples/key.hs
+++ b/tests/samples/key.hs
@@ -2,6 +2,6 @@ prKeyValid=LS0tLS1CRUdJTiBQUklWQVRFIENDcUdTTTQ5QXdFSEJHMHdhd0lCQVFRZ0ViVnpmUGWxh
 secret_looks_like_linux_path_1="/VnpmUGWxhQW9KQAwrL2ZYdDJPNG1PQjYxMXNPaF"
 secret_looks_like_linux_path_2="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjYxMXNPF"
 secret_looks_like_linux_path_3="VnpmUGWxhQW/9KQAwrL2ZYdDJPNG1PQjYxMXNPF="
-secret_looks_like_linux_path__="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMPLE"
+secret_looks_like_linux_path__="VnpmUGWxhQW/9KQAwrL2ZYd/DJPNG1PQjEXAMbLE"
 
 "https://example.com/api/js?key=dhd0lCQVFRZ0ViVnpmUGWxhQW9KQWwrLzZYdDJPNG1PQjYxMXNPaFJB&bug=true"

From 6d0ab82c9f22f5cb240cf113f4d16c2567ad6de2 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 7 Aug 2024 12:57:30 +0300
Subject: [PATCH 10/18] test counters fix

---
 tests/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 4f6c5b878..41e62d375 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -7,18 +7,18 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 362
-SAMPLES_CRED_LINE_COUNT: int = 379
+SAMPLES_CRED_COUNT: int = 364
+SAMPLES_CRED_LINE_COUNT: int = 381
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 320
+SAMPLES_POST_CRED_COUNT: int = 322
 
 # with option --doc
 SAMPLES_IN_DOC = 411
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 17
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 18
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters

From fdb5ee70ba2926b5739a3126b9c67fea29970f03 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 7 Aug 2024 13:16:48 +0300
Subject: [PATCH 11/18] reduce whitespaces during extracting subtext

---
 credsweeper/utils/util.py | 10 +++++++++-
 tests/utils/test_util.py  | 16 +++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
index 3f51d18d5..1fa4d8188 100644
--- a/credsweeper/utils/util.py
+++ b/credsweeper/utils/util.py
@@ -4,6 +4,7 @@
 import logging
 import math
 import os
+import string
 import struct
 import tarfile
 from dataclasses import dataclass
@@ -685,6 +686,13 @@ def subtext(text: str, pos: int, hunk_size: int) -> str:
         else:
             left_quota = hunk_size - pos
             left_pos = 0
+        # skip leading whitespaces in result string
+        for i in range(left_pos, pos):
+            if text[i] in string.whitespace:
+                left_quota += 1
+                left_pos += 1
+            else:
+                break
         right_remain = len(text) - pos
         if hunk_size <= right_remain:
             right_quota = 0
@@ -698,4 +706,4 @@ def subtext(text: str, pos: int, hunk_size: int) -> str:
             left_pos -= right_quota
             if 0 > left_pos:
                 left_pos = 0
-        return text[left_pos:right_pos]
+        return text[left_pos:right_pos].rstrip()
diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py
index 2d67cccde..b2cf2cca0 100644
--- a/tests/utils/test_util.py
+++ b/tests/utils/test_util.py
@@ -599,13 +599,19 @@ def test_get_chunks_coverage_p(self):
 
     def test_subtext_n(self):
         self.assertEqual("", Util.subtext("", 0, 0))
+        self.assertEqual("", Util.subtext(' ' * 42, 0, 0))
 
     def test_subtext_p(self):
-        # self.assertEqual(AZ_STRING, Util.subtext(AZ_STRING, 37, 40))
-        self.assertEqual("The quick ", Util.subtext(AZ_STRING, 0, 5))
-        self.assertEqual("The quick ", Util.subtext(AZ_STRING, 3, 5))
-        self.assertEqual(" fox jumps", Util.subtext(AZ_STRING, 20, 5))
+        self.assertEqual("var=value0123456789;", Util.subtext("                 var=value0123456789;   ", 21, 10))
+        self.assertEqual(AZ_STRING, Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 1 + len(AZ_STRING) >> 1))
+        self.assertEqual("x jump", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 3))
+        self.assertEqual("ox jumps", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 4))
+        self.assertEqual("fox jumps", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 5))
+        self.assertEqual("fox jumps ov", Util.subtext(AZ_STRING, len(AZ_STRING) >> 1, 6))
+        self.assertEqual("The quick", Util.subtext(AZ_STRING, 0, 5))
+        self.assertEqual("The quick", Util.subtext(AZ_STRING, 3, 5))
+        self.assertEqual("fox jumps", Util.subtext(AZ_STRING, AZ_STRING.find("jumps"), 5))
         self.assertEqual("e lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 5))
         self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6))
-        self.assertEqual(AZ_STRING[:40], Util.subtext(AZ_STRING, 15, 20))
+        self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20))
         self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20))

From 2c2191994d5ec05486fca72bbe3fe698960dd35f Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 7 Aug 2024 13:32:36 +0300
Subject: [PATCH 12/18] aux BM ref

---
 .github/workflows/benchmark.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index d6017bb39..8b418ecf1 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: jwt
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -74,7 +74,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: jwt
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -172,7 +172,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: jwt
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -354,7 +354,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: babenek/CredData
-          ref: jwt
+          ref: auxiliary
 
       - name: Markup hashing
         run: |

From d7f65c148f75c508e062d1601cb6ac5d49f2d8b1 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 7 Aug 2024 14:05:34 +0300
Subject: [PATCH 13/18] BM scores fix

---
 cicd/benchmark.txt | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index eb12c2dd0..3380e7756 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -1,7 +1,7 @@
-DATA: 16978521 interested lines. MARKUP: 61845 items
+DATA: 16978521 interested lines. MARKUP: 61851 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
-                          194         28318           64          427           89
+                          194         28318           64          429           87
 .1                          2           641            2            5
 .admx                       1            26                         1
 .adoc                       1           158           11            6            1
@@ -53,8 +53,8 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .erb                       13           323                        27
 .erl                        4            96                         8
 .ex                        25          4968            3          105            5
-.example                   17          1838           73           37           55
-.exs                       24          4842            3          188            4
+.example                   17          1838           73           38           55
+.exs                       24          4842            3          189            4
 .ext                        5           211            1            4            2
 .fsproj                     1            75                         1
 .g4                         2           201                         2
@@ -80,7 +80,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .ipynb                      1           134                         5
 .j                          1           241                         4
 .j2                        30          5530            6          213           10
-.java                     621        134132          314         1357          170
+.java                     621        134132          314         1361          170
 .jenkinsfile                1            58            1            7
 .jinja2                     1            64                         2
 .js                       659        536413          521         2642          336
@@ -89,7 +89,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .jsx                        7           857                        19
 .jwt                        1             1            2
 .key                       83          2737           70           14
-.kt                       123         20774           50          384            3
+.kt                       123         20774           51          384            3
 .l                          1           982                         1
 .las                        1          6656                        46
 .lasso                      1           230                         6
@@ -150,13 +150,13 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .pug                        2           193                         2
 .purs                       1            69                         4
 .pxd                        1           150                         5            2
-.py                       890        291553          618         3465          748
+.py                       890        291553          618         3466          748
 .pyi                        4          1361                         9
 .pyp                        1           167                         1
 .pyx                        2          1094                        21
 .r                          4            62            6            3            1
 .rake                       2            51                         2
-.rb                       861        131867          237         3457          615
+.rb                       861        131867          237         3458          615
 .re                         1            31                         1
 .red                        1           159                         1
 .release                    1            13                         4
@@ -222,8 +222,8 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          460          916          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10281      16978521         7499        59954         5230
-credsweeper result_cnt : 6597, lost_cnt : 0, true_cnt : 6352, false_cnt : 245
+TOTAL:                  10281      16978521         7500        59964         5228
+credsweeper result_cnt : 6594, lost_cnt : 0, true_cnt : 6346, false_cnt : 248
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     123         3163          185         112   109     3   3345    14  0.000896  0.113821  0.995102  0.973214  0.886179  0.927660
@@ -231,7 +231,7 @@ AWS Client ID                           168           13            0         16
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
 AWS S3 Bucket                            61           25            0          92    61    25      0     0  1.000000  0.000000  0.709302  0.709302  1.000000  0.829932
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    407         2725           77         372   351    21   2781    56  0.007495  0.137592  0.976005  0.943548  0.862408  0.901155
+Auth                                    407         2728           77         371   350    21   2784    57  0.007487  0.140049  0.975716  0.943396  0.859951  0.899743
 Azure Access Token                       19            0            0          12    12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
@@ -253,16 +253,16 @@ Grafana Provisioned API Key              22            1            0
 JSON Web Token                          169           61            0         158   137    21     40    32  0.344262  0.189349  0.769565  0.867089  0.810651  0.837920
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     483         8494          464         445   436     9   8949    47  0.001005  0.097308  0.994068  0.979775  0.902692  0.939655
+Key                                     483         8496          464         443   434     9   8951    49  0.001004  0.101449  0.993858  0.979684  0.898551  0.937365
 Nonce                                    83           53            0          85    79     6     47     4  0.113208  0.048193  0.926471  0.929412  0.951807  0.940476
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1823         7474         2752        1682  1614    67  10159   209  0.006552  0.114646  0.977094  0.960143  0.885354  0.921233
+Password                               1820         7476         2751        1681  1613    68  10159   207  0.006649  0.113736  0.977173  0.959548  0.886264  0.921451
 Salt                                     42           76            2          38    38     0     78     4  0.000000  0.095238  0.966667  1.000000  0.904762  0.950000
 Secret                                 1358        28497          869        1234  1229     5  29361   129  0.000170  0.094993  0.995639  0.995948  0.905007  0.948302
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   584         3973          438         519   511     8   4403    73  0.001814  0.125000  0.983784  0.984586  0.875000  0.926564
+Token                                   585         3975          438         512   503     9   4404    82  0.002039  0.140171  0.981793  0.982422  0.859829  0.917046
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         194          125          251         184   184     0    376    10  0.000000  0.051546  0.982456  1.000000  0.948454  0.973545
-                                       7499        59954         5230        6604  6352   245  59709  1147  0.004086  0.152954  0.979363  0.962862  0.847046  0.901249
+URL Credentials                         197          126          250         190   190     0    376     7  0.000000  0.035533  0.987784  1.000000  0.964467  0.981912
+                                       7500        59964         5228        6605  6346   248  59716  1154  0.004136  0.153867  0.979219  0.962390  0.846133  0.900525

From 95d12f3c2d2dad831c6ac037405b878a23232208 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Wed, 7 Aug 2024 19:59:53 +0300
Subject: [PATCH 14/18] Rollback BM

---
 .github/workflows/benchmark.yml | 12 +++----
 tests/data/depth_3.json         | 62 ++++++++++++++++-----------------
 tests/data/ml_threshold.json    | 62 ++++++++++++++++-----------------
 tests/data/output.json          | 62 ++++++++++++++++-----------------
 tests/samples/url_cred.js       |  4 +--
 5 files changed, 99 insertions(+), 103 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 8b418ecf1..1caaa133f 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -22,8 +22,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: auxiliary
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -73,8 +72,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: auxiliary
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -171,8 +169,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: auxiliary
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -353,8 +350,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: auxiliary
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index d4d44afa9..01cdc64ff 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -10784,33 +10784,6 @@
             }
         ]
     },
-    {
-        "api_validation": "NOT_AVAILABLE",
-        "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.989,
-        "rule": "URL Credentials",
-        "severity": "high",
-        "confidence": "moderate",
-        "line_data_list": [
-            {
-                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
-                "line_num": 3,
-                "path": "tests/samples/url_cred.js",
-                "info": "tests/samples/url_cred.js|RAW",
-                "value": "FnD83JZs",
-                "value_start": 44,
-                "value_end": 52,
-                "variable": "smtps://",
-                "variable_start": 18,
-                "variable_end": 26,
-                "entropy_validation": {
-                    "iterator": "BASE64_CHARS",
-                    "entropy": 3.0,
-                    "valid": false
-                }
-            }
-        ]
-    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -10821,7 +10794,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 5,
+                "line_num": 3,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -10848,7 +10821,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 9,
+                "line_num": 7,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -10875,7 +10848,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "3487263-2384579834-234732875-345",
@@ -10902,7 +10875,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "tests/samples/url_cred.js|RAW",
                 "value": "546DFS64N90P3AW7DX",
@@ -10919,6 +10892,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 13,
+                "path": "tests/samples/url_cred.js",
+                "info": "tests/samples/url_cred.js|RAW",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index c0bf22718..d905cd4b5 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -9846,33 +9846,6 @@
             }
         ]
     },
-    {
-        "api_validation": "NOT_AVAILABLE",
-        "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.989,
-        "rule": "URL Credentials",
-        "severity": "high",
-        "confidence": "moderate",
-        "line_data_list": [
-            {
-                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
-                "line_num": 3,
-                "path": "tests/samples/url_cred.js",
-                "info": "",
-                "value": "FnD83JZs",
-                "value_start": 44,
-                "value_end": 52,
-                "variable": "smtps://",
-                "variable_start": 18,
-                "variable_end": 26,
-                "entropy_validation": {
-                    "iterator": "BASE64_CHARS",
-                    "entropy": 3.0,
-                    "valid": false
-                }
-            }
-        ]
-    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -9883,7 +9856,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 5,
+                "line_num": 3,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -9910,7 +9883,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 9,
+                "line_num": 7,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -9937,7 +9910,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "3487263-2384579834-234732875-345",
@@ -9964,7 +9937,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX",
@@ -9981,6 +9954,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 13,
+                "path": "tests/samples/url_cred.js",
+                "info": "",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/output.json b/tests/data/output.json
index 24dc955c2..1d75227d5 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -8766,33 +8766,6 @@
             }
         ]
     },
-    {
-        "api_validation": "NOT_AVAILABLE",
-        "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.989,
-        "rule": "URL Credentials",
-        "severity": "high",
-        "confidence": "moderate",
-        "line_data_list": [
-            {
-                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
-                "line_num": 3,
-                "path": "tests/samples/url_cred.js",
-                "info": "",
-                "value": "FnD83JZs",
-                "value_start": 44,
-                "value_end": 52,
-                "variable": "smtps://",
-                "variable_start": 18,
-                "variable_end": 26,
-                "entropy_validation": {
-                    "iterator": "BASE64_CHARS",
-                    "entropy": 3.0,
-                    "valid": false
-                }
-            }
-        ]
-    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -8803,7 +8776,7 @@
         "line_data_list": [
             {
                 "line": "url = \"https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut\";",
-                "line_num": 5,
+                "line_num": 3,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX%2Fkeep",
@@ -8830,7 +8803,7 @@
         "line_data_list": [
             {
                 "line": "// \"fp://no.host.real/any/path/to/nowhere/\",\"key\":\"f45VgF8jX79o@anydata.com\"",
-                "line_num": 9,
+                "line_num": 7,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "f45VgF8jX79o@anydata.com",
@@ -8857,7 +8830,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "3487263-2384579834-234732875-345",
@@ -8884,7 +8857,7 @@
         "line_data_list": [
             {
                 "line": "39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2",
-                "line_num": 12,
+                "line_num": 10,
                 "path": "tests/samples/url_cred.js",
                 "info": "",
                 "value": "546DFS64N90P3AW7DX",
@@ -8901,6 +8874,33 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.989,
+        "rule": "URL Credentials",
+        "severity": "high",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "email_as_login = \"smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465\";",
+                "line_num": 13,
+                "path": "tests/samples/url_cred.js",
+                "info": "",
+                "value": "FnD83JZs",
+                "value_start": 44,
+                "value_end": 52,
+                "variable": "smtps://",
+                "variable_start": 18,
+                "variable_end": 26,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.0,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js
index db6af3e57..d555f7233 100644
--- a/tests/samples/url_cred.js
+++ b/tests/samples/url_cred.js
@@ -1,7 +1,5 @@
 const connection_url = require('dbconnection://ad%6Din:5WdF4f2jE76a@db-host-local');
 
-email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465";
-
 url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut";
 
 // note:dummyuser@example.com
@@ -11,3 +9,5 @@ url = "https://secure.com/83675/39084?Credential=546DFS64N90P3AW7DX%2Fkeep%26cut
 /* partially line to sanitize url-like items
 39084?Credential=546DFS64N90P3AW7DX&key=3487263-2384579834-234732875-345&hasToBefound=2
 */
+
+email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465";

From 660fc44955b83a7fba0403075afb870a03ed67dd Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 8 Aug 2024 11:10:54 +0300
Subject: [PATCH 15/18] JWT fix

---
 credsweeper/filters/value_json_web_token_check.py | 2 +-
 tests/filters/test_value_json_web_token_check.py  | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/credsweeper/filters/value_json_web_token_check.py b/credsweeper/filters/value_json_web_token_check.py
index d7265dbce..ed6a2e2e1 100644
--- a/credsweeper/filters/value_json_web_token_check.py
+++ b/credsweeper/filters/value_json_web_token_check.py
@@ -53,7 +53,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
                     if not header_check:
                         header_check = bool(ValueJsonWebTokenCheck.header_keys.intersection(json_keys))
                     # payload follows the header
-                    if not payload_check:
+                    elif not payload_check:
                         payload_check = bool(ValueJsonWebTokenCheck.payload_keys.intersection(json_keys))
                         # any other payloads are allowed
                 elif header_check and payload_check and not signature_check:
diff --git a/tests/filters/test_value_json_web_token_check.py b/tests/filters/test_value_json_web_token_check.py
index 9aa85a752..4cb701956 100644
--- a/tests/filters/test_value_json_web_token_check.py
+++ b/tests/filters/test_value_json_web_token_check.py
@@ -20,6 +20,9 @@ def test_value_jwt_check_p(self):
         self.assertTrue(ValueJsonWebTokenCheck().run(
             get_line_data(line="eyJhbGciOiJSUzI1NiJ9Cg.eyJleHAiOjY1NTM2fQo.AAAAAAAAAAAAAAAAAAAAAAA",
                           pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET))
+        self.assertTrue(ValueJsonWebTokenCheck().run(
+            get_line_data(line="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.x3.GFsFyGiCUIP5VHI9CEJL9thWsGjSZf1fJfarNk-LGTM",
+                          pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET))
 
     def test_value_jwt_check_n(self):
         self.assertFalse(ValueJsonWebTokenCheck().run(
@@ -33,3 +36,8 @@ def test_value_jwt_check_n(self):
                                "Ui1o9ndy7ckISHQVhuYFKu78l7nqC4heghK_Gw4h7EB7s8eEuUC-D6JjVtX10IyS" \
                                "vCRkRo7f8dWQTjFLs7mlPowjRz0cP5J-MmCoegKHYagOHZ_ArXOR91_u8jMdwmOf",
                           pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET))
+        self.assertFalse(ValueJsonWebTokenCheck().run(
+            get_line_data(line="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9." \
+                               "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9." \
+                               ".e30.GFsFyGiCUIP5VHI9CEJL9thWsGjSZf1fJfarNk-LGTM",
+                          pattern=LINE_VALUE_PATTERN), DUMMY_ANALYSIS_TARGET))

From 536a7f394bb9d2592e89704933bacfb97a32a3a7 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 8 Aug 2024 11:31:03 +0300
Subject: [PATCH 16/18] customBMref

---
 .github/workflows/benchmark.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 1caaa133f..8b418ecf1 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -22,7 +22,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -72,7 +73,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -169,7 +171,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: auxiliary
 
       - name: Markup hashing
         run: |
@@ -350,7 +353,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: auxiliary
 
       - name: Markup hashing
         run: |

From 6d072d44cdf88a6b4701899c6c1d6031ece1c472 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 8 Aug 2024 11:41:34 +0300
Subject: [PATCH 17/18] JWT fix BC scor

---
 cicd/benchmark.txt | 52 +++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index 3380e7756..4ec884387 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -1,4 +1,4 @@
-DATA: 16978521 interested lines. MARKUP: 61851 items
+DATA: 16978521 interested lines. MARKUP: 61852 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           64          429           87
@@ -11,7 +11,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .backup                     1            62            1            1
 .bash                       2          2158                         2            1
 .bat                        4           233            1           13            2
-.bats                      15          2804           12           52            9
+.bats                      15          2804           14           50            9
 .bazel                      3           424                         8
 .build                      2            40                         3
 .bundle                     4          1512                       570
@@ -27,7 +27,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .cmd                        4           401            2            3
 .cnf                        8           858           18           45           18
 .coffee                     1           585                         2
-.conf                      60          4945           50           74           54
+.conf                      60          4945           53           71           54
 .config                    20           492           16           33            1
 .cpp                       15          5688            1           61
 .creds                      1            10            1            1
@@ -53,7 +53,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .erb                       13           323                        27
 .erl                        4            96                         8
 .ex                        25          4968            3          105            5
-.example                   17          1838           73           38           55
+.example                   17          1838           74           38           54
 .exs                       24          4842            3          189            4
 .ext                        5           211            1            4            2
 .fsproj                     1            75                         1
@@ -61,7 +61,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .gd                         1            37                         1
 .gml                        3          3075                        26
 .gni                        3          5017                        18
-.go                      1079        566327          619         4333          742
+.go                      1079        566327          621         4331          742
 .golden                     5          1168            1           14           29
 .gradle                    45          3265            4           91          100
 .graphql                    7           420                        13
@@ -74,16 +74,16 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .html                      53         15327           14          115           18
 .idl                        2           777                         4
 .iml                        6           699                        36
-.in                         6          2130            1           80           12
+.in                         6          2130            3           78           12
 .inc                        2            56                         2            1
 .ini                       11          1437           24           12           18
 .ipynb                      1           134                         5
-.j                          1           241                         4
+.j                          1           241            2            2
 .j2                        30          5530            6          213           10
 .java                     621        134132          314         1361          170
 .jenkinsfile                1            58            1            7
 .jinja2                     1            64                         2
-.js                       659        536413          521         2642          336
+.js                       659        536413          526         2637          336
 .json                     860      13670669          623        10948          140
 .jsp                       13          3202            1           42
 .jsx                        7           857                        19
@@ -105,12 +105,12 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .lock                      24        160912                       144
 .log                        2           199           38           52
 .lua                       10          1924                        37            3
-.m                         16         13358           11          151            3
+.m                         16         13358           11          152            3
 .manifest                   3           102                         3
 .markdown                   3           139                         3            1
 .markerb                    3            12                         3
 .marko                      1            21                         2
-.md                       673        149294          646         2366          671
+.md                       673        149294          658         2361          664
 .mdx                        3           549                         7
 .mjml                       1            18                         1
 .mjs                       22          4424           50          343
@@ -122,7 +122,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .mqh                        1          1023                         2
 .msg                        1         26644                         1
 .mysql                      1            36                                      2
-.ndjson                     2          5006           34          268            2
+.ndjson                     2          5006           37          266            2
 .nix                        4           211                        12
 .nolint                     1             2                         1
 .odd                        1          1281                        57
@@ -132,7 +132,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .patch                      4        109405                        27
 .pbxproj                    1           941                         1
 .pem                       48          1169           47            8
-.php                      371         75710          129         1770           80
+.php                      371         75710          130         1769           80
 .pl                        16         14727            6           47
 .pm                         3           744                         8
 .po                         3          2994                        15
@@ -150,7 +150,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .pug                        2           193                         2
 .purs                       1            69                         4
 .pxd                        1           150                         5            2
-.py                       890        291553          618         3466          748
+.py                       890        291553          626         3461          744
 .pyi                        4          1361                         9
 .pyp                        1           167                         1
 .pyx                        2          1094                        21
@@ -222,16 +222,16 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          460          916          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10281      16978521         7500        59964         5228
-credsweeper result_cnt : 6594, lost_cnt : 0, true_cnt : 6346, false_cnt : 248
+TOTAL:                  10281      16978521         7541        59936         5216
+credsweeper result_cnt : 6566, lost_cnt : 0, true_cnt : 6348, false_cnt : 218
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
-API                                     123         3163          185         112   109     3   3345    14  0.000896  0.113821  0.995102  0.973214  0.886179  0.927660
+API                                     124         3162          185         112   109     3   3344    15  0.000896  0.120968  0.994814  0.973214  0.879032  0.923729
 AWS Client ID                           168           13            0         160   160     0     13     8  0.000000  0.047619  0.955801  1.000000  0.952381  0.975610
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
 AWS S3 Bucket                            61           25            0          92    61    25      0     0  1.000000  0.000000  0.709302  0.709302  1.000000  0.829932
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    407         2728           77         371   350    21   2784    57  0.007487  0.140049  0.975716  0.943396  0.859951  0.899743
+Auth                                    408         2727           77         371   350    21   2783    58  0.007489  0.142157  0.975405  0.943396  0.857843  0.898588
 Azure Access Token                       19            0            0          12    12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
@@ -250,19 +250,19 @@ Google API Key                           12            0            0          1
 Google Multi                             10            2            0          11    10     1      1     0  0.500000  0.000000  0.916667  0.909091  1.000000  0.952381
 Google OAuth Access Token                 3            0            0           3     3     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Grafana Provisioned API Key              22            1            0           5     5     0      1    17  0.000000  0.772727  0.260870  1.000000  0.227273  0.370370
-JSON Web Token                          169           61            0         158   137    21     40    32  0.344262  0.189349  0.769565  0.867089  0.810651  0.837920
+JSON Web Token                          169           61            0         130   130     0     61    39  0.000000  0.230769  0.830435  1.000000  0.769231  0.869565
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     483         8496          464         443   434     9   8951    49  0.001004  0.101449  0.993858  0.979684  0.898551  0.937365
-Nonce                                    83           53            0          85    79     6     47     4  0.113208  0.048193  0.926471  0.929412  0.951807  0.940476
+Key                                     493         8487          464         443   434     9   8942    59  0.001005  0.119675  0.992800  0.979684  0.880325  0.927350
+Nonce                                    90           46            0          85    84     1     45     6  0.021739  0.066667  0.948529  0.988235  0.933333  0.960000
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1820         7476         2751        1681  1613    68  10159   207  0.006649  0.113736  0.977173  0.959548  0.886264  0.921451
-Salt                                     42           76            2          38    38     0     78     4  0.000000  0.095238  0.966667  1.000000  0.904762  0.950000
-Secret                                 1358        28497          869        1234  1229     5  29361   129  0.000170  0.094993  0.995639  0.995948  0.905007  0.948302
+Password                               1834         7472         2741        1681  1617    64  10149   217  0.006267  0.118321  0.976675  0.961927  0.881679  0.920057
+Salt                                     45           73            2          38    38     0     75     7  0.000000  0.155556  0.941667  1.000000  0.844444  0.915663
+Secret                                 1362        28494          868        1234  1229     5  29357   133  0.000170  0.097651  0.995508  0.995948  0.902349  0.946841
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   585         3975          438         512   503     9   4404    82  0.002039  0.140171  0.981793  0.982422  0.859829  0.917046
+Token                                   586         3974          438         512   503     9   4403    83  0.002040  0.141638  0.981593  0.982422  0.858362  0.916211
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         197          126          250         190   190     0    376     7  0.000000  0.035533  0.987784  1.000000  0.964467  0.981912
-                                       7500        59964         5228        6605  6346   248  59716  1154  0.004136  0.153867  0.979219  0.962390  0.846133  0.900525
+URL Credentials                         197          127          249         190   190     0    376     7  0.000000  0.035533  0.987784  1.000000  0.964467  0.981912
+                                       7541        59936         5216        6577  6348   218  59718  1193  0.003637  0.158202  0.979089  0.966799  0.841798  0.899979

From 23d3abb39b1df39c852fa1c38519a29c45213139 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Thu, 8 Aug 2024 16:13:46 +0300
Subject: [PATCH 18/18] BM scores fix

---
 cicd/benchmark.txt | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index 4ec884387..1626253d5 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -1,4 +1,4 @@
-DATA: 16978521 interested lines. MARKUP: 61852 items
+DATA: 16978521 interested lines. MARKUP: 61855 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           64          429           87
@@ -27,7 +27,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .cmd                        4           401            2            3
 .cnf                        8           858           18           45           18
 .coffee                     1           585                         2
-.conf                      60          4945           53           71           54
+.conf                      60          4945           54           69           54
 .config                    20           492           16           33            1
 .cpp                       15          5688            1           61
 .creds                      1            10            1            1
@@ -54,14 +54,14 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .erl                        4            96                         8
 .ex                        25          4968            3          105            5
 .example                   17          1838           74           38           54
-.exs                       24          4842            3          189            4
+.exs                       24          4842            3          190            4
 .ext                        5           211            1            4            2
 .fsproj                     1            75                         1
 .g4                         2           201                         2
 .gd                         1            37                         1
 .gml                        3          3075                        26
 .gni                        3          5017                        18
-.go                      1079        566327          621         4331          742
+.go                      1079        566327          623         4329          742
 .golden                     5          1168            1           14           29
 .gradle                    45          3265            4           91          100
 .graphql                    7           420                        13
@@ -83,8 +83,8 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .java                     621        134132          314         1361          170
 .jenkinsfile                1            58            1            7
 .jinja2                     1            64                         2
-.js                       659        536413          526         2637          336
-.json                     860      13670669          623        10948          140
+.js                       659        536413          526         2638          336
+.json                     860      13670669          624        10946          140
 .jsp                       13          3202            1           42
 .jsx                        7           857                        19
 .jwt                        1             1            2
@@ -110,7 +110,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .markdown                   3           139                         3            1
 .markerb                    3            12                         3
 .marko                      1            21                         2
-.md                       673        149294          658         2361          664
+.md                       673        149294          658         2362          664
 .mdx                        3           549                         7
 .mjml                       1            18                         1
 .mjs                       22          4424           50          343
@@ -150,7 +150,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .pug                        2           193                         2
 .purs                       1            69                         4
 .pxd                        1           150                         5            2
-.py                       890        291553          626         3461          744
+.py                       890        291553          627         3460          744
 .pyi                        4          1361                         9
 .pyp                        1           167                         1
 .pyx                        2          1094                        21
@@ -222,8 +222,8 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          460          916          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10281      16978521         7541        59936         5216
-credsweeper result_cnt : 6566, lost_cnt : 0, true_cnt : 6348, false_cnt : 218
+TOTAL:                  10281      16978521         7546        59932         5216
+credsweeper result_cnt : 6585, lost_cnt : 0, true_cnt : 6367, false_cnt : 218
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     124         3162          185         112   109     3   3344    15  0.000896  0.120968  0.994814  0.973214  0.879032  0.923729
@@ -231,7 +231,7 @@ AWS Client ID                           168           13            0         16
 AWS Multi                                75           12            0          87    75    11      1     0  0.916667  0.000000  0.873563  0.872093  1.000000  0.931677
 AWS S3 Bucket                            61           25            0          92    61    25      0     0  1.000000  0.000000  0.709302  0.709302  1.000000  0.829932
 Atlassian Old PAT token                  27          212            3          12     3     8    207    24  0.037209  0.888889  0.867769  0.272727  0.111111  0.157895
-Auth                                    408         2727           77         371   350    21   2783    58  0.007489  0.142157  0.975405  0.943396  0.857843  0.898588
+Auth                                    408         2727           77         372   351    21   2783    57  0.007489  0.139706  0.975716  0.943548  0.860294  0.900000
 Azure Access Token                       19            0            0          12    12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
@@ -253,16 +253,16 @@ Grafana Provisioned API Key              22            1            0
 JSON Web Token                          169           61            0         130   130     0     61    39  0.000000  0.230769  0.830435  1.000000  0.769231  0.869565
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 14            6            0          10    10     0      6     4  0.000000  0.285714  0.800000  1.000000  0.714286  0.833333
-Key                                     493         8487          464         443   434     9   8942    59  0.001005  0.119675  0.992800  0.979684  0.880325  0.927350
-Nonce                                    90           46            0          85    84     1     45     6  0.021739  0.066667  0.948529  0.988235  0.933333  0.960000
+Key                                     497         8483          464         448   439     9   8938    58  0.001006  0.116700  0.992906  0.979911  0.883300  0.929101
+Nonce                                    90           47            0          84    83     1     46     7  0.021277  0.077778  0.941606  0.988095  0.922222  0.954023
 Other                                     0            0            5                 0     0      5     0  0.000000            1.000000
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1834         7472         2741        1681  1617    64  10149   217  0.006267  0.118321  0.976675  0.961927  0.881679  0.920057
-Salt                                     45           73            2          38    38     0     75     7  0.000000  0.155556  0.941667  1.000000  0.844444  0.915663
-Secret                                 1362        28494          868        1234  1229     5  29357   133  0.000170  0.097651  0.995508  0.995948  0.902349  0.946841
+Password                               1834         7473         2741        1691  1627    64  10150   207  0.006266  0.112868  0.977507  0.962153  0.887132  0.923121
+Salt                                     45           73            2          39    39     0     75     6  0.000000  0.133333  0.950000  1.000000  0.866667  0.928571
+Secret                                 1362        28492          868        1236  1231     5  29355   131  0.000170  0.096182  0.995573  0.995955  0.903818  0.947652
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   586         3974          438         512   503     9   4403    83  0.002040  0.141638  0.981593  0.982422  0.858362  0.916211
+Token                                   586         3974          438         513   504     9   4403    82  0.002040  0.139932  0.981793  0.982456  0.860068  0.917197
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         197          127          249         190   190     0    376     7  0.000000  0.035533  0.987784  1.000000  0.964467  0.981912
-                                       7541        59936         5216        6577  6348   218  59718  1193  0.003637  0.158202  0.979089  0.966799  0.841798  0.899979
+URL Credentials                         198          127          249         190   190     0    376     8  0.000000  0.040404  0.986063  1.000000  0.959596  0.979381
+                                       7546        59932         5216        6596  6367   218  59714  1179  0.003637  0.156242  0.979297  0.966894  0.843758  0.901139