From 21bef6dcf4001b8a8afaaa001a6c7f1004772baf Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Wed, 23 Aug 2023 13:51:28 +0300 Subject: [PATCH] Phone number detection (#409) * [skip actions] [phone] 2023-08-16T10:14:11+03:00 * [skip actions] [phone] 2023-08-16T10:57:52+03:00 * [skip actions] [phone] 2023-08-16T12:22:56+03:00 * [skip actions] [phone] 2023-08-16T12:41:28+03:00 * Check phone number with length * custom_benchmark * fix test samples * fix test * fix benchmark * Update .github/workflows/benchmark.yml --- cicd/benchmark.txt | 14 +- credsweeper/filters/__init__.py | 1 + credsweeper/filters/value_phone_check.py | 258 +++++++++++++++++++++++ credsweeper/rules/config.yaml | 12 ++ tests/__init__.py | 8 +- tests/data/depth_3.json | 88 ++++++++ tests/data/ml_threshold_0.json | 88 ++++++++ tests/data/output.json | 88 ++++++++ tests/samples/phones | 7 + 9 files changed, 553 insertions(+), 11 deletions(-) create mode 100644 credsweeper/filters/value_phone_check.py create mode 100644 tests/samples/phones diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index b3b06300a..785cd5331 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,25 +1,25 @@ -DATA: 19434458 valid lines. MARKUP: 73696 items +DATA: 19434458 valid lines. MARKUP: 73890 items Category Positives Negatives Template -------------------------- ----------- ----------- ---------- Authentication Key & Token 67 1 31 Generic Secret 1055 15 203 Generic Token 333 45 558 -Other 515 63135 635 +Other 708 63136 635 Password 1403 110 4170 Predefined Pattern 326 2 40 Private Key 1001 1 3 Seed, Salt, Nonce 39 4 4 -TOTAL: 4739 63313 5644 -Detected Credentials: 4744 -credsweeper result_cnt : 4227, lost_cnt : 0, true_cnt : 3857, false_cnt : 370 +TOTAL: 4932 63314 5644 +Detected Credentials: 4938 +credsweeper result_cnt : 4421, lost_cnt : 0, true_cnt : 4050, false_cnt : 371 Category TP FP TN FN FPR FNR ACC PRC RCL F1 -------------------------- ---- ---- -------- ---- ---------- --------- -------- -------- -------- -------- Authentication Key & Token 51 4 28 16 0.125 0.238806 0.79798 0.927273 0.761194 0.836066 Generic Secret 971 2 216 84 0.00917431 0.0796209 0.932443 0.997945 0.920379 0.957594 Generic Token 287 7 596 46 0.0116086 0.138138 0.943376 0.97619 0.861862 0.91547 -Other 253 237 63533 262 0.00371648 0.508738 0.992238 0.516327 0.491262 0.503483 +Other 446 238 63533 262 0.0037321 0.370057 0.992246 0.652047 0.629943 0.640805 Password 984 116 4164 419 0.0271028 0.298646 0.90586 0.894545 0.701354 0.786256 Predefined Pattern 309 2 40 17 0.0476191 0.0521472 0.94837 0.993569 0.947853 0.970173 Private Key 967 0 4 34 0.033966 0.966169 1 0.966034 0.982724 Seed, Salt, Nonce 35 2 6 4 0.25 0.102564 0.87234 0.945946 0.897436 0.921053 - 3857 370 19429349 882 1.904e-05 0.186115 0.999936 0.912467 0.813885 0.860361 + 4050 371 19429155 882 1.909e-05 0.178832 0.999936 0.916082 0.821168 0.866032 diff --git a/credsweeper/filters/__init__.py b/credsweeper/filters/__init__.py index ce16128e4..7b7076732 100644 --- a/credsweeper/filters/__init__.py +++ b/credsweeper/filters/__init__.py @@ -29,6 +29,7 @@ from credsweeper.filters.value_pattern_check import ValuePatternCheck from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck from credsweeper.filters.value_pem_pattern_check import ValuePemPatternCheck +from credsweeper.filters.value_phone_check import ValuePhoneCheck from credsweeper.filters.value_similarity_check import ValueSimilarityCheck from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck from credsweeper.filters.value_string_type_check import ValueStringTypeCheck diff --git a/credsweeper/filters/value_phone_check.py b/credsweeper/filters/value_phone_check.py new file mode 100644 index 000000000..55c9826bf --- /dev/null +++ b/credsweeper/filters/value_phone_check.py @@ -0,0 +1,258 @@ +from credsweeper.config import Config +from credsweeper.credentials import LineData +from credsweeper.file_handler.analysis_target import AnalysisTarget +from credsweeper.filters import Filter + + +class ValuePhoneCheck(Filter): + """Check that value may be a phone number""" + + PREFIX_LENGTH = { + "1": 10, + "20": 10, + "211": 7, + "212": 9, + "213": 9, + "216": 8, + "218": 10, + "220": 7, + "221": 9, + "222": 8, + "223": 8, + "224": 9, + "225": 8, + "226": 8, + "227": 8, + "228": 8, + "229": 8, + "230": 8, + "231": (8, 9), + "232": 8, + "233": 9, + "234": 8, + "235": 6, + "236": 8, + "237": 9, + "238": 7, + "239": 7, + "240": 9, + "241": 7, + "242": 9, + "243": 7, + "244": 9, + "245": 9, + "246": 7, + "248": 7, + "249": 7, + "250": 9, + "251": 9, + "252": (8, 9), + "253": 10, + "254": 10, + "255": 7, + "256": 7, + "257": 8, + "258": 12, + "260": 9, + "261": 7, + "262": (9, 10), + "263": 9, + "264": 7, + "265": [7, 9], + "266": 8, + "267": 7, + "268": 8, + "269": 7, + "27": 9, + "290": 4, + "291": 7, + "297": 7, + "298": 5, + "299": 6, + "30": 10, + "31": 9, + "32": 9, + "33": 9, + "34": 9, + "350": 8, + "351": 9, + "352": 9, + "353": 9, + "354": 7, + "355": 9, + "356": 8, + "357": 8, + "358": (8, 11), + "359": 9, + "36": 9, + "370": 8, + "371": 8, + "372": 8, + "373": 8, + "374": 6, + "375": 9, + "376": 6, + "377": 8, + "378": 10, + "379": 10, + "380": 9, + "381": 9, + "382": 8, + "383": 8, + "385": 9, + "386": 9, + "387": 8, + "389": 8, + "39": 10, + "40": 10, + "41": 9, + "420": 9, + "421": 9, + "423": 7, + "43": (10, 11), + "44": 10, + "45": 8, + "46": 7, + "47": (8, 10), + "48": 9, + "49": 10, + "500": 5, + "501": 7, + "502": 8, + "503": 8, + "504": 8, + "505": 8, + "506": 8, + "507": 8, + "508": 6, + "509": 8, + "51": 9, + "52": 10, + "53": 8, + "54": 10, + "55": 11, + "56": 9, + "57": 10, + "58": 7, + "590": [9, 12], + "591": 9, + "592": 7, + "593": 9, + "594": 9, + "595": 9, + "596": 9, + "597": (6, 7), + "598": 8, + "599": 7, + "60": 7, + "61": (9, 10), + "62": 11, + "63": 10, + "64": (8, 9), + "65": 8, + "66": 9, + "670": 7, + "672": [6, 9], + "673": 7, + "674": 7, + "675": 8, + "676": 5, + "677": 7, + "678": 5, + "679": 7, + "680": 7, + "681": 6, + "682": 5, + "683": 4, + "685": 5, + "686": 8, + "687": 6, + "688": 5, + "689": 8, + "690": 5, + "691": 7, + "692": 7, + "7": 10, + "81": 10, + "82": (9, 10), + "84": 9, + "850": (3, 10), + "852": 8, + "853": 8, + "855": 9, + "856": (8, 9), + "86": 11, + "870": 9, + "880": 10, + "886": 9, + "90": 11, + "91": 10, + "92": 10, + "93": 9, + "94": 7, + "95": (7, 10), + "960": 7, + "961": (7, 8), + "962": (8, 9), + "963": 7, + "964": 10, + "965": 8, + "966": 9, + "967": 9, + "968": 8, + "970": 9, + "971": 9, + "972": 9, + "973": 8, + "974": 8, + "975": 7, + "976": 8, + "977": 10, + "98": 11, + "992": 9, + "993": 8, + "994": 9, + "995": 9, + "996": 9, + "998": 9, + } + + def __init__(self, config: Config = None) -> None: + self.prefix_limit = 1 + max(len(x) for x in self.PREFIX_LENGTH.keys()) + pass + + def run(self, line_data: LineData, target: AnalysisTarget) -> bool: + """Run filter checks on received credential candidate data 'line_data'. + + Args: + line_data: credential candidate data + target: multiline target from which line data was obtained + + Return: + False, if the sequence is not card number. True if it is + + """ + if line_data.value is None: + return True + + # https://en.wikipedia.org/wiki/List_of_country_calling_codes + # https://en.wikipedia.org/wiki/List_of_mobile_telephone_prefixes_by_country + + # until rule regex requires '+' at start + value = line_data.value[1:] + value_len = len(value) + + for prefix_size in range(1, self.prefix_limit): + key = value[:prefix_size] + if key in self.PREFIX_LENGTH: + phone_length = value_len - prefix_size + length = self.PREFIX_LENGTH[key] + if isinstance(length, int) and phone_length == length: + break + elif isinstance(length, list) and phone_length in length: + break + elif isinstance(length, tuple) and length[0] <= phone_length <= length[1]: + break + else: + return True + return False diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index 2bf5b1f7e..109c99e29 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -1,3 +1,15 @@ +- name: Phone + severity: info + type: pattern + values: + - (^|[^0-9A-Za-z])(?P\+[1-9][0-9]{6,14})([^=0-9A-Za-z]|$) + filter_type: + - ValuePhoneCheck + min_line_len: 10 + required_substrings: + - "+" + doc_available: false + - name: VIN severity: info type: pattern diff --git a/tests/__init__.py b/tests/__init__.py index a71ae6d49..ce66789cc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,14 +1,14 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 108 +SAMPLES_FILES_COUNT: int = 109 # credentials count after scan -SAMPLES_CRED_COUNT: int = 108 -SAMPLES_CRED_LINE_COUNT: int = 119 +SAMPLES_CRED_COUNT: int = 112 +SAMPLES_CRED_LINE_COUNT: int = 123 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 102 +SAMPLES_POST_CRED_COUNT: int = 106 # with option --doc SAMPLES_IN_DOC = 72 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index ecf4d1183..2c52d2559 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -3012,6 +3012,94 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+82000000000 - 9 digits after prefix. valid. various length", + "line_num": 1, + "path": "tests/samples/phones", + "info": "tests/samples/phones|RAW", + "value": "+82000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.207518749639422, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+820000000000 - 10 digits after prefix. valid. various length", + "line_num": 2, + "path": "tests/samples/phones", + "info": "tests/samples/phones|RAW", + "value": "+820000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+380000000000 - 10 digits. valid. fixed length", + "line_num": 4, + "path": "tests/samples/phones", + "info": "tests/samples/phones|RAW", + "value": "+380000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+590000000000000 - 12 digits. valid. random length", + "line_num": 6, + "path": "tests/samples/phones", + "info": "tests/samples/phones|RAW", + "value": "+590000000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 0.9933927290103626, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold_0.json b/tests/data/ml_threshold_0.json index 392e078f0..a4f31d33a 100644 --- a/tests/data/ml_threshold_0.json +++ b/tests/data/ml_threshold_0.json @@ -1946,6 +1946,94 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+82000000000 - 9 digits after prefix. valid. various length", + "line_num": 1, + "path": "tests/samples/phones", + "info": "", + "value": "+82000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.207518749639422, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+820000000000 - 10 digits after prefix. valid. various length", + "line_num": 2, + "path": "tests/samples/phones", + "info": "", + "value": "+820000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+380000000000 - 10 digits. valid. fixed length", + "line_num": 4, + "path": "tests/samples/phones", + "info": "", + "value": "+380000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+590000000000000 - 12 digits. valid. random length", + "line_num": 6, + "path": "tests/samples/phones", + "info": "", + "value": "+590000000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 0.9933927290103626, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index 00db08415..3c687e543 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -1858,6 +1858,94 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+82000000000 - 9 digits after prefix. valid. various length", + "line_num": 1, + "path": "tests/samples/phones", + "info": "", + "value": "+82000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.207518749639422, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+820000000000 - 10 digits after prefix. valid. various length", + "line_num": 2, + "path": "tests/samples/phones", + "info": "", + "value": "+820000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+380000000000 - 10 digits. valid. fixed length", + "line_num": 4, + "path": "tests/samples/phones", + "info": "", + "value": "+380000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 1.1451104143815827, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Phone", + "severity": "info", + "line_data_list": [ + { + "line": "+590000000000000 - 12 digits. valid. random length", + "line_num": 6, + "path": "tests/samples/phones", + "info": "", + "value": "+590000000000000", + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 0.9933927290103626, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/samples/phones b/tests/samples/phones new file mode 100644 index 000000000..78ebdc57e --- /dev/null +++ b/tests/samples/phones @@ -0,0 +1,7 @@ ++82000000000 - 9 digits after prefix. valid. various length ++820000000000 - 10 digits after prefix. valid. various length ++8212345678 - 8 digits after prefix. invalid. various length ++380000000000 - 10 digits. valid. fixed length ++3801234567890 - 11 digits. invalid. fixed length ++590000000000000 - 12 digits. valid. random length ++59000000000000 - 11 digits. invalid. random length \ No newline at end of file