From 21bef6dcf4001b8a8afaaa001a6c7f1004772baf Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@users.noreply.github.com>
Date: Wed, 23 Aug 2023 13:51:28 +0300
Subject: [PATCH] Phone number detection (#409)

* [skip actions] [phone] 2023-08-16T10:14:11+03:00

* [skip actions] [phone] 2023-08-16T10:57:52+03:00

* [skip actions] [phone] 2023-08-16T12:22:56+03:00

* [skip actions] [phone] 2023-08-16T12:41:28+03:00

* Check phone number with length

* custom_benchmark

* fix test samples

* fix test

* fix benchmark

* Update .github/workflows/benchmark.yml
---
 cicd/benchmark.txt                       |  14 +-
 credsweeper/filters/__init__.py          |   1 +
 credsweeper/filters/value_phone_check.py | 258 +++++++++++++++++++++++
 credsweeper/rules/config.yaml            |  12 ++
 tests/__init__.py                        |   8 +-
 tests/data/depth_3.json                  |  88 ++++++++
 tests/data/ml_threshold_0.json           |  88 ++++++++
 tests/data/output.json                   |  88 ++++++++
 tests/samples/phones                     |   7 +
 9 files changed, 553 insertions(+), 11 deletions(-)
 create mode 100644 credsweeper/filters/value_phone_check.py
 create mode 100644 tests/samples/phones

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
index b3b06300a..785cd5331 100644
--- a/cicd/benchmark.txt
+++ b/cicd/benchmark.txt
@@ -1,25 +1,25 @@
-DATA: 19434458 valid lines. MARKUP: 73696 items
+DATA: 19434458 valid lines. MARKUP: 73890 items
 Category                      Positives    Negatives    Template
 --------------------------  -----------  -----------  ----------
 Authentication Key & Token           67            1          31
 Generic Secret                     1055           15         203
 Generic Token                       333           45         558
-Other                               515        63135         635
+Other                               708        63136         635
 Password                           1403          110        4170
 Predefined Pattern                  326            2          40
 Private Key                        1001            1           3
 Seed, Salt, Nonce                    39            4           4
-TOTAL:                             4739        63313        5644
-Detected Credentials: 4744
-credsweeper result_cnt : 4227, lost_cnt : 0, true_cnt : 3857, false_cnt : 370
+TOTAL:                             4932        63314        5644
+Detected Credentials: 4938
+credsweeper result_cnt : 4421, lost_cnt : 0, true_cnt : 4050, false_cnt : 371
 Category                      TP    FP        TN    FN         FPR        FNR       ACC       PRC       RCL        F1
 --------------------------  ----  ----  --------  ----  ----------  ---------  --------  --------  --------  --------
 Authentication Key & Token    51     4        28    16  0.125       0.238806   0.79798   0.927273  0.761194  0.836066
 Generic Secret               971     2       216    84  0.00917431  0.0796209  0.932443  0.997945  0.920379  0.957594
 Generic Token                287     7       596    46  0.0116086   0.138138   0.943376  0.97619   0.861862  0.91547
-Other                        253   237     63533   262  0.00371648  0.508738   0.992238  0.516327  0.491262  0.503483
+Other                        446   238     63533   262  0.0037321   0.370057   0.992246  0.652047  0.629943  0.640805
 Password                     984   116      4164   419  0.0271028   0.298646   0.90586   0.894545  0.701354  0.786256
 Predefined Pattern           309     2        40    17  0.0476191   0.0521472  0.94837   0.993569  0.947853  0.970173
 Private Key                  967     0         4    34              0.033966   0.966169  1         0.966034  0.982724
 Seed, Salt, Nonce             35     2         6     4  0.25        0.102564   0.87234   0.945946  0.897436  0.921053
-                            3857   370  19429349   882  1.904e-05   0.186115   0.999936  0.912467  0.813885  0.860361
+                            4050   371  19429155   882  1.909e-05   0.178832   0.999936  0.916082  0.821168  0.866032
diff --git a/credsweeper/filters/__init__.py b/credsweeper/filters/__init__.py
index ce16128e4..7b7076732 100644
--- a/credsweeper/filters/__init__.py
+++ b/credsweeper/filters/__init__.py
@@ -29,6 +29,7 @@
 from credsweeper.filters.value_pattern_check import ValuePatternCheck
 from credsweeper.filters.value_pattern_length_check import ValuePatternLengthCheck
 from credsweeper.filters.value_pem_pattern_check import ValuePemPatternCheck
+from credsweeper.filters.value_phone_check import ValuePhoneCheck
 from credsweeper.filters.value_similarity_check import ValueSimilarityCheck
 from credsweeper.filters.value_split_keyword_check import ValueSplitKeywordCheck
 from credsweeper.filters.value_string_type_check import ValueStringTypeCheck
diff --git a/credsweeper/filters/value_phone_check.py b/credsweeper/filters/value_phone_check.py
new file mode 100644
index 000000000..55c9826bf
--- /dev/null
+++ b/credsweeper/filters/value_phone_check.py
@@ -0,0 +1,258 @@
+from credsweeper.config import Config
+from credsweeper.credentials import LineData
+from credsweeper.file_handler.analysis_target import AnalysisTarget
+from credsweeper.filters import Filter
+
+
+class ValuePhoneCheck(Filter):
+    """Check that value may be a phone number"""
+
+    PREFIX_LENGTH = {
+        "1": 10,
+        "20": 10,
+        "211": 7,
+        "212": 9,
+        "213": 9,
+        "216": 8,
+        "218": 10,
+        "220": 7,
+        "221": 9,
+        "222": 8,
+        "223": 8,
+        "224": 9,
+        "225": 8,
+        "226": 8,
+        "227": 8,
+        "228": 8,
+        "229": 8,
+        "230": 8,
+        "231": (8, 9),
+        "232": 8,
+        "233": 9,
+        "234": 8,
+        "235": 6,
+        "236": 8,
+        "237": 9,
+        "238": 7,
+        "239": 7,
+        "240": 9,
+        "241": 7,
+        "242": 9,
+        "243": 7,
+        "244": 9,
+        "245": 9,
+        "246": 7,
+        "248": 7,
+        "249": 7,
+        "250": 9,
+        "251": 9,
+        "252": (8, 9),
+        "253": 10,
+        "254": 10,
+        "255": 7,
+        "256": 7,
+        "257": 8,
+        "258": 12,
+        "260": 9,
+        "261": 7,
+        "262": (9, 10),
+        "263": 9,
+        "264": 7,
+        "265": [7, 9],
+        "266": 8,
+        "267": 7,
+        "268": 8,
+        "269": 7,
+        "27": 9,
+        "290": 4,
+        "291": 7,
+        "297": 7,
+        "298": 5,
+        "299": 6,
+        "30": 10,
+        "31": 9,
+        "32": 9,
+        "33": 9,
+        "34": 9,
+        "350": 8,
+        "351": 9,
+        "352": 9,
+        "353": 9,
+        "354": 7,
+        "355": 9,
+        "356": 8,
+        "357": 8,
+        "358": (8, 11),
+        "359": 9,
+        "36": 9,
+        "370": 8,
+        "371": 8,
+        "372": 8,
+        "373": 8,
+        "374": 6,
+        "375": 9,
+        "376": 6,
+        "377": 8,
+        "378": 10,
+        "379": 10,
+        "380": 9,
+        "381": 9,
+        "382": 8,
+        "383": 8,
+        "385": 9,
+        "386": 9,
+        "387": 8,
+        "389": 8,
+        "39": 10,
+        "40": 10,
+        "41": 9,
+        "420": 9,
+        "421": 9,
+        "423": 7,
+        "43": (10, 11),
+        "44": 10,
+        "45": 8,
+        "46": 7,
+        "47": (8, 10),
+        "48": 9,
+        "49": 10,
+        "500": 5,
+        "501": 7,
+        "502": 8,
+        "503": 8,
+        "504": 8,
+        "505": 8,
+        "506": 8,
+        "507": 8,
+        "508": 6,
+        "509": 8,
+        "51": 9,
+        "52": 10,
+        "53": 8,
+        "54": 10,
+        "55": 11,
+        "56": 9,
+        "57": 10,
+        "58": 7,
+        "590": [9, 12],
+        "591": 9,
+        "592": 7,
+        "593": 9,
+        "594": 9,
+        "595": 9,
+        "596": 9,
+        "597": (6, 7),
+        "598": 8,
+        "599": 7,
+        "60": 7,
+        "61": (9, 10),
+        "62": 11,
+        "63": 10,
+        "64": (8, 9),
+        "65": 8,
+        "66": 9,
+        "670": 7,
+        "672": [6, 9],
+        "673": 7,
+        "674": 7,
+        "675": 8,
+        "676": 5,
+        "677": 7,
+        "678": 5,
+        "679": 7,
+        "680": 7,
+        "681": 6,
+        "682": 5,
+        "683": 4,
+        "685": 5,
+        "686": 8,
+        "687": 6,
+        "688": 5,
+        "689": 8,
+        "690": 5,
+        "691": 7,
+        "692": 7,
+        "7": 10,
+        "81": 10,
+        "82": (9, 10),
+        "84": 9,
+        "850": (3, 10),
+        "852": 8,
+        "853": 8,
+        "855": 9,
+        "856": (8, 9),
+        "86": 11,
+        "870": 9,
+        "880": 10,
+        "886": 9,
+        "90": 11,
+        "91": 10,
+        "92": 10,
+        "93": 9,
+        "94": 7,
+        "95": (7, 10),
+        "960": 7,
+        "961": (7, 8),
+        "962": (8, 9),
+        "963": 7,
+        "964": 10,
+        "965": 8,
+        "966": 9,
+        "967": 9,
+        "968": 8,
+        "970": 9,
+        "971": 9,
+        "972": 9,
+        "973": 8,
+        "974": 8,
+        "975": 7,
+        "976": 8,
+        "977": 10,
+        "98": 11,
+        "992": 9,
+        "993": 8,
+        "994": 9,
+        "995": 9,
+        "996": 9,
+        "998": 9,
+    }
+
+    def __init__(self, config: Config = None) -> None:
+        self.prefix_limit = 1 + max(len(x) for x in self.PREFIX_LENGTH.keys())
+        pass
+
+    def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
+        """Run filter checks on received credential candidate data 'line_data'.
+
+        Args:
+            line_data: credential candidate data
+            target: multiline target from which line data was obtained
+
+        Return:
+            False, if the sequence is not card number. True if it is
+
+        """
+        if line_data.value is None:
+            return True
+
+        # https://en.wikipedia.org/wiki/List_of_country_calling_codes
+        # https://en.wikipedia.org/wiki/List_of_mobile_telephone_prefixes_by_country
+
+        # until rule regex requires '+' at start
+        value = line_data.value[1:]
+        value_len = len(value)
+
+        for prefix_size in range(1, self.prefix_limit):
+            key = value[:prefix_size]
+            if key in self.PREFIX_LENGTH:
+                phone_length = value_len - prefix_size
+                length = self.PREFIX_LENGTH[key]
+                if isinstance(length, int) and phone_length == length:
+                    break
+                elif isinstance(length, list) and phone_length in length:
+                    break
+                elif isinstance(length, tuple) and length[0] <= phone_length <= length[1]:
+                    break
+        else:
+            return True
+        return False
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
index 2bf5b1f7e..109c99e29 100644
--- a/credsweeper/rules/config.yaml
+++ b/credsweeper/rules/config.yaml
@@ -1,3 +1,15 @@
+- name: Phone
+  severity: info
+  type: pattern
+  values:
+    - (^|[^0-9A-Za-z])(?P<value>\+[1-9][0-9]{6,14})([^=0-9A-Za-z]|$)
+  filter_type:
+    - ValuePhoneCheck
+  min_line_len: 10
+  required_substrings:
+    - "+"
+  doc_available: false
+
 - name: VIN
   severity: info
   type: pattern
diff --git a/tests/__init__.py b/tests/__init__.py
index a71ae6d49..ce66789cc 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,14 +1,14 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT: int = 108
+SAMPLES_FILES_COUNT: int = 109
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 108
-SAMPLES_CRED_LINE_COUNT: int = 119
+SAMPLES_CRED_COUNT: int = 112
+SAMPLES_CRED_LINE_COUNT: int = 123
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 102
+SAMPLES_POST_CRED_COUNT: int = 106
 
 # with option --doc
 SAMPLES_IN_DOC = 72
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index ecf4d1183..2c52d2559 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -3012,6 +3012,94 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+82000000000 - 9 digits after prefix. valid. various length",
+                "line_num": 1,
+                "path": "tests/samples/phones",
+                "info": "tests/samples/phones|RAW",
+                "value": "+82000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.207518749639422,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+820000000000 - 10 digits after prefix. valid. various length",
+                "line_num": 2,
+                "path": "tests/samples/phones",
+                "info": "tests/samples/phones|RAW",
+                "value": "+820000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+380000000000 - 10 digits. valid. fixed length",
+                "line_num": 4,
+                "path": "tests/samples/phones",
+                "info": "tests/samples/phones|RAW",
+                "value": "+380000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+590000000000000 - 12 digits. valid. random length",
+                "line_num": 6,
+                "path": "tests/samples/phones",
+                "info": "tests/samples/phones|RAW",
+                "value": "+590000000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 0.9933927290103626,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/ml_threshold_0.json b/tests/data/ml_threshold_0.json
index 392e078f0..a4f31d33a 100644
--- a/tests/data/ml_threshold_0.json
+++ b/tests/data/ml_threshold_0.json
@@ -1946,6 +1946,94 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+82000000000 - 9 digits after prefix. valid. various length",
+                "line_num": 1,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+82000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.207518749639422,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+820000000000 - 10 digits after prefix. valid. various length",
+                "line_num": 2,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+820000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+380000000000 - 10 digits. valid. fixed length",
+                "line_num": 4,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+380000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+590000000000000 - 12 digits. valid. random length",
+                "line_num": 6,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+590000000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 0.9933927290103626,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/data/output.json b/tests/data/output.json
index 00db08415..3c687e543 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -1858,6 +1858,94 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+82000000000 - 9 digits after prefix. valid. various length",
+                "line_num": 1,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+82000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.207518749639422,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+820000000000 - 10 digits after prefix. valid. various length",
+                "line_num": 2,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+820000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+380000000000 - 10 digits. valid. fixed length",
+                "line_num": 4,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+380000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 1.1451104143815827,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Phone",
+        "severity": "info",
+        "line_data_list": [
+            {
+                "line": "+590000000000000 - 12 digits. valid. random length",
+                "line_num": 6,
+                "path": "tests/samples/phones",
+                "info": "",
+                "value": "+590000000000000",
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 0.9933927290103626,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "NOT_AVAILABLE",
diff --git a/tests/samples/phones b/tests/samples/phones
new file mode 100644
index 000000000..78ebdc57e
--- /dev/null
+++ b/tests/samples/phones
@@ -0,0 +1,7 @@
++82000000000 - 9 digits after prefix. valid. various length
++820000000000 - 10 digits after prefix. valid. various length
++8212345678 - 8 digits after prefix. invalid. various length
++380000000000 - 10 digits. valid. fixed length
++3801234567890 - 11 digits. invalid. fixed length
++590000000000000 - 12 digits. valid. random length
++59000000000000 - 11 digits. invalid. random length
\ No newline at end of file