Merge branch 'subhashtext' into ml

Samsung · Aug 14, 2024 · 4582016 · 4582016
2 parents ff705d2 + 0ce84fc
commit 4582016
Show file tree

Hide file tree

Showing 14 changed files with 1,463 additions and 899 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -22,8 +22,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: abspos
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -73,8 +72,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: abspos
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -171,8 +169,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: abspos
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |
@@ -354,8 +351,7 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: babenek/CredData
-          ref: abspos
+          repository: Samsung/CredData
 
       - name: Markup hashing
         run: |

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
@@ -1,4 +1,6 @@
-DATA: 16348035 interested lines. MARKUP: 62633 items
+META MD5 6e26189b51be42fd388d39547ee97d9b
+DATA MD5 4d496bb06291caeab62a6eadaeeddf83
+DATA: 16345157 interested lines. MARKUP: 62633 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           66          427           87
@@ -33,7 +35,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .creds                      1            10            1            1
 .crlf                       1            27            1
 .crt                        2          4979                       253
-.cs                       268         82410          158          910           94
+.cs                       268         79532          158          910           94
 .cshtml                     5           180                        12
 .csp                        3           379                        11
 .csproj                     1            14                         1
@@ -110,7 +112,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .markdown                   3           139                         3            1
 .markerb                    3            12                         3
 .marko                      1            21                         2
-.md                       674        149399          723         2370          662
+.md                       674        149399          723         2372          661
 .mdx                        3           549                         7
 .mjml                       1            18                         1
 .mjs                       22          4424           78          343
@@ -220,47 +222,47 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      418         36162          549          912          384
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10259      16348035         8766        59711         5180
-credsweeper result_cnt : 7827, lost_cnt : 0, true_cnt : 7644, false_cnt : 183
+TOTAL:                  10259      16345157         8766        59713         5179
+credsweeper result_cnt : 7735, lost_cnt : 0, true_cnt : 7513, false_cnt : 222
 Rules                             Positives    Negatives    Templates    Reported    TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  ----  ----  -----  ----  --------  --------  --------  --------  --------  --------
-API                                     128         3130          185         110   110     0   3315    18  0.000000  0.140625  0.994772  1.000000  0.859375  0.924370
+API                                     128         3130          185         113   111     2   3313    17  0.000603  0.132812  0.994482  0.982301  0.867188  0.921162
 AWS Client ID                           167           18            0         160   160     0     18     7  0.000000  0.041916  0.962162  1.000000  0.958084  0.978593
 AWS Multi                                75           14            0          87    75    11      3     0  0.785714  0.000000  0.876404  0.872093  1.000000  0.931677
 AWS S3 Bucket                            66           24            0          91    65    24      0     1  1.000000  0.015152  0.722222  0.730337  0.984848  0.838710
 Atlassian Old PAT token                  27          208            3          12     3     8    203    24  0.037915  0.888889  0.865546  0.272727  0.111111  0.157895
-Auth                                    412         2724           76         385   374    11   2789    38  0.003929  0.092233  0.984745  0.971429  0.907767  0.938519
+Auth                                    412         2724           76         377   358    19   2781    54  0.006786  0.131068  0.977273  0.949602  0.868932  0.907478
 Azure Access Token                       19            0            0          12    12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            2            0           7     7     0      2     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5     5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
 Bitbucket Client ID                     142         1808            9          48    28    19   1798   114  0.010457  0.802817  0.932108  0.595745  0.197183  0.296296
 Bitbucket Client Secret                 230          527           10          40    29    11    526   201  0.020484  0.873913  0.723598  0.725000  0.126087  0.214815
-Certificate                              25          466            1          26    20     6    461     5  0.012848  0.200000  0.977642  0.769231  0.800000  0.784314
+Certificate                              25          466            1          27    20     7    460     5  0.014989  0.200000  0.975610  0.740741  0.800000  0.769231
 Credential                               94          154           74          84    84     0    228    10  0.000000  0.106383  0.968944  1.000000  0.893617  0.943820
 Docker Swarm Token                        2            0            0           1     1     0      0     1            0.500000  0.500000  1.000000  0.500000  0.666667
 Dropbox App secret                       64          114            0          46    35    10    104    29  0.087719  0.453125  0.780899  0.777778  0.546875  0.642202
 Facebook Access Token                     0            1            0                 0     0      1     0  0.000000            1.000000
 Firebase Domain                           6            1            0           7     6     1      0     0  1.000000  0.000000  0.857143  0.857143  1.000000  0.923077
 Github Old Token                          1            0            0           1     1     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Gitlab Feed Token                       189          450           87          56    44    11    526   145  0.020484  0.767196  0.785124  0.800000  0.232804  0.360656
-Gitlab Incoming Email Token              37            4            0          22    19     3      1    18  0.750000  0.486486  0.487805  0.863636  0.513514  0.644068
+Gitlab Incoming Email Token              37            4            0          21    19     2      2    18  0.500000  0.486486  0.512195  0.904762  0.513514  0.655172
 Google API Key                           12            0            0          12    12     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Google Multi                             10            2            0          11    10     1      1     0  0.500000  0.000000  0.916667  0.909091  1.000000  0.952381
 Google OAuth Access Token                 3            0            0           3     3     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Grafana Provisioned API Key              22            1            0           5     5     0      1    17  0.000000  0.772727  0.260870  1.000000  0.227273  0.370370
 JSON Web Token                          170           61            0         131   131     0     61    39  0.000000  0.229412  0.831169  1.000000  0.770588  0.870432
 Jira / Confluence PAT token               0            4            0                 0     0      4     0  0.000000            1.000000
 Jira 2FA                                 15            6            0          12    12     0      6     3  0.000000  0.200000  0.857143  1.000000  0.800000  0.888889
-Key                                     539         8456          464         505   494    11   8909    45  0.001233  0.083488  0.994080  0.978218  0.916512  0.946360
-Nonce                                    91           48            0          88    88     0     48     3  0.000000  0.032967  0.978417  1.000000  0.967033  0.983240
+Key                                     539         8456          464         468   461     7   8913    78  0.000785  0.144712  0.991014  0.985043  0.855288  0.915591
+Nonce                                    91           48            0          83    81     2     46    10  0.041667  0.109890  0.913669  0.975904  0.890110  0.931034
 PEM Private Key                        1019         1483            0        1023  1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1843         7478         2722        1703  1658    45  10155   185  0.004412  0.100380  0.980902  0.973576  0.899620  0.935138
-Salt                                     45           74            2          42    42     0     76     3  0.000000  0.066667  0.975207  1.000000  0.933333  0.965517
-Secret                                 1368        28360          868        1240  1237     3  29225   131  0.000103  0.095760  0.995620  0.997581  0.904240  0.948620
+Password                               1843         7479         2722        1721  1652    69  10132   191  0.006764  0.103635  0.978412  0.959907  0.896365  0.927048
+Salt                                     45           74            2          42    41     1     75     4  0.013158  0.088889  0.958678  0.976190  0.911111  0.942529
+Secret                                 1368        28360          868        1235  1229     6  29222   139  0.000205  0.101608  0.995261  0.995142  0.898392  0.944295
 Seed                                      1            6            0                 0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4     4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   648         3952          437         593   591     2   4387    57  0.000456  0.087963  0.988287  0.996627  0.912037  0.952458
+Token                                   648         3952          437         539   533     6   4383   115  0.001367  0.177469  0.975978  0.988868  0.822531  0.898062
 Twilio API Key                            0            5            2                 0     0      7     0  0.000000            1.000000
-URL Credentials                         209          128          240         201   200     1    367     9  0.002717  0.043062  0.982669  0.995025  0.956938  0.975610
+URL Credentials                         209          129          239         196   196     0    368    13  0.000000  0.062201  0.977470  1.000000  0.937799  0.967901
 UUID                                   1069            1            0        1061  1060     1      0     9  1.000000  0.008419  0.990654  0.999057  0.991581  0.995305
-                                       8766        59711         5180        7834  7644   183  59528  1122  0.003065  0.127995  0.980943  0.976619  0.872005  0.921352
+                                       8766        59713         5179        7742  7513   222  59491  1253  0.003718  0.142939  0.978461  0.971299  0.857061  0.910611
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -406,7 +406,7 @@ def export_results(self) -> None:
 
         if self.json_filename:
             is_exported = True
-            Util.json_dump([credential.to_json(hashed=self.hashed,subtext=self.subtext) for credential in credentials],
+            Util.json_dump([credential.to_json(hashed=self.hashed, subtext=self.subtext) for credential in credentials],
                            file_path=self.json_filename)
 
         if self.xlsx_filename:

diff --git a/credsweeper/credentials/candidate.py b/credsweeper/credentials/candidate.py
@@ -120,7 +120,7 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
             "confidence": self.confidence.value,
             "use_ml": self.use_ml,
             # put the array to end to make json more readable
-            "line_data_list": [line_data.to_json(hashed,subtext) for line_data in self.line_data_list],
+            "line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
         }
         if self.config is not None:
             reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
@@ -136,7 +136,7 @@ def to_dict_list(self, hashed: bool, subtext: bool) -> List[dict]:
 
         """
         reported_output = []
-        json_output = self.to_json(hashed, subtext )
+        json_output = self.to_json(hashed, subtext)
         refined_data = copy.deepcopy(json_output)
         del refined_data["line_data_list"]
         for line_data in json_output["line_data_list"]:

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -83,9 +83,6 @@ def __init__(
         self.wrap = None
 
         self.initialize(match_obj)
-        if 0 <= self.value_start and self.value and not (self.value == self.line[self.value_start:self.value_end]):
-            print(line, self.value)
-            assert False
 
     def compare(self, other: 'LineData') -> bool:
         """Comparison method - skip whole line and checks only when variable and value are the same"""
@@ -305,10 +302,11 @@ def is_source_file_with_quotes(self) -> bool:
         return False
 
     @staticmethod
-    def get_hash_or_subtext(text: Optional[str],  #
-                            hashed: bool,  #
-                            cut_pos: Optional[StartEnd] = None,  #
-                            ) -> Optional[str]:
+    def get_hash_or_subtext(
+            text: Optional[str],  #
+            hashed: bool,  #
+            cut_pos: Optional[StartEnd] = None,  #
+    ) -> Optional[str]:
         """Represent not empty text with hash or a "beauty" subtext if required
 
         Args:
@@ -328,7 +326,7 @@ def get_hash_or_subtext(text: Optional[str],  #
             elif cut_pos is not None:
                 if 2 * ML_HUNK < cut_pos.end - cut_pos.start:
                     # subtext positions exceed the limit
-                    text = text[cut_pos.start: cut_pos.end]
+                    text = text[cut_pos.start:cut_pos.end]
                 else:
                     strip_text = text.strip()
                     if 2 * ML_HUNK >= len(strip_text):

diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
@@ -39,7 +39,7 @@ def read_detected_data(file_path: str) -> Dict[identifier, Dict]:
         line_data = deepcopy(cred["line_data_list"][0])
         line_data.pop("entropy_validation")
         line_data.pop("info")
-        line_data["line"] = None # will be read during join_label with data for ML input only
+        line_data["line"] = None  # will be read during join_label with data for ML input only
         meta_path = transform_to_meta_path(line_data["path"])
         line_data["path"] = meta_path
         line_data["RuleName"] = [rule_name]
@@ -137,6 +137,7 @@ def get_colored_line(line_data: Dict[str, Any]) -> str:
 
 def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier, Dict],
                cred_data_location: str) -> pd.DataFrame:
+
     @cache
     def read_text(path) -> list[str]:
         with open(path, "r", encoding="utf8") as f:
@@ -148,7 +149,7 @@ def read_text(path) -> list[str]:
         for i in line_data["RuleName"]:
             detected_rules.add(i)
         text = read_text(f'{cred_data_location}/{line_data["path"]}')
-        line = text[line_data["line_num"]-1]
+        line = text[line_data["line_num"] - 1]
         line_data["line"] = line
         if not line_data["value"]:
             print(f"WARNING: empty value\n{line_data}")
@@ -187,7 +188,8 @@ def read_text(path) -> list[str]:
                   f"\nsub_line:'{get_colored_line(line_data)}'")
             continue
         # check the value in detected data
-        assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"] , (line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
+        assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"], (
+            line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
         # todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
         line_data["GroundTruth"] = label
         line_data["file_type"] = Util.get_extension(line_data["path"])

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -7,18 +7,18 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 362
-SAMPLES_CRED_LINE_COUNT: int = 379
+SAMPLES_CRED_COUNT: int = 364
+SAMPLES_CRED_LINE_COUNT: int = 381
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 330
+SAMPLES_POST_CRED_COUNT: int = 338
 
 # with option --doc
 SAMPLES_IN_DOC = 417
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 19
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters

diff --git a/tests/credentials/test_line_data.py b/tests/credentials/test_line_data.py
@@ -146,7 +146,6 @@ def test_part_url_sanitize_p(self) -> None:
     def test_hash_text_n(self):
         self.assertEqual("", LineData.get_hash_or_subtext('', hashed=True))
 
-
     def test_hash_text_p(self):
         # $ echo -n "The quick brown fox jumps over the lazy dog" | sha256sum
         self.assertEqual("d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592",
@@ -156,10 +155,9 @@ def test_sub_text_n(self):
         subtext = LineData.get_hash_or_subtext(None, hashed=False, cut_pos=StartEnd(4, 9))
         self.assertIsNone(subtext)
 
-
     def test_sub_text_p(self):
         subtext = LineData.get_hash_or_subtext(AZ_STRING, hashed=False, cut_pos=StartEnd(4, 9))
         self.assertEqual(AZ_STRING, subtext)
-        text200sym=f"\t   {''.join(string.digits for _ in range(20))}"
+        text200sym = f"\t   {''.join(string.digits for _ in range(20))}"
         subtext = LineData.get_hash_or_subtext(text200sym, hashed=False, cut_pos=StartEnd(4, 9))
         self.assertEqual(''.join(string.digits for _ in range(16)), subtext)