Samsung · babenek · Aug 19, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
@@ -150,7 +150,7 @@ jobs:
 
       - name: Verify benchmark scores of the PR
         run: |
-          diff --ignore-all-space --ignore-blank-lines temp/CredSweeper/cicd/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
+          diff --unified=3 --ignore-all-space --ignore-blank-lines temp/CredSweeper/.ci/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
 
   # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 

@@ -58,7 +58,8 @@ jobs:
     - name: Check ml_model.onnx integrity
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 62d92ab2f91a18e861d846a7b8a0c3a7
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
+        md5sum --binary credsweeper/ml_model/model_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
 
     # # # Python setup
 

@@ -41,7 +41,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         not_comment = not line_data.is_comment()
 
         if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \
-                and '=' in line_data.separator:
+                and line_data.separator and '=' in line_data.separator:
             # heterogeneous code e.g. YAML in Python uses colon sign instead equals
             return True
 

@@ -373,12 +373,12 @@
                     ".cjs",
                     ".cljc",
                     ".cmd",
+                    ".cmm",
                     ".cnf",
                     ".coffee",
                     ".conf",
                     ".config",
                     ".cpp",
-                    ".creds",
                     ".crt",
                     ".cs",
                     ".csp",
@@ -417,7 +417,6 @@
                     ".json",
                     ".jsp",
                     ".jsx",
-                    ".jwt",
                     ".kt",
                     ".las",
                     ".ldif",
@@ -428,7 +427,6 @@
                     ".log",
                     ".lua",
                     ".m",
-                    ".map",
                     ".markerb",
                     ".md",
                     ".mdx",
@@ -445,7 +443,6 @@
                     ".patch",
                     ".php",
                     ".pl",
-                    ".pm",
                     ".po",
                     ".pod",
                     ".postinst",
@@ -457,7 +454,6 @@
                     ".purs",
                     ".pxd",
                     ".py",
-                    ".pyi",
                     ".pyx",
                     ".r",
                     ".rb",
@@ -469,6 +465,7 @@
                     ".rs",
                     ".rsp",
                     ".rst",
+                    ".rules",
                     ".sample",
                     ".sbt",
                     ".scala",
@@ -478,7 +475,6 @@
                     ".sql",
                     ".storyboard",
                     ".strings",
-                    ".swift",
                     ".t",
                     ".td",
                     ".tdf",
@@ -498,7 +494,6 @@
                     ".vue",
                     ".xaml",
                     ".xib",
-                    ".xml",
                     ".yaml",
                     ".yml",
                     ".zsh",
@@ -515,7 +510,6 @@
                     "Certificate",
                     "Credential",
                     "Github Old Token",
-                    "JSON Web Token",
                     "Key",
                     "Nonce",
                     "Password",

@@ -59,7 +59,8 @@ def main(cred_data_location: str, jobs: int) -> str:
     prepare_train_data(_cred_data_location, jobs)
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
-    detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
+    cred_data_location_path = pathlib.Path(cred_data_location) / "data"
+    detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location_path)}.json")
     print(f"CredSweeper detected {len(detected_data)} credentials without ML")
     # all markup data
     meta_data = read_metadata(f"{cred_data_location}/meta")
@@ -125,6 +126,7 @@ def main(cred_data_location: str, jobs: int) -> str:
     x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
     y_test = get_y_labels(df_test)
     print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
+    del df_test
 
     max_epochs = 100
     # ^^^ the line is patched in GitHub action to speed-up test train

@@ -215,6 +215,11 @@ def read_text(path) -> list[str]:
                                + line[markup["ValueStart"]:markup["ValueEnd"]] \
                                + Style.RESET_ALL \
                                + line[markup["ValueEnd"]:]
+                    elif 0 <= markup["ValueStart"]:
+                        line = line[:markup["ValueStart"]] \
+                               + Fore.LIGHTGREEN_EX \
+                               + line[:markup["ValueStart"]] \
+                               + Style.RESET_ALL
                     print(line)
                     break
     read_text.cache_clear()

@@ -19,9 +19,9 @@ def execute_scanner(dataset_location: str, result_location_str, j):
         sys.exit(error_code)
 
 
-def data_checksum(cred_data_location: str) -> str:
+def data_checksum(dir_path: Path) -> str:
     checksum = hashlib.md5(b'').digest()
-    for root, dirs, files in os.walk(Path(cred_data_location) / "data"):
+    for root, dirs, files in os.walk(dir_path):
         for file in files:
             with open(os.path.join(root, file), "rb") as f:
                 cvs_checksum = hashlib.md5(f.read()).digest()
@@ -38,12 +38,17 @@ def prepare_train_data(cred_data_location: str, j: int):
         new_rules = [x for x in rules if x.get("use_ml")]
         Util.yaml_dump(new_rules, "results/train_config.yaml")
 
-    detected_data_filename = f"results/detected_data.{data_checksum(cred_data_location)}.json"
+    meta_checksum = data_checksum(Path(cred_data_location) / "meta")
+    print(f"meta checksum {meta_checksum}")
+
+    data_dir_checksum = data_checksum(Path(cred_data_location) / "data")
+    print(f"data checksum {data_dir_checksum}")
+    detected_data_filename = f"results/detected_data.{data_dir_checksum}.json"
 
     if not os.path.exists(detected_data_filename):
         print(f"Get CredSweeper results from {cred_data_location}. May take some time")
         execute_scanner(cred_data_location, detected_data_filename, j)
     else:
-        print(f"Get cached result {data_checksum(cred_data_location)}")
+        print(f"Get cached result {data_dir_checksum}")
 
     print("Train data prepared!")
@@ -11,14 +11,14 @@
 SAMPLES_CRED_LINE_COUNT: int = 381
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 323
+SAMPLES_POST_CRED_COUNT: int = 336
 
 # with option --doc
 SAMPLES_IN_DOC = 417
 
 # archived credentials that are not found without --depth
-SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 17
+SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters