Samsung · babenek · Aug 21, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
@@ -422,7 +422,7 @@ jobs:
           # crc32 should be changed
           python -m credsweeper --banner
           # run quick scan
-          python -m credsweeper --log debug --path ../tests/samples --save-json
+          python -m credsweeper --ml_providers AzureExecutionProvider,CPUExecutionProvider --log debug --path ../tests/samples --save-json
           NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
           if [ 10 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
             echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"

@@ -27,6 +27,15 @@ jobs:
         fetch-depth: 0
         ref: ${{ github.event.pull_request.head.sha }}
 
+    # # # ml_config & ml_model integrity
+
+    - name: Check ml_model.onnx integrity
+      if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
+      run: |
+        md5sum --binary credsweeper/ml_model/ml_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
+
+
     # # # line ending
 
     - name: Check for text file ending
@@ -53,14 +62,6 @@ jobs:
         done
         exit ${n}
 
-    # # # ml_model integrity
-
-    - name: Check ml_model.onnx integrity
-      if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
-      run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
-        md5sum --binary credsweeper/ml_model/model_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
-
     # # # Python setup
 
     - name: Set up Python

@@ -117,7 +117,6 @@ def get_arguments() -> Namespace:
                        dest="export_log_config",
                        metavar="PATH")
     parser.add_argument("--rules",
-                        nargs="?",
                         help="path of rule config file (default: credsweeper/rules/config.yaml). "
                         f"severity:{[i.value for i in Severity]} "
                         f"type:{[i.value for i in RuleType]}",
@@ -131,13 +130,11 @@ def get_arguments() -> Namespace:
                         dest="severity",
                         type=severity_levels)
     parser.add_argument("--config",
-                        nargs="?",
                         help="use custom config (default: built-in)",
                         default=None,
                         dest="config_path",
                         metavar="PATH")
     parser.add_argument("--log_config",
-                        nargs="?",
                         help="use custom log config (default: built-in)",
                         default=None,
                         dest="log_config_path",
@@ -178,15 +175,27 @@ def get_arguments() -> Namespace:
                         default=16,
                         required=False,
                         metavar="POSITIVE_INT")
-    ml_provider_group = parser.add_mutually_exclusive_group()
-    ml_provider_group.add_argument("--azure",
-                                   help="enable AzureExecutionProvider for onnx",
-                                   dest="azure",
-                                   action="store_true")
-    ml_provider_group.add_argument("--cuda",
-                                   help="enable CUDAExecutionProvider for onnx",
-                                   dest="cuda",
-                                   action="store_true")
+    parser.add_argument("--ml_config",
+                        help="use external config for ml model",
+                        type=str,
+                        default=None,
+                        dest="ml_config",
+                        required=False,
+                        metavar="PATH")
+    parser.add_argument("--ml_model",
+                        help="use external ml model",
+                        type=str,
+                        default=None,
+                        dest="ml_model",
+                        required=False,
+                        metavar="PATH")
+    parser.add_argument("--ml_providers",
+                        help="comma separated list of providers for onnx (CPUExecutionProvider is used by default)",
+                        type=str,
+                        default=None,
+                        dest="ml_providers",
+                        required=False,
+                        metavar="STR")
     parser.add_argument("--api_validation",
                         help="add credential api validation option to credsweeper pipeline. "
                         "External API is used to reduce FP for some rule types.",
@@ -297,8 +306,9 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   pool_count=args.jobs,
                                   ml_batch_size=args.ml_batch_size,
                                   ml_threshold=args.ml_threshold,
-                                  azure=args.azure,
-                                  cuda=args.cuda,
+                                  ml_config=args.ml_config,
+                                  ml_model=args.ml_model,
+                                  ml_providers=args.ml_providers,
                                   find_by_ext=args.find_by_ext,
                                   depth=args.depth,
                                   doc=args.doc,

@@ -49,8 +49,9 @@ def __init__(self,
                  pool_count: int = 1,
                  ml_batch_size: Optional[int] = None,
                  ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
-                 azure: bool = False,
-                 cuda: bool = False,
+                 ml_config: Union[None, str, Path] = None,
+                 ml_model: Union[None, str, Path] = None,
+                 ml_providers: Optional[str] = None,
                  find_by_ext: bool = False,
                  depth: int = 0,
                  doc: bool = False,
@@ -78,6 +79,9 @@ def __init__(self,
             pool_count: int value, number of parallel processes to use
             ml_batch_size: int value, size of the batch for model inference
             ml_threshold: float or string value to specify threshold for the ml model
+            ml_config: str or Path to set custom config of ml model
+            ml_model: str or Path to set custom ml model
+            ml_providers: str - comma separated list with providers
             find_by_ext: boolean - files will be reported by extension
             depth: int - how deep container files will be scanned
             doc: boolean - document-specific scanning
@@ -113,8 +117,9 @@ def __init__(self,
         self.sort_output = sort_output
         self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
         self.ml_threshold = ml_threshold
-        self.azure = azure
-        self.cuda = cuda
+        self.ml_config = ml_config
+        self.ml_model = ml_model
+        self.ml_providers = ml_providers
         self.ml_validator = None
         self.__log_level = log_level
 
@@ -187,7 +192,12 @@ def ml_validator(self) -> MlValidator:
         """ml_validator getter"""
         from credsweeper.ml_model import MlValidator
         if not self.__ml_validator:
-            self.__ml_validator: MlValidator = MlValidator(threshold=self.ml_threshold)
+            self.__ml_validator: MlValidator = MlValidator(
+                threshold=self.ml_threshold,  #
+                ml_config=self.ml_config,  #
+                ml_model=self.ml_model,  #
+                ml_providers=self.ml_providers,  #
+            )
         assert self.__ml_validator, "self.__ml_validator was not initialized"
         return self.__ml_validator
 

@@ -1,7 +1,8 @@
+import hashlib
 import logging
-import os
 import string
-from typing import List, Tuple, Union
+from pathlib import Path
+from typing import List, Tuple, Union, Optional
 
 import numpy as np
 import onnxruntime as ort
@@ -21,35 +22,56 @@ class MlValidator:
     CHAR_INDEX = {char: index for index, char in enumerate('\0' + string.printable + NON_ASCII)}
     NUM_CLASSES = len(CHAR_INDEX)
 
-    def __init__(self, threshold: Union[float, ThresholdPreset], azure: bool = False, cuda: bool = False) -> None:
+    def __init__(
+            self,  #
+            threshold: Union[float, ThresholdPreset],  #
+            ml_config: Union[None, str, Path] = None,  #
+            ml_model: Union[None, str, Path] = None,  #
+            ml_providers: Optional[str] = None) -> None:
         """Init
 
         Args:
             threshold: decision threshold
+            ml_config: path to ml config
+            ml_model: path to ml model
+            ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
         """
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_file_path = os.path.join(dir_path, "ml_model.onnx")
-        if azure:
-            provider = "AzureExecutionProvider"
-        elif cuda:
-            provider = "CUDAExecutionProvider"
+        dir_path = Path(__file__).parent
+
+        if ml_config:
+            ml_config_path = Path(ml_config)
+        else:
+            ml_config_path = dir_path / "ml_config.json"
+        with open(ml_config_path, "rb") as f:
+            md5_config = hashlib.md5(f.read()).hexdigest()
+
+        if ml_model:
+            ml_model_path = Path(ml_model)
+        else:
+            ml_model_path = dir_path / "ml_model.onnx"
+        with open(ml_model_path, "rb") as f:
+            md5_model = hashlib.md5(f.read()).hexdigest()
+
+        if ml_providers:
+            providers = ml_providers.split(',')
         else:
-            provider = "CPUExecutionProvider"
-        self.model_session = ort.InferenceSession(model_file_path, providers=[provider])
+            providers = ["CPUExecutionProvider"]
+        self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
 
-        model_details = Util.json_load(os.path.join(dir_path, "model_config.json"))
+        model_config = Util.json_load(ml_config_path)
         if isinstance(threshold, float):
             self.threshold = threshold
-        elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_details:
-            self.threshold = model_details["thresholds"][threshold.value]
+        elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
+            self.threshold = model_config["thresholds"][threshold.value]
         else:
             self.threshold = 0.5
 
         self.common_feature_list = []
         self.unique_feature_list = []
-        logger.info("Init ML validator, model file path: %s", model_file_path)
-        logger.debug("ML validator details: %s", model_details)
-        for feature_definition in model_details["features"]:
+        logger.info("Init ML validator with %s provider; config:'%s' md5:%s model:'%s' md5:%s", providers,
+                    ml_config_path, md5_config, ml_model_path, md5_model)
+        logger.debug("ML validator details: %s", model_config)
+        for feature_definition in model_config["features"]:
             feature_class = feature_definition["type"]
             kwargs = feature_definition.get("kwargs", {})
             feature_constructor = getattr(features, feature_class, None)

@@ -13,9 +13,13 @@ Get all argument list:
 
 .. code-block:: text
 
-    usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
-                             [--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
-                             [--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
+    usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH])
+                             [--rules PATH] [--severity SEVERITY] [--config PATH] [--log_config PATH] [--denylist PATH]
+                             [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR]
+                             [--ml_batch_size POSITIVE_INT] [--ml_config PATH] [--ml_model PATH] [--ml_providers STR]
+                             [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]]
+                             [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
+                             [--size_limit SIZE_LIMIT]
                              [--banner] [--version]
     options:
       -h, --help            show this help message and exit
@@ -27,10 +31,10 @@ Get all argument list:
                             exporting default config to file (default: config.json)
       --export_log_config [PATH]
                             exporting default logger config to file (default: log.yaml)
-      --rules [PATH]        path of rule config file (default: credsweeper/rules/config.yaml). severity:['critical', 'high', 'medium', 'low', 'info'] type:['keyword', 'pattern', 'pem_key', 'multi']
+      --rules PATH          path of rule config file (default: credsweeper/rules/config.yaml). severity:['critical', 'high', 'medium', 'low', 'info'] type:['keyword', 'pattern', 'pem_key', 'multi']
       --severity SEVERITY   set minimum level for rules to apply ['critical', 'high', 'medium', 'low', 'info'](default: 'Severity.INFO', case insensitive)
-      --config [PATH]       use custom config (default: built-in)
-      --log_config [PATH]   use custom log config (default: built-in)
+      --config PATH         use custom config (default: built-in)
+      --log_config PATH     use custom log config (default: built-in)
       --denylist PATH       path to a plain text file with lines or secrets to ignore
       --find-by-ext         find files by predefined extension
       --depth POSITIVE_INT  additional recursive search in data (experimental)
@@ -41,8 +45,9 @@ Get all argument list:
                             'highest'] (default: medium)
       --ml_batch_size POSITIVE_INT, -b POSITIVE_INT
                             batch size for model inference (default: 16)
-      --azure               enable AzureExecutionProvider for onnx
-      --cuda                enable CUDAExecutionProvider for onnx
+      --ml_config PATH      use external config for ml model
+      --ml_model PATH       use external ml model
+      --ml_providers STR    comma separated list of providers for onnx (CPUExecutionProvider is used by default)
       --api_validation      add credential api validation option to credsweeper pipeline. External API is used to reduce FP for some rule types.
       --jobs POSITIVE_INT, -j POSITIVE_INT
                             number of parallel processes to use (default: 1)

@@ -216,6 +216,13 @@ def main(cred_data_location: str, jobs: int) -> str:
     # print in last line the name
     print(f"\nYou can find your model in:\n{_model_file_name}")
 
+    # convert the model to onnx right now
     command = f"{sys.executable} -m tf2onnx.convert --saved-model {_model_file_name}" \
               f" --output {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx --verbose"
     subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
+
+    # to keep the hash in log
+    command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
+    subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
+    command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
+    subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
@@ -7,10 +7,10 @@
 
 
 def model_config_preprocess(df_all: pd.DataFrame) -> Dict[str, float]:
-    model_config_path = APP_PATH / "ml_model" / "model_config.json"
+    model_config_path = APP_PATH / "ml_model" / "ml_config.json"
     model_config = Util.json_load(model_config_path)
 
-    # check whether all extensions from meta are in model_config.json
+    # check whether all extensions from meta are in ml_config.json
 
     for x in model_config["features"]:
         if "FileExtension" == x["type"]: