Optimizations (#415)

* [skip actions] [auxiliary] 2023-08-31T11:37:29+03:00 * optimization
Samsung · Sep 7, 2023 · 9a1d3ff · 9a1d3ff
1 parent fd24912
commit 9a1d3ff
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 15 deletions.
diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py
@@ -193,6 +193,7 @@ def structure_scan(
                                                             info=f"{struct_provider.info}|STRUCT:{key}")
                 new_candidates = self.structure_scan(val_struct_provider, depth, recursive_limit_size)
                 candidates.extend(new_candidates)
+
             elif isinstance(value, bytes):
                 bytes_struct_provider = DataContentProvider(data=value,
                                                             file_path=struct_provider.file_path,

diff --git a/credsweeper/file_handler/analysis_target.py b/credsweeper/file_handler/analysis_target.py
@@ -39,6 +39,11 @@ def line_strip_len(self) -> int:
         """cached value"""
         return len(self.line_strip)
 
+    @cached_property
+    def line_strip_lower(self) -> str:
+        """cached value"""
+        return self.line_strip.lower()
+
     @cached_property
     def lines(self) -> List[str]:
         """cached value"""

diff --git a/credsweeper/filters/value_ip_check.py b/credsweeper/filters/value_ip_check.py
@@ -32,9 +32,10 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         with contextlib.suppress(Exception):
             ip = ipaddress.ip_address(line_data.value)
             if 4 == ip.version:
-                line_lower = target.line.lower()
+                # use line_strip_lower due the property should be cached already
+                line_strip_lower = target.line_strip_lower
                 for i in ValueIPCheck.FALSE_POSITIVE_MARKERS:
-                    if i in line_lower:
+                    if i in line_strip_lower:
                         return True
             if ip.is_loopback or ip.is_private or ip.is_reserved or ip.is_link_local or ip.is_multicast:
                 return True

diff --git a/credsweeper/rules/rule.py b/credsweeper/rules/rule.py
@@ -1,7 +1,7 @@
 import logging
 import re
 from functools import cached_property
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Set
 
 from credsweeper import validations, filters
 from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH
@@ -68,7 +68,7 @@ def __init__(self, config: Config, rule_dict: Dict) -> None:
         self.__filters = self._init_filters(rule_dict.get(Rule.FILTER_TYPE))
         self.__use_ml = bool(rule_dict.get(Rule.USE_ML))
         self.__validations = self._init_validations(rule_dict.get(Rule.VALIDATIONS))
-        self.__required_substrings = [i.strip().lower() for i in rule_dict.get(Rule.REQUIRED_SUBSTRINGS, [])]
+        self.__required_substrings = set(i.strip().lower() for i in rule_dict.get(Rule.REQUIRED_SUBSTRINGS, []))
         self.__has_required_substrings = bool(self.__required_substrings)
         required_regex = rule_dict.get(Rule.REQUIRED_REGEX)
         if required_regex and not isinstance(required_regex, str):
@@ -220,7 +220,7 @@ def _assert_rule_mandatory_fields(rule_template: Dict) -> None:
             raise ValueError(f"Malformed rule config file. Contain rule with missing fields: {missing_fields}.")
 
     @cached_property
-    def required_substrings(self) -> List[str]:
+    def required_substrings(self) -> Set[str]:
         """required_substrings getter"""
         return self.__required_substrings
 

diff --git a/credsweeper/scanner/scan_type/pem_key_pattern.py b/credsweeper/scanner/scan_type/pem_key_pattern.py
@@ -100,7 +100,7 @@ def detect_pem_key(cls, config: Config, rule: Rule, target: AnalysisTarget) -> L
                         logger.debug("Filtered with entropy %f '%s'", entropy_validator.entropy, key_data)
                         return []
                     # OPENSSH format has multiple AAAAA pattern
-                    if "OPENSSH" not in target.line and cls.pem_pattern_check.equal_pattern_check(key_data):
+                    if "OPENSSH" not in target.line_strip and cls.pem_pattern_check.equal_pattern_check(key_data):
                         logger.debug("Filtered with ValuePemPatternCheck %s", target)
                         return []
                     # all OK - return line data with all lines which include PEM

diff --git a/credsweeper/scanner/scan_type/scan_type.py b/credsweeper/scanner/scan_type/scan_type.py
@@ -105,7 +105,7 @@ def _get_candidate(cls, config: Config, rule: Rule, target: AnalysisTarget) -> O
             remove current line. None otherwise
 
         """
-        if config.exclude_lines and target.line.strip() in config.exclude_lines:
+        if config.exclude_lines and target.line_strip in config.exclude_lines:
             return None
 
         line_data = cls.get_line_data(config=config, target=target, pattern=rule.patterns[0], filters=rule.filters)

diff --git a/credsweeper/scanner/scanner.py b/credsweeper/scanner/scanner.py
@@ -1,7 +1,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import List, Type, Tuple, Union, Dict, Generator
+from typing import List, Type, Tuple, Union, Dict, Generator, Set
 
 from credsweeper.app import APP_PATH
 from credsweeper.common.constants import RuleType, MIN_VARIABLE_LENGTH, MIN_SEPARATOR_LENGTH, MIN_VALUE_LENGTH, \
@@ -43,6 +43,14 @@ def __init__(self, config: Config, rule_path: Union[None, str, Path]) -> None:
         self.min_len = min(self.min_pattern_len, self.min_keyword_len, self.min_pem_key_len, self.min_multi_len,
                            MIN_VARIABLE_LENGTH + MIN_SEPARATOR_LENGTH + MIN_VALUE_LENGTH)
 
+    @staticmethod
+    def _substring_check(substrings: Set[str], text: str) -> bool:
+        """checks whether `text` has any required substring. Set is used to reduce extra transformations"""
+        for substring in substrings:
+            if substring in text:
+                return True
+        return False
+
     def _set_rules_scanners(self, rule_path: Union[None, str, Path]) -> None:
         """Auxiliary method to fill rules, determine min_pattern_len and set scanners"""
         if rule_path is None:
@@ -125,21 +133,19 @@ def scan(self, provider: ContentProvider) -> List[Candidate]:
             matched_multi = target_line_stripped_len >= self.min_multi_len
 
             if not (matched_keyword or matched_pem_key or matched_pattern or matched_multi):
+                # target may be skipped only with length because not all rules have required_substrings
                 continue
 
             # use lower case for required substring
-            target_line_stripped_lower = target_line_stripped.lower()
+            target_line_stripped_lower = target.line_strip_lower
             # cached value to skip the same regex verifying
             matched_regex: Dict[re.Pattern, bool] = {}
 
             for rule, scanner in self.yield_rule_scanner(target_line_stripped_len, matched_pattern, matched_keyword,
                                                          matched_pem_key, matched_multi):
-                for substring in rule.required_substrings:
-                    if substring in target_line_stripped_lower:
-                        break
-                else:
-                    if rule.has_required_substrings:
-                        continue
+                if rule.has_required_substrings \
+                        and not self._substring_check(rule.required_substrings, target_line_stripped_lower):
+                    continue
 
                 # common regex might be triggered for the same target
                 if rule.required_regex: