fix repetitive pattern extraction (#109)

fqrious · himynamesdave · web-flow · commit 19ad9413b2a9 · 2025-01-29T10:35:15.000Z
* fix repetitive pattern extraction #108 * add --ignore_extraction_boundary #109 * Update README.md * adding tests * fix subdomain-hostname must not have tld, fix issue causing any item with same boundary as the file not getting extracted by adding \n at start and end of file before splitting * docs updates --------- Co-authored-by: David G <himynamesdave@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -83,12 +83,13 @@ The following arguments are available:
 
 How the extractions are performed
 
-* `--use_extractions` (REQUIRED): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied.
-	* Important: if using any AI extractions, you must set an OpenAI API key in your `.env` file
+* `--use_extractions` (REQUIRED): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `pattern_*` would run all extractions starting with `pattern_`)
+	* Important: if using any AI extractions (`ai_*`), you must set an AI API key in your `.env` file
 	* Important: if you are using any MITRE ATT&CK, CAPEC, CWE, ATLAS or Location extractions you must set `CTIBUTLER` or NVD CPE or CVE extractions you must set `VULMATCH` settings in your `.env` file
 * `--relationship_mode` (REQUIRED): either.
 	* `ai`: AI provider must be enabled. extractions performed by either regex or AI for extractions user selected. Rich relationships created from AI provider from extractions.
 	* `standard`: extractions performed by either regex or AI (AI provider must be enabled) for extractions user selected. Basic relationships created from extractions back to master Report object generated.
+* `--ignore_extraction_boundary` (OPTIONAL, default `false`, not compatible with AI extractions): in some cases the same string will create multiple extractions depending on extractions set (e.g. `https://www.google.com/file.txt` could create a url, url with file, domain, subdomain, and file). The default behaviour is for txt2stix to take the longest extraction and ignore everything else (e.g. only extract url with file, and ignore url, file, domain, subdomain, and file). If you want to override this behaviour and get all extractions in the output, set this flag to `true`.
 * `--ignore_image_refs` (default `true`): images references in documents don't usually need extracting. e.g. `<img src="https://example.com/image.png" alt="something">` you would not want domain or file extractions extracting `example.com` and `image.png`. Hence these are ignored by default (they are removed from text sent to extraction). Note, only the `img src` is ignored, all other values e.g. `alt` are considered. If you want extractions to consider this data, set it to `false`
 * `--ignore_link_refs` (default `true`): link references in documents don't usually need extracting e.g. `<a href="https://example.com/link.html" title="something">Bad Actor</a>` you would only want `Bad actor` to be considered for extraction. Hence these part of the link are ignored by default (they are removed from text sent to extraction). Note, only the `a href` is ignored, all other values e.g. `title` are considered. Setting this to `false` will also include everything inside the link tag (e.g. `example.com` would extract as a domain)
 
diff --git a/tests/data/manually_generated_reports/test_extraction_boundary.txt b/tests/data/manually_generated_reports/test_extraction_boundary.txt
@@ -0,0 +1 @@
+https://subdomain.google.com/file.txt
diff --git a/tests/manual-tests/cases-standard-tests.md b/tests/manual-tests/cases-standard-tests.md
@@ -321,6 +321,35 @@ python3 txt2stix.py \
 	--report_id 8cf2590e-f7b8-40c6-99cd-4aad9fbdc8bd
 ```
 
+### extraction boundary tests
+
+Should create `pattern_url_file` extraction as boundary observed
+
+```shell
+python3 txt2stix.py \
+	--relationship_mode standard \
+	--input_file tests/data/manually_generated_reports/test_extraction_boundary.txt \
+	--name 'extraction boundary tests 1' \
+	--tlp_level clear \
+	--confidence 100 \
+	--use_extractions 'pattern_*' \
+	--report_id f6d8800b-9708-4c74-aa1b-7a59d3c79d79
+```
+
+Should create all extractions;
+
+```shell
+python3 txt2stix.py \
+	--relationship_mode standard \
+	--input_file tests/data/manually_generated_reports/test_extraction_boundary.txt \
+	--name 'extraction boundary tests 1' \
+	--tlp_level clear \
+	--confidence 100 \
+	--ignore_extraction_boundary true \
+	--use_extractions 'pattern_*' \
+	--report_id 0f5b1afd-c468-49a2-9896-6910b7f124dd
+```
+
 ### disarm demo
 
 ```shell
@@ -333,4 +362,4 @@ python3 txt2stix.py \
 	--confidence 100 \
 	--use_extractions lookup_disarm_name \
 	--report_id 8cb2dbf0-136f-4ecb-995c-095496e22abc
-```
+```
diff --git a/txt2stix/extractions.py b/txt2stix/extractions.py
@@ -1,6 +1,10 @@
-from typing import Any
+from typing import Any, Type
 import yaml
 from pathlib import Path
+
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import txt2stix.pattern.extractors.base_extractor
 from .common import NamedDict
 
 class Extractor(NamedDict):
@@ -19,6 +23,7 @@ class Extractor(NamedDict):
     prompt_negative_examples = None
     stix_mapping = None
     prompt_extraction_extra = None
+    pattern_extractor : 'Type[txt2stix.pattern.extractors.base_extractor.BaseExtractor]' = None
 
 
     def __init__(self, key, dct, include_path=None, test_cases: dict[str, list[str]]=None):
diff --git a/txt2stix/pattern/extractors/base_extractor.py b/txt2stix/pattern/extractors/base_extractor.py
@@ -19,7 +19,7 @@ class BaseExtractor:
     version = None
     stix_mapping = None
     invalid_characters = ['.', ',', '!', '`', '(', ')', '{', '}', '"', '````', ' ', '[', ']']
-    SPLITS_FINDER = re.compile(r'[\'"<\(\{\[\s].*?[\)\s\]\}\)>"\']') #split on boundary characters instead of ' ' only
+    SPLITS_FINDER = re.compile(r'[\'"<\(\{\[\s](?P<item>.*?)[\)\s\]\}\)>"\']') #split on boundary characters instead of ' ' only
 
 
     @classmethod
@@ -40,44 +40,41 @@ def extract_extraction_from_text(cls, text: str):
         start_index = 0
         if cls.extraction_regex is not None:
             if cls.extraction_regex.startswith("^") or cls.extraction_regex.endswith("$"):
-                for word in cls.split_all(text):
-                    end_index = start_index + len(word) - 1
+                for matchsplit in cls.SPLITS_FINDER.finditer(text):
+                    word = matchsplit.group('item')
+                    start_index = matchsplit.start('item')
                     match = re.match(cls.extraction_regex, word)
                     if match:
-                        extracted_observables.append((match.group(0), start_index))
+                        extracted_observables.append((match.group(0), match.start()+start_index))
                     else:
                         stripped_word = word.strip(cls.common_strip_elements)
                         match = re.match(cls.extraction_regex, stripped_word)
                         if match:
-                            extracted_observables.append((match.group(0), start_index))
-                    start_index = end_index + 2  # Adding 1 for the space and 1 for the next word's starting index
+                            extracted_observables.append((match.group(0), start_index + word.index(stripped_word)))
             else:
                 # Find regex in the entire text (including whitespace)
                 for match in re.finditer(cls.extraction_regex, text):
-                    match = match.group().strip('\n')
-                    end_index = start_index + len(match) - 1
+                    match_value = match.group().strip('\n')
+                    start_index, end_index = match.span()
                 
-                    extracted_observables.append((match, start_index))
-                    start_index = end_index + 2  # Adding 1 for the space and 1 for the next word's starting index
+                    extracted_observables.append((match_value, start_index))
 
         # If extraction_function is not None, then find matches that don't throw exception when
         elif cls.extraction_function is not None:
 
             start_index = 0
             
-            words = cls.SPLITS_FINDER.findall(text)
-            for word in words:
+            for match in cls.SPLITS_FINDER.finditer(text):
+                word = match.group('item')
                 end_index = start_index + len(word) - 1
 
                 word = BaseExtractor.trim_invalid_characters(word, cls.invalid_characters)
                 try:
                     if cls.extraction_function(word):
-                        extracted_observables.append((word, start_index))
+                        extracted_observables.append((word, match.start('item')))
                 except Exception as error:
                     pass
 
-                start_index = end_index + 2  # Adding 1 for the space and 1 for the next word's starting index
-
         else:
             raise ValueError("Both extraction_regex and extraction_function can't be None.")
 
@@ -93,15 +90,25 @@ def extract_extraction_from_text(cls, text: str):
 
         response = []
 
-        for extraction, positions in string_positions.items():
+        # for extraction, positions in string_positions.items():
+        #     response.append({
+        #         "value": extraction,
+        #         "type": cls.name,
+        #         "version": cls.version,
+        #         "stix_mapping": cls.stix_mapping,
+        #         "start_index": positions,
+        #     })
+
+        for position, (string, pos) in enumerate(extracted_observables, 1):
+            if cls.filter_function and not cls.filter_function(string):
+                continue
             response.append({
-                "value": extraction,
+                "value": string,
                 "type": cls.name,
                 "version": cls.version,
                 "stix_mapping": cls.stix_mapping,
-                "start_index": positions,
+                "start_index": pos,
             })
-
         return response
 
     @staticmethod
diff --git a/txt2stix/pattern/extractors/domain/hostname_extractor.py b/txt2stix/pattern/extractors/domain/hostname_extractor.py
@@ -1,5 +1,8 @@
 from tld import get_tld
 
+from txt2stix.utils import validate_file_mimetype
+from ..helper import TLDs
+
 from ..base_extractor import BaseExtractor
 
 
@@ -29,5 +32,5 @@ def filter_function(domain):
         if domain.count('.') <= 1:
             tld = get_tld(domain, fix_protocol=True, fail_silently=True)
             if not tld:
-                return True
-        return False
+                return not validate_file_mimetype(domain)
+        return False
diff --git a/txt2stix/pattern/extractors/domain/sub_domain_extractor.py b/txt2stix/pattern/extractors/domain/sub_domain_extractor.py
@@ -1,5 +1,8 @@
 from tld import get_tld
 
+from txt2stix.utils import validate_file_mimetype
+from ..helper import TLDs
+
 from ..base_extractor import BaseExtractor
 
 
@@ -39,4 +42,8 @@ def filter_function(domain):
 
 class HostNameSubdomainExtractor(SubDomainExtractor):
     name = "pattern_host_name_subdomain"
-    filter_function = lambda domain: domain.count('.') >= 2
+    filter_function = lambda domain: domain.count('.') >= 2 and get_tld(domain, fail_silently=True) not in TLDs
+
+    def filter_function(domain):
+        tld =  get_tld(domain, fail_silently=True, fix_protocol=True)
+        return domain.count('.') >= 2 and not tld and not validate_file_mimetype(domain)
diff --git a/txt2stix/pattern/extractors/helper.py b/txt2stix/pattern/extractors/helper.py
@@ -8,7 +8,8 @@
 from .base_extractor import ALL_EXTRACTORS
 
 from ...extractions import Extractor
-from ...utils import read_included_file
+from ...utils import FILE_EXTENSIONS, read_included_file, TLDs
+
 
 
 def read_text_file(file_path):
@@ -48,7 +49,7 @@ def check_false_positive_domain(domain):
         bool: True if the domain is not a false positive, False otherwise.
     """
     file_extension = domain.split(".")[-1]
-    if file_extension in FILE_EXTENSION:
+    if file_extension in FILE_EXTENSIONS:
         return False
     else:
         return True
@@ -65,15 +66,27 @@ def load_extractor(extractor):
     extractor.pattern_extractor.stix_mapping = extractor.stix_mapping
 
 
-def extract_all(extractors :list[Extractor], input_text):
+def extract_all(extractors :list[Extractor], input_text, ignore_extraction_boundary=False):
     logging.info("using pattern extractors")
     pattern_extracts = []
     for extractor in extractors:
         load_extractor(extractor)
         extracts = extractor.pattern_extractor().extract_extraction_from_text(input_text)
         pattern_extracts.extend(extracts)
-    return pattern_extracts
-
 
-FILE_EXTENSION = read_included_file('lookups/extensions.txt')
-TLD = read_included_file('lookups/tld.txt')
+    pattern_extracts.sort(key=lambda ex: (ex['start_index'], len(ex['value'])))
+    retval = {}
+    end = 0
+    for raw_extract in pattern_extracts:
+        start_index = raw_extract['start_index']
+        key = (raw_extract['type'], raw_extract['value'])
+        if ignore_extraction_boundary or start_index >= end:
+            extraction = retval.setdefault(key, {**raw_extract, "start_index":[start_index]})
+            if start_index not in extraction['start_index']:
+                extraction['start_index'].append(start_index)
+            end = start_index + len(raw_extract['value'])
+    return list(retval.values())
+
+
+# FILE_EXTENSION = read_included_file('lookups/extensions.txt')
+# TLD = read_included_file('lookups/tld.txt')
diff --git a/txt2stix/pattern/extractors/others/cve_extractor.py b/txt2stix/pattern/extractors/others/cve_extractor.py
@@ -11,4 +11,4 @@ class CVEExtractor(BaseExtractor):
     """
 
     name = "pattern_cve_id"
-    extraction_regex = r'^CVE-\d{4}-(?:\d{4}|\d{5})$'
+    extraction_regex = r'^CVE-\d{4}-(?:\d{4,6})$'
diff --git a/txt2stix/pattern/extractors/others/email_extractor.py b/txt2stix/pattern/extractors/others/email_extractor.py
@@ -1,5 +1,5 @@
 from ..base_extractor import BaseExtractor
-from ..helper import TLD
+from ..helper import TLDs
 
 
 class EmailAddressExtractor(BaseExtractor):
@@ -17,5 +17,5 @@ class EmailAddressExtractor(BaseExtractor):
     def filter_function(email):
         x = email.split("@")
         domain = x[-1].split(".")[-1]
-        if domain in TLD:
+        if domain in TLDs:
             return True
diff --git a/txt2stix/txt2stix.py b/txt2stix/txt2stix.py
@@ -141,6 +141,7 @@ def parse_args():
     parser.add_argument("--external_refs", type=parse_ref, help="pass additional `external_references` entry (or entries) to the report object created. e.g --external_ref author=dogesec link=https://dkjjadhdaj.net", default=[], metavar="{source_name}={external_id}", action="extend", nargs='+')
     parser.add_argument('--ignore_image_refs', default=True, type=parse_bool)
     parser.add_argument('--ignore_link_refs', default=True, type=parse_bool)
+    parser.add_argument("--ignore_extraction_boundary", default=False, type=parse_bool, help="default if not passed is `false`, but if set to `true` will ignore boundary capture logic for extractions")
 
     args = parser.parse_args()
     if not args.input_file.exists():
@@ -176,9 +177,10 @@ def log_notes(content, type):
     logging.debug(json.dumps(content, sort_keys=True, indent=4))
     logging.debug(f" ========================= {'-'*len(type)} ========================= ")
 
-def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extractors: list[BaseAIExtractor]=[]):
+def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extractors: list[BaseAIExtractor]=[], **kwargs):
     assert ai_extractors or not extractors_map.get("ai"), "There should be at least one AI extractor in ai_extractors"
 
+    text_content = "\n"+text_content+"\n"
     all_extracts = dict()
     if extractors_map.get("lookup"):
         try:
@@ -191,7 +193,7 @@ def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extra
     if extractors_map.get("pattern"):
         try:
             logging.info("using pattern extractors")
-            pattern_extracts = pattern.extract_all(extractors_map["pattern"].values(), text_content)
+            pattern_extracts = pattern.extract_all(extractors_map["pattern"].values(), text_content, ignore_extraction_boundary=kwargs.get('ignore_extraction_boundary', False))
             bundler.process_observables(pattern_extracts)
             all_extracts["pattern"] = pattern_extracts
         except BaseException as e:
@@ -256,7 +258,7 @@ def main():
         if args.relationship_mode == "ai":
             validate_token_count(int(os.environ["INPUT_TOKEN_LIMIT"]), preprocessed_text, [args.ai_settings_relationships])
 
-        all_extracts = extract_all(bundler, args.use_extractions, preprocessed_text, ai_extractors=args.ai_settings_extractions)
+        all_extracts = extract_all(bundler, args.use_extractions, preprocessed_text, ai_extractors=args.ai_settings_extractions, ignore_extraction_boundary=args.ignore_extraction_boundary)
         extracted_relationships = None
         if args.relationship_mode == "ai" and sum(map(lambda x: len(x), all_extracts.values())):
             extracted_relationships = extract_relationships_with_ai(bundler, preprocessed_text, all_extracts, args.ai_settings_relationships)
diff --git a/txt2stix/utils.py b/txt2stix/utils.py
@@ -58,7 +58,7 @@ def read_included_file(path):
     
 def validate_tld(domain):
     extracted_domain = tldextract.extract(domain)
-    return extracted_domain.suffix in TLDS
+    return extracted_domain.suffix in TLDs
 
 def validate_reg_key(reg_key):
     reg_key = reg_key.lower()
@@ -71,6 +71,6 @@ def validate_file_mimetype(file_name):
     _, ext = os.path.splitext(file_name)
     return FILE_EXTENSIONS.get(ext)
 
-TLDS = [tld.lower() for tld in read_included_file('helpers/tlds.txt').splitlines()]
+TLDs = [tld.lower() for tld in read_included_file('helpers/tlds.txt').splitlines()]
 REGISTRY_PREFIXES = [key.lower() for key in read_included_file('helpers/windows_registry_key_prefix.txt').splitlines()]
 FILE_EXTENSIONS = dict(line.lower().split(',') for line in read_included_file('helpers/mimetype_filename_extension_list.csv').splitlines())