add content-check output to data file #119 (#121)

fqrious · Fadl · web-flow · commit 6e5eaaefaea4 · 2025-02-17T14:50:19.000Z
Co-authored-by: Fadl &lt;folawumi@dogesec.com&gt;
diff --git a/txt2stix/txt2stix.py b/txt2stix/txt2stix.py
@@ -136,11 +136,6 @@ def parse_args():
 
     inf_arg  = parser.add_argument("--input_file", "--input-file", required=True, help="The file to be converted. Must be .txt", type=Path)
     parser.add_argument("--ai_check_content", required=False, type=parse_model, help="Use an AI model to check wether the content of the file contains threat intelligence. Paticularly useful to weed out vendor marketing.")
-    if (args := parser.parse_known_args()[0]) and args.ai_check_content:
-        model : BaseAIExtractor = args.ai_check_content
-        value = model.check_content(args.input_file.read_text())
-        print("check-content output:", value.model_dump_json())
-        exit(0)
     name_arg = parser.add_argument("--name", required=True, help="Name of the file, max 124 chars", default="stix-out")
     parser.add_argument("--created", required=False, default=datetime.now(), help="Allow user to optionally pass --created time in input, which will hardcode the time used in created times")
     parser.add_argument("--ai_settings_extractions", required=False, type=parse_model, help="(required if AI extraction enabled): passed in format provider:model e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers.", metavar="provider[:model]", nargs='+', default=[parse_model('openai')])
@@ -266,29 +261,40 @@ def main():
         preprocessed_text = remove_links(input_text, args.ignore_image_refs, args.ignore_link_refs)
         load_env()
 
+        should_extract = True
+        content_check_output = None
+
+        if args.ai_check_content:
+            logging.info("checking content")
+            model : BaseAIExtractor = args.ai_check_content
+            content_check_output = model.check_content(args.input_file.read_text())
+            should_extract = content_check_output.describes_incident
+            logging.info("=== ai-check-content output ====")
+            logging.info(content_check_output.model_dump_json())
+
 
         bundler = txt2stixBundler(args.name, args.use_identity, args.tlp_level, input_text, args.confidence, args.all_extractors, args.labels, created=args.created, report_id=args.report_id, external_references=args.external_refs)
         log_notes(sys.argv, "Config")
         convo_str = None
 
         # ai_extractor_session = args.ai_model[0](args.ai_model[1])
-        if args.use_extractions.get("ai"):
-            validate_token_count(int(os.environ["INPUT_TOKEN_LIMIT"]), preprocessed_text, args.ai_settings_extractions)
-        if args.relationship_mode == "ai":
-            validate_token_count(int(os.environ["INPUT_TOKEN_LIMIT"]), preprocessed_text, [args.ai_settings_relationships])
-
-        all_extracts = extract_all(bundler, args.use_extractions, preprocessed_text, ai_extractors=args.ai_settings_extractions, ignore_extraction_boundary=args.ignore_extraction_boundary)
-        extracted_relationships = None
-        if args.relationship_mode == "ai" and sum(map(lambda x: len(x), all_extracts.values())):
-            extracted_relationships = extract_relationships_with_ai(bundler, preprocessed_text, all_extracts, args.ai_settings_relationships)
-            
-        # convo_str = ai_extractor_session.get_conversation() if ai_extractor_session and ai_extractor_session.initialized else ""
-        flow = None
-        if args.ai_create_attack_flow:
-            logging.info("creating attack-flow bundle")
-            ex: BaseAIExtractor = args.ai_settings_relationships
-            flow = ex.extract_attack_flow(input_text, all_extracts, extracted_relationships)
-            bundler.flow_objects = parse_flow(bundler.report, flow)
+
+        
+        all_extracts = flow = extracted_relationships = None
+        if should_extract:
+            if args.use_extractions.get("ai"):
+                validate_token_count(int(os.environ["INPUT_TOKEN_LIMIT"]), preprocessed_text, args.ai_settings_extractions)
+            if args.relationship_mode == "ai":
+                validate_token_count(int(os.environ["INPUT_TOKEN_LIMIT"]), preprocessed_text, [args.ai_settings_relationships])
+            all_extracts = extract_all(bundler, args.use_extractions, preprocessed_text, ai_extractors=args.ai_settings_extractions, ignore_extraction_boundary=args.ignore_extraction_boundary)
+            if args.relationship_mode == "ai" and sum(map(lambda x: len(x), all_extracts.values())):
+                extracted_relationships = extract_relationships_with_ai(bundler, preprocessed_text, all_extracts, args.ai_settings_relationships)
+                
+            if args.ai_create_attack_flow:
+                logging.info("creating attack-flow bundle")
+                ex: BaseAIExtractor = args.ai_settings_relationships
+                flow = ex.extract_attack_flow(input_text, all_extracts, extracted_relationships)
+                bundler.flow_objects = parse_flow(bundler.report, flow)
 
             
 
@@ -298,9 +304,10 @@ def main():
         output_path.write_text(out)
         logger.info(f"Wrote bundle output to `{output_path}`")
         data = {
+            "content-check": content_check_output and content_check_output.model_dump(),
             "extractions": all_extracts,
             "relationships": extracted_relationships,
-            "attack-flow": flow.model_dump(),
+            "attack-flow": flow and flow.model_dump(),
         }
         data_path = Path(str(output_path).replace('bundle--', 'data--'))
         data_path.write_text(json.dumps(data, indent=4))