Merge pull request #1927 from blacklanternsecurity/extractous

New module: Extractous
blacklanternsecurity · Nov 7, 2024 · 3aaccda · 3aaccda
2 parents 17a55eb + 8b9f03a
commit 3aaccda
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 128 deletions.
diff --git a/bbot/modules/unstructured.py → bbot/modules/extractous.py b/bbot/modules/unstructured.py → bbot/modules/extractous.py
@@ -1,9 +1,9 @@
-import os
+from extractous import Extractor
 
 from bbot.modules.base import BaseModule
 
 
-class unstructured(BaseModule):
+class extractous(BaseModule):
     watched_events = ["FILESYSTEM"]
     produced_events = ["RAW_TEXT"]
     flags = ["passive", "safe"]
@@ -63,15 +63,11 @@ class unstructured(BaseModule):
         "extensions": "File extensions to parse",
     }
 
-    deps_apt = ["libmagic-dev", "poppler-utils", "tesseract-ocr", "libreoffice", "pandoc"]
-    deps_pip = ["unstructured[all-docs]>=0.15.7,<1.0", "nltk>=3.9.0,<4.0"]
-
+    deps_pip = ["extractous"]
     scope_distance_modifier = 1
 
     async def setup(self):
         self.extensions = list(set([e.lower().strip(".") for e in self.config.get("extensions", [])]))
-        # Do not send user statistics to the unstructured library
-        os.environ["SCARF_NO_ANALYTICS"] = "true"
         return True
 
     async def filter_event(self, event):
@@ -94,22 +90,16 @@ async def handle_event(self, event):
             )
             await self.emit_event(raw_text_event)
 
-    async def finish(self):
-        del os.environ["SCARF_NO_ANALYTICS"]
-        return
-
 
 def extract_text(file_path):
     """
-    extract_text Extracts plaintext from a document path using unstructured.
+    extract_text Extracts plaintext from a document path using extractous.
 
     :param file_path: The path of the file to extract text from.
     :return: ASCII-encoded plaintext extracted from the document.
     """
 
-    from unstructured.partition.auto import partition
-
-    unstructured_file_types = [
+    extractable_file_types = [
         ".csv",
         ".eml",
         ".msg",
@@ -135,12 +125,21 @@ def extract_text(file_path):
         ".xml",
     ]
 
-    # If the file can be extracted with unstructured use its partition function or try and read it
-    if any(file_path.lower().endswith(file_type) for file_type in unstructured_file_types):
+    # If the file can be extracted with extractous use its partition function or try and read it
+    if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
         try:
-            elements = partition(filename=file_path)
-            return "\n\n".join(element.text for element in elements)
-        except ValueError:
+            extractor = Extractor()
+            reader = extractor.extract_file(str(file_path))
+
+            result = ""
+            buffer = reader.read(4096)
+            while len(buffer) > 0:
+                result += buffer.decode("utf-8")
+                buffer = reader.read(4096)
+
+            return result.strip()
+
+        except Exception:
             with open(file_path, "rb") as file:
                 return file.read().decode("utf-8", errors="ignore")
     else:

diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py
@@ -894,7 +894,7 @@ def check(self, module_test, events):
 
 class TestExcavateRAWTEXT(ModuleTestBase):
     targets = ["http://127.0.0.1:8888/", "test.notreal"]
-    modules_overrides = ["excavate", "httpx", "filedownload", "unstructured"]
+    modules_overrides = ["excavate", "httpx", "filedownload", "extractous"]
     config_overrides = {"scope": {"report_distance": 1}, "web": {"spider_distance": 2, "spider_depth": 2}}
 
     pdf_data = r"""%PDF-1.3
@@ -965,7 +965,7 @@ class TestExcavateRAWTEXT(ModuleTestBase):
 startxref
 1669
 %%EOF"""
-    unstructured_response = """This is an email [email protected]
+    extractous_response = """This is an email [email protected]
 
 An example JWT eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c
 
@@ -995,13 +995,13 @@ def check(self, module_test, events):
         raw_text_events = [e for e in events if e.type == "RAW_TEXT"]
         assert 1 == len(raw_text_events), "Failed to emit RAW_TEXT event"
         assert (
-            raw_text_events[0].data == self.unstructured_response
+            raw_text_events[0].data == self.extractous_response
         ), f"Text extracted from PDF is incorrect, got {raw_text_events[0].data}"
         email_events = [e for e in events if e.type == "EMAIL_ADDRESS"]
         assert 1 == len(email_events), "Failed to emit EMAIL_ADDRESS event"
         assert (
             email_events[0].data == "[email protected]"
-        ), f"Email extracted from unstructured text is incorrect, got {email_events[0].data}"
+        ), f"Email extracted from extractous text is incorrect, got {email_events[0].data}"
         finding_events = [e for e in events if e.type == "FINDING"]
         assert 2 == len(finding_events), "Failed to emit FINDING events"
         assert any(
@@ -1026,7 +1026,7 @@ def check(self, module_test, events):
         url_events = [e.data for e in events if e.type == "URL_UNVERIFIED"]
         assert (
             "https://www.test.notreal/about" in url_events
-        ), f"URL extracted from unstructured text is incorrect, got {url_events}"
+        ), f"URL extracted from extractous text is incorrect, got {url_events}"
         assert (
             "/donot_detect.js" not in url_events
-        ), f"URL extracted from unstructured text is incorrect, got {url_events}"
+        ), f"URL extracted from extractous text is incorrect, got {url_events}"