Skip to content

Commit

Permalink
Merge pull request #1927 from blacklanternsecurity/extractous
Browse files Browse the repository at this point in the history
New module: Extractous
  • Loading branch information
TheTechromancer authored Nov 7, 2024
2 parents 17a55eb + 8b9f03a commit 3aaccda
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 128 deletions.
39 changes: 19 additions & 20 deletions bbot/modules/unstructured.py → bbot/modules/extractous.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
from extractous import Extractor

from bbot.modules.base import BaseModule


class unstructured(BaseModule):
class extractous(BaseModule):
watched_events = ["FILESYSTEM"]
produced_events = ["RAW_TEXT"]
flags = ["passive", "safe"]
Expand Down Expand Up @@ -63,15 +63,11 @@ class unstructured(BaseModule):
"extensions": "File extensions to parse",
}

deps_apt = ["libmagic-dev", "poppler-utils", "tesseract-ocr", "libreoffice", "pandoc"]
deps_pip = ["unstructured[all-docs]>=0.15.7,<1.0", "nltk>=3.9.0,<4.0"]

deps_pip = ["extractous"]
scope_distance_modifier = 1

async def setup(self):
self.extensions = list(set([e.lower().strip(".") for e in self.config.get("extensions", [])]))
# Do not send user statistics to the unstructured library
os.environ["SCARF_NO_ANALYTICS"] = "true"
return True

async def filter_event(self, event):
Expand All @@ -94,22 +90,16 @@ async def handle_event(self, event):
)
await self.emit_event(raw_text_event)

async def finish(self):
del os.environ["SCARF_NO_ANALYTICS"]
return


def extract_text(file_path):
"""
extract_text Extracts plaintext from a document path using unstructured.
extract_text Extracts plaintext from a document path using extractous.
:param file_path: The path of the file to extract text from.
:return: ASCII-encoded plaintext extracted from the document.
"""

from unstructured.partition.auto import partition

unstructured_file_types = [
extractable_file_types = [
".csv",
".eml",
".msg",
Expand All @@ -135,12 +125,21 @@ def extract_text(file_path):
".xml",
]

# If the file can be extracted with unstructured use its partition function or try and read it
if any(file_path.lower().endswith(file_type) for file_type in unstructured_file_types):
# If the file can be extracted with extractous use its partition function or try and read it
if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
try:
elements = partition(filename=file_path)
return "\n\n".join(element.text for element in elements)
except ValueError:
extractor = Extractor()
reader = extractor.extract_file(str(file_path))

result = ""
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)

return result.strip()

except Exception:
with open(file_path, "rb") as file:
return file.read().decode("utf-8", errors="ignore")
else:
Expand Down
12 changes: 6 additions & 6 deletions bbot/test/test_step_2/module_tests/test_module_excavate.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,7 @@ def check(self, module_test, events):

class TestExcavateRAWTEXT(ModuleTestBase):
targets = ["http://127.0.0.1:8888/", "test.notreal"]
modules_overrides = ["excavate", "httpx", "filedownload", "unstructured"]
modules_overrides = ["excavate", "httpx", "filedownload", "extractous"]
config_overrides = {"scope": {"report_distance": 1}, "web": {"spider_distance": 2, "spider_depth": 2}}

pdf_data = r"""%PDF-1.3
Expand Down Expand Up @@ -965,7 +965,7 @@ class TestExcavateRAWTEXT(ModuleTestBase):
startxref
1669
%%EOF"""
unstructured_response = """This is an email [email protected]
extractous_response = """This is an email [email protected]
An example JWT eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c
Expand Down Expand Up @@ -995,13 +995,13 @@ def check(self, module_test, events):
raw_text_events = [e for e in events if e.type == "RAW_TEXT"]
assert 1 == len(raw_text_events), "Failed to emit RAW_TEXT event"
assert (
raw_text_events[0].data == self.unstructured_response
raw_text_events[0].data == self.extractous_response
), f"Text extracted from PDF is incorrect, got {raw_text_events[0].data}"
email_events = [e for e in events if e.type == "EMAIL_ADDRESS"]
assert 1 == len(email_events), "Failed to emit EMAIL_ADDRESS event"
assert (
email_events[0].data == "[email protected]"
), f"Email extracted from unstructured text is incorrect, got {email_events[0].data}"
), f"Email extracted from extractous text is incorrect, got {email_events[0].data}"
finding_events = [e for e in events if e.type == "FINDING"]
assert 2 == len(finding_events), "Failed to emit FINDING events"
assert any(
Expand All @@ -1026,7 +1026,7 @@ def check(self, module_test, events):
url_events = [e.data for e in events if e.type == "URL_UNVERIFIED"]
assert (
"https://www.test.notreal/about" in url_events
), f"URL extracted from unstructured text is incorrect, got {url_events}"
), f"URL extracted from extractous text is incorrect, got {url_events}"
assert (
"/donot_detect.js" not in url_events
), f"URL extracted from unstructured text is incorrect, got {url_events}"
), f"URL extracted from extractous text is incorrect, got {url_events}"
Loading

0 comments on commit 3aaccda

Please sign in to comment.