Skip to content

Commit

Permalink
Merge pull request #1976 from blacklanternsecurity/fix-extractous
Browse files Browse the repository at this point in the history
Update Extractous with new API changes
  • Loading branch information
TheTechromancer authored Nov 18, 2024
2 parents b1d8b94 + b14bad5 commit 3945fd1
Showing 1 changed file with 17 additions and 42 deletions.
59 changes: 17 additions & 42 deletions bbot/modules/extractous.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ async def filter_event(self, event):
async def handle_event(self, event):
file_path = event.data["path"]
content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
if isinstance(content, tuple):
error, traceback = content
self.error(f"Error extracting text from {file_path}: {error}")
self.trace(traceback)
return

if content:
raw_text_event = self.make_event(
content,
Expand All @@ -99,49 +105,18 @@ def extract_text(file_path):
:return: ASCII-encoded plaintext extracted from the document.
"""

extractable_file_types = [
".csv",
".eml",
".msg",
".epub",
".xlsx",
".xls",
".html",
".htm",
".md",
".org",
".odt",
".pdf",
".txt",
".text",
".log",
".ppt",
".pptx",
".rst",
".rtf",
".tsv",
".doc",
".docx",
".xml",
]

# If the file can be extracted with extractous use its partition function or try and read it
if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
try:
extractor = Extractor()
reader = extractor.extract_file(str(file_path))
try:
extractor = Extractor()
reader, metadata = extractor.extract_file(str(file_path))

result = ""
result = ""
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)

return result.strip()
return result.strip()
except Exception as e:
import traceback

except Exception:
with open(file_path, "rb") as file:
return file.read().decode("utf-8", errors="ignore")
else:
with open(file_path, "rb") as file:
return file.read().decode("utf-8", errors="ignore")
return (str(e), traceback.format_exc())

0 comments on commit 3945fd1

Please sign in to comment.