-
-
Notifications
You must be signed in to change notification settings - Fork 626
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1927 from blacklanternsecurity/extractous
New module: Extractous
- Loading branch information
Showing
4 changed files
with
79 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -894,7 +894,7 @@ def check(self, module_test, events): | |
|
||
class TestExcavateRAWTEXT(ModuleTestBase): | ||
targets = ["http://127.0.0.1:8888/", "test.notreal"] | ||
modules_overrides = ["excavate", "httpx", "filedownload", "unstructured"] | ||
modules_overrides = ["excavate", "httpx", "filedownload", "extractous"] | ||
config_overrides = {"scope": {"report_distance": 1}, "web": {"spider_distance": 2, "spider_depth": 2}} | ||
|
||
pdf_data = r"""%PDF-1.3 | ||
|
@@ -965,7 +965,7 @@ class TestExcavateRAWTEXT(ModuleTestBase): | |
startxref | ||
1669 | ||
%%EOF""" | ||
unstructured_response = """This is an email [email protected] | ||
extractous_response = """This is an email [email protected] | ||
An example JWT eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c | ||
|
@@ -995,13 +995,13 @@ def check(self, module_test, events): | |
raw_text_events = [e for e in events if e.type == "RAW_TEXT"] | ||
assert 1 == len(raw_text_events), "Failed to emit RAW_TEXT event" | ||
assert ( | ||
raw_text_events[0].data == self.unstructured_response | ||
raw_text_events[0].data == self.extractous_response | ||
), f"Text extracted from PDF is incorrect, got {raw_text_events[0].data}" | ||
email_events = [e for e in events if e.type == "EMAIL_ADDRESS"] | ||
assert 1 == len(email_events), "Failed to emit EMAIL_ADDRESS event" | ||
assert ( | ||
email_events[0].data == "[email protected]" | ||
), f"Email extracted from unstructured text is incorrect, got {email_events[0].data}" | ||
), f"Email extracted from extractous text is incorrect, got {email_events[0].data}" | ||
finding_events = [e for e in events if e.type == "FINDING"] | ||
assert 2 == len(finding_events), "Failed to emit FINDING events" | ||
assert any( | ||
|
@@ -1026,7 +1026,7 @@ def check(self, module_test, events): | |
url_events = [e.data for e in events if e.type == "URL_UNVERIFIED"] | ||
assert ( | ||
"https://www.test.notreal/about" in url_events | ||
), f"URL extracted from unstructured text is incorrect, got {url_events}" | ||
), f"URL extracted from extractous text is incorrect, got {url_events}" | ||
assert ( | ||
"/donot_detect.js" not in url_events | ||
), f"URL extracted from unstructured text is incorrect, got {url_events}" | ||
), f"URL extracted from extractous text is incorrect, got {url_events}" |
Oops, something went wrong.