-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1068 from MichaelDecent/comm_pkg5
add TextBlob Parser swarmauri
- Loading branch information
Showing
8 changed files
with
255 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Swarmauri Example Community Package |
59 changes: 59 additions & 0 deletions
59
pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
[tool.poetry] | ||
name = "swarmauri_parser_communitytextblob" | ||
version = "0.6.0.dev1" | ||
description = "TextBlob Parser for Swarmauri." | ||
authors = ["Jacob Stewart <[email protected]>"] | ||
license = "Apache-2.0" | ||
readme = "README.md" | ||
repository = "http://github.com/swarmauri/swarmauri-sdk" | ||
classifiers = [ | ||
"License :: OSI Approved :: Apache Software License", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
"Programming Language :: Python :: 3.12" | ||
] | ||
|
||
[tool.poetry.dependencies] | ||
python = ">=3.10,<3.13" | ||
|
||
# Swarmauri | ||
swarmauri_core = { path = "../../core" } | ||
swarmauri_base = { path = "../../base" } | ||
|
||
# Dependencies | ||
textblob = "^0.18.0" | ||
nltk = "^3.9.1" | ||
|
||
|
||
[tool.poetry.group.dev.dependencies] | ||
flake8 = "^7.0" | ||
pytest = "^8.0" | ||
pytest-asyncio = ">=0.24.0" | ||
pytest-xdist = "^3.6.1" | ||
pytest-json-report = "^1.5.0" | ||
python-dotenv = "*" | ||
requests = "^2.32.3" | ||
|
||
[build-system] | ||
requires = ["poetry-core>=1.0.0"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.pytest.ini_options] | ||
norecursedirs = ["combined", "scripts"] | ||
|
||
markers = [ | ||
"test: standard test", | ||
"unit: Unit tests", | ||
"integration: Integration tests", | ||
"acceptance: Acceptance tests", | ||
"experimental: Experimental tests" | ||
] | ||
log_cli = true | ||
log_cli_level = "INFO" | ||
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" | ||
log_cli_date_format = "%Y-%m-%d %H:%M:%S" | ||
asyncio_default_fixture_loop_scope = "function" | ||
|
||
[tool.poetry.plugins."swarmauri.parsers"] | ||
TextBlobNounParser = "swarmauri_parser_communitytextblob:TextBlobNounParser" | ||
TextBlobSentenceParser = "swarmauri_parser_communitytextblob:TextBlobSentenceParser" |
58 changes: 58 additions & 0 deletions
58
...armauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from textblob import TextBlob | ||
from typing import List, Union, Any, Literal | ||
from swarmauri_standard.documents.Document import Document | ||
from swarmauri_base.parsers.ParserBase import ParserBase | ||
|
||
|
||
class TextBlobNounParser(ParserBase): | ||
""" | ||
A concrete implementation of IParser using TextBlob for Natural Language Processing tasks. | ||
This parser leverages TextBlob's functionalities such as noun phrase extraction, | ||
sentiment analysis, classification, language translation, and more for parsing texts. | ||
""" | ||
|
||
type: Literal["TextBlobNounParser"] = "TextBlobNounParser" | ||
|
||
def __init__(self, **kwargs): | ||
try: | ||
import nltk | ||
|
||
# Download required NLTK data | ||
nltk.download("punkt") | ||
nltk.download("averaged_perceptron_tagger") | ||
nltk.download("brown") | ||
nltk.download("wordnet") | ||
nltk.download("punkt_tab") | ||
super().__init__(**kwargs) | ||
except Exception as e: | ||
raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}") | ||
|
||
def parse(self, data: Union[str, Any]) -> List[Document]: | ||
""" | ||
Parses the input data using TextBlob to perform basic NLP tasks | ||
and returns a list of documents with the parsed information. | ||
Parameters: | ||
- data (Union[str, Any]): The input data to parse, expected to be text data for this parser. | ||
Returns: | ||
- List[IDocument]: A list of documents with metadata generated from the parsing process. | ||
""" | ||
# Ensure the data is a string | ||
if not isinstance(data, str): | ||
raise ValueError("TextBlobParser expects a string as input data.") | ||
|
||
try: | ||
# Use TextBlob for NLP tasks | ||
blob = TextBlob(data) | ||
|
||
# Extracts noun phrases to demonstrate one of TextBlob's capabilities. | ||
noun_phrases = list(blob.noun_phrases) | ||
|
||
# Create document with extracted information | ||
document = Document(content=data, metadata={"noun_phrases": noun_phrases}) | ||
|
||
return [document] | ||
except Exception as e: | ||
raise RuntimeError(f"Error during text parsing: {str(e)}") |
49 changes: 49 additions & 0 deletions
49
...uri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from textblob import TextBlob | ||
from typing import List, Union, Any, Literal | ||
from swarmauri_standard.documents.Document import Document | ||
from swarmauri_base.parsers.ParserBase import ParserBase | ||
|
||
|
||
class TextBlobSentenceParser(ParserBase): | ||
""" | ||
A parser that leverages TextBlob to break text into sentences. | ||
This parser uses the natural language processing capabilities of TextBlob | ||
to accurately identify sentence boundaries within large blocks of text. | ||
""" | ||
|
||
type: Literal["TextBlobSentenceParser"] = "TextBlobSentenceParser" | ||
|
||
def __init__(self, **kwargs): | ||
import nltk | ||
|
||
nltk.download("punkt_tab") | ||
super().__init__(**kwargs) | ||
|
||
def parse(self, data: Union[str, Any]) -> List[Document]: | ||
""" | ||
Parses the input text into sentence-based document chunks using TextBlob. | ||
Args: | ||
data (Union[str, Any]): The input text to be parsed. | ||
Returns: | ||
List[IDocument]: A list of IDocument instances, each representing a sentence. | ||
""" | ||
# Ensure the input is a string | ||
if not isinstance(data, str): | ||
data = str(data) | ||
|
||
# Utilize TextBlob for sentence tokenization | ||
blob = TextBlob(data) | ||
sentences = blob.sentences | ||
|
||
# Create a document instance for each sentence | ||
documents = [ | ||
Document( | ||
content=str(sentence), metadata={"parser": "TextBlobSentenceParser"} | ||
) | ||
for index, sentence in enumerate(sentences) | ||
] | ||
|
||
return documents |
16 changes: 16 additions & 0 deletions
16
...mmunity/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from .TextBlobNounParser import TextBlobNounParser | ||
from .TextBlobSentenceParser import TextBlobSentenceParser | ||
|
||
__version__ = "0.6.0.dev26" | ||
__long_desc__ = """ | ||
# Swarmauri TextBlob Based Components | ||
Components Included: | ||
- TextBlobNounParser | ||
- TextBlobSentenceParser | ||
Follow us at: https://github.com/swarmauri | ||
Star us at: https://github.com/swarmauri/swarmauri-sdk | ||
""" |
43 changes: 43 additions & 0 deletions
43
pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import pytest | ||
from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobNounParser as Parser | ||
|
||
|
||
def setup_module(module): | ||
"""Setup any state specific to the execution of the given module.""" | ||
try: | ||
# Initialize a parser to trigger NLTK downloads | ||
Parser() | ||
except Exception as e: | ||
pytest.skip(f"Failed to initialize NLTK resources: {str(e)}") | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def parser(): | ||
"""Fixture to provide a parser instance for tests.""" | ||
return Parser() | ||
|
||
|
||
@pytest.mark.unit | ||
def test_ubc_resource(parser): | ||
assert parser.resource == "Parser" | ||
|
||
|
||
@pytest.mark.unit | ||
def test_ubc_type(parser): | ||
assert parser.type == "TextBlobNounParser" | ||
|
||
|
||
@pytest.mark.unit | ||
def test_serialization(parser): | ||
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id | ||
|
||
|
||
@pytest.mark.unit | ||
def test_parse(parser): | ||
try: | ||
documents = parser.parse("One more large chapula please.") | ||
assert documents[0].resource == "Document" | ||
assert documents[0].content == "One more large chapula please." | ||
assert documents[0].metadata["noun_phrases"] == ["large chapula"] | ||
except Exception as e: | ||
pytest.fail(f"Parser failed with error: {str(e)}") |
23 changes: 23 additions & 0 deletions
23
...mmunity/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pytest | ||
from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobSentenceParser as Parser | ||
|
||
@pytest.mark.unit | ||
def test_ubc_resource(): | ||
parser = Parser() | ||
assert parser.resource == 'Parser' | ||
|
||
@pytest.mark.unit | ||
def test_ubc_type(): | ||
parser = Parser() | ||
assert parser.type == 'TextBlobSentenceParser' | ||
|
||
@pytest.mark.unit | ||
def test_serialization(): | ||
parser = Parser() | ||
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id | ||
|
||
@pytest.mark.unit | ||
def test_parse(): | ||
documents = Parser().parse('One more large chapula please.') | ||
assert documents[0].resource == 'Document' | ||
assert documents[0].content == 'One more large chapula please.' |