Merge pull request #1068 from MichaelDecent/comm_pkg5

add TextBlob Parser swarmauri
swarmauri · Jan 12, 2025 · d62287a · d62287a
2 parents e9fe1eb + 72d552e
commit d62287a
Show file tree

Hide file tree

Showing 8 changed files with 255 additions and 2 deletions.
diff --git a/pkgs/community/swarmauri_community/pyproject.toml b/pkgs/community/swarmauri_community/pyproject.toml
@@ -31,8 +31,12 @@ captcha = "^0.6.0"
 #gensim = { version = "^4.3.3", optional = true }
 #gradio = { version = "^5.4.0", optional = true }
 leptonai = { version = "^0.22.0", optional = true }
+
+neo4j = { version = "^5.25.0", optional = true }
+#nltk = { version = "^3.9.1", optional = true }
+
 #neo4j = { version = "^5.25.0", optional = true }
-nltk = { version = "^3.9.1", optional = true }
+
 pandas = "^2.2.3"
 psutil = { version = "^6.1.0", optional = true }
 pygithub = { version = "^2.4.0", optional = true }
@@ -49,7 +53,7 @@ pinecone-client = { version = "^5.0.1", optional = true, extras = ["grpc"] }
 pypdf = { version = "^5.0.1", optional = true }
 pypdftk = { version = "^0.5", optional = true }
 weaviate-client = { version = "^4.9.2", optional = true }
-textblob = { version = "^0.18.0", optional = true }
+#textblob = { version = "^0.18.0", optional = true }
 torch = { version = "^2.4.1", optional = true}
 scikit-learn = { version = "^1.5.2", optional = true }
 #protobuf = { version = "^3.20.0", optional = true }

diff --git a/pkgs/community/swarmauri_parser_communitytextblob/README.md b/pkgs/community/swarmauri_parser_communitytextblob/README.md
@@ -0,0 +1 @@
+# Swarmauri Example Community Package
diff --git a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml
@@ -0,0 +1,59 @@
+[tool.poetry]
+name = "swarmauri_parser_communitytextblob"
+version = "0.6.0.dev1"
+description = "TextBlob Parser for Swarmauri."
+authors = ["Jacob Stewart <[email protected]>"]
+license = "Apache-2.0"
+readme = "README.md"
+repository = "http://github.com/swarmauri/swarmauri-sdk"
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12"
+]
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+
+# Swarmauri
+swarmauri_core = { path = "../../core" }
+swarmauri_base = { path = "../../base" }
+
+# Dependencies
+textblob = "^0.18.0"
+nltk = "^3.9.1"
+
+
+[tool.poetry.group.dev.dependencies]
+flake8 = "^7.0"
+pytest = "^8.0"
+pytest-asyncio = ">=0.24.0"
+pytest-xdist = "^3.6.1"
+pytest-json-report = "^1.5.0"
+python-dotenv = "*"
+requests = "^2.32.3"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+norecursedirs = ["combined", "scripts"]
+
+markers = [
+    "test: standard test",
+    "unit: Unit tests",
+    "integration: Integration tests",
+    "acceptance: Acceptance tests",
+    "experimental: Experimental tests"
+]
+log_cli = true
+log_cli_level = "INFO"
+log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
+asyncio_default_fixture_loop_scope = "function"
+
+[tool.poetry.plugins."swarmauri.parsers"]
+TextBlobNounParser = "swarmauri_parser_communitytextblob:TextBlobNounParser"
+TextBlobSentenceParser = "swarmauri_parser_communitytextblob:TextBlobSentenceParser"
diff --git a/...armauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py b/...armauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py
@@ -0,0 +1,58 @@
+from textblob import TextBlob
+from typing import List, Union, Any, Literal
+from swarmauri_standard.documents.Document import Document
+from swarmauri_base.parsers.ParserBase import ParserBase
+
+
+class TextBlobNounParser(ParserBase):
+    """
+    A concrete implementation of IParser using TextBlob for Natural Language Processing tasks.
+
+    This parser leverages TextBlob's functionalities such as noun phrase extraction,
+    sentiment analysis, classification, language translation, and more for parsing texts.
+    """
+
+    type: Literal["TextBlobNounParser"] = "TextBlobNounParser"
+
+    def __init__(self, **kwargs):
+        try:
+            import nltk
+
+            # Download required NLTK data
+            nltk.download("punkt")
+            nltk.download("averaged_perceptron_tagger")
+            nltk.download("brown")
+            nltk.download("wordnet")
+            nltk.download("punkt_tab")
+            super().__init__(**kwargs)
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}")
+
+    def parse(self, data: Union[str, Any]) -> List[Document]:
+        """
+        Parses the input data using TextBlob to perform basic NLP tasks
+        and returns a list of documents with the parsed information.
+
+        Parameters:
+        - data (Union[str, Any]): The input data to parse, expected to be text data for this parser.
+
+        Returns:
+        - List[IDocument]: A list of documents with metadata generated from the parsing process.
+        """
+        # Ensure the data is a string
+        if not isinstance(data, str):
+            raise ValueError("TextBlobParser expects a string as input data.")
+
+        try:
+            # Use TextBlob for NLP tasks
+            blob = TextBlob(data)
+
+            # Extracts noun phrases to demonstrate one of TextBlob's capabilities.
+            noun_phrases = list(blob.noun_phrases)
+
+            # Create document with extracted information
+            document = Document(content=data, metadata={"noun_phrases": noun_phrases})
+
+            return [document]
+        except Exception as e:
+            raise RuntimeError(f"Error during text parsing: {str(e)}")
diff --git a/...uri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py b/...uri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py
@@ -0,0 +1,49 @@
+from textblob import TextBlob
+from typing import List, Union, Any, Literal
+from swarmauri_standard.documents.Document import Document
+from swarmauri_base.parsers.ParserBase import ParserBase
+
+
+class TextBlobSentenceParser(ParserBase):
+    """
+    A parser that leverages TextBlob to break text into sentences.
+
+    This parser uses the natural language processing capabilities of TextBlob
+    to accurately identify sentence boundaries within large blocks of text.
+    """
+
+    type: Literal["TextBlobSentenceParser"] = "TextBlobSentenceParser"
+
+    def __init__(self, **kwargs):
+        import nltk
+
+        nltk.download("punkt_tab")
+        super().__init__(**kwargs)
+
+    def parse(self, data: Union[str, Any]) -> List[Document]:
+        """
+        Parses the input text into sentence-based document chunks using TextBlob.
+
+        Args:
+            data (Union[str, Any]): The input text to be parsed.
+
+        Returns:
+            List[IDocument]: A list of IDocument instances, each representing a sentence.
+        """
+        # Ensure the input is a string
+        if not isinstance(data, str):
+            data = str(data)
+
+        # Utilize TextBlob for sentence tokenization
+        blob = TextBlob(data)
+        sentences = blob.sentences
+
+        # Create a document instance for each sentence
+        documents = [
+            Document(
+                content=str(sentence), metadata={"parser": "TextBlobSentenceParser"}
+            )
+            for index, sentence in enumerate(sentences)
+        ]
+
+        return documents
diff --git a/...mmunity/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py b/...mmunity/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py
@@ -0,0 +1,16 @@
+from .TextBlobNounParser import TextBlobNounParser
+from .TextBlobSentenceParser import TextBlobSentenceParser
+
+__version__ = "0.6.0.dev26"
+__long_desc__ = """
+
+# Swarmauri TextBlob Based Components
+
+Components Included:
+- TextBlobNounParser
+- TextBlobSentenceParser
+
+Follow us at: https://github.com/swarmauri
+Star us at: https://github.com/swarmauri/swarmauri-sdk
+
+"""
diff --git a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py
@@ -0,0 +1,43 @@
+import pytest
+from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobNounParser as Parser
+
+
+def setup_module(module):
+    """Setup any state specific to the execution of the given module."""
+    try:
+        # Initialize a parser to trigger NLTK downloads
+        Parser()
+    except Exception as e:
+        pytest.skip(f"Failed to initialize NLTK resources: {str(e)}")
+
+
+@pytest.fixture(scope="module")
+def parser():
+    """Fixture to provide a parser instance for tests."""
+    return Parser()
+
+
+@pytest.mark.unit
+def test_ubc_resource(parser):
+    assert parser.resource == "Parser"
+
+
+@pytest.mark.unit
+def test_ubc_type(parser):
+    assert parser.type == "TextBlobNounParser"
+
+
+@pytest.mark.unit
+def test_serialization(parser):
+    assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id
+
+
+@pytest.mark.unit
+def test_parse(parser):
+    try:
+        documents = parser.parse("One more large chapula please.")
+        assert documents[0].resource == "Document"
+        assert documents[0].content == "One more large chapula please."
+        assert documents[0].metadata["noun_phrases"] == ["large chapula"]
+    except Exception as e:
+        pytest.fail(f"Parser failed with error: {str(e)}")
diff --git a/...mmunity/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py b/...mmunity/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py
@@ -0,0 +1,23 @@
+import pytest
+from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobSentenceParser as Parser
+
+@pytest.mark.unit
+def test_ubc_resource():
+    parser = Parser()
+    assert parser.resource == 'Parser'
+
+@pytest.mark.unit
+def test_ubc_type():
+    parser = Parser()
+    assert parser.type == 'TextBlobSentenceParser'
+
+@pytest.mark.unit
+def test_serialization():
+    parser = Parser()
+    assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id
+
+@pytest.mark.unit
+def test_parse():
+    documents = Parser().parse('One more large chapula please.')
+    assert documents[0].resource == 'Document'
+    assert documents[0].content == 'One more large chapula please.'