From 5716a519c191fdb1e0ef2b8b85102fe4d1ed1b26 Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Fri, 10 Jan 2025 08:21:19 +0100 Subject: [PATCH 1/5] feat: add Swarmauri TextBlob Parser --- .../README.md | 1 + .../pyproject.toml | 55 ++++++++++++++++++ .../TextBlobNounParser.py | 57 +++++++++++++++++++ .../TextBlobSentenceParser.py | 49 ++++++++++++++++ .../__init__.py | 16 ++++++ 5 files changed, 178 insertions(+) create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/README.md create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py diff --git a/pkgs/community/swarmauri_parser_communitytextblob/README.md b/pkgs/community/swarmauri_parser_communitytextblob/README.md new file mode 100644 index 000000000..cd26902a2 --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/README.md @@ -0,0 +1 @@ +# Swarmauri Example Community Package \ No newline at end of file diff --git a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml new file mode 100644 index 000000000..e62e8ae26 --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml @@ -0,0 +1,55 @@ +[tool.poetry] +name = "swarmauri_parser_communitytextblob" +version = "0.6.0.dev1" +description = "TextBlob Parser for Swarmauri." +authors = ["Jacob Stewart "] +license = "Apache-2.0" +readme = "README.md" +repository = "http://github.com/swarmauri/swarmauri-sdk" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] + +[tool.poetry.dependencies] +python = ">=3.10,<3.13" + +# Swarmauri +swarmauri_core = { path = "../../core" } +swarmauri_base = { path = "../../base" } + + +[tool.poetry.group.dev.dependencies] +flake8 = "^7.0" +pytest = "^8.0" +pytest-asyncio = ">=0.24.0" +pytest-xdist = "^3.6.1" +pytest-json-report = "^1.5.0" +python-dotenv = "*" +requests = "^2.32.3" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +norecursedirs = ["combined", "scripts"] + +markers = [ + "test: standard test", + "unit: Unit tests", + "integration: Integration tests", + "acceptance: Acceptance tests", + "experimental: Experimental tests" +] +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" +asyncio_default_fixture_loop_scope = "function" + +[tool.poetry.plugins."swarmauri.parsers"] +TextBlobNounParser = "swarmauri_parser_communitytextblob:TextBlobNounParser" +TextBlobSentenceParser = "swarmauri_parser_communitytextblob:TextBlobSentenceParser" \ No newline at end of file diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py new file mode 100644 index 000000000..68023f1aa --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py @@ -0,0 +1,57 @@ +from textblob import TextBlob +from typing import List, Union, Any, Literal +from swarmauri.documents.concrete.Document import Document +from swarmauri.parsers.base.ParserBase import ParserBase + + +class TextBlobNounParser(ParserBase): + """ + A concrete implementation of IParser using TextBlob for Natural Language Processing tasks. + + This parser leverages TextBlob's functionalities such as noun phrase extraction, + sentiment analysis, classification, language translation, and more for parsing texts. + """ + + type: Literal["TextBlobNounParser"] = "TextBlobNounParser" + + def __init__(self, **kwargs): + try: + import nltk + + # Download required NLTK data + nltk.download("punkt") + nltk.download("averaged_perceptron_tagger") + nltk.download("brown") + nltk.download("wordnet") + super().__init__(**kwargs) + except Exception as e: + raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}") + + def parse(self, data: Union[str, Any]) -> List[Document]: + """ + Parses the input data using TextBlob to perform basic NLP tasks + and returns a list of documents with the parsed information. + + Parameters: + - data (Union[str, Any]): The input data to parse, expected to be text data for this parser. + + Returns: + - List[IDocument]: A list of documents with metadata generated from the parsing process. + """ + # Ensure the data is a string + if not isinstance(data, str): + raise ValueError("TextBlobParser expects a string as input data.") + + try: + # Use TextBlob for NLP tasks + blob = TextBlob(data) + + # Extracts noun phrases to demonstrate one of TextBlob's capabilities. + noun_phrases = list(blob.noun_phrases) + + # Create document with extracted information + document = Document(content=data, metadata={"noun_phrases": noun_phrases}) + + return [document] + except Exception as e: + raise RuntimeError(f"Error during text parsing: {str(e)}") diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py new file mode 100644 index 000000000..b816b065c --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py @@ -0,0 +1,49 @@ +from textblob import TextBlob +from typing import List, Union, Any, Literal +from swarmauri.documents.concrete.Document import Document +from swarmauri.parsers.base.ParserBase import ParserBase + + +class TextBlobSentenceParser(ParserBase): + """ + A parser that leverages TextBlob to break text into sentences. + + This parser uses the natural language processing capabilities of TextBlob + to accurately identify sentence boundaries within large blocks of text. + """ + + type: Literal["TextBlobSentenceParser"] = "TextBlobSentenceParser" + + def __init__(self, **kwargs): + import nltk + + nltk.download("punkt_tab") + super().__init__(**kwargs) + + def parse(self, data: Union[str, Any]) -> List[Document]: + """ + Parses the input text into sentence-based document chunks using TextBlob. + + Args: + data (Union[str, Any]): The input text to be parsed. + + Returns: + List[IDocument]: A list of IDocument instances, each representing a sentence. + """ + # Ensure the input is a string + if not isinstance(data, str): + data = str(data) + + # Utilize TextBlob for sentence tokenization + blob = TextBlob(data) + sentences = blob.sentences + + # Create a document instance for each sentence + documents = [ + Document( + content=str(sentence), metadata={"parser": "TextBlobSentenceParser"} + ) + for index, sentence in enumerate(sentences) + ] + + return documents diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py new file mode 100644 index 000000000..a83877682 --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/__init__.py @@ -0,0 +1,16 @@ +from .TextBlobNounParser import TextBlobNounParser +from .TextBlobSentenceParser import TextBlobSentenceParser + +__version__ = "0.6.0.dev26" +__long_desc__ = """ + +# Swarmauri TextBlob Based Components + +Components Included: +- TextBlobNounParser +- TextBlobSentenceParser + +Follow us at: https://github.com/swarmauri +Star us at: https://github.com/swarmauri/swarmauri-sdk + +""" From dc2573f0d48760d838ad68f7efb59adf66c653aa Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Fri, 10 Jan 2025 08:34:26 +0100 Subject: [PATCH 2/5] add punkt_tab resource download for NLTK in TextBlobNounParser. Co-authored-by: Lavesh-Akhadkar laveshakhadkar@gmail.com --- .../swarmauri_parser_communitytextblob/TextBlobNounParser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py index 68023f1aa..c5f6cb31a 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py @@ -23,6 +23,7 @@ def __init__(self, **kwargs): nltk.download("averaged_perceptron_tagger") nltk.download("brown") nltk.download("wordnet") + nltk.download('punkt_tab') super().__init__(**kwargs) except Exception as e: raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}") From ea5b6e59e91ae2dd26eadc45f771d4c23c6d08d2 Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Fri, 10 Jan 2025 08:41:27 +0100 Subject: [PATCH 3/5] feat: add TextBlob dependency and implement unit tests for NLTK parsers --- .../pyproject.toml | 3 ++ .../TextBlobNounParser.py | 6 +-- .../TextBlobSentenceParser.py | 4 +- .../unit/TextBlobNounParser_unit_test.py | 43 +++++++++++++++++++ .../unit/TextBlobSentenceParser_unit_test.py | 23 ++++++++++ 5 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py create mode 100644 pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py diff --git a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml index e62e8ae26..f2b3b8ff9 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml +++ b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml @@ -20,6 +20,9 @@ python = ">=3.10,<3.13" swarmauri_core = { path = "../../core" } swarmauri_base = { path = "../../base" } +# Dependencies +textblob = "^0.18.0" + [tool.poetry.group.dev.dependencies] flake8 = "^7.0" diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py index c5f6cb31a..36ad70218 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobNounParser.py @@ -1,7 +1,7 @@ from textblob import TextBlob from typing import List, Union, Any, Literal -from swarmauri.documents.concrete.Document import Document -from swarmauri.parsers.base.ParserBase import ParserBase +from swarmauri_standard.documents.Document import Document +from swarmauri_base.parsers.ParserBase import ParserBase class TextBlobNounParser(ParserBase): @@ -23,7 +23,7 @@ def __init__(self, **kwargs): nltk.download("averaged_perceptron_tagger") nltk.download("brown") nltk.download("wordnet") - nltk.download('punkt_tab') + nltk.download("punkt_tab") super().__init__(**kwargs) except Exception as e: raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}") diff --git a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py index b816b065c..0b94ba9c9 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py +++ b/pkgs/community/swarmauri_parser_communitytextblob/swarmauri_parser_communitytextblob/TextBlobSentenceParser.py @@ -1,7 +1,7 @@ from textblob import TextBlob from typing import List, Union, Any, Literal -from swarmauri.documents.concrete.Document import Document -from swarmauri.parsers.base.ParserBase import ParserBase +from swarmauri_standard.documents.Document import Document +from swarmauri_base.parsers.ParserBase import ParserBase class TextBlobSentenceParser(ParserBase): diff --git a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py new file mode 100644 index 000000000..e5f8a550c --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py @@ -0,0 +1,43 @@ +import pytest +from swarmauri_community.parsers.concrete.TextBlobNounParser import TextBlobNounParser as Parser + + +def setup_module(module): + """Setup any state specific to the execution of the given module.""" + try: + # Initialize a parser to trigger NLTK downloads + Parser() + except Exception as e: + pytest.skip(f"Failed to initialize NLTK resources: {str(e)}") + + +@pytest.fixture(scope="module") +def parser(): + """Fixture to provide a parser instance for tests.""" + return Parser() + + +@pytest.mark.unit +def test_ubc_resource(parser): + assert parser.resource == "Parser" + + +@pytest.mark.unit +def test_ubc_type(parser): + assert parser.type == "TextBlobNounParser" + + +@pytest.mark.unit +def test_serialization(parser): + assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id + + +@pytest.mark.unit +def test_parse(parser): + try: + documents = parser.parse("One more large chapula please.") + assert documents[0].resource == "Document" + assert documents[0].content == "One more large chapula please." + assert documents[0].metadata["noun_phrases"] == ["large chapula"] + except Exception as e: + pytest.fail(f"Parser failed with error: {str(e)}") diff --git a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py new file mode 100644 index 000000000..36c347906 --- /dev/null +++ b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py @@ -0,0 +1,23 @@ +import pytest +from swarmauri_community.parsers.concrete.TextBlobSentenceParser import TextBlobSentenceParser as Parser + +@pytest.mark.unit +def test_ubc_resource(): + parser = Parser() + assert parser.resource == 'Parser' + +@pytest.mark.unit +def test_ubc_type(): + parser = Parser() + assert parser.type == 'TextBlobSentenceParser' + +@pytest.mark.unit +def test_serialization(): + parser = Parser() + assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id + +@pytest.mark.unit +def test_parse(): + documents = Parser().parse('One more large chapula please.') + assert documents[0].resource == 'Document' + assert documents[0].content == 'One more large chapula please.' From ef0aab4363b280c4ae17345fa49b3315474a177e Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Fri, 10 Jan 2025 08:55:16 +0100 Subject: [PATCH 4/5] fix: update import paths for TextBlobNounParser and TextBlobSentenceParser in unit tests --- .../tests/unit/TextBlobNounParser_unit_test.py | 2 +- .../tests/unit/TextBlobSentenceParser_unit_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py index e5f8a550c..df9748212 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py +++ b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobNounParser_unit_test.py @@ -1,5 +1,5 @@ import pytest -from swarmauri_community.parsers.concrete.TextBlobNounParser import TextBlobNounParser as Parser +from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobNounParser as Parser def setup_module(module): diff --git a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py index 36c347906..75375c5cb 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py +++ b/pkgs/community/swarmauri_parser_communitytextblob/tests/unit/TextBlobSentenceParser_unit_test.py @@ -1,5 +1,5 @@ import pytest -from swarmauri_community.parsers.concrete.TextBlobSentenceParser import TextBlobSentenceParser as Parser +from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobSentenceParser as Parser @pytest.mark.unit def test_ubc_resource(): From b1076e145624df0740b51d9028ff3a7c89dc34df Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Fri, 10 Jan 2025 09:12:22 +0100 Subject: [PATCH 5/5] fix: update nltk and textblob dependencies in pyproject.toml files --- pkgs/community/swarmauri_community/pyproject.toml | 4 ++-- .../swarmauri_parser_communitytextblob/pyproject.toml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pkgs/community/swarmauri_community/pyproject.toml b/pkgs/community/swarmauri_community/pyproject.toml index dfccb914c..f0023c02c 100644 --- a/pkgs/community/swarmauri_community/pyproject.toml +++ b/pkgs/community/swarmauri_community/pyproject.toml @@ -32,7 +32,7 @@ captcha = "^0.6.0" #gradio = { version = "^5.4.0", optional = true } leptonai = { version = "^0.22.0", optional = true } neo4j = { version = "^5.25.0", optional = true } -nltk = { version = "^3.9.1", optional = true } +#nltk = { version = "^3.9.1", optional = true } pandas = "^2.2.3" psutil = { version = "^6.1.0", optional = true } pygithub = { version = "^2.4.0", optional = true } @@ -49,7 +49,7 @@ pinecone-client = { version = "^5.0.1", optional = true, extras = ["grpc"] } pypdf = { version = "^5.0.1", optional = true } pypdftk = { version = "^0.5", optional = true } weaviate-client = { version = "^4.9.2", optional = true } -textblob = { version = "^0.18.0", optional = true } +#textblob = { version = "^0.18.0", optional = true } torch = { version = "^2.4.1", optional = true} scikit-learn = { version = "^1.5.2", optional = true } #protobuf = { version = "^3.20.0", optional = true } diff --git a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml index f2b3b8ff9..fa0369057 100644 --- a/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml +++ b/pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml @@ -22,6 +22,7 @@ swarmauri_base = { path = "../../base" } # Dependencies textblob = "^0.18.0" +nltk = "^3.9.1" [tool.poetry.group.dev.dependencies]