Skip to content

Commit

Permalink
Merge pull request #1068 from MichaelDecent/comm_pkg5
Browse files Browse the repository at this point in the history
add TextBlob Parser swarmauri
  • Loading branch information
cobycloud authored Jan 12, 2025
2 parents e9fe1eb + 72d552e commit d62287a
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 2 deletions.
8 changes: 6 additions & 2 deletions pkgs/community/swarmauri_community/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ captcha = "^0.6.0"
#gensim = { version = "^4.3.3", optional = true }
#gradio = { version = "^5.4.0", optional = true }
leptonai = { version = "^0.22.0", optional = true }

neo4j = { version = "^5.25.0", optional = true }
#nltk = { version = "^3.9.1", optional = true }

#neo4j = { version = "^5.25.0", optional = true }
nltk = { version = "^3.9.1", optional = true }

pandas = "^2.2.3"
psutil = { version = "^6.1.0", optional = true }
pygithub = { version = "^2.4.0", optional = true }
Expand All @@ -49,7 +53,7 @@ pinecone-client = { version = "^5.0.1", optional = true, extras = ["grpc"] }
pypdf = { version = "^5.0.1", optional = true }
pypdftk = { version = "^0.5", optional = true }
weaviate-client = { version = "^4.9.2", optional = true }
textblob = { version = "^0.18.0", optional = true }
#textblob = { version = "^0.18.0", optional = true }
torch = { version = "^2.4.1", optional = true}
scikit-learn = { version = "^1.5.2", optional = true }
#protobuf = { version = "^3.20.0", optional = true }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Swarmauri Example Community Package
59 changes: 59 additions & 0 deletions pkgs/community/swarmauri_parser_communitytextblob/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[tool.poetry]
name = "swarmauri_parser_communitytextblob"
version = "0.6.0.dev1"
description = "TextBlob Parser for Swarmauri."
authors = ["Jacob Stewart <[email protected]>"]
license = "Apache-2.0"
readme = "README.md"
repository = "http://github.com/swarmauri/swarmauri-sdk"
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
]

[tool.poetry.dependencies]
python = ">=3.10,<3.13"

# Swarmauri
swarmauri_core = { path = "../../core" }
swarmauri_base = { path = "../../base" }

# Dependencies
textblob = "^0.18.0"
nltk = "^3.9.1"


[tool.poetry.group.dev.dependencies]
flake8 = "^7.0"
pytest = "^8.0"
pytest-asyncio = ">=0.24.0"
pytest-xdist = "^3.6.1"
pytest-json-report = "^1.5.0"
python-dotenv = "*"
requests = "^2.32.3"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
norecursedirs = ["combined", "scripts"]

markers = [
"test: standard test",
"unit: Unit tests",
"integration: Integration tests",
"acceptance: Acceptance tests",
"experimental: Experimental tests"
]
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
asyncio_default_fixture_loop_scope = "function"

[tool.poetry.plugins."swarmauri.parsers"]
TextBlobNounParser = "swarmauri_parser_communitytextblob:TextBlobNounParser"
TextBlobSentenceParser = "swarmauri_parser_communitytextblob:TextBlobSentenceParser"
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from textblob import TextBlob
from typing import List, Union, Any, Literal
from swarmauri_standard.documents.Document import Document
from swarmauri_base.parsers.ParserBase import ParserBase


class TextBlobNounParser(ParserBase):
"""
A concrete implementation of IParser using TextBlob for Natural Language Processing tasks.
This parser leverages TextBlob's functionalities such as noun phrase extraction,
sentiment analysis, classification, language translation, and more for parsing texts.
"""

type: Literal["TextBlobNounParser"] = "TextBlobNounParser"

def __init__(self, **kwargs):
try:
import nltk

# Download required NLTK data
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("brown")
nltk.download("wordnet")
nltk.download("punkt_tab")
super().__init__(**kwargs)
except Exception as e:
raise RuntimeError(f"Failed to initialize NLTK resources: {str(e)}")

def parse(self, data: Union[str, Any]) -> List[Document]:
"""
Parses the input data using TextBlob to perform basic NLP tasks
and returns a list of documents with the parsed information.
Parameters:
- data (Union[str, Any]): The input data to parse, expected to be text data for this parser.
Returns:
- List[IDocument]: A list of documents with metadata generated from the parsing process.
"""
# Ensure the data is a string
if not isinstance(data, str):
raise ValueError("TextBlobParser expects a string as input data.")

try:
# Use TextBlob for NLP tasks
blob = TextBlob(data)

# Extracts noun phrases to demonstrate one of TextBlob's capabilities.
noun_phrases = list(blob.noun_phrases)

# Create document with extracted information
document = Document(content=data, metadata={"noun_phrases": noun_phrases})

return [document]
except Exception as e:
raise RuntimeError(f"Error during text parsing: {str(e)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from textblob import TextBlob
from typing import List, Union, Any, Literal
from swarmauri_standard.documents.Document import Document
from swarmauri_base.parsers.ParserBase import ParserBase


class TextBlobSentenceParser(ParserBase):
"""
A parser that leverages TextBlob to break text into sentences.
This parser uses the natural language processing capabilities of TextBlob
to accurately identify sentence boundaries within large blocks of text.
"""

type: Literal["TextBlobSentenceParser"] = "TextBlobSentenceParser"

def __init__(self, **kwargs):
import nltk

nltk.download("punkt_tab")
super().__init__(**kwargs)

def parse(self, data: Union[str, Any]) -> List[Document]:
"""
Parses the input text into sentence-based document chunks using TextBlob.
Args:
data (Union[str, Any]): The input text to be parsed.
Returns:
List[IDocument]: A list of IDocument instances, each representing a sentence.
"""
# Ensure the input is a string
if not isinstance(data, str):
data = str(data)

# Utilize TextBlob for sentence tokenization
blob = TextBlob(data)
sentences = blob.sentences

# Create a document instance for each sentence
documents = [
Document(
content=str(sentence), metadata={"parser": "TextBlobSentenceParser"}
)
for index, sentence in enumerate(sentences)
]

return documents
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from .TextBlobNounParser import TextBlobNounParser
from .TextBlobSentenceParser import TextBlobSentenceParser

__version__ = "0.6.0.dev26"
__long_desc__ = """
# Swarmauri TextBlob Based Components
Components Included:
- TextBlobNounParser
- TextBlobSentenceParser
Follow us at: https://github.com/swarmauri
Star us at: https://github.com/swarmauri/swarmauri-sdk
"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest
from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobNounParser as Parser


def setup_module(module):
"""Setup any state specific to the execution of the given module."""
try:
# Initialize a parser to trigger NLTK downloads
Parser()
except Exception as e:
pytest.skip(f"Failed to initialize NLTK resources: {str(e)}")


@pytest.fixture(scope="module")
def parser():
"""Fixture to provide a parser instance for tests."""
return Parser()


@pytest.mark.unit
def test_ubc_resource(parser):
assert parser.resource == "Parser"


@pytest.mark.unit
def test_ubc_type(parser):
assert parser.type == "TextBlobNounParser"


@pytest.mark.unit
def test_serialization(parser):
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id


@pytest.mark.unit
def test_parse(parser):
try:
documents = parser.parse("One more large chapula please.")
assert documents[0].resource == "Document"
assert documents[0].content == "One more large chapula please."
assert documents[0].metadata["noun_phrases"] == ["large chapula"]
except Exception as e:
pytest.fail(f"Parser failed with error: {str(e)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from swarmauri_parser_communitytextblob.TextBlobSentenceParser import TextBlobSentenceParser as Parser

@pytest.mark.unit
def test_ubc_resource():
parser = Parser()
assert parser.resource == 'Parser'

@pytest.mark.unit
def test_ubc_type():
parser = Parser()
assert parser.type == 'TextBlobSentenceParser'

@pytest.mark.unit
def test_serialization():
parser = Parser()
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id

@pytest.mark.unit
def test_parse():
documents = Parser().parse('One more large chapula please.')
assert documents[0].resource == 'Document'
assert documents[0].content == 'One more large chapula please.'

0 comments on commit d62287a

Please sign in to comment.