Skip to content

Commit

Permalink
Add Swarmauri PytesseractImg2Text community package
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelDecent committed Jan 14, 2025
1 parent 877c8f5 commit 58f440a
Show file tree
Hide file tree
Showing 6 changed files with 398 additions and 2 deletions.
4 changes: 2 additions & 2 deletions pkgs/community/swarmauri_community/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ captcha = "^0.6.0"
#folium = { version = "^0.18.0", optional = true }
#gensim = { version = "^4.3.3", optional = true }
#gradio = { version = "^5.4.0", optional = true }
leptonai = { version = "^0.22.0", optional = true }
#leptonai = { version = "^0.22.0", optional = true }

neo4j = { version = "^5.25.0", optional = true }
#nltk = { version = "^3.9.1", optional = true }
Expand All @@ -48,7 +48,7 @@ typing_extensions = "^4.12.2"
#tiktoken = { version = "^0.8.0", optional = true }
PyMuPDF = { version = "^1.24.12", optional = true }
#qdrant-client = { version = "^1.12.0", optional = true }
pinecone-client = { version = "^5.0.1", optional = true, extras = ["grpc"] }
#pinecone-client = { version = "^5.0.1", optional = true, extras = ["grpc"] }
pypdf = { version = "^5.0.1", optional = true }
pypdftk = { version = "^0.5", optional = true }
weaviate-client = { version = "^4.9.2", optional = true }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Swarmauri Example Community Package
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
[tool.poetry]
name = "swarmauri_llm_communitypytesseractImg2text"
version = "0.6.0.dev1"
description = "PytesseractImg2Text Model"
authors = ["Jacob Stewart <[email protected]>"]
license = "Apache-2.0"
readme = "README.md"
repository = "http://github.com/swarmauri/swarmauri-sdk"
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
]

[tool.poetry.dependencies]
python = ">=3.10,<3.13"

# Swarmauri
swarmauri_core = { path = "../../core" }
swarmauri_base = { path = "../../base" }

# Dependencies
pytesseract = "^0.3.13"


[tool.poetry.group.dev.dependencies]
flake8 = "^7.0"
pytest = "^8.0"
pytest-asyncio = ">=0.24.0"
pytest-xdist = "^3.6.1"
pytest-json-report = "^1.5.0"
python-dotenv = "*"
requests = "^2.32.3"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
norecursedirs = ["combined", "scripts"]

markers = [
"test: standard test",
"unit: Unit tests",
"integration: Integration tests",
"acceptance: Acceptance tests",
"experimental: Experimental tests"
]
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
asyncio_default_fixture_loop_scope = "function"

[tool.poetry.plugins."swarmauri.agents"]
ExampleCommunityAgent = "swm_example_community_package.ExampleCommunityAgent:ExampleCommunityAgent"
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import os
import asyncio
from typing import List, Literal, Union
from pydantic import Field, ConfigDict
from PIL import Image
import pytesseract
from io import BytesIO
from swarmauri_base.llms.LLMBase import LLMBase


class PytesseractImg2TextModel(LLMBase):
"""
A model for performing OCR (Optical Character Recognition) using Pytesseract.
It can process both local images and image bytes, returning extracted text.
Requires Tesseract-OCR to be installed on the system.
"""

tesseract_cmd: str = Field(
default_factory=lambda: os.environ.get(
"TESSERACT_CMD",
("/usr/bin/tesseract" if os.path.exists("/usr/bin/tesseract") else None),
)
)
type: Literal["PytesseractImg2TextModel"] = "PytesseractImg2TextModel"
language: str = Field(default="eng")
config: str = Field(default="") # Custom configuration string
model_config = ConfigDict(protected_namespaces=())

def __init__(self, **data):
super().__init__(**data)
pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd

def _process_image(self, image: Union[str, bytes, Image.Image], **kwargs) -> str:
"""Process an image and return extracted text."""
try:
# Handle different input types
if isinstance(image, str):
# If image is a file path
img = Image.open(image)
elif isinstance(image, bytes):
# If image is bytes
img = Image.open(BytesIO(image))
elif isinstance(image, Image.Image):
# If image is already a PIL Image
img = image
else:
raise ValueError("Unsupported image format")

# Extract text using pytesseract
custom_config = kwargs.get("config", self.config)
lang = kwargs.get("language", self.language)

text = pytesseract.image_to_string(img, lang=lang, config=custom_config)

return text.strip()

except Exception as e:
raise Exception(f"OCR processing failed: {str(e)}")

def extract_text(self, image: Union[str, bytes, Image.Image], **kwargs) -> str:
"""
Extracts text from an image.
Args:
image: Can be a file path, bytes, or PIL Image
**kwargs: Additional arguments for OCR processing
- language: OCR language (e.g., 'eng', 'fra', etc.)
- config: Custom Tesseract configuration string
Returns:
Extracted text as string
"""
return self._process_image(image, **kwargs)

async def aextract_text(
self, image: Union[str, bytes, Image.Image], **kwargs
) -> str:
"""
Asynchronously extracts text from an image.
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self.extract_text, image, **kwargs)

def batch(
self, images: List[Union[str, bytes, Image.Image]], **kwargs
) -> List[str]:
"""
Process multiple images in batch.
Args:
images: List of images (file paths, bytes, or PIL Images)
**kwargs: Additional arguments for OCR processing
Returns:
List of extracted texts
"""
results = []
for image in images:
text = self.extract_text(image=image, **kwargs)
results.append(text)
return results

async def abatch(
self,
images: List[Union[str, bytes, Image.Image]],
max_concurrent: int = 5,
**kwargs,
) -> List[str]:
"""
Asynchronously process multiple images in batch.
Args:
images: List of images (file paths, bytes, or PIL Images)
max_concurrent: Maximum number of concurrent operations
**kwargs: Additional arguments for OCR processing
Returns:
List of extracted texts
"""
semaphore = asyncio.Semaphore(max_concurrent)

async def process_image(image):
async with semaphore:
return await self.aextract_text(image=image, **kwargs)

tasks = [process_image(image) for image in images]
return await asyncio.gather(*tasks)

def get_supported_languages(self) -> List[str]:
"""
Returns a list of supported languages by executing 'tesseract --list-langs' command.
Returns:
List[str]: List of available language codes (e.g., ['eng', 'osd'])
Raises:
Exception: If the command execution fails or returns unexpected output
"""
try:
# Execute tesseract command to list languages
import subprocess

result = subprocess.run(
[self.tesseract_cmd, "--list-langs"],
capture_output=True,
text=True,
check=True,
)

# Parse the output
output_lines = result.stdout.strip().split("\n")

# Skip the first line which is the directory info
# and filter out empty lines
languages = [lang.strip() for lang in output_lines[1:] if lang.strip()]

return languages

except subprocess.CalledProcessError as e:
raise Exception(f"Failed to get language list from Tesseract: {e.stderr}")
except Exception as e:
raise Exception(f"Error getting supported languages: {str(e)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .PytesseractImg2TextModel import PytesseractImg2TextModel

__version__ = "0.6.0.dev26"
__long_desc__ = """
# Swarmauri PytesseractImg2Text Model Plugin
Visit us at: https://swarmauri.com
Follow us at: https://github.com/swarmauri
Star us at: https://github.com/swarmauri/swarmauri-sdk
"""
Loading

0 comments on commit 58f440a

Please sign in to comment.