Skip to content

Commit

Permalink
Merge pull request geekan#1408 from seehi/feat-omniparse
Browse files Browse the repository at this point in the history
Feat omniparse
  • Loading branch information
better629 authored Aug 6, 2024
2 parents ec2c1dc + 015212d commit 22e1009
Show file tree
Hide file tree
Showing 19 changed files with 684 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*.ico binary
*.jpeg binary
*.mp3 binary
*.mp4 binary
*.zip binary
*.bin binary

Expand Down
4 changes: 4 additions & 0 deletions config/config2.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ iflytek_api_secret: "YOUR_API_SECRET"

metagpt_tti_url: "YOUR_MODEL_URL"

omniparse:
api_key: "YOUR_API_KEY"
base_url: "YOUR_BASE_URL"

models:
# "YOUR_MODEL_NAME_1 or YOUR_API_TYPE_1": # model: "gpt-4-turbo" # or gpt-3.5-turbo
# api_type: "openai" # or azure / ollama / groq etc.
Expand Down
Binary file added examples/data/omniparse/test01.docx
Binary file not shown.
Binary file added examples/data/omniparse/test02.pdf
Binary file not shown.
Binary file added examples/data/omniparse/test03.mp4
Binary file not shown.
Binary file added examples/data/omniparse/test04.mp3
Binary file not shown.
64 changes: 64 additions & 0 deletions examples/rag/omniparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import asyncio

from metagpt.config2 import config
from metagpt.const import EXAMPLE_DATA_PATH
from metagpt.logs import logger
from metagpt.rag.parsers import OmniParse
from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType
from metagpt.utils.omniparse_client import OmniParseClient

TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx"
TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf"
TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4"
TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3"


async def omniparse_client_example():
client = OmniParseClient(base_url=config.omniparse.base_url)

# docx
with open(TEST_DOCX, "rb") as f:
file_input = f.read()
document_parse_ret = await client.parse_document(file_input=file_input, bytes_filename="test_01.docx")
logger.info(document_parse_ret)

# pdf
pdf_parse_ret = await client.parse_pdf(file_input=TEST_PDF)
logger.info(pdf_parse_ret)

# video
video_parse_ret = await client.parse_video(file_input=TEST_VIDEO)
logger.info(video_parse_ret)

# audio
audio_parse_ret = await client.parse_audio(file_input=TEST_AUDIO)
logger.info(audio_parse_ret)


async def omniparse_example():
parser = OmniParse(
api_key=config.omniparse.api_key,
base_url=config.omniparse.base_url,
parse_options=OmniParseOptions(
parse_type=OmniParseType.PDF,
result_type=ParseResultType.MD,
max_timeout=120,
num_workers=3,
),
)
ret = parser.load_data(file_path=TEST_PDF)
logger.info(ret)

file_paths = [TEST_DOCX, TEST_PDF]
parser.parse_type = OmniParseType.DOCUMENT
ret = await parser.aload_data(file_path=file_paths)
logger.info(ret)


async def main():
await omniparse_client_example()
await omniparse_example()


if __name__ == "__main__":
asyncio.run(main())
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion examples/rag_search.py → examples/rag/rag_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio

from examples.rag_pipeline import DOC_PATH, QUESTION
from examples.rag.rag_pipeline import DOC_PATH, QUESTION
from metagpt.logs import logger
from metagpt.rag.engines import SimpleEngine
from metagpt.roles import Sales
Expand Down
4 changes: 4 additions & 0 deletions metagpt/config2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from metagpt.configs.browser_config import BrowserConfig
from metagpt.configs.embedding_config import EmbeddingConfig
from metagpt.configs.file_parser_config import OmniParseConfig
from metagpt.configs.llm_config import LLMConfig, LLMType
from metagpt.configs.mermaid_config import MermaidConfig
from metagpt.configs.redis_config import RedisConfig
Expand Down Expand Up @@ -51,6 +52,9 @@ class Config(CLIParams, YamlModel):
# RAG Embedding
embedding: EmbeddingConfig = EmbeddingConfig()

# omniparse
omniparse: OmniParseConfig = OmniParseConfig()

# Global Proxy. Will be used if llm.proxy is not set
proxy: str = ""

Expand Down
6 changes: 6 additions & 0 deletions metagpt/configs/file_parser_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from metagpt.utils.yaml_model import YamlModel


class OmniParseConfig(YamlModel):
api_key: str = ""
base_url: str = ""
31 changes: 30 additions & 1 deletion metagpt/rag/engines/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.readers.base import BaseReader
from llama_index.core.response_synthesizers import (
BaseSynthesizer,
get_response_synthesizer,
Expand All @@ -28,6 +29,7 @@
TransformComponent,
)

from metagpt.config2 import config
from metagpt.rag.factories import (
get_index,
get_rag_embedding,
Expand All @@ -36,6 +38,7 @@
get_retriever,
)
from metagpt.rag.interface import NoEmbedding, RAGObject
from metagpt.rag.parsers import OmniParse
from metagpt.rag.retrievers.base import ModifiableRAGRetriever, PersistableRAGRetriever
from metagpt.rag.retrievers.hybrid_retriever import SimpleHybridRetriever
from metagpt.rag.schema import (
Expand All @@ -44,6 +47,9 @@
BaseRetrieverConfig,
BM25RetrieverConfig,
ObjectNode,
OmniParseOptions,
OmniParseType,
ParseResultType,
)
from metagpt.utils.common import import_class

Expand Down Expand Up @@ -100,7 +106,10 @@ def from_docs(
if not input_dir and not input_files:
raise ValueError("Must provide either `input_dir` or `input_files`.")

documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
file_extractor = cls._get_file_extractor()
documents = SimpleDirectoryReader(
input_dir=input_dir, input_files=input_files, file_extractor=file_extractor
).load_data()
cls._fix_document_metadata(documents)

transformations = transformations or cls._default_transformations()
Expand Down Expand Up @@ -301,3 +310,23 @@ def _resolve_embed_model(embed_model: BaseEmbedding = None, configs: list[Any] =
@staticmethod
def _default_transformations():
return [SentenceSplitter()]

@staticmethod
def _get_file_extractor() -> dict[str:BaseReader]:
"""
Get the file extractor.
Currently, only PDF use OmniParse. Other document types use the built-in reader from llama_index.
Returns:
dict[file_type: BaseReader]
"""
file_extractor: dict[str:BaseReader] = {}
if config.omniparse.base_url:
pdf_parser = OmniParse(
api_key=config.omniparse.api_key,
base_url=config.omniparse.base_url,
parse_options=OmniParseOptions(parse_type=OmniParseType.PDF, result_type=ParseResultType.MD),
)
file_extractor[".pdf"] = pdf_parser

return file_extractor
3 changes: 3 additions & 0 deletions metagpt/rag/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from metagpt.rag.parsers.omniparse import OmniParse

__all__ = ["OmniParse"]
139 changes: 139 additions & 0 deletions metagpt/rag/parsers/omniparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import asyncio
from fileinput import FileInput
from pathlib import Path
from typing import List, Optional, Union

from llama_index.core import Document
from llama_index.core.async_utils import run_jobs
from llama_index.core.readers.base import BaseReader

from metagpt.logs import logger
from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType
from metagpt.utils.async_helper import NestAsyncio
from metagpt.utils.omniparse_client import OmniParseClient


class OmniParse(BaseReader):
"""OmniParse"""

def __init__(
self, api_key: str = None, base_url: str = "http://localhost:8000", parse_options: OmniParseOptions = None
):
"""
Args:
api_key: Default None, can be used for authentication later.
base_url: OmniParse Base URL for the API.
parse_options: Optional settings for OmniParse. Default is OmniParseOptions with default values.
"""
self.parse_options = parse_options or OmniParseOptions()
self.omniparse_client = OmniParseClient(api_key, base_url, max_timeout=self.parse_options.max_timeout)

@property
def parse_type(self):
return self.parse_options.parse_type

@property
def result_type(self):
return self.parse_options.result_type

@parse_type.setter
def parse_type(self, parse_type: Union[str, OmniParseType]):
if isinstance(parse_type, str):
parse_type = OmniParseType(parse_type)
self.parse_options.parse_type = parse_type

@result_type.setter
def result_type(self, result_type: Union[str, ParseResultType]):
if isinstance(result_type, str):
result_type = ParseResultType(result_type)
self.parse_options.result_type = result_type

async def _aload_data(
self,
file_path: Union[str, bytes, Path],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Returns:
List[Document]
"""
try:
if self.parse_type == OmniParseType.PDF:
# pdf parse
parsed_result = await self.omniparse_client.parse_pdf(file_path)
else:
# other parse use omniparse_client.parse_document
# For compatible byte data, additional filename is required
extra_info = extra_info or {}
filename = extra_info.get("filename")
parsed_result = await self.omniparse_client.parse_document(file_path, bytes_filename=filename)

# Get the specified structured data based on result_type
content = getattr(parsed_result, self.result_type)
docs = [
Document(
text=content,
metadata=extra_info or {},
)
]
except Exception as e:
logger.error(f"OMNI Parse Error: {e}")
docs = []

return docs

async def aload_data(
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Notes:
This method ultimately calls _aload_data for processing.
Returns:
List[Document]
"""
docs = []
if isinstance(file_path, (str, bytes, Path)):
# Processing single file
docs = await self._aload_data(file_path, extra_info)
elif isinstance(file_path, list):
# Concurrently process multiple files
parse_jobs = [self._aload_data(file_item, extra_info) for file_item in file_path]
doc_ret_list = await run_jobs(jobs=parse_jobs, workers=self.parse_options.num_workers)
docs = [doc for docs in doc_ret_list for doc in docs]
return docs

def load_data(
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
) -> List[Document]:
"""
Load data from the input file_path.
Args:
file_path: File path or file byte data.
extra_info: Optional dictionary containing additional information.
Notes:
This method ultimately calls aload_data for processing.
Returns:
List[Document]
"""
NestAsyncio.apply_once() # Ensure compatibility with nested async calls
return asyncio.run(self.aload_data(file_path, extra_info))
52 changes: 50 additions & 2 deletions metagpt/rag/schema.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""RAG schemas."""

from enum import Enum
from pathlib import Path
from typing import Any, ClassVar, Literal, Optional, Union
from typing import Any, ClassVar, List, Literal, Optional, Union

from chromadb.api.types import CollectionMetadata
from llama_index.core.embeddings import BaseEmbedding
Expand Down Expand Up @@ -214,3 +214,51 @@ def get_obj_metadata(obj: RAGObject) -> dict:
)

return metadata.model_dump()


class OmniParseType(str, Enum):
"""OmniParseType"""

PDF = "PDF"
DOCUMENT = "DOCUMENT"


class ParseResultType(str, Enum):
"""The result type for the parser."""

TXT = "text"
MD = "markdown"
JSON = "json"


class OmniParseOptions(BaseModel):
"""OmniParse Options config"""

result_type: ParseResultType = Field(default=ParseResultType.MD, description="OmniParse result_type")
parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse parse_type")
max_timeout: Optional[int] = Field(default=120, description="Maximum timeout for OmniParse service requests")
num_workers: int = Field(
default=5,
gt=0,
lt=10,
description="Number of concurrent requests for multiple files",
)


class OminParseImage(BaseModel):
image: str = Field(default="", description="image str bytes")
image_name: str = Field(default="", description="image name")
image_info: Optional[dict] = Field(default={}, description="image info")


class OmniParsedResult(BaseModel):
markdown: str = Field(default="", description="markdown text")
text: str = Field(default="", description="plain text")
images: Optional[List[OminParseImage]] = Field(default=[], description="images")
metadata: Optional[dict] = Field(default={}, description="metadata")

@model_validator(mode="before")
def set_markdown(cls, values):
if not values.get("markdown"):
values["markdown"] = values.get("text")
return values
Loading

0 comments on commit 22e1009

Please sign in to comment.