Merge pull request geekan#1408 from seehi/feat-omniparse

Feat omniparse
iorisa · Aug 6, 2024 · 22e1009 · 22e1009
2 parents ec2c1dc + 015212d
commit 22e1009
Show file tree

Hide file tree

Showing 19 changed files with 684 additions and 5 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -14,6 +14,7 @@
 *.ico binary
 *.jpeg binary
 *.mp3 binary
+*.mp4 binary
 *.zip binary
 *.bin binary
 

diff --git a/config/config2.example.yaml b/config/config2.example.yaml
@@ -60,6 +60,10 @@ iflytek_api_secret: "YOUR_API_SECRET"
 
 metagpt_tti_url: "YOUR_MODEL_URL"
 
+omniparse:
+    api_key: "YOUR_API_KEY"
+    base_url: "YOUR_BASE_URL"
+
 models:
 #  "YOUR_MODEL_NAME_1 or YOUR_API_TYPE_1": # model: "gpt-4-turbo"  # or gpt-3.5-turbo
 #    api_type: "openai"  # or azure / ollama / groq etc.

diff --git a/examples/data/omniparse/test01.docx b/examples/data/omniparse/test01.docx
diff --git a/examples/data/omniparse/test02.pdf b/examples/data/omniparse/test02.pdf
diff --git a/examples/data/omniparse/test03.mp4 b/examples/data/omniparse/test03.mp4
diff --git a/examples/data/omniparse/test04.mp3 b/examples/data/omniparse/test04.mp3
diff --git a/examples/rag/omniparse.py b/examples/rag/omniparse.py
@@ -0,0 +1,64 @@
+import asyncio
+
+from metagpt.config2 import config
+from metagpt.const import EXAMPLE_DATA_PATH
+from metagpt.logs import logger
+from metagpt.rag.parsers import OmniParse
+from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType
+from metagpt.utils.omniparse_client import OmniParseClient
+
+TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx"
+TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf"
+TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4"
+TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3"
+
+
+async def omniparse_client_example():
+    client = OmniParseClient(base_url=config.omniparse.base_url)
+
+    # docx
+    with open(TEST_DOCX, "rb") as f:
+        file_input = f.read()
+    document_parse_ret = await client.parse_document(file_input=file_input, bytes_filename="test_01.docx")
+    logger.info(document_parse_ret)
+
+    # pdf
+    pdf_parse_ret = await client.parse_pdf(file_input=TEST_PDF)
+    logger.info(pdf_parse_ret)
+
+    # video
+    video_parse_ret = await client.parse_video(file_input=TEST_VIDEO)
+    logger.info(video_parse_ret)
+
+    # audio
+    audio_parse_ret = await client.parse_audio(file_input=TEST_AUDIO)
+    logger.info(audio_parse_ret)
+
+
+async def omniparse_example():
+    parser = OmniParse(
+        api_key=config.omniparse.api_key,
+        base_url=config.omniparse.base_url,
+        parse_options=OmniParseOptions(
+            parse_type=OmniParseType.PDF,
+            result_type=ParseResultType.MD,
+            max_timeout=120,
+            num_workers=3,
+        ),
+    )
+    ret = parser.load_data(file_path=TEST_PDF)
+    logger.info(ret)
+
+    file_paths = [TEST_DOCX, TEST_PDF]
+    parser.parse_type = OmniParseType.DOCUMENT
+    ret = await parser.aload_data(file_path=file_paths)
+    logger.info(ret)
+
+
+async def main():
+    await omniparse_client_example()
+    await omniparse_example()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/rag_bm.py → examples/rag/rag_bm.py b/examples/rag_bm.py → examples/rag/rag_bm.py
diff --git a/examples/rag_pipeline.py → examples/rag/rag_pipeline.py b/examples/rag_pipeline.py → examples/rag/rag_pipeline.py
diff --git a/examples/rag_search.py → examples/rag/rag_search.py b/examples/rag_search.py → examples/rag/rag_search.py
@@ -2,7 +2,7 @@
 
 import asyncio
 
-from examples.rag_pipeline import DOC_PATH, QUESTION
+from examples.rag.rag_pipeline import DOC_PATH, QUESTION
 from metagpt.logs import logger
 from metagpt.rag.engines import SimpleEngine
 from metagpt.roles import Sales

diff --git a/metagpt/config2.py b/metagpt/config2.py
@@ -13,6 +13,7 @@
 
 from metagpt.configs.browser_config import BrowserConfig
 from metagpt.configs.embedding_config import EmbeddingConfig
+from metagpt.configs.file_parser_config import OmniParseConfig
 from metagpt.configs.llm_config import LLMConfig, LLMType
 from metagpt.configs.mermaid_config import MermaidConfig
 from metagpt.configs.redis_config import RedisConfig
@@ -51,6 +52,9 @@ class Config(CLIParams, YamlModel):
     # RAG Embedding
     embedding: EmbeddingConfig = EmbeddingConfig()
 
+    # omniparse
+    omniparse: OmniParseConfig = OmniParseConfig()
+
     # Global Proxy. Will be used if llm.proxy is not set
     proxy: str = ""
 

diff --git a/metagpt/configs/file_parser_config.py b/metagpt/configs/file_parser_config.py
@@ -0,0 +1,6 @@
+from metagpt.utils.yaml_model import YamlModel
+
+
+class OmniParseConfig(YamlModel):
+    api_key: str = ""
+    base_url: str = ""
diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py
@@ -14,6 +14,7 @@
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.postprocessor.types import BaseNodePostprocessor
 from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.readers.base import BaseReader
 from llama_index.core.response_synthesizers import (
     BaseSynthesizer,
     get_response_synthesizer,
@@ -28,6 +29,7 @@
     TransformComponent,
 )
 
+from metagpt.config2 import config
 from metagpt.rag.factories import (
     get_index,
     get_rag_embedding,
@@ -36,6 +38,7 @@
     get_retriever,
 )
 from metagpt.rag.interface import NoEmbedding, RAGObject
+from metagpt.rag.parsers import OmniParse
 from metagpt.rag.retrievers.base import ModifiableRAGRetriever, PersistableRAGRetriever
 from metagpt.rag.retrievers.hybrid_retriever import SimpleHybridRetriever
 from metagpt.rag.schema import (
@@ -44,6 +47,9 @@
     BaseRetrieverConfig,
     BM25RetrieverConfig,
     ObjectNode,
+    OmniParseOptions,
+    OmniParseType,
+    ParseResultType,
 )
 from metagpt.utils.common import import_class
 
@@ -100,7 +106,10 @@ def from_docs(
         if not input_dir and not input_files:
             raise ValueError("Must provide either `input_dir` or `input_files`.")
 
-        documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
+        file_extractor = cls._get_file_extractor()
+        documents = SimpleDirectoryReader(
+            input_dir=input_dir, input_files=input_files, file_extractor=file_extractor
+        ).load_data()
         cls._fix_document_metadata(documents)
 
         transformations = transformations or cls._default_transformations()
@@ -301,3 +310,23 @@ def _resolve_embed_model(embed_model: BaseEmbedding = None, configs: list[Any] =
     @staticmethod
     def _default_transformations():
         return [SentenceSplitter()]
+
+    @staticmethod
+    def _get_file_extractor() -> dict[str:BaseReader]:
+        """
+        Get the file extractor.
+        Currently, only PDF use OmniParse. Other document types use the built-in reader from llama_index.
+
+        Returns:
+            dict[file_type: BaseReader]
+        """
+        file_extractor: dict[str:BaseReader] = {}
+        if config.omniparse.base_url:
+            pdf_parser = OmniParse(
+                api_key=config.omniparse.api_key,
+                base_url=config.omniparse.base_url,
+                parse_options=OmniParseOptions(parse_type=OmniParseType.PDF, result_type=ParseResultType.MD),
+            )
+            file_extractor[".pdf"] = pdf_parser
+
+        return file_extractor
diff --git a/metagpt/rag/parsers/__init__.py b/metagpt/rag/parsers/__init__.py
@@ -0,0 +1,3 @@
+from metagpt.rag.parsers.omniparse import OmniParse
+
+__all__ = ["OmniParse"]
diff --git a/metagpt/rag/parsers/omniparse.py b/metagpt/rag/parsers/omniparse.py
@@ -0,0 +1,139 @@
+import asyncio
+from fileinput import FileInput
+from pathlib import Path
+from typing import List, Optional, Union
+
+from llama_index.core import Document
+from llama_index.core.async_utils import run_jobs
+from llama_index.core.readers.base import BaseReader
+
+from metagpt.logs import logger
+from metagpt.rag.schema import OmniParseOptions, OmniParseType, ParseResultType
+from metagpt.utils.async_helper import NestAsyncio
+from metagpt.utils.omniparse_client import OmniParseClient
+
+
+class OmniParse(BaseReader):
+    """OmniParse"""
+
+    def __init__(
+        self, api_key: str = None, base_url: str = "http://localhost:8000", parse_options: OmniParseOptions = None
+    ):
+        """
+        Args:
+            api_key: Default None, can be used for authentication later.
+            base_url: OmniParse Base URL for the API.
+            parse_options: Optional settings for OmniParse. Default is OmniParseOptions with default values.
+        """
+        self.parse_options = parse_options or OmniParseOptions()
+        self.omniparse_client = OmniParseClient(api_key, base_url, max_timeout=self.parse_options.max_timeout)
+
+    @property
+    def parse_type(self):
+        return self.parse_options.parse_type
+
+    @property
+    def result_type(self):
+        return self.parse_options.result_type
+
+    @parse_type.setter
+    def parse_type(self, parse_type: Union[str, OmniParseType]):
+        if isinstance(parse_type, str):
+            parse_type = OmniParseType(parse_type)
+        self.parse_options.parse_type = parse_type
+
+    @result_type.setter
+    def result_type(self, result_type: Union[str, ParseResultType]):
+        if isinstance(result_type, str):
+            result_type = ParseResultType(result_type)
+        self.parse_options.result_type = result_type
+
+    async def _aload_data(
+        self,
+        file_path: Union[str, bytes, Path],
+        extra_info: Optional[dict] = None,
+    ) -> List[Document]:
+        """
+        Load data from the input file_path.
+
+        Args:
+            file_path: File path or file byte data.
+            extra_info: Optional dictionary containing additional information.
+
+        Returns:
+            List[Document]
+        """
+        try:
+            if self.parse_type == OmniParseType.PDF:
+                # pdf parse
+                parsed_result = await self.omniparse_client.parse_pdf(file_path)
+            else:
+                # other parse use omniparse_client.parse_document
+                # For compatible byte data, additional filename is required
+                extra_info = extra_info or {}
+                filename = extra_info.get("filename")
+                parsed_result = await self.omniparse_client.parse_document(file_path, bytes_filename=filename)
+
+            # Get the specified structured data based on result_type
+            content = getattr(parsed_result, self.result_type)
+            docs = [
+                Document(
+                    text=content,
+                    metadata=extra_info or {},
+                )
+            ]
+        except Exception as e:
+            logger.error(f"OMNI Parse Error: {e}")
+            docs = []
+
+        return docs
+
+    async def aload_data(
+        self,
+        file_path: Union[List[FileInput], FileInput],
+        extra_info: Optional[dict] = None,
+    ) -> List[Document]:
+        """
+        Load data from the input file_path.
+
+        Args:
+            file_path: File path or file byte data.
+            extra_info: Optional dictionary containing additional information.
+
+        Notes:
+            This method ultimately calls _aload_data for processing.
+
+        Returns:
+            List[Document]
+        """
+        docs = []
+        if isinstance(file_path, (str, bytes, Path)):
+            # Processing single file
+            docs = await self._aload_data(file_path, extra_info)
+        elif isinstance(file_path, list):
+            # Concurrently process multiple files
+            parse_jobs = [self._aload_data(file_item, extra_info) for file_item in file_path]
+            doc_ret_list = await run_jobs(jobs=parse_jobs, workers=self.parse_options.num_workers)
+            docs = [doc for docs in doc_ret_list for doc in docs]
+        return docs
+
+    def load_data(
+        self,
+        file_path: Union[List[FileInput], FileInput],
+        extra_info: Optional[dict] = None,
+    ) -> List[Document]:
+        """
+        Load data from the input file_path.
+
+        Args:
+            file_path: File path or file byte data.
+            extra_info: Optional dictionary containing additional information.
+
+        Notes:
+            This method ultimately calls aload_data for processing.
+
+        Returns:
+            List[Document]
+        """
+        NestAsyncio.apply_once()  # Ensure compatibility with nested async calls
+        return asyncio.run(self.aload_data(file_path, extra_info))
diff --git a/metagpt/rag/schema.py b/metagpt/rag/schema.py
@@ -1,7 +1,7 @@
 """RAG schemas."""
-
+from enum import Enum
 from pathlib import Path
-from typing import Any, ClassVar, Literal, Optional, Union
+from typing import Any, ClassVar, List, Literal, Optional, Union
 
 from chromadb.api.types import CollectionMetadata
 from llama_index.core.embeddings import BaseEmbedding
@@ -214,3 +214,51 @@ def get_obj_metadata(obj: RAGObject) -> dict:
         )
 
         return metadata.model_dump()
+
+
+class OmniParseType(str, Enum):
+    """OmniParseType"""
+
+    PDF = "PDF"
+    DOCUMENT = "DOCUMENT"
+
+
+class ParseResultType(str, Enum):
+    """The result type for the parser."""
+
+    TXT = "text"
+    MD = "markdown"
+    JSON = "json"
+
+
+class OmniParseOptions(BaseModel):
+    """OmniParse Options config"""
+
+    result_type: ParseResultType = Field(default=ParseResultType.MD, description="OmniParse result_type")
+    parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse parse_type")
+    max_timeout: Optional[int] = Field(default=120, description="Maximum timeout for OmniParse service requests")
+    num_workers: int = Field(
+        default=5,
+        gt=0,
+        lt=10,
+        description="Number of concurrent requests for multiple files",
+    )
+
+
+class OminParseImage(BaseModel):
+    image: str = Field(default="", description="image str bytes")
+    image_name: str = Field(default="", description="image name")
+    image_info: Optional[dict] = Field(default={}, description="image info")
+
+
+class OmniParsedResult(BaseModel):
+    markdown: str = Field(default="", description="markdown text")
+    text: str = Field(default="", description="plain text")
+    images: Optional[List[OminParseImage]] = Field(default=[], description="images")
+    metadata: Optional[dict] = Field(default={}, description="metadata")
+
+    @model_validator(mode="before")
+    def set_markdown(cls, values):
+        if not values.get("markdown"):
+            values["markdown"] = values.get("text")
+        return values
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from metagpt.rag.parsers.omniparse import OmniParse

		__all__ = ["OmniParse"]