feat(document-search): init document-search module with basic RAG cap…

…abilities on text (#3)
deepsense-ai · Sep 16, 2024 · c8ae2f9 · c8ae2f9
1 parent 25d8249
commit c8ae2f9
Show file tree

Hide file tree

Showing 29 changed files with 720 additions and 0 deletions.
diff --git a/packages/ragnarok-document-search/examples/simple_text.py b/packages/ragnarok-document-search/examples/simple_text.py
@@ -0,0 +1,33 @@
+import asyncio
+
+from ragnarok_document_search import DocumentSearch
+from ragnarok_document_search.documents.document import DocumentMeta
+from ragnarok_document_search.vector_store.in_memory import InMemoryVectorStore
+
+from ragnarok_common.embeddings.litellm import LiteLLMEmbeddings
+
+documents = [
+    DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
+    DocumentMeta.create_text_document_from_literal(
+        "Why doesn't James Bond fart in bed? Because it would blow his cover."
+    ),
+    DocumentMeta.create_text_document_from_literal(
+        "Why programmers don't like to swim? Because they're scared of the floating points."
+    ),
+]
+
+
+async def main():
+    """Run the example."""
+
+    document_search = DocumentSearch(embedder=LiteLLMEmbeddings(), vector_store=InMemoryVectorStore())
+
+    for document in documents:
+        await document_search.ingest_document(document)
+
+    results = await document_search.search("I'm boiling my water and I need a joke")
+    print(results)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/packages/ragnarok-document-search/pyproject.toml b/packages/ragnarok-document-search/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools >= 40.9.0", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/packages/ragnarok-document-search/setup.cfg b/packages/ragnarok-document-search/setup.cfg
@@ -0,0 +1,43 @@
+[metadata]
+name = ragnarok-document-search
+# do not change version by hand: use bump_version.sh
+version = 0.0.1
+description = "The ragstack module responsible for fetching data from unstructured data sources."
+author = deepsense.ai
+author_email = [email protected]
+license = Other/Proprietary License
+license_files = LICENSE.md
+classifiers =
+    Development Status :: 1 - Planning
+    Environment :: Console
+    Intended Audience :: Science/Research
+    License :: Other/Proprietary License
+    Natural Language :: English
+    Operating System :: Independent
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    Topic :: AI
+    Private :: Do Not Upload
+
+[options]
+package_dir=
+    =src
+packages=find:
+zip_safe = False
+platforms = any
+include_package_data = True
+python_requires = >=3.10
+install_requires =
+    numpy>=1.24.0
+    pydantic>=2.8.2
+
+[options.packages.find]
+where=src
+
+[bdist_wheel]
+universal = 1
+
+[aliases]
+# Alias `setup.py test` to `setup.py pytest`
+test = pytest
diff --git a/packages/ragnarok-document-search/src/py.typed b/packages/ragnarok-document-search/src/py.typed
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py
@@ -0,0 +1,3 @@
+from ._main import DocumentSearch
+
+__all__ = ["DocumentSearch"]
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py
@@ -0,0 +1,3 @@
+"""Version information."""
+
+__version__ = "0.0.1"
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/_main.py b/packages/ragnarok-document-search/src/ragnarok_document_search/_main.py
@@ -0,0 +1,78 @@
+from ragnarok_document_search.documents.document import DocumentMeta
+from ragnarok_document_search.documents.element import Element
+from ragnarok_document_search.ingestion.document_processor import DocumentProcessor
+from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser
+from ragnarok_document_search.retrieval.rephrasers.noop import NoopQueryRephraser
+from ragnarok_document_search.retrieval.rerankers.base import Reranker
+from ragnarok_document_search.retrieval.rerankers.noop import NoopReranker
+from ragnarok_document_search.vector_store.base import VectorStore
+
+from ragnarok_common.embeddings.base import Embeddings
+
+
+class DocumentSearch:
+    """
+    A main entrypoint to the DocumentSearch functionality.
+
+    It provides methods for both ingestion and retrieval.
+
+    Retrieval:
+
+        1. Uses QueryRephraser to rephrase the query.
+        2. Uses VectorStore to retrieve the most relevant chunks.
+        3. Uses Reranker to rerank the chunks.
+    """
+
+    embedder: Embeddings
+
+    vector_store: VectorStore
+
+    query_rephraser: QueryRephraser
+    reranker: Reranker
+
+    def __init__(
+        self,
+        embedder: Embeddings,
+        vector_store: VectorStore,
+        query_rephraser: QueryRephraser | None = None,
+        reranker: Reranker | None = None,
+    ) -> None:
+        self.embedder = embedder
+        self.vector_store = vector_store
+        self.query_rephraser = query_rephraser or NoopQueryRephraser()
+        self.reranker = reranker or NoopReranker()
+
+    async def search(self, query: str) -> list[Element]:
+        """
+        Search for the most relevant chunks for a query.
+
+        Args:
+            query: The query to search for.
+
+        Returns:
+            A list of chunks.
+        """
+        queries = self.query_rephraser.rephrase(query)
+        chunks = []
+        for rephrased_query in queries:
+            search_vector = await self.embedder.embed_text([rephrased_query])
+            # TODO: search parameters should be configurable
+            entries = await self.vector_store.retrieve(search_vector[0], k=1)
+            chunks.extend([Element.from_vector_db_entry(entry) for entry in entries])
+
+        return self.reranker.rerank(chunks)
+
+    async def ingest_document(self, document: DocumentMeta) -> None:
+        """
+        Ingest a document.
+
+        Args:
+            document: The document to ingest.
+        """
+        # TODO: This is a placeholder implementation. It should be replaced with a real implementation.
+
+        document_processor = DocumentProcessor()
+        elements = await document_processor.process(document)
+        vectors = await self.embedder.embed_text([element.get_key() for element in elements])
+        entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors)]
+        await self.vector_store.store(entries)
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py
@@ -0,0 +1,105 @@
+import tempfile
+from enum import Enum
+from pathlib import Path
+from typing import Union
+
+from pydantic import BaseModel, Field
+from ragnarok_document_search.documents.sources import LocalFileSource
+
+
+class DocumentType(str, Enum):
+    """Types of documents that can be stored."""
+
+    MD = "md"
+    TXT = "txt"
+
+
+class DocumentMeta(BaseModel):
+    """
+    An object representing a document metadata.
+    """
+
+    document_type: DocumentType
+    source: Union[LocalFileSource] = Field(..., discriminator="source_type")
+
+    @property
+    def id(self) -> str:
+        """
+        Get the document ID.
+
+        Returns:
+            The document ID.
+        """
+        return self.source.get_id()
+
+    async def fetch(self) -> "Document":
+        """
+        This method fetches the document from source (potentially remote) and creates an object to interface with it.
+        Based on the document type, it will return a different object.
+
+        Returns:
+            The document.
+        """
+        local_path = await self.source.fetch()
+        return Document.from_document_meta(self, local_path)
+
+    @classmethod
+    def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
+        """
+        Create a text document from a literal content.
+
+        Args:
+            content: The content of the document.
+
+        Returns:
+            The document metadata.
+        """
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            temp_file.write(content.encode())
+
+        return cls(
+            document_type=DocumentType.TXT,
+            source=LocalFileSource(path=Path(temp_file.name)),
+        )
+
+
+class Document(BaseModel):
+    """
+    An object representing a document which is downloaded and stored locally.
+    """
+
+    local_path: Path
+    metadata: DocumentMeta
+
+    @classmethod
+    def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
+        """
+        Create a document from a document metadata.
+        Based on the document type, it will return a different object.
+
+        Args:
+            document_meta: The document metadata.
+            local_path: The local path to the document.
+
+        Returns:
+            The document.
+        """
+        if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
+            return TextDocument(local_path=local_path, metadata=document_meta)
+        return cls(local_path=local_path, metadata=document_meta)
+
+
+class TextDocument(Document):
+    """
+    An object representing a text document.
+    """
+
+    @property
+    def content(self) -> str:
+        """
+        Get the content of the document.
+
+        Returns:
+            The content of the document.
+        """
+        return self.local_path.read_text()
diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py
@@ -0,0 +1,86 @@
+from abc import ABC, abstractmethod
+from typing import ClassVar
+
+from pydantic import BaseModel
+from ragnarok_document_search.documents.document import DocumentMeta
+from ragnarok_document_search.vector_store.base import VectorDBEntry
+
+
+class Element(BaseModel, ABC):
+    """
+    An object representing an element in a document.
+    """
+
+    element_type: str
+    document: DocumentMeta
+
+    _elements_registry: ClassVar[dict[str, type["Element"]]] = {}
+
+    @abstractmethod
+    def get_key(self) -> str:
+        """
+        Get the key of the element which will be used to generate the vector.
+
+        Returns:
+            The key.
+        """
+
+    @classmethod
+    def __pydantic_init_subclass__(cls, **kwargs):  # pylint: disable=unused-argument
+        element_type_default = cls.model_fields["element_type"].default
+
+        if element_type_default is None:
+            raise ValueError("Element type must be defined")
+
+        Element._elements_registry[element_type_default] = cls
+
+    @classmethod
+    def from_vector_db_entry(cls, db_entry: VectorDBEntry) -> "Element":
+        """
+        Create an element from a vector database entry.
+
+        Args:
+            db_entry: The vector database entry.
+
+        Returns:
+            The element.
+        """
+        meta = db_entry.metadata
+        element_type = meta["element_type"]
+        element_cls = Element._elements_registry[element_type]
+
+        return element_cls(**meta)
+
+    def to_vector_db_entry(self, vector: list[float]) -> VectorDBEntry:
+        """
+        Create a vector database entry from the element.
+
+        Args:
+            vector: The vector.
+
+        Returns:
+            The vector database entry
+        """
+        return VectorDBEntry(
+            key=self.get_key(),
+            vector=vector,
+            metadata=self.model_dump(),
+        )
+
+
+class TextElement(Element):
+    """
+    An object representing a text element in a document.
+    """
+
+    element_type: str = "text"
+    content: str
+
+    def get_key(self) -> str:
+        """
+        Get the key of the element which will be used to generate the vector.
+
+        Returns:
+            The key.
+        """
+        return self.content
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._main import DocumentSearch

		__all__ = ["DocumentSearch"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Version information."""

		__version__ = "0.0.1"