From c8ae2f98047e15e7ee131997c535312f824e55f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Hordy=C5=84ski?= <26008518+mhordynski@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:29:42 +0200 Subject: [PATCH] feat(document-search): init document-search module with basic RAG capabilities on text (#3) --- .../examples/simple_text.py | 33 ++++++ .../ragnarok-document-search/pyproject.toml | 3 + packages/ragnarok-document-search/setup.cfg | 43 +++++++ .../ragnarok-document-search/src/py.typed | 0 .../src/ragnarok_document_search/__init__.py | 3 + .../ragnarok_document_search/__version__.py | 3 + .../src/ragnarok_document_search/_main.py | 78 +++++++++++++ .../documents/__init__.py | 0 .../documents/document.py | 105 ++++++++++++++++++ .../documents/element.py | 86 ++++++++++++++ .../documents/sources.py | 56 ++++++++++ .../ingestion/__init__.py | 0 .../ingestion/document_processor.py | 35 ++++++ .../src/ragnarok_document_search/py.typed | 0 .../retrieval/__init__.py | 0 .../retrieval/rephrasers/__init__.py | 0 .../retrieval/rephrasers/base.py | 20 ++++ .../retrieval/rephrasers/noop.py | 20 ++++ .../retrieval/rerankers/__init__.py | 0 .../retrieval/rerankers/base.py | 22 ++++ .../retrieval/rerankers/noop.py | 23 ++++ .../vector_store/__init__.py | 0 .../vector_store/base.py | 42 +++++++ .../vector_store/in_memory.py | 46 ++++++++ .../tests/unit/__init__.py | 0 .../tests/unit/test_document_search.py | 24 ++++ .../tests/unit/test_documents.py | 20 ++++ .../tests/unit/test_elements.py | 30 +++++ .../tests/unit/test_simple_vector_store.py | 28 +++++ 29 files changed, 720 insertions(+) create mode 100644 packages/ragnarok-document-search/examples/simple_text.py create mode 100644 packages/ragnarok-document-search/pyproject.toml create mode 100644 packages/ragnarok-document-search/setup.cfg create mode 100644 packages/ragnarok-document-search/src/py.typed create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/_main.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/documents/sources.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/document_processor.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/py.typed create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/base.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/noop.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/base.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/noop.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/__init__.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/base.py create mode 100644 packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/in_memory.py create mode 100644 packages/ragnarok-document-search/tests/unit/__init__.py create mode 100644 packages/ragnarok-document-search/tests/unit/test_document_search.py create mode 100644 packages/ragnarok-document-search/tests/unit/test_documents.py create mode 100644 packages/ragnarok-document-search/tests/unit/test_elements.py create mode 100644 packages/ragnarok-document-search/tests/unit/test_simple_vector_store.py diff --git a/packages/ragnarok-document-search/examples/simple_text.py b/packages/ragnarok-document-search/examples/simple_text.py new file mode 100644 index 00000000..9fc7d2f7 --- /dev/null +++ b/packages/ragnarok-document-search/examples/simple_text.py @@ -0,0 +1,33 @@ +import asyncio + +from ragnarok_document_search import DocumentSearch +from ragnarok_document_search.documents.document import DocumentMeta +from ragnarok_document_search.vector_store.in_memory import InMemoryVectorStore + +from ragnarok_common.embeddings.litellm import LiteLLMEmbeddings + +documents = [ + DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."), + DocumentMeta.create_text_document_from_literal( + "Why doesn't James Bond fart in bed? Because it would blow his cover." + ), + DocumentMeta.create_text_document_from_literal( + "Why programmers don't like to swim? Because they're scared of the floating points." + ), +] + + +async def main(): + """Run the example.""" + + document_search = DocumentSearch(embedder=LiteLLMEmbeddings(), vector_store=InMemoryVectorStore()) + + for document in documents: + await document_search.ingest_document(document) + + results = await document_search.search("I'm boiling my water and I need a joke") + print(results) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/packages/ragnarok-document-search/pyproject.toml b/packages/ragnarok-document-search/pyproject.toml new file mode 100644 index 00000000..67ea20b7 --- /dev/null +++ b/packages/ragnarok-document-search/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools >= 40.9.0", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/packages/ragnarok-document-search/setup.cfg b/packages/ragnarok-document-search/setup.cfg new file mode 100644 index 00000000..95a0e87b --- /dev/null +++ b/packages/ragnarok-document-search/setup.cfg @@ -0,0 +1,43 @@ +[metadata] +name = ragnarok-document-search +# do not change version by hand: use bump_version.sh +version = 0.0.1 +description = "The ragstack module responsible for fetching data from unstructured data sources." +author = deepsense.ai +author_email = contact@deepsense.ai +license = Other/Proprietary License +license_files = LICENSE.md +classifiers = + Development Status :: 1 - Planning + Environment :: Console + Intended Audience :: Science/Research + License :: Other/Proprietary License + Natural Language :: English + Operating System :: Independent + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Topic :: AI + Private :: Do Not Upload + +[options] +package_dir= + =src +packages=find: +zip_safe = False +platforms = any +include_package_data = True +python_requires = >=3.10 +install_requires = + numpy>=1.24.0 + pydantic>=2.8.2 + +[options.packages.find] +where=src + +[bdist_wheel] +universal = 1 + +[aliases] +# Alias `setup.py test` to `setup.py pytest` +test = pytest diff --git a/packages/ragnarok-document-search/src/py.typed b/packages/ragnarok-document-search/src/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py new file mode 100644 index 00000000..aafe8c11 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/__init__.py @@ -0,0 +1,3 @@ +from ._main import DocumentSearch + +__all__ = ["DocumentSearch"] diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py new file mode 100644 index 00000000..3742a1f5 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/__version__.py @@ -0,0 +1,3 @@ +"""Version information.""" + +__version__ = "0.0.1" diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/_main.py b/packages/ragnarok-document-search/src/ragnarok_document_search/_main.py new file mode 100644 index 00000000..a1642668 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/_main.py @@ -0,0 +1,78 @@ +from ragnarok_document_search.documents.document import DocumentMeta +from ragnarok_document_search.documents.element import Element +from ragnarok_document_search.ingestion.document_processor import DocumentProcessor +from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser +from ragnarok_document_search.retrieval.rephrasers.noop import NoopQueryRephraser +from ragnarok_document_search.retrieval.rerankers.base import Reranker +from ragnarok_document_search.retrieval.rerankers.noop import NoopReranker +from ragnarok_document_search.vector_store.base import VectorStore + +from ragnarok_common.embeddings.base import Embeddings + + +class DocumentSearch: + """ + A main entrypoint to the DocumentSearch functionality. + + It provides methods for both ingestion and retrieval. + + Retrieval: + + 1. Uses QueryRephraser to rephrase the query. + 2. Uses VectorStore to retrieve the most relevant chunks. + 3. Uses Reranker to rerank the chunks. + """ + + embedder: Embeddings + + vector_store: VectorStore + + query_rephraser: QueryRephraser + reranker: Reranker + + def __init__( + self, + embedder: Embeddings, + vector_store: VectorStore, + query_rephraser: QueryRephraser | None = None, + reranker: Reranker | None = None, + ) -> None: + self.embedder = embedder + self.vector_store = vector_store + self.query_rephraser = query_rephraser or NoopQueryRephraser() + self.reranker = reranker or NoopReranker() + + async def search(self, query: str) -> list[Element]: + """ + Search for the most relevant chunks for a query. + + Args: + query: The query to search for. + + Returns: + A list of chunks. + """ + queries = self.query_rephraser.rephrase(query) + chunks = [] + for rephrased_query in queries: + search_vector = await self.embedder.embed_text([rephrased_query]) + # TODO: search parameters should be configurable + entries = await self.vector_store.retrieve(search_vector[0], k=1) + chunks.extend([Element.from_vector_db_entry(entry) for entry in entries]) + + return self.reranker.rerank(chunks) + + async def ingest_document(self, document: DocumentMeta) -> None: + """ + Ingest a document. + + Args: + document: The document to ingest. + """ + # TODO: This is a placeholder implementation. It should be replaced with a real implementation. + + document_processor = DocumentProcessor() + elements = await document_processor.process(document) + vectors = await self.embedder.embed_text([element.get_key() for element in elements]) + entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors)] + await self.vector_store.store(entries) diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py new file mode 100644 index 00000000..7810b719 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/document.py @@ -0,0 +1,105 @@ +import tempfile +from enum import Enum +from pathlib import Path +from typing import Union + +from pydantic import BaseModel, Field +from ragnarok_document_search.documents.sources import LocalFileSource + + +class DocumentType(str, Enum): + """Types of documents that can be stored.""" + + MD = "md" + TXT = "txt" + + +class DocumentMeta(BaseModel): + """ + An object representing a document metadata. + """ + + document_type: DocumentType + source: Union[LocalFileSource] = Field(..., discriminator="source_type") + + @property + def id(self) -> str: + """ + Get the document ID. + + Returns: + The document ID. + """ + return self.source.get_id() + + async def fetch(self) -> "Document": + """ + This method fetches the document from source (potentially remote) and creates an object to interface with it. + Based on the document type, it will return a different object. + + Returns: + The document. + """ + local_path = await self.source.fetch() + return Document.from_document_meta(self, local_path) + + @classmethod + def create_text_document_from_literal(cls, content: str) -> "DocumentMeta": + """ + Create a text document from a literal content. + + Args: + content: The content of the document. + + Returns: + The document metadata. + """ + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file.write(content.encode()) + + return cls( + document_type=DocumentType.TXT, + source=LocalFileSource(path=Path(temp_file.name)), + ) + + +class Document(BaseModel): + """ + An object representing a document which is downloaded and stored locally. + """ + + local_path: Path + metadata: DocumentMeta + + @classmethod + def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document": + """ + Create a document from a document metadata. + Based on the document type, it will return a different object. + + Args: + document_meta: The document metadata. + local_path: The local path to the document. + + Returns: + The document. + """ + if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]: + return TextDocument(local_path=local_path, metadata=document_meta) + return cls(local_path=local_path, metadata=document_meta) + + +class TextDocument(Document): + """ + An object representing a text document. + """ + + @property + def content(self) -> str: + """ + Get the content of the document. + + Returns: + The content of the document. + """ + return self.local_path.read_text() diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py new file mode 100644 index 00000000..ea1e4aee --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/element.py @@ -0,0 +1,86 @@ +from abc import ABC, abstractmethod +from typing import ClassVar + +from pydantic import BaseModel +from ragnarok_document_search.documents.document import DocumentMeta +from ragnarok_document_search.vector_store.base import VectorDBEntry + + +class Element(BaseModel, ABC): + """ + An object representing an element in a document. + """ + + element_type: str + document: DocumentMeta + + _elements_registry: ClassVar[dict[str, type["Element"]]] = {} + + @abstractmethod + def get_key(self) -> str: + """ + Get the key of the element which will be used to generate the vector. + + Returns: + The key. + """ + + @classmethod + def __pydantic_init_subclass__(cls, **kwargs): # pylint: disable=unused-argument + element_type_default = cls.model_fields["element_type"].default + + if element_type_default is None: + raise ValueError("Element type must be defined") + + Element._elements_registry[element_type_default] = cls + + @classmethod + def from_vector_db_entry(cls, db_entry: VectorDBEntry) -> "Element": + """ + Create an element from a vector database entry. + + Args: + db_entry: The vector database entry. + + Returns: + The element. + """ + meta = db_entry.metadata + element_type = meta["element_type"] + element_cls = Element._elements_registry[element_type] + + return element_cls(**meta) + + def to_vector_db_entry(self, vector: list[float]) -> VectorDBEntry: + """ + Create a vector database entry from the element. + + Args: + vector: The vector. + + Returns: + The vector database entry + """ + return VectorDBEntry( + key=self.get_key(), + vector=vector, + metadata=self.model_dump(), + ) + + +class TextElement(Element): + """ + An object representing a text element in a document. + """ + + element_type: str = "text" + content: str + + def get_key(self) -> str: + """ + Get the key of the element which will be used to generate the vector. + + Returns: + The key. + """ + return self.content diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/documents/sources.py b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/sources.py new file mode 100644 index 00000000..e1fc9a4c --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/documents/sources.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel + + +class Source(BaseModel, ABC): + """ + An object representing a source. + """ + + @abstractmethod + def get_id(self) -> str: + """ + Get the source ID. + + Returns: + The source ID. + """ + + @abstractmethod + async def fetch(self) -> Path: + """ + Load the source. + + Returns: + The path to the source. + """ + + +class LocalFileSource(Source): + """ + An object representing a local file source. + """ + + source_type: Literal["local_file"] = "local_file" + path: Path + + def get_id(self) -> str: + """ + Get unique identifier of the object in the source. + + Returns: + Unique identifier. + """ + return f"local_file:{self.path.absolute()}" + + async def fetch(self) -> Path: + """ + Fetch the source. + + Returns: + The local path to the object fetched from the source. + """ + return self.path diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/document_processor.py b/packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/document_processor.py new file mode 100644 index 00000000..5f10301f --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/ingestion/document_processor.py @@ -0,0 +1,35 @@ +""" +TODO: This module is mocked. To be deleted and replaced with a real implementation. +""" + +from typing import List + +from ragnarok_document_search.documents.document import DocumentMeta, TextDocument +from ragnarok_document_search.documents.element import Element, TextElement + + +class DocumentProcessor: + """ + A class with an implementation of Document Processor, allowing to process documents. + + TODO: probably this one should be replaced with something more generic, + allowing for passing different processors for different document types. + """ + + async def process(self, document_meta: DocumentMeta) -> List[Element]: + """ + Process the document. + + Args: + document_meta: The document to process. + + Returns: + The processed elements. + """ + document = await document_meta.fetch() + + if isinstance(document, TextDocument): + # for now just return the whole document as a single element + return [TextElement(document=document_meta, content=document.content)] + + return [] diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/py.typed b/packages/ragnarok-document-search/src/ragnarok_document_search/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/base.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/base.py new file mode 100644 index 00000000..a40b9f9b --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/base.py @@ -0,0 +1,20 @@ +import abc + + +class QueryRephraser(abc.ABC): + """ + Rephrases a query. Can provide multiple rephrased queries from one sentence / question. + """ + + @staticmethod + @abc.abstractmethod + def rephrase(query: str) -> list[str]: + """ + Rephrase a query. + + Args: + query: The query to rephrase. + + Returns: + The rephrased queries. + """ diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/noop.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/noop.py new file mode 100644 index 00000000..f89ab39b --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rephrasers/noop.py @@ -0,0 +1,20 @@ +from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser + + +class NoopQueryRephraser(QueryRephraser): + """ + A no-op query paraphraser that does not change the query. + """ + + @staticmethod + def rephrase(query: str) -> list[str]: + """ + Mock implementation which outputs the same query as in input. + + Args: + query: The query to rephrase. + + Returns: + The list with non-transformed query. + """ + return [query] diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/base.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/base.py new file mode 100644 index 00000000..2c941d99 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/base.py @@ -0,0 +1,22 @@ +import abc + +from ragnarok_document_search.documents.element import Element + + +class Reranker(abc.ABC): + """ + Reranks chunks retrieved from vector store. + """ + + @staticmethod + @abc.abstractmethod + def rerank(chunks: list[Element]) -> list[Element]: + """ + Rerank chunks. + + Args: + chunks: The chunks to rerank. + + Returns: + The reranked chunks. + """ diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/noop.py b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/noop.py new file mode 100644 index 00000000..9ef73cbb --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/retrieval/rerankers/noop.py @@ -0,0 +1,23 @@ +from typing import List + +from ragnarok_document_search.documents.element import Element +from ragnarok_document_search.retrieval.rerankers.base import Reranker + + +class NoopReranker(Reranker): + """ + A no-op reranker that does not change the order of the chunks. + """ + + @staticmethod + def rerank(chunks: List[Element]) -> List[Element]: + """ + No reranking, returning the same chunks as in input. + + Args: + chunks: The chunks to rerank. + + Returns: + The reranked chunks. + """ + return chunks diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/__init__.py b/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/base.py b/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/base.py new file mode 100644 index 00000000..4d494c56 --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/base.py @@ -0,0 +1,42 @@ +import abc +from typing import List + +from pydantic import BaseModel + + +class VectorDBEntry(BaseModel): + """ + An object representing a vector database entry. + """ + + key: str + vector: list[float] + metadata: dict + + +class VectorStore(abc.ABC): + """ + A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. + """ + + @abc.abstractmethod + async def store(self, entries: List[VectorDBEntry]) -> None: + """ + Store entries in the vector store. + + Args: + entries: The entries to store. + """ + + @abc.abstractmethod + async def retrieve(self, vector: list[float], k: int = 5) -> list[VectorDBEntry]: + """ + Retrieve entries from the vector store. + + Args: + vector: The vector to search for. + k: The number of entries to retrieve. + + Returns: + The entries. + """ diff --git a/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/in_memory.py b/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/in_memory.py new file mode 100644 index 00000000..babe6b1c --- /dev/null +++ b/packages/ragnarok-document-search/src/ragnarok_document_search/vector_store/in_memory.py @@ -0,0 +1,46 @@ +import numpy as np +from ragnarok_document_search.vector_store.base import VectorDBEntry, VectorStore + + +class InMemoryVectorStore(VectorStore): + """ + A simple in-memory implementation of Vector Store, storing vectors in memory. + """ + + def __init__(self): + self._storage = {} + + async def store(self, entries: list[VectorDBEntry]) -> None: + """ + Store entries in the vector store. + + Args: + entries: The entries to store. + """ + for entry in entries: + self._storage[entry.key] = entry + + async def retrieve(self, vector: list[float], k: int = 5) -> list[VectorDBEntry]: + """ + Retrieve entries from the vector store. + + Args: + vector: The vector to search for. + k: The number of entries to retrieve. + + Returns: + The entries. + """ + knn = [] + + for entry in self._storage.values(): + entry_distance = self._calculate_squared_euclidean(entry.vector, vector) + knn.append((entry, entry_distance)) + + knn.sort(key=lambda x: x[1]) + + return [entry for entry, _ in knn[:k]] + + @staticmethod + def _calculate_squared_euclidean(vector_x: list[float], vector_b: list[float]) -> float: + return np.linalg.norm(np.array(vector_x) - np.array(vector_b)) diff --git a/packages/ragnarok-document-search/tests/unit/__init__.py b/packages/ragnarok-document-search/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/ragnarok-document-search/tests/unit/test_document_search.py b/packages/ragnarok-document-search/tests/unit/test_document_search.py new file mode 100644 index 00000000..6d8b3bbb --- /dev/null +++ b/packages/ragnarok-document-search/tests/unit/test_document_search.py @@ -0,0 +1,24 @@ +from unittest.mock import AsyncMock + +from ragnarok_document_search import DocumentSearch +from ragnarok_document_search.documents.document import DocumentMeta +from ragnarok_document_search.documents.element import TextElement +from ragnarok_document_search.vector_store.in_memory import InMemoryVectorStore + + +async def test_document_search(): + embeddings_mock = AsyncMock() + embeddings_mock.embed_text.return_value = [[0.1, 0.1]] + + document_search = DocumentSearch(embedder=embeddings_mock, vector_store=InMemoryVectorStore()) + + await document_search.ingest_document( + DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George") + ) + + results = await document_search.search("Peppa's brother") + + first_result = results[0] + + assert isinstance(first_result, TextElement) + assert first_result.content == "Name of Peppa's brother is George" diff --git a/packages/ragnarok-document-search/tests/unit/test_documents.py b/packages/ragnarok-document-search/tests/unit/test_documents.py new file mode 100644 index 00000000..beb2ed6a --- /dev/null +++ b/packages/ragnarok-document-search/tests/unit/test_documents.py @@ -0,0 +1,20 @@ +import tempfile +from pathlib import Path + +from ragnarok_document_search.documents.document import DocumentMeta, DocumentType, TextDocument +from ragnarok_document_search.documents.sources import LocalFileSource + + +async def test_loading_local_file_source(): + with tempfile.NamedTemporaryFile() as f: + f.write(b"test") + f.seek(0) + + source = LocalFileSource(path=Path(f.name)) + + document_meta = DocumentMeta(document_type=DocumentType.TXT, source=source) + + document = await document_meta.fetch() + + assert isinstance(document, TextDocument) + assert document.content == "test" diff --git a/packages/ragnarok-document-search/tests/unit/test_elements.py b/packages/ragnarok-document-search/tests/unit/test_elements.py new file mode 100644 index 00000000..05c73a32 --- /dev/null +++ b/packages/ragnarok-document-search/tests/unit/test_elements.py @@ -0,0 +1,30 @@ +from ragnarok_document_search.documents.document import DocumentType +from ragnarok_document_search.documents.element import Element +from ragnarok_document_search.vector_store.base import VectorDBEntry + + +def test_resolving_element_type(): + class MyElement(Element): + element_type: str = "custom_element" + foo: str + + def get_key(self) -> str: + return self.foo + self.foo + + element = Element.from_vector_db_entry( + db_entry=VectorDBEntry( + key="key", + vector=[0.1, 0.2], + metadata={ + "element_type": "custom_element", + "foo": "bar", + "document": {"document_type": "txt", "source": {"source_type": "local_file", "path": "/example/path"}}, + }, + ) + ) + + assert isinstance(element, MyElement) + assert element.foo == "bar" + assert element.get_key() == "barbar" + assert element.document.document_type == DocumentType.TXT + assert element.document.source.source_type == "local_file" diff --git a/packages/ragnarok-document-search/tests/unit/test_simple_vector_store.py b/packages/ragnarok-document-search/tests/unit/test_simple_vector_store.py new file mode 100644 index 00000000..41510683 --- /dev/null +++ b/packages/ragnarok-document-search/tests/unit/test_simple_vector_store.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from ragnarok_document_search.documents.document import DocumentMeta, DocumentType +from ragnarok_document_search.documents.element import TextElement +from ragnarok_document_search.documents.sources import LocalFileSource +from ragnarok_document_search.vector_store.in_memory import InMemoryVectorStore + + +async def test_simple_vector_store(): + store = InMemoryVectorStore() + + document = DocumentMeta(document_type=DocumentType.TXT, source=LocalFileSource(path=Path("test.txt"))) + elements = [ + (TextElement(content="dog", document=document), [0.5, 0.5]), + (TextElement(content="cat", document=document), [0.6, 0.6]), + ] + + entries = [element[0].to_vector_db_entry(vector=element[1]) for element in elements] + + await store.store(entries) + + search_vector = [0.4, 0.4] + + results = await store.retrieve(search_vector, 2) + + assert len(results) == 2 + assert results[0].metadata["content"] == "dog" + assert results[1].metadata["content"] == "cat"