From 240ed227a3855335b4dec8e5faf1cb88ac4c69a6 Mon Sep 17 00:00:00 2001 From: MichaelDecent Date: Mon, 13 Jan 2025 09:32:24 +0100 Subject: [PATCH] Add Swarmauri Annoy Vector Store community package --- .../swarmauri_community/pyproject.toml | 4 +- .../vector_stores/AnnoyVectorStore_test.py | 2 +- .../README.md | 1 + .../pyproject.toml | 59 ++++ .../AnnoyVectorStore.py | 291 ++++++++++++++++++ .../__init__.py | 12 + .../tests/unit/AnnoyVectorStore_test.py | 70 +++++ .../pyproject.toml | 5 +- 8 files changed, 437 insertions(+), 7 deletions(-) create mode 100644 pkgs/community/swarmauri_vectorstore_communityannoy/README.md create mode 100644 pkgs/community/swarmauri_vectorstore_communityannoy/pyproject.toml create mode 100644 pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/AnnoyVectorStore.py create mode 100644 pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/__init__.py create mode 100644 pkgs/community/swarmauri_vectorstore_communityannoy/tests/unit/AnnoyVectorStore_test.py diff --git a/pkgs/community/swarmauri_community/pyproject.toml b/pkgs/community/swarmauri_community/pyproject.toml index cd4f9c00b..b447bdc52 100644 --- a/pkgs/community/swarmauri_community/pyproject.toml +++ b/pkgs/community/swarmauri_community/pyproject.toml @@ -54,8 +54,8 @@ pypdf = { version = "^5.0.1", optional = true } pypdftk = { version = "^0.5", optional = true } weaviate-client = { version = "^4.9.2", optional = true } #textblob = { version = "^0.18.0", optional = true } -torch = { version = "^2.4.1", optional = true} -scikit-learn = { version = "^1.5.2", optional = true } +#torch = { version = "^2.4.1", optional = true} +#scikit-learn = { version = "^1.5.2", optional = true } #protobuf = { version = "^3.20.0", optional = true } [tool.poetry.extras] diff --git a/pkgs/community/swarmauri_community/tests/unit/vector_stores/AnnoyVectorStore_test.py b/pkgs/community/swarmauri_community/tests/unit/vector_stores/AnnoyVectorStore_test.py index cee7afddd..d30618218 100644 --- a/pkgs/community/swarmauri_community/tests/unit/vector_stores/AnnoyVectorStore_test.py +++ b/pkgs/community/swarmauri_community/tests/unit/vector_stores/AnnoyVectorStore_test.py @@ -1,5 +1,5 @@ import pytest -from swarmauri.documents.concrete.Document import Document +from swarmauri_standard.documents.Document import Document from swarmauri_community.vector_stores.concrete.AnnoyVectorStore import AnnoyVectorStore diff --git a/pkgs/community/swarmauri_vectorstore_communityannoy/README.md b/pkgs/community/swarmauri_vectorstore_communityannoy/README.md new file mode 100644 index 000000000..cd26902a2 --- /dev/null +++ b/pkgs/community/swarmauri_vectorstore_communityannoy/README.md @@ -0,0 +1 @@ +# Swarmauri Example Community Package \ No newline at end of file diff --git a/pkgs/community/swarmauri_vectorstore_communityannoy/pyproject.toml b/pkgs/community/swarmauri_vectorstore_communityannoy/pyproject.toml new file mode 100644 index 000000000..ea3bf6b87 --- /dev/null +++ b/pkgs/community/swarmauri_vectorstore_communityannoy/pyproject.toml @@ -0,0 +1,59 @@ +[tool.poetry] +name = "swarmauri_vectorstore_communityannoy" +version = "0.6.0.dev1" +description = "Swarmauri Annoy Vector Store" +authors = ["Jacob Stewart "] +license = "Apache-2.0" +readme = "README.md" +repository = "http://github.com/swarmauri/swarmauri-sdk" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] + +[tool.poetry.dependencies] +python = ">=3.10,<3.13" + +# Swarmauri +swarmauri_core = { path = "../../core" } +swarmauri_base = { path = "../../base" } +swarmauri_vectorstore_doc2vec = { path = "../../standards" } + +# Dependencies +annoy = "^1.17.3" + + + +[tool.poetry.group.dev.dependencies] +flake8 = "^7.0" +pytest = "^8.0" +pytest-asyncio = ">=0.24.0" +pytest-xdist = "^3.6.1" +pytest-json-report = "^1.5.0" +python-dotenv = "*" +requests = "^2.32.3" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +norecursedirs = ["combined", "scripts"] + +markers = [ + "test: standard test", + "unit: Unit tests", + "integration: Integration tests", + "acceptance: Acceptance tests", + "experimental: Experimental tests" +] +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" +asyncio_default_fixture_loop_scope = "function" + +[tool.poetry.plugins."swarmauri.vector_stores"] +AnnoyVectorStore = "swarmauri_vectorstore_communityannoy.AnnoyVectorStore:AnnoyVectorStore" \ No newline at end of file diff --git a/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/AnnoyVectorStore.py b/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/AnnoyVectorStore.py new file mode 100644 index 000000000..3331e2963 --- /dev/null +++ b/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/AnnoyVectorStore.py @@ -0,0 +1,291 @@ +from typing import List, Union, Literal, Optional +import numpy as np +from annoy import AnnoyIndex +import os + +from swarmauri_standard.documents.Document import Document +from swarmauri_vectorstore_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding +from swarmauri_standard.distances.CosineDistance import CosineDistance + +from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase +from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import ( + VectorStoreRetrieveMixin, +) +from swarmauri_base.vector_stores.VectorStoreCloudMixin import VectorStoreCloudMixin +from swarmauri_base.vector_stores.VectorStoreSaveLoadMixin import ( + VectorStoreSaveLoadMixin, +) + + +class AnnoyVectorStore( + VectorStoreRetrieveMixin, + VectorStoreCloudMixin, + VectorStoreSaveLoadMixin, + VectorStoreBase, +): + """ + A vector store implementation using Annoy as the backend. + + This class provides methods to interact with an Annoy index, including + adding, retrieving, and searching for documents. Note that Annoy indices + are immutable after building, so updates and deletes require rebuilding. + """ + + type: Literal["AnnoyVectorStore"] = "AnnoyVectorStore" + api_key: str = ( + "not_required" # Annoy doesn't need an API key, but base class requires it + ) + + def __init__(self, **kwargs): + """ + Initialize the AnnoyVectorStore. + Args: + **kwargs: Additional keyword arguments. + """ + # Set default api_key if not provided + if "api_key" not in kwargs: + kwargs["api_key"] = "not_required" + + super().__init__(**kwargs) + self._embedder = Doc2VecEmbedding(vector_size=self.vector_size) + self._distance = CosineDistance() + self.client = None + self._documents = ( + {} + ) # Store documents in memory since Annoy only stores vectors + self._current_index = 0 # Track the next available index + self._id_to_index = {} # Map document IDs to Annoy indices + self._index_to_id = {} # Map Annoy indices to document IDs + + def delete(self): + """ + Delete the Annoy index if it exists. + """ + try: + if os.path.exists(f"{self.collection_name}.ann"): + os.remove(f"{self.collection_name}.ann") + self.client = None + self._documents = {} + self._current_index = 0 + self._id_to_index = {} + self._index_to_id = {} + except Exception as e: + raise RuntimeError( + f"Failed to delete index {self.collection_name}: {str(e)}" + ) + + def connect(self, metric: Optional[str] = "angular", n_trees: int = 10): + """ + Connect to the Annoy index, creating it if it doesn't exist. + + Args: + metric (Optional[str]): The distance metric to use. Defaults to "angular". + n_trees (int): Number of trees for the Annoy index. More trees = better accuracy but larger index. + """ + try: + self.client = AnnoyIndex(self.vector_size, metric) + if os.path.exists(f"{self.collection_name}.ann"): + self.client.load(f"{self.collection_name}.ann") + except Exception as e: + raise RuntimeError( + f"Failed to connect to Annoy index {self.collection_name}: {str(e)}" + ) + + def disconnect(self): + """ + Disconnect from the Annoy index. + """ + try: + self.client = None + except Exception as e: + raise RuntimeError(f"Error during disconnecting: {str(e)}") + + def _prepare_vector(self, document: Document) -> np.ndarray: + """ + Prepare a vector for insertion into the Annoy index. + + Args: + document (Document): The document to prepare. + + Returns: + np.ndarray: The prepared vector. + """ + if not document.embedding: + self._embedder.fit([document.content]) + embedding = self._embedder.transform([document.content])[0].to_numpy() + else: + embedding = np.array(document.embedding) + return embedding + + def add_document(self, document: Document, namespace: Optional[str] = "") -> None: + """ + Add a single document to the Annoy index. + Note: In Annoy, the index needs to be rebuilt after adding documents. + + Args: + document (Document): The document to add. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + """ + try: + vector = self._prepare_vector(document) + index = self._current_index + self.client.add_item(index, vector) + self._documents[document.id] = document + self._id_to_index[document.id] = index + self._index_to_id[index] = document.id + self._current_index += 1 + except Exception as e: + raise RuntimeError(f"Failed to add document {document.id}: {str(e)}") + + def add_documents( + self, + documents: List[Document], + namespace: Optional[str] = "", + batch_size: int = 200, + ) -> None: + """ + Add multiple documents to the Annoy index. + Note: The index will be built after adding all documents. + + Args: + documents (List[Document]): The list of documents to add. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + batch_size (int): Not used in Annoy but kept for compatibility. + """ + try: + for document in documents: + self.add_document(document, namespace) + self.client.build(10) # Build with default 10 trees + self.client.save(f"{self.collection_name}.ann") + except Exception as e: + raise RuntimeError(f"Failed to add documents: {str(e)}") + + def get_document( + self, id: str, namespace: Optional[str] = "" + ) -> Union[Document, None]: + """ + Retrieve a single document by its ID. + + Args: + id (str): The ID of the document to retrieve. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + + Returns: + Union[Document, None]: The retrieved document, or None if not found. + """ + return self._documents.get(id) + + def get_all_documents(self, namespace: Optional[str] = "") -> List[Document]: + """ + Retrieve all documents. + + Args: + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + + Returns: + List[Document]: A list of all documents. + """ + return list(self._documents.values()) + + def delete_document(self, id: str, namespace: Optional[str] = "") -> None: + """ + Delete a single document. + Note: This requires rebuilding the index. + + Args: + id (str): The ID of the document to delete. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + """ + try: + if id in self._documents: + del self._documents[id] + index = self._id_to_index[id] + del self._id_to_index[id] + del self._index_to_id[index] + # Rebuild index with remaining documents + self.client = AnnoyIndex(self.vector_size, "angular") + for doc_id, doc in self._documents.items(): + vector = self._prepare_vector(doc) + self.client.add_item(self._id_to_index[doc_id], vector) + self.client.build(10) + self.client.save(f"{self.collection_name}.ann") + except Exception as e: + raise RuntimeError(f"Failed to delete document {id}: {str(e)}") + + def clear_documents(self, namespace: Optional[str] = "") -> None: + """ + Delete all documents. + + Args: + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + """ + try: + self.delete() + self.connect() + except Exception as e: + raise RuntimeError(f"Failed to clear documents: {str(e)}") + + def update_document( + self, id: str, document: Document, namespace: Optional[str] = "" + ) -> None: + """ + Update a document. + Note: This requires rebuilding the index. + + Args: + id (str): The ID of the document to update. + document (Document): The updated document. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + """ + try: + self.delete_document(id, namespace) + self.add_document(document, namespace) + except Exception as e: + raise RuntimeError(f"Failed to update document {id}: {str(e)}") + + def document_count(self, namespace: Optional[str] = "") -> int: + """ + Get the number of documents in the index. + + Args: + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + + Returns: + int: The number of documents in the index. + """ + return len(self._documents) + + def retrieve( + self, query: str, top_k: int = 5, namespace: Optional[str] = "" + ) -> List[Document]: + """ + Retrieve documents based on a query string. + + Args: + query (str): The query string to search for. + top_k (int): The number of results to return. Defaults to 5. + namespace (Optional[str]): Not used in Annoy but kept for compatibility. + + Returns: + List[Document]: A list of retrieved documents. + """ + try: + query_embedding = self._embedder.infer_vector(query).value + indices, distances = self.client.get_nns_by_vector( + query_embedding, top_k, include_distances=True + ) + results = [] + for idx in indices: + doc_id = self._index_to_id.get(idx) + if doc_id: + results.append(self._documents[doc_id]) + return results + except Exception as e: + raise RuntimeError(f"Failed to retrieve documents: {str(e)}") + + def model_dump_json(self, *args, **kwargs) -> str: + """ + Override the model_dump_json method to ensure proper serialization. + """ + self.disconnect() + return super().model_dump_json(*args, **kwargs) diff --git a/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/__init__.py b/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/__init__.py new file mode 100644 index 000000000..fb72b0ad3 --- /dev/null +++ b/pkgs/community/swarmauri_vectorstore_communityannoy/swarmauri_vectorstore_communityannoy/__init__.py @@ -0,0 +1,12 @@ +from .AnnoyVectorStore import AnnoyVectorStore + +__version__ = "0.6.0.dev26" +__long_desc__ = """ + +# Swarmauri Annoy vector store Plugin + +Visit us at: https://swarmauri.com +Follow us at: https://github.com/swarmauri +Star us at: https://github.com/swarmauri/swarmauri-sdk + +""" diff --git a/pkgs/community/swarmauri_vectorstore_communityannoy/tests/unit/AnnoyVectorStore_test.py b/pkgs/community/swarmauri_vectorstore_communityannoy/tests/unit/AnnoyVectorStore_test.py new file mode 100644 index 000000000..a2ae2fa81 --- /dev/null +++ b/pkgs/community/swarmauri_vectorstore_communityannoy/tests/unit/AnnoyVectorStore_test.py @@ -0,0 +1,70 @@ +import pytest +from swarmauri_standard.documents.Document import Document +from swarmauri_vectorstore_communityannoy.AnnoyVectorStore import AnnoyVectorStore + + +# Fixture for creating an AnnoyVectorStore instance +@pytest.fixture +def vector_store(): + vs = AnnoyVectorStore( + collection_name="test_annoy", + vector_size=100, + ) + vs.connect() + yield vs + # Cleanup after tests + vs.delete() + + +@pytest.mark.unit +def test_ubc_resource(vector_store): + assert vector_store.resource == "VectorStore" + assert vector_store.embedder.resource == "Embedding" + + +@pytest.mark.unit +def test_ubc_type(vector_store): + assert vector_store.type == "AnnoyVectorStore" + + +@pytest.mark.unit +def test_serialization(vector_store): + assert ( + vector_store.id + == AnnoyVectorStore.model_validate_json(vector_store.model_dump_json()).id + ) + + +@pytest.mark.unit +def test_top_k(vector_store): + documents = [ + Document(content="test"), + Document(content="test1"), + Document(content="test2"), + Document(content="test3"), + ] + + vector_store.add_documents(documents) + results = vector_store.retrieve(query="test", top_k=2) + assert len(results) == 2 + + +@pytest.mark.unit +def test_document_count(vector_store): + documents = [ + Document(content="test1"), + Document(content="test2"), + Document(content="test3"), + ] + + vector_store.add_documents(documents) + assert vector_store.document_count() == 3 + + +@pytest.mark.unit +def test_get_document(vector_store): + doc = Document(content="test document") + vector_store.add_document(doc) + retrieved_doc = vector_store.get_document(doc.id) + assert retrieved_doc.id == doc.id + assert retrieved_doc.content == doc.content diff --git a/pkgs/community/swm_example_community_package/pyproject.toml b/pkgs/community/swm_example_community_package/pyproject.toml index 846946be2..923bca792 100644 --- a/pkgs/community/swm_example_community_package/pyproject.toml +++ b/pkgs/community/swm_example_community_package/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "swm_example_community_package" version = "0.6.0.dev1" -description = "Swarmauri Psutil Tool" +description = "example community package" authors = ["Jacob Stewart "] license = "Apache-2.0" readme = "README.md" @@ -20,9 +20,6 @@ python = ">=3.10,<3.13" swarmauri_core = { path = "../../core" } swarmauri_base = { path = "../../base" } -# Dependencies -psutil = "^6.1.0" - [tool.poetry.group.dev.dependencies] flake8 = "^7.0"