Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Swarmauri Annoy Vector Store community package #1079

Merged
merged 2 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkgs/community/swarmauri_community/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ pypdf = { version = "^5.0.1", optional = true }
pypdftk = { version = "^0.5", optional = true }
weaviate-client = { version = "^4.9.2", optional = true }
#textblob = { version = "^0.18.0", optional = true }
torch = { version = "^2.4.1", optional = true}
scikit-learn = { version = "^1.5.2", optional = true }
#torch = { version = "^2.4.1", optional = true}
#scikit-learn = { version = "^1.5.2", optional = true }
#protobuf = { version = "^3.20.0", optional = true }

[tool.poetry.extras]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from swarmauri.documents.concrete.Document import Document
from swarmauri_standard.documents.Document import Document
from swarmauri_community.vector_stores.concrete.AnnoyVectorStore import AnnoyVectorStore


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Swarmauri Example Community Package
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[tool.poetry]
name = "swarmauri_vectorstore_communityannoy"
version = "0.6.0.dev1"
description = "Swarmauri Annoy Vector Store"
authors = ["Jacob Stewart <[email protected]>"]
license = "Apache-2.0"
readme = "README.md"
repository = "http://github.com/swarmauri/swarmauri-sdk"
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
]

[tool.poetry.dependencies]
python = ">=3.10,<3.13"

# Swarmauri
swarmauri_core = { path = "../../core" }
swarmauri_base = { path = "../../base" }
swarmauri_vectorstore_doc2vec = { path = "../../standards" }

# Dependencies
annoy = "^1.17.3"



[tool.poetry.group.dev.dependencies]
flake8 = "^7.0"
pytest = "^8.0"
pytest-asyncio = ">=0.24.0"
pytest-xdist = "^3.6.1"
pytest-json-report = "^1.5.0"
python-dotenv = "*"
requests = "^2.32.3"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
norecursedirs = ["combined", "scripts"]

markers = [
"test: standard test",
"unit: Unit tests",
"integration: Integration tests",
"acceptance: Acceptance tests",
"experimental: Experimental tests"
]
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
asyncio_default_fixture_loop_scope = "function"

[tool.poetry.plugins."swarmauri.vector_stores"]
AnnoyVectorStore = "swarmauri_vectorstore_communityannoy.AnnoyVectorStore:AnnoyVectorStore"
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
from typing import List, Union, Literal, Optional
import numpy as np
from annoy import AnnoyIndex
import os

from swarmauri_standard.documents.Document import Document
from swarmauri_vectorstore_doc2vec.Doc2VecEmbedding import Doc2VecEmbedding
from swarmauri_standard.distances.CosineDistance import CosineDistance

from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase
from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import (
VectorStoreRetrieveMixin,
)
from swarmauri_base.vector_stores.VectorStoreCloudMixin import VectorStoreCloudMixin
from swarmauri_base.vector_stores.VectorStoreSaveLoadMixin import (
VectorStoreSaveLoadMixin,
)


class AnnoyVectorStore(
VectorStoreRetrieveMixin,
VectorStoreCloudMixin,
VectorStoreSaveLoadMixin,
VectorStoreBase,
):
"""
A vector store implementation using Annoy as the backend.

This class provides methods to interact with an Annoy index, including
adding, retrieving, and searching for documents. Note that Annoy indices
are immutable after building, so updates and deletes require rebuilding.
"""

type: Literal["AnnoyVectorStore"] = "AnnoyVectorStore"
api_key: str = (
"not_required" # Annoy doesn't need an API key, but base class requires it
)

def __init__(self, **kwargs):
"""
Initialize the AnnoyVectorStore.
Args:
**kwargs: Additional keyword arguments.
"""
# Set default api_key if not provided
if "api_key" not in kwargs:
kwargs["api_key"] = "not_required"

super().__init__(**kwargs)
self._embedder = Doc2VecEmbedding(vector_size=self.vector_size)
self._distance = CosineDistance()
self.client = None
self._documents = (
{}
) # Store documents in memory since Annoy only stores vectors
self._current_index = 0 # Track the next available index
self._id_to_index = {} # Map document IDs to Annoy indices
self._index_to_id = {} # Map Annoy indices to document IDs

def delete(self):
"""
Delete the Annoy index if it exists.
"""
try:
if os.path.exists(f"{self.collection_name}.ann"):
os.remove(f"{self.collection_name}.ann")
self.client = None
self._documents = {}
self._current_index = 0
self._id_to_index = {}
self._index_to_id = {}
except Exception as e:
raise RuntimeError(
f"Failed to delete index {self.collection_name}: {str(e)}"
)

def connect(self, metric: Optional[str] = "angular", n_trees: int = 10):
"""
Connect to the Annoy index, creating it if it doesn't exist.

Args:
metric (Optional[str]): The distance metric to use. Defaults to "angular".
n_trees (int): Number of trees for the Annoy index. More trees = better accuracy but larger index.
"""
try:
self.client = AnnoyIndex(self.vector_size, metric)
if os.path.exists(f"{self.collection_name}.ann"):
self.client.load(f"{self.collection_name}.ann")
except Exception as e:
raise RuntimeError(
f"Failed to connect to Annoy index {self.collection_name}: {str(e)}"
)

def disconnect(self):
"""
Disconnect from the Annoy index.
"""
try:
self.client = None
except Exception as e:
raise RuntimeError(f"Error during disconnecting: {str(e)}")

def _prepare_vector(self, document: Document) -> np.ndarray:
"""
Prepare a vector for insertion into the Annoy index.

Args:
document (Document): The document to prepare.

Returns:
np.ndarray: The prepared vector.
"""
if not document.embedding:
self._embedder.fit([document.content])
embedding = self._embedder.transform([document.content])[0].to_numpy()
else:
embedding = np.array(document.embedding)
return embedding

def add_document(self, document: Document, namespace: Optional[str] = "") -> None:
"""
Add a single document to the Annoy index.
Note: In Annoy, the index needs to be rebuilt after adding documents.

Args:
document (Document): The document to add.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.
"""
try:
vector = self._prepare_vector(document)
index = self._current_index
self.client.add_item(index, vector)
self._documents[document.id] = document
self._id_to_index[document.id] = index
self._index_to_id[index] = document.id
self._current_index += 1
except Exception as e:
raise RuntimeError(f"Failed to add document {document.id}: {str(e)}")

def add_documents(
self,
documents: List[Document],
namespace: Optional[str] = "",
batch_size: int = 200,
) -> None:
"""
Add multiple documents to the Annoy index.
Note: The index will be built after adding all documents.

Args:
documents (List[Document]): The list of documents to add.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.
batch_size (int): Not used in Annoy but kept for compatibility.
"""
try:
for document in documents:
self.add_document(document, namespace)
self.client.build(10) # Build with default 10 trees
self.client.save(f"{self.collection_name}.ann")
except Exception as e:
raise RuntimeError(f"Failed to add documents: {str(e)}")

def get_document(
self, id: str, namespace: Optional[str] = ""
) -> Union[Document, None]:
"""
Retrieve a single document by its ID.

Args:
id (str): The ID of the document to retrieve.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.

Returns:
Union[Document, None]: The retrieved document, or None if not found.
"""
return self._documents.get(id)

def get_all_documents(self, namespace: Optional[str] = "") -> List[Document]:
"""
Retrieve all documents.

Args:
namespace (Optional[str]): Not used in Annoy but kept for compatibility.

Returns:
List[Document]: A list of all documents.
"""
return list(self._documents.values())

def delete_document(self, id: str, namespace: Optional[str] = "") -> None:
"""
Delete a single document.
Note: This requires rebuilding the index.

Args:
id (str): The ID of the document to delete.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.
"""
try:
if id in self._documents:
del self._documents[id]
index = self._id_to_index[id]
del self._id_to_index[id]
del self._index_to_id[index]
# Rebuild index with remaining documents
self.client = AnnoyIndex(self.vector_size, "angular")
for doc_id, doc in self._documents.items():
vector = self._prepare_vector(doc)
self.client.add_item(self._id_to_index[doc_id], vector)
self.client.build(10)
self.client.save(f"{self.collection_name}.ann")
except Exception as e:
raise RuntimeError(f"Failed to delete document {id}: {str(e)}")

def clear_documents(self, namespace: Optional[str] = "") -> None:
"""
Delete all documents.

Args:
namespace (Optional[str]): Not used in Annoy but kept for compatibility.
"""
try:
self.delete()
self.connect()
except Exception as e:
raise RuntimeError(f"Failed to clear documents: {str(e)}")

def update_document(
self, id: str, document: Document, namespace: Optional[str] = ""
) -> None:
"""
Update a document.
Note: This requires rebuilding the index.

Args:
id (str): The ID of the document to update.
document (Document): The updated document.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.
"""
try:
self.delete_document(id, namespace)
self.add_document(document, namespace)
except Exception as e:
raise RuntimeError(f"Failed to update document {id}: {str(e)}")

def document_count(self, namespace: Optional[str] = "") -> int:
"""
Get the number of documents in the index.

Args:
namespace (Optional[str]): Not used in Annoy but kept for compatibility.

Returns:
int: The number of documents in the index.
"""
return len(self._documents)

def retrieve(
self, query: str, top_k: int = 5, namespace: Optional[str] = ""
) -> List[Document]:
"""
Retrieve documents based on a query string.

Args:
query (str): The query string to search for.
top_k (int): The number of results to return. Defaults to 5.
namespace (Optional[str]): Not used in Annoy but kept for compatibility.

Returns:
List[Document]: A list of retrieved documents.
"""
try:
query_embedding = self._embedder.infer_vector(query).value
indices, distances = self.client.get_nns_by_vector(
query_embedding, top_k, include_distances=True
)
results = []
for idx in indices:
doc_id = self._index_to_id.get(idx)
if doc_id:
results.append(self._documents[doc_id])
return results
except Exception as e:
raise RuntimeError(f"Failed to retrieve documents: {str(e)}")

def model_dump_json(self, *args, **kwargs) -> str:
"""
Override the model_dump_json method to ensure proper serialization.
"""
self.disconnect()
return super().model_dump_json(*args, **kwargs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .AnnoyVectorStore import AnnoyVectorStore

__version__ = "0.6.0.dev26"
__long_desc__ = """

# Swarmauri Annoy vector store Plugin

Visit us at: https://swarmauri.com
Follow us at: https://github.com/swarmauri
Star us at: https://github.com/swarmauri/swarmauri-sdk

"""
Loading
Loading