Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: add diskann index type #9677

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class AzureCosmosDBMemoryStore(MemoryStoreBase):
m = None
ef_construction = None
ef_search = None
max_degree = None
l_build = None
l_search = None

def __init__(
self,
Expand All @@ -55,6 +58,9 @@ def __init__(
m: int = 16,
ef_construction: int = 64,
ef_search: int = 40,
max_degree: int = 32,
l_build: int = 50,
l_search: int = 40,
):
"""Initializes a new instance of the AzureCosmosDBMemoryStore class."""
if vector_dimensions <= 0:
Expand All @@ -72,6 +78,9 @@ def __init__(
self.m = m
self.ef_construction = ef_construction
self.ef_search = ef_search
self.max_degree = max_degree
self.l_build = l_build
self.l_search = l_search

@staticmethod
async def create(
Expand All @@ -84,6 +93,9 @@ async def create(
m: int,
ef_construction: int,
ef_search: int,
max_degree: int,
l_build: int,
l_search: int,
index_name: str | None = None,
cosmos_connstr: str | None = None,
application_name: str | None = None,
Expand Down Expand Up @@ -115,6 +127,9 @@ async def create(
m=m,
ef_construction=ef_construction,
ef_search=ef_search,
max_degree=max_degree,
l_build=l_build,
l_search=l_search,
)
else:
raise MemoryConnectorInitializationError(f"API type {cosmos_api} is not supported.")
Expand All @@ -130,6 +145,9 @@ async def create(
m,
ef_construction,
ef_search,
max_degree,
l_build,
l_search,
)
await store.create_collection(collection_name)
return store
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ class MongoStoreApi(AzureCosmosDBStoreApi):
m = None
ef_construction = None
ef_search = None
max_degree = None
l_build = None
l_search = None

"""
Args:
Expand All @@ -55,7 +58,8 @@ class MongoStoreApi(AzureCosmosDBStoreApi):
kind: Type of vector index to create.
Possible options are:
- vector-ivf
- vector-hnsw: available as a preview feature only,
- vector-hnsw
- vector-diskann: available as a preview feature only
to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features
m: The max number of connections per layer (16 by default, minimum
value is 2, maximum value is 100). Higher m is suitable for datasets
Expand All @@ -68,6 +72,12 @@ class MongoStoreApi(AzureCosmosDBStoreApi):
ef_construction has to be at least 2 * m
ef_search: The size of the dynamic candidate list for search (40 by default).
A higher value provides better recall at the cost of speed.
max_degree: Max number of neighbors for diskann index.
Default value is 32, range from 20 to 2048.
l_build: l value for diskann index building.
Default value is 50, range from 10 to 500.
l_search: l value for diskann index searching.
Default value is 40, range from 10 to 10000.
database: The Mongo Database object of the azure cosmos db mongo store
"""

Expand All @@ -82,6 +92,9 @@ def __init__(
m: int,
ef_construction: int,
ef_search: int,
max_degree: int,
l_build: int,
l_search: int,
database=None,
):
"""Initializes a new instance of the MongoStoreApi class."""
Expand All @@ -95,6 +108,9 @@ def __init__(
self.m = m
self.ef_construction = ef_construction
self.ef_search = ef_search
self.max_degree = max_degree
self.l_build = l_build
self.l_search = l_search

@override
async def create_collection(self, collection_name: str) -> None:
Expand All @@ -118,6 +134,15 @@ async def create_collection(self, collection_name: str) -> None:
self.similarity,
self.vector_dimensions,
)
elif self.kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
create_index_commands = self._get_vector_index_diskann(
collection_name,
self.kind,
self.max_degree,
self.l_build,
self.similarity,
self.vector_dimensions,
)
# invoke the command from the database object
self.database.command(create_index_commands)
self.collection = self.database[collection_name]
Expand Down Expand Up @@ -161,6 +186,26 @@ def _get_vector_index_hnsw(
],
}

def _get_vector_index_diskann(
self, collection_name: str, kind: str, max_degree: int, l_build: int, similarity: str, dimensions: int
) -> dict[str, Any]:
return {
"createIndexes": collection_name,
"indexes": [
{
"name": self.index_name,
"key": {"embedding": "cosmosSearch"},
"cosmosSearchOptions": {
"kind": kind,
"maxDegree": max_degree,
"lBuild": l_build,
"similarity": similarity,
"dimensions": dimensions,
},
}
],
}

@override
async def get_collections(self) -> list[str]:
return self.database.list_collection_names()
Expand Down Expand Up @@ -254,6 +299,8 @@ async def get_nearest_matches(
pipeline = self._get_pipeline_vector_ivf(embedding.tolist(), limit)
elif self.kind == CosmosDBVectorSearchType.VECTOR_HNSW:
pipeline = self._get_pipeline_vector_hnsw(embedding.tolist(), limit, self.ef_search)
elif self.kind == CosmosDBVectorSearchType.VECTOR_DISKANN:
pipeline = self._get_pipeline_vector_diskann(embedding.tolist(), limit, self.l_search)

cursor = self.collection.aggregate(pipeline)

Expand Down Expand Up @@ -318,6 +365,29 @@ def _get_pipeline_vector_hnsw(
]
return pipeline

def _get_pipeline_vector_diskann(
self, embeddings: list[float], k: int = 4, l_search: int = 40
) -> list[dict[str, Any]]:
pipeline: list[dict[str, Any]] = [
{
"$search": {
"cosmosSearch": {
"vector": embeddings,
"path": "embedding",
"k": k,
"lSearch": l_search,
},
}
},
{
"$project": {
"similarityScore": {"$meta": "searchScore"},
"document": "$$ROOT",
}
},
]
return pipeline

@override
async def get_nearest_match(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ class CosmosDBVectorSearchType(str, Enum):
"""IVF vector index"""
VECTOR_HNSW = "vector-hnsw"
"""HNSW vector index"""
VECTOR_DISKANN = "vector-diskann"
"""DISKANN vector index"""
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
application_name = "PYTHON_SEMANTIC_KERNEL"
cosmos_api = "mongo-vcore"
index_name = "sk_test_vector_search_index"
index_name_vector_diskann = "sk_test_vector_search_index_diskann"
vector_dimensions = 1536
num_lists = 1
similarity = CosmosDBSimilarityType.COS
Expand Down Expand Up @@ -109,6 +110,9 @@ async def azurecosmosdb_memorystore() -> MemoryStoreBase:
m=m,
ef_construction=ef_construction,
ef_search=ef_search,
max_degree=50,
l_build=40,
l_search=100,
)


Expand Down Expand Up @@ -199,3 +203,120 @@ async def test_get_nearest_matches(
assert all(result[i][0]._id in [memory_record1._id, memory_record2._id] for i in range(2))

await store.remove_batch("", [memory_record1._id, memory_record2._id, memory_record3._id])


"""
Test cases for the similarity algorithm using vector-diskann
"""


async def azurecosmosdb_memorystore_vector_diskann() -> MemoryStoreBase:
return await AzureCosmosDBMemoryStore.create(
cosmos_connstr=cosmos_connstr,
application_name=application_name,
cosmos_api=cosmos_api,
database_name=database_name,
collection_name=collection_name,
index_name=index_name_vector_diskann,
vector_dimensions=vector_dimensions,
num_lists=num_lists,
similarity=similarity,
kind=CosmosDBVectorSearchType.VECTOR_DISKANN,
m=m,
ef_construction=ef_construction,
ef_search=ef_search,
max_degree=50,
l_build=40,
l_search=100,
)


@pytest.mark.asyncio
@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set")
async def test_create_get_drop_exists_collection_vector_diskann():
store = await azurecosmosdb_memorystore_vector_diskann()
test_collection = "test_collection"

await store.create_collection(test_collection)

collection_list = await store.get_collections()
assert test_collection in collection_list

await store.delete_collection(test_collection)

result = await store.does_collection_exist(test_collection)
assert result is False


@pytest.mark.asyncio
@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set")
async def test_upsert_and_get_and_remove_vector_diskann(
memory_record1: MemoryRecord,
):
store = await azurecosmosdb_memorystore_vector_diskann()
doc_id = await store.upsert("", memory_record1)
assert doc_id == memory_record1._id

result = await store.get("", memory_record1._id, with_embedding=True)

assert result is not None
assert result._id == memory_record1._id
assert all(result._embedding[i] == memory_record1._embedding[i] for i in range(len(result._embedding)))

await store.remove("", memory_record1._id)


@pytest.mark.asyncio
@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set")
async def test_upsert_batch_and_get_batch_remove_batch_vector_diskann(
memory_record2: MemoryRecord, memory_record3: MemoryRecord
):
store = await azurecosmosdb_memorystore_vector_diskann()
doc_ids = await store.upsert_batch("", [memory_record2, memory_record3])
assert len(doc_ids) == 2
assert all(doc_id in [memory_record2._id, memory_record3._id] for doc_id in doc_ids)

results = await store.get_batch("", [memory_record2._id, memory_record3._id], with_embeddings=True)

assert len(results) == 2
assert all(result._id in [memory_record2._id, memory_record3._id] for result in results)

await store.remove_batch("", [memory_record2._id, memory_record3._id])


@pytest.mark.asyncio
@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set")
async def test_get_nearest_match_vector_diskann(memory_record1: MemoryRecord, memory_record2: MemoryRecord):
store = await azurecosmosdb_memorystore_vector_diskann()
await store.upsert_batch("", [memory_record1, memory_record2])
test_embedding = memory_record1.embedding.copy()
test_embedding[0] = test_embedding[0] + 0.1

result = await store.get_nearest_match(
collection_name, test_embedding, min_relevance_score=0.0, with_embedding=True
)

assert result is not None
assert result[0]._id == memory_record1._id
assert all(result[0]._embedding[i] == memory_record1._embedding[i] for i in range(len(result[0]._embedding)))

await store.remove_batch("", [memory_record1._id, memory_record2._id])


@pytest.mark.asyncio
@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set")
async def test_get_nearest_matches_vector_diskann(
memory_record1: MemoryRecord,
memory_record2: MemoryRecord,
memory_record3: MemoryRecord,
):
store = await azurecosmosdb_memorystore_vector_diskann()
await store.upsert_batch("", [memory_record1, memory_record2, memory_record3])
test_embedding = memory_record2.embedding.copy()
test_embedding[0] = test_embedding[4] + 0.1

result = await store.get_nearest_matches("", test_embedding, limit=2, min_relevance_score=0.0, with_embeddings=True)
assert len(result) == 2
assert all(result[i][0]._id in [memory_record1._id, memory_record2._id] for i in range(2))

await store.remove_batch("", [memory_record1._id, memory_record2._id, memory_record3._id])
Loading