Skip to content

Commit

Permalink
Thorough rewrite and optimization of integration tests (#82)
Browse files Browse the repository at this point in the history
* complete removal of SomeEmbeddings for tests

* test_vectorstore_autodetect brought to rationality

* test_graphvectorstore is nice now

* halfway through test_vectorstore.py

* tests of from_ methods of vectorstore are now good

* most test_vectorstore done; missing only indexing and coreclients_init

* all of graph/vectorstores brought to order

* graph/vstore tests mostly hcd-compatible (wip)

* wip on fixing the hcd/apikey header thing

* completed rewrite of tests for graph/vectorstores

* chat message histories tested nicely

* deep restructuring of the caches testing

* test_document_loaders under control

* further improvement test document loader

* with test_storage it seems everything is done now.

* tiny docstr edit

* make openai key into a fixture to heal compile test

* clean info on IT prereqs
  • Loading branch information
hemidactylus authored Sep 27, 2024
1 parent 80ea19c commit 3eb73d7
Show file tree
Hide file tree
Showing 20 changed files with 3,554 additions and 3,250 deletions.
1 change: 0 additions & 1 deletion libs/astradb/codespell_ignore_words.txt
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
Haa
2 changes: 1 addition & 1 deletion libs/astradb/langchain_astradb/graph_vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def mmr_traversal_search( # noqa: C901
def get_adjacent(tags: set[str]) -> Iterable[_Edge]:
targets: dict[str, _Edge] = {}

# TODO: Would be better parralelized
# TODO: Would be better parallelized
for tag in tags:
m_filter = (metadata_filter or {}).copy()
m_filter[self.link_from_metadata_key] = tag
Expand Down
2 changes: 1 addition & 1 deletion libs/astradb/langchain_astradb/utils/mmr_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def __init__(

# List of the candidates.
self.candidates = []
# ND array of the candidate embeddings.
# numpy n-dimensional array of the candidate embeddings.
self.candidate_embeddings = np.ndarray((0, self.dimensions), dtype=np.float32)

self.best_score = NEG_INF
Expand Down
11 changes: 4 additions & 7 deletions libs/astradb/langchain_astradb/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,6 @@ def __init__(
This is useful when the service is configured for the collection,
but no corresponding secret is stored within
Astra's key management system.
This parameter cannot be provided without
specifying ``collection_vector_service_options``.
content_field: name of the field containing the textual content
in the documents when saved on Astra DB. For vectorize collections,
this cannot be specified; for non-vectorize collection, defaults
Expand All @@ -473,7 +471,7 @@ def __init__(
Please understand the limitations of this method and get some
understanding of your data before passing ``"*"`` for this parameter.
ignore_invalid_documents: if False (default), exceptions are raised
when a document is found on the Astra DB collectin that does
when a document is found on the Astra DB collection that does
not have the expected shape. If set to True, such results
from the database are ignored and a warning is issued. Note
that in this case a similarity search may end up returning fewer
Expand Down Expand Up @@ -824,11 +822,10 @@ async def adelete(
raise ValueError(msg)

_max_workers = concurrency or self.bulk_delete_concurrency
return all(
await gather_with_concurrency(
_max_workers, *[self.adelete_by_document_id(doc_id) for doc_id in ids]
)
await gather_with_concurrency(
_max_workers, *[self.adelete_by_document_id(doc_id) for doc_id in ids]
)
return True

def delete_collection(self) -> None:
"""Completely delete the collection from the database.
Expand Down
37 changes: 24 additions & 13 deletions libs/astradb/testing.env.sample
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
export ASTRA_DB_APPLICATION_TOKEN="AstraCS:aaabbbccc..."
export ASTRA_DB_API_ENDPOINT="https://0123...-region.apps.astra.datastax.com"
export ASTRA_DB_KEYSPACE="default_keyspace"
# Optional (mostly for HCD and such):
# export ASTRA_DB_ENVIRONMENT="..."

# required to test vectorize with SHARED_SECRET. Comment on HCD and such.
export SHARED_SECRET_NAME_OPENAI="NAME_SUPPLIED_IN_ASTRA_KMS"
# required to test vectorize with HEADER
export OPENAI_API_KEY="sk-aaabbbccc..."

# change to "1" if nvidia server-side embeddings are available for the DB
export NVIDIA_VECTORIZE_AVAILABLE="0"
# ASTRA DB SETUP

ASTRA_DB_API_ENDPOINT=https://your_astra_db_id-your_region.apps.astra.datastax.com
ASTRA_DB_APPLICATION_TOKEN=AstraCS:your_astra_db_application_token
# ASTRA_DB_KEYSPACE=your_astra_db_keyspace
# ASTRA_DB_ENVIRONMENT="prod"

SHARED_SECRET_NAME_OPENAI="key_name_on_astra_kms"
OPENAI_API_KEY="..."


### For testing on HCD it will not do SHARED_SECRET vectorize and look something like:
#
#
#
# ASTRA_DB_APPLICATION_TOKEN="Cassandra:Y2Fzc2FuZHJh:Y2Fzc2FuZHJh"
# ASTRA_DB_API_ENDPOINT="http://localhost:8181"
# ASTRA_DB_KEYSPACE="keyspace_created_by_the_ci_for_testing"
# ASTRA_DB_ENVIRONMENT="hcd"
#
# OPENAI_API_KEY="..."
#
#
#
57 changes: 32 additions & 25 deletions libs/astradb/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,14 @@
from __future__ import annotations

import json
from typing import TYPE_CHECKING, Any

from langchain_core.embeddings import Embeddings
from langchain_core.language_models import LLM
from typing_extensions import override


class SomeEmbeddings(Embeddings):
"""Turn a sentence into an embedding vector in some way.
Not important how. It is deterministic is all that counts.
"""

def __init__(self, dimension: int) -> None:
self.dimension = dimension

def embed_documents(self, texts: list[str]) -> list[list[float]]:
return [self.embed_query(txt) for txt in texts]

async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
return self.embed_documents(texts)

def embed_query(self, text: str) -> list[float]:
unnormed0 = [ord(c) for c in text[: self.dimension]]
unnormed = (unnormed0 + [1] + [0] * (self.dimension - 1 - len(unnormed0)))[
: self.dimension
]
norm = sum(x * x for x in unnormed) ** 0.5
return [x / norm for x in unnormed]

async def aembed_query(self, text: str) -> list[float]:
return self.embed_query(text)
if TYPE_CHECKING:
from langchain_core.callbacks import CallbackManagerForLLMRun


class ParserEmbeddings(Embeddings):
Expand Down Expand Up @@ -61,3 +41,30 @@ def embed_query(self, text: str) -> list[float]:

async def aembed_query(self, text: str) -> list[float]:
return self.embed_query(text)


class IdentityLLM(LLM):
num_calls: int = 0

@property
@override
def _llm_type(self) -> str:
return "fake"

@override
def _call(
self,
prompt: str,
stop: list[str] | None = None,
run_manager: CallbackManagerForLLMRun | None = None,
**kwargs: Any,
) -> str:
self.num_calls += 1
if stop is not None:
return f"STOP<{prompt.upper()}>"
return prompt

@property
@override
def _identifying_params(self) -> dict[str, Any]:
return {}
5 changes: 0 additions & 5 deletions libs/astradb/tests/integration_tests/.env.example

This file was deleted.

Loading

0 comments on commit 3eb73d7

Please sign in to comment.