Thorough rewrite and optimization of integration tests (#82)

* complete removal of SomeEmbeddings for tests * test_vectorstore_autodetect brought to rationality * test_graphvectorstore is nice now * halfway through test_vectorstore.py * tests of from_ methods of vectorstore are now good * most test_vectorstore done; missing only indexing and coreclients_init * all of graph/vectorstores brought to order * graph/vstore tests mostly hcd-compatible (wip) * wip on fixing the hcd/apikey header thing * completed rewrite of tests for graph/vectorstores * chat message histories tested nicely * deep restructuring of the caches testing * test_document_loaders under control * further improvement test document loader * with test_storage it seems everything is done now. * tiny docstr edit * make openai key into a fixture to heal compile test * clean info on IT prereqs
langchain-ai · Sep 27, 2024 · 3eb73d7 · 3eb73d7
1 parent 80ea19c
commit 3eb73d7
Show file tree

Hide file tree

Showing 20 changed files with 3,554 additions and 3,250 deletions.
diff --git a/libs/astradb/codespell_ignore_words.txt b/libs/astradb/codespell_ignore_words.txt
@@ -1 +0,0 @@
-Haa

diff --git a/libs/astradb/langchain_astradb/graph_vectorstores.py b/libs/astradb/langchain_astradb/graph_vectorstores.py
@@ -323,7 +323,7 @@ def mmr_traversal_search(  # noqa: C901
         def get_adjacent(tags: set[str]) -> Iterable[_Edge]:
             targets: dict[str, _Edge] = {}
 
-            # TODO: Would be better parralelized
+            # TODO: Would be better parallelized
             for tag in tags:
                 m_filter = (metadata_filter or {}).copy()
                 m_filter[self.link_from_metadata_key] = tag

diff --git a/libs/astradb/langchain_astradb/utils/mmr_traversal.py b/libs/astradb/langchain_astradb/utils/mmr_traversal.py
@@ -117,7 +117,7 @@ def __init__(
 
         # List of the candidates.
         self.candidates = []
-        # ND array of the candidate embeddings.
+        # numpy n-dimensional array of the candidate embeddings.
         self.candidate_embeddings = np.ndarray((0, self.dimensions), dtype=np.float32)
 
         self.best_score = NEG_INF

diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py
@@ -460,8 +460,6 @@ def __init__(
                 This is useful when the service is configured for the collection,
                 but no corresponding secret is stored within
                 Astra's key management system.
-                This parameter cannot be provided without
-                specifying ``collection_vector_service_options``.
             content_field: name of the field containing the textual content
                 in the documents when saved on Astra DB. For vectorize collections,
                 this cannot be specified; for non-vectorize collection, defaults
@@ -473,7 +471,7 @@ def __init__(
                 Please understand the limitations of this method and get some
                 understanding of your data before passing ``"*"`` for this parameter.
             ignore_invalid_documents: if False (default), exceptions are raised
-                when a document is found on the Astra DB collectin that does
+                when a document is found on the Astra DB collection that does
                 not have the expected shape. If set to True, such results
                 from the database are ignored and a warning is issued. Note
                 that in this case a similarity search may end up returning fewer
@@ -824,11 +822,10 @@ async def adelete(
             raise ValueError(msg)
 
         _max_workers = concurrency or self.bulk_delete_concurrency
-        return all(
-            await gather_with_concurrency(
-                _max_workers, *[self.adelete_by_document_id(doc_id) for doc_id in ids]
-            )
+        await gather_with_concurrency(
+            _max_workers, *[self.adelete_by_document_id(doc_id) for doc_id in ids]
         )
+        return True
 
     def delete_collection(self) -> None:
         """Completely delete the collection from the database.

diff --git a/libs/astradb/testing.env.sample b/libs/astradb/testing.env.sample
@@ -1,13 +1,24 @@
-export ASTRA_DB_APPLICATION_TOKEN="AstraCS:aaabbbccc..."
-export ASTRA_DB_API_ENDPOINT="https://0123...-region.apps.astra.datastax.com"
-export ASTRA_DB_KEYSPACE="default_keyspace"
-# Optional (mostly for HCD and such):
-# export ASTRA_DB_ENVIRONMENT="..."
-
-# required to test vectorize with SHARED_SECRET. Comment on HCD and such.
-export SHARED_SECRET_NAME_OPENAI="NAME_SUPPLIED_IN_ASTRA_KMS"
-# required to test vectorize with HEADER
-export OPENAI_API_KEY="sk-aaabbbccc..."
-
-# change to "1" if nvidia server-side embeddings are available for the DB
-export NVIDIA_VECTORIZE_AVAILABLE="0"
+# ASTRA DB SETUP
+
+ASTRA_DB_API_ENDPOINT=https://your_astra_db_id-your_region.apps.astra.datastax.com
+ASTRA_DB_APPLICATION_TOKEN=AstraCS:your_astra_db_application_token
+# ASTRA_DB_KEYSPACE=your_astra_db_keyspace
+# ASTRA_DB_ENVIRONMENT="prod"
+
+SHARED_SECRET_NAME_OPENAI="key_name_on_astra_kms"
+OPENAI_API_KEY="..."
+
+
+### For testing on HCD it will not do SHARED_SECRET vectorize and look something like:
+#
+#
+#
+#    ASTRA_DB_APPLICATION_TOKEN="Cassandra:Y2Fzc2FuZHJh:Y2Fzc2FuZHJh"
+#    ASTRA_DB_API_ENDPOINT="http://localhost:8181"
+#    ASTRA_DB_KEYSPACE="keyspace_created_by_the_ci_for_testing"
+#    ASTRA_DB_ENVIRONMENT="hcd"
+#   
+#    OPENAI_API_KEY="..."
+#
+#
+#
diff --git a/libs/astradb/tests/conftest.py b/libs/astradb/tests/conftest.py
@@ -5,34 +5,14 @@
 from __future__ import annotations
 
 import json
+from typing import TYPE_CHECKING, Any
 
 from langchain_core.embeddings import Embeddings
+from langchain_core.language_models import LLM
+from typing_extensions import override
 
-
-class SomeEmbeddings(Embeddings):
-    """Turn a sentence into an embedding vector in some way.
-    Not important how. It is deterministic is all that counts.
-    """
-
-    def __init__(self, dimension: int) -> None:
-        self.dimension = dimension
-
-    def embed_documents(self, texts: list[str]) -> list[list[float]]:
-        return [self.embed_query(txt) for txt in texts]
-
-    async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
-        return self.embed_documents(texts)
-
-    def embed_query(self, text: str) -> list[float]:
-        unnormed0 = [ord(c) for c in text[: self.dimension]]
-        unnormed = (unnormed0 + [1] + [0] * (self.dimension - 1 - len(unnormed0)))[
-            : self.dimension
-        ]
-        norm = sum(x * x for x in unnormed) ** 0.5
-        return [x / norm for x in unnormed]
-
-    async def aembed_query(self, text: str) -> list[float]:
-        return self.embed_query(text)
+if TYPE_CHECKING:
+    from langchain_core.callbacks import CallbackManagerForLLMRun
 
 
 class ParserEmbeddings(Embeddings):
@@ -61,3 +41,30 @@ def embed_query(self, text: str) -> list[float]:
 
     async def aembed_query(self, text: str) -> list[float]:
         return self.embed_query(text)
+
+
+class IdentityLLM(LLM):
+    num_calls: int = 0
+
+    @property
+    @override
+    def _llm_type(self) -> str:
+        return "fake"
+
+    @override
+    def _call(
+        self,
+        prompt: str,
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> str:
+        self.num_calls += 1
+        if stop is not None:
+            return f"STOP<{prompt.upper()}>"
+        return prompt
+
+    @property
+    @override
+    def _identifying_params(self) -> dict[str, Any]:
+        return {}
diff --git a/libs/astradb/tests/integration_tests/.env.example b/libs/astradb/tests/integration_tests/.env.example