langchain-ai · orkhank · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/libs/partners/pinecone/langchain_pinecone/vectorstores.py b/libs/partners/pinecone/langchain_pinecone/vectorstores.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import logging
 import os
 import uuid
@@ -21,6 +22,7 @@
 from langchain_core.utils.iter import batch_iterate
 from langchain_core.vectorstores import VectorStore
 from pinecone import Pinecone as PineconeClient  # type: ignore
+from pinecone.exceptions import NotFoundException
 
 from langchain_pinecone._utilities import DistanceStrategy, maximal_marginal_relevance
 
@@ -223,6 +225,29 @@ def embeddings(self) -> Optional[Embeddings]:
         """Access the query embedding object if available."""
         return self._embedding
 
+    def _get_vector_count(self) -> int:
+        description = self._index.describe_index_stats()
+        return description["total_vector_count"]
+
+    async def _wait_on_index(self, expected_num_docs: int) -> None:
+        """
+        Wait for the Pinecone Index to be ready.
+
+        Blocks until the index has the expected number of documents.
+
+        Args:
+            expected_num_docs: The expected number of documents in the index.
+
+        """
+        ready = False
+        while not ready:
+            await asyncio.sleep(2)
+            try:
+                vector_count = self._get_vector_count()
+                ready = vector_count == expected_num_docs
+            except NotFoundException:
+                pass
+
     def add_texts(
         self,
         texts: Iterable[str],
@@ -269,6 +294,8 @@ def add_texts(
         for metadata, text in zip(metadatas, texts):
             metadata[self._text_key] = text
 
+        initial_vector_count = self._get_vector_count()
+
         # For loops to avoid memory issues and optimize when using HTTP based embeddings
         # The first loop runs the embeddings, it benefits when using OpenAI embeddings
         # The second loops runs the pinecone upsert asynchronously.
@@ -298,6 +325,8 @@ def add_texts(
                     **kwargs,
                 )
 
+        asyncio.run(self._wait_on_index(len(texts) + initial_vector_count))
+
         return ids
 
     def similarity_search_with_score(

diff --git a/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py b/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py
@@ -89,7 +89,6 @@ def test_from_texts(
             index_name=INDEX_NAME,
             namespace=NAMESPACE_NAME,
         )
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
         output = docsearch.similarity_search(unique_id, k=1, namespace=NAMESPACE_NAME)
         output[0].id = None  # overwrite ID for ease of comparison
         assert output == [Document(page_content=needs)]
@@ -113,7 +112,6 @@ def test_from_texts_with_metadatas(
             metadatas=metadatas,
             namespace=namespace,
         )
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
         output = docsearch.similarity_search(needs, k=1, namespace=namespace)
 
         output[0].id = None
@@ -133,7 +131,6 @@ def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> Non
             namespace=NAMESPACE_NAME,
         )
         print(texts)  # noqa: T201
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
         output = docsearch.similarity_search_with_score(
             "foo", k=3, namespace=NAMESPACE_NAME
         )
@@ -178,8 +175,6 @@ def test_from_existing_index_with_namespaces(
             namespace=f"{INDEX_NAME}-2",
         )
 
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
-
         # Search with namespace
         docsearch = PineconeVectorStore.from_existing_index(
             index_name=INDEX_NAME,
@@ -203,7 +198,6 @@ def test_add_documents_with_ids(
             index_name=INDEX_NAME,
             namespace=NAMESPACE_NAME,
         )
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
         index_stats = self.index.describe_index_stats()
         assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts)
 
@@ -215,7 +209,6 @@ def test_add_documents_with_ids(
             index_name=INDEX_NAME,
             namespace=NAMESPACE_NAME,
         )
-        time.sleep(DEFAULT_SLEEP)  # prevent race condition
         index_stats = self.index.describe_index_stats()
         assert (
             index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts) * 2
@@ -234,8 +227,6 @@ def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None
             index_name=INDEX_NAME,
             metadatas=metadatas,
         )
-        # wait for the index to be ready
-        time.sleep(DEFAULT_SLEEP)
         output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
         print(output)  # noqa: T201
         assert all(