diff --git a/libs/partners/pinecone/langchain_pinecone/vectorstores.py b/libs/partners/pinecone/langchain_pinecone/vectorstores.py index 05fbd9199541d..e4548f0852541 100644 --- a/libs/partners/pinecone/langchain_pinecone/vectorstores.py +++ b/libs/partners/pinecone/langchain_pinecone/vectorstores.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import logging import os import uuid @@ -21,6 +22,7 @@ from langchain_core.utils.iter import batch_iterate from langchain_core.vectorstores import VectorStore from pinecone import Pinecone as PineconeClient # type: ignore +from pinecone.exceptions import NotFoundException from langchain_pinecone._utilities import DistanceStrategy, maximal_marginal_relevance @@ -223,6 +225,29 @@ def embeddings(self) -> Optional[Embeddings]: """Access the query embedding object if available.""" return self._embedding + def _get_vector_count(self) -> int: + description = self._index.describe_index_stats() + return description["total_vector_count"] + + async def _wait_on_index(self, expected_num_docs: int) -> None: + """ + Wait for the Pinecone Index to be ready. + + Blocks until the index has the expected number of documents. + + Args: + expected_num_docs: The expected number of documents in the index. + + """ + ready = False + while not ready: + await asyncio.sleep(2) + try: + vector_count = self._get_vector_count() + ready = vector_count == expected_num_docs + except NotFoundException: + pass + def add_texts( self, texts: Iterable[str], @@ -269,6 +294,8 @@ def add_texts( for metadata, text in zip(metadatas, texts): metadata[self._text_key] = text + initial_vector_count = self._get_vector_count() + # For loops to avoid memory issues and optimize when using HTTP based embeddings # The first loop runs the embeddings, it benefits when using OpenAI embeddings # The second loops runs the pinecone upsert asynchronously. @@ -298,6 +325,8 @@ def add_texts( **kwargs, ) + asyncio.run(self._wait_on_index(len(texts) + initial_vector_count)) + return ids def similarity_search_with_score( diff --git a/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py b/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py index 184825a8174a8..e49ba444dd3a9 100644 --- a/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/pinecone/tests/integration_tests/test_vectorstores.py @@ -89,7 +89,6 @@ def test_from_texts( index_name=INDEX_NAME, namespace=NAMESPACE_NAME, ) - time.sleep(DEFAULT_SLEEP) # prevent race condition output = docsearch.similarity_search(unique_id, k=1, namespace=NAMESPACE_NAME) output[0].id = None # overwrite ID for ease of comparison assert output == [Document(page_content=needs)] @@ -113,7 +112,6 @@ def test_from_texts_with_metadatas( metadatas=metadatas, namespace=namespace, ) - time.sleep(DEFAULT_SLEEP) # prevent race condition output = docsearch.similarity_search(needs, k=1, namespace=namespace) output[0].id = None @@ -133,7 +131,6 @@ def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> Non namespace=NAMESPACE_NAME, ) print(texts) # noqa: T201 - time.sleep(DEFAULT_SLEEP) # prevent race condition output = docsearch.similarity_search_with_score( "foo", k=3, namespace=NAMESPACE_NAME ) @@ -178,8 +175,6 @@ def test_from_existing_index_with_namespaces( namespace=f"{INDEX_NAME}-2", ) - time.sleep(DEFAULT_SLEEP) # prevent race condition - # Search with namespace docsearch = PineconeVectorStore.from_existing_index( index_name=INDEX_NAME, @@ -203,7 +198,6 @@ def test_add_documents_with_ids( index_name=INDEX_NAME, namespace=NAMESPACE_NAME, ) - time.sleep(DEFAULT_SLEEP) # prevent race condition index_stats = self.index.describe_index_stats() assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts) @@ -215,7 +209,6 @@ def test_add_documents_with_ids( index_name=INDEX_NAME, namespace=NAMESPACE_NAME, ) - time.sleep(DEFAULT_SLEEP) # prevent race condition index_stats = self.index.describe_index_stats() assert ( index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts) * 2 @@ -234,8 +227,6 @@ def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None index_name=INDEX_NAME, metadatas=metadatas, ) - # wait for the index to be ready - time.sleep(DEFAULT_SLEEP) output = docsearch.similarity_search_with_relevance_scores("foo", k=3) print(output) # noqa: T201 assert all(