diff --git a/.github/labeler.yml b/.github/labeler.yml index f2dcedad2..1a41c2caf 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -44,6 +44,11 @@ integration:qdrant: - any-glob-to-any-file: "integrations/qdrant/**/*" - any-glob-to-any-file: ".github/workflows/qdrant.yml" +integration:pinecone: + - changed-files: + - any-glob-to-any-file: "integrations/pinecone/**/*" + - any-glob-to-any-file: ".github/workflows/pinecone.yml" + integration:unstructured-fileconverter: - changed-files: - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*" diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 506795e7f..069dba1be 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -48,8 +48,10 @@ dependencies = [ "pytest-xdist", ] [tool.hatch.envs.default.scripts] -test = "pytest {args:tests}" -test-cov = "coverage run -m pytest {args:tests}" +# Pinecone tests are slow (require HTTP requests), so we run them in parallel +# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html) +test = "pytest -n auto --maxprocesses=3 {args:tests}" +test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}" cov-report = [ "- coverage combine", "coverage report", diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 98e9922c2..e5f0e032f 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -165,22 +165,30 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D return written_docs def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - # we try to return all the matching documents but Pinecone has some limits + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) + + :param filters: The filters to apply to the document list. + :return: A list of Documents that match the given filters. + """ + + # Pinecone only performs vector similarity search + # here we are querying with a dummy vector and the max compatible top_k documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) + + # when simply filtering, we don't want to return any scores + # furthermore, we are querying with a dummy vector, so the scores are meaningless for doc in documents: doc.score = None - #TODO: restart from here - total_docs_number = self.count_documents() - if total_docs_number > TOP_K_LIMIT: + if len(documents) == TOP_K_LIMIT: logger.warning( - f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. " - f"However, there are {total_docs_number} documents in the namespace. " + f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. " + f"It is likely that there are more matching documents in the document store. " ) - return documents - - documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT, filters=filters) - return documents def delete_documents(self, document_ids: List[str]) -> None: @@ -195,7 +203,8 @@ def _embedding_retrieval( self, query_embedding: List[float], *, - filters: Optional[Dict[str, Any]] = None, + namespace: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: """ @@ -206,6 +215,7 @@ def _embedding_retrieval( `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. + :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. :param filters: Filters applied to the retrieved Documents. Defaults to None. :param top_k: Maximum number of Documents to return, defaults to 10 @@ -223,8 +233,7 @@ def _embedding_retrieval( result = self._index.query( vector=query_embedding, top_k=top_k, - filter=filters, - namespace=self.namespace, + namespace=namespace or self.namespace, include_values=True, include_metadata=True, ) diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py deleted file mode 100644 index 02462d422..000000000 --- a/integrations/pinecone/tests/test_count.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - CountDocumentsTest, -) - - -class TestCountDocuments(CountDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py deleted file mode 100644 index 88b145704..000000000 --- a/integrations/pinecone/tests/test_delete.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - DeleteDocumentsTest, -) - - -class TestDeleteDocuments(DeleteDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 2169cf026..76088e860 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -3,11 +3,24 @@ import numpy as np import pytest from haystack import Document +from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest from pinecone_haystack.document_store import PineconeDocumentStore -class TestDocumentStore: +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... + @patch("pinecone_haystack.document_store.pinecone") def test_init(self, mock_pinecone): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py deleted file mode 100644 index 7c04a93be..000000000 --- a/integrations/pinecone/tests/test_write.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from haystack import Document -from haystack.testing.document_store import ( - WriteDocumentsTest, -) - -from pinecone_haystack.document_store import PineconeDocumentStore - - -class TestWriteDocuments(WriteDocumentsTest): - def test_write_documents(self, document_store: PineconeDocumentStore): - docs = [Document(id="1")] - assert document_store.write_documents(docs) == 1 - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): - ... - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): - ...