Merge branch 'add_pinecone' into pinecone-filters

deepset-ai · Dec 22, 2023 · 8459d7b · 8459d7b
2 parents d13692a + 017cd75
commit 8459d7b
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 51 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -44,6 +44,11 @@ integration:qdrant:
       - any-glob-to-any-file: "integrations/qdrant/**/*"
       - any-glob-to-any-file: ".github/workflows/qdrant.yml"
 
+integration:pinecone:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/pinecone/**/*"
+      - any-glob-to-any-file: ".github/workflows/pinecone.yml"      
+
 integration:unstructured-fileconverter:
   - changed-files:
       - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"

diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml
@@ -48,8 +48,10 @@ dependencies = [
   "pytest-xdist",
 ]
 [tool.hatch.envs.default.scripts]
-test = "pytest {args:tests}"
-test-cov = "coverage run -m pytest {args:tests}"
+# Pinecone tests are slow (require HTTP requests), so we run them in parallel
+# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html)
+test = "pytest -n auto --maxprocesses=3 {args:tests}"
+test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}"
 cov-report = [
   "- coverage combine",
   "coverage report",

diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -165,22 +165,30 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
         return written_docs
 
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
-        # we try to return all the matching documents but Pinecone has some limits
+        """
+        Returns the documents that match the filters provided.
+
+        For a detailed specification of the filters,
+        refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
+
+        :param filters: The filters to apply to the document list.
+        :return: A list of Documents that match the given filters.
+        """
+
+        # Pinecone only performs vector similarity search
+        # here we are querying with a dummy vector and the max compatible top_k
         documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)
+
+        # when simply filtering, we don't want to return any scores
+        # furthermore, we are querying with a dummy vector, so the scores are meaningless
         for doc in documents:
             doc.score = None
 
-        #TODO: restart from here
-        total_docs_number = self.count_documents()
-        if total_docs_number > TOP_K_LIMIT:
+        if len(documents) == TOP_K_LIMIT:
             logger.warning(
-                f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. "
-                f"However, there are {total_docs_number} documents in the namespace. "
+                f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. "
+                f"It is likely that there are more matching documents in the document store. "
             )
-            return documents
-
-        documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT, filters=filters)
-
         return documents
 
     def delete_documents(self, document_ids: List[str]) -> None:
@@ -195,7 +203,8 @@ def _embedding_retrieval(
         self,
         query_embedding: List[float],
         *,
-        filters: Optional[Dict[str, Any]] = None,
+        namespace: Optional[str] = None,
+        filters: Optional[Dict[str, Any]] = None,  # noqa: ARG002 (filters to be implemented)
         top_k: int = 10,
     ) -> List[Document]:
         """
@@ -206,6 +215,7 @@ def _embedding_retrieval(
         `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.
 
         :param query_embedding: Embedding of the query.
+        :param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
         :param filters: Filters applied to the retrieved Documents. Defaults to None.
         :param top_k: Maximum number of Documents to return, defaults to 10
 
@@ -223,8 +233,7 @@ def _embedding_retrieval(
         result = self._index.query(
             vector=query_embedding,
             top_k=top_k,
-            filter=filters,
-            namespace=self.namespace,
+            namespace=namespace or self.namespace,
             include_values=True,
             include_metadata=True,
         )

diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py
diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py
diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py
@@ -3,11 +3,24 @@
 import numpy as np
 import pytest
 from haystack import Document
+from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
 
 from pinecone_haystack.document_store import PineconeDocumentStore
 
 
-class TestDocumentStore:
+class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest):
+    def test_write_documents(self, document_store: PineconeDocumentStore):
+        docs = [Document(id="1")]
+        assert document_store.write_documents(docs) == 1
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
+        ...
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
+        ...
+
     @patch("pinecone_haystack.document_store.pinecone")
     def test_init(self, mock_pinecone):
         mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}

diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py