Skip to content

Commit

Permalink
Merge branch 'add_pinecone' into pinecone-filters
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Dec 22, 2023
2 parents d13692a + 017cd75 commit 8459d7b
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 51 deletions.
5 changes: 5 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ integration:qdrant:
- any-glob-to-any-file: "integrations/qdrant/**/*"
- any-glob-to-any-file: ".github/workflows/qdrant.yml"

integration:pinecone:
- changed-files:
- any-glob-to-any-file: "integrations/pinecone/**/*"
- any-glob-to-any-file: ".github/workflows/pinecone.yml"

integration:unstructured-fileconverter:
- changed-files:
- any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"
Expand Down
6 changes: 4 additions & 2 deletions integrations/pinecone/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ dependencies = [
"pytest-xdist",
]
[tool.hatch.envs.default.scripts]
test = "pytest {args:tests}"
test-cov = "coverage run -m pytest {args:tests}"
# Pinecone tests are slow (require HTTP requests), so we run them in parallel
# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html)
test = "pytest -n auto --maxprocesses=3 {args:tests}"
test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}"
cov-report = [
"- coverage combine",
"coverage report",
Expand Down
35 changes: 22 additions & 13 deletions integrations/pinecone/src/pinecone_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,22 +165,30 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
return written_docs

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
# we try to return all the matching documents but Pinecone has some limits
"""
Returns the documents that match the filters provided.
For a detailed specification of the filters,
refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
"""

# Pinecone only performs vector similarity search
# here we are querying with a dummy vector and the max compatible top_k
documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)

# when simply filtering, we don't want to return any scores
# furthermore, we are querying with a dummy vector, so the scores are meaningless
for doc in documents:
doc.score = None

#TODO: restart from here
total_docs_number = self.count_documents()
if total_docs_number > TOP_K_LIMIT:
if len(documents) == TOP_K_LIMIT:
logger.warning(
f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. "
f"However, there are {total_docs_number} documents in the namespace. "
f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. "
f"It is likely that there are more matching documents in the document store. "
)
return documents

documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT, filters=filters)

return documents

def delete_documents(self, document_ids: List[str]) -> None:
Expand All @@ -195,7 +203,8 @@ def _embedding_retrieval(
self,
query_embedding: List[float],
*,
filters: Optional[Dict[str, Any]] = None,
namespace: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented)
top_k: int = 10,
) -> List[Document]:
"""
Expand All @@ -206,6 +215,7 @@ def _embedding_retrieval(
`PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.
:param query_embedding: Embedding of the query.
:param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10
Expand All @@ -223,8 +233,7 @@ def _embedding_retrieval(
result = self._index.query(
vector=query_embedding,
top_k=top_k,
filter=filters,
namespace=self.namespace,
namespace=namespace or self.namespace,
include_values=True,
include_metadata=True,
)
Expand Down
7 changes: 0 additions & 7 deletions integrations/pinecone/tests/test_count.py

This file was deleted.

7 changes: 0 additions & 7 deletions integrations/pinecone/tests/test_delete.py

This file was deleted.

15 changes: 14 additions & 1 deletion integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,24 @@
import numpy as np
import pytest
from haystack import Document
from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest

from pinecone_haystack.document_store import PineconeDocumentStore


class TestDocumentStore:
class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest):
def test_write_documents(self, document_store: PineconeDocumentStore):
docs = [Document(id="1")]
assert document_store.write_documents(docs) == 1

@pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
...

@pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
...

@patch("pinecone_haystack.document_store.pinecone")
def test_init(self, mock_pinecone):
mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}
Expand Down
21 changes: 0 additions & 21 deletions integrations/pinecone/tests/test_write.py

This file was deleted.

0 comments on commit 8459d7b

Please sign in to comment.