Skip to content

Commit

Permalink
labeler
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Dec 22, 2023
1 parent 8e6f0e6 commit c759d10
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 15 deletions.
5 changes: 5 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ integration:qdrant:
- any-glob-to-any-file: "integrations/qdrant/**/*"
- any-glob-to-any-file: ".github/workflows/qdrant.yml"

integration:pinecone:
- changed-files:
- any-glob-to-any-file: "integrations/pinecone/**/*"
- any-glob-to-any-file: ".github/workflows/pinecone.yml"

integration:unstructured-fileconverter:
- changed-files:
- any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"
Expand Down
39 changes: 24 additions & 15 deletions integrations/pinecone/src/pinecone_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,23 +162,31 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
return written_docs

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
if not filters:
# in this case, we try to return all documents but Pinecone has some limits
documents = self._embedding_retrieval(
query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT
)
for doc in documents:
doc.score = None
"""
Returns the documents that match the filters provided.
total_docs_number = self.count_documents()
if total_docs_number > TOP_K_LIMIT:
logger.warning(
f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. "
f"However, there are {total_docs_number} documents in the namespace. "
)
return documents
For a detailed specification of the filters,
refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
return []
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
"""

# Pinecone only performs vector similarity search
# here we are querying with a dummy vector and the max compatible top_k
documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)

# when simply filtering, we don't want to return any scores
# furthermore, we are querying with a dummy vector, so the scores are meaningless
for doc in documents:
doc.score = None

if len(documents) == TOP_K_LIMIT:
logger.warning(
f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. "
f"It is likely that there are more matching documents in the document store. "
)
return documents

def delete_documents(self, document_ids: List[str]) -> None:
"""
Expand All @@ -204,6 +212,7 @@ def _embedding_retrieval(
`PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.
:param query_embedding: Embedding of the query.
:param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10
Expand Down

0 comments on commit c759d10

Please sign in to comment.