diff --git a/integrations/pinecone/pydoc/config.yml b/integrations/pinecone/pydoc/config.yml index 51ef0ee15..fff835877 100644 --- a/integrations/pinecone/pydoc/config.yml +++ b/integrations/pinecone/pydoc/config.yml @@ -4,9 +4,7 @@ loaders: modules: [ "haystack_integrations.components.retrievers.pinecone.embedding_retriever", - "haystack_integrations.document_stores.pinecone.document_store", - "haystack_integrations.document_stores.pinecone.errors", - "haystack_integrations.document_stores.pinecone.filters", + "haystack_integrations.document_stores.pinecone.document_store" ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py b/integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py index 840c9e1f6..02c4a3a87 100644 --- a/integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py +++ b/integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py @@ -12,9 +12,41 @@ @component class PineconeEmbeddingRetriever: """ - Retrieves documents from the PineconeDocumentStore, based on their dense embeddings. + Retrieves documents from the `PineconeDocumentStore`, based on their dense embeddings. - Needs to be connected to the PineconeDocumentStore. + Usage example: + ```python + import os + from haystack.document_stores.types import DuplicatePolicy + from haystack import Document + from haystack import Pipeline + from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder + from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever + from haystack_integrations.document_stores.pinecone import PineconeDocumentStore + + os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY" + document_store = PineconeDocumentStore(index="my_index", namespace="my_namespace", dimension=768) + + documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + + document_embedder = SentenceTransformersDocumentEmbedder() + document_embedder.warm_up() + documents_with_embeddings = document_embedder.run(documents) + + document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) + + query_pipeline = Pipeline() + query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) + query_pipeline.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store)) + query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + + query = "How many languages are there?" + + res = query_pipeline.run({"text_embedder": {"text": query}}) + assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today." + ``` """ def __init__( @@ -25,13 +57,11 @@ def __init__( top_k: int = 10, ): """ - Create the PineconeEmbeddingRetriever component. - - :param document_store: An instance of PineconeDocumentStore. - :param filters: Filters applied to the retrieved Documents. Defaults to None. - :param top_k: Maximum number of Documents to return, defaults to 10. + :param document_store: The Pinecone Document Store. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. - :raises ValueError: If `document_store` is not an instance of PineconeDocumentStore. + :raises ValueError: If `document_store` is not an instance of `PineconeDocumentStore`. """ if not isinstance(document_store, PineconeDocumentStore): msg = "document_store must be an instance of PineconeDocumentStore" @@ -42,6 +72,11 @@ def __init__( self.top_k = top_k def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, filters=self.filters, @@ -51,6 +86,13 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ data["init_parameters"]["document_store"] = default_from_dict( PineconeDocumentStore, data["init_parameters"]["document_store"] ) @@ -59,10 +101,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever": @component.output_types(documents=List[Document]) def run(self, query_embedding: List[float]): """ - Retrieve documents from the PineconeDocumentStore, based on their dense embeddings. + Retrieve documents from the `PineconeDocumentStore`, based on their dense embeddings. :param query_embedding: Embedding of the query. - :return: List of Document similar to `query_embedding`. + :returns: List of Document similar to `query_embedding`. """ docs = self.document_store._embedding_retrieval( query_embedding=query_embedding, diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py index 91364d7bf..a23cf80f6 100644 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py +++ b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py @@ -26,6 +26,10 @@ class PineconeDocumentStore: + """ + A Document Store using [Pinecone vector database](https://www.pinecone.io/). + """ + def __init__( self, *, @@ -42,20 +46,17 @@ def __init__( It is meant to be connected to a Pinecone index and namespace. :param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the - environment variable PINECONE_API_KEY (recommended). - :param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp". + environment variable `PINECONE_API_KEY` (recommended). + :param environment: The Pinecone environment to connect to. :param index: The Pinecone index to connect to. If the index does not exist, it will be created. - Defaults to "default". :param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created - at the first write. Defaults to "default". - :param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by - Pinecone. + at the first write. + :param batch_size: The number of documents to write in a single batch. When setting this parameter, + consider [documented Pinecone limits](https://docs.pinecone.io/docs/limits). :param dimension: The dimension of the embeddings. This parameter is only used when creating a new index. - Defaults to 768. :param index_creation_kwargs: Additional keyword arguments to pass to the index creation method. - For example, you can specify `metric`, `pods`, `replicas`... You can find the full list of supported arguments in the - [API reference](https://docs.pinecone.io/reference/create_index-1). + [API reference](https://docs.pinecone.io/reference/create_index). """ resolved_api_key = api_key.resolve_value() @@ -95,10 +96,22 @@ def __init__( @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, api_key=self.api_key.to_dict(), @@ -128,7 +141,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D :param policy: The duplicate policy to use when writing documents. PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`. - :return: The number of documents written to the document store. + :returns: The number of documents written to the document store. """ if len(documents) > 0 and not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" @@ -157,7 +170,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) :param filters: The filters to apply to the document list. - :return: A list of Documents that match the given filters. + :returns: A list of Documents that match the given filters. """ # Pinecone only performs vector similarity search @@ -178,7 +191,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc def delete_documents(self, document_ids: List[str]) -> None: """ - Deletes all documents with a matching document_ids from the document store. + Deletes documents that match the provided `document_ids` from the document store. :param document_ids: the document ids to delete """ @@ -197,14 +210,14 @@ def _embedding_retrieval( This method is not mean to be part of the public interface of `PineconeDocumentStore` nor called directly. - `PineconeDenseRetriever` uses this method directly and is the public interface for it. + `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. - :param filters: Filters applied to the retrieved Documents. Defaults to None. - :param top_k: Maximum number of Documents to return, defaults to 10 + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. - :return: List of Document that are most similar to `query_embedding` + :returns: List of Document that are most similar to `query_embedding` """ if not query_embedding: diff --git a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/errors.py b/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/errors.py deleted file mode 100644 index 994f34cf0..000000000 --- a/integrations/pinecone/src/haystack_integrations/document_stores/pinecone/errors.py +++ /dev/null @@ -1,10 +0,0 @@ -from haystack.document_stores.errors import DocumentStoreError -from haystack.errors import FilterError - - -class PineconeDocumentStoreError(DocumentStoreError): - pass - - -class PineconeDocumentStoreFilterError(FilterError): - pass