From c6da1208f469b4569545b856378afb6ac30f2c0c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 5 Mar 2024 11:23:48 +0100 Subject: [PATCH] doc: fixing docstrings and API docs for gradient (#507) * initial import * adding returned Dict * attending PR comments * linting * fixing doc --- .../gradient/gradient_document_embedder.py | 49 +++++++++++++++---- .../gradient/gradient_text_embedder.py | 31 +++++++++--- .../components/generators/gradient/base.py | 21 ++++++-- 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py index 4ccfb9da5..a868c6c1b 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py @@ -23,16 +23,32 @@ def _alt_progress_bar(x: Any) -> Any: class GradientDocumentEmbedder: """ A component for computing Document embeddings using Gradient AI API. + The embedding of each Document is stored in the `embedding` field of the Document. + Usage example: ```python - embedder = GradientDocumentEmbedder(model="bge_large") - p = Pipeline() - p.add_component(embedder, name="document_embedder") - p.add_component(instance=GradientDocumentEmbedder( - p.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") - p.connect("document_embedder", "document_writer") - p.run({"document_embedder": {"documents": documents}}) + from haystack import Pipeline + from haystack.document_stores.in_memory import InMemoryDocumentStore + from haystack.components.writers import DocumentWriter + from haystack import Document + + from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder + + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + + indexing_pipeline = Pipeline() + indexing_pipeline.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") + indexing_pipeline.add_component( + instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") + ) + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + >>> {'document_writer': {'documents_written': 3}} ``` """ @@ -53,7 +69,7 @@ def __init__( :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). :param progress_bar: Whether to show a progress bar while embedding the documents. """ self._batch_size = batch_size @@ -75,8 +91,12 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> dict: """ - Serialize the component to a Python dictionary. + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ + return default_to_dict( self, model=self._model_name, @@ -91,13 +111,17 @@ def to_dict(self) -> dict: def from_dict(cls, data: Dict[str, Any]) -> "GradientDocumentEmbedder": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data) def warm_up(self) -> None: """ - Load the embedding model. + Initializes the component. """ if not hasattr(self, "_embedding_model"): self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) @@ -125,9 +149,14 @@ def _generate_embeddings(self, documents: List[Document], batch_size: int) -> Li def run(self, documents: List[Document]): """ Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. :param documents: A list of Documents to embed. + :returns: + A dictionary with the following keys: + - `documents`: The embedded Documents. + """ if not isinstance(documents, list) or documents and any(not isinstance(doc, Document) for doc in documents): msg = "GradientDocumentEmbedder expects a list of Documents as input.\ diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py index 029d5c52f..77b2d6250 100644 --- a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py @@ -8,15 +8,23 @@ @component class GradientTextEmbedder: """ - A component for embedding strings using models hosted on Gradient AI (https://gradient.ai). + A component for embedding strings using models hosted on [Gradient AI](https://gradient.ai). + Usage example: ```python - embedder = GradientTextEmbedder(model="bge_large") + from haystack_integrations.components.embedders.gradient import GradientTextEmbedder + from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever + from haystack.document_stores.in_memory import InMemoryDocumentStore + from haystack import Pipeline + + embedder = p = Pipeline() - p.add_component(instance=embedder, name="text_embedder") - p.add_component(instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever") + p.add_component("text_embedder", GradientTextEmbedder(model="bge-large")) + p.add_component("retriever", InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore())) p.connect("text_embedder", "retriever") - p.run("embed me!!!") + p.run(data={"text_embedder": {"text":"You can embed me put I'll return no matching documents"}}) + >>> No Documents found with embeddings. Returning empty list. To generate embeddings, use a DocumentEmbedder. + >>> {'retriever': {'documents': []}} ``` """ @@ -34,7 +42,7 @@ def __init__( :param model: The name of the model to use. :param access_token: The Gradient access token. :param workspace_id: The Gradient workspace ID. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). """ self._host = host self._model_name = model @@ -53,7 +61,10 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> dict: """ - Serialize the component to a Python dictionary. + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ return default_to_dict( self, @@ -67,13 +78,17 @@ def to_dict(self) -> dict: def from_dict(cls, data: Dict[str, Any]) -> "GradientTextEmbedder": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data) def warm_up(self) -> None: """ - Load the embedding model. + Initializes the component. """ if not hasattr(self, "_embedding_model"): self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) diff --git a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py index 9176c3e4b..71b39d309 100644 --- a/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py @@ -16,7 +16,10 @@ class GradientGenerator: Queries the LLM using Gradient AI's SDK ('gradientai' package). See [Gradient AI API](https://docs.gradient.ai/docs/sdk-quickstart) for more details. + Usage example: ```python + from haystack_integrations.components.generators.gradient import GradientGenerator + llm = GradientGenerator(base_model_slug="llama2-7b-chat") llm.warm_up() print(llm.run(prompt="What is the meaning of life?")) @@ -40,17 +43,17 @@ def __init__( """ Create a GradientGenerator component. - :param access_token: The Gradient access token. If not provided it's read from the environment - variable GRADIENT_ACCESS_TOKEN. + :param access_token: The Gradient access token as a `Secret`. If not provided it's read from the environment + variable `GRADIENT_ACCESS_TOKEN`. :param base_model_slug: The base model slug to use. - :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param host: The Gradient host. By default, it uses [Gradient AI](https://api.gradient.ai/). :param max_generated_token_count: The maximum number of tokens to generate. :param model_adapter_id: The model adapter ID to use. :param temperature: The temperature to use. :param top_k: The top k to use. :param top_p: The top p to use. - :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment - variable GRADIENT_WORKSPACE_ID. + :param workspace_id: The Gradient workspace ID as a `Secret`. If not provided it's read from the environment + variable `GRADIENT_WORKSPACE_ID`. """ self._access_token = access_token self._base_model_slug = base_model_slug @@ -84,6 +87,9 @@ def __init__( def to_dict(self) -> Dict[str, Any]: """ Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. """ return default_to_dict( self, @@ -102,7 +108,12 @@ def to_dict(self) -> Dict[str, Any]: def from_dict(cls, data: Dict[str, Any]) -> "GradientGenerator": """ Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. """ + deserialize_secrets_inplace(data["init_parameters"], keys=["access_token", "workspace_id"]) return default_from_dict(cls, data)