Merge branch 'tuana/mistral' of https://github.com/deepset-ai/haystac…

…k-core-integrations into tuana/mistral
deepset-ai · Feb 15, 2024 · d59c233 · d59c233
2 parents 08845fa + c21180f
commit d59c233
Show file tree

Hide file tree

Showing 78 changed files with 2,975 additions and 400 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -69,6 +69,11 @@ integration:mistral:
       - any-glob-to-any-file: "integrations/mistral/**/*"
       - any-glob-to-any-file: ".github/workflows/mistral.yml"
 
+integration:mongodb-atlas:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/mongodb_atlas/**/*"
+      - any-glob-to-any-file: ".github/workflows/mongodb_atlas.yml"
+
 integration:ollama:
   - changed-files:
       - any-glob-to-any-file: "integrations/ollama/**/*"

diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml
@@ -0,0 +1,58 @@
+# This workflow comes from https://github.com/ofek/hatch-mypyc
+# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
+name: Test / mongodb_atlas
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - "integrations/mongodb_atlas/**"
+      - ".github/workflows/mongodb_atlas.yml"
+
+defaults:
+  run:
+    working-directory: integrations/mongodb_atlas
+
+concurrency:
+  group: mongodb-atlas-${{ github.head_ref }}
+  cancel-in-progress: true
+
+env:
+  PYTHONUNBUFFERED: "1"
+  FORCE_COLOR: "1"
+  MONGO_CONNECTION_STRING: ${{ secrets.MONGO_CONNECTION_STRING }}
+
+jobs:
+  run:
+    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python-version: ['3.9', '3.10', '3.11']
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Hatch
+        run: pip install --upgrade hatch
+
+      - name: Lint
+        working-directory: integrations/mongodb_atlas
+        if: matrix.python-version == '3.9' && runner.os == 'Linux'
+        run: hatch run lint:all
+
+      - name: Generate docs
+        if: matrix.python-version == '3.9' && runner.os == 'Linux'
+        run: hatch run docs
+
+      - name: Run tests
+        working-directory: integrations/mongodb_atlas
+        run: hatch run cov
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ deepset-haystack
 | [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack)             | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) |
 | [jina-haystack](integrations/jina/)                                 | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack)                                             | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml)                                                 |
 | [llama-cpp-haystack](integrations/llama_cpp/)                       | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack)                         | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml)                                  |
+| [mongodb-atlas-haystack](integrations/mongodb_atlas/)               | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg?color=orange)](https://pypi.org/project/mongodb-atlas-haystack)              | [![Test / mongodb-atlas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml)                      |
 | [ollama-haystack](integrations/ollama/)                             | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack)                            | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml)                                           |
 | [opensearch-haystack](integrations/opensearch/)                     | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                               |
 | [pinecone-haystack](integrations/pinecone/)                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack)                        | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)                                     |

diff --git a/integrations/astra/examples/example.py b/integrations/astra/examples/example.py
@@ -47,7 +47,7 @@
 p.add_component(instance=DocumentCleaner(), name="cleaner")
 p.add_component(instance=DocumentSplitter(split_by="word", split_length=150, split_overlap=30), name="splitter")
 p.add_component(
-    instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
@@ -63,7 +63,7 @@
 # Create a querying pipeline on the indexed data
 q = Pipeline()
 q.add_component(
-    instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 q.add_component("retriever", AstraEmbeddingRetriever(document_store))

diff --git a/integrations/astra/examples/pipeline_example.py b/integrations/astra/examples/pipeline_example.py
@@ -62,7 +62,7 @@
 ]
 p = Pipeline()
 p.add_component(
-    instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 p.add_component(instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="writer")
@@ -74,7 +74,7 @@
 # Construct rag pipeline
 rag_pipeline = Pipeline()
 rag_pipeline.add_component(
-    instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
+    instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
     name="embedder",
 )
 rag_pipeline.add_component(instance=AstraEmbeddingRetriever(document_store=document_store), name="retriever")

diff --git a/integrations/cohere/tests/test_cohere_chat_generator.py b/integrations/cohere/tests/test_cohere_chat_generator.py
@@ -274,10 +274,7 @@ def test_live_run(self):
     @pytest.mark.integration
     def test_live_run_wrong_model(self, chat_messages):
         component = CohereChatGenerator(model="something-obviously-wrong")
-        with pytest.raises(
-            cohere.CohereAPIError,
-            match="model not found, make sure the correct model ID was used and that you have access to the model.",
-        ):
+        with pytest.raises(cohere.CohereAPIError):
             component.run(chat_messages)
 
     @pytest.mark.skipif(

diff --git a/integrations/cohere/tests/test_cohere_generators.py b/integrations/cohere/tests/test_cohere_generators.py
@@ -149,10 +149,7 @@ def test_cohere_generator_run_wrong_model(self):
         import cohere
 
         component = CohereGenerator(model="something-obviously-wrong")
-        with pytest.raises(
-            cohere.CohereAPIError,
-            match="model not found, make sure the correct model ID was used and that you have access to the model.",
-        ):
+        with pytest.raises(cohere.CohereAPIError):
             component.run(prompt="What's the capital of France?")
 
     @pytest.mark.skipif(

diff --git a/integrations/cohere/tests/test_document_embedder.py b/integrations/cohere/tests/test_document_embedder.py
@@ -136,9 +136,9 @@ def test_run(self):
     def test_run_wrong_input_format(self):
         embedder = CohereDocumentEmbedder(api_key=Secret.from_token("test-api-key"))
 
-        with pytest.raises(TypeError, match="CohereDocumentEmbedder expects a list of Documents as input"):
+        with pytest.raises(TypeError):
             embedder.run(documents="text")
-        with pytest.raises(TypeError, match="CohereDocumentEmbedder expects a list of Documents as input"):
+        with pytest.raises(TypeError):
             embedder.run(documents=[1, 2, 3])
 
         assert embedder.run(documents=[]) == {"documents": [], "meta": {}}
diff --git a/integrations/cohere/tests/test_text_embedder.py b/integrations/cohere/tests/test_text_embedder.py
@@ -106,7 +106,7 @@ def test_run_wrong_input_format(self):
         embedder = CohereTextEmbedder(api_key=Secret.from_token("test-api-key"))
         list_integers_input = ["text_snippet_1", "text_snippet_2"]
 
-        with pytest.raises(TypeError, match="CohereTextEmbedder expects a string as input"):
+        with pytest.raises(TypeError):
             embedder.run(text=list_integers_input)
 
     @pytest.mark.skipif(

diff --git a/integrations/fastembed/example/example.py b/integrations/fastembed/example/example.py
@@ -0,0 +1,33 @@
+from haystack import Document, Pipeline
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder, FastembedTextEmbedder
+
+document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
+
+documents = [
+    Document(content="My name is Wolfgang and I live in Berlin"),
+    Document(content="I saw a black horse running"),
+    Document(content="Germany has many big cities"),
+    Document(content="fastembed is supported by and maintained by Qdrant."),
+]
+
+document_embedder = FastembedDocumentEmbedder()
+document_embedder.warm_up()
+documents_with_embeddings = document_embedder.run(documents)["documents"]
+document_store.write_documents(documents_with_embeddings)
+
+query_pipeline = Pipeline()
+query_pipeline.add_component("text_embedder", FastembedTextEmbedder())
+query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
+query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+
+query = "Who supports fastembed?"
+
+result = query_pipeline.run({"text_embedder": {"text": query}})
+
+print(result["retriever"]["documents"][0])  # noqa: T201
+
+# Document(id=...,
+#  content: 'fastembed is supported by and maintained by Qdrant.',
+#  score: 0.758..)
diff --git a/integrations/google_ai/pyproject.toml b/integrations/google_ai/pyproject.toml
@@ -152,6 +152,7 @@ ban-relative-imports = "parents"
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
 
 [tool.coverage.run]
+source = ["haystack_integrations"]
 branch = true
 parallel = true
 
@@ -160,6 +161,8 @@ google_ai_haystack = ["src"]
 tests = ["tests"]
 
 [tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing=true
 exclude_lines = [
   "no cov",
   "if __name__ == .__main__.:",

diff --git a/...ations/google_ai/src/haystack_integrations/components/generators/google_ai/chat/gemini.py b/...ations/google_ai/src/haystack_integrations/components/generators/google_ai/chat/gemini.py
@@ -9,6 +9,7 @@
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.dataclasses.byte_stream import ByteStream
 from haystack.dataclasses.chat_message import ChatMessage, ChatRole
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 logger = logging.getLogger(__name__)
 
@@ -20,11 +21,12 @@ class GoogleAIGeminiChatGenerator:
 
     Sample usage:
     ```python
+    from haystack.utils import Secret
     from haystack.dataclasses.chat_message import ChatMessage
     from haystack_integrations.components.generators.google_ai import GoogleAIGeminiChatGenerator
 
 
-    gemini_chat = GoogleAIGeminiChatGenerator(model="gemini-pro", api_key="<MY_API_KEY>")
+    gemini_chat = GoogleAIGeminiChatGenerator(model="gemini-pro", api_key=Secret.from_token("<MY_API_KEY>"))
 
     messages = [ChatMessage.from_user("What is the most interesting thing you know?")]
     res = gemini_chat.run(messages=messages)
@@ -40,6 +42,7 @@ class GoogleAIGeminiChatGenerator:
 
     This is a more advanced usage that also uses function calls:
     ```python
+    from haystack.utils import Secret
     from haystack.dataclasses.chat_message import ChatMessage
     from google.ai.generativelanguage import FunctionDeclaration, Tool
 
@@ -73,7 +76,8 @@ def get_current_weather(location: str, unit: str = "celsius") -> str:
 
     messages = [ChatMessage.from_user("What is the most interesting thing you know?")]
 
-    gemini_chat = GoogleAIGeminiChatGenerator(model="gemini-pro", api_key="<MY_API_KEY>", tools=[tool])
+    gemini_chat = GoogleAIGeminiChatGenerator(model="gemini-pro", api_key=Secret.from_token("<MY_API_KEY>"),
+                                              tools=[tool])
 
     messages = [ChatMessage.from_user(content = "What is the temperature in celsius in Berlin?")]
     res = gemini_chat.run(messages=messages)
@@ -95,15 +99,14 @@ def get_current_weather(location: str, unit: str = "celsius") -> str:
     def __init__(
         self,
         *,
-        api_key: Optional[str] = None,
+        api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"),  # noqa: B008
         model: str = "gemini-pro-vision",
         generation_config: Optional[Union[GenerationConfig, Dict[str, Any]]] = None,
         safety_settings: Optional[Dict[HarmCategory, HarmBlockThreshold]] = None,
         tools: Optional[List[Tool]] = None,
     ):
         """
         Initialize a GoogleAIGeminiChatGenerator instance.
-        If `api_key` is `None` it will use the `GOOGLE_API_KEY` env variable for authentication.
 
         To get an API key, visit: https://makersuite.google.com
 
@@ -112,7 +115,7 @@ def __init__(
         * `gemini-pro-vision`
         * `gemini-ultra`
 
-        :param api_key: Google Makersuite API key, defaults to None
+        :param api_key: Google Makersuite API key.
         :param model: Name of the model to use, defaults to "gemini-pro-vision"
         :param generation_config: The generation config to use, defaults to None.
             Can either be a GenerationConfig object or a dictionary of parameters.
@@ -130,8 +133,9 @@ def __init__(
         """
 
         # Authenticate, if api_key is None it will use the GOOGLE_API_KEY env variable
-        genai.configure(api_key=api_key)
+        genai.configure(api_key=api_key.resolve_value())
 
+        self._api_key = api_key
         self._model_name = model
         self._generation_config = generation_config
         self._safety_settings = safety_settings
@@ -153,6 +157,7 @@ def _generation_config_to_dict(self, config: Union[GenerationConfig, Dict[str, A
     def to_dict(self) -> Dict[str, Any]:
         data = default_to_dict(
             self,
+            api_key=self._api_key.to_dict(),
             model=self._model_name,
             generation_config=self._generation_config,
             safety_settings=self._safety_settings,
@@ -168,6 +173,8 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIGeminiChatGenerator":
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+
         if (tools := data["init_parameters"].get("tools")) is not None:
             data["init_parameters"]["tools"] = [Tool.deserialize(t) for t in tools]
         if (generation_config := data["init_parameters"].get("generation_config")) is not None:

diff --git a/integrations/google_ai/src/haystack_integrations/components/generators/google_ai/gemini.py b/integrations/google_ai/src/haystack_integrations/components/generators/google_ai/gemini.py
@@ -9,6 +9,7 @@
 from haystack.core.component.types import Variadic
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.dataclasses.byte_stream import ByteStream
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 logger = logging.getLogger(__name__)
 
@@ -20,9 +21,10 @@ class GoogleAIGeminiGenerator:
 
     Sample usage:
     ```python
+    from haystack.utils import Secret
     from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
 
-    gemini = GoogleAIGeminiGenerator(model="gemini-pro", api_key="<MY_API_KEY>")
+    gemini = GoogleAIGeminiGenerator(model="gemini-pro", api_key=Secret.from_token("<MY_API_KEY>"))
     res = gemini.run(parts = ["What is the most interesting thing you know?"])
     for answer in res["answers"]:
         print(answer)
@@ -31,6 +33,7 @@ class GoogleAIGeminiGenerator:
     This is a more advanced usage that also uses text and images as input:
     ```python
     import requests
+    from haystack.utils import Secret
     from haystack.dataclasses.byte_stream import ByteStream
     from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
 
@@ -50,7 +53,7 @@ class GoogleAIGeminiGenerator:
         for url in URLS
     ]
 
-    gemini = GoogleAIGeminiGenerator(model="gemini-pro-vision", api_key="<MY_API_KEY>")
+    gemini = GoogleAIGeminiGenerator(model="gemini-pro-vision", api_key=Secret.from_token("<MY_API_KEY>"))
     result = gemini.run(parts = ["What can you tell me about this robots?", *images])
     for answer in result["answers"]:
         print(answer)
@@ -66,15 +69,14 @@ class GoogleAIGeminiGenerator:
     def __init__(
         self,
         *,
-        api_key: Optional[str] = None,
+        api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"),  # noqa: B008
         model: str = "gemini-pro-vision",
         generation_config: Optional[Union[GenerationConfig, Dict[str, Any]]] = None,
         safety_settings: Optional[Dict[HarmCategory, HarmBlockThreshold]] = None,
         tools: Optional[List[Tool]] = None,
     ):
         """
         Initialize a GoogleAIGeminiGenerator instance.
-        If `api_key` is `None` it will use the `GOOGLE_API_KEY` env variable for authentication.
 
         To get an API key, visit: https://makersuite.google.com
 
@@ -83,7 +85,7 @@ def __init__(
         * `gemini-pro-vision`
         * `gemini-ultra`
 
-        :param api_key: Google Makersuite API key, defaults to None
+        :param api_key: Google Makersuite API key.
         :param model: Name of the model to use, defaults to "gemini-pro-vision"
         :param generation_config: The generation config to use, defaults to None.
             Can either be a GenerationConfig object or a dictionary of parameters.
@@ -99,9 +101,9 @@ def __init__(
         :param tools: The tools to use, defaults to None.
             A list of Tool objects that can be used to modify the generation process.
         """
-        # Authenticate, if api_key is None it will use the GOOGLE_API_KEY env variable
-        genai.configure(api_key=api_key)
+        genai.configure(api_key=api_key.resolve_value())
 
+        self._api_key = api_key
         self._model_name = model
         self._generation_config = generation_config
         self._safety_settings = safety_settings
@@ -123,6 +125,7 @@ def _generation_config_to_dict(self, config: Union[GenerationConfig, Dict[str, A
     def to_dict(self) -> Dict[str, Any]:
         data = default_to_dict(
             self,
+            api_key=self._api_key.to_dict(),
             model=self._model_name,
             generation_config=self._generation_config,
             safety_settings=self._safety_settings,
@@ -138,6 +141,8 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIGeminiGenerator":
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+
         if (tools := data["init_parameters"].get("tools")) is not None:
             data["init_parameters"]["tools"] = [Tool.deserialize(t) for t in tools]
         if (generation_config := data["init_parameters"].get("generation_config")) is not None: