Merge branch 'main' into rm-pytorch-from-instructor-deps

deepset-ai · Aug 30, 2024 · 3d9c11b · 3d9c11b
2 parents ad19916 + a902ae5
commit 3d9c11b
Show file tree

Hide file tree

Showing 33 changed files with 1,593 additions and 255 deletions.
diff --git a/.github/utils/pyproject_to_requirements.py b/.github/utils/pyproject_to_requirements.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+from pathlib import Path
+import toml
+
+def main(pyproject_path: Path, exclude_optional_dependencies: bool = False):
+    content = toml.load(pyproject_path)
+    deps = set(content["project"]["dependencies"])
+
+    if not exclude_optional_dependencies:
+        optional_deps = content["project"].get("optional-dependencies", {})
+        for dep_list in optional_deps.values():
+            deps.update(dep_list)
+
+    print("\n".join(sorted(deps)))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="pyproject_to_requirements.py",
+        description="Convert pyproject.toml to requirements.txt"
+    )
+    parser.add_argument("pyproject_path", type=Path, help="Path to pyproject.toml file")
+    parser.add_argument("--exclude-optional-dependencies", action="store_true", help="Exclude optional dependencies")
+
+    args = parser.parse_args()
+    main(args.pyproject_path, args.exclude_optional_dependencies)
diff --git a/.github/workflows/CI_license_compliance.yml b/.github/workflows/CI_license_compliance.yml
@@ -0,0 +1,95 @@
+name: Core / License Compliance
+
+on:
+  pull_request:
+    paths:
+    - "integrations/**/pyproject.toml"
+  # Since we test PRs, there is no need to run the workflow at each
+  # merge on `main`. Let's use a cron job instead.
+  schedule:
+    - cron: "0 0 * * *" # every day at midnight
+
+env:
+  CORE_DATADOG_API_KEY: ${{ secrets.CORE_DATADOG_API_KEY }}
+  PYTHON_VERSION: "3.10"
+  EXCLUDE_PACKAGES: "(?i)^(deepeval|cohere|fastembed|ragas|tqdm|psycopg).*"
+
+  # Exclusions must be explicitly motivated
+  #
+  # - deepeval is Apache 2.0 but the license is not available on PyPI
+  # - cohere is MIT but the license is not available on PyPI
+  # - fastembed is Apache 2.0 but the license on PyPI is unclear ("Other/Proprietary License (Apache License)")
+  # - ragas is Apache 2.0 but the license is not available on PyPI
+
+  # - tqdm is MLP but there are no better alternatives
+  # - psycopg is LGPL-3.0 but FOSSA is fine with it
+
+jobs:
+  license_check_direct:
+    name: Direct dependencies only
+    env:
+      REQUIREMENTS_FILE: requirements_direct.txt
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "${{ env.PYTHON_VERSION }}"
+
+      - name: Get changed pyproject files (for pull requests only)
+        if: ${{ github.event_name == 'pull_request' }}
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          files: |
+            integrations/**/pyproject.toml       
+
+      - name: Get direct dependencies from pyproject.toml files
+        run: |
+          pip install toml
+
+          # Determine the list of pyproject.toml files to process
+          if [ "${{ github.event_name }}" = "schedule" ]; then
+            echo "Scheduled run: processing all pyproject.toml files..."
+            FILES=$(find integrations -type f -name 'pyproject.toml')
+          else
+            echo "Pull request: processing changed pyproject.toml files..."
+            FILES="${{ steps.changed-files.outputs.all_changed_files }}"
+          fi
+
+          for file in $FILES; do
+            python .github/utils/pyproject_to_requirements.py $file >> ${{ env.REQUIREMENTS_FILE }}
+            echo "" >> ${{ env.REQUIREMENTS_FILE }}
+          done
+
+      - name: Check Licenses
+        id: license_check_report
+        uses: pilosus/action-pip-license-checker@v2
+        with:
+          github-token: ${{ secrets.GH_ACCESS_TOKEN }}
+          requirements: ${{ env.REQUIREMENTS_FILE }}
+          fail: "Copyleft,Other,Error"
+          exclude: "${{ env.EXCLUDE_PACKAGES }}"
+
+      # We keep the license inventory on FOSSA
+      - name: Send license report to Fossa
+        uses: fossas/[email protected]
+        continue-on-error: true # not critical
+        with:
+          api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }}
+
+      - name: Print report
+        if: ${{ always() }}
+        run: echo "${{ steps.license_check_report.outputs.report }}"
+
+      - name: Send event to Datadog for nightly failures
+        if: failure() && github.event_name == 'schedule'
+        uses: ./.github/actions/send_failure
+        with:
+          title: |
+            Core integrations license compliance nightly failure: ${{ github.workflow }}
+          api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
+
diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 
 ## Inventory
 
+[![License Compliance](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/CI_license_compliance.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/CI_license_compliance.yml)
+
 | Package                                                                                                        | Type                | PyPi Package                                                                                                                                             | Status                                                                                                                                                                                                                                               |
 |----------------------------------------------------------------------------------------------------------------|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | [amazon-bedrock-haystack](integrations/amazon_bedrock/)                                                        | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack)                         | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml)                   |

diff --git a/integrations/google_ai/CHANGELOG.md b/integrations/google_ai/CHANGELOG.md
@@ -2,11 +2,24 @@
 
 ## [unreleased]
 
+### 🐛 Bug Fixes
+
+- Remove the use of deprecated gemini models (#1032)
+
+### 🧪 Testing
+
+- Do not retry tests in `hatch run test` command (#954)
+
 ### ⚙️ Miscellaneous Tasks
 
 - Retry tests to reduce flakyness (#836)
 - Update ruff invocation to include check parameter (#853)
 
+### Docs
+
+- Update GeminiGenerator docstrings (#964)
+- Update GoogleChatGenerator docstrings (#962)
+
 ## [integrations/google_ai-v1.1.0] - 2024-06-05
 
 ### 🐛 Bug Fixes

diff --git a/...ations/google_ai/src/haystack_integrations/components/generators/google_ai/chat/gemini.py b/...ations/google_ai/src/haystack_integrations/components/generators/google_ai/chat/gemini.py
@@ -1,16 +1,16 @@
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import google.generativeai as genai
 from google.ai.generativelanguage import Content, Part
 from google.ai.generativelanguage import Tool as ToolProto
 from google.generativeai import GenerationConfig, GenerativeModel
-from google.generativeai.types import HarmBlockThreshold, HarmCategory, Tool
+from google.generativeai.types import GenerateContentResponse, HarmBlockThreshold, HarmCategory, Tool
 from haystack.core.component import component
 from haystack.core.serialization import default_from_dict, default_to_dict
-from haystack.dataclasses.byte_stream import ByteStream
+from haystack.dataclasses import ByteStream, StreamingChunk
 from haystack.dataclasses.chat_message import ChatMessage, ChatRole
-from haystack.utils import Secret, deserialize_secrets_inplace
+from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
 
 logger = logging.getLogger(__name__)
 
@@ -21,10 +21,7 @@ class GoogleAIGeminiChatGenerator:
     Completes chats using multimodal Gemini models through Google AI Studio.
 
     It uses the [`ChatMessage`](https://docs.haystack.deepset.ai/docs/data-classes#chatmessage)
-      dataclass to interact with the model. You can use the following models:
-    - gemini-pro
-    - gemini-ultra
-    - gemini-pro-vision
+      dataclass to interact with the model.
 
     ### Usage example
 
@@ -103,27 +100,20 @@ def __init__(
         self,
         *,
         api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"),  # noqa: B008
-        model: str = "gemini-pro-vision",
+        model: str = "gemini-1.5-flash",
         generation_config: Optional[Union[GenerationConfig, Dict[str, Any]]] = None,
         safety_settings: Optional[Dict[HarmCategory, HarmBlockThreshold]] = None,
         tools: Optional[List[Tool]] = None,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
         """
         Initializes a `GoogleAIGeminiChatGenerator` instance.
 
         To get an API key, visit: https://makersuite.google.com
 
-        It supports the following models:
-        * `gemini-pro`
-        * `gemini-pro-vision`
-        * `gemini-ultra`
-
         :param api_key: Google AI Studio API key. To get a key,
         see [Google AI Studio](https://makersuite.google.com).
-        :param model: Name of the model to use. Supported models are:
-            - gemini-pro
-            - gemini-ultra
-            - gemini-pro-vision
+        :param model: Name of the model to use. For available models, see https://ai.google.dev/gemini-api/docs/models/gemini.
         :param generation_config: The generation configuration to use.
             This can either be a `GenerationConfig` object or a dictionary of parameters.
             For available parameters, see
@@ -132,6 +122,8 @@ def __init__(
             A dictionary with `HarmCategory` as keys and `HarmBlockThreshold` as values.
             For more information, see [the API reference](https://ai.google.dev/api)
         :param tools: A list of Tool objects that can be used for [Function calling](https://ai.google.dev/docs/function_calling).
+        :param streaming_callback: A callback function that is called when a new token is received from the stream.
+            The callback function accepts StreamingChunk as an argument.
         """
 
         genai.configure(api_key=api_key.resolve_value())
@@ -142,6 +134,7 @@ def __init__(
         self._safety_settings = safety_settings
         self._tools = tools
         self._model = GenerativeModel(self._model_name, tools=self._tools)
+        self._streaming_callback = streaming_callback
 
     def _generation_config_to_dict(self, config: Union[GenerationConfig, Dict[str, Any]]) -> Dict[str, Any]:
         if isinstance(config, dict):
@@ -162,13 +155,16 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
+        callback_name = serialize_callable(self._streaming_callback) if self._streaming_callback else None
+
         data = default_to_dict(
             self,
             api_key=self._api_key.to_dict(),
             model=self._model_name,
             generation_config=self._generation_config,
             safety_settings=self._safety_settings,
             tools=self._tools,
+            streaming_callback=callback_name,
         )
         if (tools := data["init_parameters"].get("tools")) is not None:
             data["init_parameters"]["tools"] = []
@@ -213,6 +209,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIGeminiChatGenerator":
             data["init_parameters"]["safety_settings"] = {
                 HarmCategory(k): HarmBlockThreshold(v) for k, v in safety_settings.items()
             }
+        if (serialized_callback_handler := data["init_parameters"].get("streaming_callback")) is not None:
+            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
         return default_from_dict(cls, data)
 
     def _convert_part(self, part: Union[str, ByteStream, Part]) -> Part:
@@ -274,16 +272,23 @@ def _message_to_content(self, message: ChatMessage) -> Content:
         return Content(parts=[part], role=role)
 
     @component.output_types(replies=List[ChatMessage])
-    def run(self, messages: List[ChatMessage]):
+    def run(
+        self,
+        messages: List[ChatMessage],
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
+    ):
         """
         Generates text based on the provided messages.
 
         :param messages:
             A list of `ChatMessage` instances, representing the input messages.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
         :returns:
             A dictionary containing the following key:
             - `replies`:  A list containing the generated responses as `ChatMessage` instances.
         """
+        streaming_callback = streaming_callback or self._streaming_callback
         history = [self._message_to_content(m) for m in messages[:-1]]
         session = self._model.start_chat(history=history)
 
@@ -292,10 +297,22 @@ def run(self, messages: List[ChatMessage]):
             content=new_message,
             generation_config=self._generation_config,
             safety_settings=self._safety_settings,
+            stream=streaming_callback is not None,
         )
 
+        replies = self._get_stream_response(res, streaming_callback) if streaming_callback else self._get_response(res)
+
+        return {"replies": replies}
+
+    def _get_response(self, response_body: GenerateContentResponse) -> List[ChatMessage]:
+        """
+        Extracts the responses from the Google AI response.
+
+        :param response_body: The response from Google AI request.
+        :returns: The extracted responses.
+        """
         replies = []
-        for candidate in res.candidates:
+        for candidate in response_body.candidates:
             for part in candidate.content.parts:
                 if part.text != "":
                     replies.append(ChatMessage.from_system(part.text))
@@ -307,5 +324,23 @@ def run(self, messages: List[ChatMessage]):
                             name=part.function_call.name,
                         )
                     )
+        return replies
 
-        return {"replies": replies}
+    def _get_stream_response(
+        self, stream: GenerateContentResponse, streaming_callback: Callable[[StreamingChunk], None]
+    ) -> List[ChatMessage]:
+        """
+        Extracts the responses from the Google AI streaming response.
+
+        :param stream: The streaming response from the Google AI request.
+        :param streaming_callback: The handler for the streaming response.
+        :returns: The extracted response with the content of all streaming chunks.
+        """
+        responses = []
+        for chunk in stream:
+            content = chunk.text if len(chunk.parts) > 0 and "text" in chunk.parts[0] else ""
+            streaming_callback(StreamingChunk(content=content, meta=chunk.to_dict()))
+            responses.append(content)
+
+        combined_response = "".join(responses).lstrip()
+        return [ChatMessage.from_system(content=combined_response)]