From f9d0e77ae9f0e850d3e3ee5038ecfd0b77f98222 Mon Sep 17 00:00:00 2001
From: jlonge4 <91354480+jlonge4@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:00:20 -0500
Subject: [PATCH] feat: add `JinaReaderConnector` (#1150)

* begin rough draft

* begin rough draft

* begin rough draft

* small fixes

* Haystack document conversion

* git folder changes

* add pipeline functions

* correct mode map

* add reader mode Enum class file

* add docstrings

* add JINA url for ref

* add mode norm for mode map check in run method

* add mode norm for mode map check in run method

* add json_response and associated parsing

* ignore api key lint error

* ignore api key lint error

* reduce code redundancy

* reduce code redundancy

* add headers option to run method

* Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* update location / final edits

* Update integrations/jina/src/haystack_integrations/components/converters/jina/reader.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* update paths

* add descriptions for json response/headers

* lint

* unit tests for reader-connector

* unit tests for reader-connector

* unit tests for reader-connector

* fix circular import

* update header test

* update test

* update test

* update test

* update test

* update test

* update test

* update test

* update test

* refactoring + more tests

* example

* pydoc config

* examples can contain print

---------

Co-authored-by: anitha6g <anitha6g@gmail.com>
Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
---
 .../jina/examples/jina_reader_connector.py    |  47 ++++++
 integrations/jina/pydoc/config.yml            |   1 +
 integrations/jina/pyproject.toml              |   7 +-
 .../components/connectors/jina/__init__.py    |   7 +
 .../components/connectors/jina/reader.py      | 141 ++++++++++++++++++
 .../components/connectors/jina/reader_mode.py |  40 +++++
 .../jina/tests/test_reader_connector.py       | 141 ++++++++++++++++++
 7 files changed, 383 insertions(+), 1 deletion(-)
 create mode 100644 integrations/jina/examples/jina_reader_connector.py
 create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py
 create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py
 create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py
 create mode 100644 integrations/jina/tests/test_reader_connector.py
diff --git a/integrations/jina/examples/jina_reader_connector.py b/integrations/jina/examples/jina_reader_connector.py
new file mode 100644
index 000000000..24b6f5db3
--- /dev/null
+++ b/integrations/jina/examples/jina_reader_connector.py
@@ -0,0 +1,47 @@
+# to make use of the JinaReaderConnector, we first need to install the Haystack integration
+# pip install jina-haystack
+
+# then we must set the JINA_API_KEY environment variable
+# export JINA_API_KEY=<your-api-key>
+
+
+from haystack_integrations.components.connectors.jina import JinaReaderConnector
+
+# we can use the JinaReaderConnector to process a URL and return the textual content of the page
+reader = JinaReaderConnector(mode="read")
+query = "https://example.com"
+result = reader.run(query=query)
+
+print(result)
+# {'documents': [Document(id=fa3e51e4ca91828086dca4f359b6e1ea2881e358f83b41b53c84616cb0b2f7cf,
+# content: 'This domain is for use in illustrative examples in documents. You may use this domain in literature ...',
+# meta: {'title': 'Example Domain', 'description': '', 'url': 'https://example.com/', 'usage': {'tokens': 42}})]}
+
+
+# we can perform a web search by setting the mode to "search"
+reader = JinaReaderConnector(mode="search")
+query = "UEFA Champions League 2024"
+result = reader.run(query=query)
+
+print(result)
+# {'documents': Document(id=6a71abf9955594232037321a476d39a835c0cb7bc575d886ee0087c973c95940,
+# content: '2024/25 UEFA Champions League: Matches, draw, final, key dates | UEFA Champions League | UEFA.com...',
+# meta: {'title': '2024/25 UEFA Champions League: Matches, draw, final, key dates',
+# 'description': 'What are the match dates? Where is the 2025 final? How will the competition work?',
+# 'url': 'https://www.uefa.com/uefachampionsleague/news/...',
+# 'usage': {'tokens': 5581}}), ...]}
+
+
+# finally, we can perform fact-checking by setting the mode to "ground" (experimental)
+reader = JinaReaderConnector(mode="ground")
+query = "ChatGPT was launched in 2017"
+result = reader.run(query=query)
+
+print(result)
+# {'documents': [Document(id=f0c964dbc1ebb2d6584c8032b657150b9aa6e421f714cc1b9f8093a159127f0c,
+# content: 'The statement that ChatGPT was launched in 2017 is incorrect. Multiple references confirm that ChatG...',
+# meta: {'factuality': 0, 'result': False, 'references': [
+# {'url': 'https://en.wikipedia.org/wiki/ChatGPT',
+# 'keyQuote': 'ChatGPT is a generative artificial intelligence (AI) chatbot developed by OpenAI and launched in 2022.',
+# 'isSupportive': False}, ...],
+# 'usage': {'tokens': 10188}})]}
diff --git a/integrations/jina/pydoc/config.yml b/integrations/jina/pydoc/config.yml
index 8c7a241f6..2d0ef4f87 100644
--- a/integrations/jina/pydoc/config.yml
+++ b/integrations/jina/pydoc/config.yml
@@ -6,6 +6,7 @@ loaders:
         "haystack_integrations.components.embedders.jina.document_embedder",
         "haystack_integrations.components.embedders.jina.text_embedder",
         "haystack_integrations.components.rankers.jina.ranker",
+        "haystack_integrations.components.connectors.jina.reader",
       ]
     ignore_when_discovered: ["__init__"]
 processors:
diff --git a/integrations/jina/pyproject.toml b/integrations/jina/pyproject.toml
index c89eeacb4..e3af086d0 100644
--- a/integrations/jina/pyproject.toml
+++ b/integrations/jina/pyproject.toml
@@ -132,18 +132,23 @@ ban-relative-imports = "parents"
 [tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
+# examples can contain "print" commands
+"examples/**/*" = ["T201"]
 
 [tool.coverage.run]
 source = ["haystack_integrations"]
 branch = true
 parallel = false
 
-
 [tool.coverage.report]
 omit = ["*/tests/*", "*/__init__.py"]
 show_missing = true
 exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
 
+[tool.pytest.ini_options]
+minversion = "6.0"
+markers = ["unit: unit tests", "integration: integration tests"]
+
 [[tool.mypy.overrides]]
 module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
 ignore_missing_imports = true
diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py
new file mode 100644
index 000000000..95368df21
--- /dev/null
+++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .reader import JinaReaderConnector
+from .reader_mode import JinaReaderMode
+
+__all__ = ["JinaReaderConnector", "JinaReaderMode"]
diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py
new file mode 100644
index 000000000..eb53329f7
--- /dev/null
+++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py
@@ -0,0 +1,141 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from typing import Any, Dict, List, Optional, Union
+from urllib.parse import quote
+
+import requests
+from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+from .reader_mode import JinaReaderMode
+
+READER_ENDPOINT_URL_BY_MODE = {
+    JinaReaderMode.READ: "https://r.jina.ai/",
+    JinaReaderMode.SEARCH: "https://s.jina.ai/",
+    JinaReaderMode.GROUND: "https://g.jina.ai/",
+}
+
+
+@component
+class JinaReaderConnector:
+    """
+    A component that interacts with Jina AI's reader service to process queries and return documents.
+
+    This component supports different modes of operation: `read`, `search`, and `ground`.
+
+    Usage example:
+    ```python
+    from haystack_integrations.components.connectors.jina import JinaReaderConnector
+
+    reader = JinaReaderConnector(mode="read")
+    query = "https://example.com"
+    result = reader.run(query=query)
+    document = result["documents"][0]
+    print(document.content)
+
+    >>> "This domain is for use in illustrative examples..."
+    ```
+    """
+
+    def __init__(
+        self,
+        mode: Union[JinaReaderMode, str],
+        api_key: Secret = Secret.from_env_var("JINA_API_KEY"),  # noqa: B008
+        json_response: bool = True,
+    ):
+        """
+        Initialize a JinaReader instance.
+
+        :param mode: The operation mode for the reader (`read`, `search` or `ground`).
+            - `read`: process a URL and return the textual content of the page.
+            - `search`: search the web and return textual content of the most relevant pages.
+            - `ground`: call the grounding engine to perform fact checking.
+            For more information on the modes, see the [Jina Reader documentation](https://jina.ai/reader/).
+        :param api_key: The Jina API key. It can be explicitly provided or automatically read from the
+            environment variable JINA_API_KEY (recommended).
+        :param json_response: Controls the response format from the Jina Reader API.
+            If `True`, requests a JSON response, resulting in Documents with rich structured metadata.
+            If `False`, requests a raw response, resulting in one Document with minimal metadata.
+        """
+        self.api_key = api_key
+        self.json_response = json_response
+
+        if isinstance(mode, str):
+            mode = JinaReaderMode.from_str(mode)
+        self.mode = mode
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            api_key=self.api_key.to_dict(),
+            mode=str(self.mode),
+            json_response=self.json_response,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector":
+        """
+        Deserializes the component from a dictionary.
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
+
+    def _json_to_document(self, data: dict) -> Document:
+        """
+        Convert a JSON response/record to a Document, depending on the reader mode.
+        """
+        if self.mode == JinaReaderMode.GROUND:
+            content = data.pop("reason")
+        else:
+            content = data.pop("content")
+        document = Document(content=content, meta=data)
+        return document
+
+    @component.output_types(document=List[Document])
+    def run(self, query: str, headers: Optional[Dict[str, str]] = None):
+        """
+        Process the query/URL using the Jina AI reader service.
+
+        :param query: The query string or URL to process.
+        :param headers: Optional headers to include in the request for customization. Refer to the
+            [Jina Reader documentation](https://jina.ai/reader/) for more information.
+
+        :returns:
+            A dictionary with the following keys:
+                - `documents`: A list of `Document` objects.
+        """
+        headers = headers or {}
+        headers["Authorization"] = f"Bearer {self.api_key.resolve_value()}"
+
+        if self.json_response:
+            headers["Accept"] = "application/json"
+
+        endpoint_url = READER_ENDPOINT_URL_BY_MODE[self.mode]
+        encoded_target = quote(query, safe="")
+        url = f"{endpoint_url}{encoded_target}"
+
+        response = requests.get(url, headers=headers, timeout=60)
+
+        # raw response: we just return a single Document with text
+        if not self.json_response:
+            meta = {"content_type": response.headers["Content-Type"], "query": query}
+            return {"documents": [Document(content=response.content, meta=meta)]}
+
+        response_json = json.loads(response.content).get("data", {})
+        if self.mode == JinaReaderMode.SEARCH:
+            documents = [self._json_to_document(record) for record in response_json]
+            return {"documents": documents}
+
+        return {"documents": [self._json_to_document(response_json)]}
diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py
new file mode 100644
index 000000000..2ccf7250b
--- /dev/null
+++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from enum import Enum
+
+
+class JinaReaderMode(Enum):
+    """
+    Enum representing modes for the Jina Reader.
+
+    Modes:
+        READ: Process a URL and return the textual content of the page.
+        SEARCH: Search the web and return the textual content of the most relevant pages.
+        GROUND: Call the grounding engine to perform fact checking.
+
+    """
+
+    READ = "read"
+    SEARCH = "search"
+    GROUND = "ground"
+
+    def __str__(self):
+        return self.value
+
+    @classmethod
+    def from_str(cls, string: str) -> "JinaReaderMode":
+        """
+        Create the reader mode from a string.
+
+        :param string:
+            String to convert.
+        :returns:
+            Reader mode.
+        """
+        enum_map = {e.value: e for e in JinaReaderMode}
+        reader_mode = enum_map.get(string)
+        if reader_mode is None:
+            msg = f"Unknown reader mode '{string}'. Supported modes are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return reader_mode
diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py
new file mode 100644
index 000000000..449f73df8
--- /dev/null
+++ b/integrations/jina/tests/test_reader_connector.py
@@ -0,0 +1,141 @@
+import json
+import os
+from unittest.mock import patch
+
+import pytest
+from haystack import Document
+from haystack.utils import Secret
+
+from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode
+
+
+class TestJinaReaderConnector:
+    def test_init_with_custom_parameters(self, monkeypatch):
+        monkeypatch.setenv("TEST_KEY", "test-api-key")
+        reader = JinaReaderConnector(mode="read", api_key=Secret.from_env_var("TEST_KEY"), json_response=False)
+
+        assert reader.mode == JinaReaderMode.READ
+        assert reader.api_key.resolve_value() == "test-api-key"
+        assert reader.json_response is False
+
+    def test_init_with_invalid_mode(self):
+        with pytest.raises(ValueError):
+            JinaReaderConnector(mode="INVALID")
+
+    def test_to_dict(self, monkeypatch):
+        monkeypatch.setenv("TEST_KEY", "test-api-key")
+        reader = JinaReaderConnector(mode="search", api_key=Secret.from_env_var("TEST_KEY"), json_response=True)
+
+        serialized = reader.to_dict()
+
+        assert serialized["type"] == "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector"
+        assert "init_parameters" in serialized
+
+        init_params = serialized["init_parameters"]
+        assert init_params["mode"] == "search"
+        assert init_params["json_response"] is True
+        assert "api_key" in init_params
+        assert init_params["api_key"]["type"] == "env_var"
+
+    def test_from_dict(self, monkeypatch):
+        monkeypatch.setenv("JINA_API_KEY", "test-api-key")
+        component_dict = {
+            "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector",
+            "init_parameters": {
+                "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"], "strict": True},
+                "mode": "read",
+                "json_response": True,
+            },
+        }
+
+        reader = JinaReaderConnector.from_dict(component_dict)
+
+        assert isinstance(reader, JinaReaderConnector)
+        assert reader.mode == JinaReaderMode.READ
+        assert reader.json_response is True
+        assert reader.api_key.resolve_value() == "test-api-key"
+
+    def test_json_to_document_read_mode(self, monkeypatch):
+        monkeypatch.setenv("TEST_KEY", "test-api-key")
+        reader = JinaReaderConnector(mode="read")
+
+        data = {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"}
+        document = reader._json_to_document(data)
+
+        assert isinstance(document, Document)
+        assert document.content == "Mocked content"
+        assert document.meta["title"] == "Mocked Title"
+        assert document.meta["url"] == "https://example.com"
+
+    def test_json_to_document_ground_mode(self, monkeypatch):
+        monkeypatch.setenv("TEST_KEY", "test-api-key")
+        reader = JinaReaderConnector(mode="ground")
+
+        data = {
+            "factuality": 0,
+            "result": False,
+            "reason": "The statement is contradicted by...",
+            "references": [{"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False}],
+        }
+
+        document = reader._json_to_document(data)
+        assert isinstance(document, Document)
+        assert document.content == "The statement is contradicted by..."
+        assert document.meta["factuality"] == 0
+        assert document.meta["result"] is False
+        assert document.meta["references"] == [
+            {"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False}
+        ]
+
+    @patch("requests.get")
+    def test_run_with_mocked_response(self, mock_get, monkeypatch):
+        monkeypatch.setenv("JINA_API_KEY", "test-api-key")
+        mock_json_response = {
+            "data": {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"}
+        }
+        mock_get.return_value.content = json.dumps(mock_json_response).encode("utf-8")
+        mock_get.return_value.headers = {"Content-Type": "application/json"}
+
+        reader = JinaReaderConnector(mode="read")
+        result = reader.run(query="https://example.com")
+
+        assert mock_get.call_count == 1
+        assert mock_get.call_args[0][0] == "https://r.jina.ai/https%3A%2F%2Fexample.com"
+        assert mock_get.call_args[1]["headers"] == {
+            "Authorization": "Bearer test-api-key",
+            "Accept": "application/json",
+        }
+
+        assert len(result) == 1
+        document = result["documents"][0]
+        assert isinstance(document, Document)
+        assert document.content == "Mocked content"
+        assert document.meta["title"] == "Mocked Title"
+        assert document.meta["url"] == "https://example.com"
+
+    @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set")
+    @pytest.mark.integration
+    def test_run_reader_mode(self):
+        reader = JinaReaderConnector(mode="read")
+        result = reader.run(query="https://example.com")
+
+        assert len(result) == 1
+        document = result["documents"][0]
+        assert isinstance(document, Document)
+        assert "This domain is for use in illustrative examples" in document.content
+        assert document.meta["title"] == "Example Domain"
+        assert document.meta["url"] == "https://example.com/"
+
+    @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set")
+    @pytest.mark.integration
+    def test_run_search_mode(self):
+        reader = JinaReaderConnector(mode="search")
+        result = reader.run(query="When was Jina AI founded?")
+
+        assert len(result) >= 1
+        for doc in result["documents"]:
+            assert isinstance(doc, Document)
+            assert doc.content
+            assert "title" in doc.meta
+            assert "url" in doc.meta
+            assert "description" in doc.meta