From f9d0e77ae9f0e850d3e3ee5038ecfd0b77f98222 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:00:20 -0500 Subject: [PATCH] feat: add `JinaReaderConnector` (#1150) * begin rough draft * begin rough draft * begin rough draft * small fixes * Haystack document conversion * git folder changes * add pipeline functions * correct mode map * add reader mode Enum class file * add docstrings * add JINA url for ref * add mode norm for mode map check in run method * add mode norm for mode map check in run method * add json_response and associated parsing * ignore api key lint error * ignore api key lint error * reduce code redundancy * reduce code redundancy * add headers option to run method * Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci * Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci * Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci * Update integrations/jina/src/haystack_integrations/components/reader/jina/reader.py Co-authored-by: Stefano Fiorucci * update location / final edits * Update integrations/jina/src/haystack_integrations/components/converters/jina/reader.py Co-authored-by: Stefano Fiorucci * update paths * add descriptions for json response/headers * lint * unit tests for reader-connector * unit tests for reader-connector * unit tests for reader-connector * fix circular import * update header test * update test * update test * update test * update test * update test * update test * update test * update test * refactoring + more tests * example * pydoc config * examples can contain print --------- Co-authored-by: anitha6g Co-authored-by: Stefano Fiorucci --- .../jina/examples/jina_reader_connector.py | 47 ++++++ integrations/jina/pydoc/config.yml | 1 + integrations/jina/pyproject.toml | 7 +- .../components/connectors/jina/__init__.py | 7 + .../components/connectors/jina/reader.py | 141 ++++++++++++++++++ .../components/connectors/jina/reader_mode.py | 40 +++++ .../jina/tests/test_reader_connector.py | 141 ++++++++++++++++++ 7 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 integrations/jina/examples/jina_reader_connector.py create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py create mode 100644 integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py create mode 100644 integrations/jina/tests/test_reader_connector.py diff --git a/integrations/jina/examples/jina_reader_connector.py b/integrations/jina/examples/jina_reader_connector.py new file mode 100644 index 000000000..24b6f5db3 --- /dev/null +++ b/integrations/jina/examples/jina_reader_connector.py @@ -0,0 +1,47 @@ +# to make use of the JinaReaderConnector, we first need to install the Haystack integration +# pip install jina-haystack + +# then we must set the JINA_API_KEY environment variable +# export JINA_API_KEY= + + +from haystack_integrations.components.connectors.jina import JinaReaderConnector + +# we can use the JinaReaderConnector to process a URL and return the textual content of the page +reader = JinaReaderConnector(mode="read") +query = "https://example.com" +result = reader.run(query=query) + +print(result) +# {'documents': [Document(id=fa3e51e4ca91828086dca4f359b6e1ea2881e358f83b41b53c84616cb0b2f7cf, +# content: 'This domain is for use in illustrative examples in documents. You may use this domain in literature ...', +# meta: {'title': 'Example Domain', 'description': '', 'url': 'https://example.com/', 'usage': {'tokens': 42}})]} + + +# we can perform a web search by setting the mode to "search" +reader = JinaReaderConnector(mode="search") +query = "UEFA Champions League 2024" +result = reader.run(query=query) + +print(result) +# {'documents': Document(id=6a71abf9955594232037321a476d39a835c0cb7bc575d886ee0087c973c95940, +# content: '2024/25 UEFA Champions League: Matches, draw, final, key dates | UEFA Champions League | UEFA.com...', +# meta: {'title': '2024/25 UEFA Champions League: Matches, draw, final, key dates', +# 'description': 'What are the match dates? Where is the 2025 final? How will the competition work?', +# 'url': 'https://www.uefa.com/uefachampionsleague/news/...', +# 'usage': {'tokens': 5581}}), ...]} + + +# finally, we can perform fact-checking by setting the mode to "ground" (experimental) +reader = JinaReaderConnector(mode="ground") +query = "ChatGPT was launched in 2017" +result = reader.run(query=query) + +print(result) +# {'documents': [Document(id=f0c964dbc1ebb2d6584c8032b657150b9aa6e421f714cc1b9f8093a159127f0c, +# content: 'The statement that ChatGPT was launched in 2017 is incorrect. Multiple references confirm that ChatG...', +# meta: {'factuality': 0, 'result': False, 'references': [ +# {'url': 'https://en.wikipedia.org/wiki/ChatGPT', +# 'keyQuote': 'ChatGPT is a generative artificial intelligence (AI) chatbot developed by OpenAI and launched in 2022.', +# 'isSupportive': False}, ...], +# 'usage': {'tokens': 10188}})]} diff --git a/integrations/jina/pydoc/config.yml b/integrations/jina/pydoc/config.yml index 8c7a241f6..2d0ef4f87 100644 --- a/integrations/jina/pydoc/config.yml +++ b/integrations/jina/pydoc/config.yml @@ -6,6 +6,7 @@ loaders: "haystack_integrations.components.embedders.jina.document_embedder", "haystack_integrations.components.embedders.jina.text_embedder", "haystack_integrations.components.rankers.jina.ranker", + "haystack_integrations.components.connectors.jina.reader", ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/jina/pyproject.toml b/integrations/jina/pyproject.toml index c89eeacb4..e3af086d0 100644 --- a/integrations/jina/pyproject.toml +++ b/integrations/jina/pyproject.toml @@ -132,18 +132,23 @@ ban-relative-imports = "parents" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] [tool.coverage.run] source = ["haystack_integrations"] branch = true parallel = false - [tool.coverage.report] omit = ["*/tests/*", "*/__init__.py"] show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] +[tool.pytest.ini_options] +minversion = "6.0" +markers = ["unit: unit tests", "integration: integration tests"] + [[tool.mypy.overrides]] module = ["haystack.*", "haystack_integrations.*", "pytest.*"] ignore_missing_imports = true diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py new file mode 100644 index 000000000..95368df21 --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .reader import JinaReaderConnector +from .reader_mode import JinaReaderMode + +__all__ = ["JinaReaderConnector", "JinaReaderMode"] diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py new file mode 100644 index 000000000..eb53329f7 --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py @@ -0,0 +1,141 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import json +from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote + +import requests +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace + +from .reader_mode import JinaReaderMode + +READER_ENDPOINT_URL_BY_MODE = { + JinaReaderMode.READ: "https://r.jina.ai/", + JinaReaderMode.SEARCH: "https://s.jina.ai/", + JinaReaderMode.GROUND: "https://g.jina.ai/", +} + + +@component +class JinaReaderConnector: + """ + A component that interacts with Jina AI's reader service to process queries and return documents. + + This component supports different modes of operation: `read`, `search`, and `ground`. + + Usage example: + ```python + from haystack_integrations.components.connectors.jina import JinaReaderConnector + + reader = JinaReaderConnector(mode="read") + query = "https://example.com" + result = reader.run(query=query) + document = result["documents"][0] + print(document.content) + + >>> "This domain is for use in illustrative examples..." + ``` + """ + + def __init__( + self, + mode: Union[JinaReaderMode, str], + api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008 + json_response: bool = True, + ): + """ + Initialize a JinaReader instance. + + :param mode: The operation mode for the reader (`read`, `search` or `ground`). + - `read`: process a URL and return the textual content of the page. + - `search`: search the web and return textual content of the most relevant pages. + - `ground`: call the grounding engine to perform fact checking. + For more information on the modes, see the [Jina Reader documentation](https://jina.ai/reader/). + :param api_key: The Jina API key. It can be explicitly provided or automatically read from the + environment variable JINA_API_KEY (recommended). + :param json_response: Controls the response format from the Jina Reader API. + If `True`, requests a JSON response, resulting in Documents with rich structured metadata. + If `False`, requests a raw response, resulting in one Document with minimal metadata. + """ + self.api_key = api_key + self.json_response = json_response + + if isinstance(mode, str): + mode = JinaReaderMode.from_str(mode) + self.mode = mode + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + mode=str(self.mode), + json_response=self.json_response, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "JinaReaderConnector": + """ + Deserializes the component from a dictionary. + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _json_to_document(self, data: dict) -> Document: + """ + Convert a JSON response/record to a Document, depending on the reader mode. + """ + if self.mode == JinaReaderMode.GROUND: + content = data.pop("reason") + else: + content = data.pop("content") + document = Document(content=content, meta=data) + return document + + @component.output_types(document=List[Document]) + def run(self, query: str, headers: Optional[Dict[str, str]] = None): + """ + Process the query/URL using the Jina AI reader service. + + :param query: The query string or URL to process. + :param headers: Optional headers to include in the request for customization. Refer to the + [Jina Reader documentation](https://jina.ai/reader/) for more information. + + :returns: + A dictionary with the following keys: + - `documents`: A list of `Document` objects. + """ + headers = headers or {} + headers["Authorization"] = f"Bearer {self.api_key.resolve_value()}" + + if self.json_response: + headers["Accept"] = "application/json" + + endpoint_url = READER_ENDPOINT_URL_BY_MODE[self.mode] + encoded_target = quote(query, safe="") + url = f"{endpoint_url}{encoded_target}" + + response = requests.get(url, headers=headers, timeout=60) + + # raw response: we just return a single Document with text + if not self.json_response: + meta = {"content_type": response.headers["Content-Type"], "query": query} + return {"documents": [Document(content=response.content, meta=meta)]} + + response_json = json.loads(response.content).get("data", {}) + if self.mode == JinaReaderMode.SEARCH: + documents = [self._json_to_document(record) for record in response_json] + return {"documents": documents} + + return {"documents": [self._json_to_document(response_json)]} diff --git a/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py new file mode 100644 index 000000000..2ccf7250b --- /dev/null +++ b/integrations/jina/src/haystack_integrations/components/connectors/jina/reader_mode.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from enum import Enum + + +class JinaReaderMode(Enum): + """ + Enum representing modes for the Jina Reader. + + Modes: + READ: Process a URL and return the textual content of the page. + SEARCH: Search the web and return the textual content of the most relevant pages. + GROUND: Call the grounding engine to perform fact checking. + + """ + + READ = "read" + SEARCH = "search" + GROUND = "ground" + + def __str__(self): + return self.value + + @classmethod + def from_str(cls, string: str) -> "JinaReaderMode": + """ + Create the reader mode from a string. + + :param string: + String to convert. + :returns: + Reader mode. + """ + enum_map = {e.value: e for e in JinaReaderMode} + reader_mode = enum_map.get(string) + if reader_mode is None: + msg = f"Unknown reader mode '{string}'. Supported modes are: {list(enum_map.keys())}" + raise ValueError(msg) + return reader_mode diff --git a/integrations/jina/tests/test_reader_connector.py b/integrations/jina/tests/test_reader_connector.py new file mode 100644 index 000000000..449f73df8 --- /dev/null +++ b/integrations/jina/tests/test_reader_connector.py @@ -0,0 +1,141 @@ +import json +import os +from unittest.mock import patch + +import pytest +from haystack import Document +from haystack.utils import Secret + +from haystack_integrations.components.connectors.jina import JinaReaderConnector, JinaReaderMode + + +class TestJinaReaderConnector: + def test_init_with_custom_parameters(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="read", api_key=Secret.from_env_var("TEST_KEY"), json_response=False) + + assert reader.mode == JinaReaderMode.READ + assert reader.api_key.resolve_value() == "test-api-key" + assert reader.json_response is False + + def test_init_with_invalid_mode(self): + with pytest.raises(ValueError): + JinaReaderConnector(mode="INVALID") + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="search", api_key=Secret.from_env_var("TEST_KEY"), json_response=True) + + serialized = reader.to_dict() + + assert serialized["type"] == "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector" + assert "init_parameters" in serialized + + init_params = serialized["init_parameters"] + assert init_params["mode"] == "search" + assert init_params["json_response"] is True + assert "api_key" in init_params + assert init_params["api_key"]["type"] == "env_var" + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("JINA_API_KEY", "test-api-key") + component_dict = { + "type": "haystack_integrations.components.connectors.jina.reader.JinaReaderConnector", + "init_parameters": { + "api_key": {"type": "env_var", "env_vars": ["JINA_API_KEY"], "strict": True}, + "mode": "read", + "json_response": True, + }, + } + + reader = JinaReaderConnector.from_dict(component_dict) + + assert isinstance(reader, JinaReaderConnector) + assert reader.mode == JinaReaderMode.READ + assert reader.json_response is True + assert reader.api_key.resolve_value() == "test-api-key" + + def test_json_to_document_read_mode(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="read") + + data = {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"} + document = reader._json_to_document(data) + + assert isinstance(document, Document) + assert document.content == "Mocked content" + assert document.meta["title"] == "Mocked Title" + assert document.meta["url"] == "https://example.com" + + def test_json_to_document_ground_mode(self, monkeypatch): + monkeypatch.setenv("TEST_KEY", "test-api-key") + reader = JinaReaderConnector(mode="ground") + + data = { + "factuality": 0, + "result": False, + "reason": "The statement is contradicted by...", + "references": [{"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False}], + } + + document = reader._json_to_document(data) + assert isinstance(document, Document) + assert document.content == "The statement is contradicted by..." + assert document.meta["factuality"] == 0 + assert document.meta["result"] is False + assert document.meta["references"] == [ + {"url": "https://example.com", "keyQuote": "Mocked key quote", "isSupportive": False} + ] + + @patch("requests.get") + def test_run_with_mocked_response(self, mock_get, monkeypatch): + monkeypatch.setenv("JINA_API_KEY", "test-api-key") + mock_json_response = { + "data": {"content": "Mocked content", "title": "Mocked Title", "url": "https://example.com"} + } + mock_get.return_value.content = json.dumps(mock_json_response).encode("utf-8") + mock_get.return_value.headers = {"Content-Type": "application/json"} + + reader = JinaReaderConnector(mode="read") + result = reader.run(query="https://example.com") + + assert mock_get.call_count == 1 + assert mock_get.call_args[0][0] == "https://r.jina.ai/https%3A%2F%2Fexample.com" + assert mock_get.call_args[1]["headers"] == { + "Authorization": "Bearer test-api-key", + "Accept": "application/json", + } + + assert len(result) == 1 + document = result["documents"][0] + assert isinstance(document, Document) + assert document.content == "Mocked content" + assert document.meta["title"] == "Mocked Title" + assert document.meta["url"] == "https://example.com" + + @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set") + @pytest.mark.integration + def test_run_reader_mode(self): + reader = JinaReaderConnector(mode="read") + result = reader.run(query="https://example.com") + + assert len(result) == 1 + document = result["documents"][0] + assert isinstance(document, Document) + assert "This domain is for use in illustrative examples" in document.content + assert document.meta["title"] == "Example Domain" + assert document.meta["url"] == "https://example.com/" + + @pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set") + @pytest.mark.integration + def test_run_search_mode(self): + reader = JinaReaderConnector(mode="search") + result = reader.run(query="When was Jina AI founded?") + + assert len(result) >= 1 + for doc in result["documents"]: + assert isinstance(doc, Document) + assert doc.content + assert "title" in doc.meta + assert "url" in doc.meta + assert "description" in doc.meta