docs: review and normalize haystack.components.websearch (#7236)

* docs: review and normalize `haystack.components.websearch` * fix: use correct type annotations * refactor: use type from protocol Co-authored-by: Silvano Cerza <[email protected]> * Revert "refactor: use type from protocol" This reverts commit 23d6f45. * docs: refactor according to comments * build: correctly pin to 4.7 --------- Co-authored-by: Silvano Cerza <[email protected]>
deepset-ai · Feb 28, 2024 · f22d499 · f22d499
1 parent 20ebb46
commit f22d499
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 32 deletions.
diff --git a/docs/pydoc/config/websearch_api.yml b/docs/pydoc/config/websearch_api.yml
@@ -1,7 +1,7 @@
 loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/components/websearch]
-    modules: ["serper_dev"]
+    modules: ["serper_dev", "searchapi"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter

diff --git a/haystack/components/websearch/searchapi.py b/haystack/components/websearch/searchapi.py
@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Union
 
 import requests
 
@@ -20,9 +20,21 @@ class SearchApiError(ComponentError):
 @component
 class SearchApiWebSearch:
     """
-    Search engine using SearchApi API. Given a query, it returns a list of URLs that are the most relevant.
+    Uses [SearchApi](https://www.searchapi.io/) to search the web for relevant documents.
 
     See the [SearchApi website](https://www.searchapi.io/) for more details.
+
+    Usage example:
+    ```python
+    from haystack.components.websearch import SearchApiWebSearch
+    from haystack.utils import Secret
+
+    websearch = SearchApiWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
+    results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
+
+    assert results["documents"]
+    assert results["links"]
+    ```
     """
 
     def __init__(
@@ -37,8 +49,8 @@ def __init__(
         :param top_k: Number of documents to return.
         :param allowed_domains: List of domains to limit the search to.
         :param search_params: Additional parameters passed to the SearchApi API.
-        For example, you can set 'num' to 100 to increase the number of search results.
-        See the [SearchApi website](https://www.searchapi.io/) for more details.
+            For example, you can set 'num' to 100 to increase the number of search results.
+            See the [SearchApi website](https://www.searchapi.io/) for more details.
         """
 
         self.api_key = api_key
@@ -51,7 +63,10 @@ def __init__(
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+              Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -64,17 +79,27 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "SearchApiWebSearch":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+                The deserialized component.
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
-    @component.output_types(documents=List[Document], links=List[str])
-    def run(self, query: str):
+    @component.output_types(documents=List[Document], links=Union[List[Document], List[str]])
+    def run(self, query: str) -> Dict[str, Union[List[Document], List[str]]]:
         """
-        Search the SearchApi API for the given query and return the results as a list of Documents and a list of links.
-
-        :param query: Query string.
+        Uses [SearchApi](https://www.searchapi.io/) to search the web.
+
+        :param query: Search query.
+        :returns: A dictionary with the following keys:
+            - "documents": List of documents returned by the search engine.
+            - "links": List of links returned by the search engine.
+        :raises TimeoutError: If the request to the SearchApi API times out.
+        :raises SearchApiError: If an error occurs while querying the SearchApi API.
         """
         query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
 
@@ -84,8 +109,8 @@ def run(self, query: str):
         try:
             response = requests.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
             response.raise_for_status()  # Will raise an HTTPError for bad responses
-        except requests.Timeout:
-            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
+        except requests.Timeout as error:
+            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error
 
         except requests.RequestException as e:
             raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e

diff --git a/haystack/components/websearch/serper_dev.py b/haystack/components/websearch/serper_dev.py
@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Union
 
 import requests
 
@@ -20,9 +20,21 @@ class SerperDevError(ComponentError):
 @component
 class SerperDevWebSearch:
     """
-    Search engine using SerperDev API. Given a query, it returns a list of URLs that are the most relevant.
+    Uses [Serper](https://serper.dev/) to search the web for relevant documents.
 
     See the [Serper Dev website](https://serper.dev/) for more details.
+
+    Usage example:
+    ```python
+    from haystack.components.websearch import SerperDevWebSearch
+    from haystack.utils import Secret
+
+    websearch = SerperDevWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
+    results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
+
+    assert results["documents"]
+    assert results["links"]
+    ```
     """
 
     def __init__(
@@ -33,12 +45,12 @@ def __init__(
         search_params: Optional[Dict[str, Any]] = None,
     ):
         """
-        :param api_key: API key for the SerperDev API.
+        :param api_key: API key for the Serper API.
         :param top_k: Number of documents to return.
         :param allowed_domains: List of domains to limit the search to.
-        :param search_params: Additional parameters passed to the SerperDev API.
-        For example, you can set 'num' to 20 to increase the number of search results.
-        See the [Serper Dev website](https://serper.dev/) for more details.
+        :param search_params: Additional parameters passed to the Serper API.
+            For example, you can set 'num' to 20 to increase the number of search results.
+            See the [Serper website](https://serper.dev/) for more details.
         """
         self.api_key = api_key
         self.top_k = top_k
@@ -50,7 +62,10 @@ def __init__(
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+                Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -63,17 +78,25 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "SerperDevWebSearch":
         """
-        Deserialize this component from a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+                Dictionary with serialized data.
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
-    @component.output_types(documents=List[Document], links=List[str])
-    def run(self, query: str):
+    @component.output_types(documents=List[Document], links=Union[List[Document], List[str]])
+    def run(self, query: str) -> Dict[str, Union[List[Document], List[str]]]:
         """
-        Search the SerperDev API for the given query and return the results as a list of Documents and a list of links.
-
-        :param query: Query string.
+        Use [Serper](https://serper.dev/) to search the web.
+
+        :param query: Search query.
+        :returns: A dictionary with the following keys:
+            - "documents": List of documents returned by the search engine.
+            - "links": List of links returned by the search engine.
+        :raises SerperDevError: If an error occurs while querying the SerperDev API.
+        :raises TimeoutError: If the request to the SerperDev API times out.
         """
         query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
 
@@ -85,8 +108,8 @@ def run(self, query: str):
         try:
             response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30)  # type: ignore
             response.raise_for_status()  # Will raise an HTTPError for bad responses
-        except requests.Timeout:
-            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
+        except requests.Timeout as error:
+            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error
 
         except requests.RequestException as e:
             raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,7 @@ dependencies = [
   "pyyaml",
   "more-itertools",  # TextDocumentSplitter
   "networkx", # Pipeline graphs
-  "typing_extensions>=3.7", # typing support for Python 3.8
+  "typing_extensions>=4.7", # typing support for Python 3.8
   "boilerpy3", # Fulltext extraction from HTML pages
 ]
 

diff --git a/releasenotes/notes/pin-typing-extensions-c4026f59603445b7.yaml b/releasenotes/notes/pin-typing-extensions-c4026f59603445b7.yaml
@@ -1,5 +1,5 @@
 ---
 fixes:
   - |
-    Pin the `typing-extensions` package to versions >= 3.7 to avoid
+    Pin the `typing-extensions` package to versions >= 4.7 to avoid
     [incompatibilities with the `openai` package](https://community.openai.com/t/error-while-importing-openai-from-open-import-openai/578166/26).
diff --git a/test/components/websearch/test_serperdev.py b/test/components/websearch/test_serperdev.py
@@ -174,7 +174,7 @@ def test_web_search(self):
         ws = SerperDevWebSearch(top_k=10)
         results = ws.run(query="Who is the boyfriend of Olivia Wilde?")
         documents = results["documents"]
-        links = results["documents"]
+        links = results["links"]
         assert len(documents) == len(links) == 10
         assert all(isinstance(doc, Document) for doc in results)
         assert all(isinstance(link, str) for link in links)