Skip to content

Commit

Permalink
docs: review and normalize haystack.components.websearch (#7236)
Browse files Browse the repository at this point in the history
* docs: review and normalize `haystack.components.websearch`

* fix: use correct type annotations

* refactor: use type from protocol

Co-authored-by: Silvano Cerza <[email protected]>

* Revert "refactor: use type from protocol"

This reverts commit 23d6f45.

* docs: refactor according to comments

* build: correctly pin to 4.7

---------

Co-authored-by: Silvano Cerza <[email protected]>
  • Loading branch information
wochinge and silvanocerza authored Feb 28, 2024
1 parent 20ebb46 commit f22d499
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 32 deletions.
2 changes: 1 addition & 1 deletion docs/pydoc/config/websearch_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/websearch]
modules: ["serper_dev"]
modules: ["serper_dev", "searchapi"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
51 changes: 38 additions & 13 deletions haystack/components/websearch/searchapi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import logging
from typing import Dict, List, Optional, Any
from typing import Dict, List, Optional, Any, Union

import requests

Expand All @@ -20,9 +20,21 @@ class SearchApiError(ComponentError):
@component
class SearchApiWebSearch:
"""
Search engine using SearchApi API. Given a query, it returns a list of URLs that are the most relevant.
Uses [SearchApi](https://www.searchapi.io/) to search the web for relevant documents.
See the [SearchApi website](https://www.searchapi.io/) for more details.
Usage example:
```python
from haystack.components.websearch import SearchApiWebSearch
from haystack.utils import Secret
websearch = SearchApiWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
assert results["documents"]
assert results["links"]
```
"""

def __init__(
Expand All @@ -37,8 +49,8 @@ def __init__(
:param top_k: Number of documents to return.
:param allowed_domains: List of domains to limit the search to.
:param search_params: Additional parameters passed to the SearchApi API.
For example, you can set 'num' to 100 to increase the number of search results.
See the [SearchApi website](https://www.searchapi.io/) for more details.
For example, you can set 'num' to 100 to increase the number of search results.
See the [SearchApi website](https://www.searchapi.io/) for more details.
"""

self.api_key = api_key
Expand All @@ -51,7 +63,10 @@ def __init__(

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
Expand All @@ -64,17 +79,27 @@ def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SearchApiWebSearch":
"""
Deserialize this component from a dictionary.
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@component.output_types(documents=List[Document], links=List[str])
def run(self, query: str):
@component.output_types(documents=List[Document], links=Union[List[Document], List[str]])
def run(self, query: str) -> Dict[str, Union[List[Document], List[str]]]:
"""
Search the SearchApi API for the given query and return the results as a list of Documents and a list of links.
:param query: Query string.
Uses [SearchApi](https://www.searchapi.io/) to search the web.
:param query: Search query.
:returns: A dictionary with the following keys:
- "documents": List of documents returned by the search engine.
- "links": List of links returned by the search engine.
:raises TimeoutError: If the request to the SearchApi API times out.
:raises SearchApiError: If an error occurs while querying the SearchApi API.
"""
query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""

Expand All @@ -84,8 +109,8 @@ def run(self, query: str):
try:
response = requests.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
response.raise_for_status() # Will raise an HTTPError for bad responses
except requests.Timeout:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
except requests.Timeout as error:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error

except requests.RequestException as e:
raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
Expand Down
53 changes: 38 additions & 15 deletions haystack/components/websearch/serper_dev.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import logging
from typing import Dict, List, Optional, Any
from typing import Dict, List, Optional, Any, Union

import requests

Expand All @@ -20,9 +20,21 @@ class SerperDevError(ComponentError):
@component
class SerperDevWebSearch:
"""
Search engine using SerperDev API. Given a query, it returns a list of URLs that are the most relevant.
Uses [Serper](https://serper.dev/) to search the web for relevant documents.
See the [Serper Dev website](https://serper.dev/) for more details.
Usage example:
```python
from haystack.components.websearch import SerperDevWebSearch
from haystack.utils import Secret
websearch = SerperDevWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
assert results["documents"]
assert results["links"]
```
"""

def __init__(
Expand All @@ -33,12 +45,12 @@ def __init__(
search_params: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for the SerperDev API.
:param api_key: API key for the Serper API.
:param top_k: Number of documents to return.
:param allowed_domains: List of domains to limit the search to.
:param search_params: Additional parameters passed to the SerperDev API.
For example, you can set 'num' to 20 to increase the number of search results.
See the [Serper Dev website](https://serper.dev/) for more details.
:param search_params: Additional parameters passed to the Serper API.
For example, you can set 'num' to 20 to increase the number of search results.
See the [Serper website](https://serper.dev/) for more details.
"""
self.api_key = api_key
self.top_k = top_k
Expand All @@ -50,7 +62,10 @@ def __init__(

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
Expand All @@ -63,17 +78,25 @@ def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SerperDevWebSearch":
"""
Deserialize this component from a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@component.output_types(documents=List[Document], links=List[str])
def run(self, query: str):
@component.output_types(documents=List[Document], links=Union[List[Document], List[str]])
def run(self, query: str) -> Dict[str, Union[List[Document], List[str]]]:
"""
Search the SerperDev API for the given query and return the results as a list of Documents and a list of links.
:param query: Query string.
Use [Serper](https://serper.dev/) to search the web.
:param query: Search query.
:returns: A dictionary with the following keys:
- "documents": List of documents returned by the search engine.
- "links": List of links returned by the search engine.
:raises SerperDevError: If an error occurs while querying the SerperDev API.
:raises TimeoutError: If the request to the SerperDev API times out.
"""
query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""

Expand All @@ -85,8 +108,8 @@ def run(self, query: str):
try:
response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30) # type: ignore
response.raise_for_status() # Will raise an HTTPError for bad responses
except requests.Timeout:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
except requests.Timeout as error:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error

except requests.RequestException as e:
raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ dependencies = [
"pyyaml",
"more-itertools", # TextDocumentSplitter
"networkx", # Pipeline graphs
"typing_extensions>=3.7", # typing support for Python 3.8
"typing_extensions>=4.7", # typing support for Python 3.8
"boilerpy3", # Fulltext extraction from HTML pages
]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
fixes:
- |
Pin the `typing-extensions` package to versions >= 3.7 to avoid
Pin the `typing-extensions` package to versions >= 4.7 to avoid
[incompatibilities with the `openai` package](https://community.openai.com/t/error-while-importing-openai-from-open-import-openai/578166/26).
2 changes: 1 addition & 1 deletion test/components/websearch/test_serperdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_web_search(self):
ws = SerperDevWebSearch(top_k=10)
results = ws.run(query="Who is the boyfriend of Olivia Wilde?")
documents = results["documents"]
links = results["documents"]
links = results["links"]
assert len(documents) == len(links) == 10
assert all(isinstance(doc, Document) for doc in results)
assert all(isinstance(link, str) for link in links)
Expand Down

0 comments on commit f22d499

Please sign in to comment.