Skip to content

Commit

Permalink
docs: Docstring update (#525)
Browse files Browse the repository at this point in the history
* Docstring update

* PR review - Julian

* pylint fixes
  • Loading branch information
vblagoje authored Mar 4, 2024
1 parent 710ac4d commit 9b98f60
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
@component
class ElasticsearchBM25Retriever:
"""
ElasticsearchBM25Retriever is a keyword-based retriever that uses BM25 to find the most
similar documents to a user's query.
ElasticsearchBM25Retriever retrieves documents from the ElasticsearchDocumentStore using BM25 algorithm to find the
most similar documents to a user's query.
This retriever is only compatible with ElasticsearchDocumentStore.
Usage example:
Expand All @@ -35,7 +36,7 @@ class ElasticsearchBM25Retriever:
result = retriever.run(query="Who lives in Berlin?")
for doc in result["documents"]:
print(doc.text)
print(doc.content)
```
"""

Expand All @@ -55,8 +56,9 @@ def __init__(
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
see the official documentation for valid values:
https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
See the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for more details.
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
"""
Expand All @@ -72,6 +74,12 @@ def __init__(
self._scale_score = scale_score

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self._filters,
Expand All @@ -83,6 +91,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
data["init_parameters"]["document_store"]
)
Expand All @@ -96,7 +112,8 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio
:param query: String to search in Documents' text.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:return: List of Documents that match the query.
:returns: A dictionary with the following keys:
- `documents`: List of Documents that match the query.
"""
docs = self._document_store._bm25_retrieval(
query=query,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,35 @@
@component
class ElasticsearchEmbeddingRetriever:
"""
Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore.
ElasticsearchEmbeddingRetriever retrieves documents from the ElasticsearchDocumentStore using vector similarity.
Needs to be connected to the ElasticsearchDocumentStore to run.
Usage example:
```python
from haystack import Document
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
# Add documents to DocumentStore
documents = [
Document(text="My name is Carla and I live in Berlin"),
Document(text="My name is Paul and I live in New York"),
Document(text="My name is Silvano and I live in Matera"),
Document(text="My name is Usagi Tsukino and I live in Tokyo"),
]
document_store.write_documents(documents)
te = SentenceTransformersTextEmbedder()
te.warm_up()
query_embeddings = te.run("Who lives in Berlin?")["embedding"]
result = retriever.run(query=query_embeddings)
for doc in result["documents"]:
print(doc.content)
```
"""

def __init__(
Expand All @@ -33,8 +59,8 @@ def __init__(
:param top_k: Maximum number of Documents to return, defaults to 10
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
Increasing this value will improve search accuracy at the cost of slower search speeds.
You can read more about it in the Elasticsearch documentation:
https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
You can read more about it in the Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
:raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
"""
if not isinstance(document_store, ElasticsearchDocumentStore):
Expand All @@ -47,6 +73,12 @@ def __init__(
self._num_candidates = num_candidates

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self._filters,
Expand All @@ -57,6 +89,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
data["init_parameters"]["document_store"]
)
Expand All @@ -70,7 +110,8 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] =
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:return: List of Documents similar to `query_embedding`.
:returns: A dictionary with the following keys:
- `documents`: List of Documents most similar to the given query_embedding
"""
docs = self._document_store._embedding_retrieval(
query_embedding=query_embedding,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@

class ElasticsearchDocumentStore:
"""
ElasticsearchDocumentStore is a Document Store for Elasticsearch.
It can be used with Elastic Cloud or your own Elasticsearch cluster.
ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
Elasticsearch cluster.
Simple usage with Elastic Cloud:
Usage example with Elastic Cloud:
```python
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY")
```
One can also connect to a self-hosted Elasticsearch instance:
Usage example with a self-hosted Elasticsearch instance:
```python
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
Expand All @@ -53,8 +53,8 @@ class ElasticsearchDocumentStore:
We strongly recommend to enable security so that only authorized users can access your data.
For more details on how to connect to Elasticsearch and configure security,
see the official Elasticsearch documentation:
https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
see the official Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
All extra keyword arguments will be passed to the Elasticsearch client.
"""
Expand All @@ -75,19 +75,19 @@ def __init__(
One can also set the similarity function used to compare Documents embeddings. This is mostly useful
when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
For more information on connection parameters, see the official Elasticsearch documentation:
https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
For more information on connection parameters, see the official Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
For the full list of supported kwargs, see the official Elasticsearch reference:
https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch
For the full list of supported kwargs, see the official Elasticsearch
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
:param hosts: List of hosts running the Elasticsearch client. Defaults to None
:param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default"
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created.
To choose the most appropriate function, look for information about your embedding model.
To understand how document scores are computed, see the Elasticsearch documentation:
https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
To understand how document scores are computed, see the Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
:param **kwargs: Optional arguments that ``Elasticsearch`` takes.
"""
self._hosts = hosts
Expand Down Expand Up @@ -115,6 +115,12 @@ def __init__(
self._client.indices.create(index=index, mappings=mappings)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
# This is not the best solution to serialise this class but is the fastest to implement.
# Not all kwargs types can be serialised to text so this can fail. We must serialise each
# type explicitly to handle this properly.
Expand All @@ -128,11 +134,20 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
return default_from_dict(cls, data)

def count_documents(self) -> int:
"""
Returns how many documents are present in the document store.
:returns: Number of documents in the document store.
"""
return self._client.count(index=self._index)["count"]

Expand Down Expand Up @@ -165,6 +180,14 @@ def _search_documents(self, **kwargs) -> List[Document]:
return documents

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
"""
The main query method for the document store. It retrieves all documents that match the filters.
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
see the official Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
:returns: List of Documents that match the filters.
"""
if filters and "operator" not in filters and "conditions" not in filters:
filters = convert(filters)

Expand All @@ -175,8 +198,13 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
"""
Writes Documents to Elasticsearch.
If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the
same ID already exists in the document store.
:param documents: List of Documents to write to the document store.
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
:returns: Number of documents written to the document store.
"""
if len(documents) > 0:
if not isinstance(documents[0], Document):
Expand Down Expand Up @@ -229,6 +257,9 @@ def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
"""
Creates a Document from the search hit provided.
This is mostly useful in self.filter_documents().
:param hit: A search hit from Elasticsearch.
:returns: Document created from the search hit.
"""
data = hit["_source"]

Expand All @@ -242,7 +273,7 @@ def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
:param object_ids: the object_ids to delete
:param document_ids: the object_ids to delete
"""

#
Expand Down Expand Up @@ -272,18 +303,19 @@ def _bm25_retrieval(
`ElasticsearchDocumentStore` nor called directly.
`ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
`query` must be a non empty string, otherwise a `ValueError` will be raised.
`query` must be a non-empty string, otherwise a `ValueError` will be raised.
:param query: String to search in saved Documents' text.
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
see the official documentation for valid values:
https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
see the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for valid values.
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
:raises ValueError: If `query` is an empty string
:return: List of Document that match `query`
:returns: List of Document that match `query`
"""

if not query:
Expand Down Expand Up @@ -341,10 +373,10 @@ def _embedding_retrieval(
:param top_k: Maximum number of Documents to return, defaults to 10
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
Increasing this value will improve search accuracy at the cost of slower search speeds.
You can read more about it in the Elasticsearch documentation:
https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy
You can read more about it in the Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
:raises ValueError: If `query_embedding` is an empty list
:return: List of Document that are most similar to `query_embedding`
:returns: List of Document that are most similar to `query_embedding`
"""

if not query_embedding:
Expand Down

0 comments on commit 9b98f60

Please sign in to comment.