From 746e7d1ebec0ed652042916cb4224bad7ee2f014 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Thu, 13 Jun 2024 13:00:12 +0200 Subject: [PATCH 1/5] doc: add docstrings qdrant document store --- .../document_stores/qdrant/document_store.py | 258 ++++++++++++++++++ 1 file changed, 258 insertions(+) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index b3be56f40..5049bcae2 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -49,6 +49,45 @@ def get_batches_from_generator(iterable, n): class QdrantDocumentStore: + + """ + QdrantDocumentStore is a Document Store for Qdrant. It can be used with any Qdrant instance, in-memory, locally persisted, hosted, and the official Qdrant Cloud. + + Usage example by creating an in-memory instance: + + ```python + from haystack.dataclasses.document import Document + from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_embedding=True, + wait_result_from_api=True, + ) + document_store.write_documents([ + Document(content="This is first", embedding=[0.0]*5), + Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) + ]) + ``` + + Usage example with Qdrant Cloud: + + ```python + from haystack.dataclasses.document import Document + from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + + document_store = QdrantDocumentStore( + url="https://xxxxxx-xxxxx-xxxxx-xxxx-xxxxxxxxx.us-east.aws.cloud.qdrant.io:6333", + api_key="", + ) + document_store.write_documents([ + Document(content="This is first", embedding=[0.0]*5), + Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) + ]) + ``` + """ + SIMILARITY: ClassVar[Dict[str, str]] = { "cosine": rest.Distance.COSINE, "dot_product": rest.Distance.DOT, @@ -96,6 +135,91 @@ def __init__( scroll_size: int = 10_000, payload_fields_to_index: Optional[List[dict]] = None, ): + + """ + :param location: + If “:memory:” - use in-memory Qdrant instance. If str - use it as a url parameter. If None - use default values for host and port. + :param url: + Either host or str of “Optional[scheme], host, Optional[port], Optional[prefix]”. Default: None + :param port: + Port of the REST API interface. Default: 6333 + :param grpc_port: + Port of the gRPC interface. Default: 6334 + :param prefer_grpc: + If true - use gRPC interface whenever possible in custom methods. Default: False + :param https: + If true - use HTTPS(SSL) protocol. Default: None + :param api_key: + API key for authentication in Qdrant Cloud. Default: None + :param prefix: + If not None - add prefix to the REST URL path. Example: service/v1 will result in http://localhost:6333/service/v1/{qdrant-endpoint} for REST API. Default: None + :param timeout: + Timeout for REST and gRPC API requests. Default: 5 seconds for REST and unlimited for gRPC + :param host: + Host name of Qdrant service. If url and host are None, set to ‘localhost’. Default: None + :param path: + Persistence path for QdrantLocal. Default: None + :param force_disable_check_same_thread: + For QdrantLocal, force disable check_same_thread. Default: False. Only use this if you can guarantee that you can resolve the thread safety outside QdrantClient. + :param index: + Name of the index. Defaults to 'Document'. + :param embedding_dim: + Dimension of the embeddings. Default: 768 + :param on_disk: + Whether to store the collection on disk. Default: False. + :param content_field: + The field name for the document content. Default: 'content'. + :param name_field: + The field name for the document name. Default: 'name'. + :param embedding_field: + The field name for the document embeddings. Default: 'embedding'. + :param use_sparse_embedding: + If set to True, enables the use for 'Sparse Embedding' class. Default: False + :param similarity: + The similarity metric to use. Default: 'cosine' + :param return_embedding: + Whether to return embeddings in the search results. Default: False. + :param progress_bar: + Whether to show a progress bar or not. Default: True + :param duplicate_documents: + The policy for handling duplicate documents ("overwrite", "skip", or "fail"). Default: 'overwrite'. + :param recreate_index: + Whether to recreate the index. Default: False. + :param shard_number: + Number of shards in the collection. Default is 1, minimum is 1. + :param replication_factor: + Replication factor for the collection. Default is 1, minimum is 1. + Defines how many copies of each shard will be created. Effective only in distributed mode. + :param write_consistency_factor: + Write consistency factor for the collection. Default is 1, minimum is 1. + Defines how many replicas should apply the operation for it to be considered successful. + Increasing this number makes the collection more resilient to inconsistencies but will cause failures if not enough replicas are available. + Effective only in distributed mode. + :param on_disk_payload: + If true, the point's payload will not be stored in memory and will be read from the disk every time it is requested. + This setting saves RAM by slightly increasing response time. Note: indexed payload values remain in RAM. Deafault: None + :param hnsw_config: + Params for HNSW index. + :param optimizers_config: + Params for optimizer. + :param wal_config: + Params for Write-Ahead-Log. + :param quantization_config: + Params for quantization. If None, quantization will be disabled. Default: None + :param init_from: + Use data stored in another collection to initialize this collection. Default: None + :param wait_result_from_api: + Whether to wait for the result from the API after each request. Default: True. + :param metadata: + Additional metadata to include with the documents. Default: None. + :param write_batch_size: + The batch size for writing documents. Default: 100. + :param scroll_size: + The scroll size for reading documents. Default: 10,000. + :param payload_fields_to_index: + List of payload fields to index. Default: None. + """ + self._client = None # Store the Qdrant client specific attributes @@ -172,6 +296,9 @@ def client(self): return self._client def count_documents(self) -> int: + """ + Returns the number of documents present in the DocumentStore. + """ try: response = self.client.count( collection_name=self.index, @@ -187,6 +314,15 @@ def filter_documents( self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None, ) -> List[Document]: + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, refer to the + DocumentStore.filter_documents() protocol documentation. + + :param filters: The filters to apply to the document list. + :returns: A list of Documents that match the given filters. + """ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter): msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`" raise ValueError(msg) @@ -204,6 +340,19 @@ def write_documents( documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL, ): + """ + Writes documents to Qdrant using the specified policy. + The QdrantDocumentStore can handle duplicate documents based on the given policy. The available policies are: + - `FAIL`: The operation will raise an error if any document already exists. + - `OVERWRITE`: Existing documents will be overwritten with the new ones. + - `SKIP`: Existing documents will be skipped, and only new documents will be added. + + This method validates the documents, sets up the collection, handles duplicates based on the policy, + and batches the documents for efficient upsertion into Qdrant. + + :param documents: A list of Document objects to write to Qdrant. + :param policy: The policy for handling duplicate documents. Defaults to `FAIL`. + """ for doc in documents: if not isinstance(doc, Document): msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}." @@ -239,6 +388,11 @@ def write_documents( return len(document_objects) def delete_documents(self, ids: List[str]): + """ + Deletes all documents with matching document_ids from the DocumentStore. + + :param document_ids: The object_ids to delete. + """ ids = [convert_id(_id) for _id in ids] try: self.client.delete( @@ -253,10 +407,24 @@ def delete_documents(self, ids: List[str]): @classmethod def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ params = inspect.signature(self.__init__).parameters # type: ignore # All the __init__ params must be set as attributes # Set as init_parms without default values @@ -271,6 +439,13 @@ def get_documents_generator( self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None, ) -> Generator[Document, None, None]: + """ + Returns a generator that yields documents from Qdrant based on the provided filters. + + :param filters: Filters applied to the retrieved Documents. + :returns: A generator that yields Haystack Document objects retrieved from Qdrant. + """ + index = self.index qdrant_filters = convert_filters_to_qdrant(filters) @@ -299,6 +474,16 @@ def get_documents_by_id( ids: List[str], index: Optional[str] = None, ) -> List[Document]: + """ + Retrieves documents from Qdrant by their IDs. + + :param ids: + A list of document IDs to retrieve. + :param index: + The name of the index to retrieve documents from. + :returns: + A list of Haystack Document objects. + """ index = index or self.index documents: List[Document] = [] @@ -325,6 +510,21 @@ def _query_by_sparse( scale_score: bool = True, return_embedding: bool = False, ) -> List[Document]: + """ + Queries Qdrant using sparse embeddings and returns the most relevant documents. + + :param query_sparse_embedding: Sparse embedding of the query. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. + :param return_embedding: Whether to return the embeddings of the retrieved documents. + + :returns: List of Document that are most similar to `query_sparse_embedding`. + + :raises QdrantStoreError: + If the Document Store was initialized with `use_sparse_embeddings=False`. + """ + if not self.use_sparse_embeddings: message = ( "You are trying to query using sparse embeddings, but the Document Store " @@ -367,6 +567,17 @@ def _query_by_embedding( scale_score: bool = True, return_embedding: bool = False, ) -> List[Document]: + """ + Queries Qdrant using dense embeddings and returns the most relevant documents. + + :param query_embedding: Dense embedding of the query. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. + :param return_embedding: Whether to return the embeddings of the retrieved documents. + + :returns: List of Document that are most similar to `query_embedding`. + """ qdrant_filters = convert_filters_to_qdrant(filters) points = self.client.search( @@ -474,6 +685,16 @@ def _query_hybrid( return results def get_distance(self, similarity: str) -> rest.Distance: + """ + Retrieves the distance metric for the specified similarity measure. + + :param similarity: + The similarity measure to retrieve the distance. + :returns: + The corresponding rest.Distance object. + :raises QdrantStoreError: + If the provided similarity measure is not supported. + """ try: return self.SIMILARITY[similarity] except KeyError as ke: @@ -507,6 +728,29 @@ def _set_up_collection( on_disk: bool = False, payload_fields_to_index: Optional[List[dict]] = None, ): + """ + Sets up the Qdrant collection with the specified parameters. + :param collection_name: + The name of the collection to set up. + :param embedding_dim: + The dimension of the embeddings. + :param recreate_collection: + Whether to recreate the collection if it already exists. + :param similarity: + The similarity measure to use. + :param use_sparse_embeddings: + Whether to use sparse embeddings. + :param on_disk: + Whether to store the collection on disk. Default: False. + :param payload_fields_to_index: + List of payload fields to index. + + :raises QdrantStoreError: + If the collection exists with incompatible settings. + :raises ValueError: + If the collection exists with a different similarity measure or embedding dimension. + + """ distance = self.get_distance(similarity) if recreate_collection or not self.client.collection_exists(collection_name): @@ -576,6 +820,20 @@ def recreate_collection( on_disk: Optional[bool] = None, use_sparse_embeddings: Optional[bool] = None, ): + """ + Recreates the Qdrant collection with the specified parameters. + + :param collection_name: + The name of the collection to recreate. + :param distance: + The distance metric to use for the collection. + :param embedding_dim: + The dimension of the embeddings. + :param on_disk: + Whether to store the collection on disk. Default: None. + :param use_sparse_embeddings: + Whether to use sparse embeddings. Default: None. + """ if on_disk is None: on_disk = self.on_disk From 2248b189f93d91d8f67db8a446ae2140d9db253c Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Thu, 13 Jun 2024 13:47:57 +0200 Subject: [PATCH 2/5] doc: add docstrings for qdrant document store --- .../document_stores/qdrant/document_store.py | 217 +++++++++--------- 1 file changed, 112 insertions(+), 105 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 5049bcae2..f30860d69 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -49,9 +49,9 @@ def get_batches_from_generator(iterable, n): class QdrantDocumentStore: - """ - QdrantDocumentStore is a Document Store for Qdrant. It can be used with any Qdrant instance, in-memory, locally persisted, hosted, and the official Qdrant Cloud. + QdrantDocumentStore is a Document Store for Qdrant. It can be used with any Qdrant instance: in-memory, + locally persisted, hosted, and the official Qdrant Cloud. Usage example by creating an in-memory instance: @@ -66,7 +66,7 @@ class QdrantDocumentStore: wait_result_from_api=True, ) document_store.write_documents([ - Document(content="This is first", embedding=[0.0]*5), + Document(content="This is first", embedding=[0.0]*5), Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) ]) ``` @@ -82,7 +82,7 @@ class QdrantDocumentStore: api_key="", ) document_store.write_documents([ - Document(content="This is first", embedding=[0.0]*5), + Document(content="This is first", embedding=[0.0]*5), Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) ]) ``` @@ -135,88 +135,94 @@ def __init__( scroll_size: int = 10_000, payload_fields_to_index: Optional[List[dict]] = None, ): - - """ - :param location: - If “:memory:” - use in-memory Qdrant instance. If str - use it as a url parameter. If None - use default values for host and port. - :param url: - Either host or str of “Optional[scheme], host, Optional[port], Optional[prefix]”. Default: None - :param port: - Port of the REST API interface. Default: 6333 - :param grpc_port: - Port of the gRPC interface. Default: 6334 - :param prefer_grpc: - If true - use gRPC interface whenever possible in custom methods. Default: False - :param https: - If true - use HTTPS(SSL) protocol. Default: None - :param api_key: - API key for authentication in Qdrant Cloud. Default: None - :param prefix: - If not None - add prefix to the REST URL path. Example: service/v1 will result in http://localhost:6333/service/v1/{qdrant-endpoint} for REST API. Default: None - :param timeout: - Timeout for REST and gRPC API requests. Default: 5 seconds for REST and unlimited for gRPC - :param host: - Host name of Qdrant service. If url and host are None, set to ‘localhost’. Default: None - :param path: - Persistence path for QdrantLocal. Default: None - :param force_disable_check_same_thread: - For QdrantLocal, force disable check_same_thread. Default: False. Only use this if you can guarantee that you can resolve the thread safety outside QdrantClient. + """ + :param location: + If “:memory:” - use in-memory Qdrant instance. If str - use it as a url parameter. + If None - use default values for host and port. + :param url: + Either host or str of “Optional[scheme], host, Optional[port], Optional[prefix]”. Default: None. + :param port: + Port of the REST API interface. Default: 6333. + :param grpc_port: + Port of the gRPC interface. Default: 6334. + :param prefer_grpc: + If true - use gRPC interface whenever possible in custom methods. Default: False. + :param https: + If true - use HTTPS(SSL) protocol. Default: None. + :param api_key: + API key for authentication in Qdrant Cloud. Default: None. + :param prefix: + If not None - add prefix to the REST URL path. + Example: service/v1 will result in http://localhost:6333/service/v1/{qdrant-endpoint} + for REST API. Default: None. + :param timeout: + Timeout for REST and gRPC API requests. Default: 5 seconds for REST and unlimited for gRPC. + :param host: + Host name of Qdrant service. If url and host are None, set to `localhost`. Default: None. + :param path: + Persistence path for QdrantLocal. Default: None. + :param force_disable_check_same_thread: + For QdrantLocal, force disable check_same_thread. Default: False. + Only use this if you can guarantee that you can resolve the thread safety outside QdrantClient. :param index: - Name of the index. Defaults to 'Document'. + Name of the index. Default: 'Document'. :param embedding_dim: - Dimension of the embeddings. Default: 768 + Dimension of the embeddings. Default: 768. :param on_disk: Whether to store the collection on disk. Default: False. :param content_field: - The field name for the document content. Default: 'content'. + The field for the document content. Default: 'content'. :param name_field: - The field name for the document name. Default: 'name'. + The field for the document name. Default: 'name'. :param embedding_field: - The field name for the document embeddings. Default: 'embedding'. + The field for the document embeddings. Default: 'embedding'. :param use_sparse_embedding: - If set to True, enables the use for 'Sparse Embedding' class. Default: False + If set to True, enables the use for 'Sparse Embedding' class. Default: False. :param similarity: - The similarity metric to use. Default: 'cosine' + The similarity metric to use. Default: 'cosine'. :param return_embedding: Whether to return embeddings in the search results. Default: False. :param progress_bar: - Whether to show a progress bar or not. Default: True + Whether to show a progress bar or not. Default: True. :param duplicate_documents: The policy for handling duplicate documents ("overwrite", "skip", or "fail"). Default: 'overwrite'. :param recreate_index: Whether to recreate the index. Default: False. - :param shard_number: + :param shard_number: Number of shards in the collection. Default is 1, minimum is 1. - :param replication_factor: - Replication factor for the collection. Default is 1, minimum is 1. + :param replication_factor: + Replication factor for the collection. Default is 1, minimum is 1. Defines how many copies of each shard will be created. Effective only in distributed mode. - :param write_consistency_factor: - Write consistency factor for the collection. Default is 1, minimum is 1. - Defines how many replicas should apply the operation for it to be considered successful. - Increasing this number makes the collection more resilient to inconsistencies but will cause failures if not enough replicas are available. - Effective only in distributed mode. - :param on_disk_payload: - If true, the point's payload will not be stored in memory and will be read from the disk every time it is requested. - This setting saves RAM by slightly increasing response time. Note: indexed payload values remain in RAM. Deafault: None - :param hnsw_config: - Params for HNSW index. - :param optimizers_config: - Params for optimizer. - :param wal_config: - Params for Write-Ahead-Log. - :param quantization_config: - Params for quantization. If None, quantization will be disabled. Default: None - :param init_from: - Use data stored in another collection to initialize this collection. Default: None - :param wait_result_from_api: + :param write_consistency_factor: + Write consistency factor for the collection. Default is 1, minimum is 1. + Defines how many replicas should apply to the operation for it to be considered successful. + Increasing this number makes the collection more resilient to inconsistencies + but will cause failures if not enough replicas are available. + Effective only in distributed mode. + :param on_disk_payload: + If true, the point's payload will not be stored in memory and + will be read from the disk every time it is requested. + This setting saves RAM by slightly increasing response time. + Note: indexed payload values remain in RAM. Deafault: None. + :param hnsw_config: + Params for HNSW index. + :param optimizers_config: + Params for optimizer. + :param wal_config: + Params for Write-Ahead-Log. + :param quantization_config: + Params for quantization. If None, quantization will be disabled. Default: None. + :param init_from: + Use data stored in another collection to initialize this collection. Default: None. + :param wait_result_from_api: Whether to wait for the result from the API after each request. Default: True. - :param metadata: + :param metadata: Additional metadata to include with the documents. Default: None. - :param write_batch_size: + :param write_batch_size: The batch size for writing documents. Default: 100. - :param scroll_size: + :param scroll_size: The scroll size for reading documents. Default: 10,000. - :param payload_fields_to_index: + :param payload_fields_to_index: List of payload fields to index. Default: None. """ @@ -297,7 +303,7 @@ def client(self): def count_documents(self) -> int: """ - Returns the number of documents present in the DocumentStore. + Returns the number of documents present in the Document Store. """ try: response = self.client.count( @@ -315,13 +321,13 @@ def filter_documents( filters: Optional[Union[Dict[str, Any], rest.Filter]] = None, ) -> List[Document]: """ - Returns the documents that match the filters provided. + Returns the documents that match the provided filters. For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. :param filters: The filters to apply to the document list. - :returns: A list of Documents that match the given filters. + :returns: A list of documents that match the given filters. """ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter): msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`" @@ -342,12 +348,13 @@ def write_documents( ): """ Writes documents to Qdrant using the specified policy. - The QdrantDocumentStore can handle duplicate documents based on the given policy. The available policies are: + The QdrantDocumentStore can handle duplicate documents based on the given policy. + The available policies are: - `FAIL`: The operation will raise an error if any document already exists. - `OVERWRITE`: Existing documents will be overwritten with the new ones. - `SKIP`: Existing documents will be skipped, and only new documents will be added. - This method validates the documents, sets up the collection, handles duplicates based on the policy, + This method validates the documents, sets up the collection, handles duplicates based on the policy, and batches the documents for efficient upsertion into Qdrant. :param documents: A list of Document objects to write to Qdrant. @@ -442,7 +449,7 @@ def get_documents_generator( """ Returns a generator that yields documents from Qdrant based on the provided filters. - :param filters: Filters applied to the retrieved Documents. + :param filters: Filters applied to the retrieved documents. :returns: A generator that yields Haystack Document objects retrieved from Qdrant. """ @@ -477,11 +484,11 @@ def get_documents_by_id( """ Retrieves documents from Qdrant by their IDs. - :param ids: + :param ids: A list of document IDs to retrieve. - :param index: - The name of the index to retrieve documents from. - :returns: + :param index: + The name of the index to retrieve documents from. + :returns: A list of Haystack Document objects. """ index = index or self.index @@ -514,12 +521,12 @@ def _query_by_sparse( Queries Qdrant using sparse embeddings and returns the most relevant documents. :param query_sparse_embedding: Sparse embedding of the query. - :param filters: Filters applied to the retrieved Documents. - :param top_k: Maximum number of Documents to return. + :param filters: Filters applied to the retrieved documents. + :param top_k: Maximum number of documents to return. :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. :param return_embedding: Whether to return the embeddings of the retrieved documents. - :returns: List of Document that are most similar to `query_sparse_embedding`. + :returns: List of documents that are most similar to `query_sparse_embedding`. :raises QdrantStoreError: If the Document Store was initialized with `use_sparse_embeddings=False`. @@ -571,12 +578,12 @@ def _query_by_embedding( Queries Qdrant using dense embeddings and returns the most relevant documents. :param query_embedding: Dense embedding of the query. - :param filters: Filters applied to the retrieved Documents. - :param top_k: Maximum number of Documents to return. + :param filters: Filters applied to the retrieved documents. + :param top_k: Maximum number of documents to return. :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. :param return_embedding: Whether to return the embeddings of the retrieved documents. - :returns: List of Document that are most similar to `query_embedding`. + :returns: List of documents that are most similar to `query_embedding`. """ qdrant_filters = convert_filters_to_qdrant(filters) @@ -620,8 +627,8 @@ def _query_hybrid( :param query_embedding: Dense embedding of the query. :param query_sparse_embedding: Sparse embedding of the query. - :param filters: Filters applied to the retrieved Documents. - :param top_k: Maximum number of Documents to return. + :param filters: Filters applied to the retrieved documents. + :param top_k: Maximum number of documents to return. :param return_embedding: Whether to return the embeddings of the retrieved documents. :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`. @@ -688,11 +695,11 @@ def get_distance(self, similarity: str) -> rest.Distance: """ Retrieves the distance metric for the specified similarity measure. - :param similarity: + :param similarity: The similarity measure to retrieve the distance. - :returns: + :returns: The corresponding rest.Distance object. - :raises QdrantStoreError: + :raises QdrantStoreError: If the provided similarity measure is not supported. """ try: @@ -730,26 +737,26 @@ def _set_up_collection( ): """ Sets up the Qdrant collection with the specified parameters. - :param collection_name: + :param collection_name: The name of the collection to set up. - :param embedding_dim: + :param embedding_dim: The dimension of the embeddings. - :param recreate_collection: + :param recreate_collection: Whether to recreate the collection if it already exists. - :param similarity: + :param similarity: The similarity measure to use. - :param use_sparse_embeddings: + :param use_sparse_embeddings: Whether to use sparse embeddings. - :param on_disk: + :param on_disk: Whether to store the collection on disk. Default: False. - :param payload_fields_to_index: + :param payload_fields_to_index: List of payload fields to index. - :raises QdrantStoreError: + :raises QdrantStoreError: If the collection exists with incompatible settings. - :raises ValueError: + :raises ValueError: If the collection exists with a different similarity measure or embedding dimension. - + """ distance = self.get_distance(similarity) @@ -823,15 +830,15 @@ def recreate_collection( """ Recreates the Qdrant collection with the specified parameters. - :param collection_name: + :param collection_name: The name of the collection to recreate. - :param distance: + :param distance: The distance metric to use for the collection. - :param embedding_dim: + :param embedding_dim: The dimension of the embeddings. - :param on_disk: + :param on_disk: Whether to store the collection on disk. Default: None. - :param use_sparse_embeddings: + :param use_sparse_embeddings: Whether to use sparse embeddings. Default: None. """ if on_disk is None: @@ -885,11 +892,11 @@ def _handle_duplicate_documents( :param documents: A list of Haystack Document objects. :param index: name of the index - :param duplicate_documents: Handle duplicates document based on parameter options. + :param duplicate_documents: Handle duplicate documents based on parameter options. Parameter options : ( 'skip','overwrite','fail') - skip (default option): Ignore the duplicates documents + skip (default option): Ignore the duplicates documents. overwrite: Update any existing documents with the same ID when adding documents. - fail: an error is raised if the document ID of the document being added already + fail: An error is raised if the document ID of the document being added already exists. :returns: A list of Haystack Document objects. """ @@ -910,10 +917,10 @@ def _handle_duplicate_documents( def _drop_duplicate_documents(self, documents: List[Document], index: Optional[str] = None) -> List[Document]: """ - Drop duplicates documents based on same hash ID + Drop duplicate documents based on same hash ID. :param documents: A list of Haystack Document objects. - :param index: name of the index + :param index: Name of the index. :returns: A list of Haystack Document objects. """ _hash_ids: Set = set() From b2ffd2ebcc56df4c19531f4a6ade2d4d14357434 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Mon, 17 Jun 2024 23:42:18 +0200 Subject: [PATCH 3/5] Updated docstrings based on PR review --- .../document_stores/qdrant/document_store.py | 116 +++++++++--------- 1 file changed, 57 insertions(+), 59 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index f30860d69..57a4fb03b 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -50,8 +50,8 @@ def get_batches_from_generator(iterable, n): class QdrantDocumentStore: """ - QdrantDocumentStore is a Document Store for Qdrant. It can be used with any Qdrant instance: in-memory, - locally persisted, hosted, and the official Qdrant Cloud. + QdrantDocumentStore is a Document Store for Qdrant. + It can be used with any Qdrant instance: in-memory, disk-persisted, Docker-based, and Qdrant Cloud Cluster deployments. Usage example by creating an in-memory instance: @@ -61,9 +61,7 @@ class QdrantDocumentStore: document_store = QdrantDocumentStore( ":memory:", - recreate_index=True, - return_embedding=True, - wait_result_from_api=True, + recreate_index=True ) document_store.write_documents([ Document(content="This is first", embedding=[0.0]*5), @@ -137,73 +135,74 @@ def __init__( ): """ :param location: - If “:memory:” - use in-memory Qdrant instance. If str - use it as a url parameter. - If None - use default values for host and port. + If `memory` - use in-memory Qdrant instance. + If `str` - use it as a URL parameter. + If `None` - use default values for host and port. :param url: - Either host or str of “Optional[scheme], host, Optional[port], Optional[prefix]”. Default: None. + Either host or str of `Optional[scheme], host, Optional[port], Optional[prefix]`. :param port: - Port of the REST API interface. Default: 6333. + Port of the REST API interface. :param grpc_port: - Port of the gRPC interface. Default: 6334. + Port of the gRPC interface. :param prefer_grpc: - If true - use gRPC interface whenever possible in custom methods. Default: False. + If `True` - use gRPC interface whenever possible in custom methods. :param https: - If true - use HTTPS(SSL) protocol. Default: None. + If `True` - use HTTPS(SSL) protocol. :param api_key: - API key for authentication in Qdrant Cloud. Default: None. + API key for authentication in Qdrant Cloud. :param prefix: - If not None - add prefix to the REST URL path. + If not `None` - add prefix to the REST URL path. Example: service/v1 will result in http://localhost:6333/service/v1/{qdrant-endpoint} - for REST API. Default: None. + for REST API. :param timeout: - Timeout for REST and gRPC API requests. Default: 5 seconds for REST and unlimited for gRPC. + Timeout for REST and gRPC API requests. :param host: - Host name of Qdrant service. If url and host are None, set to `localhost`. Default: None. + Host name of Qdrant service. If url and host are None, set to `localhost`. :param path: - Persistence path for QdrantLocal. Default: None. + Persistence path for QdrantLocal. :param force_disable_check_same_thread: - For QdrantLocal, force disable check_same_thread. Default: False. + For QdrantLocal, force disable check_same_thread. Only use this if you can guarantee that you can resolve the thread safety outside QdrantClient. :param index: - Name of the index. Default: 'Document'. + Name of the index. :param embedding_dim: - Dimension of the embeddings. Default: 768. + Dimension of the embeddings. :param on_disk: - Whether to store the collection on disk. Default: False. + Whether to store the collection on disk. :param content_field: - The field for the document content. Default: 'content'. + The field for the document content. :param name_field: - The field for the document name. Default: 'name'. + The field for the document name. :param embedding_field: - The field for the document embeddings. Default: 'embedding'. + The field for the document embeddings. :param use_sparse_embedding: - If set to True, enables the use for 'Sparse Embedding' class. Default: False. + If set to True, enables the use for 'Sparse Embedding' class. :param similarity: - The similarity metric to use. Default: 'cosine'. + The similarity metric to use. :param return_embedding: - Whether to return embeddings in the search results. Default: False. + Whether to return embeddings in the search results. :param progress_bar: - Whether to show a progress bar or not. Default: True. + Whether to show a progress bar or not. :param duplicate_documents: - The policy for handling duplicate documents ("overwrite", "skip", or "fail"). Default: 'overwrite'. + The policy for handling duplicate documents ("overwrite", "skip", or "fail"). :param recreate_index: - Whether to recreate the index. Default: False. + Whether to recreate the index. :param shard_number: - Number of shards in the collection. Default is 1, minimum is 1. + Number of shards in the collection. :param replication_factor: - Replication factor for the collection. Default is 1, minimum is 1. + Replication factor for the collection. Defines how many copies of each shard will be created. Effective only in distributed mode. :param write_consistency_factor: - Write consistency factor for the collection. Default is 1, minimum is 1. + Write consistency factor for the collection. Minimum value is 1. Defines how many replicas should apply to the operation for it to be considered successful. Increasing this number makes the collection more resilient to inconsistencies but will cause failures if not enough replicas are available. Effective only in distributed mode. :param on_disk_payload: - If true, the point's payload will not be stored in memory and + If `True`, the point's payload will not be stored in memory and will be read from the disk every time it is requested. This setting saves RAM by slightly increasing response time. - Note: indexed payload values remain in RAM. Deafault: None. + Note: indexed payload values remain in RAM. :param hnsw_config: Params for HNSW index. :param optimizers_config: @@ -211,19 +210,19 @@ def __init__( :param wal_config: Params for Write-Ahead-Log. :param quantization_config: - Params for quantization. If None, quantization will be disabled. Default: None. + Params for quantization. If None, quantization will be disabled. :param init_from: - Use data stored in another collection to initialize this collection. Default: None. + Use data stored in another collection to initialize this collection. :param wait_result_from_api: - Whether to wait for the result from the API after each request. Default: True. + Whether to wait for the result from the API after each request. :param metadata: - Additional metadata to include with the documents. Default: None. + Additional metadata to include with the documents. :param write_batch_size: - The batch size for writing documents. Default: 100. + The batch size for writing documents. :param scroll_size: - The scroll size for reading documents. Default: 10,000. + The scroll size for reading documents. :param payload_fields_to_index: - List of payload fields to index. Default: None. + List of payload fields to index. """ self._client = None @@ -324,7 +323,7 @@ def filter_documents( Returns the documents that match the provided filters. For a detailed specification of the filters, refer to the - DocumentStore.filter_documents() protocol documentation. + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering) :param filters: The filters to apply to the document list. :returns: A list of documents that match the given filters. @@ -354,11 +353,10 @@ def write_documents( - `OVERWRITE`: Existing documents will be overwritten with the new ones. - `SKIP`: Existing documents will be skipped, and only new documents will be added. - This method validates the documents, sets up the collection, handles duplicates based on the policy, - and batches the documents for efficient upsertion into Qdrant. - :param documents: A list of Document objects to write to Qdrant. - :param policy: The policy for handling duplicate documents. Defaults to `FAIL`. + :param policy: The policy for handling duplicate documents. + + :returns: The number of documents written to the document store. """ for doc in documents: if not isinstance(doc, Document): @@ -396,9 +394,9 @@ def write_documents( def delete_documents(self, ids: List[str]): """ - Deletes all documents with matching document_ids from the DocumentStore. + Deletes documents that match the provided `document_ids` from the document store. - :param document_ids: The object_ids to delete. + :param document_ids: the document ids to delete """ ids = [convert_id(_id) for _id in ids] try: @@ -450,7 +448,7 @@ def get_documents_generator( Returns a generator that yields documents from Qdrant based on the provided filters. :param filters: Filters applied to the retrieved documents. - :returns: A generator that yields Haystack Document objects retrieved from Qdrant. + :returns: A generator that yields documents retrieved from Qdrant. """ index = self.index @@ -489,7 +487,7 @@ def get_documents_by_id( :param index: The name of the index to retrieve documents from. :returns: - A list of Haystack Document objects. + A list of documents. """ index = index or self.index @@ -518,12 +516,12 @@ def _query_by_sparse( return_embedding: bool = False, ) -> List[Document]: """ - Queries Qdrant using sparse embeddings and returns the most relevant documents. + Queries Qdrant using sparse a embedding and returns the most relevant documents. :param query_sparse_embedding: Sparse embedding of the query. :param filters: Filters applied to the retrieved documents. :param top_k: Maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. + :param scale_score: Whether to scale the scores of the retrieved documents. :param return_embedding: Whether to return the embeddings of the retrieved documents. :returns: List of documents that are most similar to `query_sparse_embedding`. @@ -575,12 +573,12 @@ def _query_by_embedding( return_embedding: bool = False, ) -> List[Document]: """ - Queries Qdrant using dense embeddings and returns the most relevant documents. + Queries Qdrant using a dense embedding and returns the most relevant documents. :param query_embedding: Dense embedding of the query. :param filters: Filters applied to the retrieved documents. :param top_k: Maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents. Default: True. + :param scale_score: Whether to scale the scores of the retrieved documents. :param return_embedding: Whether to return the embeddings of the retrieved documents. :returns: List of documents that are most similar to `query_embedding`. @@ -748,7 +746,7 @@ def _set_up_collection( :param use_sparse_embeddings: Whether to use sparse embeddings. :param on_disk: - Whether to store the collection on disk. Default: False. + Whether to store the collection on disk. :param payload_fields_to_index: List of payload fields to index. @@ -837,9 +835,9 @@ def recreate_collection( :param embedding_dim: The dimension of the embeddings. :param on_disk: - Whether to store the collection on disk. Default: None. + Whether to store the collection on disk. :param use_sparse_embeddings: - Whether to use sparse embeddings. Default: None. + Whether to use sparse embeddings. """ if on_disk is None: on_disk = self.on_disk From c7a829d1d159eb3d40603ff89a7967d0808057ff Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Mon, 17 Jun 2024 23:50:27 +0200 Subject: [PATCH 4/5] Fixed lint errors --- .../document_stores/qdrant/document_store.py | 81 ++++++++++--------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 57a4fb03b..4a5d91332 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -50,8 +50,9 @@ def get_batches_from_generator(iterable, n): class QdrantDocumentStore: """ - QdrantDocumentStore is a Document Store for Qdrant. - It can be used with any Qdrant instance: in-memory, disk-persisted, Docker-based, and Qdrant Cloud Cluster deployments. + QdrantDocumentStore is a Document Store for Qdrant. + It can be used with any Qdrant instance: in-memory, disk-persisted, Docker-based, + and Qdrant Cloud Cluster deployments. Usage example by creating an in-memory instance: @@ -135,62 +136,62 @@ def __init__( ): """ :param location: - If `memory` - use in-memory Qdrant instance. + If `memory` - use in-memory Qdrant instance. If `str` - use it as a URL parameter. If `None` - use default values for host and port. :param url: Either host or str of `Optional[scheme], host, Optional[port], Optional[prefix]`. :param port: - Port of the REST API interface. + Port of the REST API interface. :param grpc_port: Port of the gRPC interface. :param prefer_grpc: - If `True` - use gRPC interface whenever possible in custom methods. + If `True` - use gRPC interface whenever possible in custom methods. :param https: - If `True` - use HTTPS(SSL) protocol. + If `True` - use HTTPS(SSL) protocol. :param api_key: - API key for authentication in Qdrant Cloud. + API key for authentication in Qdrant Cloud. :param prefix: If not `None` - add prefix to the REST URL path. Example: service/v1 will result in http://localhost:6333/service/v1/{qdrant-endpoint} - for REST API. + for REST API. :param timeout: - Timeout for REST and gRPC API requests. + Timeout for REST and gRPC API requests. :param host: - Host name of Qdrant service. If url and host are None, set to `localhost`. + Host name of Qdrant service. If url and host are None, set to `localhost`. :param path: - Persistence path for QdrantLocal. + Persistence path for QdrantLocal. :param force_disable_check_same_thread: - For QdrantLocal, force disable check_same_thread. + For QdrantLocal, force disable check_same_thread. Only use this if you can guarantee that you can resolve the thread safety outside QdrantClient. :param index: - Name of the index. + Name of the index. :param embedding_dim: - Dimension of the embeddings. + Dimension of the embeddings. :param on_disk: - Whether to store the collection on disk. + Whether to store the collection on disk. :param content_field: - The field for the document content. + The field for the document content. :param name_field: - The field for the document name. + The field for the document name. :param embedding_field: - The field for the document embeddings. + The field for the document embeddings. :param use_sparse_embedding: - If set to True, enables the use for 'Sparse Embedding' class. + If set to True, enables the use for 'Sparse Embedding' class. :param similarity: - The similarity metric to use. + The similarity metric to use. :param return_embedding: - Whether to return embeddings in the search results. + Whether to return embeddings in the search results. :param progress_bar: - Whether to show a progress bar or not. + Whether to show a progress bar or not. :param duplicate_documents: - The policy for handling duplicate documents ("overwrite", "skip", or "fail"). + The policy for handling duplicate documents ("overwrite", "skip", or "fail"). :param recreate_index: - Whether to recreate the index. + Whether to recreate the index. :param shard_number: - Number of shards in the collection. + Number of shards in the collection. :param replication_factor: - Replication factor for the collection. + Replication factor for the collection. Defines how many copies of each shard will be created. Effective only in distributed mode. :param write_consistency_factor: Write consistency factor for the collection. Minimum value is 1. @@ -202,7 +203,7 @@ def __init__( If `True`, the point's payload will not be stored in memory and will be read from the disk every time it is requested. This setting saves RAM by slightly increasing response time. - Note: indexed payload values remain in RAM. + Note: indexed payload values remain in RAM. :param hnsw_config: Params for HNSW index. :param optimizers_config: @@ -210,19 +211,19 @@ def __init__( :param wal_config: Params for Write-Ahead-Log. :param quantization_config: - Params for quantization. If None, quantization will be disabled. + Params for quantization. If None, quantization will be disabled. :param init_from: - Use data stored in another collection to initialize this collection. + Use data stored in another collection to initialize this collection. :param wait_result_from_api: - Whether to wait for the result from the API after each request. + Whether to wait for the result from the API after each request. :param metadata: - Additional metadata to include with the documents. + Additional metadata to include with the documents. :param write_batch_size: - The batch size for writing documents. + The batch size for writing documents. :param scroll_size: - The scroll size for reading documents. + The scroll size for reading documents. :param payload_fields_to_index: - List of payload fields to index. + List of payload fields to index. """ self._client = None @@ -354,9 +355,9 @@ def write_documents( - `SKIP`: Existing documents will be skipped, and only new documents will be added. :param documents: A list of Document objects to write to Qdrant. - :param policy: The policy for handling duplicate documents. + :param policy: The policy for handling duplicate documents. - :returns: The number of documents written to the document store. + :returns: The number of documents written to the document store. """ for doc in documents: if not isinstance(doc, Document): @@ -578,7 +579,7 @@ def _query_by_embedding( :param query_embedding: Dense embedding of the query. :param filters: Filters applied to the retrieved documents. :param top_k: Maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents. + :param scale_score: Whether to scale the scores of the retrieved documents. :param return_embedding: Whether to return the embeddings of the retrieved documents. :returns: List of documents that are most similar to `query_embedding`. @@ -746,7 +747,7 @@ def _set_up_collection( :param use_sparse_embeddings: Whether to use sparse embeddings. :param on_disk: - Whether to store the collection on disk. + Whether to store the collection on disk. :param payload_fields_to_index: List of payload fields to index. @@ -835,9 +836,9 @@ def recreate_collection( :param embedding_dim: The dimension of the embeddings. :param on_disk: - Whether to store the collection on disk. + Whether to store the collection on disk. :param use_sparse_embeddings: - Whether to use sparse embeddings. + Whether to use sparse embeddings. """ if on_disk is None: on_disk = self.on_disk From fb8feed80b29b456ed3523c26d810a4272de8b7e Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Thu, 20 Jun 2024 13:10:04 +0200 Subject: [PATCH 5/5] Fixed minor typos --- .../document_stores/qdrant/document_store.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 4a5d91332..51d64e5e3 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -158,7 +158,7 @@ def __init__( :param timeout: Timeout for REST and gRPC API requests. :param host: - Host name of Qdrant service. If url and host are None, set to `localhost`. + Host name of Qdrant service. If ùrl` and `host` are `None`, set to `localhost`. :param path: Persistence path for QdrantLocal. :param force_disable_check_same_thread: @@ -177,7 +177,7 @@ def __init__( :param embedding_field: The field for the document embeddings. :param use_sparse_embedding: - If set to True, enables the use for 'Sparse Embedding' class. + If set to `True`, enables support for sparse embeddings. :param similarity: The similarity metric to use. :param return_embedding: @@ -185,7 +185,7 @@ def __init__( :param progress_bar: Whether to show a progress bar or not. :param duplicate_documents: - The policy for handling duplicate documents ("overwrite", "skip", or "fail"). + The parameter is not used and will be removed in future release. :param recreate_index: Whether to recreate the index. :param shard_number: @@ -211,7 +211,7 @@ def __init__( :param wal_config: Params for Write-Ahead-Log. :param quantization_config: - Params for quantization. If None, quantization will be disabled. + Params for quantization. If `None`, quantization will be disabled. :param init_from: Use data stored in another collection to initialize this collection. :param wait_result_from_api: @@ -517,7 +517,7 @@ def _query_by_sparse( return_embedding: bool = False, ) -> List[Document]: """ - Queries Qdrant using sparse a embedding and returns the most relevant documents. + Queries Qdrant using a sparse embedding and returns the most relevant documents. :param query_sparse_embedding: Sparse embedding of the query. :param filters: Filters applied to the retrieved documents.