Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: MongoDB Atlas keyword search #1200

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
# For details, see https://www.mongodb.com/docs/atlas/getting-started/
# NOTE: you need to create manually the vector search index and the full text search
# index in your MongoDB Atlas database.

# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
# with the connection string to your MongoDB Atlas database.
Expand All @@ -29,12 +31,17 @@
database_name="haystack_test",
collection_name="test_collection",
vector_search_index="test_vector_search_index",
full_text_search_index="test_full_text_search_index",
)

# This is to avoid duplicates in the collection
print(f"Cleaning up collection {document_store.collection_name}")
document_store.collection.delete_many({})

# Create the indexing Pipeline and index some documents
file_paths = glob.glob("neural-search-pills/pills/*.md")


print("Creating indexing pipeline")
indexing = Pipeline()
indexing.add_component("converter", MarkdownToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
Expand All @@ -44,17 +51,20 @@
indexing.connect("splitter", "embedder")
indexing.connect("embedder", "writer")

print(f"Running indexing pipeline with {len(file_paths)} files")
indexing.run({"converter": {"sources": file_paths}})


# Create the querying Pipeline and try a query
print("Creating querying pipeline")
querying = Pipeline()
querying.add_component("embedder", SentenceTransformersTextEmbedder())
querying.add_component("retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
querying.connect("embedder", "retriever")

query = "What is a cross-encoder?"
print(f"Running querying pipeline with query: '{query}'")
results = querying.run({"embedder": {"text": "What is a cross-encoder?"}})

print(f"Results: {results}")
for doc in results["retriever"]["documents"]:
print(doc)
print("-" * 10)
80 changes: 80 additions & 0 deletions integrations/mongodb_atlas/examples/hybrid_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Install required packages for this example, including mongodb-atlas-haystack and other libraries needed
# for Markdown conversion and embeddings generation. Use the following command:
#
# pip install mongodb-atlas-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0"
#
# Download some Markdown files to index.
# git clone https://github.com/anakin87/neural-search-pills

import glob

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.joiners import DocumentJoiner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter

from haystack_integrations.components.retrievers.mongodb_atlas import (
MongoDBAtlasEmbeddingRetriever,
MongoDBAtlasFullTextRetriever,
)
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore

# To use the MongoDBAtlasDocumentStore, you must have a running MongoDB Atlas database.
# For details, see https://www.mongodb.com/docs/atlas/getting-started/
# NOTE: you need to create manually the vector search index and the full text search
# index in your MongoDB Atlas database.

# Once your database is set, set the environment variable `MONGO_CONNECTION_STRING`
# with the connection string to your MongoDB Atlas database.
# format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}".

# Initialize the document store
document_store = MongoDBAtlasDocumentStore(
database_name="haystack_test",
collection_name="test_collection",
vector_search_index="test_vector_search_index",
full_text_search_index="test_full_text_search_index",
)

file_paths = glob.glob("neural-search-pills/pills/*.md")

# This is to avoid duplicates in the collection
print(f"Cleaning up collection {document_store.collection_name}")
document_store.collection.delete_many({})

print("Creating indexing pipeline")
indexing = Pipeline()
indexing.add_component("converter", MarkdownToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "splitter")
indexing.connect("splitter", "document_embedder")
indexing.connect("document_embedder", "writer")

print(f"Running indexing pipeline with {len(file_paths)} files")
indexing.run({"converter": {"sources": file_paths}})

print("Creating querying pipeline")
querying = Pipeline()
querying.add_component("text_embedder", SentenceTransformersTextEmbedder())
querying.add_component("embedding_retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=3))
querying.add_component("full_text_retriever", MongoDBAtlasFullTextRetriever(document_store=document_store, top_k=3))
querying.add_component(
"joiner",
DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3),
)
querying.connect("text_embedder", "embedding_retriever")
querying.connect("embedding_retriever", "joiner")
querying.connect("full_text_retriever", "joiner")

query = "cross-encoder"
print(f"Running querying pipeline with query '{query}'")
results = querying.run({"text_embedder": {"text": query}, "full_text_retriever": {"query": query}})

print(f"Results: {results}")
for doc in results["joiner"]["documents"]:
print(doc)
print("-" * 10)
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack_integrations.components.retrievers.mongodb_atlas.embedding_retriever import MongoDBAtlasEmbeddingRetriever
from haystack_integrations.components.retrievers.mongodb_atlas.full_text_retriever import MongoDBAtlasFullTextRetriever

__all__ = ["MongoDBAtlasEmbeddingRetriever"]
__all__ = ["MongoDBAtlasEmbeddingRetriever", "MongoDBAtlasFullTextRetriever"]
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class MongoDBAtlasEmbeddingRetriever:

store = MongoDBAtlasDocumentStore(database_name="haystack_integration_test",
collection_name="test_embeddings_collection",
vector_search_index="cosine_index")
vector_search_index="cosine_index",
full_text_search_index="full_text_index")
retriever = MongoDBAtlasEmbeddingRetriever(document_store=store)

results = retriever.run(query_embedding=np.random.random(768).tolist())
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Literal, Optional, Union

from haystack import component, default_from_dict, default_to_dict
from haystack.dataclasses import Document
from haystack.document_stores.types import FilterPolicy
from haystack.document_stores.types.filter_policy import apply_filter_policy

from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore


@component
class MongoDBAtlasFullTextRetriever:
"""
Retrieves documents from the MongoDBAtlasDocumentStore by full-text search.

The full-text search is dependent on the full_text_search_index used in the MongoDBAtlasDocumentStore.
See MongoDBAtlasDocumentStore for more information.

Usage example:
```python
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasFullTextRetriever

store = MongoDBAtlasDocumentStore(database_name="your_existing_db",
collection_name="your_existing_collection",
vector_search_index="your_existing_index",
full_text_search_index="your_existing_index")
retriever = MongoDBAtlasFullTextRetriever(document_store=store)

results = retriever.run(query="Lorem ipsum")
print(results["documents"])
```

The example above retrieves the 10 most similar documents to the query "Lorem ipsum" from the
MongoDBAtlasDocumentStore.
"""

def __init__(
self,
*,
document_store: MongoDBAtlasDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
):
"""
:param document_store: An instance of MongoDBAtlasDocumentStore.
:param filters: Filters applied to the retrieved Documents. Make sure that the fields used in the filters are
included in the configuration of the `full_text_search_index`. The configuration must be done manually
in the Web UI of MongoDB Atlas.
:param top_k: Maximum number of Documents to return.
:param filter_policy: Policy to determine how filters are applied.

:raises ValueError: If `document_store` is not an instance of MongoDBAtlasDocumentStore.
"""

if not isinstance(document_store, MongoDBAtlasDocumentStore):
msg = "document_store must be an instance of MongoDBAtlasDocumentStore"
raise ValueError(msg)

self.document_store = document_store
self.filters = filters or {}
self.top_k = top_k
self.filter_policy = (
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
top_k=self.top_k,
filter_policy=self.filter_policy.value,
document_store=self.document_store.to_dict(),
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasFullTextRetriever":
"""
Deserializes the component from a dictionary.

:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = MongoDBAtlasDocumentStore.from_dict(
data["init_parameters"]["document_store"]
)

return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
self,
query: Union[str, List[str]],
fuzzy: Optional[Dict[str, int]] = None,
match_criteria: Optional[Literal["any", "all"]] = None,
score: Optional[Dict[str, Dict]] = None,
synonyms: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
) -> Dict[str, List[Document]]:
"""
Retrieve documents from the MongoDBAtlasDocumentStore by full-text search.

:param query: The query string or a list of query strings to search for.
If the query contains multiple terms, Atlas Search evaluates each term separately for matches.
:param fuzzy: Enables finding strings similar to the search term(s).
Note, `fuzzy` cannot be used with `synonyms`. Configurable options include `maxEdits`, `prefixLength`,
and `maxExpansions`. For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param match_criteria: Defines how terms in the query are matched. Supported options are `"any"` and `"all"`.
For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param score: Specifies the scoring method for matching results. Supported options include `boost`, `constant`,
and `function`. For more details refer to MongoDB Atlas
[documentation](https://www.mongodb.com/docs/atlas/atlas-search/text/#fields).
:param synonyms: The name of the synonym mapping definition in the index. This value cannot be an empty string.
Note, `synonyms` can not be used with `fuzzy`.
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
the `filter_policy` chosen at retriever initialization. See init method docstring for more
details.
:param top_k: Maximum number of Documents to return. Overrides the value specified at initialization.
:returns: A dictionary with the following keys:
- `documents`: List of Documents most similar to the given `query`
"""
filters = apply_filter_policy(self.filter_policy, self.filters, filters)
top_k = top_k or self.top_k

docs = self.document_store._fulltext_retrieval(
query=query,
fuzzy=fuzzy,
match_criteria=match_criteria,
score=score,
synonyms=synonyms,
filters=filters,
top_k=top_k,
)

return {"documents": docs}
Loading
Loading