From 8d1d7a752b84593c24678ec2f0e07aa925c57233 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 23 Feb 2024 15:30:40 +0100 Subject: [PATCH] MongoDB Atlas support in Haystack 2.0 (#180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * MongoDB Atlas support in Haystack 2.0 * Update integrations/mongodb.md Co-authored-by: Bilge Yücel * pipeline example * Update integrations/mongodb.md * Update mongodb.md --------- Co-authored-by: Bilge Yücel --- integrations/mongodb.md | 125 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 3 deletions(-) diff --git a/integrations/mongodb.md b/integrations/mongodb.md index df53bc3a..1c7f2240 100644 --- a/integrations/mongodb.md +++ b/integrations/mongodb.md @@ -14,19 +14,138 @@ type: Document Store report_issue: https://github.com/deepset-ai/haystack/issues logo: /logos/mongodb.png toc: true +version: Haystack 2.0 --- +### Table of Contents + +- [Overview](#overview) +- [Haystack 2.0](#haystack-20) + - [Installation](#installation) + - [Usage](#usage) +- [Haystack 1.x](#haystack-1x) + - [Installation (1.x)](#installation-1x) + - [Usage (1.x)](#usage-1x) + - [Writing Documents to MongoDBAtlasDocumentStore](#writing-documents-to-mongodbstlasdocumentstore) + - [Indexing Pipeline](#indexing-pipeline) + - [Query Pipeline](#query-pipeline) + +## Overview + [MongoDB](https://www.mongodb.com/) is a document database designed for ease of application development and scaling. [MongoDB Atlas](https://www.mongodb.com/atlas) is a multi-cloud database service built by people behind MongoDB. MongoDB Atlas simplifies deploying and managing your databases while offering the versatility you need to build resilient and performant global applications on the cloud providers of your choice. For a detailed overview of all the available methods and settings for the `MongoDBAtlasDocumentStore`, visit the Haystack [Documentation](https://docs.haystack.deepset.ai/docs/document_store#initialization). -## Installation +## Haystack 2.0 + + +### Installation + +```bash +pip install mongodb-atlas-haystack +``` + +### Usage + +To use MongoDB Atlas as your data storage for your Haystack LLM pipelines, you must have a running database at MongoDB Atlas. For details, see [Get Started with Atlas](https://www.mongodb.com/docs/atlas/getting-started/). + +Once your database is set, first export an environment variable called `MONGO_CONNECTION_STRING`, which should look more or less like this: + +```bash +export MONGO_CONNECTION_STRING="mongodb+srv://:@.gwkckbk.mongodb.net/?retryWrites=true&w=majority" +``` + +And then you can initialize a [`MongoDBAtlasDocumentStore`](https://docs.haystack.deepset.ai/v2.0/docs/mongodbatlasdocumentstore) for Haystack with the required configurations: + +```python +from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore + +document_store = MongoDBAtlasDocumentStore( + database_name="haystack_test", + collection_name="test_collection", +) +``` + +### Example pipelines + +Here is some example code of an end-to-end RAG app built on MongoDB Atlas: one indexing pipeline that embeds the documents, +and a generative pipeline that can be used for question answering. + +```python +from haystack import Pipeline, Document +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.writers import DocumentWriter +from haystack.components.generators import OpenAIGenerator +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore +from haystack_integrations.components.embedders.mongodb_atlas import MongoDBAtlasEmbeddingRetriever + +# Create some example documents +documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), +] + +# We support many different databases. Here we load a simple and lightweight in-memory document store. +document_store = MongoDBAtlasDocumentStore() + +# Define some more components +doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP) +doc_embedder = SentenceTransformersDocumentEmbedder(model="intfloat/e5-base-v2") +query_embedder = SentenceTransformersTextEmbedder(model="intfloat/e5-base-v2") + +# Pipeline that ingests document for retrieval +indexing_pipe = Pipeline() +indexing_pipe.add_component(instance=doc_embedder, name="doc_embedder") +indexing_pipe.add_component(instance=doc_writer, name="doc_writer") + +indexing_pipe.connect("doc_embedder.documents", "doc_writer.documents") +indexing_pipe.run({"doc_embedder": {"documents": documents}}) + +# Build a RAG pipeline with a Retriever to get relevant documents to +# the query and a OpenAIGenerator interacting with LLMs using a custom prompt. +prompt_template = """ +Given these documents, answer the question.\nDocuments: +{% for doc in documents %} + {{ doc.content }} +{% endfor %} + +\nQuestion: {{question}} +\nAnswer: +""" +rag_pipeline = Pipeline() +rag_pipeline.add_component(instance=query_embedder, name="query_embedder") +rag_pipeline.add_component(instance=MongoDBAtlasEmbeddingRetriever(document_store=document_store), name="retriever") +rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +rag_pipeline.add_component(instance=OpenAIGenerator(), name="llm") +rag_pipeline.connect("query_embedder", "retriever.query_embedding") +rag_pipeline.connect("embedding_retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder", "llm") + +# Ask a question on the data you just added. +question = "Where does Mark live?" +result = rag_pipeline.run( + { + "query_embedder": {"text": question}, + "prompt_builder": {"question": question}, + } +) + +# For details, like which documents were used to generate the answer, look into the GeneratedAnswer object +print(result["answer_builder"]["answers"]) +``` + +## Haystack 1.x + +### Installation ```bash pip install farm-haystack[mongodb] ``` -## Usage +### Usage To use MongoDB Atlas as your data storage for your Haystack LLM pipelines, you must have a running database at MongoDB Atlas. For details, see [Get Started with Atlas](https://www.mongodb.com/docs/atlas/getting-started/). @@ -48,7 +167,7 @@ ds=MongoDBAtlasDocumentStore( To write documents to your `MongoDBAtlasDocumentStore`, create an indexing pipeline, or use the `write_documents()` function. For this step, you may make use of the available [FileConverters](https://docs.haystack.deepset.ai/docs/file_converters) and [PreProcessors](https://docs.haystack.deepset.ai/docs/preprocessor), as well as other [Integrations](/integrations) that might help you fetch data from other resources. Below is an example indexing pipeline that indexes your Markdown files into a MongoDB Atlas instance. -#### Indexing Pipeline +### Indexing Pipeline ```python from haystack import Pipeline