From 60d025b83be4d4f884c67819904383ccd89cff87 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 4 Nov 2023 10:16:02 -0700 Subject: [PATCH] mongo parent document retrieval (#12887) --- .../mongo-parent-document-retrieval/LICENSE | 21 +++ .../mongo-parent-document-retrieval/README.md | 178 ++++++++++++++++++ .../mongo-parent-document-retrieval/ingest.py | 59 ++++++ .../__init__.py | 3 + .../mongo_parent_document_retrieval/chain.py | 91 +++++++++ .../pyproject.toml | 27 +++ .../tests/__init__.py | 0 7 files changed, 379 insertions(+) create mode 100644 templates/mongo-parent-document-retrieval/LICENSE create mode 100644 templates/mongo-parent-document-retrieval/README.md create mode 100644 templates/mongo-parent-document-retrieval/ingest.py create mode 100644 templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/__init__.py create mode 100644 templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/chain.py create mode 100644 templates/mongo-parent-document-retrieval/pyproject.toml create mode 100644 templates/mongo-parent-document-retrieval/tests/__init__.py diff --git a/templates/mongo-parent-document-retrieval/LICENSE b/templates/mongo-parent-document-retrieval/LICENSE new file mode 100644 index 0000000000000..426b65090341f --- /dev/null +++ b/templates/mongo-parent-document-retrieval/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 LangChain, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/templates/mongo-parent-document-retrieval/README.md b/templates/mongo-parent-document-retrieval/README.md new file mode 100644 index 0000000000000..ea4a01472530d --- /dev/null +++ b/templates/mongo-parent-document-retrieval/README.md @@ -0,0 +1,178 @@ +# mongo-parent-document-retrieval + +This template performs RAG using MongoDB and OpenAI. +It does a more advanced form of RAG called Parent-Document Retrieval. + +In this form of retrieval, a large document is first split into medium sized chunks. +From there, those medium size chunks are split into small chunks. +Embeddings are created for the small chunks. +When a query comes in, an embedding is created for that query and compared to the small chunks. +But rather than passing the small chunks directly to the LLM for generation, the medium-sized chunks +from whence the smaller chunks came are passed. +This helps enable finer-grained search, but then passing of larger context (which can be useful during generation). + +## Environment Setup + +You should export two environment variables, one being your MongoDB URI, the other being your OpenAI API KEY. +If you do not have a MongoDB URI, see the `Setup Mongo` section at the bottom for instructions on how to do so. + +```shell +export MONGO_URI=... +export OPENAI_API_KEY=... +``` + +## Usage + +To use this package, you should first have the LangChain CLI installed: + +```shell +pip install -U langchain-cli +``` + +To create a new LangChain project and install this as the only package, you can do: + +```shell +langchain app new my-app --package mongo-parent-document-retrieval +``` + +If you want to add this to an existing project, you can just run: + +```shell +langchain app add mongo-parent-document-retrieval +``` + +And add the following code to your `server.py` file: +```python +from mongo_parent_document_retrieval import chain as mongo_parent_document_retrieval_chain + +add_routes(app, mongo_parent_document_retrieval_chain, path="/mongo-parent-document-retrieval") +``` + +(Optional) Let's now configure LangSmith. +LangSmith will help us trace, monitor and debug LangChain applications. +LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). +If you don't have access, you can skip this section + + +```shell +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY= +export LANGCHAIN_PROJECT= # if not specified, defaults to "default" +``` + +If you DO NOT already have a Mongo Search Index you want to connect to, see `MongoDB Setup` section below before proceeding. +Note that because Parent Document Retrieval uses a different indexing strategy, it's likely you will want to run this new setup. + +If you DO have a MongoDB Search index you want to connect to, edit the connection details in `mongo_parent_document_retrieval/chain.py` + +If you are inside this directory, then you can spin up a LangServe instance directly by: + +```shell +langchain serve +``` + +This will start the FastAPI app with a server is running locally at +[http://localhost:8000](http://localhost:8000) + +We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) +We can access the playground at [http://127.0.0.1:8000/mongo-parent-document-retrieval/playground](http://127.0.0.1:8000/mongo-parent-document-retrieval/playground) + +We can access the template from code with: + +```python +from langserve.client import RemoteRunnable + +runnable = RemoteRunnable("http://localhost:8000/mongo-parent-document-retrieval") +``` + +For additional context, please refer to [this notebook](https://colab.research.google.com/drive/1cr2HBAHyBmwKUerJq2if0JaNhy-hIq7I#scrollTo=TZp7_CBfxTOB). + + +## MongoDB Setup + +Use this step if you need to setup your MongoDB account and ingest data. +We will first follow the standard MongoDB Atlas setup instructions [here](https://www.mongodb.com/docs/atlas/getting-started/). + +1. Create an account (if not already done) +2. Create a new project (if not already done) +3. Locate your MongoDB URI. + +This can be done by going to the deployement overview page and connecting to you database + +![connect.png](_images/connect.png) + +We then look at the drivers available + +![driver.png](_images/driver.png) + +Among which we will see our URI listed + +![uri.png](_images/uri.png) + +Let's then set that as an environment variable locally: + +```shell +export MONGO_URI=... +``` + +4. Let's also set an environment variable for OpenAI (which we will use as an LLM) + +```shell +export OPENAI_API_KEY=... +``` + +5. Let's now ingest some data! We can do that by moving into this directory and running the code in `ingest.py`, eg: + +```shell +python ingest.py +``` + +Note that you can (and should!) change this to ingest data of your choice + +6. We now need to set up a vector index on our data. + +We can first connect to the cluster where our database lives + +![cluster.png](_images%2Fcluster.png) + +We can then navigate to where all our collections are listed + +![collections.png](_images%2Fcollections.png) + +We can then find the collection we want and look at the search indexes for that collection + +![search-indexes.png](_images%2Fsearch-indexes.png) + +That should likely be empty, and we want to create a new one: + +![create.png](_images%2Fcreate.png) + +We will use the JSON editor to create it + +![json_editor.png](_images%2Fjson_editor.png) + +And we will paste the following JSON in: + +```text +{ + "mappings": { + "dynamic": true, + "fields": { + "doc_level": [ + { + "type": "token" + } + ], + "embedding": { + "dimensions": 1536, + "similarity": "cosine", + "type": "knnVector" + } + } + } +} +``` +![json.png](_images%2Fjson.png) + +From there, hit "Next" and then "Create Search Index". It will take a little bit but you should then have an index over your data! + diff --git a/templates/mongo-parent-document-retrieval/ingest.py b/templates/mongo-parent-document-retrieval/ingest.py new file mode 100644 index 0000000000000..3a6c546aea604 --- /dev/null +++ b/templates/mongo-parent-document-retrieval/ingest.py @@ -0,0 +1,59 @@ +import os +import uuid + +from langchain.document_loaders import PyPDFLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import MongoDBAtlasVectorSearch +from pymongo import MongoClient + +PARENT_DOC_ID_KEY = "parent_doc_id" + + +def parent_child_splitter(data, id_key=PARENT_DOC_ID_KEY): + parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) + # This text splitter is used to create the child documents + # It should create documents smaller than the parent + child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) + documents = parent_splitter.split_documents(data) + doc_ids = [str(uuid.uuid4()) for _ in documents] + + docs = [] + for i, doc in enumerate(documents): + _id = doc_ids[i] + sub_docs = child_splitter.split_documents([doc]) + for _doc in sub_docs: + _doc.metadata[id_key] = _id + _doc.metadata["doc_level"] = "child" + docs.extend(sub_docs) + doc.metadata[id_key] = _id + doc.metadata["doc_level"] = "parent" + return documents, docs + + +MONGO_URI = os.environ["MONGO_URI"] + +# Note that if you change this, you also need to change it in `rag_mongo/chain.py` +DB_NAME = "langchain-test-2" +COLLECTION_NAME = "test" +ATLAS_VECTOR_SEARCH_INDEX_NAME = "default" +EMBEDDING_FIELD_NAME = "embedding" +client = MongoClient(MONGO_URI) +db = client[DB_NAME] +MONGODB_COLLECTION = db[COLLECTION_NAME] + +if __name__ == "__main__": + # Load docs + loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf") + data = loader.load() + + # Split docs + parent_docs, child_docs = parent_child_splitter(data) + + # Insert the documents in MongoDB Atlas Vector Search + _ = MongoDBAtlasVectorSearch.from_documents( + documents=parent_docs + child_docs, + embedding=OpenAIEmbeddings(disallowed_special=()), + collection=MONGODB_COLLECTION, + index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME, + ) diff --git a/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/__init__.py b/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/__init__.py new file mode 100644 index 0000000000000..01e197dabbe03 --- /dev/null +++ b/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/__init__.py @@ -0,0 +1,3 @@ +from mongo_parent_document_retrieval.chain import chain + +__all__ = ["chain"] diff --git a/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/chain.py b/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/chain.py new file mode 100644 index 0000000000000..7307e60b66b1d --- /dev/null +++ b/templates/mongo-parent-document-retrieval/mongo_parent_document_retrieval/chain.py @@ -0,0 +1,91 @@ +import os + +from langchain.chat_models import ChatOpenAI +from langchain.embeddings import OpenAIEmbeddings +from langchain.prompts import ChatPromptTemplate +from langchain.pydantic_v1 import BaseModel +from langchain.schema.document import Document +from langchain.schema.output_parser import StrOutputParser +from langchain.schema.runnable import RunnableParallel, RunnablePassthrough +from langchain.vectorstores import MongoDBAtlasVectorSearch +from pymongo import MongoClient + +MONGO_URI = os.environ["MONGO_URI"] +PARENT_DOC_ID_KEY = "parent_doc_id" +# Note that if you change this, you also need to change it in `rag_mongo/chain.py` +DB_NAME = "langchain-test-2" +COLLECTION_NAME = "test" +ATLAS_VECTOR_SEARCH_INDEX_NAME = "default" +EMBEDDING_FIELD_NAME = "embedding" +client = MongoClient(MONGO_URI) +db = client[DB_NAME] +MONGODB_COLLECTION = db[COLLECTION_NAME] + + +vector_search = MongoDBAtlasVectorSearch.from_connection_string( + MONGO_URI, + DB_NAME + "." + COLLECTION_NAME, + OpenAIEmbeddings(disallowed_special=()), + index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME, +) + + +def retrieve(query: str): + results = vector_search.similarity_search( + query, + k=4, + pre_filter={"doc_level": {"$eq": "child"}}, + post_filter_pipeline=[ + {"$project": {"embedding": 0}}, + { + "$lookup": { + "from": COLLECTION_NAME, + "localField": PARENT_DOC_ID_KEY, + "foreignField": PARENT_DOC_ID_KEY, + "as": "parent_context", + "pipeline": [ + {"$match": {"doc_level": "parent"}}, + {"$limit": 1}, + {"$project": {"embedding": 0}}, + ], + } + }, + ], + ) + parent_docs = [] + parent_doc_ids = set() + for result in results: + res = result.metadata["parent_context"][0] + text = res.pop("text") + # This causes serialization issues. + res.pop("_id") + parent_doc = Document(page_content=text, metadata=res) + if parent_doc.metadata[PARENT_DOC_ID_KEY] not in parent_doc_ids: + parent_doc_ids.add(parent_doc.metadata[PARENT_DOC_ID_KEY]) + parent_docs.append(parent_doc) + return parent_docs + + +# RAG prompt +template = """Answer the question based only on the following context: +{context} +Question: {question} +""" +prompt = ChatPromptTemplate.from_template(template) + +# RAG +model = ChatOpenAI() +chain = ( + RunnableParallel({"context": retrieve, "question": RunnablePassthrough()}) + | prompt + | model + | StrOutputParser() +) + + +# Add typing for input +class Question(BaseModel): + __root__: str + + +chain = chain.with_types(input_type=Question) diff --git a/templates/mongo-parent-document-retrieval/pyproject.toml b/templates/mongo-parent-document-retrieval/pyproject.toml new file mode 100644 index 0000000000000..2916e698254a5 --- /dev/null +++ b/templates/mongo-parent-document-retrieval/pyproject.toml @@ -0,0 +1,27 @@ +[tool.poetry] +name = "mongo-parent-document-retrieval" +version = "0.0.1" +description = "" +authors = [] +readme = "README.md" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +langchain = ">=0.0.313, <0.1" +openai = "^0.28.1" +pymongo = "^4.6.0" +pypdf = "^3.17.0" +tiktoken = "^0.5.1" + +[tool.poetry.group.dev.dependencies] +langchain-cli = ">=0.0.4" +fastapi = "^0.104.0" +sse-starlette = "^1.6.5" + +[tool.langserve] +export_module = "mongo_parent_document_retrieval" +export_attr = "chain" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/templates/mongo-parent-document-retrieval/tests/__init__.py b/templates/mongo-parent-document-retrieval/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d